diff options
Diffstat (limited to 'mm')
98 files changed, 7152 insertions, 4162 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 0e26f4fc8717..bd0ea5454af8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -695,15 +695,6 @@ config PCP_BATCH_SCALE_MAX config PHYS_ADDR_T_64BIT def_bool 64BIT -config BOUNCE - bool "Enable bounce buffers" - default y - depends on BLOCK && MMU && HIGHMEM - help - Enable bounce buffers for devices that cannot access the full range of - memory available to the CPU. Enabled by default when HIGHMEM is - selected, but you may say n to override this. - config MMU_NOTIFIER bool select INTERVAL_TREE @@ -749,7 +740,7 @@ config MEMORY_FAILURE depends on MMU depends on ARCH_SUPPORTS_MEMORY_FAILURE bool "Enable recovery from hardware memory errors" - select RAS + select INTERVAL_TREE help Enables code to recover from some memory failures on systems with MCA recovery. This allows a system to continue running @@ -862,6 +853,97 @@ choice enabled at runtime via sysfs. endchoice +choice + prompt "Shmem hugepage allocation defaults" + depends on TRANSPARENT_HUGEPAGE + default TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER + help + Selects the hugepage allocation policy defaults for + the internal shmem mount. + + The selection made here can be overridden by using the kernel + command line 'transparent_hugepage_shmem=' option. + + config TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER + bool "never" + help + Disable hugepage allocation for shmem mount by default. It can + still be enabled with the kernel command line + 'transparent_hugepage_shmem=' option or at runtime via sysfs + knob. Note that madvise(MADV_COLLAPSE) can still cause + transparent huge pages to be obtained even if this mode is + specified. + + config TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ALWAYS + bool "always" + help + Always attempt to allocate hugepage for shmem mount, can + increase the memory footprint of applications without a + guaranteed benefit but it will work automatically for all + applications. + + config TRANSPARENT_HUGEPAGE_SHMEM_HUGE_WITHIN_SIZE + bool "within_size" + help + Enable hugepage allocation for shmem mount if the allocation + will be fully within the i_size. This configuration also takes + into account any madvise(MADV_HUGEPAGE) hints that may be + provided by the applications. + + config TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ADVISE + bool "advise" + help + Enable hugepage allocation for the shmem mount exclusively when + applications supply the madvise(MADV_HUGEPAGE) hint. + This ensures that hugepages are used only in response to explicit + requests from applications. +endchoice + +choice + prompt "Tmpfs hugepage allocation defaults" + depends on TRANSPARENT_HUGEPAGE + default TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER + help + Selects the hugepage allocation policy defaults for + the tmpfs mount. + + The selection made here can be overridden by using the kernel + command line 'transparent_hugepage_tmpfs=' option. + + config TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER + bool "never" + help + Disable hugepage allocation for tmpfs mount by default. It can + still be enabled with the kernel command line + 'transparent_hugepage_tmpfs=' option. Note that + madvise(MADV_COLLAPSE) can still cause transparent huge pages + to be obtained even if this mode is specified. + + config TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ALWAYS + bool "always" + help + Always attempt to allocate hugepage for tmpfs mount, can + increase the memory footprint of applications without a + guaranteed benefit but it will work automatically for all + applications. + + config TRANSPARENT_HUGEPAGE_TMPFS_HUGE_WITHIN_SIZE + bool "within_size" + help + Enable hugepage allocation for tmpfs mount if the allocation + will be fully within the i_size. This configuration also takes + into account any madvise(MADV_HUGEPAGE) hints that may be + provided by the applications. + + config TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ADVISE + bool "advise" + help + Enable hugepage allocation for the tmpfs mount exclusively when + applications supply the madvise(MADV_HUGEPAGE) hint. + This ensures that hugepages are used only in response to explicit + requests from applications. +endchoice + config THP_SWAP def_bool y depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP && 64BIT @@ -908,6 +990,16 @@ config PAGE_MAPCOUNT config PGTABLE_HAS_HUGE_LEAVES def_bool TRANSPARENT_HUGEPAGE || HUGETLB_PAGE +# +# We can end up creating gigantic folio. +# +config HAVE_GIGANTIC_FOLIOS + def_bool (HUGETLB_PAGE && ARCH_HAS_GIGANTIC_PAGE) || \ + (ZONE_DEVICE && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) + +config ASYNC_KERNEL_PGTABLE_FREE + def_bool n + # TODO: Allow to be enabled without THP config ARCH_SUPPORTS_HUGE_PFNMAP def_bool n diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 32b65073d0cc..7638d75b27db 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -175,10 +175,10 @@ config DEBUG_PAGE_REF nil until the tracepoints are actually enabled. config DEBUG_RODATA_TEST - bool "Testcase for the marking rodata read-only" - depends on STRICT_KERNEL_RWX + bool "Testcase for the marking rodata read-only" + depends on STRICT_KERNEL_RWX help - This option enables a testcase for the setting rodata read-only. + This option enables a testcase for the setting rodata read-only. config ARCH_HAS_DEBUG_WX bool diff --git a/mm/Makefile b/mm/Makefile index 21abb3353550..00ceb2418b64 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -78,7 +78,7 @@ endif obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o -obj-$(CONFIG_HUGETLBFS) += hugetlb.o +obj-$(CONFIG_HUGETLBFS) += hugetlb.o hugetlb_sysfs.o hugetlb_sysctl.o ifdef CONFIG_CMA obj-$(CONFIG_HUGETLBFS) += hugetlb_cma.o endif diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 41b6c9386b69..c5740c6d37a2 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -72,7 +72,7 @@ static void collect_wb_stats(struct wb_stats *stats, list_for_each_entry(inode, &wb->b_more_io, i_io_list) stats->nr_more_io++; list_for_each_entry(inode, &wb->b_dirty_time, i_io_list) - if (inode->i_state & I_DIRTY_TIME) + if (inode_state_read_once(inode) & I_DIRTY_TIME) stats->nr_dirty_time++; spin_unlock(&wb->list_lock); diff --git a/mm/damon/core.c b/mm/damon/core.c index 109b050c795a..f9fc0375890a 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -10,6 +10,7 @@ #include <linux/damon.h> #include <linux/delay.h> #include <linux/kthread.h> +#include <linux/memcontrol.h> #include <linux/mm.h> #include <linux/psi.h> #include <linux/slab.h> @@ -19,11 +20,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/damon.h> -#ifdef CONFIG_DAMON_KUNIT_TEST -#undef DAMON_MIN_REGION -#define DAMON_MIN_REGION 1 -#endif - static DEFINE_MUTEX(damon_lock); static int nr_running_ctxs; static bool running_exclusive_ctxs; @@ -305,7 +301,7 @@ void damos_add_filter(struct damos *s, struct damos_filter *f) if (damos_filter_for_ops(f->type)) list_add_tail(&f->list, &s->ops_filters); else - list_add_tail(&f->list, &s->filters); + list_add_tail(&f->list, &s->core_filters); } static void damos_del_filter(struct damos_filter *f) @@ -396,7 +392,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, */ scheme->next_apply_sis = 0; scheme->walk_completed = false; - INIT_LIST_HEAD(&scheme->filters); + INIT_LIST_HEAD(&scheme->core_filters); INIT_LIST_HEAD(&scheme->ops_filters); scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); @@ -449,7 +445,7 @@ void damon_destroy_scheme(struct damos *s) damos_for_each_quota_goal_safe(g, g_next, &s->quota) damos_destroy_quota_goal(g); - damos_for_each_filter_safe(f, next, s) + damos_for_each_core_filter_safe(f, next, s) damos_destroy_filter(f); damos_for_each_ops_filter_safe(f, next, s) @@ -478,6 +474,7 @@ struct damon_target *damon_new_target(void) t->nr_regions = 0; INIT_LIST_HEAD(&t->regions_list); INIT_LIST_HEAD(&t->list); + t->obsolete = false; return t; } @@ -788,6 +785,11 @@ static void damos_commit_quota_goal_union( case DAMOS_QUOTA_NODE_MEM_FREE_BP: dst->nid = src->nid; break; + case DAMOS_QUOTA_NODE_MEMCG_USED_BP: + case DAMOS_QUOTA_NODE_MEMCG_FREE_BP: + dst->nid = src->nid; + dst->memcg_id = src->memcg_id; + break; default: break; } @@ -857,12 +859,12 @@ static int damos_commit_quota(struct damos_quota *dst, struct damos_quota *src) return 0; } -static struct damos_filter *damos_nth_filter(int n, struct damos *s) +static struct damos_filter *damos_nth_core_filter(int n, struct damos *s) { struct damos_filter *filter; int i = 0; - damos_for_each_filter(filter, s) { + damos_for_each_core_filter(filter, s) { if (i++ == n) return filter; } @@ -916,15 +918,15 @@ static int damos_commit_core_filters(struct damos *dst, struct damos *src) struct damos_filter *dst_filter, *next, *src_filter, *new_filter; int i = 0, j = 0; - damos_for_each_filter_safe(dst_filter, next, dst) { - src_filter = damos_nth_filter(i++, src); + damos_for_each_core_filter_safe(dst_filter, next, dst) { + src_filter = damos_nth_core_filter(i++, src); if (src_filter) damos_commit_filter(dst_filter, src_filter); else damos_destroy_filter(dst_filter); } - damos_for_each_filter_safe(src_filter, next, src) { + damos_for_each_core_filter_safe(src_filter, next, src) { if (j++ < i) continue; @@ -988,41 +990,37 @@ static void damos_set_filters_default_reject(struct damos *s) s->core_filters_default_reject = false; else s->core_filters_default_reject = - damos_filters_default_reject(&s->filters); + damos_filters_default_reject(&s->core_filters); s->ops_filters_default_reject = damos_filters_default_reject(&s->ops_filters); } -static int damos_commit_dests(struct damos *dst, struct damos *src) +static int damos_commit_dests(struct damos_migrate_dests *dst, + struct damos_migrate_dests *src) { - struct damos_migrate_dests *dst_dests, *src_dests; - - dst_dests = &dst->migrate_dests; - src_dests = &src->migrate_dests; - - if (dst_dests->nr_dests != src_dests->nr_dests) { - kfree(dst_dests->node_id_arr); - kfree(dst_dests->weight_arr); + if (dst->nr_dests != src->nr_dests) { + kfree(dst->node_id_arr); + kfree(dst->weight_arr); - dst_dests->node_id_arr = kmalloc_array(src_dests->nr_dests, - sizeof(*dst_dests->node_id_arr), GFP_KERNEL); - if (!dst_dests->node_id_arr) { - dst_dests->weight_arr = NULL; + dst->node_id_arr = kmalloc_array(src->nr_dests, + sizeof(*dst->node_id_arr), GFP_KERNEL); + if (!dst->node_id_arr) { + dst->weight_arr = NULL; return -ENOMEM; } - dst_dests->weight_arr = kmalloc_array(src_dests->nr_dests, - sizeof(*dst_dests->weight_arr), GFP_KERNEL); - if (!dst_dests->weight_arr) { + dst->weight_arr = kmalloc_array(src->nr_dests, + sizeof(*dst->weight_arr), GFP_KERNEL); + if (!dst->weight_arr) { /* ->node_id_arr will be freed by scheme destruction */ return -ENOMEM; } } - dst_dests->nr_dests = src_dests->nr_dests; - for (int i = 0; i < src_dests->nr_dests; i++) { - dst_dests->node_id_arr[i] = src_dests->node_id_arr[i]; - dst_dests->weight_arr[i] = src_dests->weight_arr[i]; + dst->nr_dests = src->nr_dests; + for (int i = 0; i < src->nr_dests; i++) { + dst->node_id_arr[i] = src->node_id_arr[i]; + dst->weight_arr[i] = src->weight_arr[i]; } return 0; @@ -1069,7 +1067,7 @@ static int damos_commit(struct damos *dst, struct damos *src) dst->wmarks = src->wmarks; dst->target_nid = src->target_nid; - err = damos_commit_dests(dst, src); + err = damos_commit_dests(&dst->migrate_dests, &src->migrate_dests); if (err) return err; @@ -1181,7 +1179,11 @@ static int damon_commit_targets( damon_for_each_target_safe(dst_target, next, dst) { src_target = damon_nth_target(i++, src); - if (src_target) { + /* + * If src target is obsolete, do not commit the parameters to + * the dst target, and further remove the dst target. + */ + if (src_target && !src_target->obsolete) { err = damon_commit_target( dst_target, damon_target_has_pid(dst), src_target, damon_target_has_pid(src), @@ -1204,6 +1206,9 @@ static int damon_commit_targets( damon_for_each_target_safe(src_target, next, src) { if (j++ < i) continue; + /* target to remove has no matching dst */ + if (src_target->obsolete) + return -EINVAL; new_target = damon_new_target(); if (!new_target) return -ENOMEM; @@ -1434,7 +1439,7 @@ bool damon_is_running(struct damon_ctx *ctx) * Ask DAMON worker thread (kdamond) of @ctx to call a function with an * argument data that respectively passed via &damon_call_control->fn and * &damon_call_control->data of @control. If &damon_call_control->repeat of - * @control is set, further wait until the kdamond finishes handling of the + * @control is unset, further wait until the kdamond finishes handling of the * request. Otherwise, return as soon as the request is made. * * The kdamond executes the function with the argument in the main loop, just @@ -1757,7 +1762,7 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, struct damos_filter *filter; s->core_filters_allowed = false; - damos_for_each_filter(filter, s) { + damos_for_each_core_filter(filter, s) { if (damos_filter_match(ctx, t, r, filter, ctx->min_sz_region)) { if (filter->allow) s->core_filters_allowed = true; @@ -2035,12 +2040,50 @@ static __kernel_ulong_t damos_get_node_mem_bp( numerator = i.freeram; return numerator * 10000 / i.totalram; } + +static unsigned long damos_get_node_memcg_used_bp( + struct damos_quota_goal *goal) +{ + struct mem_cgroup *memcg; + struct lruvec *lruvec; + unsigned long used_pages, numerator; + struct sysinfo i; + + rcu_read_lock(); + memcg = mem_cgroup_from_id(goal->memcg_id); + rcu_read_unlock(); + if (!memcg) { + if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP) + return 0; + else /* DAMOS_QUOTA_NODE_MEMCG_FREE_BP */ + return 10000; + } + mem_cgroup_flush_stats(memcg); + lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(goal->nid)); + used_pages = lruvec_page_state(lruvec, NR_ACTIVE_ANON); + used_pages += lruvec_page_state(lruvec, NR_INACTIVE_ANON); + used_pages += lruvec_page_state(lruvec, NR_ACTIVE_FILE); + used_pages += lruvec_page_state(lruvec, NR_INACTIVE_FILE); + + si_meminfo_node(&i, goal->nid); + if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP) + numerator = used_pages; + else /* DAMOS_QUOTA_NODE_MEMCG_FREE_BP */ + numerator = i.totalram - used_pages; + return numerator * 10000 / i.totalram; +} #else static __kernel_ulong_t damos_get_node_mem_bp( struct damos_quota_goal *goal) { return 0; } + +static unsigned long damos_get_node_memcg_used_bp( + struct damos_quota_goal *goal) +{ + return 0; +} #endif @@ -2061,6 +2104,10 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) case DAMOS_QUOTA_NODE_MEM_FREE_BP: goal->current_value = damos_get_node_mem_bp(goal); break; + case DAMOS_QUOTA_NODE_MEMCG_USED_BP: + case DAMOS_QUOTA_NODE_MEMCG_FREE_BP: + goal->current_value = damos_get_node_memcg_used_bp(goal); + break; default: break; } @@ -2770,6 +2817,7 @@ static bool damon_find_biggest_system_ram(unsigned long *start, * @t: The monitoring target to set the region. * @start: The pointer to the start address of the region. * @end: The pointer to the end address of the region. + * @min_sz_region: Minimum region size. * * This function sets the region of @t as requested by @start and @end. If the * values of @start and @end are zero, however, this function finds the biggest @@ -2780,7 +2828,8 @@ static bool damon_find_biggest_system_ram(unsigned long *start, * Return: 0 on success, negative error code otherwise. */ int damon_set_region_biggest_system_ram_default(struct damon_target *t, - unsigned long *start, unsigned long *end) + unsigned long *start, unsigned long *end, + unsigned long min_sz_region) { struct damon_addr_range addr_range; @@ -2793,7 +2842,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t, addr_range.start = *start; addr_range.end = *end; - return damon_set_regions(t, &addr_range, 1, DAMON_MIN_REGION); + return damon_set_regions(t, &addr_range, 1, min_sz_region); } /* diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 42b9a656f9de..49b4bc294f4e 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -242,7 +242,8 @@ static int damon_lru_sort_apply_parameters(void) err = damon_set_region_biggest_system_ram_default(param_target, &monitor_region_start, - &monitor_region_end); + &monitor_region_end, + param_ctx->min_sz_region); if (err) goto out; err = damon_commit_ctx(ctx, param_ctx); diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index 998c5180a603..a218d9922234 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -11,7 +11,7 @@ #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/swap.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include "../internal.h" #include "ops-common.h" @@ -51,7 +51,7 @@ void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr if (likely(pte_present(pteval))) pfn = pte_pfn(pteval); else - pfn = swp_offset_pfn(pte_to_swp_entry(pteval)); + pfn = softleaf_to_pfn(softleaf_from_pte(pteval)); folio = damon_get_folio(pfn); if (!folio) @@ -75,12 +75,24 @@ void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - struct folio *folio = damon_get_folio(pmd_pfn(pmdp_get(pmd))); + pmd_t pmdval = pmdp_get(pmd); + struct folio *folio; + bool young = false; + unsigned long pfn; + if (likely(pmd_present(pmdval))) + pfn = pmd_pfn(pmdval); + else + pfn = softleaf_to_pfn(softleaf_from_pmd(pmdval)); + + folio = damon_get_folio(pfn); if (!folio) return; - if (pmdp_clear_young_notify(vma, addr, pmd)) + if (likely(pmd_present(pmdval))) + young |= pmdp_clear_young_notify(vma, addr, pmd); + young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + HPAGE_PMD_SIZE); + if (young) folio_set_young(folio); folio_set_idle(folio); @@ -162,21 +174,17 @@ void damon_folio_mkold(struct folio *folio) .rmap_one = damon_folio_mkold_one, .anon_lock = folio_lock_anon_vma_read, }; - bool need_lock; if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { folio_set_idle(folio); return; } - need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); - if (need_lock && !folio_trylock(folio)) + if (!folio_trylock(folio)) return; rmap_walk(folio, &rwc); - - if (need_lock) - folio_unlock(folio); + folio_unlock(folio); } @@ -203,7 +211,9 @@ static bool damon_folio_young_one(struct folio *folio, mmu_notifier_test_young(vma->vm_mm, addr); } else { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - *accessed = pmd_young(pmdp_get(pvmw.pmd)) || + pmd_t pmd = pmdp_get(pvmw.pmd); + + *accessed = (pmd_present(pmd) && pmd_young(pmd)) || !folio_test_idle(folio) || mmu_notifier_test_young(vma->vm_mm, addr); #else @@ -228,7 +238,6 @@ bool damon_folio_young(struct folio *folio) .rmap_one = damon_folio_young_one, .anon_lock = folio_lock_anon_vma_read, }; - bool need_lock; if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { if (folio_test_idle(folio)) @@ -237,14 +246,11 @@ bool damon_folio_young(struct folio *folio) return true; } - need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); - if (need_lock && !folio_trylock(folio)) + if (!folio_trylock(folio)) return false; rmap_walk(folio, &rwc); - - if (need_lock) - folio_unlock(folio); + folio_unlock(folio); return accessed; } diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 7ba3d0f9a19a..36a582e09eae 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -250,7 +250,8 @@ static int damon_reclaim_apply_parameters(void) err = damon_set_region_biggest_system_ram_default(param_target, &monitor_region_start, - &monitor_region_end); + &monitor_region_end, + param_ctx->min_sz_region); if (err) goto out; err = damon_commit_ctx(ctx, param_ctx); diff --git a/mm/damon/stat.c b/mm/damon/stat.c index d8010968bbed..ed8e3629d31a 100644 --- a/mm/damon/stat.c +++ b/mm/damon/stat.c @@ -46,6 +46,8 @@ MODULE_PARM_DESC(aggr_interval_us, static struct damon_ctx *damon_stat_context; +static unsigned long damon_stat_last_refresh_jiffies; + static void damon_stat_set_estimated_memory_bandwidth(struct damon_ctx *c) { struct damon_target *t; @@ -130,13 +132,12 @@ static void damon_stat_set_idletime_percentiles(struct damon_ctx *c) static int damon_stat_damon_call_fn(void *data) { struct damon_ctx *c = data; - static unsigned long last_refresh_jiffies; /* avoid unnecessarily frequent stat update */ - if (time_before_eq(jiffies, last_refresh_jiffies + + if (time_before_eq(jiffies, damon_stat_last_refresh_jiffies + msecs_to_jiffies(5 * MSEC_PER_SEC))) return 0; - last_refresh_jiffies = jiffies; + damon_stat_last_refresh_jiffies = jiffies; aggr_interval_us = c->attrs.aggr_interval; damon_stat_set_estimated_memory_bandwidth(c); @@ -187,7 +188,8 @@ static struct damon_ctx *damon_stat_build_ctx(void) if (!target) goto free_out; damon_add_target(ctx, target); - if (damon_set_region_biggest_system_ram_default(target, &start, &end)) + if (damon_set_region_biggest_system_ram_default(target, &start, &end, + ctx->min_sz_region)) goto free_out; return ctx; free_out: @@ -210,6 +212,8 @@ static int damon_stat_start(void) err = damon_start(&damon_stat_context, 1, true); if (err) return err; + + damon_stat_last_refresh_jiffies = jiffies; call_control.data = damon_stat_context; return damon_call(damon_stat_context, &call_control); } diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 6536f16006c9..30d20f5b3192 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -999,6 +999,7 @@ struct damos_sysfs_quota_goal { unsigned long target_value; unsigned long current_value; int nid; + char *path; }; static struct damos_sysfs_quota_goal *damos_sysfs_quota_goal_alloc(void) @@ -1029,6 +1030,14 @@ struct damos_sysfs_qgoal_metric_name damos_sysfs_qgoal_metric_names[] = { .metric = DAMOS_QUOTA_NODE_MEM_FREE_BP, .name = "node_mem_free_bp", }, + { + .metric = DAMOS_QUOTA_NODE_MEMCG_USED_BP, + .name = "node_memcg_used_bp", + }, + { + .metric = DAMOS_QUOTA_NODE_MEMCG_FREE_BP, + .name = "node_memcg_free_bp", + }, }; static ssize_t target_metric_show(struct kobject *kobj, @@ -1112,7 +1121,6 @@ static ssize_t nid_show(struct kobject *kobj, struct damos_sysfs_quota_goal *goal = container_of(kobj, struct damos_sysfs_quota_goal, kobj); - /* todo: return error if the goal is not using nid */ return sysfs_emit(buf, "%d\n", goal->nid); } @@ -1128,10 +1136,39 @@ static ssize_t nid_store(struct kobject *kobj, return err ? err : count; } +static ssize_t path_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damos_sysfs_quota_goal *goal = container_of(kobj, + struct damos_sysfs_quota_goal, kobj); + + return sysfs_emit(buf, "%s\n", goal->path ? goal->path : ""); +} + +static ssize_t path_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damos_sysfs_quota_goal *goal = container_of(kobj, + struct damos_sysfs_quota_goal, kobj); + char *path = kmalloc_array(size_add(count, 1), sizeof(*path), + GFP_KERNEL); + + if (!path) + return -ENOMEM; + + strscpy(path, buf, count + 1); + kfree(goal->path); + goal->path = path; + return count; +} + static void damos_sysfs_quota_goal_release(struct kobject *kobj) { - /* or, notify this release to the feed callback */ - kfree(container_of(kobj, struct damos_sysfs_quota_goal, kobj)); + struct damos_sysfs_quota_goal *goal = container_of(kobj, + struct damos_sysfs_quota_goal, kobj); + + kfree(goal->path); + kfree(goal); } static struct kobj_attribute damos_sysfs_quota_goal_target_metric_attr = @@ -1146,11 +1183,15 @@ static struct kobj_attribute damos_sysfs_quota_goal_current_value_attr = static struct kobj_attribute damos_sysfs_quota_goal_nid_attr = __ATTR_RW_MODE(nid, 0600); +static struct kobj_attribute damos_sysfs_quota_goal_path_attr = + __ATTR_RW_MODE(path, 0600); + static struct attribute *damos_sysfs_quota_goal_attrs[] = { &damos_sysfs_quota_goal_target_metric_attr.attr, &damos_sysfs_quota_goal_target_value_attr.attr, &damos_sysfs_quota_goal_current_value_attr.attr, &damos_sysfs_quota_goal_nid_attr.attr, + &damos_sysfs_quota_goal_path_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damos_sysfs_quota_goal); @@ -2492,7 +2533,7 @@ static int damos_sysfs_add_quota_score( struct damos_quota *quota) { struct damos_quota_goal *goal; - int i; + int i, err; for (i = 0; i < sysfs_goals->nr; i++) { struct damos_sysfs_quota_goal *sysfs_goal = @@ -2513,6 +2554,16 @@ static int damos_sysfs_add_quota_score( case DAMOS_QUOTA_NODE_MEM_FREE_BP: goal->nid = sysfs_goal->nid; break; + case DAMOS_QUOTA_NODE_MEMCG_USED_BP: + case DAMOS_QUOTA_NODE_MEMCG_FREE_BP: + err = damon_sysfs_memcg_path_to_id( + sysfs_goal->path, &goal->memcg_id); + if (err) { + damos_destroy_quota_goal(goal); + return err; + } + goal->nid = sysfs_goal->nid; + break; default: break; } diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index cd6815ecc04e..e2bd2d7becdd 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -212,6 +212,7 @@ struct damon_sysfs_target { struct kobject kobj; struct damon_sysfs_regions *regions; int pid; + bool obsolete; }; static struct damon_sysfs_target *damon_sysfs_target_alloc(void) @@ -263,6 +264,29 @@ static ssize_t pid_target_store(struct kobject *kobj, return count; } +static ssize_t obsolete_target_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_target *target = container_of(kobj, + struct damon_sysfs_target, kobj); + + return sysfs_emit(buf, "%c\n", target->obsolete ? 'Y' : 'N'); +} + +static ssize_t obsolete_target_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_target *target = container_of(kobj, + struct damon_sysfs_target, kobj); + bool obsolete; + int err = kstrtobool(buf, &obsolete); + + if (err) + return err; + target->obsolete = obsolete; + return count; +} + static void damon_sysfs_target_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_target, kobj)); @@ -271,8 +295,12 @@ static void damon_sysfs_target_release(struct kobject *kobj) static struct kobj_attribute damon_sysfs_target_pid_attr = __ATTR_RW_MODE(pid_target, 0600); +static struct kobj_attribute damon_sysfs_target_obsolete_attr = + __ATTR_RW_MODE(obsolete_target, 0600); + static struct attribute *damon_sysfs_target_attrs[] = { &damon_sysfs_target_pid_attr.attr, + &damon_sysfs_target_obsolete_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_target); @@ -1264,7 +1292,7 @@ enum damon_sysfs_cmd { DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS, /* * @DAMON_SYSFS_CMD_UPDATE_TUNED_INTERVALS: Update the tuned monitoring - * intevals. + * intervals. */ DAMON_SYSFS_CMD_UPDATE_TUNED_INTERVALS, /* @@ -1377,6 +1405,7 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target, /* caller will destroy targets */ return -EINVAL; } + t->obsolete = sys_target->obsolete; return damon_sysfs_set_regions(t, sys_target->regions, ctx->min_sz_region); } @@ -1452,6 +1481,26 @@ static struct damon_ctx *damon_sysfs_build_ctx( struct damon_sysfs_context *sys_ctx); /* + * Return a new damon_ctx for testing new parameters to commit. + */ +static struct damon_ctx *damon_sysfs_new_test_ctx( + struct damon_ctx *running_ctx) +{ + struct damon_ctx *test_ctx; + int err; + + test_ctx = damon_new_ctx(); + if (!test_ctx) + return NULL; + err = damon_commit_ctx(test_ctx, running_ctx); + if (err) { + damon_destroy_ctx(test_ctx); + return NULL; + } + return test_ctx; +} + +/* * damon_sysfs_commit_input() - Commit user inputs to a running kdamond. * @kdamond: The kobject wrapper for the associated kdamond. * @@ -1472,7 +1521,7 @@ static int damon_sysfs_commit_input(void *data) param_ctx = damon_sysfs_build_ctx(kdamond->contexts->contexts_arr[0]); if (IS_ERR(param_ctx)) return PTR_ERR(param_ctx); - test_ctx = damon_new_ctx(); + test_ctx = damon_sysfs_new_test_ctx(kdamond->damon_ctx); if (!test_ctx) return -ENOMEM; err = damon_commit_ctx(test_ctx, param_ctx); @@ -1552,16 +1601,17 @@ static struct damon_ctx *damon_sysfs_build_ctx( return ctx; } +static unsigned long damon_sysfs_next_update_jiffies; + static int damon_sysfs_repeat_call_fn(void *data) { struct damon_sysfs_kdamond *sysfs_kdamond = data; - static unsigned long next_update_jiffies; if (!sysfs_kdamond->refresh_ms) return 0; - if (time_before(jiffies, next_update_jiffies)) + if (time_before(jiffies, damon_sysfs_next_update_jiffies)) return 0; - next_update_jiffies = jiffies + + damon_sysfs_next_update_jiffies = jiffies + msecs_to_jiffies(sysfs_kdamond->refresh_ms); if (!mutex_trylock(&damon_sysfs_lock)) @@ -1607,6 +1657,9 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) } kdamond->damon_ctx = ctx; + damon_sysfs_next_update_jiffies = + jiffies + msecs_to_jiffies(kdamond->refresh_ms); + repeat_call_control->fn = damon_sysfs_repeat_call_fn; repeat_call_control->data = kdamond; repeat_call_control->repeat = true; diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 51369e35298b..a1eff023e928 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -20,11 +20,17 @@ static void damon_test_regions(struct kunit *test) struct damon_target *t; r = damon_new_region(1, 2); + if (!r) + kunit_skip(test, "region alloc fail"); KUNIT_EXPECT_EQ(test, 1ul, r->ar.start); KUNIT_EXPECT_EQ(test, 2ul, r->ar.end); KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses); t = damon_new_target(); + if (!t) { + damon_free_region(r); + kunit_skip(test, "target alloc fail"); + } KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t)); damon_add_region(r, t); @@ -52,7 +58,14 @@ static void damon_test_target(struct kunit *test) struct damon_ctx *c = damon_new_ctx(); struct damon_target *t; + if (!c) + kunit_skip(test, "ctx alloc fail"); + t = damon_new_target(); + if (!t) { + damon_destroy_ctx(c); + kunit_skip(test, "target alloc fail"); + } KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c)); damon_add_target(c, t); @@ -84,8 +97,15 @@ static void damon_test_aggregate(struct kunit *test) struct damon_region *r; int it, ir; + if (!ctx) + kunit_skip(test, "ctx alloc fail"); + for (it = 0; it < 3; it++) { t = damon_new_target(); + if (!t) { + damon_destroy_ctx(ctx); + kunit_skip(test, "target alloc fail"); + } damon_add_target(ctx, t); } @@ -93,6 +113,10 @@ static void damon_test_aggregate(struct kunit *test) damon_for_each_target(t, ctx) { for (ir = 0; ir < 3; ir++) { r = damon_new_region(saddr[it][ir], eaddr[it][ir]); + if (!r) { + damon_destroy_ctx(ctx); + kunit_skip(test, "region alloc fail"); + } r->nr_accesses = accesses[it][ir]; r->nr_accesses_bp = accesses[it][ir] * 10000; damon_add_region(r, t); @@ -120,12 +144,17 @@ static void damon_test_aggregate(struct kunit *test) static void damon_test_split_at(struct kunit *test) { - struct damon_ctx *c = damon_new_ctx(); struct damon_target *t; struct damon_region *r, *r_new; t = damon_new_target(); + if (!t) + kunit_skip(test, "target alloc fail"); r = damon_new_region(0, 100); + if (!r) { + damon_free_target(t); + kunit_skip(test, "region alloc fail"); + } r->nr_accesses_bp = 420000; r->nr_accesses = 42; r->last_nr_accesses = 15; @@ -143,7 +172,6 @@ static void damon_test_split_at(struct kunit *test) KUNIT_EXPECT_EQ(test, r->last_nr_accesses, r_new->last_nr_accesses); damon_free_target(t); - damon_destroy_ctx(c); } static void damon_test_merge_two(struct kunit *test) @@ -153,11 +181,21 @@ static void damon_test_merge_two(struct kunit *test) int i; t = damon_new_target(); + if (!t) + kunit_skip(test, "target alloc fail"); r = damon_new_region(0, 100); + if (!r) { + damon_free_target(t); + kunit_skip(test, "region alloc fail"); + } r->nr_accesses = 10; r->nr_accesses_bp = 100000; damon_add_region(r, t); r2 = damon_new_region(100, 300); + if (!r2) { + damon_free_target(t); + kunit_skip(test, "second region alloc fail"); + } r2->nr_accesses = 20; r2->nr_accesses_bp = 200000; damon_add_region(r2, t); @@ -203,8 +241,14 @@ static void damon_test_merge_regions_of(struct kunit *test) int i; t = damon_new_target(); + if (!t) + kunit_skip(test, "target alloc fail"); for (i = 0; i < ARRAY_SIZE(sa); i++) { r = damon_new_region(sa[i], ea[i]); + if (!r) { + damon_free_target(t); + kunit_skip(test, "region alloc fail"); + } r->nr_accesses = nrs[i]; r->nr_accesses_bp = nrs[i] * 10000; damon_add_region(r, t); @@ -223,24 +267,34 @@ static void damon_test_merge_regions_of(struct kunit *test) static void damon_test_split_regions_of(struct kunit *test) { - struct damon_ctx *c = damon_new_ctx(); struct damon_target *t; struct damon_region *r; t = damon_new_target(); + if (!t) + kunit_skip(test, "target alloc fail"); r = damon_new_region(0, 22); + if (!r) { + damon_free_target(t); + kunit_skip(test, "region alloc fail"); + } damon_add_region(r, t); - damon_split_regions_of(t, 2, DAMON_MIN_REGION); + damon_split_regions_of(t, 2, 1); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u); damon_free_target(t); t = damon_new_target(); + if (!t) + kunit_skip(test, "second target alloc fail"); r = damon_new_region(0, 220); + if (!r) { + damon_free_target(t); + kunit_skip(test, "second region alloc fail"); + } damon_add_region(r, t); - damon_split_regions_of(t, 4, DAMON_MIN_REGION); + damon_split_regions_of(t, 4, 1); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u); damon_free_target(t); - damon_destroy_ctx(c); } static void damon_test_ops_registration(struct kunit *test) @@ -249,6 +303,9 @@ static void damon_test_ops_registration(struct kunit *test) struct damon_operations ops = {.id = DAMON_OPS_VADDR}, bak; bool need_cleanup = false; + if (!c) + kunit_skip(test, "ctx alloc fail"); + /* DAMON_OPS_VADDR is registered only if CONFIG_DAMON_VADDR is set */ if (!damon_is_registered_ops(DAMON_OPS_VADDR)) { bak.id = DAMON_OPS_VADDR; @@ -294,16 +351,29 @@ static void damon_test_ops_registration(struct kunit *test) static void damon_test_set_regions(struct kunit *test) { struct damon_target *t = damon_new_target(); - struct damon_region *r1 = damon_new_region(4, 16); - struct damon_region *r2 = damon_new_region(24, 32); + struct damon_region *r1, *r2; struct damon_addr_range range = {.start = 8, .end = 28}; unsigned long expects[] = {8, 16, 16, 24, 24, 28}; int expect_idx = 0; struct damon_region *r; + if (!t) + kunit_skip(test, "target alloc fail"); + r1 = damon_new_region(4, 16); + if (!r1) { + damon_free_target(t); + kunit_skip(test, "region alloc fail"); + } + r2 = damon_new_region(24, 32); + if (!r2) { + damon_free_target(t); + damon_free_region(r1); + kunit_skip(test, "second region alloc fail"); + } + damon_add_region(r1, t); damon_add_region(r2, t); - damon_set_regions(t, &range, 1, DAMON_MIN_REGION); + damon_set_regions(t, &range, 1, 1); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 3); damon_for_each_region(r, t) { @@ -342,6 +412,9 @@ static void damon_test_update_monitoring_result(struct kunit *test) struct damon_attrs new_attrs; struct damon_region *r = damon_new_region(3, 7); + if (!r) + kunit_skip(test, "region alloc fail"); + r->nr_accesses = 15; r->nr_accesses_bp = 150000; r->age = 20; @@ -375,6 +448,9 @@ static void damon_test_set_attrs(struct kunit *test) .sample_interval = 5000, .aggr_interval = 100000,}; struct damon_attrs invalid_attrs; + if (!c) + kunit_skip(test, "ctx alloc fail"); + KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &valid_attrs), 0); invalid_attrs = valid_attrs; @@ -412,6 +488,8 @@ static void damos_test_new_filter(struct kunit *test) struct damos_filter *filter; filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, false); + if (!filter) + kunit_skip(test, "filter alloc fail"); KUNIT_EXPECT_EQ(test, filter->type, DAMOS_FILTER_TYPE_ANON); KUNIT_EXPECT_EQ(test, filter->matching, true); KUNIT_EXPECT_PTR_EQ(test, filter->list.prev, &filter->list); @@ -419,20 +497,535 @@ static void damos_test_new_filter(struct kunit *test) damos_destroy_filter(filter); } +static void damos_test_commit_quota_goal_for(struct kunit *test, + struct damos_quota_goal *dst, + struct damos_quota_goal *src) +{ + u64 dst_last_psi_total = 0; + + if (dst->metric == DAMOS_QUOTA_SOME_MEM_PSI_US) + dst_last_psi_total = dst->last_psi_total; + damos_commit_quota_goal(dst, src); + + KUNIT_EXPECT_EQ(test, dst->metric, src->metric); + KUNIT_EXPECT_EQ(test, dst->target_value, src->target_value); + if (src->metric == DAMOS_QUOTA_USER_INPUT) + KUNIT_EXPECT_EQ(test, dst->current_value, src->current_value); + if (dst_last_psi_total && src->metric == DAMOS_QUOTA_SOME_MEM_PSI_US) + KUNIT_EXPECT_EQ(test, dst->last_psi_total, dst_last_psi_total); + switch (dst->metric) { + case DAMOS_QUOTA_NODE_MEM_USED_BP: + case DAMOS_QUOTA_NODE_MEM_FREE_BP: + KUNIT_EXPECT_EQ(test, dst->nid, src->nid); + break; + case DAMOS_QUOTA_NODE_MEMCG_USED_BP: + case DAMOS_QUOTA_NODE_MEMCG_FREE_BP: + KUNIT_EXPECT_EQ(test, dst->nid, src->nid); + KUNIT_EXPECT_EQ(test, dst->memcg_id, src->memcg_id); + break; + default: + break; + } +} + +static void damos_test_commit_quota_goal(struct kunit *test) +{ + struct damos_quota_goal dst = { + .metric = DAMOS_QUOTA_SOME_MEM_PSI_US, + .target_value = 1000, + .current_value = 123, + .last_psi_total = 456, + }; + + damos_test_commit_quota_goal_for(test, &dst, + &(struct damos_quota_goal){ + .metric = DAMOS_QUOTA_USER_INPUT, + .target_value = 789, + .current_value = 12}); + damos_test_commit_quota_goal_for(test, &dst, + &(struct damos_quota_goal){ + .metric = DAMOS_QUOTA_NODE_MEM_FREE_BP, + .target_value = 345, + .current_value = 678, + .nid = 9, + }); + damos_test_commit_quota_goal_for(test, &dst, + &(struct damos_quota_goal){ + .metric = DAMOS_QUOTA_NODE_MEM_USED_BP, + .target_value = 12, + .current_value = 345, + .nid = 6, + }); + damos_test_commit_quota_goal_for(test, &dst, + &(struct damos_quota_goal){ + .metric = DAMOS_QUOTA_NODE_MEMCG_USED_BP, + .target_value = 456, + .current_value = 567, + .nid = 6, + .memcg_id = 7, + }); + damos_test_commit_quota_goal_for(test, &dst, + &(struct damos_quota_goal){ + .metric = DAMOS_QUOTA_NODE_MEMCG_FREE_BP, + .target_value = 890, + .current_value = 901, + .nid = 10, + .memcg_id = 1, + }); + damos_test_commit_quota_goal_for(test, &dst, + &(struct damos_quota_goal) { + .metric = DAMOS_QUOTA_USER_INPUT, + .target_value = 789, + .current_value = 12, + }); +} + +static void damos_test_commit_quota_goals_for(struct kunit *test, + struct damos_quota_goal *dst_goals, int nr_dst_goals, + struct damos_quota_goal *src_goals, int nr_src_goals) +{ + struct damos_quota dst, src; + struct damos_quota_goal *goal, *next; + bool skip = true; + int i; + + INIT_LIST_HEAD(&dst.goals); + INIT_LIST_HEAD(&src.goals); + + for (i = 0; i < nr_dst_goals; i++) { + /* + * When nr_src_goals is smaller than dst_goals, + * damos_commit_quota_goals() will kfree() the dst goals. + * Make it kfree()-able. + */ + goal = damos_new_quota_goal(dst_goals[i].metric, + dst_goals[i].target_value); + if (!goal) + goto out; + damos_add_quota_goal(&dst, goal); + } + skip = false; + for (i = 0; i < nr_src_goals; i++) + damos_add_quota_goal(&src, &src_goals[i]); + + damos_commit_quota_goals(&dst, &src); + + i = 0; + damos_for_each_quota_goal(goal, (&dst)) { + KUNIT_EXPECT_EQ(test, goal->metric, src_goals[i].metric); + KUNIT_EXPECT_EQ(test, goal->target_value, + src_goals[i++].target_value); + } + KUNIT_EXPECT_EQ(test, i, nr_src_goals); + +out: + damos_for_each_quota_goal_safe(goal, next, (&dst)) + damos_destroy_quota_goal(goal); + if (skip) + kunit_skip(test, "goal alloc fail"); +} + +static void damos_test_commit_quota_goals(struct kunit *test) +{ + damos_test_commit_quota_goals_for(test, + (struct damos_quota_goal[]){}, 0, + (struct damos_quota_goal[]){ + { + .metric = DAMOS_QUOTA_USER_INPUT, + .target_value = 123, + }, + }, 1); + damos_test_commit_quota_goals_for(test, + (struct damos_quota_goal[]){ + { + .metric = DAMOS_QUOTA_USER_INPUT, + .target_value = 234, + }, + + }, 1, + (struct damos_quota_goal[]){ + { + .metric = DAMOS_QUOTA_USER_INPUT, + .target_value = 345, + }, + }, 1); + damos_test_commit_quota_goals_for(test, + (struct damos_quota_goal[]){ + { + .metric = DAMOS_QUOTA_USER_INPUT, + .target_value = 456, + }, + + }, 1, + (struct damos_quota_goal[]){}, 0); +} + +static void damos_test_commit_quota(struct kunit *test) +{ + struct damos_quota dst = { + .reset_interval = 1, + .ms = 2, + .sz = 3, + .weight_sz = 4, + .weight_nr_accesses = 5, + .weight_age = 6, + }; + struct damos_quota src = { + .reset_interval = 7, + .ms = 8, + .sz = 9, + .weight_sz = 10, + .weight_nr_accesses = 11, + .weight_age = 12, + }; + + INIT_LIST_HEAD(&dst.goals); + INIT_LIST_HEAD(&src.goals); + + damos_commit_quota(&dst, &src); + + KUNIT_EXPECT_EQ(test, dst.reset_interval, src.reset_interval); + KUNIT_EXPECT_EQ(test, dst.ms, src.ms); + KUNIT_EXPECT_EQ(test, dst.sz, src.sz); + KUNIT_EXPECT_EQ(test, dst.weight_sz, src.weight_sz); + KUNIT_EXPECT_EQ(test, dst.weight_nr_accesses, src.weight_nr_accesses); + KUNIT_EXPECT_EQ(test, dst.weight_age, src.weight_age); +} + +static int damos_test_help_dests_setup(struct damos_migrate_dests *dests, + unsigned int *node_id_arr, unsigned int *weight_arr, + size_t nr_dests) +{ + size_t i; + + dests->node_id_arr = kmalloc_array(nr_dests, + sizeof(*dests->node_id_arr), GFP_KERNEL); + if (!dests->node_id_arr) + return -ENOMEM; + dests->weight_arr = kmalloc_array(nr_dests, + sizeof(*dests->weight_arr), GFP_KERNEL); + if (!dests->weight_arr) { + kfree(dests->node_id_arr); + dests->node_id_arr = NULL; + return -ENOMEM; + } + + for (i = 0; i < nr_dests; i++) { + dests->node_id_arr[i] = node_id_arr[i]; + dests->weight_arr[i] = weight_arr[i]; + } + dests->nr_dests = nr_dests; + return 0; +} + +static void damos_test_help_dests_free(struct damos_migrate_dests *dests) +{ + kfree(dests->node_id_arr); + kfree(dests->weight_arr); +} + +static void damos_test_commit_dests_for(struct kunit *test, + unsigned int *dst_node_id_arr, unsigned int *dst_weight_arr, + size_t dst_nr_dests, + unsigned int *src_node_id_arr, unsigned int *src_weight_arr, + size_t src_nr_dests) +{ + struct damos_migrate_dests dst = {}, src = {}; + int i, err; + bool skip = true; + + err = damos_test_help_dests_setup(&dst, dst_node_id_arr, + dst_weight_arr, dst_nr_dests); + if (err) + kunit_skip(test, "dests setup fail"); + err = damos_test_help_dests_setup(&src, src_node_id_arr, + src_weight_arr, src_nr_dests); + if (err) { + damos_test_help_dests_free(&dst); + kunit_skip(test, "src setup fail"); + } + err = damos_commit_dests(&dst, &src); + if (err) + goto out; + skip = false; + + KUNIT_EXPECT_EQ(test, dst.nr_dests, src_nr_dests); + for (i = 0; i < dst.nr_dests; i++) { + KUNIT_EXPECT_EQ(test, dst.node_id_arr[i], src_node_id_arr[i]); + KUNIT_EXPECT_EQ(test, dst.weight_arr[i], src_weight_arr[i]); + } + +out: + damos_test_help_dests_free(&dst); + damos_test_help_dests_free(&src); + if (skip) + kunit_skip(test, "skip"); +} + +static void damos_test_commit_dests(struct kunit *test) +{ + damos_test_commit_dests_for(test, + (unsigned int[]){1, 2, 3}, (unsigned int[]){2, 3, 4}, + 3, + (unsigned int[]){4, 5, 6}, (unsigned int[]){5, 6, 7}, + 3); + damos_test_commit_dests_for(test, + (unsigned int[]){1, 2}, (unsigned int[]){2, 3}, + 2, + (unsigned int[]){4, 5, 6}, (unsigned int[]){5, 6, 7}, + 3); + damos_test_commit_dests_for(test, + NULL, NULL, 0, + (unsigned int[]){4, 5, 6}, (unsigned int[]){5, 6, 7}, + 3); + damos_test_commit_dests_for(test, + (unsigned int[]){1, 2, 3}, (unsigned int[]){2, 3, 4}, + 3, + (unsigned int[]){4, 5}, (unsigned int[]){5, 6}, 2); + damos_test_commit_dests_for(test, + (unsigned int[]){1, 2, 3}, (unsigned int[]){2, 3, 4}, + 3, + NULL, NULL, 0); +} + +static void damos_test_commit_filter_for(struct kunit *test, + struct damos_filter *dst, struct damos_filter *src) +{ + damos_commit_filter(dst, src); + KUNIT_EXPECT_EQ(test, dst->type, src->type); + KUNIT_EXPECT_EQ(test, dst->matching, src->matching); + KUNIT_EXPECT_EQ(test, dst->allow, src->allow); + switch (src->type) { + case DAMOS_FILTER_TYPE_MEMCG: + KUNIT_EXPECT_EQ(test, dst->memcg_id, src->memcg_id); + break; + case DAMOS_FILTER_TYPE_ADDR: + KUNIT_EXPECT_EQ(test, dst->addr_range.start, + src->addr_range.start); + KUNIT_EXPECT_EQ(test, dst->addr_range.end, + src->addr_range.end); + break; + case DAMOS_FILTER_TYPE_TARGET: + KUNIT_EXPECT_EQ(test, dst->target_idx, src->target_idx); + break; + case DAMOS_FILTER_TYPE_HUGEPAGE_SIZE: + KUNIT_EXPECT_EQ(test, dst->sz_range.min, src->sz_range.min); + KUNIT_EXPECT_EQ(test, dst->sz_range.max, src->sz_range.max); + break; + default: + break; + } +} + static void damos_test_commit_filter(struct kunit *test) { - struct damos_filter *src_filter = damos_new_filter( - DAMOS_FILTER_TYPE_ANON, true, true); - struct damos_filter *dst_filter = damos_new_filter( - DAMOS_FILTER_TYPE_ACTIVE, false, false); + struct damos_filter dst = { + .type = DAMOS_FILTER_TYPE_ACTIVE, + .matching = false, + .allow = false, + }; + + damos_test_commit_filter_for(test, &dst, + &(struct damos_filter){ + .type = DAMOS_FILTER_TYPE_ANON, + .matching = true, + .allow = true, + }); + damos_test_commit_filter_for(test, &dst, + &(struct damos_filter){ + .type = DAMOS_FILTER_TYPE_MEMCG, + .matching = false, + .allow = false, + .memcg_id = 123, + }); + damos_test_commit_filter_for(test, &dst, + &(struct damos_filter){ + .type = DAMOS_FILTER_TYPE_YOUNG, + .matching = true, + .allow = true, + }); + damos_test_commit_filter_for(test, &dst, + &(struct damos_filter){ + .type = DAMOS_FILTER_TYPE_HUGEPAGE_SIZE, + .matching = false, + .allow = false, + .sz_range = {.min = 234, .max = 345}, + }); + damos_test_commit_filter_for(test, &dst, + &(struct damos_filter){ + .type = DAMOS_FILTER_TYPE_UNMAPPED, + .matching = true, + .allow = true, + }); + damos_test_commit_filter_for(test, &dst, + &(struct damos_filter){ + .type = DAMOS_FILTER_TYPE_ADDR, + .matching = false, + .allow = false, + .addr_range = {.start = 456, .end = 567}, + }); + damos_test_commit_filter_for(test, &dst, + &(struct damos_filter){ + .type = DAMOS_FILTER_TYPE_TARGET, + .matching = true, + .allow = true, + .target_idx = 6, + }); +} + +static void damos_test_help_initailize_scheme(struct damos *scheme) +{ + INIT_LIST_HEAD(&scheme->quota.goals); + INIT_LIST_HEAD(&scheme->core_filters); + INIT_LIST_HEAD(&scheme->ops_filters); +} - damos_commit_filter(dst_filter, src_filter); - KUNIT_EXPECT_EQ(test, dst_filter->type, src_filter->type); - KUNIT_EXPECT_EQ(test, dst_filter->matching, src_filter->matching); - KUNIT_EXPECT_EQ(test, dst_filter->allow, src_filter->allow); +static void damos_test_commit_for(struct kunit *test, struct damos *dst, + struct damos *src) +{ + int err; + + damos_test_help_initailize_scheme(dst); + damos_test_help_initailize_scheme(src); + + err = damos_commit(dst, src); + if (err) + kunit_skip(test, "damos_commit fail"); + + KUNIT_EXPECT_EQ(test, dst->pattern.min_sz_region, + src->pattern.min_sz_region); + KUNIT_EXPECT_EQ(test, dst->pattern.max_sz_region, + src->pattern.max_sz_region); + KUNIT_EXPECT_EQ(test, dst->pattern.min_nr_accesses, + src->pattern.min_nr_accesses); + KUNIT_EXPECT_EQ(test, dst->pattern.max_nr_accesses, + src->pattern.max_nr_accesses); + KUNIT_EXPECT_EQ(test, dst->pattern.min_age_region, + src->pattern.min_age_region); + KUNIT_EXPECT_EQ(test, dst->pattern.max_age_region, + src->pattern.max_age_region); + + KUNIT_EXPECT_EQ(test, dst->action, src->action); + KUNIT_EXPECT_EQ(test, dst->apply_interval_us, src->apply_interval_us); + + KUNIT_EXPECT_EQ(test, dst->wmarks.metric, src->wmarks.metric); + KUNIT_EXPECT_EQ(test, dst->wmarks.interval, src->wmarks.interval); + KUNIT_EXPECT_EQ(test, dst->wmarks.high, src->wmarks.high); + KUNIT_EXPECT_EQ(test, dst->wmarks.mid, src->wmarks.mid); + KUNIT_EXPECT_EQ(test, dst->wmarks.low, src->wmarks.low); + + switch (src->action) { + case DAMOS_MIGRATE_COLD: + case DAMOS_MIGRATE_HOT: + KUNIT_EXPECT_EQ(test, dst->target_nid, src->target_nid); + break; + default: + break; + } +} - damos_destroy_filter(src_filter); - damos_destroy_filter(dst_filter); +static void damos_test_commit(struct kunit *test) +{ + damos_test_commit_for(test, + &(struct damos){ + .pattern = (struct damos_access_pattern){ + 1, 2, 3, 4, 5, 6}, + .action = DAMOS_PAGEOUT, + .apply_interval_us = 1000000, + .wmarks = (struct damos_watermarks){ + DAMOS_WMARK_FREE_MEM_RATE, + 900, 100, 50}, + }, + &(struct damos){ + .pattern = (struct damos_access_pattern){ + 2, 3, 4, 5, 6, 7}, + .action = DAMOS_PAGEOUT, + .apply_interval_us = 2000000, + .wmarks = (struct damos_watermarks){ + DAMOS_WMARK_FREE_MEM_RATE, + 800, 50, 30}, + }); + damos_test_commit_for(test, + &(struct damos){ + .pattern = (struct damos_access_pattern){ + 1, 2, 3, 4, 5, 6}, + .action = DAMOS_PAGEOUT, + .apply_interval_us = 1000000, + .wmarks = (struct damos_watermarks){ + DAMOS_WMARK_FREE_MEM_RATE, + 900, 100, 50}, + }, + &(struct damos){ + .pattern = (struct damos_access_pattern){ + 2, 3, 4, 5, 6, 7}, + .action = DAMOS_MIGRATE_HOT, + .apply_interval_us = 2000000, + .target_nid = 5, + }); +} + +static struct damon_target *damon_test_help_setup_target( + unsigned long region_start_end[][2], int nr_regions) +{ + struct damon_target *t; + struct damon_region *r; + int i; + + t = damon_new_target(); + if (!t) + return NULL; + for (i = 0; i < nr_regions; i++) { + r = damon_new_region(region_start_end[i][0], + region_start_end[i][1]); + if (!r) { + damon_free_target(t); + return NULL; + } + damon_add_region(r, t); + } + return t; +} + +static void damon_test_commit_target_regions_for(struct kunit *test, + unsigned long dst_start_end[][2], int nr_dst_regions, + unsigned long src_start_end[][2], int nr_src_regions, + unsigned long expect_start_end[][2], int nr_expect_regions) +{ + struct damon_target *dst_target, *src_target; + struct damon_region *r; + int i; + + dst_target = damon_test_help_setup_target(dst_start_end, nr_dst_regions); + if (!dst_target) + kunit_skip(test, "dst target setup fail"); + src_target = damon_test_help_setup_target(src_start_end, nr_src_regions); + if (!src_target) { + damon_free_target(dst_target); + kunit_skip(test, "src target setup fail"); + } + damon_commit_target_regions(dst_target, src_target, 1); + i = 0; + damon_for_each_region(r, dst_target) { + KUNIT_EXPECT_EQ(test, r->ar.start, expect_start_end[i][0]); + KUNIT_EXPECT_EQ(test, r->ar.end, expect_start_end[i][1]); + i++; + } + KUNIT_EXPECT_EQ(test, damon_nr_regions(dst_target), nr_expect_regions); + KUNIT_EXPECT_EQ(test, i, nr_expect_regions); + damon_free_target(dst_target); + damon_free_target(src_target); +} + +static void damon_test_commit_target_regions(struct kunit *test) +{ + damon_test_commit_target_regions_for(test, + (unsigned long[][2]) {{3, 8}, {8, 10}}, 2, + (unsigned long[][2]) {{4, 6}}, 1, + (unsigned long[][2]) {{4, 6}}, 1); + damon_test_commit_target_regions_for(test, + (unsigned long[][2]) {{3, 8}, {8, 10}}, 2, + (unsigned long[][2]) {}, 0, + (unsigned long[][2]) {{3, 8}, {8, 10}}, 2); } static void damos_test_filter_out(struct kunit *test) @@ -442,58 +1035,66 @@ static void damos_test_filter_out(struct kunit *test) struct damos_filter *f; f = damos_new_filter(DAMOS_FILTER_TYPE_ADDR, true, false); - f->addr_range = (struct damon_addr_range){ - .start = DAMON_MIN_REGION * 2, .end = DAMON_MIN_REGION * 6}; + if (!f) + kunit_skip(test, "filter alloc fail"); + f->addr_range = (struct damon_addr_range){.start = 2, .end = 6}; t = damon_new_target(); - r = damon_new_region(DAMON_MIN_REGION * 3, DAMON_MIN_REGION * 5); + if (!t) { + damos_destroy_filter(f); + kunit_skip(test, "target alloc fail"); + } + r = damon_new_region(3, 5); + if (!r) { + damos_destroy_filter(f); + damon_free_target(t); + kunit_skip(test, "region alloc fail"); + } damon_add_region(r, t); /* region in the range */ - KUNIT_EXPECT_TRUE(test, - damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); + KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f, 1)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region before the range */ - r->ar.start = DAMON_MIN_REGION * 1; - r->ar.end = DAMON_MIN_REGION * 2; + r->ar.start = 1; + r->ar.end = 2; KUNIT_EXPECT_FALSE(test, - damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); + damos_filter_match(NULL, t, r, f, 1)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region after the range */ - r->ar.start = DAMON_MIN_REGION * 6; - r->ar.end = DAMON_MIN_REGION * 8; + r->ar.start = 6; + r->ar.end = 8; KUNIT_EXPECT_FALSE(test, - damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); + damos_filter_match(NULL, t, r, f, 1)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region started before the range */ - r->ar.start = DAMON_MIN_REGION * 1; - r->ar.end = DAMON_MIN_REGION * 4; - KUNIT_EXPECT_FALSE(test, - damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); + r->ar.start = 1; + r->ar.end = 4; + KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f, 1)); /* filter should have split the region */ - KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 1); - KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 2); + KUNIT_EXPECT_EQ(test, r->ar.start, 1); + KUNIT_EXPECT_EQ(test, r->ar.end, 2); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 2); r2 = damon_next_region(r); - KUNIT_EXPECT_EQ(test, r2->ar.start, DAMON_MIN_REGION * 2); - KUNIT_EXPECT_EQ(test, r2->ar.end, DAMON_MIN_REGION * 4); + KUNIT_EXPECT_EQ(test, r2->ar.start, 2); + KUNIT_EXPECT_EQ(test, r2->ar.end, 4); damon_destroy_region(r2, t); /* region started in the range */ - r->ar.start = DAMON_MIN_REGION * 2; - r->ar.end = DAMON_MIN_REGION * 8; + r->ar.start = 2; + r->ar.end = 8; KUNIT_EXPECT_TRUE(test, - damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); + damos_filter_match(NULL, t, r, f, 1)); /* filter should have split the region */ - KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 2); - KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 6); + KUNIT_EXPECT_EQ(test, r->ar.start, 2); + KUNIT_EXPECT_EQ(test, r->ar.end, 6); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 2); r2 = damon_next_region(r); - KUNIT_EXPECT_EQ(test, r2->ar.start, DAMON_MIN_REGION * 6); - KUNIT_EXPECT_EQ(test, r2->ar.end, DAMON_MIN_REGION * 8); + KUNIT_EXPECT_EQ(test, r2->ar.start, 6); + KUNIT_EXPECT_EQ(test, r2->ar.end, 8); damon_destroy_region(r2, t); damon_free_target(t); @@ -536,7 +1137,7 @@ static void damon_test_set_filters_default_reject(struct kunit *test) struct damos scheme; struct damos_filter *target_filter, *anon_filter; - INIT_LIST_HEAD(&scheme.filters); + INIT_LIST_HEAD(&scheme.core_filters); INIT_LIST_HEAD(&scheme.ops_filters); damos_set_filters_default_reject(&scheme); @@ -548,6 +1149,8 @@ static void damon_test_set_filters_default_reject(struct kunit *test) KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false); target_filter = damos_new_filter(DAMOS_FILTER_TYPE_TARGET, true, true); + if (!target_filter) + kunit_skip(test, "filter alloc fail"); damos_add_filter(&scheme, target_filter); damos_set_filters_default_reject(&scheme); /* @@ -573,6 +1176,10 @@ static void damon_test_set_filters_default_reject(struct kunit *test) KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false); anon_filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, true); + if (!anon_filter) { + damos_free_filter(target_filter); + kunit_skip(test, "anon_filter alloc fail"); + } damos_add_filter(&scheme, anon_filter); damos_set_filters_default_reject(&scheme); @@ -598,6 +1205,9 @@ static void damon_test_set_filters_default_reject(struct kunit *test) */ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false); KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, true); + + damos_free_filter(anon_filter); + damos_free_filter(target_filter); } static struct kunit_case damon_test_cases[] = { @@ -615,7 +1225,13 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_set_attrs), KUNIT_CASE(damon_test_moving_sum), KUNIT_CASE(damos_test_new_filter), + KUNIT_CASE(damos_test_commit_quota_goal), + KUNIT_CASE(damos_test_commit_quota_goals), + KUNIT_CASE(damos_test_commit_quota), + KUNIT_CASE(damos_test_commit_dests), KUNIT_CASE(damos_test_commit_filter), + KUNIT_CASE(damos_test_commit), + KUNIT_CASE(damon_test_commit_target_regions), KUNIT_CASE(damos_test_filter_out), KUNIT_CASE(damon_test_feed_loop_next_input), KUNIT_CASE(damon_test_set_filters_default_reject), diff --git a/mm/damon/tests/sysfs-kunit.h b/mm/damon/tests/sysfs-kunit.h index 7b5c7b307da9..0c665ed255a3 100644 --- a/mm/damon/tests/sysfs-kunit.h +++ b/mm/damon/tests/sysfs-kunit.h @@ -45,16 +45,41 @@ static void damon_sysfs_test_add_targets(struct kunit *test) struct damon_ctx *ctx; sysfs_targets = damon_sysfs_targets_alloc(); + if (!sysfs_targets) + kunit_skip(test, "sysfs_targets alloc fail"); sysfs_targets->nr = 1; sysfs_targets->targets_arr = kmalloc_array(1, sizeof(*sysfs_targets->targets_arr), GFP_KERNEL); + if (!sysfs_targets->targets_arr) { + kfree(sysfs_targets); + kunit_skip(test, "targets_arr alloc fail"); + } sysfs_target = damon_sysfs_target_alloc(); + if (!sysfs_target) { + kfree(sysfs_targets->targets_arr); + kfree(sysfs_targets); + kunit_skip(test, "sysfs_target alloc fail"); + } sysfs_target->pid = __damon_sysfs_test_get_any_pid(12, 100); sysfs_target->regions = damon_sysfs_regions_alloc(); + if (!sysfs_target->regions) { + kfree(sysfs_targets->targets_arr); + kfree(sysfs_targets); + kfree(sysfs_target); + kunit_skip(test, "sysfs_regions alloc fail"); + } + sysfs_targets->targets_arr[0] = sysfs_target; ctx = damon_new_ctx(); + if (!ctx) { + kfree(sysfs_targets->targets_arr); + kfree(sysfs_targets); + kfree(sysfs_target->regions); + kfree(sysfs_target); + kunit_skip(test, "ctx alloc fail"); + } damon_sysfs_add_targets(ctx, sysfs_targets); KUNIT_EXPECT_EQ(test, 1u, nr_damon_targets(ctx)); diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h index fce38dd53cf8..30dc5459f1d2 100644 --- a/mm/damon/tests/vaddr-kunit.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -136,8 +136,14 @@ static void damon_do_test_apply_three_regions(struct kunit *test, int i; t = damon_new_target(); + if (!t) + kunit_skip(test, "target alloc fail"); for (i = 0; i < nr_regions / 2; i++) { r = damon_new_region(regions[i * 2], regions[i * 2 + 1]); + if (!r) { + damon_destroy_target(t, NULL); + kunit_skip(test, "region alloc fail"); + } damon_add_region(r, t); } @@ -250,7 +256,16 @@ static void damon_test_split_evenly_fail(struct kunit *test, unsigned long start, unsigned long end, unsigned int nr_pieces) { struct damon_target *t = damon_new_target(); - struct damon_region *r = damon_new_region(start, end); + struct damon_region *r; + + if (!t) + kunit_skip(test, "target alloc fail"); + + r = damon_new_region(start, end); + if (!r) { + damon_free_target(t); + kunit_skip(test, "region alloc fail"); + } damon_add_region(r, t); KUNIT_EXPECT_EQ(test, @@ -269,10 +284,17 @@ static void damon_test_split_evenly_succ(struct kunit *test, unsigned long start, unsigned long end, unsigned int nr_pieces) { struct damon_target *t = damon_new_target(); - struct damon_region *r = damon_new_region(start, end); + struct damon_region *r; unsigned long expected_width = (end - start) / nr_pieces; unsigned long i = 0; + if (!t) + kunit_skip(test, "target alloc fail"); + r = damon_new_region(start, end); + if (!r) { + damon_free_target(t); + kunit_skip(test, "region alloc fail"); + } damon_add_region(r, t); KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, nr_pieces), 0); diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 7e834467b2d8..2750c88e7225 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -307,24 +307,16 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t *pte; - pmd_t pmde; spinlock_t *ptl; - if (pmd_trans_huge(pmdp_get(pmd))) { - ptl = pmd_lock(walk->mm, pmd); - pmde = pmdp_get(pmd); + ptl = pmd_trans_huge_lock(pmd, walk->vma); + if (ptl) { + pmd_t pmde = pmdp_get(pmd); - if (!pmd_present(pmde)) { - spin_unlock(ptl); - return 0; - } - - if (pmd_trans_huge(pmde)) { + if (pmd_present(pmde)) damon_pmdp_mkold(pmd, walk->vma, addr); - spin_unlock(ptl); - return 0; - } spin_unlock(ptl); + return 0; } pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); @@ -446,22 +438,13 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, struct damon_young_walk_private *priv = walk->private; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pmd_trans_huge(pmdp_get(pmd))) { - pmd_t pmde; - - ptl = pmd_lock(walk->mm, pmd); - pmde = pmdp_get(pmd); + ptl = pmd_trans_huge_lock(pmd, walk->vma); + if (ptl) { + pmd_t pmde = pmdp_get(pmd); - if (!pmd_present(pmde)) { - spin_unlock(ptl); - return 0; - } - - if (!pmd_trans_huge(pmde)) { - spin_unlock(ptl); - goto regular_page; - } - folio = damon_get_folio(pmd_pfn(pmde)); + if (!pmd_present(pmde)) + goto huge_out; + folio = vm_normal_folio_pmd(walk->vma, addr, pmde); if (!folio) goto huge_out; if (pmd_young(pmde) || !folio_test_idle(folio) || @@ -469,13 +452,10 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, addr)) priv->young = true; *priv->folio_sz = HPAGE_PMD_SIZE; - folio_put(folio); huge_out: spin_unlock(ptl); return 0; } - -regular_page: #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); @@ -484,14 +464,13 @@ regular_page: ptent = ptep_get(pte); if (!pte_present(ptent)) goto out; - folio = damon_get_folio(pte_pfn(ptent)); + folio = vm_normal_folio(walk->vma, addr, ptent); if (!folio) goto out; if (pte_young(ptent) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) priv->young = true; *priv->folio_sz = folio_size(folio); - folio_put(folio); out: pte_unmap_unlock(pte, ptl); return 0; @@ -718,7 +697,6 @@ isolate: list_add(&folio->lru, &migration_lists[i]); } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE static int damos_va_migrate_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { @@ -728,63 +706,49 @@ static int damos_va_migrate_pmd_entry(pmd_t *pmd, unsigned long addr, struct damos_migrate_dests *dests = &s->migrate_dests; struct folio *folio; spinlock_t *ptl; - pmd_t pmde; - - ptl = pmd_lock(walk->mm, pmd); - pmde = pmdp_get(pmd); - - if (!pmd_present(pmde) || !pmd_trans_huge(pmde)) - goto unlock; - - /* Tell page walk code to not split the PMD */ - walk->action = ACTION_CONTINUE; - - folio = damon_get_folio(pmd_pfn(pmde)); - if (!folio) - goto unlock; - - if (damos_va_filter_out(s, folio, walk->vma, addr, NULL, pmd)) - goto put_folio; - - damos_va_migrate_dests_add(folio, walk->vma, addr, dests, - migration_lists); - -put_folio: - folio_put(folio); -unlock: - spin_unlock(ptl); - return 0; -} -#else -#define damos_va_migrate_pmd_entry NULL -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + pte_t *start_pte, *pte, ptent; + int nr; -static int damos_va_migrate_pte_entry(pte_t *pte, unsigned long addr, - unsigned long next, struct mm_walk *walk) -{ - struct damos_va_migrate_private *priv = walk->private; - struct list_head *migration_lists = priv->migration_lists; - struct damos *s = priv->scheme; - struct damos_migrate_dests *dests = &s->migrate_dests; - struct folio *folio; - pte_t ptent; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + ptl = pmd_trans_huge_lock(pmd, walk->vma); + if (ptl) { + pmd_t pmde = pmdp_get(pmd); - ptent = ptep_get(pte); - if (pte_none(ptent) || !pte_present(ptent)) + if (!pmd_present(pmde)) + goto huge_out; + folio = vm_normal_folio_pmd(walk->vma, addr, pmde); + if (!folio) + goto huge_out; + if (damos_va_filter_out(s, folio, walk->vma, addr, NULL, pmd)) + goto huge_out; + damos_va_migrate_dests_add(folio, walk->vma, addr, dests, + migration_lists); +huge_out: + spin_unlock(ptl); return 0; + } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - folio = damon_get_folio(pte_pfn(ptent)); - if (!folio) + start_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (!pte) return 0; - if (damos_va_filter_out(s, folio, walk->vma, addr, pte, NULL)) - goto put_folio; - - damos_va_migrate_dests_add(folio, walk->vma, addr, dests, - migration_lists); + for (; addr < next; pte += nr, addr += nr * PAGE_SIZE) { + nr = 1; + ptent = ptep_get(pte); -put_folio: - folio_put(folio); + if (pte_none(ptent) || !pte_present(ptent)) + continue; + folio = vm_normal_folio(walk->vma, addr, ptent); + if (!folio) + continue; + if (damos_va_filter_out(s, folio, walk->vma, addr, pte, NULL)) + return 0; + damos_va_migrate_dests_add(folio, walk->vma, addr, dests, + migration_lists); + nr = folio_nr_pages(folio); + } + pte_unmap_unlock(start_pte, ptl); return 0; } @@ -850,7 +814,7 @@ static unsigned long damos_va_migrate(struct damon_target *target, struct damos_migrate_dests *dests = &s->migrate_dests; struct mm_walk_ops walk_ops = { .pmd_entry = damos_va_migrate_pmd_entry, - .pte_entry = damos_va_migrate_pte_entry, + .pte_entry = NULL, .walk_lock = PGWALK_RDLOCK, }; @@ -910,13 +874,10 @@ static int damos_va_stat_pmd_entry(pmd_t *pmd, unsigned long addr, int nr; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pmd_trans_huge(*pmd)) { - pmd_t pmde; + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { + pmd_t pmde = pmdp_get(pmd); - ptl = pmd_trans_huge_lock(pmd, vma); - if (!ptl) - return 0; - pmde = pmdp_get(pmd); if (!pmd_present(pmde)) goto huge_unlock; diff --git a/mm/debug.c b/mm/debug.c index 64ddb0c4b4be..77fa8fe1d641 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -67,7 +67,7 @@ static const char *page_type_name(unsigned int page_type) return page_type_names[i]; } -static void __dump_folio(struct folio *folio, struct page *page, +static void __dump_folio(const struct folio *folio, const struct page *page, unsigned long pfn, unsigned long idx) { struct address_space *mapping = folio_mapping(folio); @@ -327,7 +327,7 @@ static int __init setup_vm_debug(char *str) while (*str) { switch (tolower(*str)) { - case'p': + case 'p': __page_init_poisoning = true; break; default: diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 830107b6dd08..ae9b9310d96f 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -25,14 +25,14 @@ #include <linux/random.h> #include <linux/spinlock.h> #include <linux/swap.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/start_kernel.h> #include <linux/sched/mm.h> #include <linux/io.h> #include <linux/vmalloc.h> +#include <linux/pgalloc.h> #include <asm/cacheflush.h> -#include <asm/pgalloc.h> #include <asm/tlbflush.h> /* @@ -74,6 +74,7 @@ struct pgtable_debug_args { unsigned long fixed_pte_pfn; swp_entry_t swp_entry; + swp_entry_t leaf_entry; }; static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx) @@ -102,6 +103,12 @@ static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx) WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte, args->vma)))); WARN_ON(pte_dirty(pte_wrprotect(pte_mkclean(pte)))); WARN_ON(!pte_dirty(pte_wrprotect(pte_mkdirty(pte)))); + + WARN_ON(!pte_dirty(pte_mkwrite_novma(pte_mkdirty(pte)))); + WARN_ON(pte_dirty(pte_mkwrite_novma(pte_mkclean(pte)))); + WARN_ON(!pte_write(pte_mkdirty(pte_mkwrite_novma(pte)))); + WARN_ON(!pte_write(pte_mkwrite_novma(pte_wrprotect(pte)))); + WARN_ON(pte_write(pte_wrprotect(pte_mkwrite_novma(pte)))); } static void __init pte_advanced_tests(struct pgtable_debug_args *args) @@ -195,6 +202,13 @@ static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite(pmd, args->vma)))); WARN_ON(pmd_dirty(pmd_wrprotect(pmd_mkclean(pmd)))); WARN_ON(!pmd_dirty(pmd_wrprotect(pmd_mkdirty(pmd)))); + + WARN_ON(!pmd_dirty(pmd_mkwrite_novma(pmd_mkdirty(pmd)))); + WARN_ON(pmd_dirty(pmd_mkwrite_novma(pmd_mkclean(pmd)))); + WARN_ON(!pmd_write(pmd_mkdirty(pmd_mkwrite_novma(pmd)))); + WARN_ON(!pmd_write(pmd_mkwrite_novma(pmd_wrprotect(pmd)))); + WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite_novma(pmd)))); + /* * A huge page does not point to next level page table * entry. Hence this must qualify as pmd_bad(). @@ -690,7 +704,7 @@ static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args) { pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); - if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) + if (!pgtable_supports_soft_dirty()) return; pr_debug("Validating PTE soft dirty\n"); @@ -701,14 +715,16 @@ static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args) static void __init pte_swap_soft_dirty_tests(struct pgtable_debug_args *args) { pte_t pte; + softleaf_t entry; - if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) + if (!pgtable_supports_soft_dirty()) return; pr_debug("Validating PTE swap soft dirty\n"); pte = swp_entry_to_pte(args->swp_entry); - WARN_ON(!is_swap_pte(pte)); + entry = softleaf_from_pte(pte); + WARN_ON(!softleaf_is_swap(entry)); WARN_ON(!pte_swp_soft_dirty(pte_swp_mksoft_dirty(pte))); WARN_ON(pte_swp_soft_dirty(pte_swp_clear_soft_dirty(pte))); } @@ -718,7 +734,7 @@ static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) { pmd_t pmd; - if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) + if (!pgtable_supports_soft_dirty()) return; if (!has_transparent_hugepage()) @@ -730,65 +746,73 @@ static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) WARN_ON(pmd_soft_dirty(pmd_clear_soft_dirty(pmd))); } -static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) +static void __init pmd_leaf_soft_dirty_tests(struct pgtable_debug_args *args) { pmd_t pmd; - if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) || - !IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION)) + if (!pgtable_supports_soft_dirty() || + !IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION)) return; if (!has_transparent_hugepage()) return; pr_debug("Validating PMD swap soft dirty\n"); - pmd = swp_entry_to_pmd(args->swp_entry); - WARN_ON(!is_swap_pmd(pmd)); + pmd = swp_entry_to_pmd(args->leaf_entry); + WARN_ON(!pmd_is_huge(pmd)); + WARN_ON(!pmd_is_valid_softleaf(pmd)); WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd))); WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd))); } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) { } -static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { } +static void __init pmd_leaf_soft_dirty_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args) { - swp_entry_t entry, entry2; + swp_entry_t entry; + softleaf_t softleaf; pte_t pte; pr_debug("Validating PTE swap exclusive\n"); entry = args->swp_entry; pte = swp_entry_to_pte(entry); + softleaf = softleaf_from_pte(pte); + WARN_ON(pte_swp_exclusive(pte)); - WARN_ON(!is_swap_pte(pte)); - entry2 = pte_to_swp_entry(pte); - WARN_ON(memcmp(&entry, &entry2, sizeof(entry))); + WARN_ON(!softleaf_is_swap(softleaf)); + WARN_ON(memcmp(&entry, &softleaf, sizeof(entry))); pte = pte_swp_mkexclusive(pte); + softleaf = softleaf_from_pte(pte); + WARN_ON(!pte_swp_exclusive(pte)); - WARN_ON(!is_swap_pte(pte)); + WARN_ON(!softleaf_is_swap(softleaf)); WARN_ON(pte_swp_soft_dirty(pte)); - entry2 = pte_to_swp_entry(pte); - WARN_ON(memcmp(&entry, &entry2, sizeof(entry))); + WARN_ON(memcmp(&entry, &softleaf, sizeof(entry))); pte = pte_swp_clear_exclusive(pte); + softleaf = softleaf_from_pte(pte); + WARN_ON(pte_swp_exclusive(pte)); - WARN_ON(!is_swap_pte(pte)); - entry2 = pte_to_swp_entry(pte); - WARN_ON(memcmp(&entry, &entry2, sizeof(entry))); + WARN_ON(!softleaf_is_swap(softleaf)); + WARN_ON(memcmp(&entry, &softleaf, sizeof(entry))); } static void __init pte_swap_tests(struct pgtable_debug_args *args) { swp_entry_t arch_entry; + softleaf_t entry; pte_t pte1, pte2; pr_debug("Validating PTE swap\n"); pte1 = swp_entry_to_pte(args->swp_entry); - WARN_ON(!is_swap_pte(pte1)); + entry = softleaf_from_pte(pte1); + + WARN_ON(!softleaf_is_swap(entry)); arch_entry = __pte_to_swp_entry(pte1); pte2 = __swp_entry_to_pte(arch_entry); @@ -796,7 +820,7 @@ static void __init pte_swap_tests(struct pgtable_debug_args *args) } #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION -static void __init pmd_swap_tests(struct pgtable_debug_args *args) +static void __init pmd_softleaf_tests(struct pgtable_debug_args *args) { swp_entry_t arch_entry; pmd_t pmd1, pmd2; @@ -805,21 +829,22 @@ static void __init pmd_swap_tests(struct pgtable_debug_args *args) return; pr_debug("Validating PMD swap\n"); - pmd1 = swp_entry_to_pmd(args->swp_entry); - WARN_ON(!is_swap_pmd(pmd1)); + pmd1 = swp_entry_to_pmd(args->leaf_entry); + WARN_ON(!pmd_is_huge(pmd1)); + WARN_ON(!pmd_is_valid_softleaf(pmd1)); arch_entry = __pmd_to_swp_entry(pmd1); pmd2 = __swp_entry_to_pmd(arch_entry); WARN_ON(memcmp(&pmd1, &pmd2, sizeof(pmd1))); } #else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */ -static void __init pmd_swap_tests(struct pgtable_debug_args *args) { } +static void __init pmd_softleaf_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ static void __init swap_migration_tests(struct pgtable_debug_args *args) { struct page *page; - swp_entry_t swp; + softleaf_t entry; if (!IS_ENABLED(CONFIG_MIGRATION)) return; @@ -842,17 +867,17 @@ static void __init swap_migration_tests(struct pgtable_debug_args *args) * be locked, otherwise it stumbles upon a BUG_ON(). */ __SetPageLocked(page); - swp = make_writable_migration_entry(page_to_pfn(page)); - WARN_ON(!is_migration_entry(swp)); - WARN_ON(!is_writable_migration_entry(swp)); + entry = make_writable_migration_entry(page_to_pfn(page)); + WARN_ON(!softleaf_is_migration(entry)); + WARN_ON(!softleaf_is_migration_write(entry)); - swp = make_readable_migration_entry(swp_offset(swp)); - WARN_ON(!is_migration_entry(swp)); - WARN_ON(is_writable_migration_entry(swp)); + entry = make_readable_migration_entry(swp_offset(entry)); + WARN_ON(!softleaf_is_migration(entry)); + WARN_ON(softleaf_is_migration_write(entry)); - swp = make_readable_migration_entry(page_to_pfn(page)); - WARN_ON(!is_migration_entry(swp)); - WARN_ON(is_writable_migration_entry(swp)); + entry = make_readable_migration_entry(page_to_pfn(page)); + WARN_ON(!softleaf_is_migration(entry)); + WARN_ON(softleaf_is_migration_write(entry)); __ClearPageLocked(page); } @@ -1204,9 +1229,11 @@ static int __init init_args(struct pgtable_debug_args *args) init_fixed_pfns(args); /* See generic_max_swapfile_size(): probe the maximum offset */ - max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL)))); - /* Create a swp entry with all possible bits set */ - args->swp_entry = swp_entry((1 << MAX_SWAPFILES_SHIFT) - 1, max_swap_offset); + max_swap_offset = swp_offset(softleaf_from_pte(softleaf_to_pte(swp_entry(0, ~0UL)))); + /* Create a swp entry with all possible bits set while still being swap. */ + args->swp_entry = swp_entry(MAX_SWAPFILES - 1, max_swap_offset); + /* Create a non-present migration entry. */ + args->leaf_entry = make_writable_migration_entry(~0UL); /* * Allocate (huge) pages because some of the tests need to access @@ -1296,12 +1323,12 @@ static int __init debug_vm_pgtable(void) pte_soft_dirty_tests(&args); pmd_soft_dirty_tests(&args); pte_swap_soft_dirty_tests(&args); - pmd_swap_soft_dirty_tests(&args); + pmd_leaf_soft_dirty_tests(&args); pte_swap_exclusive_tests(&args); pte_swap_tests(&args); - pmd_swap_tests(&args); + pmd_softleaf_tests(&args); swap_migration_tests(&args); diff --git a/mm/fadvise.c b/mm/fadvise.c index 588fe76c5a14..67028e30aa91 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -111,8 +111,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) spin_unlock(&file->f_lock); break; case POSIX_FADV_DONTNEED: - __filemap_fdatawrite_range(mapping, offset, endbyte, - WB_SYNC_NONE); + filemap_flush_range(mapping, offset, endbyte); /* * First and last FULL page! Partial pages are deliberately diff --git a/mm/filemap.c b/mm/filemap.c index 13f0259d993c..ebd75684cb0a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -21,7 +21,7 @@ #include <linux/gfp.h> #include <linux/mm.h> #include <linux/swap.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/syscalls.h> #include <linux/mman.h> #include <linux/pagemap.h> @@ -48,7 +48,8 @@ #include <linux/rcupdate_wait.h> #include <linux/sched/mm.h> #include <linux/sysctl.h> -#include <asm/pgalloc.h> +#include <linux/pgalloc.h> + #include <asm/tlbflush.h> #include "internal.h" @@ -181,13 +182,13 @@ static void filemap_unaccount_folio(struct address_space *mapping, nr = folio_nr_pages(folio); - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); + lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); if (folio_test_swapbacked(folio)) { - __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); + lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); if (folio_test_pmd_mappable(folio)) - __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); + lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); } else if (folio_test_pmd_mappable(folio)) { - __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); + lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } if (test_bit(AS_KERNEL_FILE, &folio->mapping->flags)) @@ -256,7 +257,7 @@ void filemap_remove_folio(struct folio *folio) __filemap_remove_folio(folio, NULL); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); filemap_free_folio(mapping, folio); @@ -335,7 +336,7 @@ void delete_from_page_cache_batch(struct address_space *mapping, page_cache_delete_batch(mapping, fbatch); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); for (i = 0; i < folio_batch_count(fbatch); i++) @@ -366,83 +367,60 @@ static int filemap_check_and_keep_errors(struct address_space *mapping) return 0; } -/** - * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range - * @mapping: address space structure to write - * @wbc: the writeback_control controlling the writeout - * - * Call writepages on the mapping using the provided wbc to control the - * writeout. - * - * Return: %0 on success, negative error code otherwise. - */ -int filemap_fdatawrite_wbc(struct address_space *mapping, - struct writeback_control *wbc) +static int filemap_writeback(struct address_space *mapping, loff_t start, + loff_t end, enum writeback_sync_modes sync_mode, + long *nr_to_write) { + struct writeback_control wbc = { + .sync_mode = sync_mode, + .nr_to_write = nr_to_write ? *nr_to_write : LONG_MAX, + .range_start = start, + .range_end = end, + }; int ret; if (!mapping_can_writeback(mapping) || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; - wbc_attach_fdatawrite_inode(wbc, mapping->host); - ret = do_writepages(mapping, wbc); - wbc_detach_inode(wbc); + wbc_attach_fdatawrite_inode(&wbc, mapping->host); + ret = do_writepages(mapping, &wbc); + wbc_detach_inode(&wbc); + + if (!ret && nr_to_write) + *nr_to_write = wbc.nr_to_write; return ret; } -EXPORT_SYMBOL(filemap_fdatawrite_wbc); /** - * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range + * filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write * @start: offset in bytes where the range starts * @end: offset in bytes where the range ends (inclusive) - * @sync_mode: enable synchronous operation * * Start writeback against all of a mapping's dirty pages that lie * within the byte offsets <start, end> inclusive. * - * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as - * opposed to a regular memory cleansing writeback. The difference between - * these two operations is that if a dirty page/buffer is encountered, it must - * be waited upon, and not just skipped over. + * This is a data integrity operation that waits upon dirty or in writeback + * pages. * * Return: %0 on success, negative error code otherwise. */ -int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, - loff_t end, int sync_mode) -{ - struct writeback_control wbc = { - .sync_mode = sync_mode, - .nr_to_write = LONG_MAX, - .range_start = start, - .range_end = end, - }; - - return filemap_fdatawrite_wbc(mapping, &wbc); -} - -static inline int __filemap_fdatawrite(struct address_space *mapping, - int sync_mode) +int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end) { - return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); + return filemap_writeback(mapping, start, end, WB_SYNC_ALL, NULL); } +EXPORT_SYMBOL(filemap_fdatawrite_range); int filemap_fdatawrite(struct address_space *mapping) { - return __filemap_fdatawrite(mapping, WB_SYNC_ALL); + return filemap_fdatawrite_range(mapping, 0, LLONG_MAX); } EXPORT_SYMBOL(filemap_fdatawrite); -int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, - loff_t end) -{ - return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); -} -EXPORT_SYMBOL(filemap_fdatawrite_range); - /** - * filemap_fdatawrite_range_kick - start writeback on a range + * filemap_flush_range - start writeback on a range * @mapping: target address_space * @start: index to start writeback on * @end: last (inclusive) index for writeback @@ -452,12 +430,12 @@ EXPORT_SYMBOL(filemap_fdatawrite_range); * * Return: %0 on success, negative error code otherwise. */ -int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, +int filemap_flush_range(struct address_space *mapping, loff_t start, loff_t end) { - return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE); + return filemap_writeback(mapping, start, end, WB_SYNC_NONE, NULL); } -EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick); +EXPORT_SYMBOL_GPL(filemap_flush_range); /** * filemap_flush - mostly a non-blocking flush @@ -470,10 +448,22 @@ EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick); */ int filemap_flush(struct address_space *mapping) { - return __filemap_fdatawrite(mapping, WB_SYNC_NONE); + return filemap_flush_range(mapping, 0, LLONG_MAX); } EXPORT_SYMBOL(filemap_flush); +/* + * Start writeback on @nr_to_write pages from @mapping. No one but the existing + * btrfs caller should be using this. Talk to linux-mm if you think adding a + * new caller is a good idea. + */ +int filemap_flush_nr(struct address_space *mapping, long *nr_to_write) +{ + return filemap_writeback(mapping, 0, LLONG_MAX, WB_SYNC_NONE, + nr_to_write); +} +EXPORT_SYMBOL_FOR_MODULES(filemap_flush_nr, "btrfs"); + /** * filemap_range_has_page - check if a page exists in range. * @mapping: address space within which to check @@ -691,8 +681,7 @@ int filemap_write_and_wait_range(struct address_space *mapping, return 0; if (mapping_needs_writeback(mapping)) { - err = __filemap_fdatawrite_range(mapping, lstart, lend, - WB_SYNC_ALL); + err = filemap_fdatawrite_range(mapping, lstart, lend); /* * Even if the above returned error, the pages may be * written partially (e.g. -ENOSPC), so we wait for it. @@ -794,8 +783,7 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) return 0; if (mapping_needs_writeback(mapping)) { - err = __filemap_fdatawrite_range(mapping, lstart, lend, - WB_SYNC_ALL); + err = filemap_fdatawrite_range(mapping, lstart, lend); /* See comment of filemap_write_and_wait() */ if (err != -EIO) __filemap_fdatawait_range(mapping, lstart, lend); @@ -843,13 +831,13 @@ void replace_page_cache_folio(struct folio *old, struct folio *new) old->mapping = NULL; /* hugetlb pages do not participate in page cache accounting. */ if (!folio_test_hugetlb(old)) - __lruvec_stat_sub_folio(old, NR_FILE_PAGES); + lruvec_stat_sub_folio(old, NR_FILE_PAGES); if (!folio_test_hugetlb(new)) - __lruvec_stat_add_folio(new, NR_FILE_PAGES); + lruvec_stat_add_folio(new, NR_FILE_PAGES); if (folio_test_swapbacked(old)) - __lruvec_stat_sub_folio(old, NR_SHMEM); + lruvec_stat_sub_folio(old, NR_SHMEM); if (folio_test_swapbacked(new)) - __lruvec_stat_add_folio(new, NR_SHMEM); + lruvec_stat_add_folio(new, NR_SHMEM); xas_unlock_irq(&xas); if (free_folio) free_folio(old); @@ -932,9 +920,9 @@ noinline int __filemap_add_folio(struct address_space *mapping, /* hugetlb pages do not participate in page cache accounting */ if (!huge) { - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); + lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); if (folio_test_pmd_mappable(folio)) - __lruvec_stat_mod_folio(folio, + lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr); } @@ -1002,11 +990,16 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, EXPORT_SYMBOL_GPL(filemap_add_folio); #ifdef CONFIG_NUMA -struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) +struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *policy) { int n; struct folio *folio; + if (policy) + return folio_alloc_mpol_noprof(gfp, order, policy, + NO_INTERLEAVE_INDEX, numa_node_id()); + if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; do { @@ -1401,7 +1394,7 @@ repeat: * This follows the same logic as folio_wait_bit_common() so see the comments * there. */ -void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) +void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) __releases(ptl) { struct wait_page_queue wait_page; @@ -1410,7 +1403,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) unsigned long pflags; bool in_thrashing; wait_queue_head_t *q; - struct folio *folio = pfn_swap_entry_folio(entry); + struct folio *folio = softleaf_to_folio(entry); q = folio_waitqueue(folio); if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) { @@ -1923,11 +1916,12 @@ out: } /** - * __filemap_get_folio - Find and get a reference to a folio. + * __filemap_get_folio_mpol - Find and get a reference to a folio. * @mapping: The address_space to search. * @index: The page index. * @fgp_flags: %FGP flags modify how the folio is returned. * @gfp: Memory allocation flags to use if %FGP_CREAT is specified. + * @policy: NUMA memory allocation policy to follow. * * Looks up the page cache entry at @mapping & @index. * @@ -1938,8 +1932,8 @@ out: * * Return: The found folio or an ERR_PTR() otherwise. */ -struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, - fgf_t fgp_flags, gfp_t gfp) +struct folio *__filemap_get_folio_mpol(struct address_space *mapping, + pgoff_t index, fgf_t fgp_flags, gfp_t gfp, struct mempolicy *policy) { struct folio *folio; @@ -2009,7 +2003,7 @@ no_page: err = -ENOMEM; if (order > min_order) alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; - folio = filemap_alloc_folio(alloc_gfp, order); + folio = filemap_alloc_folio(alloc_gfp, order, policy); if (!folio) continue; @@ -2056,7 +2050,7 @@ no_page: folio_clear_dropbehind(folio); return folio; } -EXPORT_SYMBOL(__filemap_get_folio); +EXPORT_SYMBOL(__filemap_get_folio_mpol); static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max, xa_mark_t mark) @@ -2366,6 +2360,64 @@ out: } EXPORT_SYMBOL(filemap_get_folios_tag); +/** + * filemap_get_folios_dirty - Get a batch of dirty folios + * @mapping: The address_space to search + * @start: The starting folio index + * @end: The final folio index (inclusive) + * @fbatch: The batch to fill + * + * filemap_get_folios_dirty() works exactly like filemap_get_folios(), except + * the returned folios are presumed to be dirty or undergoing writeback. Dirty + * state is presumed because we don't block on folio lock nor want to miss + * folios. Callers that need to can recheck state upon locking the folio. + * + * This may not return all dirty folios if the batch gets filled up. + * + * Return: The number of folios found. + * Also update @start to be positioned for traversal of the next folio. + */ +unsigned filemap_get_folios_dirty(struct address_space *mapping, pgoff_t *start, + pgoff_t end, struct folio_batch *fbatch) +{ + XA_STATE(xas, &mapping->i_pages, *start); + struct folio *folio; + + rcu_read_lock(); + while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { + if (xa_is_value(folio)) + continue; + if (folio_trylock(folio)) { + bool clean = !folio_test_dirty(folio) && + !folio_test_writeback(folio); + folio_unlock(folio); + if (clean) { + folio_put(folio); + continue; + } + } + if (!folio_batch_add(fbatch, folio)) { + unsigned long nr = folio_nr_pages(folio); + *start = folio->index + nr; + goto out; + } + } + /* + * We come here when there is no folio beyond @end. We take care to not + * overflow the index @start as it confuses some of the callers. This + * breaks the iteration when there is a folio at index -1 but that is + * already broke anyway. + */ + if (end == (pgoff_t)-1) + *start = (pgoff_t)-1; + else + *start = end + 1; +out: + rcu_read_unlock(); + + return folio_batch_count(fbatch); +} + /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: @@ -2551,7 +2603,7 @@ static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch) if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) return -EAGAIN; - folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order); + folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order, NULL); if (!folio) return -ENOMEM; if (iocb->ki_flags & IOCB_DONTCACHE) @@ -3253,11 +3305,47 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff); struct file *fpin = NULL; vm_flags_t vm_flags = vmf->vma->vm_flags; + bool force_thp_readahead = false; unsigned short mmap_miss; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Use the readahead code, even if readahead is disabled */ - if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) { + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + (vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) + force_thp_readahead = true; + + if (!force_thp_readahead) { + /* + * If we don't want any read-ahead, don't bother. + * VM_EXEC case below is already intended for random access. + */ + if ((vm_flags & (VM_RAND_READ | VM_EXEC)) == VM_RAND_READ) + return fpin; + + if (!ra->ra_pages) + return fpin; + + if (vm_flags & VM_SEQ_READ) { + fpin = maybe_unlock_mmap_for_io(vmf, fpin); + page_cache_sync_ra(&ractl, ra->ra_pages); + return fpin; + } + } + + if (!(vm_flags & VM_SEQ_READ)) { + /* Avoid banging the cache line if not needed */ + mmap_miss = READ_ONCE(ra->mmap_miss); + if (mmap_miss < MMAP_LOTSAMISS * 10) + WRITE_ONCE(ra->mmap_miss, ++mmap_miss); + + /* + * Do we miss much more than hit in this file? If so, + * stop bothering with read-ahead. It will only hurt. + */ + if (mmap_miss > MMAP_LOTSAMISS) + return fpin; + } + + if (force_thp_readahead) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1); ra->size = HPAGE_PMD_NR; @@ -3272,34 +3360,6 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) page_cache_ra_order(&ractl, ra); return fpin; } -#endif - - /* - * If we don't want any read-ahead, don't bother. VM_EXEC case below is - * already intended for random access. - */ - if ((vm_flags & (VM_RAND_READ | VM_EXEC)) == VM_RAND_READ) - return fpin; - if (!ra->ra_pages) - return fpin; - - if (vm_flags & VM_SEQ_READ) { - fpin = maybe_unlock_mmap_for_io(vmf, fpin); - page_cache_sync_ra(&ractl, ra->ra_pages); - return fpin; - } - - /* Avoid banging the cache line if not needed */ - mmap_miss = READ_ONCE(ra->mmap_miss); - if (mmap_miss < MMAP_LOTSAMISS * 10) - WRITE_ONCE(ra->mmap_miss, ++mmap_miss); - - /* - * Do we miss much more than hit in this file? If so, - * stop bothering with read-ahead. It will only hurt. - */ - if (mmap_miss > MMAP_LOTSAMISS) - return fpin; if (vm_flags & VM_EXEC) { /* @@ -3681,8 +3741,10 @@ skip: static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct folio *folio, unsigned long start, unsigned long addr, unsigned int nr_pages, - unsigned long *rss, unsigned short *mmap_miss) + unsigned long *rss, unsigned short *mmap_miss, + pgoff_t file_end) { + struct address_space *mapping = folio->mapping; unsigned int ref_from_caller = 1; vm_fault_t ret = 0; struct page *page = folio_page(folio, start); @@ -3691,12 +3753,16 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, unsigned long addr0; /* - * Map the large folio fully where possible. + * Map the large folio fully where possible: * - * The folio must not cross VMA or page table boundary. + * - The folio is fully within size of the file or belong + * to shmem/tmpfs; + * - The folio doesn't cross VMA boundary; + * - The folio doesn't cross page table boundary; */ addr0 = addr - start * PAGE_SIZE; - if (folio_within_vma(folio, vmf->vma) && + if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) && + folio_within_vma(folio, vmf->vma) && (addr0 & PMD_MASK) == ((addr0 + folio_size(folio) - 1) & PMD_MASK)) { vmf->pte -= start; page -= start; @@ -3817,7 +3883,18 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, if (!folio) goto out; - if (filemap_map_pmd(vmf, folio, start_pgoff)) { + file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1; + end_pgoff = min(end_pgoff, file_end); + + /* + * Do not allow to map with PMD across i_size to preserve + * SIGBUS semantics. + * + * Make an exception for shmem/tmpfs that for long time + * intentionally mapped with PMDs across i_size. + */ + if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) && + filemap_map_pmd(vmf, folio, start_pgoff)) { ret = VM_FAULT_NOPAGE; goto out; } @@ -3830,10 +3907,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, goto out; } - file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1; - if (end_pgoff > file_end) - end_pgoff = file_end; - folio_type = mm_counter_file(folio); do { unsigned long end; @@ -3850,7 +3923,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, else ret |= filemap_map_folio_range(vmf, folio, xas.xa_index - folio->index, addr, - nr_pages, &rss, &mmap_miss); + nr_pages, &rss, &mmap_miss, file_end); folio_unlock(folio); } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); @@ -3983,8 +4056,7 @@ static struct folio *do_read_cache_folio(struct address_space *mapping, repeat: folio = filemap_get_folio(mapping, index); if (IS_ERR(folio)) { - folio = filemap_alloc_folio(gfp, - mapping_min_folio_order(mapping)); + folio = filemap_alloc_folio(gfp, mapping_min_folio_order(mapping), NULL); if (!folio) return ERR_PTR(-ENOMEM); index = mapping_align_index(mapping, index); @@ -4457,16 +4529,8 @@ int filemap_invalidate_inode(struct inode *inode, bool flush, unmap_mapping_pages(mapping, first, nr, false); /* Write back the data if we're asked to. */ - if (flush) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .range_start = start, - .range_end = end, - }; - - filemap_fdatawrite_wbc(mapping, &wbc); - } + if (flush) + filemap_fdatawrite_range(mapping, start, end); /* Wait for writeback to complete on all folios and discard. */ invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE); @@ -4545,7 +4609,7 @@ static void filemap_cachestat(struct address_space *mapping, swp_entry_t swp = radix_to_swp_entry(folio); /* swapin error results in poisoned entry */ - if (non_swap_entry(swp)) + if (!softleaf_is_swap(swp)) goto resched; /* @@ -950,7 +950,7 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; pudp = pud_offset(p4dp, address); - pud = READ_ONCE(*pudp); + pud = pudp_get(pudp); if (!pud_present(pud)) return no_page_table(vma, flags, address); if (pud_leaf(pud)) { @@ -975,7 +975,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, p4d_t *p4dp, p4d; p4dp = p4d_offset(pgdp, address); - p4d = READ_ONCE(*p4dp); + p4d = p4dp_get(p4dp); BUILD_BUG_ON(p4d_leaf(p4d)); if (!p4d_present(p4d) || p4d_bad(p4d)) @@ -2710,7 +2710,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * * *) ptes can be read atomically by the architecture. * - * *) valid user addesses are below TASK_MAX_SIZE + * *) valid user addresses are below TASK_MAX_SIZE * * The last two assumptions can be relaxed by the addition of helper functions. * @@ -3060,7 +3060,7 @@ static int gup_fast_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, pudp = pud_offset_lockless(p4dp, p4d, addr); do { - pud_t pud = READ_ONCE(*pudp); + pud_t pud = pudp_get(pudp); next = pud_addr_end(addr, end); if (unlikely(!pud_present(pud))) @@ -3086,7 +3086,7 @@ static int gup_fast_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, p4dp = p4d_offset_lockless(pgdp, pgd, addr); do { - p4d_t p4d = READ_ONCE(*p4dp); + p4d_t p4d = p4dp_get(p4dp); next = p4d_addr_end(addr, end); if (!p4d_present(p4d)) @@ -3108,7 +3108,7 @@ static void gup_fast_pgd_range(unsigned long addr, unsigned long end, pgdp = pgd_offset(current->mm, addr); do { - pgd_t pgd = READ_ONCE(*pgdp); + pgd_t pgd = pgdp_get(pgdp); next = pgd_addr_end(addr, end); if (pgd_none(pgd)) @@ -18,7 +18,7 @@ #include <linux/sched.h> #include <linux/mmzone.h> #include <linux/pagemap.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/hugetlb.h> #include <linux/memremap.h> #include <linux/sched/mm.h> @@ -244,7 +244,12 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, uint64_t pfn_req_flags = *hmm_pfn; uint64_t new_pfn_flags = 0; - if (pte_none_mostly(pte)) { + /* + * Any other marker than a UFFD WP marker will result in a fault error + * that will be correctly handled, so we need only check for UFFD WP + * here. + */ + if (pte_none(pte) || pte_is_uffd_wp_marker(pte)) { required_fault = hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); if (required_fault) @@ -253,19 +258,19 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, } if (!pte_present(pte)) { - swp_entry_t entry = pte_to_swp_entry(pte); + const softleaf_t entry = softleaf_from_pte(pte); /* * Don't fault in device private pages owned by the caller, * just report the PFN. */ - if (is_device_private_entry(entry) && - page_pgmap(pfn_swap_entry_to_page(entry))->owner == + if (softleaf_is_device_private(entry) && + page_pgmap(softleaf_to_page(entry))->owner == range->dev_private_owner) { cpu_flags = HMM_PFN_VALID; - if (is_writable_device_private_entry(entry)) + if (softleaf_is_device_private_write(entry)) cpu_flags |= HMM_PFN_WRITE; - new_pfn_flags = swp_offset_pfn(entry) | cpu_flags; + new_pfn_flags = softleaf_to_pfn(entry) | cpu_flags; goto out; } @@ -274,16 +279,16 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, if (!required_fault) goto out; - if (!non_swap_entry(entry)) + if (softleaf_is_swap(entry)) goto fault; - if (is_device_private_entry(entry)) + if (softleaf_is_device_private(entry)) goto fault; - if (is_device_exclusive_entry(entry)) + if (softleaf_is_device_exclusive(entry)) goto fault; - if (is_migration_entry(entry)) { + if (softleaf_is_migration(entry)) { pte_unmap(ptep); hmm_vma_walk->last = addr; migration_entry_wait(walk->mm, pmdp, addr); @@ -334,19 +339,19 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start, struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; unsigned long npages = (end - start) >> PAGE_SHIFT; + const softleaf_t entry = softleaf_from_pmd(pmd); unsigned long addr = start; - swp_entry_t entry = pmd_to_swp_entry(pmd); unsigned int required_fault; - if (is_device_private_entry(entry) && - pfn_swap_entry_folio(entry)->pgmap->owner == + if (softleaf_is_device_private(entry) && + softleaf_to_folio(entry)->pgmap->owner == range->dev_private_owner) { unsigned long cpu_flags = HMM_PFN_VALID | hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT); - unsigned long pfn = swp_offset_pfn(entry); + unsigned long pfn = softleaf_to_pfn(entry); unsigned long i; - if (is_writable_device_private_entry(entry)) + if (softleaf_is_device_private_write(entry)) cpu_flags |= HMM_PFN_WRITE; /* @@ -365,7 +370,7 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start, required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0); if (required_fault) { - if (is_device_private_entry(entry)) + if (softleaf_is_device_private(entry)) return hmm_vma_fault(addr, end, required_fault, walk); else return -EFAULT; @@ -407,7 +412,7 @@ again: if (pmd_none(pmd)) return hmm_vma_walk_hole(start, end, -1, walk); - if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { + if (thp_migration_supported() && pmd_is_migration_entry(pmd)) { if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) { hmm_vma_walk->last = addr; pmd_migration_entry_wait(walk->mm, pmdp); @@ -491,7 +496,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, /* Normally we don't want to split the huge page */ walk->action = ACTION_CONTINUE; - pud = READ_ONCE(*pudp); + pud = pudp_get(pudp); if (!pud_present(pud)) { spin_unlock(ptl); return hmm_vma_walk_hole(start, end, -1, walk); @@ -811,7 +816,7 @@ dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map, break; case PCI_P2PDMA_MAP_BUS_ADDR: pfns[idx] |= HMM_PFN_P2PDMA_BUS | HMM_PFN_DMA_MAPPED; - return pci_p2pdma_bus_addr_map(p2pdma_state, paddr); + return pci_p2pdma_bus_addr_map(p2pdma_state->mem, paddr); default: return DMA_MAPPING_ERROR; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1d1b74950332..f7c565f11a98 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -37,11 +37,11 @@ #include <linux/sched/sysctl.h> #include <linux/memory-tiers.h> #include <linux/compat.h> +#include <linux/pgalloc.h> #include <linux/pgalloc_tag.h> #include <linux/pagewalk.h> #include <asm/tlb.h> -#include <asm/pgalloc.h> #include "internal.h" #include "swap.h" @@ -214,7 +214,8 @@ retry: if (likely(atomic_inc_not_zero(&huge_zero_refcount))) return true; - zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, + zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO | __GFP_ZEROTAGS) & + ~__GFP_MOVABLE, HPAGE_PMD_ORDER); if (!zero_folio) { count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); @@ -1076,28 +1077,103 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) return pmd; } +static struct deferred_split *split_queue_node(int nid) +{ + struct pglist_data *pgdata = NODE_DATA(nid); + + return &pgdata->deferred_split_queue; +} + #ifdef CONFIG_MEMCG static inline -struct deferred_split *get_deferred_split_queue(struct folio *folio) +struct mem_cgroup *folio_split_queue_memcg(struct folio *folio, + struct deferred_split *queue) { - struct mem_cgroup *memcg = folio_memcg(folio); - struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); + if (mem_cgroup_disabled()) + return NULL; + if (split_queue_node(folio_nid(folio)) == queue) + return NULL; + return container_of(queue, struct mem_cgroup, deferred_split_queue); +} - if (memcg) - return &memcg->deferred_split_queue; - else - return &pgdat->deferred_split_queue; +static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg) +{ + return memcg ? &memcg->deferred_split_queue : split_queue_node(nid); } #else static inline -struct deferred_split *get_deferred_split_queue(struct folio *folio) +struct mem_cgroup *folio_split_queue_memcg(struct folio *folio, + struct deferred_split *queue) { - struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); + return NULL; +} - return &pgdat->deferred_split_queue; +static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg) +{ + return split_queue_node(nid); } #endif +static struct deferred_split *split_queue_lock(int nid, struct mem_cgroup *memcg) +{ + struct deferred_split *queue; + +retry: + queue = memcg_split_queue(nid, memcg); + spin_lock(&queue->split_queue_lock); + /* + * There is a period between setting memcg to dying and reparenting + * deferred split queue, and during this period the THPs in the deferred + * split queue will be hidden from the shrinker side. + */ + if (unlikely(memcg_is_dying(memcg))) { + spin_unlock(&queue->split_queue_lock); + memcg = parent_mem_cgroup(memcg); + goto retry; + } + + return queue; +} + +static struct deferred_split * +split_queue_lock_irqsave(int nid, struct mem_cgroup *memcg, unsigned long *flags) +{ + struct deferred_split *queue; + +retry: + queue = memcg_split_queue(nid, memcg); + spin_lock_irqsave(&queue->split_queue_lock, *flags); + if (unlikely(memcg_is_dying(memcg))) { + spin_unlock_irqrestore(&queue->split_queue_lock, *flags); + memcg = parent_mem_cgroup(memcg); + goto retry; + } + + return queue; +} + +static struct deferred_split *folio_split_queue_lock(struct folio *folio) +{ + return split_queue_lock(folio_nid(folio), folio_memcg(folio)); +} + +static struct deferred_split * +folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags) +{ + return split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags); +} + +static inline void split_queue_unlock(struct deferred_split *queue) +{ + spin_unlock(&queue->split_queue_lock); +} + +static inline void split_queue_unlock_irqrestore(struct deferred_split *queue, + unsigned long flags) +{ + spin_unlock_irqrestore(&queue->split_queue_lock, flags); +} + static inline bool is_transparent_hugepage(const struct folio *folio) { if (!folio_test_large(folio)) @@ -1126,7 +1202,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, if (len_pad < len || (off + len_pad) < off) return 0; - ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad, + ret = mm_get_unmapped_area_vmflags(filp, addr, len_pad, off >> PAGE_SHIFT, flags, vm_flags); /* @@ -1163,7 +1239,7 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add if (ret) return ret; - return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags, + return mm_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, vm_flags); } @@ -1217,7 +1293,7 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, return folio; } -static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd, +void map_anon_folio_pmd_nopf(struct folio *folio, pmd_t *pmd, struct vm_area_struct *vma, unsigned long haddr) { pmd_t entry; @@ -1228,6 +1304,13 @@ static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd, folio_add_lru_vma(folio, vma); set_pmd_at(vma->vm_mm, haddr, pmd, entry); update_mmu_cache_pmd(vma, haddr, pmd); + deferred_split_folio(folio, false); +} + +static void map_anon_folio_pmd_pf(struct folio *folio, pmd_t *pmd, + struct vm_area_struct *vma, unsigned long haddr) +{ + map_anon_folio_pmd_nopf(folio, pmd, vma, haddr); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); count_vm_event(THP_FAULT_ALLOC); count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); @@ -1270,9 +1353,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) return ret; } pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); - map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); + map_anon_folio_pmd_pf(folio, vmf->pmd, vma, haddr); mm_inc_nr_ptes(vma->vm_mm); - deferred_split_folio(folio, false); spin_unlock(vmf->ptl); } @@ -1287,6 +1369,44 @@ release: } +vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + vm_fault_t ret = 0; + spinlock_t *ptl; + softleaf_t entry; + struct page *page; + struct folio *folio; + + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + + ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) { + spin_unlock(ptl); + return 0; + } + + entry = softleaf_from_pmd(vmf->orig_pmd); + page = softleaf_to_page(entry); + folio = page_folio(page); + vmf->page = page; + vmf->pte = NULL; + if (folio_trylock(folio)) { + folio_get(folio); + spin_unlock(ptl); + ret = page_pgmap(page)->ops->migrate_to_ram(vmf); + folio_unlock(folio); + folio_put(folio); + } else { + spin_unlock(ptl); + } + + return ret; +} + /* * always: directly stall for all thp allocations * defer: wake kswapd and fail if not immediately available @@ -1641,17 +1761,86 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, EXPORT_SYMBOL_GPL(vmf_insert_folio_pud); #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -void touch_pmd(struct vm_area_struct *vma, unsigned long addr, +/** + * touch_pmd - Mark page table pmd entry as accessed and dirty (for write) + * @vma: The VMA covering @addr + * @addr: The virtual address + * @pmd: pmd pointer into the page table mapping @addr + * @write: Whether it's a write access + * + * Return: whether the pmd entry is changed + */ +bool touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, bool write) { - pmd_t _pmd; + pmd_t entry; - _pmd = pmd_mkyoung(*pmd); + entry = pmd_mkyoung(*pmd); if (write) - _pmd = pmd_mkdirty(_pmd); + entry = pmd_mkdirty(entry); if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, - pmd, _pmd, write)) + pmd, entry, write)) { update_mmu_cache_pmd(vma, addr, pmd); + return true; + } + + return false; +} + +static void copy_huge_non_present_pmd( + struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, + struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pmd_t pmd, pgtable_t pgtable) +{ + softleaf_t entry = softleaf_from_pmd(pmd); + struct folio *src_folio; + + VM_WARN_ON_ONCE(!pmd_is_valid_softleaf(pmd)); + + if (softleaf_is_migration_write(entry) || + softleaf_is_migration_read_exclusive(entry)) { + entry = make_readable_migration_entry(swp_offset(entry)); + pmd = swp_entry_to_pmd(entry); + if (pmd_swp_soft_dirty(*src_pmd)) + pmd = pmd_swp_mksoft_dirty(pmd); + if (pmd_swp_uffd_wp(*src_pmd)) + pmd = pmd_swp_mkuffd_wp(pmd); + set_pmd_at(src_mm, addr, src_pmd, pmd); + } else if (softleaf_is_device_private(entry)) { + /* + * For device private entries, since there are no + * read exclusive entries, writable = !readable + */ + if (softleaf_is_device_private_write(entry)) { + entry = make_readable_device_private_entry(swp_offset(entry)); + pmd = swp_entry_to_pmd(entry); + + if (pmd_swp_soft_dirty(*src_pmd)) + pmd = pmd_swp_mksoft_dirty(pmd); + if (pmd_swp_uffd_wp(*src_pmd)) + pmd = pmd_swp_mkuffd_wp(pmd); + set_pmd_at(src_mm, addr, src_pmd, pmd); + } + + src_folio = softleaf_to_folio(entry); + VM_WARN_ON(!folio_test_large(src_folio)); + + folio_get(src_folio); + /* + * folio_try_dup_anon_rmap_pmd does not fail for + * device private entries. + */ + folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page, + dst_vma, src_vma); + } + + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); + mm_inc_nr_ptes(dst_mm); + pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); + if (!userfaultfd_wp(dst_vma)) + pmd = pmd_swp_clear_uffd_wp(pmd); + set_pmd_at(dst_mm, addr, dst_pmd, pmd); } int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -1699,31 +1888,13 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = -EAGAIN; pmd = *src_pmd; -#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION - if (unlikely(is_swap_pmd(pmd))) { - swp_entry_t entry = pmd_to_swp_entry(pmd); - - VM_BUG_ON(!is_pmd_migration_entry(pmd)); - if (!is_readable_migration_entry(entry)) { - entry = make_readable_migration_entry( - swp_offset(entry)); - pmd = swp_entry_to_pmd(entry); - if (pmd_swp_soft_dirty(*src_pmd)) - pmd = pmd_swp_mksoft_dirty(pmd); - if (pmd_swp_uffd_wp(*src_pmd)) - pmd = pmd_swp_mkuffd_wp(pmd); - set_pmd_at(src_mm, addr, src_pmd, pmd); - } - add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); - mm_inc_nr_ptes(dst_mm); - pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); - if (!userfaultfd_wp(dst_vma)) - pmd = pmd_swp_clear_uffd_wp(pmd); - set_pmd_at(dst_mm, addr, dst_pmd, pmd); + if (unlikely(thp_migration_supported() && + pmd_is_valid_softleaf(pmd))) { + copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, + dst_vma, src_vma, pmd, pgtable); ret = 0; goto out_unlock; } -#endif if (unlikely(!pmd_trans_huge(pmd))) { pte_free(dst_mm, pgtable); @@ -1841,18 +2012,14 @@ unlock: } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -void huge_pmd_set_accessed(struct vm_fault *vmf) +bool huge_pmd_set_accessed(struct vm_fault *vmf) { bool write = vmf->flags & FAULT_FLAG_WRITE; - vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) - goto unlock; - - touch_pmd(vmf->vma, vmf->address, vmf->pmd, write); + return false; -unlock: - spin_unlock(vmf->ptl); + return touch_pmd(vmf->vma, vmf->address, vmf->pmd, write); } static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf) @@ -1877,7 +2044,7 @@ static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf) if (ret) goto release; (void)pmdp_huge_clear_flush(vma, haddr, vmf->pmd); - map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); + map_anon_folio_pmd_pf(folio, vmf->pmd, vma, haddr); goto unlock; release: folio_put(folio); @@ -2113,7 +2280,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (unlikely(!pmd_present(orig_pmd))) { VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(orig_pmd)); + !pmd_is_migration_entry(orig_pmd)); goto out; } @@ -2211,15 +2378,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, folio_remove_rmap_pmd(folio, page, vma); WARN_ON_ONCE(folio_mapcount(folio) < 0); VM_BUG_ON_PAGE(!PageHead(page), page); - } else if (thp_migration_supported()) { - swp_entry_t entry; + } else if (pmd_is_valid_softleaf(orig_pmd)) { + const softleaf_t entry = softleaf_from_pmd(orig_pmd); - VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); - entry = pmd_to_swp_entry(orig_pmd); - folio = pfn_swap_entry_folio(entry); + folio = softleaf_to_folio(entry); flush_needed = 0; - } else - WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); + + if (!thp_migration_supported()) + WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); + } if (folio_test_anon(folio)) { zap_deposited_table(tlb->mm, pmd); @@ -2239,6 +2406,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, folio_mark_accessed(folio); } + if (folio_is_device_private(folio)) { + folio_remove_rmap_pmd(folio, &folio->page, vma); + WARN_ON_ONCE(folio_mapcount(folio) < 0); + folio_put(folio); + } + spin_unlock(ptl); if (flush_needed) tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); @@ -2263,20 +2436,23 @@ static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, static pmd_t move_soft_dirty_pmd(pmd_t pmd) { -#ifdef CONFIG_MEM_SOFT_DIRTY - if (unlikely(is_pmd_migration_entry(pmd))) - pmd = pmd_swp_mksoft_dirty(pmd); - else if (pmd_present(pmd)) - pmd = pmd_mksoft_dirty(pmd); -#endif + if (pgtable_supports_soft_dirty()) { + if (unlikely(pmd_is_migration_entry(pmd))) + pmd = pmd_swp_mksoft_dirty(pmd); + else if (pmd_present(pmd)) + pmd = pmd_mksoft_dirty(pmd); + } + return pmd; } static pmd_t clear_uffd_wp_pmd(pmd_t pmd) { + if (pmd_none(pmd)) + return pmd; if (pmd_present(pmd)) pmd = pmd_clear_uffd_wp(pmd); - else if (is_swap_pmd(pmd)) + else pmd = pmd_swp_clear_uffd_wp(pmd); return pmd; @@ -2333,6 +2509,42 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, return false; } +static void change_non_present_huge_pmd(struct mm_struct *mm, + unsigned long addr, pmd_t *pmd, bool uffd_wp, + bool uffd_wp_resolve) +{ + softleaf_t entry = softleaf_from_pmd(*pmd); + const struct folio *folio = softleaf_to_folio(entry); + pmd_t newpmd; + + VM_WARN_ON(!pmd_is_valid_softleaf(*pmd)); + if (softleaf_is_migration_write(entry)) { + /* + * A protection check is difficult so + * just be safe and disable write + */ + if (folio_test_anon(folio)) + entry = make_readable_exclusive_migration_entry(swp_offset(entry)); + else + entry = make_readable_migration_entry(swp_offset(entry)); + newpmd = swp_entry_to_pmd(entry); + if (pmd_swp_soft_dirty(*pmd)) + newpmd = pmd_swp_mksoft_dirty(newpmd); + } else if (softleaf_is_device_private_write(entry)) { + entry = make_readable_device_private_entry(swp_offset(entry)); + newpmd = swp_entry_to_pmd(entry); + } else { + newpmd = *pmd; + } + + if (uffd_wp) + newpmd = pmd_swp_mkuffd_wp(newpmd); + else if (uffd_wp_resolve) + newpmd = pmd_swp_clear_uffd_wp(newpmd); + if (!pmd_same(*pmd, newpmd)) + set_pmd_at(mm, addr, pmd, newpmd); +} + /* * Returns * - 0 if PMD could not be locked @@ -2361,42 +2573,14 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (!ptl) return 0; -#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION - if (is_swap_pmd(*pmd)) { - swp_entry_t entry = pmd_to_swp_entry(*pmd); - struct folio *folio = pfn_swap_entry_folio(entry); - pmd_t newpmd; - - VM_BUG_ON(!is_pmd_migration_entry(*pmd)); - if (is_writable_migration_entry(entry)) { - /* - * A protection check is difficult so - * just be safe and disable write - */ - if (folio_test_anon(folio)) - entry = make_readable_exclusive_migration_entry(swp_offset(entry)); - else - entry = make_readable_migration_entry(swp_offset(entry)); - newpmd = swp_entry_to_pmd(entry); - if (pmd_swp_soft_dirty(*pmd)) - newpmd = pmd_swp_mksoft_dirty(newpmd); - } else { - newpmd = *pmd; - } - - if (uffd_wp) - newpmd = pmd_swp_mkuffd_wp(newpmd); - else if (uffd_wp_resolve) - newpmd = pmd_swp_clear_uffd_wp(newpmd); - if (!pmd_same(*pmd, newpmd)) - set_pmd_at(mm, addr, pmd, newpmd); + if (thp_migration_supported() && pmd_is_valid_softleaf(*pmd)) { + change_non_present_huge_pmd(mm, addr, pmd, uffd_wp, + uffd_wp_resolve); goto unlock; } -#endif if (prot_numa) { - struct folio *folio; - bool toptier; + /* * Avoid trapping faults against the zero page. The read-only * data is likely to be read-cached on the local CPU and @@ -2408,19 +2592,9 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (pmd_protnone(*pmd)) goto unlock; - folio = pmd_folio(*pmd); - toptier = node_is_toptier(folio_nid(folio)); - /* - * Skip scanning top tier node if normal numa - * balancing is disabled - */ - if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && - toptier) + if (!folio_can_map_prot_numa(pmd_folio(*pmd), vma, + vma_is_single_threaded_private(vma))) goto unlock; - - if (folio_use_access_time(folio)) - folio_xchg_access_time(folio, - jiffies_to_msecs(jiffies)); } /* * In case prot_numa, we are under mmap_read_lock(mm). It's critical @@ -2533,7 +2707,6 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm pmd_t _dst_pmd, src_pmdval; struct page *src_page; struct folio *src_folio; - struct anon_vma *src_anon_vma; spinlock_t *src_ptl, *dst_ptl; pgtable_t src_pgtable; struct mmu_notifier_range range; @@ -2555,7 +2728,7 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm if (!pmd_trans_huge(src_pmdval)) { spin_unlock(src_ptl); - if (is_pmd_migration_entry(src_pmdval)) { + if (pmd_is_migration_entry(src_pmdval)) { pmd_migration_entry_wait(mm, &src_pmdval); return -EAGAIN; } @@ -2582,23 +2755,9 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm src_addr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); - if (src_folio) { + if (src_folio) folio_lock(src_folio); - /* - * split_huge_page walks the anon_vma chain without the page - * lock. Serialize against it with the anon_vma lock, the page - * lock is not enough. - */ - src_anon_vma = folio_get_anon_vma(src_folio); - if (!src_anon_vma) { - err = -EAGAIN; - goto unlock_folio; - } - anon_vma_lock_write(src_anon_vma); - } else - src_anon_vma = NULL; - dst_ptl = pmd_lockptr(mm, dst_pmd); double_pt_lock(src_ptl, dst_ptl); if (unlikely(!pmd_same(*src_pmd, src_pmdval) || @@ -2643,11 +2802,6 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable); unlock_ptls: double_pt_unlock(src_ptl, dst_ptl); - if (src_anon_vma) { - anon_vma_unlock_write(src_anon_vma); - put_anon_vma(src_anon_vma); - } -unlock_folio: /* unblock rmap walks */ if (src_folio) folio_unlock(src_folio); @@ -2667,8 +2821,9 @@ unlock_folio: spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { spinlock_t *ptl; + ptl = pmd_lock(vma->vm_mm, pmd); - if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd))) + if (likely(pmd_is_huge(*pmd))) return ptl; spin_unlock(ptl); return NULL; @@ -2834,7 +2989,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, struct page *page; pgtable_t pgtable; pmd_t old_pmd, _pmd; - bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; + bool soft_dirty, uffd_wp = false, young = false, write = false; bool anon_exclusive = false, dirty = false; unsigned long addr; pte_t *pte; @@ -2843,7 +2998,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); VM_BUG_ON_VMA(vma->vm_start > haddr, vma); VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); - VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)); + + VM_WARN_ON_ONCE(!pmd_is_valid_softleaf(*pmd) && !pmd_trans_huge(*pmd)); count_vm_event(THP_SPLIT_PMD); @@ -2857,11 +3013,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, zap_deposited_table(mm, pmd); if (!vma_is_dax(vma) && vma_is_special_huge(vma)) return; - if (unlikely(is_pmd_migration_entry(old_pmd))) { - swp_entry_t entry; + if (unlikely(pmd_is_migration_entry(old_pmd))) { + const softleaf_t old_entry = softleaf_from_pmd(old_pmd); - entry = pmd_to_swp_entry(old_pmd); - folio = pfn_swap_entry_folio(entry); + folio = softleaf_to_folio(old_entry); } else if (is_huge_zero_pmd(old_pmd)) { return; } else { @@ -2891,20 +3046,54 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, return __split_huge_zero_page_pmd(vma, haddr, pmd); } - pmd_migration = is_pmd_migration_entry(*pmd); - if (unlikely(pmd_migration)) { - swp_entry_t entry; + if (pmd_is_migration_entry(*pmd)) { + softleaf_t entry; old_pmd = *pmd; - entry = pmd_to_swp_entry(old_pmd); - page = pfn_swap_entry_to_page(entry); - write = is_writable_migration_entry(entry); + entry = softleaf_from_pmd(old_pmd); + page = softleaf_to_page(entry); + folio = page_folio(page); + + soft_dirty = pmd_swp_soft_dirty(old_pmd); + uffd_wp = pmd_swp_uffd_wp(old_pmd); + + write = softleaf_is_migration_write(entry); if (PageAnon(page)) - anon_exclusive = is_readable_exclusive_migration_entry(entry); - young = is_migration_entry_young(entry); - dirty = is_migration_entry_dirty(entry); + anon_exclusive = softleaf_is_migration_read_exclusive(entry); + young = softleaf_is_migration_young(entry); + dirty = softleaf_is_migration_dirty(entry); + } else if (pmd_is_device_private_entry(*pmd)) { + softleaf_t entry; + + old_pmd = *pmd; + entry = softleaf_from_pmd(old_pmd); + page = softleaf_to_page(entry); + folio = page_folio(page); + soft_dirty = pmd_swp_soft_dirty(old_pmd); uffd_wp = pmd_swp_uffd_wp(old_pmd); + + write = softleaf_is_device_private_write(entry); + anon_exclusive = PageAnonExclusive(page); + + /* + * Device private THP should be treated the same as regular + * folios w.r.t anon exclusive handling. See the comments for + * folio handling and anon_exclusive below. + */ + if (freeze && anon_exclusive && + folio_try_share_anon_rmap_pmd(folio, page)) + freeze = false; + if (!freeze) { + rmap_t rmap_flags = RMAP_NONE; + + folio_ref_add(folio, HPAGE_PMD_NR - 1); + if (anon_exclusive) + rmap_flags |= RMAP_EXCLUSIVE; + + folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, + vma, haddr, rmap_flags); + } } else { /* * Up to this point the pmd is present and huge and userland has @@ -2988,11 +3177,11 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, * Note that NUMA hinting access restrictions are not transferred to * avoid any possibility of altering permissions across VMAs. */ - if (freeze || pmd_migration) { - for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { - pte_t entry; - swp_entry_t swp_entry; + if (freeze || pmd_is_migration_entry(old_pmd)) { + pte_t entry; + swp_entry_t swp_entry; + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { if (write) swp_entry = make_writable_migration_entry( page_to_pfn(page + i)); @@ -3011,7 +3200,33 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_swp_mksoft_dirty(entry); if (uffd_wp) entry = pte_swp_mkuffd_wp(entry); + VM_WARN_ON(!pte_none(ptep_get(pte + i))); + set_pte_at(mm, addr, pte + i, entry); + } + } else if (pmd_is_device_private_entry(old_pmd)) { + pte_t entry; + swp_entry_t swp_entry; + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { + /* + * anon_exclusive was already propagated to the relevant + * pages corresponding to the pte entries when freeze + * is false. + */ + if (write) + swp_entry = make_writable_device_private_entry( + page_to_pfn(page + i)); + else + swp_entry = make_readable_device_private_entry( + page_to_pfn(page + i)); + /* + * Young and dirty bits are not progated via swp_entry + */ + entry = swp_entry_to_pte(swp_entry); + if (soft_dirty) + entry = pte_swp_mksoft_dirty(entry); + if (uffd_wp) + entry = pte_swp_mkuffd_wp(entry); VM_WARN_ON(!pte_none(ptep_get(pte + i))); set_pte_at(mm, addr, pte + i, entry); } @@ -3038,7 +3253,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } pte_unmap(pte); - if (!pmd_migration) + if (!pmd_is_migration_entry(*pmd)) folio_remove_rmap_pmd(folio, page, vma); if (freeze) put_page(page); @@ -3051,7 +3266,7 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, bool freeze) { VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); - if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd)) + if (pmd_trans_huge(*pmd) || pmd_is_valid_softleaf(*pmd)) __split_huge_pmd_locked(vma, pmd, address, freeze); } @@ -3230,6 +3445,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio, VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio); lockdep_assert_held(&lruvec->lru_lock); + if (folio_is_device_private(folio)) + return; + if (list) { /* page reclaim is reclaiming a huge page */ VM_WARN_ON(folio_test_lru(folio)); @@ -3263,6 +3481,14 @@ bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins) caller_pins; } +static bool page_range_has_hwpoisoned(struct page *page, long nr_pages) +{ + for (; nr_pages; page++, nr_pages--) + if (PageHWPoison(page)) + return true; + return false; +} + /* * It splits @folio into @new_order folios and copies the @folio metadata to * all the resulting folios. @@ -3270,17 +3496,24 @@ bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins) static void __split_folio_to_order(struct folio *folio, int old_order, int new_order) { + /* Scan poisoned pages when split a poisoned folio to large folios */ + const bool handle_hwpoison = folio_test_has_hwpoisoned(folio) && new_order; long new_nr_pages = 1 << new_order; long nr_pages = 1 << old_order; long i; + folio_clear_has_hwpoisoned(folio); + + /* Check first new_nr_pages since the loop below skips them */ + if (handle_hwpoison && + page_range_has_hwpoisoned(folio_page(folio, 0), new_nr_pages)) + folio_set_has_hwpoisoned(folio); /* * Skip the first new_nr_pages, since the new folio from them have all * the flags from the original folio. */ for (i = new_nr_pages; i < nr_pages; i += new_nr_pages) { struct page *new_head = &folio->page + i; - /* * Careful: new_folio is not a "real" folio before we cleared PageTail. * Don't pass it around before clear_compound_head(). @@ -3322,18 +3555,13 @@ static void __split_folio_to_order(struct folio *folio, int old_order, (1L << PG_dirty) | LRU_GEN_MASK | LRU_REFS_MASK)); + if (handle_hwpoison && + page_range_has_hwpoisoned(new_head, new_nr_pages)) + folio_set_has_hwpoisoned(new_folio); + new_folio->mapping = folio->mapping; new_folio->index = folio->index + i; - /* - * page->private should not be set in tail pages. Fix up and warn once - * if private is unexpectedly set. - */ - if (unlikely(new_folio->private)) { - VM_WARN_ON_ONCE_PAGE(true, new_head); - new_folio->private = NULL; - } - if (folio_test_swapcache(folio)) new_folio->swap.val = folio->swap.val + i; @@ -3369,8 +3597,9 @@ static void __split_folio_to_order(struct folio *folio, int old_order, ClearPageCompound(&folio->page); } -/* - * It splits an unmapped @folio to lower order smaller folios in two ways. +/** + * __split_unmapped_folio() - splits an unmapped @folio to lower order folios in + * two ways: uniform split or non-uniform split. * @folio: the to-be-split folio * @new_order: the smallest order of the after split folios (since buddy * allocator like split generates folios with orders from @folio's @@ -3379,66 +3608,56 @@ static void __split_folio_to_order(struct folio *folio, int old_order, * will be split until its order becomes @new_order. * @xas: xa_state pointing to folio->mapping->i_pages and locked by caller * @mapping: @folio->mapping - * @uniform_split: if the split is uniform or not (buddy allocator like split) + * @split_type: if the split is uniform or not (buddy allocator like split) * * * 1. uniform split: the given @folio into multiple @new_order small folios, * where all small folios have the same order. This is done when - * uniform_split is true. + * split_type is SPLIT_TYPE_UNIFORM. * 2. buddy allocator like (non-uniform) split: the given @folio is split into * half and one of the half (containing the given page) is split into half - * until the given @page's order becomes @new_order. This is done when - * uniform_split is false. + * until the given @folio's order becomes @new_order. This is done when + * split_type is SPLIT_TYPE_NON_UNIFORM. * * The high level flow for these two methods are: - * 1. uniform split: a single __split_folio_to_order() is called to split the - * @folio into @new_order, then we traverse all the resulting folios one by - * one in PFN ascending order and perform stats, unfreeze, adding to list, - * and file mapping index operations. - * 2. non-uniform split: in general, folio_order - @new_order calls to - * __split_folio_to_order() are made in a for loop to split the @folio - * to one lower order at a time. The resulting small folios are processed - * like what is done during the traversal in 1, except the one containing - * @page, which is split in next for loop. + * + * 1. uniform split: @xas is split with no expectation of failure and a single + * __split_folio_to_order() is called to split the @folio into @new_order + * along with stats update. + * 2. non-uniform split: folio_order - @new_order calls to + * __split_folio_to_order() are expected to be made in a for loop to split + * the @folio to one lower order at a time. The folio containing @split_at + * is split in each iteration. @xas is split into half in each iteration and + * can fail. A failed @xas split leaves split folios as is without merging + * them back. * * After splitting, the caller's folio reference will be transferred to the - * folio containing @page. The caller needs to unlock and/or free after-split - * folios if necessary. + * folio containing @split_at. The caller needs to unlock and/or free + * after-split folios if necessary. * - * For !uniform_split, when -ENOMEM is returned, the original folio might be - * split. The caller needs to check the input folio. + * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be + * split but not to @new_order, the caller needs to check) */ static int __split_unmapped_folio(struct folio *folio, int new_order, struct page *split_at, struct xa_state *xas, - struct address_space *mapping, bool uniform_split) + struct address_space *mapping, enum split_type split_type) { - int order = folio_order(folio); - int start_order = uniform_split ? new_order : order - 1; - bool stop_split = false; - struct folio *next; + const bool is_anon = folio_test_anon(folio); + int old_order = folio_order(folio); + int start_order = split_type == SPLIT_TYPE_UNIFORM ? new_order : old_order - 1; int split_order; - int ret = 0; - - if (folio_test_anon(folio)) - mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); - - folio_clear_has_hwpoisoned(folio); /* * split to new_order one order at a time. For uniform split, * folio is split to new_order directly. */ for (split_order = start_order; - split_order >= new_order && !stop_split; + split_order >= new_order; split_order--) { - struct folio *end_folio = folio_next(folio); - int old_order = folio_order(folio); - struct folio *new_folio; + int nr_new_folios = 1UL << (old_order - split_order); /* order-1 anonymous folio is not supported */ - if (folio_test_anon(folio) && split_order == 1) - continue; - if (uniform_split && split_order != new_order) + if (is_anon && split_order == 1) continue; if (mapping) { @@ -3447,79 +3666,81 @@ static int __split_unmapped_folio(struct folio *folio, int new_order, * irq is disabled to allocate enough memory, whereas * non-uniform split can handle ENOMEM. */ - if (uniform_split) + if (split_type == SPLIT_TYPE_UNIFORM) xas_split(xas, folio, old_order); else { xas_set_order(xas, folio->index, split_order); xas_try_split(xas, folio, old_order); - if (xas_error(xas)) { - ret = xas_error(xas); - stop_split = true; - } + if (xas_error(xas)) + return xas_error(xas); } } - if (!stop_split) { - folio_split_memcg_refs(folio, old_order, split_order); - split_page_owner(&folio->page, old_order, split_order); - pgalloc_tag_split(folio, old_order, split_order); + folio_split_memcg_refs(folio, old_order, split_order); + split_page_owner(&folio->page, old_order, split_order); + pgalloc_tag_split(folio, old_order, split_order); + __split_folio_to_order(folio, old_order, split_order); - __split_folio_to_order(folio, old_order, split_order); + if (is_anon) { + mod_mthp_stat(old_order, MTHP_STAT_NR_ANON, -1); + mod_mthp_stat(split_order, MTHP_STAT_NR_ANON, nr_new_folios); } - /* - * Iterate through after-split folios and update folio stats. - * But in buddy allocator like split, the folio - * containing the specified page is skipped until its order - * is new_order, since the folio will be worked on in next - * iteration. + * If uniform split, the process is complete. + * If non-uniform, continue splitting the folio at @split_at + * as long as the next @split_order is >= @new_order. */ - for (new_folio = folio; new_folio != end_folio; new_folio = next) { - next = folio_next(new_folio); - /* - * for buddy allocator like split, new_folio containing - * @split_at page could be split again, thus do not - * change stats yet. Wait until new_folio's order is - * @new_order or stop_split is set to true by the above - * xas_split() failure. - */ - if (new_folio == page_folio(split_at)) { - folio = new_folio; - if (split_order != new_order && !stop_split) - continue; - } - if (folio_test_anon(new_folio)) - mod_mthp_stat(folio_order(new_folio), - MTHP_STAT_NR_ANON, 1); - } + folio = page_folio(split_at); + old_order = split_order; } - return ret; + return 0; } -bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, - bool warns) +bool folio_split_supported(struct folio *folio, unsigned int new_order, + enum split_type split_type, bool warns) { if (folio_test_anon(folio)) { /* order-1 is not supported for anonymous THP. */ VM_WARN_ONCE(warns && new_order == 1, "Cannot split to order-1 folio"); - return new_order != 1; - } else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && - !mapping_large_folio_support(folio->mapping)) { - /* - * No split if the file system does not support large folio. - * Note that we might still have THPs in such mappings due to - * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping - * does not actually support large folios properly. - */ - VM_WARN_ONCE(warns, - "Cannot split file folio to non-0 order"); - return false; + if (new_order == 1) + return false; + } else if (split_type == SPLIT_TYPE_NON_UNIFORM || new_order) { + if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && + !mapping_large_folio_support(folio->mapping)) { + /* + * We can always split a folio down to a single page + * (new_order == 0) uniformly. + * + * For any other scenario + * a) uniform split targeting a large folio + * (new_order > 0) + * b) any non-uniform split + * we must confirm that the file system supports large + * folios. + * + * Note that we might still have THPs in such + * mappings, which is created from khugepaged when + * CONFIG_READ_ONLY_THP_FOR_FS is enabled. But in that + * case, the mapping does not actually support large + * folios properly. + */ + VM_WARN_ONCE(warns, + "Cannot split file folio to non-0 order"); + return false; + } } - /* Only swapping a whole PMD-mapped folio is supported */ - if (folio_test_swapcache(folio)) { + /* + * swapcache folio could only be split to order 0 + * + * non-uniform split creates after-split folios with orders from + * folio_order(folio) - 1 to new_order, making it not suitable for any + * swapcache folio split. Only uniform split to order-0 can be used + * here. + */ + if ((split_type == SPLIT_TYPE_NON_UNIFORM || new_order) && folio_test_swapcache(folio)) { VM_WARN_ONCE(warns, "Cannot split swapcache folio to non-0 order"); return false; @@ -3528,40 +3749,160 @@ bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, return true; } -/* See comments in non_uniform_split_supported() */ -bool uniform_split_supported(struct folio *folio, unsigned int new_order, - bool warns) +static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order, + struct page *split_at, struct xa_state *xas, + struct address_space *mapping, bool do_lru, + struct list_head *list, enum split_type split_type, + pgoff_t end, int *nr_shmem_dropped, int extra_pins) { - if (folio_test_anon(folio)) { - VM_WARN_ONCE(warns && new_order == 1, - "Cannot split to order-1 folio"); - return new_order != 1; - } else if (new_order) { - if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && - !mapping_large_folio_support(folio->mapping)) { - VM_WARN_ONCE(warns, - "Cannot split file folio to non-0 order"); - return false; + struct folio *end_folio = folio_next(folio); + struct folio *new_folio, *next; + int old_order = folio_order(folio); + int ret = 0; + struct deferred_split *ds_queue; + + VM_WARN_ON_ONCE(!mapping && end); + /* Prevent deferred_split_scan() touching ->_refcount */ + ds_queue = folio_split_queue_lock(folio); + if (folio_ref_freeze(folio, 1 + extra_pins)) { + struct swap_cluster_info *ci = NULL; + struct lruvec *lruvec; + int expected_refs; + + if (old_order > 1) { + if (!list_empty(&folio->_deferred_list)) { + ds_queue->split_queue_len--; + /* + * Reinitialize page_deferred_list after removing the + * page from the split_queue, otherwise a subsequent + * split will see list corruption when checking the + * page_deferred_list. + */ + list_del_init(&folio->_deferred_list); + } + if (folio_test_partially_mapped(folio)) { + folio_clear_partially_mapped(folio); + mod_mthp_stat(old_order, + MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); + } } - } + split_queue_unlock(ds_queue); + if (mapping) { + int nr = folio_nr_pages(folio); - if (new_order && folio_test_swapcache(folio)) { - VM_WARN_ONCE(warns, - "Cannot split swapcache folio to non-0 order"); - return false; + if (folio_test_pmd_mappable(folio) && + new_order < HPAGE_PMD_ORDER) { + if (folio_test_swapbacked(folio)) { + lruvec_stat_mod_folio(folio, + NR_SHMEM_THPS, -nr); + } else { + lruvec_stat_mod_folio(folio, + NR_FILE_THPS, -nr); + filemap_nr_thps_dec(mapping); + } + } + } + + if (folio_test_swapcache(folio)) { + if (mapping) { + VM_WARN_ON_ONCE_FOLIO(mapping, folio); + return -EINVAL; + } + + ci = swap_cluster_get_and_lock(folio); + } + + /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ + if (do_lru) + lruvec = folio_lruvec_lock(folio); + + ret = __split_unmapped_folio(folio, new_order, split_at, xas, + mapping, split_type); + + /* + * Unfreeze after-split folios and put them back to the right + * list. @folio should be kept frozon until page cache + * entries are updated with all the other after-split folios + * to prevent others seeing stale page cache entries. + * As a result, new_folio starts from the next folio of + * @folio. + */ + for (new_folio = folio_next(folio); new_folio != end_folio; + new_folio = next) { + unsigned long nr_pages = folio_nr_pages(new_folio); + + next = folio_next(new_folio); + + zone_device_private_split_cb(folio, new_folio); + + expected_refs = folio_expected_ref_count(new_folio) + 1; + folio_ref_unfreeze(new_folio, expected_refs); + + if (do_lru) + lru_add_split_folio(folio, new_folio, lruvec, list); + + /* + * Anonymous folio with swap cache. + * NOTE: shmem in swap cache is not supported yet. + */ + if (ci) { + __swap_cache_replace_folio(ci, folio, new_folio); + continue; + } + + /* Anonymous folio without swap cache */ + if (!mapping) + continue; + + /* Add the new folio to the page cache. */ + if (new_folio->index < end) { + __xa_store(&mapping->i_pages, new_folio->index, + new_folio, 0); + continue; + } + + VM_WARN_ON_ONCE(!nr_shmem_dropped); + /* Drop folio beyond EOF: ->index >= end */ + if (shmem_mapping(mapping) && nr_shmem_dropped) + *nr_shmem_dropped += nr_pages; + else if (folio_test_clear_dirty(new_folio)) + folio_account_cleaned( + new_folio, inode_to_wb(mapping->host)); + __filemap_remove_folio(new_folio, NULL); + folio_put_refs(new_folio, nr_pages); + } + + zone_device_private_split_cb(folio, NULL); + /* + * Unfreeze @folio only after all page cache entries, which + * used to point to it, have been updated with new folios. + * Otherwise, a parallel folio_try_get() can grab @folio + * and its caller can see stale page cache entries. + */ + expected_refs = folio_expected_ref_count(folio) + 1; + folio_ref_unfreeze(folio, expected_refs); + + if (do_lru) + unlock_page_lruvec(lruvec); + + if (ci) + swap_cluster_unlock(ci); + } else { + split_queue_unlock(ds_queue); + return -EAGAIN; } - return true; + return ret; } -/* - * __folio_split: split a folio at @split_at to a @new_order folio +/** + * __folio_split() - split a folio at @split_at to a @new_order folio * @folio: folio to split * @new_order: the order of the new folio * @split_at: a page within the new folio * @lock_at: a page within @folio to be left locked to caller * @list: after-split folios will be put on it if non NULL - * @uniform_split: perform uniform split or not (non-uniform split) + * @split_type: perform uniform split or not (non-uniform split) * * It calls __split_unmapped_folio() to perform uniform and non-uniform split. * It is in charge of checking whether the split is supported or not and @@ -3572,25 +3913,24 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order, * 1. for uniform split, @lock_at points to one of @folio's subpages; * 2. for buddy allocator like (non-uniform) split, @lock_at points to @folio. * - * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be + * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be * split but not to @new_order, the caller needs to check) */ static int __folio_split(struct folio *folio, unsigned int new_order, struct page *split_at, struct page *lock_at, - struct list_head *list, bool uniform_split) + struct list_head *list, enum split_type split_type) { - struct deferred_split *ds_queue = get_deferred_split_queue(folio); XA_STATE(xas, &folio->mapping->i_pages, folio->index); struct folio *end_folio = folio_next(folio); bool is_anon = folio_test_anon(folio); struct address_space *mapping = NULL; struct anon_vma *anon_vma = NULL; - int order = folio_order(folio); + int old_order = folio_order(folio); struct folio *new_folio, *next; int nr_shmem_dropped = 0; int remap_flags = 0; int extra_pins, ret; - pgoff_t end; + pgoff_t end = 0; bool is_hzp; VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); @@ -3599,14 +3939,20 @@ static int __folio_split(struct folio *folio, unsigned int new_order, if (folio != page_folio(split_at) || folio != page_folio(lock_at)) return -EINVAL; - if (new_order >= folio_order(folio)) - return -EINVAL; + /* + * Folios that just got truncated cannot get split. Signal to the + * caller that there was a race. + * + * TODO: this will also currently refuse shmem folios that are in the + * swapcache. + */ + if (!is_anon && !folio->mapping) + return -EBUSY; - if (uniform_split && !uniform_split_supported(folio, new_order, true)) + if (new_order >= old_order) return -EINVAL; - if (!uniform_split && - !non_uniform_split_supported(folio, new_order, true)) + if (!folio_split_supported(folio, new_order, split_type, /* warn = */ true)) return -EINVAL; is_hzp = is_huge_zero_folio(folio); @@ -3632,29 +3978,15 @@ static int __folio_split(struct folio *folio, unsigned int new_order, ret = -EBUSY; goto out; } - mapping = NULL; anon_vma_lock_write(anon_vma); + mapping = NULL; } else { unsigned int min_order; gfp_t gfp; mapping = folio->mapping; - - /* Truncated ? */ - /* - * TODO: add support for large shmem folio in swap cache. - * When shmem is in swap cache, mapping is NULL and - * folio_test_swapcache() is true. - */ - if (!mapping) { - ret = -EBUSY; - goto out; - } - min_order = mapping_min_folio_order(folio->mapping); if (new_order < min_order) { - VM_WARN_ONCE(1, "Cannot split mapped folio below min-order: %u", - min_order); ret = -EINVAL; goto out; } @@ -3667,9 +3999,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order, goto out; } - if (uniform_split) { + if (split_type == SPLIT_TYPE_UNIFORM) { xas_set_order(&xas, folio->index, new_order); - xas_split_alloc(&xas, folio, folio_order(folio), gfp); + xas_split_alloc(&xas, folio, old_order, gfp); if (xas_error(&xas)) { ret = xas_error(&xas); goto out; @@ -3717,127 +4049,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order, } } - /* Prevent deferred_split_scan() touching ->_refcount */ - spin_lock(&ds_queue->split_queue_lock); - if (folio_ref_freeze(folio, 1 + extra_pins)) { - struct swap_cluster_info *ci = NULL; - struct lruvec *lruvec; - int expected_refs; - - if (folio_order(folio) > 1 && - !list_empty(&folio->_deferred_list)) { - ds_queue->split_queue_len--; - if (folio_test_partially_mapped(folio)) { - folio_clear_partially_mapped(folio); - mod_mthp_stat(folio_order(folio), - MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); - } - /* - * Reinitialize page_deferred_list after removing the - * page from the split_queue, otherwise a subsequent - * split will see list corruption when checking the - * page_deferred_list. - */ - list_del_init(&folio->_deferred_list); - } - spin_unlock(&ds_queue->split_queue_lock); - if (mapping) { - int nr = folio_nr_pages(folio); - - if (folio_test_pmd_mappable(folio) && - new_order < HPAGE_PMD_ORDER) { - if (folio_test_swapbacked(folio)) { - __lruvec_stat_mod_folio(folio, - NR_SHMEM_THPS, -nr); - } else { - __lruvec_stat_mod_folio(folio, - NR_FILE_THPS, -nr); - filemap_nr_thps_dec(mapping); - } - } - } - - if (folio_test_swapcache(folio)) { - if (mapping) { - VM_WARN_ON_ONCE_FOLIO(mapping, folio); - ret = -EINVAL; - goto fail; - } - - ci = swap_cluster_get_and_lock(folio); - } - - /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ - lruvec = folio_lruvec_lock(folio); - - ret = __split_unmapped_folio(folio, new_order, split_at, &xas, - mapping, uniform_split); - - /* - * Unfreeze after-split folios and put them back to the right - * list. @folio should be kept frozon until page cache - * entries are updated with all the other after-split folios - * to prevent others seeing stale page cache entries. - * As a result, new_folio starts from the next folio of - * @folio. - */ - for (new_folio = folio_next(folio); new_folio != end_folio; - new_folio = next) { - unsigned long nr_pages = folio_nr_pages(new_folio); - - next = folio_next(new_folio); - - expected_refs = folio_expected_ref_count(new_folio) + 1; - folio_ref_unfreeze(new_folio, expected_refs); - - lru_add_split_folio(folio, new_folio, lruvec, list); - - /* - * Anonymous folio with swap cache. - * NOTE: shmem in swap cache is not supported yet. - */ - if (ci) { - __swap_cache_replace_folio(ci, folio, new_folio); - continue; - } - - /* Anonymous folio without swap cache */ - if (!mapping) - continue; - - /* Add the new folio to the page cache. */ - if (new_folio->index < end) { - __xa_store(&mapping->i_pages, new_folio->index, - new_folio, 0); - continue; - } - - /* Drop folio beyond EOF: ->index >= end */ - if (shmem_mapping(mapping)) - nr_shmem_dropped += nr_pages; - else if (folio_test_clear_dirty(new_folio)) - folio_account_cleaned( - new_folio, inode_to_wb(mapping->host)); - __filemap_remove_folio(new_folio, NULL); - folio_put_refs(new_folio, nr_pages); - } - /* - * Unfreeze @folio only after all page cache entries, which - * used to point to it, have been updated with new folios. - * Otherwise, a parallel folio_try_get() can grab @folio - * and its caller can see stale page cache entries. - */ - expected_refs = folio_expected_ref_count(folio) + 1; - folio_ref_unfreeze(folio, expected_refs); - - unlock_page_lruvec(lruvec); - - if (ci) - swap_cluster_unlock(ci); - } else { - spin_unlock(&ds_queue->split_queue_lock); - ret = -EAGAIN; - } + ret = __folio_freeze_and_split_unmapped(folio, new_order, split_at, &xas, mapping, + true, list, split_type, end, &nr_shmem_dropped, + extra_pins); fail: if (mapping) xas_unlock(&xas); @@ -3847,9 +4061,10 @@ fail: if (nr_shmem_dropped) shmem_uncharge(mapping->host, nr_shmem_dropped); - if (!ret && is_anon) + if (!ret && is_anon && !folio_is_device_private(folio)) remap_flags = RMP_USE_SHARED_ZEROPAGE; - remap_page(folio, 1 << order, remap_flags); + + remap_page(folio, 1 << old_order, remap_flags); /* * Unlock all after-split folios except the one containing @@ -3880,9 +4095,51 @@ out_unlock: i_mmap_unlock_read(mapping); out: xas_destroy(&xas); - if (order == HPAGE_PMD_ORDER) + if (old_order == HPAGE_PMD_ORDER) count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); - count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED); + count_mthp_stat(old_order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED); + return ret; +} + +/** + * folio_split_unmapped() - split a large anon folio that is already unmapped + * @folio: folio to split + * @new_order: the order of folios after split + * + * This function is a helper for splitting folios that have already been + * unmapped. The use case is that the device or the CPU can refuse to migrate + * THP pages in the middle of migration, due to allocation issues on either + * side. + * + * anon_vma_lock is not required to be held, mmap_read_lock() or + * mmap_write_lock() should be held. @folio is expected to be locked by the + * caller. device-private and non device-private folios are supported along + * with folios that are in the swapcache. @folio should also be unmapped and + * isolated from LRU (if applicable) + * + * Upon return, the folio is not remapped, split folios are not added to LRU, + * free_folio_and_swap_cache() is not called, and new folios remain locked. + * + * Return: 0 on success, -EAGAIN if the folio cannot be split (e.g., due to + * insufficient reference count or extra pins). + */ +int folio_split_unmapped(struct folio *folio, unsigned int new_order) +{ + int extra_pins, ret = 0; + + VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_anon(folio), folio); + + if (!can_split_folio(folio, 1, &extra_pins)) + return -EAGAIN; + + local_irq_disable(); + ret = __folio_freeze_and_split_unmapped(folio, new_order, &folio->page, NULL, + NULL, false, NULL, SPLIT_TYPE_UNIFORM, + 0, NULL, extra_pins); + local_irq_enable(); return ret; } @@ -3933,22 +4190,22 @@ out: * Returns -EINVAL when trying to split to an order that is incompatible * with the folio. Splitting to order 0 is compatible with all folios. */ -int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, +int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order) { struct folio *folio = page_folio(page); - return __folio_split(folio, new_order, &folio->page, page, list, true); + return __folio_split(folio, new_order, &folio->page, page, list, + SPLIT_TYPE_UNIFORM); } -/* - * folio_split: split a folio at @split_at to a @new_order folio +/** + * folio_split() - split a folio at @split_at to a @new_order folio * @folio: folio to split * @new_order: the order of the new folio * @split_at: a page within the new folio - * - * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be - * split but not to @new_order, the caller needs to check) + * @list: after-split folios are added to @list if not null, otherwise to LRU + * list * * It has the same prerequisites and returns as * split_huge_page_to_list_to_order(). @@ -3962,12 +4219,15 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, * [order-4, {order-3}, order-3, order-5, order-6, order-7, order-8]. * * After split, folio is left locked for caller. + * + * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be + * split but not to @new_order, the caller needs to check) */ int folio_split(struct folio *folio, unsigned int new_order, struct page *split_at, struct list_head *list) { return __folio_split(folio, new_order, split_at, &folio->page, list, - false); + SPLIT_TYPE_NON_UNIFORM); } int min_order_for_split(struct folio *folio) @@ -3986,12 +4246,7 @@ int min_order_for_split(struct folio *folio) int split_folio_to_list(struct folio *folio, struct list_head *list) { - int ret = min_order_for_split(folio); - - if (ret < 0) - return ret; - - return split_huge_page_to_list_to_order(&folio->page, list, ret); + return split_huge_page_to_list_to_order(&folio->page, list, 0); } /* @@ -4014,10 +4269,9 @@ bool __folio_unqueue_deferred_split(struct folio *folio) bool unqueued = false; WARN_ON_ONCE(folio_ref_count(folio)); - WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio)); + WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio)); - ds_queue = get_deferred_split_queue(folio); - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + ds_queue = folio_split_queue_lock_irqsave(folio, &flags); if (!list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; if (folio_test_partially_mapped(folio)) { @@ -4028,7 +4282,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio) list_del_init(&folio->_deferred_list); unqueued = true; } - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + split_queue_unlock_irqrestore(ds_queue, flags); return unqueued; /* useful for debug warnings */ } @@ -4036,10 +4290,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio) /* partially_mapped=false won't clear PG_partially_mapped folio flag */ void deferred_split_folio(struct folio *folio, bool partially_mapped) { - struct deferred_split *ds_queue = get_deferred_split_queue(folio); -#ifdef CONFIG_MEMCG - struct mem_cgroup *memcg = folio_memcg(folio); -#endif + struct deferred_split *ds_queue; unsigned long flags; /* @@ -4062,7 +4313,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) if (folio_test_swapcache(folio)) return; - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + ds_queue = folio_split_queue_lock_irqsave(folio, &flags); if (partially_mapped) { if (!folio_test_partially_mapped(folio)) { folio_set_partially_mapped(folio); @@ -4077,15 +4328,16 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio); } if (list_empty(&folio->_deferred_list)) { + struct mem_cgroup *memcg; + + memcg = folio_split_queue_memcg(folio, ds_queue); list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); ds_queue->split_queue_len++; -#ifdef CONFIG_MEMCG if (memcg) set_shrinker_bit(memcg, folio_nid(folio), - deferred_split_shrinker->id); -#endif + shrinker_id(deferred_split_shrinker)); } - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + split_queue_unlock_irqrestore(ds_queue, flags); } static unsigned long deferred_split_count(struct shrinker *shrink, @@ -4131,43 +4383,42 @@ static bool thp_underused(struct folio *folio) static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct pglist_data *pgdata = NODE_DATA(sc->nid); - struct deferred_split *ds_queue = &pgdata->deferred_split_queue; + struct deferred_split *ds_queue; unsigned long flags; - LIST_HEAD(list); - struct folio *folio, *next, *prev = NULL; - int split = 0, removed = 0; + struct folio *folio, *next; + int split = 0, i; + struct folio_batch fbatch; -#ifdef CONFIG_MEMCG - if (sc->memcg) - ds_queue = &sc->memcg->deferred_split_queue; -#endif + folio_batch_init(&fbatch); - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); +retry: + ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags); /* Take pin on all head pages to avoid freeing them under us */ list_for_each_entry_safe(folio, next, &ds_queue->split_queue, _deferred_list) { if (folio_try_get(folio)) { - list_move(&folio->_deferred_list, &list); - } else { + folio_batch_add(&fbatch, folio); + } else if (folio_test_partially_mapped(folio)) { /* We lost race with folio_put() */ - if (folio_test_partially_mapped(folio)) { - folio_clear_partially_mapped(folio); - mod_mthp_stat(folio_order(folio), - MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); - } - list_del_init(&folio->_deferred_list); - ds_queue->split_queue_len--; + folio_clear_partially_mapped(folio); + mod_mthp_stat(folio_order(folio), + MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } + list_del_init(&folio->_deferred_list); + ds_queue->split_queue_len--; if (!--sc->nr_to_scan) break; + if (!folio_batch_space(&fbatch)) + break; } - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + split_queue_unlock_irqrestore(ds_queue, flags); - list_for_each_entry_safe(folio, next, &list, _deferred_list) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { bool did_split = false; bool underused = false; + struct deferred_split *fqueue; + folio = fbatch.folios[i]; if (!folio_test_partially_mapped(folio)) { /* * See try_to_map_unused_to_zeropage(): we cannot @@ -4190,38 +4441,27 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, } folio_unlock(folio); next: + if (did_split || !folio_test_partially_mapped(folio)) + continue; /* - * split_folio() removes folio from list on success. * Only add back to the queue if folio is partially mapped. * If thp_underused returns false, or if split_folio fails * in the case it was underused, then consider it used and * don't add it back to split_queue. */ - if (did_split) { - ; /* folio already removed from list */ - } else if (!folio_test_partially_mapped(folio)) { - list_del_init(&folio->_deferred_list); - removed++; - } else { - /* - * That unlocked list_del_init() above would be unsafe, - * unless its folio is separated from any earlier folios - * left on the list (which may be concurrently unqueued) - * by one safe folio with refcount still raised. - */ - swap(folio, prev); + fqueue = folio_split_queue_lock_irqsave(folio, &flags); + if (list_empty(&folio->_deferred_list)) { + list_add_tail(&folio->_deferred_list, &fqueue->split_queue); + fqueue->split_queue_len++; } - if (folio) - folio_put(folio); + split_queue_unlock_irqrestore(fqueue, flags); } + folios_put(&fbatch); - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); - list_splice_tail(&list, &ds_queue->split_queue); - ds_queue->split_queue_len -= removed; - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); - - if (prev) - folio_put(prev); + if (sc->nr_to_scan && !list_empty(&ds_queue->split_queue)) { + cond_resched(); + goto retry; + } /* * Stop shrinker if we didn't split any page, but the queue is empty. @@ -4232,6 +4472,33 @@ next: return split; } +#ifdef CONFIG_MEMCG +void reparent_deferred_split_queue(struct mem_cgroup *memcg) +{ + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + struct deferred_split *ds_queue = &memcg->deferred_split_queue; + struct deferred_split *parent_ds_queue = &parent->deferred_split_queue; + int nid; + + spin_lock_irq(&ds_queue->split_queue_lock); + spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING); + + if (!ds_queue->split_queue_len) + goto unlock; + + list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue); + parent_ds_queue->split_queue_len += ds_queue->split_queue_len; + ds_queue->split_queue_len = 0; + + for_each_node(nid) + set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker)); + +unlock: + spin_unlock(&parent_ds_queue->split_queue_lock); + spin_unlock_irq(&ds_queue->split_queue_lock); +} +#endif + #ifdef CONFIG_DEBUG_FS static void split_huge_pages_all(void) { @@ -4593,7 +4860,10 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, return 0; flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); - pmdval = pmdp_invalidate(vma, address, pvmw->pmd); + if (unlikely(!pmd_present(*pvmw->pmd))) + pmdval = pmdp_huge_get_and_clear(vma->vm_mm, address, pvmw->pmd); + else + pmdval = pmdp_invalidate(vma, address, pvmw->pmd); /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */ anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page); @@ -4635,30 +4905,48 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) unsigned long address = pvmw->address; unsigned long haddr = address & HPAGE_PMD_MASK; pmd_t pmde; - swp_entry_t entry; + softleaf_t entry; if (!(pvmw->pmd && !pvmw->pte)) return; - entry = pmd_to_swp_entry(*pvmw->pmd); + entry = softleaf_from_pmd(*pvmw->pmd); folio_get(folio); pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot)); + if (pmd_swp_soft_dirty(*pvmw->pmd)) pmde = pmd_mksoft_dirty(pmde); - if (is_writable_migration_entry(entry)) + if (softleaf_is_migration_write(entry)) pmde = pmd_mkwrite(pmde, vma); if (pmd_swp_uffd_wp(*pvmw->pmd)) pmde = pmd_mkuffd_wp(pmde); - if (!is_migration_entry_young(entry)) + if (!softleaf_is_migration_young(entry)) pmde = pmd_mkold(pmde); /* NOTE: this may contain setting soft-dirty on some archs */ - if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) + if (folio_test_dirty(folio) && softleaf_is_migration_dirty(entry)) pmde = pmd_mkdirty(pmde); + if (folio_is_device_private(folio)) { + swp_entry_t entry; + + if (pmd_write(pmde)) + entry = make_writable_device_private_entry( + page_to_pfn(new)); + else + entry = make_readable_device_private_entry( + page_to_pfn(new)); + pmde = swp_entry_to_pmd(entry); + + if (pmd_swp_soft_dirty(*pvmw->pmd)) + pmde = pmd_swp_mksoft_dirty(pmde); + if (pmd_swp_uffd_wp(*pvmw->pmd)) + pmde = pmd_swp_mkuffd_wp(pmde); + } + if (folio_test_anon(folio)) { rmap_t rmap_flags = RMAP_NONE; - if (!is_readable_migration_entry(entry)) + if (!softleaf_is_migration_read(entry)) rmap_flags |= RMAP_EXCLUSIVE; folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0455119716ec..9e7815b4f058 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7,7 +7,6 @@ #include <linux/init.h> #include <linux/mm.h> #include <linux/seq_file.h> -#include <linux/sysctl.h> #include <linux/highmem.h> #include <linux/mmu_notifier.h> #include <linux/nodemask.h> @@ -19,7 +18,6 @@ #include <linux/mutex.h> #include <linux/memblock.h> #include <linux/minmax.h> -#include <linux/sysfs.h> #include <linux/slab.h> #include <linux/sched/mm.h> #include <linux/mmdebug.h> @@ -28,7 +26,7 @@ #include <linux/string_choices.h> #include <linux/string_helpers.h> #include <linux/swap.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/jhash.h> #include <linux/numa.h> #include <linux/llist.h> @@ -39,20 +37,19 @@ #include <linux/memory.h> #include <linux/mm_inline.h> #include <linux/padata.h> +#include <linux/pgalloc.h> #include <asm/page.h> -#include <asm/pgalloc.h> #include <asm/tlb.h> #include <asm/setup.h> #include <linux/io.h> -#include <linux/hugetlb.h> -#include <linux/hugetlb_cgroup.h> #include <linux/node.h> #include <linux/page_owner.h> #include "internal.h" #include "hugetlb_vmemmap.h" #include "hugetlb_cma.h" +#include "hugetlb_internal.h" #include <linux/page-isolation.h> int hugetlb_max_hstate __read_mostly; @@ -119,7 +116,6 @@ struct mutex *hugetlb_fault_mutex_table __ro_after_init; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); static void hugetlb_vma_lock_free(struct vm_area_struct *vma); -static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool take_locks); @@ -427,17 +423,21 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma) } } -static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +/* + * vma specific semaphore used for pmd sharing and fault/truncation + * synchronization + */ +int hugetlb_vma_lock_alloc(struct vm_area_struct *vma) { struct hugetlb_vma_lock *vma_lock; /* Only establish in (flags) sharable vmas */ if (!vma || !(vma->vm_flags & VM_MAYSHARE)) - return; + return 0; /* Should never get here with non-NULL vm_private_data */ if (vma->vm_private_data) - return; + return -EINVAL; vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); if (!vma_lock) { @@ -452,13 +452,15 @@ static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) * allocation failure. */ pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); - return; + return -EINVAL; } kref_init(&vma_lock->refs); init_rwsem(&vma_lock->rw_sema); vma_lock->vma = vma; vma->vm_private_data = vma_lock; + + return 0; } /* Helper that removes a struct file_region from the resv_map cache and returns @@ -1190,20 +1192,28 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma) } } -static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) +static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) { - VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); - VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); + VM_WARN_ON_ONCE_VMA(!is_vm_hugetlb_page(vma), vma); + VM_WARN_ON_ONCE_VMA(vma->vm_flags & VM_MAYSHARE, vma); - set_vma_private_data(vma, (unsigned long)map); + set_vma_private_data(vma, get_vma_private_data(vma) | flags); } -static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) +static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map) { - VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); - VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); + VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); + VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE); - set_vma_private_data(vma, get_vma_private_data(vma) | flags); + desc->private_data = map; +} + +static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags) +{ + VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); + VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE); + + desc->private_data = (void *)((unsigned long)desc->private_data | flags); } static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) @@ -1213,6 +1223,13 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) return (get_vma_private_data(vma) & flag) != 0; } +static bool is_vma_desc_resv_set(struct vm_area_desc *desc, unsigned long flag) +{ + VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); + + return ((unsigned long)desc->private_data) & flag; +} + bool __vma_private_lock(struct vm_area_struct *vma) { return !(vma->vm_flags & VM_MAYSHARE) && @@ -1400,77 +1417,6 @@ err: return NULL; } -/* - * common helper functions for hstate_next_node_to_{alloc|free}. - * We may have allocated or freed a huge page based on a different - * nodes_allowed previously, so h->next_node_to_{alloc|free} might - * be outside of *nodes_allowed. Ensure that we use an allowed - * node for alloc or free. - */ -static int next_node_allowed(int nid, nodemask_t *nodes_allowed) -{ - nid = next_node_in(nid, *nodes_allowed); - VM_BUG_ON(nid >= MAX_NUMNODES); - - return nid; -} - -static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) -{ - if (!node_isset(nid, *nodes_allowed)) - nid = next_node_allowed(nid, nodes_allowed); - return nid; -} - -/* - * returns the previously saved node ["this node"] from which to - * allocate a persistent huge page for the pool and advance the - * next node from which to allocate, handling wrap at end of node - * mask. - */ -static int hstate_next_node_to_alloc(int *next_node, - nodemask_t *nodes_allowed) -{ - int nid; - - VM_BUG_ON(!nodes_allowed); - - nid = get_valid_node_allowed(*next_node, nodes_allowed); - *next_node = next_node_allowed(nid, nodes_allowed); - - return nid; -} - -/* - * helper for remove_pool_hugetlb_folio() - return the previously saved - * node ["this node"] from which to free a huge page. Advance the - * next node id whether or not we find a free huge page to free so - * that the next attempt to free addresses the next node. - */ -static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) -{ - int nid; - - VM_BUG_ON(!nodes_allowed); - - nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); - h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); - - return nid; -} - -#define for_each_node_mask_to_alloc(next_node, nr_nodes, node, mask) \ - for (nr_nodes = nodes_weight(*mask); \ - nr_nodes > 0 && \ - ((node = hstate_next_node_to_alloc(next_node, mask)) || 1); \ - nr_nodes--) - -#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ - for (nr_nodes = nodes_weight(*mask); \ - nr_nodes > 0 && \ - ((node = hstate_next_node_to_free(hs, mask)) || 1); \ - nr_nodes--) - #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE #ifdef CONFIG_CONTIG_ALLOC static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, @@ -1526,8 +1472,8 @@ static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid, * * Must be called with hugetlb lock held. */ -static void remove_hugetlb_folio(struct hstate *h, struct folio *folio, - bool adjust_surplus) +void remove_hugetlb_folio(struct hstate *h, struct folio *folio, + bool adjust_surplus) { int nid = folio_nid(folio); @@ -1535,7 +1481,7 @@ static void remove_hugetlb_folio(struct hstate *h, struct folio *folio, VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio); lockdep_assert_held(&hugetlb_lock); - if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + if (hstate_is_gigantic_no_runtime(h)) return; list_del(&folio->lru); @@ -1562,8 +1508,8 @@ static void remove_hugetlb_folio(struct hstate *h, struct folio *folio, h->nr_huge_pages_node[nid]--; } -static void add_hugetlb_folio(struct hstate *h, struct folio *folio, - bool adjust_surplus) +void add_hugetlb_folio(struct hstate *h, struct folio *folio, + bool adjust_surplus) { int nid = folio_nid(folio); @@ -1597,7 +1543,7 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, { bool clear_flag = folio_test_hugetlb_vmemmap_optimized(folio); - if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + if (hstate_is_gigantic_no_runtime(h)) return; /* @@ -1894,7 +1840,7 @@ static void account_new_hugetlb_folio(struct hstate *h, struct folio *folio) h->nr_huge_pages_node[folio_nid(folio)]++; } -static void init_new_hugetlb_folio(struct folio *folio) +void init_new_hugetlb_folio(struct folio *folio) { __folio_set_hugetlb(folio); INIT_LIST_HEAD(&folio->lru); @@ -2006,8 +1952,8 @@ static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, return folio; } -static void prep_and_add_allocated_folios(struct hstate *h, - struct list_head *folio_list) +void prep_and_add_allocated_folios(struct hstate *h, + struct list_head *folio_list) { unsigned long flags; struct folio *folio, *tmp_f; @@ -2212,7 +2158,7 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, { struct folio *folio = NULL; - if (hstate_is_gigantic(h)) + if (hstate_is_gigantic_no_runtime(h)) return NULL; spin_lock_irq(&hugetlb_lock); @@ -2491,7 +2437,7 @@ static void return_unused_surplus_pages(struct hstate *h, /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; - if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + if (hstate_is_gigantic_no_runtime(h)) goto out; /* @@ -2934,7 +2880,7 @@ typedef enum { * NOTE: This is mostly identical to MAP_CHG_NEEDED, except * that currently vma_needs_reservation() has an unwanted side * effect to either use end() or commit() to complete the - * transaction. Hence it needs to differenciate from NEEDED. + * transaction. Hence it needs to differentiate from NEEDED. */ MAP_CHG_ENFORCED = 2, } map_chg_state; @@ -3215,7 +3161,7 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, unsigned long start_page_number, unsigned long end_page_number) { - enum zone_type zone = zone_idx(folio_zone(folio)); + enum zone_type zone = folio_zonenum(folio); int nid = folio_nid(folio); struct page *page = folio_page(folio, start_page_number); unsigned long head_pfn = folio_pfn(folio); @@ -3248,7 +3194,7 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio, ret = folio_ref_freeze(folio, 1); VM_BUG_ON(!ret); hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages); - prep_compound_head((struct page *)folio, huge_page_order(h)); + prep_compound_head(&folio->page, huge_page_order(h)); } static bool __init hugetlb_bootmem_page_prehvo(struct huge_bootmem_page *m) @@ -3705,7 +3651,7 @@ static void __init hugetlb_init_hstates(void) * - If CMA allocation is possible, we can not demote * HUGETLB_PAGE_ORDER or smaller size pages. */ - if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + if (hstate_is_gigantic_no_runtime(h)) continue; if (hugetlb_cma_total_size() && h->order <= HUGETLB_PAGE_ORDER) continue; @@ -4062,8 +4008,8 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, return rc; } -static long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed, - unsigned long nr_to_demote) +long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed, + unsigned long nr_to_demote) __must_hold(&hugetlb_lock) { int nr_nodes, node; @@ -4131,58 +4077,14 @@ static long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed, return -EBUSY; } -#define HSTATE_ATTR_RO(_name) \ - static struct kobj_attribute _name##_attr = __ATTR_RO(_name) - -#define HSTATE_ATTR_WO(_name) \ - static struct kobj_attribute _name##_attr = __ATTR_WO(_name) - -#define HSTATE_ATTR(_name) \ - static struct kobj_attribute _name##_attr = __ATTR_RW(_name) - -static struct kobject *hugepages_kobj; -static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; - -static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); - -static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) -{ - int i; - - for (i = 0; i < HUGE_MAX_HSTATE; i++) - if (hstate_kobjs[i] == kobj) { - if (nidp) - *nidp = NUMA_NO_NODE; - return &hstates[i]; - } - - return kobj_to_node_hstate(kobj, nidp); -} - -static ssize_t nr_hugepages_show_common(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct hstate *h; - unsigned long nr_huge_pages; - int nid; - - h = kobj_to_hstate(kobj, &nid); - if (nid == NUMA_NO_NODE) - nr_huge_pages = h->nr_huge_pages; - else - nr_huge_pages = h->nr_huge_pages_node[nid]; - - return sysfs_emit(buf, "%lu\n", nr_huge_pages); -} - -static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, +ssize_t __nr_hugepages_store_common(bool obey_mempolicy, struct hstate *h, int nid, unsigned long count, size_t len) { int err; nodemask_t nodes_allowed, *n_mask; - if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + if (hstate_is_gigantic_no_runtime(h)) return -EINVAL; if (nid == NUMA_NO_NODE) { @@ -4208,458 +4110,6 @@ static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, return err ? err : len; } -static ssize_t nr_hugepages_store_common(bool obey_mempolicy, - struct kobject *kobj, const char *buf, - size_t len) -{ - struct hstate *h; - unsigned long count; - int nid; - int err; - - err = kstrtoul(buf, 10, &count); - if (err) - return err; - - h = kobj_to_hstate(kobj, &nid); - return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); -} - -static ssize_t nr_hugepages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return nr_hugepages_show_common(kobj, attr, buf); -} - -static ssize_t nr_hugepages_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t len) -{ - return nr_hugepages_store_common(false, kobj, buf, len); -} -HSTATE_ATTR(nr_hugepages); - -#ifdef CONFIG_NUMA - -/* - * hstate attribute for optionally mempolicy-based constraint on persistent - * huge page alloc/free. - */ -static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return nr_hugepages_show_common(kobj, attr, buf); -} - -static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t len) -{ - return nr_hugepages_store_common(true, kobj, buf, len); -} -HSTATE_ATTR(nr_hugepages_mempolicy); -#endif - - -static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct hstate *h = kobj_to_hstate(kobj, NULL); - return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages); -} - -static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - int err; - unsigned long input; - struct hstate *h = kobj_to_hstate(kobj, NULL); - - if (hstate_is_gigantic(h)) - return -EINVAL; - - err = kstrtoul(buf, 10, &input); - if (err) - return err; - - spin_lock_irq(&hugetlb_lock); - h->nr_overcommit_huge_pages = input; - spin_unlock_irq(&hugetlb_lock); - - return count; -} -HSTATE_ATTR(nr_overcommit_hugepages); - -static ssize_t free_hugepages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct hstate *h; - unsigned long free_huge_pages; - int nid; - - h = kobj_to_hstate(kobj, &nid); - if (nid == NUMA_NO_NODE) - free_huge_pages = h->free_huge_pages; - else - free_huge_pages = h->free_huge_pages_node[nid]; - - return sysfs_emit(buf, "%lu\n", free_huge_pages); -} -HSTATE_ATTR_RO(free_hugepages); - -static ssize_t resv_hugepages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct hstate *h = kobj_to_hstate(kobj, NULL); - return sysfs_emit(buf, "%lu\n", h->resv_huge_pages); -} -HSTATE_ATTR_RO(resv_hugepages); - -static ssize_t surplus_hugepages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct hstate *h; - unsigned long surplus_huge_pages; - int nid; - - h = kobj_to_hstate(kobj, &nid); - if (nid == NUMA_NO_NODE) - surplus_huge_pages = h->surplus_huge_pages; - else - surplus_huge_pages = h->surplus_huge_pages_node[nid]; - - return sysfs_emit(buf, "%lu\n", surplus_huge_pages); -} -HSTATE_ATTR_RO(surplus_hugepages); - -static ssize_t demote_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t len) -{ - unsigned long nr_demote; - unsigned long nr_available; - nodemask_t nodes_allowed, *n_mask; - struct hstate *h; - int err; - int nid; - - err = kstrtoul(buf, 10, &nr_demote); - if (err) - return err; - h = kobj_to_hstate(kobj, &nid); - - if (nid != NUMA_NO_NODE) { - init_nodemask_of_node(&nodes_allowed, nid); - n_mask = &nodes_allowed; - } else { - n_mask = &node_states[N_MEMORY]; - } - - /* Synchronize with other sysfs operations modifying huge pages */ - mutex_lock(&h->resize_lock); - spin_lock_irq(&hugetlb_lock); - - while (nr_demote) { - long rc; - - /* - * Check for available pages to demote each time thorough the - * loop as demote_pool_huge_page will drop hugetlb_lock. - */ - if (nid != NUMA_NO_NODE) - nr_available = h->free_huge_pages_node[nid]; - else - nr_available = h->free_huge_pages; - nr_available -= h->resv_huge_pages; - if (!nr_available) - break; - - rc = demote_pool_huge_page(h, n_mask, nr_demote); - if (rc < 0) { - err = rc; - break; - } - - nr_demote -= rc; - } - - spin_unlock_irq(&hugetlb_lock); - mutex_unlock(&h->resize_lock); - - if (err) - return err; - return len; -} -HSTATE_ATTR_WO(demote); - -static ssize_t demote_size_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct hstate *h = kobj_to_hstate(kobj, NULL); - unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K; - - return sysfs_emit(buf, "%lukB\n", demote_size); -} - -static ssize_t demote_size_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct hstate *h, *demote_hstate; - unsigned long demote_size; - unsigned int demote_order; - - demote_size = (unsigned long)memparse(buf, NULL); - - demote_hstate = size_to_hstate(demote_size); - if (!demote_hstate) - return -EINVAL; - demote_order = demote_hstate->order; - if (demote_order < HUGETLB_PAGE_ORDER) - return -EINVAL; - - /* demote order must be smaller than hstate order */ - h = kobj_to_hstate(kobj, NULL); - if (demote_order >= h->order) - return -EINVAL; - - /* resize_lock synchronizes access to demote size and writes */ - mutex_lock(&h->resize_lock); - h->demote_order = demote_order; - mutex_unlock(&h->resize_lock); - - return count; -} -HSTATE_ATTR(demote_size); - -static struct attribute *hstate_attrs[] = { - &nr_hugepages_attr.attr, - &nr_overcommit_hugepages_attr.attr, - &free_hugepages_attr.attr, - &resv_hugepages_attr.attr, - &surplus_hugepages_attr.attr, -#ifdef CONFIG_NUMA - &nr_hugepages_mempolicy_attr.attr, -#endif - NULL, -}; - -static const struct attribute_group hstate_attr_group = { - .attrs = hstate_attrs, -}; - -static struct attribute *hstate_demote_attrs[] = { - &demote_size_attr.attr, - &demote_attr.attr, - NULL, -}; - -static const struct attribute_group hstate_demote_attr_group = { - .attrs = hstate_demote_attrs, -}; - -static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, - struct kobject **hstate_kobjs, - const struct attribute_group *hstate_attr_group) -{ - int retval; - int hi = hstate_index(h); - - hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); - if (!hstate_kobjs[hi]) - return -ENOMEM; - - retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); - if (retval) { - kobject_put(hstate_kobjs[hi]); - hstate_kobjs[hi] = NULL; - return retval; - } - - if (h->demote_order) { - retval = sysfs_create_group(hstate_kobjs[hi], - &hstate_demote_attr_group); - if (retval) { - pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name); - sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group); - kobject_put(hstate_kobjs[hi]); - hstate_kobjs[hi] = NULL; - return retval; - } - } - - return 0; -} - -#ifdef CONFIG_NUMA -static bool hugetlb_sysfs_initialized __ro_after_init; - -/* - * node_hstate/s - associate per node hstate attributes, via their kobjects, - * with node devices in node_devices[] using a parallel array. The array - * index of a node device or _hstate == node id. - * This is here to avoid any static dependency of the node device driver, in - * the base kernel, on the hugetlb module. - */ -struct node_hstate { - struct kobject *hugepages_kobj; - struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; -}; -static struct node_hstate node_hstates[MAX_NUMNODES]; - -/* - * A subset of global hstate attributes for node devices - */ -static struct attribute *per_node_hstate_attrs[] = { - &nr_hugepages_attr.attr, - &free_hugepages_attr.attr, - &surplus_hugepages_attr.attr, - NULL, -}; - -static const struct attribute_group per_node_hstate_attr_group = { - .attrs = per_node_hstate_attrs, -}; - -/* - * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. - * Returns node id via non-NULL nidp. - */ -static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) -{ - int nid; - - for (nid = 0; nid < nr_node_ids; nid++) { - struct node_hstate *nhs = &node_hstates[nid]; - int i; - for (i = 0; i < HUGE_MAX_HSTATE; i++) - if (nhs->hstate_kobjs[i] == kobj) { - if (nidp) - *nidp = nid; - return &hstates[i]; - } - } - - BUG(); - return NULL; -} - -/* - * Unregister hstate attributes from a single node device. - * No-op if no hstate attributes attached. - */ -void hugetlb_unregister_node(struct node *node) -{ - struct hstate *h; - struct node_hstate *nhs = &node_hstates[node->dev.id]; - - if (!nhs->hugepages_kobj) - return; /* no hstate attributes */ - - for_each_hstate(h) { - int idx = hstate_index(h); - struct kobject *hstate_kobj = nhs->hstate_kobjs[idx]; - - if (!hstate_kobj) - continue; - if (h->demote_order) - sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group); - sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group); - kobject_put(hstate_kobj); - nhs->hstate_kobjs[idx] = NULL; - } - - kobject_put(nhs->hugepages_kobj); - nhs->hugepages_kobj = NULL; -} - - -/* - * Register hstate attributes for a single node device. - * No-op if attributes already registered. - */ -void hugetlb_register_node(struct node *node) -{ - struct hstate *h; - struct node_hstate *nhs = &node_hstates[node->dev.id]; - int err; - - if (!hugetlb_sysfs_initialized) - return; - - if (nhs->hugepages_kobj) - return; /* already allocated */ - - nhs->hugepages_kobj = kobject_create_and_add("hugepages", - &node->dev.kobj); - if (!nhs->hugepages_kobj) - return; - - for_each_hstate(h) { - err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, - nhs->hstate_kobjs, - &per_node_hstate_attr_group); - if (err) { - pr_err("HugeTLB: Unable to add hstate %s for node %d\n", - h->name, node->dev.id); - hugetlb_unregister_node(node); - break; - } - } -} - -/* - * hugetlb init time: register hstate attributes for all registered node - * devices of nodes that have memory. All on-line nodes should have - * registered their associated device by this time. - */ -static void __init hugetlb_register_all_nodes(void) -{ - int nid; - - for_each_online_node(nid) - hugetlb_register_node(node_devices[nid]); -} -#else /* !CONFIG_NUMA */ - -static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) -{ - BUG(); - if (nidp) - *nidp = -1; - return NULL; -} - -static void hugetlb_register_all_nodes(void) { } - -#endif - -static void __init hugetlb_sysfs_init(void) -{ - struct hstate *h; - int err; - - hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); - if (!hugepages_kobj) - return; - - for_each_hstate(h) { - err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, - hstate_kobjs, &hstate_attr_group); - if (err) - pr_err("HugeTLB: Unable to add hstate %s\n", h->name); - } - -#ifdef CONFIG_NUMA - hugetlb_sysfs_initialized = true; -#endif - hugetlb_register_all_nodes(); -} - -#ifdef CONFIG_SYSCTL -static void hugetlb_sysctl_init(void); -#else -static inline void hugetlb_sysctl_init(void) { } -#endif - static int __init hugetlb_init(void) { int i; @@ -5092,131 +4542,6 @@ static unsigned int allowed_mems_nr(struct hstate *h) return nr; } -#ifdef CONFIG_SYSCTL -static int proc_hugetlb_doulongvec_minmax(const struct ctl_table *table, int write, - void *buffer, size_t *length, - loff_t *ppos, unsigned long *out) -{ - struct ctl_table dup_table; - - /* - * In order to avoid races with __do_proc_doulongvec_minmax(), we - * can duplicate the @table and alter the duplicate of it. - */ - dup_table = *table; - dup_table.data = out; - - return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos); -} - -static int hugetlb_sysctl_handler_common(bool obey_mempolicy, - const struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos) -{ - struct hstate *h = &default_hstate; - unsigned long tmp = h->max_huge_pages; - int ret; - - if (!hugepages_supported()) - return -EOPNOTSUPP; - - ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, - &tmp); - if (ret) - goto out; - - if (write) - ret = __nr_hugepages_store_common(obey_mempolicy, h, - NUMA_NO_NODE, tmp, *length); -out: - return ret; -} - -static int hugetlb_sysctl_handler(const struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos) -{ - - return hugetlb_sysctl_handler_common(false, table, write, - buffer, length, ppos); -} - -#ifdef CONFIG_NUMA -static int hugetlb_mempolicy_sysctl_handler(const struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos) -{ - return hugetlb_sysctl_handler_common(true, table, write, - buffer, length, ppos); -} -#endif /* CONFIG_NUMA */ - -static int hugetlb_overcommit_handler(const struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos) -{ - struct hstate *h = &default_hstate; - unsigned long tmp; - int ret; - - if (!hugepages_supported()) - return -EOPNOTSUPP; - - tmp = h->nr_overcommit_huge_pages; - - if (write && hstate_is_gigantic(h)) - return -EINVAL; - - ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, - &tmp); - if (ret) - goto out; - - if (write) { - spin_lock_irq(&hugetlb_lock); - h->nr_overcommit_huge_pages = tmp; - spin_unlock_irq(&hugetlb_lock); - } -out: - return ret; -} - -static const struct ctl_table hugetlb_table[] = { - { - .procname = "nr_hugepages", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = hugetlb_sysctl_handler, - }, -#ifdef CONFIG_NUMA - { - .procname = "nr_hugepages_mempolicy", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = &hugetlb_mempolicy_sysctl_handler, - }, -#endif - { - .procname = "hugetlb_shm_group", - .data = &sysctl_hugetlb_shm_group, - .maxlen = sizeof(gid_t), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "nr_overcommit_hugepages", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = hugetlb_overcommit_handler, - }, -}; - -static void __init hugetlb_sysctl_init(void) -{ - register_sysctl_init("vm", hugetlb_table); -} -#endif /* CONFIG_SYSCTL */ - void hugetlb_report_meminfo(struct seq_file *m) { struct hstate *h; @@ -5521,32 +4846,6 @@ static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma, set_huge_ptep_writable(vma, address, ptep); } -bool is_hugetlb_entry_migration(pte_t pte) -{ - swp_entry_t swp; - - if (huge_pte_none(pte) || pte_present(pte)) - return false; - swp = pte_to_swp_entry(pte); - if (is_migration_entry(swp)) - return true; - else - return false; -} - -bool is_hugetlb_entry_hwpoisoned(pte_t pte) -{ - swp_entry_t swp; - - if (huge_pte_none(pte) || pte_present(pte)) - return false; - swp = pte_to_swp_entry(pte); - if (is_hwpoison_entry(swp)) - return true; - else - return false; -} - static void hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, struct folio *new_folio, pte_t old, unsigned long sz) @@ -5575,6 +4874,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, unsigned long npages = pages_per_huge_page(h); struct mmu_notifier_range range; unsigned long last_addr_mask; + softleaf_t softleaf; int ret = 0; if (cow) { @@ -5622,26 +4922,26 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte); again: if (huge_pte_none(entry)) { - /* - * Skip if src entry none. - */ - ; - } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) { + /* Skip if src entry none. */ + goto next; + } + + softleaf = softleaf_from_pte(entry); + if (unlikely(softleaf_is_hwpoison(softleaf))) { if (!userfaultfd_wp(dst_vma)) entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry, sz); - } else if (unlikely(is_hugetlb_entry_migration(entry))) { - swp_entry_t swp_entry = pte_to_swp_entry(entry); + } else if (unlikely(softleaf_is_migration(softleaf))) { bool uffd_wp = pte_swp_uffd_wp(entry); - if (!is_readable_migration_entry(swp_entry) && cow) { + if (!softleaf_is_migration_read(softleaf) && cow) { /* * COW mappings require pages in both * parent and child to be set to read. */ - swp_entry = make_readable_migration_entry( - swp_offset(swp_entry)); - entry = swp_entry_to_pte(swp_entry); + softleaf = make_readable_migration_entry( + swp_offset(softleaf)); + entry = swp_entry_to_pte(softleaf); if (userfaultfd_wp(src_vma) && uffd_wp) entry = pte_swp_mkuffd_wp(entry); set_huge_pte_at(src, addr, src_pte, entry, sz); @@ -5649,9 +4949,8 @@ again: if (!userfaultfd_wp(dst_vma)) entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry, sz); - } else if (unlikely(is_pte_marker(entry))) { - pte_marker marker = copy_pte_marker( - pte_to_swp_entry(entry), dst_vma); + } else if (unlikely(pte_is_marker(entry))) { + const pte_marker marker = copy_pte_marker(softleaf, dst_vma); if (marker) set_huge_pte_at(dst, addr, dst_pte, @@ -5708,9 +5007,7 @@ again: } hugetlb_install_folio(dst_vma, dst_pte, addr, new_folio, src_pte_old, sz); - spin_unlock(src_ptl); - spin_unlock(dst_ptl); - continue; + goto next; } if (cow) { @@ -5731,6 +5028,8 @@ again: set_huge_pte_at(dst, addr, dst_pte, entry, sz); hugetlb_count_add(npages, dst); } + +next: spin_unlock(src_ptl); spin_unlock(dst_ptl); } @@ -5767,13 +5066,13 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz); - if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) + if (need_clear_uffd_wp && pte_is_uffd_wp_marker(pte)) { huge_pte_clear(mm, new_addr, dst_pte, sz); - else { + } else { if (need_clear_uffd_wp) { if (pte_present(pte)) pte = huge_pte_clear_uffd_wp(pte); - else if (is_swap_pte(pte)) + else pte = pte_swp_clear_uffd_wp(pte); } set_huge_pte_at(mm, new_addr, dst_pte, pte, sz); @@ -6007,7 +5306,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, /* * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We * could defer the flush until now, since by holding i_mmap_rwsem we - * guaranteed that the last refernece would not be dropped. But we must + * guaranteed that the last reference would not be dropped. But we must * do the flushing before we return, as otherwise i_mmap_rwsem will be * dropped and the last reference to the shared PMDs page might be * dropped as well. @@ -6586,7 +5885,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, * If this pte was previously wr-protected, keep it wr-protected even * if populated. */ - if (unlikely(pte_marker_uffd_wp(vmf->orig_pte))) + if (unlikely(pte_is_uffd_wp_marker(vmf->orig_pte))) new_pte = huge_pte_mkuffd_wp(new_pte); set_huge_pte_at(mm, vmf->address, vmf->pte, new_pte, huge_page_size(h)); @@ -6712,36 +6011,37 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } vmf.orig_pte = huge_ptep_get(mm, vmf.address, vmf.pte); - if (huge_pte_none_mostly(vmf.orig_pte)) { - if (is_pte_marker(vmf.orig_pte)) { - pte_marker marker = - pte_marker_get(pte_to_swp_entry(vmf.orig_pte)); - - if (marker & PTE_MARKER_POISONED) { - ret = VM_FAULT_HWPOISON_LARGE | - VM_FAULT_SET_HINDEX(hstate_index(h)); - goto out_mutex; - } else if (WARN_ON_ONCE(marker & PTE_MARKER_GUARD)) { - /* This isn't supported in hugetlb. */ - ret = VM_FAULT_SIGSEGV; - goto out_mutex; - } - } - + if (huge_pte_none(vmf.orig_pte)) /* - * Other PTE markers should be handled the same way as none PTE. - * * hugetlb_no_page will drop vma lock and hugetlb fault * mutex internally, which make us return immediately. */ return hugetlb_no_page(mapping, &vmf); + + if (pte_is_marker(vmf.orig_pte)) { + const pte_marker marker = + softleaf_to_marker(softleaf_from_pte(vmf.orig_pte)); + + if (marker & PTE_MARKER_POISONED) { + ret = VM_FAULT_HWPOISON_LARGE | + VM_FAULT_SET_HINDEX(hstate_index(h)); + goto out_mutex; + } else if (WARN_ON_ONCE(marker & PTE_MARKER_GUARD)) { + /* This isn't supported in hugetlb. */ + ret = VM_FAULT_SIGSEGV; + goto out_mutex; + } + + return hugetlb_no_page(mapping, &vmf); } ret = 0; /* Not present, either a migration or a hwpoisoned entry */ - if (!pte_present(vmf.orig_pte)) { - if (is_hugetlb_entry_migration(vmf.orig_pte)) { + if (!pte_present(vmf.orig_pte) && !huge_pte_none(vmf.orig_pte)) { + const softleaf_t softleaf = softleaf_from_pte(vmf.orig_pte); + + if (softleaf_is_migration(softleaf)) { /* * Release the hugetlb fault lock now, but retain * the vma lock, because it is needed to guard the @@ -6752,9 +6052,12 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, mutex_unlock(&hugetlb_fault_mutex_table[hash]); migration_entry_wait_huge(vma, vmf.address, vmf.pte); return 0; - } else if (is_hugetlb_entry_hwpoisoned(vmf.orig_pte)) + } + if (softleaf_is_hwpoison(softleaf)) { ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); + } + goto out_mutex; } @@ -6903,6 +6206,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, int ret = -ENOMEM; struct folio *folio; bool folio_in_pagecache = false; + pte_t dst_ptep; if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { ptl = huge_pte_lock(h, dst_mm, dst_pte); @@ -7042,13 +6346,14 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, if (folio_test_hwpoison(folio)) goto out_release_unlock; + ret = -EEXIST; + + dst_ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte); /* - * We allow to overwrite a pte marker: consider when both MISSING|WP - * registered, we firstly wr-protect a none pte which has no page cache - * page backing it, then access the page. + * See comment about UFFD marker overwriting in + * mfill_atomic_install_pte(). */ - ret = -EEXIST; - if (!huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte))) + if (!huge_pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep)) goto out_release_unlock; if (folio_in_pagecache) @@ -7134,7 +6439,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma, i_mmap_lock_write(vma->vm_file->f_mapping); last_addr_mask = hugetlb_mask_last_page(h); for (; address < end; address += psize) { + softleaf_t entry; spinlock_t *ptl; + ptep = hugetlb_walk(vma, address, psize); if (!ptep) { if (!uffd_wp) { @@ -7166,14 +6473,23 @@ long hugetlb_change_protection(struct vm_area_struct *vma, continue; } pte = huge_ptep_get(mm, address, ptep); - if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { + if (huge_pte_none(pte)) { + if (unlikely(uffd_wp)) + /* Safe to modify directly (none->non-present). */ + set_huge_pte_at(mm, address, ptep, + make_pte_marker(PTE_MARKER_UFFD_WP), + psize); + goto next; + } + + entry = softleaf_from_pte(pte); + if (unlikely(softleaf_is_hwpoison(entry))) { /* Nothing to do. */ - } else if (unlikely(is_hugetlb_entry_migration(pte))) { - swp_entry_t entry = pte_to_swp_entry(pte); - struct folio *folio = pfn_swap_entry_folio(entry); + } else if (unlikely(softleaf_is_migration(entry))) { + struct folio *folio = softleaf_to_folio(entry); pte_t newpte = pte; - if (is_writable_migration_entry(entry)) { + if (softleaf_is_migration_write(entry)) { if (folio_test_anon(folio)) entry = make_readable_exclusive_migration_entry( swp_offset(entry)); @@ -7190,17 +6506,17 @@ long hugetlb_change_protection(struct vm_area_struct *vma, newpte = pte_swp_clear_uffd_wp(newpte); if (!pte_same(pte, newpte)) set_huge_pte_at(mm, address, ptep, newpte, psize); - } else if (unlikely(is_pte_marker(pte))) { + } else if (unlikely(pte_is_marker(pte))) { /* * Do nothing on a poison marker; page is - * corrupted, permissons do not apply. Here + * corrupted, permissions do not apply. Here * pte_marker_uffd_wp()==true implies !poison * because they're mutual exclusive. */ - if (pte_marker_uffd_wp(pte) && uffd_wp_resolve) + if (pte_is_uffd_wp_marker(pte) && uffd_wp_resolve) /* Safe to modify directly (non-present->none). */ huge_pte_clear(mm, address, ptep, psize); - } else if (!huge_pte_none(pte)) { + } else { pte_t old_pte; unsigned int shift = huge_page_shift(hstate_vma(vma)); @@ -7213,16 +6529,10 @@ long hugetlb_change_protection(struct vm_area_struct *vma, pte = huge_pte_clear_uffd_wp(pte); huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; - } else { - /* None pte */ - if (unlikely(uffd_wp)) - /* Safe to modify directly (none->non-present). */ - set_huge_pte_at(mm, address, ptep, - make_pte_marker(PTE_MARKER_UFFD_WP), - psize); } - spin_unlock(ptl); +next: + spin_unlock(ptl); cond_resched(); } /* @@ -7259,9 +6569,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma, */ long hugetlb_reserve_pages(struct inode *inode, - long from, long to, - struct vm_area_struct *vma, - vm_flags_t vm_flags) + long from, long to, + struct vm_area_desc *desc, + vm_flags_t vm_flags) { long chg = -1, add = -1, spool_resv, gbl_resv; struct hstate *h = hstate_inode(inode); @@ -7277,12 +6587,6 @@ long hugetlb_reserve_pages(struct inode *inode, } /* - * vma specific semaphore used for pmd sharing and fault/truncation - * synchronization - */ - hugetlb_vma_lock_alloc(vma); - - /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page * without using reserves @@ -7294,9 +6598,9 @@ long hugetlb_reserve_pages(struct inode *inode, * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need * to reserve the full area even if read-only as mprotect() may be - * called to make the mapping read-write. Assume !vma is a shm mapping + * called to make the mapping read-write. Assume !desc is a shm mapping */ - if (!vma || vma->vm_flags & VM_MAYSHARE) { + if (!desc || desc->vm_flags & VM_MAYSHARE) { /* * resv_map can not be NULL as hugetlb_reserve_pages is only * called for inodes for which resv_maps were created (see @@ -7313,8 +6617,8 @@ long hugetlb_reserve_pages(struct inode *inode, chg = to - from; - set_vma_resv_map(vma, resv_map); - set_vma_resv_flags(vma, HPAGE_RESV_OWNER); + set_vma_desc_resv_map(desc, resv_map); + set_vma_desc_resv_flags(desc, HPAGE_RESV_OWNER); } if (chg < 0) @@ -7324,7 +6628,7 @@ long hugetlb_reserve_pages(struct inode *inode, chg * pages_per_huge_page(h), &h_cg) < 0) goto out_err; - if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { + if (desc && !(desc->vm_flags & VM_MAYSHARE) && h_cg) { /* For private mappings, the hugetlb_cgroup uncharge info hangs * of the resv_map. */ @@ -7358,7 +6662,7 @@ long hugetlb_reserve_pages(struct inode *inode, * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ - if (!vma || vma->vm_flags & VM_MAYSHARE) { + if (!desc || desc->vm_flags & VM_MAYSHARE) { add = region_add(resv_map, from, to, regions_needed, h, h_cg); if (unlikely(add < 0)) { @@ -7412,16 +6716,15 @@ out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: - hugetlb_vma_lock_free(vma); - if (!vma || vma->vm_flags & VM_MAYSHARE) + if (!desc || desc->vm_flags & VM_MAYSHARE) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. */ if (chg >= 0 && add < 0) region_abort(resv_map, from, to, regions_needed); - if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + if (desc && is_vma_desc_resv_set(desc, HPAGE_RESV_OWNER)) { kref_put(&resv_map->refs, resv_map_release); - set_vma_resv_map(vma, NULL); + set_vma_desc_resv_map(desc, NULL); } return chg < 0 ? chg : add < 0 ? add : -EINVAL; } diff --git a/mm/hugetlb_internal.h b/mm/hugetlb_internal.h new file mode 100644 index 000000000000..1d2f870deccf --- /dev/null +++ b/mm/hugetlb_internal.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Internal HugeTLB definitions. + * (C) Nadia Yvette Chambers, April 2004 + */ + +#ifndef _LINUX_HUGETLB_INTERNAL_H +#define _LINUX_HUGETLB_INTERNAL_H + +#include <linux/hugetlb.h> +#include <linux/hugetlb_cgroup.h> + +/* + * Check if the hstate represents gigantic pages but gigantic page + * runtime support is not available. This is a common condition used to + * skip operations that cannot be performed on gigantic pages when runtime + * support is disabled. + */ +static inline bool hstate_is_gigantic_no_runtime(struct hstate *h) +{ + return hstate_is_gigantic(h) && !gigantic_page_runtime_supported(); +} + +/* + * common helper functions for hstate_next_node_to_{alloc|free}. + * We may have allocated or freed a huge page based on a different + * nodes_allowed previously, so h->next_node_to_{alloc|free} might + * be outside of *nodes_allowed. Ensure that we use an allowed + * node for alloc or free. + */ +static inline int next_node_allowed(int nid, nodemask_t *nodes_allowed) +{ + nid = next_node_in(nid, *nodes_allowed); + VM_BUG_ON(nid >= MAX_NUMNODES); + + return nid; +} + +static inline int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) +{ + if (!node_isset(nid, *nodes_allowed)) + nid = next_node_allowed(nid, nodes_allowed); + return nid; +} + +/* + * returns the previously saved node ["this node"] from which to + * allocate a persistent huge page for the pool and advance the + * next node from which to allocate, handling wrap at end of node + * mask. + */ +static inline int hstate_next_node_to_alloc(int *next_node, + nodemask_t *nodes_allowed) +{ + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(*next_node, nodes_allowed); + *next_node = next_node_allowed(nid, nodes_allowed); + + return nid; +} + +/* + * helper for remove_pool_hugetlb_folio() - return the previously saved + * node ["this node"] from which to free a huge page. Advance the + * next node id whether or not we find a free huge page to free so + * that the next attempt to free addresses the next node. + */ +static inline int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) +{ + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); + h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); + + return nid; +} + +#define for_each_node_mask_to_alloc(next_node, nr_nodes, node, mask) \ + for (nr_nodes = nodes_weight(*mask); \ + nr_nodes > 0 && \ + ((node = hstate_next_node_to_alloc(next_node, mask)) || 1); \ + nr_nodes--) + +#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ + for (nr_nodes = nodes_weight(*mask); \ + nr_nodes > 0 && \ + ((node = hstate_next_node_to_free(hs, mask)) || 1); \ + nr_nodes--) + +extern void remove_hugetlb_folio(struct hstate *h, struct folio *folio, + bool adjust_surplus); +extern void add_hugetlb_folio(struct hstate *h, struct folio *folio, + bool adjust_surplus); +extern void init_new_hugetlb_folio(struct folio *folio); +extern void prep_and_add_allocated_folios(struct hstate *h, + struct list_head *folio_list); +extern long demote_pool_huge_page(struct hstate *src, + nodemask_t *nodes_allowed, + unsigned long nr_to_demote); +extern ssize_t __nr_hugepages_store_common(bool obey_mempolicy, + struct hstate *h, int nid, + unsigned long count, size_t len); + +extern void hugetlb_sysfs_init(void) __init; + +#ifdef CONFIG_SYSCTL +extern void hugetlb_sysctl_init(void); +#else +static inline void hugetlb_sysctl_init(void) { } +#endif + +#endif /* _LINUX_HUGETLB_INTERNAL_H */ diff --git a/mm/hugetlb_sysctl.c b/mm/hugetlb_sysctl.c new file mode 100644 index 000000000000..bd3077150542 --- /dev/null +++ b/mm/hugetlb_sysctl.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * HugeTLB sysfs interfaces. + * (C) Nadia Yvette Chambers, April 2004 + */ + +#include <linux/sysctl.h> + +#include "hugetlb_internal.h" + +#ifdef CONFIG_SYSCTL +static int proc_hugetlb_doulongvec_minmax(const struct ctl_table *table, int write, + void *buffer, size_t *length, + loff_t *ppos, unsigned long *out) +{ + struct ctl_table dup_table; + + /* + * In order to avoid races with __do_proc_doulongvec_minmax(), we + * can duplicate the @table and alter the duplicate of it. + */ + dup_table = *table; + dup_table.data = out; + + return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos); +} + +static int hugetlb_sysctl_handler_common(bool obey_mempolicy, + const struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + struct hstate *h = &default_hstate; + unsigned long tmp = h->max_huge_pages; + int ret; + + if (!hugepages_supported()) + return -EOPNOTSUPP; + + ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, + &tmp); + if (ret) + goto out; + + if (write) + ret = __nr_hugepages_store_common(obey_mempolicy, h, + NUMA_NO_NODE, tmp, *length); +out: + return ret; +} + +static int hugetlb_sysctl_handler(const struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + + return hugetlb_sysctl_handler_common(false, table, write, + buffer, length, ppos); +} + +#ifdef CONFIG_NUMA +static int hugetlb_mempolicy_sysctl_handler(const struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + return hugetlb_sysctl_handler_common(true, table, write, + buffer, length, ppos); +} +#endif /* CONFIG_NUMA */ + +static int hugetlb_overcommit_handler(const struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + struct hstate *h = &default_hstate; + unsigned long tmp; + int ret; + + if (!hugepages_supported()) + return -EOPNOTSUPP; + + tmp = h->nr_overcommit_huge_pages; + + if (write && hstate_is_gigantic_no_runtime(h)) + return -EINVAL; + + ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, + &tmp); + if (ret) + goto out; + + if (write) { + spin_lock_irq(&hugetlb_lock); + h->nr_overcommit_huge_pages = tmp; + spin_unlock_irq(&hugetlb_lock); + } +out: + return ret; +} + +static const struct ctl_table hugetlb_table[] = { + { + .procname = "nr_hugepages", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = hugetlb_sysctl_handler, + }, +#ifdef CONFIG_NUMA + { + .procname = "nr_hugepages_mempolicy", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &hugetlb_mempolicy_sysctl_handler, + }, +#endif + { + .procname = "hugetlb_shm_group", + .data = &sysctl_hugetlb_shm_group, + .maxlen = sizeof(gid_t), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "nr_overcommit_hugepages", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = hugetlb_overcommit_handler, + }, +}; + +void __init hugetlb_sysctl_init(void) +{ + register_sysctl_init("vm", hugetlb_table); +} +#endif /* CONFIG_SYSCTL */ diff --git a/mm/hugetlb_sysfs.c b/mm/hugetlb_sysfs.c new file mode 100644 index 000000000000..79ece91406bf --- /dev/null +++ b/mm/hugetlb_sysfs.c @@ -0,0 +1,502 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * HugeTLB sysfs interfaces. + * (C) Nadia Yvette Chambers, April 2004 + */ + +#include <linux/swap.h> +#include <linux/page_owner.h> +#include <linux/page-isolation.h> + +#include "hugetlb_vmemmap.h" +#include "hugetlb_internal.h" + +#define HSTATE_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +#define HSTATE_ATTR_WO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_WO(_name) + +#define HSTATE_ATTR(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RW(_name) + +static struct kobject *hugepages_kobj; +static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; + +static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); + +static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) +{ + int i; + + for (i = 0; i < HUGE_MAX_HSTATE; i++) + if (hstate_kobjs[i] == kobj) { + if (nidp) + *nidp = NUMA_NO_NODE; + return &hstates[i]; + } + + return kobj_to_node_hstate(kobj, nidp); +} + +static ssize_t nr_hugepages_show_common(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h; + unsigned long nr_huge_pages; + int nid; + + h = kobj_to_hstate(kobj, &nid); + if (nid == NUMA_NO_NODE) + nr_huge_pages = h->nr_huge_pages; + else + nr_huge_pages = h->nr_huge_pages_node[nid]; + + return sysfs_emit(buf, "%lu\n", nr_huge_pages); +} + +static ssize_t nr_hugepages_store_common(bool obey_mempolicy, + struct kobject *kobj, const char *buf, + size_t len) +{ + struct hstate *h; + unsigned long count; + int nid; + int err; + + err = kstrtoul(buf, 10, &count); + if (err) + return err; + + h = kobj_to_hstate(kobj, &nid); + return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); +} + +static ssize_t nr_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return nr_hugepages_show_common(kobj, attr, buf); +} + +static ssize_t nr_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t len) +{ + return nr_hugepages_store_common(false, kobj, buf, len); +} +HSTATE_ATTR(nr_hugepages); + +#ifdef CONFIG_NUMA + +/* + * hstate attribute for optionally mempolicy-based constraint on persistent + * huge page alloc/free. + */ +static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return nr_hugepages_show_common(kobj, attr, buf); +} + +static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t len) +{ + return nr_hugepages_store_common(true, kobj, buf, len); +} +HSTATE_ATTR(nr_hugepages_mempolicy); +#endif + + +static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj, NULL); + return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages); +} + +static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int err; + unsigned long input; + struct hstate *h = kobj_to_hstate(kobj, NULL); + + if (hstate_is_gigantic_no_runtime(h)) + return -EINVAL; + + err = kstrtoul(buf, 10, &input); + if (err) + return err; + + spin_lock_irq(&hugetlb_lock); + h->nr_overcommit_huge_pages = input; + spin_unlock_irq(&hugetlb_lock); + + return count; +} +HSTATE_ATTR(nr_overcommit_hugepages); + +static ssize_t free_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h; + unsigned long free_huge_pages; + int nid; + + h = kobj_to_hstate(kobj, &nid); + if (nid == NUMA_NO_NODE) + free_huge_pages = h->free_huge_pages; + else + free_huge_pages = h->free_huge_pages_node[nid]; + + return sysfs_emit(buf, "%lu\n", free_huge_pages); +} +HSTATE_ATTR_RO(free_hugepages); + +static ssize_t resv_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj, NULL); + return sysfs_emit(buf, "%lu\n", h->resv_huge_pages); +} +HSTATE_ATTR_RO(resv_hugepages); + +static ssize_t surplus_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h; + unsigned long surplus_huge_pages; + int nid; + + h = kobj_to_hstate(kobj, &nid); + if (nid == NUMA_NO_NODE) + surplus_huge_pages = h->surplus_huge_pages; + else + surplus_huge_pages = h->surplus_huge_pages_node[nid]; + + return sysfs_emit(buf, "%lu\n", surplus_huge_pages); +} +HSTATE_ATTR_RO(surplus_hugepages); + +static ssize_t demote_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t len) +{ + unsigned long nr_demote; + unsigned long nr_available; + nodemask_t nodes_allowed, *n_mask; + struct hstate *h; + int err; + int nid; + + err = kstrtoul(buf, 10, &nr_demote); + if (err) + return err; + h = kobj_to_hstate(kobj, &nid); + + if (nid != NUMA_NO_NODE) { + init_nodemask_of_node(&nodes_allowed, nid); + n_mask = &nodes_allowed; + } else { + n_mask = &node_states[N_MEMORY]; + } + + /* Synchronize with other sysfs operations modifying huge pages */ + mutex_lock(&h->resize_lock); + spin_lock_irq(&hugetlb_lock); + + while (nr_demote) { + long rc; + + /* + * Check for available pages to demote each time thorough the + * loop as demote_pool_huge_page will drop hugetlb_lock. + */ + if (nid != NUMA_NO_NODE) + nr_available = h->free_huge_pages_node[nid]; + else + nr_available = h->free_huge_pages; + nr_available -= h->resv_huge_pages; + if (!nr_available) + break; + + rc = demote_pool_huge_page(h, n_mask, nr_demote); + if (rc < 0) { + err = rc; + break; + } + + nr_demote -= rc; + } + + spin_unlock_irq(&hugetlb_lock); + mutex_unlock(&h->resize_lock); + + if (err) + return err; + return len; +} +HSTATE_ATTR_WO(demote); + +static ssize_t demote_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj, NULL); + unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K; + + return sysfs_emit(buf, "%lukB\n", demote_size); +} + +static ssize_t demote_size_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct hstate *h, *demote_hstate; + unsigned long demote_size; + unsigned int demote_order; + + demote_size = (unsigned long)memparse(buf, NULL); + + demote_hstate = size_to_hstate(demote_size); + if (!demote_hstate) + return -EINVAL; + demote_order = demote_hstate->order; + if (demote_order < HUGETLB_PAGE_ORDER) + return -EINVAL; + + /* demote order must be smaller than hstate order */ + h = kobj_to_hstate(kobj, NULL); + if (demote_order >= h->order) + return -EINVAL; + + /* resize_lock synchronizes access to demote size and writes */ + mutex_lock(&h->resize_lock); + h->demote_order = demote_order; + mutex_unlock(&h->resize_lock); + + return count; +} +HSTATE_ATTR(demote_size); + +static struct attribute *hstate_attrs[] = { + &nr_hugepages_attr.attr, + &nr_overcommit_hugepages_attr.attr, + &free_hugepages_attr.attr, + &resv_hugepages_attr.attr, + &surplus_hugepages_attr.attr, +#ifdef CONFIG_NUMA + &nr_hugepages_mempolicy_attr.attr, +#endif + NULL, +}; + +static const struct attribute_group hstate_attr_group = { + .attrs = hstate_attrs, +}; + +static struct attribute *hstate_demote_attrs[] = { + &demote_size_attr.attr, + &demote_attr.attr, + NULL, +}; + +static const struct attribute_group hstate_demote_attr_group = { + .attrs = hstate_demote_attrs, +}; + +static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, + struct kobject **hstate_kobjs, + const struct attribute_group *hstate_attr_group) +{ + int retval; + int hi = hstate_index(h); + + hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); + if (!hstate_kobjs[hi]) + return -ENOMEM; + + retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); + if (retval) { + kobject_put(hstate_kobjs[hi]); + hstate_kobjs[hi] = NULL; + return retval; + } + + if (h->demote_order) { + retval = sysfs_create_group(hstate_kobjs[hi], + &hstate_demote_attr_group); + if (retval) { + pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name); + sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group); + kobject_put(hstate_kobjs[hi]); + hstate_kobjs[hi] = NULL; + return retval; + } + } + + return 0; +} + +#ifdef CONFIG_NUMA +static bool hugetlb_sysfs_initialized __ro_after_init; + +/* + * node_hstate/s - associate per node hstate attributes, via their kobjects, + * with node devices in node_devices[] using a parallel array. The array + * index of a node device or _hstate == node id. + * This is here to avoid any static dependency of the node device driver, in + * the base kernel, on the hugetlb module. + */ +struct node_hstate { + struct kobject *hugepages_kobj; + struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; +}; +static struct node_hstate node_hstates[MAX_NUMNODES]; + +/* + * A subset of global hstate attributes for node devices + */ +static struct attribute *per_node_hstate_attrs[] = { + &nr_hugepages_attr.attr, + &free_hugepages_attr.attr, + &surplus_hugepages_attr.attr, + NULL, +}; + +static const struct attribute_group per_node_hstate_attr_group = { + .attrs = per_node_hstate_attrs, +}; + +/* + * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. + * Returns node id via non-NULL nidp. + */ +static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) +{ + int nid; + + for (nid = 0; nid < nr_node_ids; nid++) { + struct node_hstate *nhs = &node_hstates[nid]; + int i; + for (i = 0; i < HUGE_MAX_HSTATE; i++) + if (nhs->hstate_kobjs[i] == kobj) { + if (nidp) + *nidp = nid; + return &hstates[i]; + } + } + + BUG(); + return NULL; +} + +/* + * Unregister hstate attributes from a single node device. + * No-op if no hstate attributes attached. + */ +void hugetlb_unregister_node(struct node *node) +{ + struct hstate *h; + struct node_hstate *nhs = &node_hstates[node->dev.id]; + + if (!nhs->hugepages_kobj) + return; /* no hstate attributes */ + + for_each_hstate(h) { + int idx = hstate_index(h); + struct kobject *hstate_kobj = nhs->hstate_kobjs[idx]; + + if (!hstate_kobj) + continue; + if (h->demote_order) + sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group); + sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group); + kobject_put(hstate_kobj); + nhs->hstate_kobjs[idx] = NULL; + } + + kobject_put(nhs->hugepages_kobj); + nhs->hugepages_kobj = NULL; +} + + +/* + * Register hstate attributes for a single node device. + * No-op if attributes already registered. + */ +void hugetlb_register_node(struct node *node) +{ + struct hstate *h; + struct node_hstate *nhs = &node_hstates[node->dev.id]; + int err; + + if (!hugetlb_sysfs_initialized) + return; + + if (nhs->hugepages_kobj) + return; /* already allocated */ + + nhs->hugepages_kobj = kobject_create_and_add("hugepages", + &node->dev.kobj); + if (!nhs->hugepages_kobj) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, + nhs->hstate_kobjs, + &per_node_hstate_attr_group); + if (err) { + pr_err("HugeTLB: Unable to add hstate %s for node %d\n", + h->name, node->dev.id); + hugetlb_unregister_node(node); + break; + } + } +} + +/* + * hugetlb init time: register hstate attributes for all registered node + * devices of nodes that have memory. All on-line nodes should have + * registered their associated device by this time. + */ +static void __init hugetlb_register_all_nodes(void) +{ + int nid; + + for_each_online_node(nid) + hugetlb_register_node(node_devices[nid]); +} +#else /* !CONFIG_NUMA */ + +static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) +{ + BUG(); + if (nidp) + *nidp = -1; + return NULL; +} + +static void hugetlb_register_all_nodes(void) { } + +#endif + +void __init hugetlb_sysfs_init(void) +{ + struct hstate *h; + int err; + + hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); + if (!hugepages_kobj) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, + hstate_kobjs, &hstate_attr_group); + if (err) + pr_err("HugeTLB: Unable to add hstate %s\n", h->name); + } + +#ifdef CONFIG_NUMA + hugetlb_sysfs_initialized = true; +#endif + hugetlb_register_all_nodes(); +} diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index ba0fb1b6a5a8..9d01f883fd71 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -15,7 +15,8 @@ #include <linux/bootmem_info.h> #include <linux/mmdebug.h> #include <linux/pagewalk.h> -#include <asm/pgalloc.h> +#include <linux/pgalloc.h> + #include <asm/tlbflush.h> #include "hugetlb_vmemmap.h" @@ -75,7 +76,7 @@ static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start, if (likely(pmd_leaf(*pmd))) { /* * Higher order allocations from buddy allocator must be able to - * be treated as indepdenent small pages (as they can be freed + * be treated as independent small pages (as they can be freed * individually). */ if (!PageReserved(head)) @@ -684,7 +685,7 @@ static void __hugetlb_vmemmap_optimize_folios(struct hstate *h, ret = hugetlb_vmemmap_split_folio(h, folio); /* - * Spliting the PMD requires allocating a page, thus lets fail + * Splitting the PMD requires allocating a page, thus let's fail * early once we encounter the first OOM. No point in retrying * as it can be dynamically done on remap with the memory * we get back from the vmemmap deduplication. @@ -715,7 +716,7 @@ static void __hugetlb_vmemmap_optimize_folios(struct hstate *h, /* * Pages to be freed may have been accumulated. If we * encounter an ENOMEM, free what we have and try again. - * This can occur in the case that both spliting fails + * This can occur in the case that both splitting fails * halfway and head page allocation also failed. In this * case __hugetlb_vmemmap_optimize_folio() would free memory * allowing more vmemmap remaps to occur. diff --git a/mm/internal.h b/mm/internal.h index 1561fc2ff5b8..89790def1bae 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -15,7 +15,7 @@ #include <linux/pagewalk.h> #include <linux/rmap.h> #include <linux/swap.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/swap_cgroup.h> #include <linux/tracepoint-defs.h> @@ -325,8 +325,7 @@ unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, /** * pte_move_swp_offset - Move the swap entry offset field of a swap pte * forward or backward by delta - * @pte: The initial pte state; is_swap_pte(pte) must be true and - * non_swap_entry() must be false. + * @pte: The initial pte state; must be a swap entry * @delta: The direction and the offset we are moving; forward if delta * is positive; backward if delta is negative * @@ -335,7 +334,7 @@ unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, */ static inline pte_t pte_move_swp_offset(pte_t pte, long delta) { - swp_entry_t entry = pte_to_swp_entry(pte); + const softleaf_t entry = softleaf_from_pte(pte); pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry), (swp_offset(entry) + delta))); @@ -352,8 +351,7 @@ static inline pte_t pte_move_swp_offset(pte_t pte, long delta) /** * pte_next_swp_offset - Increment the swap entry offset field of a swap pte. - * @pte: The initial pte state; is_swap_pte(pte) must be true and - * non_swap_entry() must be false. + * @pte: The initial pte state; must be a swap entry. * * Increments the swap offset, while maintaining all other fields, including * swap type, and any swp pte bits. The resulting pte is returned. @@ -382,21 +380,23 @@ static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte) { pte_t expected_pte = pte_next_swp_offset(pte); const pte_t *end_ptep = start_ptep + max_nr; - swp_entry_t entry = pte_to_swp_entry(pte); + const softleaf_t entry = softleaf_from_pte(pte); pte_t *ptep = start_ptep + 1; unsigned short cgroup_id; VM_WARN_ON(max_nr < 1); - VM_WARN_ON(!is_swap_pte(pte)); - VM_WARN_ON(non_swap_entry(entry)); + VM_WARN_ON(!softleaf_is_swap(entry)); cgroup_id = lookup_swap_cgroup_id(entry); while (ptep < end_ptep) { + softleaf_t entry; + pte = ptep_get(ptep); if (!pte_same(pte, expected_pte)) break; - if (lookup_swap_cgroup_id(pte_to_swp_entry(pte)) != cgroup_id) + entry = softleaf_from_pte(pte); + if (lookup_swap_cgroup_id(entry) != cgroup_id) break; expected_pte = pte_next_swp_offset(expected_pte); ptep++; @@ -1355,7 +1355,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, #ifdef CONFIG_MMU void __init vmalloc_init(void); int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, - pgprot_t prot, struct page **pages, unsigned int page_shift); + pgprot_t prot, struct page **pages, unsigned int page_shift, gfp_t gfp_mask); unsigned int get_vm_area_page_order(struct vm_struct *vm); #else static inline void vmalloc_init(void) @@ -1364,7 +1364,7 @@ static inline void vmalloc_init(void) static inline int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, - pgprot_t prot, struct page **pages, unsigned int page_shift) + pgprot_t prot, struct page **pages, unsigned int page_shift, gfp_t gfp_mask) { return -EINVAL; } @@ -1378,6 +1378,26 @@ void vunmap_range_noflush(unsigned long start, unsigned long end); void __vunmap_range_noflush(unsigned long start, unsigned long end); +static inline bool vma_is_single_threaded_private(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_SHARED) + return false; + + return atomic_read(&vma->vm_mm->mm_users) == 1; +} + +#ifdef CONFIG_NUMA_BALANCING +bool folio_can_map_prot_numa(struct folio *folio, struct vm_area_struct *vma, + bool is_private_single_threaded); + +#else +static inline bool folio_can_map_prot_numa(struct folio *folio, + struct vm_area_struct *vma, bool is_private_single_threaded) +{ + return false; +} +#endif + int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, unsigned long addr, int *flags, bool writable, int *last_cpupid); @@ -1402,7 +1422,7 @@ int __must_check try_grab_folio(struct folio *folio, int refs, */ void touch_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, bool write); -void touch_pmd(struct vm_area_struct *vma, unsigned long addr, +bool touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, bool write); /* @@ -1534,7 +1554,7 @@ static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY) * will be constantly true. */ - if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) + if (!pgtable_supports_soft_dirty()) return false; /* @@ -1629,7 +1649,10 @@ static inline void accept_page(struct page *page) #endif /* CONFIG_UNACCEPTED_MEMORY */ /* pagewalk.c */ -int walk_page_range_mm(struct mm_struct *mm, unsigned long start, +int walk_page_range_mm_unsafe(struct mm_struct *mm, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private); +int walk_page_range_vma_unsafe(struct vm_area_struct *vma, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); int walk_page_range_debug(struct mm_struct *mm, unsigned long start, @@ -1657,4 +1680,26 @@ static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm); int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm); +void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn); +int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t pgprot); + +static inline void io_remap_pfn_range_prepare(struct vm_area_desc *desc, + unsigned long orig_pfn, unsigned long size) +{ + const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size); + + return remap_pfn_range_prepare(desc, pfn); +} + +static inline int io_remap_pfn_range_complete(struct vm_area_struct *vma, + unsigned long addr, unsigned long orig_pfn, unsigned long size, + pgprot_t orig_prot) +{ + const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size); + const pgprot_t prot = pgprot_decrypted(orig_prot); + + return remap_pfn_range_complete(vma, addr, pfn, size, prot); +} + #endif /* __MM_INTERNAL_H */ diff --git a/mm/kasan/common.c b/mm/kasan/common.c index d4c14359feaf..1d27f1bd260b 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -305,9 +305,6 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, static inline bool check_page_allocation(void *ptr, unsigned long ip) { - if (!kasan_enabled()) - return false; - if (ptr != page_address(virt_to_head_page(ptr))) { kasan_report_invalid_free(ptr, ip, KASAN_REPORT_INVALID_FREE); return true; @@ -520,24 +517,20 @@ void __kasan_mempool_unpoison_pages(struct page *page, unsigned int order, bool __kasan_mempool_poison_object(void *ptr, unsigned long ip) { - struct folio *folio = virt_to_folio(ptr); + struct page *page = virt_to_page(ptr); struct slab *slab; - /* - * This function can be called for large kmalloc allocation that get - * their memory from page_alloc. Thus, the folio might not be a slab. - */ - if (unlikely(!folio_test_slab(folio))) { + if (unlikely(PageLargeKmalloc(page))) { if (check_page_allocation(ptr, ip)) return false; - kasan_poison(ptr, folio_size(folio), KASAN_PAGE_FREE, false); + kasan_poison(ptr, page_size(page), KASAN_PAGE_FREE, false); return true; } if (is_kfence_address(ptr)) return true; - slab = folio_slab(folio); + slab = page_slab(page); if (check_slab_allocation(slab->slab_cache, ptr, ip)) return false; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index b413c46b3e04..2b8e73f5f6a7 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -506,9 +506,6 @@ static void release_alloc_meta(struct kasan_alloc_meta *meta) static void release_free_meta(const void *object, struct kasan_free_meta *meta) { - if (!kasan_enabled()) - return; - /* Check if free meta is valid. */ if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREE_META) return; @@ -573,7 +570,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) kasan_save_track(&alloc_meta->alloc_track, flags); } -void __kasan_save_free_info(struct kmem_cache *cache, void *object) +void kasan_save_free_info(struct kmem_cache *cache, void *object) { struct kasan_free_meta *free_meta; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 07fa7375a848..fc9169a54766 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -399,12 +399,7 @@ void kasan_set_track(struct kasan_track *track, depot_stack_handle_t stack); void kasan_save_track(struct kasan_track *track, gfp_t flags); void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); -void __kasan_save_free_info(struct kmem_cache *cache, void *object); -static inline void kasan_save_free_info(struct kmem_cache *cache, void *object) -{ - if (kasan_enabled()) - __kasan_save_free_info(cache, object); -} +void kasan_save_free_info(struct kmem_cache *cache, void *object); #ifdef CONFIG_KASAN_GENERIC bool kasan_quarantine_put(struct kmem_cache *cache, void *object); diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 5d2a876035d6..29a751a8a08d 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -354,7 +354,7 @@ static int ___alloc_pages_bulk(struct page **pages, int nr_pages, gfp_t gfp_mask return 0; } -static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_t gfp_mask) +static int __kasan_populate_vmalloc_do(unsigned long start, unsigned long end, gfp_t gfp_mask) { unsigned long nr_pages, nr_total = PFN_UP(end - start); struct vmalloc_populate_data data; @@ -377,18 +377,10 @@ static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_ * page tables allocations ignore external gfp mask, enforce it * by the scope API */ - if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) - flags = memalloc_nofs_save(); - else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) - flags = memalloc_noio_save(); - + flags = memalloc_apply_gfp_scope(gfp_mask); ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE, kasan_populate_vmalloc_pte, &data); - - if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) - memalloc_nofs_restore(flags); - else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) - memalloc_noio_restore(flags); + memalloc_restore_scope(flags); ___free_pages_bulk(data.pages, nr_pages); if (ret) @@ -403,14 +395,11 @@ static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_ return ret; } -int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask) +int __kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask) { unsigned long shadow_start, shadow_end; int ret; - if (!kasan_enabled()) - return 0; - if (!is_vmalloc_or_module_addr((void *)addr)) return 0; @@ -432,7 +421,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mas shadow_start = PAGE_ALIGN_DOWN(shadow_start); shadow_end = PAGE_ALIGN(shadow_end); - ret = __kasan_populate_vmalloc(shadow_start, shadow_end, gfp_mask); + ret = __kasan_populate_vmalloc_do(shadow_start, shadow_end, gfp_mask); if (ret) return ret; @@ -574,7 +563,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, * pages entirely covered by the free region, we will not run in to any * trouble - any simultaneous allocations will be for disjoint regions. */ -void kasan_release_vmalloc(unsigned long start, unsigned long end, +void __kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end, unsigned long flags) @@ -583,9 +572,6 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long region_start, region_end; unsigned long size; - if (!kasan_enabled()) - return; - region_start = ALIGN(start, KASAN_MEMORY_PER_SHADOW_PAGE); region_end = ALIGN_DOWN(end, KASAN_MEMORY_PER_SHADOW_PAGE); @@ -634,9 +620,6 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, * with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored. */ - if (!kasan_enabled()) - return (void *)start; - if (!is_vmalloc_or_module_addr(start)) return (void *)start; @@ -659,9 +642,6 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, */ void __kasan_poison_vmalloc(const void *start, unsigned long size) { - if (!kasan_enabled()) - return; - if (!is_vmalloc_or_module_addr(start)) return; diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index b9f31293622b..d65d48b85f90 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -142,7 +142,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) save_stack_info(cache, object, flags, false); } -void __kasan_save_free_info(struct kmem_cache *cache, void *object) +void kasan_save_free_info(struct kmem_cache *cache, void *object) { save_stack_info(cache, object, 0, true); } diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 727c20c94ac5..577a1699c553 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -26,6 +26,7 @@ #include <linux/panic_notifier.h> #include <linux/random.h> #include <linux/rcupdate.h> +#include <linux/reboot.h> #include <linux/sched/clock.h> #include <linux/seq_file.h> #include <linux/slab.h> @@ -612,14 +613,15 @@ static unsigned long kfence_init_pool(void) * enters __slab_free() slow-path. */ for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { - struct slab *slab; + struct page *page; if (!i || (i % 2)) continue; - slab = page_slab(pfn_to_page(start_pfn + i)); - __folio_set_slab(slab_folio(slab)); + page = pfn_to_page(start_pfn + i); + __SetPageSlab(page); #ifdef CONFIG_MEMCG + struct slab *slab = page_slab(page); slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts | MEMCG_DATA_OBJEXTS; #endif @@ -665,16 +667,17 @@ static unsigned long kfence_init_pool(void) reset_slab: for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { - struct slab *slab; + struct page *page; if (!i || (i % 2)) continue; - slab = page_slab(pfn_to_page(start_pfn + i)); + page = pfn_to_page(start_pfn + i); #ifdef CONFIG_MEMCG + struct slab *slab = page_slab(page); slab->obj_exts = 0; #endif - __folio_clear_slab(slab_folio(slab)); + __ClearPageSlab(page); } return addr; @@ -820,6 +823,25 @@ static struct notifier_block kfence_check_canary_notifier = { static struct delayed_work kfence_timer; #ifdef CONFIG_KFENCE_STATIC_KEYS +static int kfence_reboot_callback(struct notifier_block *nb, + unsigned long action, void *data) +{ + /* + * Disable kfence to avoid static keys IPI synchronization during + * late shutdown/kexec + */ + WRITE_ONCE(kfence_enabled, false); + /* Cancel any pending timer work */ + cancel_delayed_work_sync(&kfence_timer); + + return NOTIFY_OK; +} + +static struct notifier_block kfence_reboot_notifier = { + .notifier_call = kfence_reboot_callback, + .priority = INT_MAX, /* Run early to stop timers ASAP */ +}; + /* Wait queue to wake up allocation-gate timer task. */ static DECLARE_WAIT_QUEUE_HEAD(allocation_wait); @@ -901,6 +923,10 @@ static void kfence_init_enable(void) if (kfence_check_on_panic) atomic_notifier_chain_register(&panic_notifier_list, &kfence_check_canary_notifier); +#ifdef CONFIG_KFENCE_STATIC_KEYS + register_reboot_notifier(&kfence_reboot_notifier); +#endif + WRITE_ONCE(kfence_enabled, true); queue_delayed_work(system_unbound_wq, &kfence_timer, 0); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index abe54f0043c7..97d1b2824386 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -17,21 +17,20 @@ #include <linux/page_idle.h> #include <linux/page_table_check.h> #include <linux/rcupdate_wait.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/shmem_fs.h> #include <linux/dax.h> #include <linux/ksm.h> +#include <linux/pgalloc.h> #include <asm/tlb.h> -#include <asm/pgalloc.h> #include "internal.h" #include "mm_slot.h" enum scan_result { SCAN_FAIL, SCAN_SUCCEED, - SCAN_PMD_NULL, - SCAN_PMD_NONE, + SCAN_NO_PTE_TABLE, SCAN_PMD_MAPPED, SCAN_EXCEED_NONE_PTE, SCAN_EXCEED_SWAP_PTE, @@ -67,7 +66,7 @@ enum scan_result { static struct task_struct *khugepaged_thread __read_mostly; static DEFINE_MUTEX(khugepaged_mutex); -/* default scan 8*512 pte (or vmas) every 30 second */ +/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */ static unsigned int khugepaged_pages_to_scan __read_mostly; static unsigned int khugepaged_pages_collapsed; static unsigned int khugepaged_full_scans; @@ -129,9 +128,8 @@ static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs); } -static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) +static ssize_t __sleep_millisecs_store(const char *buf, size_t count, + unsigned int *millisecs) { unsigned int msecs; int err; @@ -140,12 +138,19 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, if (err) return -EINVAL; - khugepaged_scan_sleep_millisecs = msecs; + *millisecs = msecs; khugepaged_sleep_expire = 0; wake_up_interruptible(&khugepaged_wait); return count; } + +static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return __sleep_millisecs_store(buf, count, &khugepaged_scan_sleep_millisecs); +} static struct kobj_attribute scan_sleep_millisecs_attr = __ATTR_RW(scan_sleep_millisecs); @@ -160,18 +165,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - unsigned int msecs; - int err; - - err = kstrtouint(buf, 10, &msecs); - if (err) - return -EINVAL; - - khugepaged_alloc_sleep_millisecs = msecs; - khugepaged_sleep_expire = 0; - wake_up_interruptible(&khugepaged_wait); - - return count; + return __sleep_millisecs_store(buf, count, &khugepaged_alloc_sleep_millisecs); } static struct kobj_attribute alloc_sleep_millisecs_attr = __ATTR_RW(alloc_sleep_millisecs); @@ -337,6 +331,13 @@ struct attribute_group khugepaged_attr_group = { }; #endif /* CONFIG_SYSFS */ +static bool pte_none_or_zero(pte_t pte) +{ + if (pte_none(pte)) + return true; + return pte_present(pte) && is_zero_pfn(pte_pfn(pte)); +} + int hugepage_madvise(struct vm_area_struct *vma, vm_flags_t *vm_flags, int advice) { @@ -518,6 +519,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, if (pte_none(pteval)) continue; + VM_WARN_ON_ONCE(!pte_present(pteval)); pfn = pte_pfn(pteval); if (is_zero_pfn(pfn)) continue; @@ -548,8 +550,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, addr += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); - if (pte_none(pteval) || (pte_present(pteval) && - is_zero_pfn(pte_pfn(pteval)))) { + if (pte_none_or_zero(pteval)) { ++none_or_zero; if (!userfaultfd_armed(vma) && (!cc->is_khugepaged || @@ -690,17 +691,17 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, address += nr_ptes * PAGE_SIZE) { nr_ptes = 1; pteval = ptep_get(_pte); - if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + if (pte_none_or_zero(pteval)) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); - if (is_zero_pfn(pte_pfn(pteval))) { - /* - * ptl mostly unnecessary. - */ - spin_lock(ptl); - ptep_clear(vma->vm_mm, address, _pte); - spin_unlock(ptl); - ksm_might_unmap_zero_page(vma->vm_mm, pteval); - } + if (pte_none(pteval)) + continue; + /* + * ptl mostly unnecessary. + */ + spin_lock(ptl); + ptep_clear(vma->vm_mm, address, _pte); + spin_unlock(ptl); + ksm_might_unmap_zero_page(vma->vm_mm, pteval); } else { struct page *src_page = pte_page(pteval); @@ -794,7 +795,7 @@ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio, unsigned long src_addr = address + i * PAGE_SIZE; struct page *src_page; - if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + if (pte_none_or_zero(pteval)) { clear_user_highpage(page, src_addr); continue; } @@ -932,21 +933,21 @@ static inline int check_pmd_state(pmd_t *pmd) pmd_t pmde = pmdp_get_lockless(pmd); if (pmd_none(pmde)) - return SCAN_PMD_NONE; + return SCAN_NO_PTE_TABLE; /* * The folio may be under migration when khugepaged is trying to * collapse it. Migration success or failure will eventually end * up with a present PMD mapping a folio again. */ - if (is_pmd_migration_entry(pmde)) + if (pmd_is_migration_entry(pmde)) return SCAN_PMD_MAPPED; if (!pmd_present(pmde)) - return SCAN_PMD_NULL; + return SCAN_NO_PTE_TABLE; if (pmd_trans_huge(pmde)) return SCAN_PMD_MAPPED; if (pmd_bad(pmde)) - return SCAN_PMD_NULL; + return SCAN_NO_PTE_TABLE; return SCAN_SUCCEED; } @@ -956,7 +957,7 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm, { *pmd = mm_find_pmd(mm, address); if (!*pmd) - return SCAN_PMD_NULL; + return SCAN_NO_PTE_TABLE; return check_pmd_state(*pmd); } @@ -1011,13 +1012,14 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl); if (!pte) { mmap_read_unlock(mm); - result = SCAN_PMD_NULL; + result = SCAN_NO_PTE_TABLE; goto out; } } vmf.orig_pte = ptep_get_lockless(pte); - if (!is_swap_pte(vmf.orig_pte)) + if (pte_none(vmf.orig_pte) || + pte_present(vmf.orig_pte)) continue; vmf.pte = pte; @@ -1184,7 +1186,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, &compound_pagelist); spin_unlock(pte_ptl); } else { - result = SCAN_PMD_NULL; + result = SCAN_NO_PTE_TABLE; } if (unlikely(result != SCAN_SUCCEED)) { @@ -1224,17 +1226,10 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, __folio_mark_uptodate(folio); pgtable = pmd_pgtable(_pmd); - _pmd = folio_mk_pmd(folio, vma->vm_page_prot); - _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); - spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); - folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); - set_pmd_at(mm, address, pmd, _pmd); - update_mmu_cache_pmd(vma, address, pmd); - deferred_split_folio(folio, false); + map_anon_folio_pmd_nopf(folio, pmd, vma, address); spin_unlock(pmd_ptl); folio = NULL; @@ -1274,14 +1269,26 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, nodes_clear(cc->alloc_nmask); pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl); if (!pte) { - result = SCAN_PMD_NULL; + result = SCAN_NO_PTE_TABLE; goto out; } for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, addr += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); - if (is_swap_pte(pteval)) { + if (pte_none_or_zero(pteval)) { + ++none_or_zero; + if (!userfaultfd_armed(vma) && + (!cc->is_khugepaged || + none_or_zero <= khugepaged_max_ptes_none)) { + continue; + } else { + result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); + goto out_unmap; + } + } + if (!pte_present(pteval)) { ++unmapped; if (!cc->is_khugepaged || unmapped <= khugepaged_max_ptes_swap) { @@ -1301,18 +1308,6 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, goto out_unmap; } } - if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - ++none_or_zero; - if (!userfaultfd_armed(vma) && - (!cc->is_khugepaged || - none_or_zero <= khugepaged_max_ptes_none)) { - continue; - } else { - result = SCAN_EXCEED_NONE_PTE; - count_vm_event(THP_SCAN_EXCEED_NONE_PTE); - goto out_unmap; - } - } if (pte_uffd_wp(pteval)) { /* * Don't collapse the page if any of the small @@ -1548,8 +1543,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, switch (result) { case SCAN_SUCCEED: break; - case SCAN_PMD_NULL: - case SCAN_PMD_NONE: + case SCAN_NO_PTE_TABLE: /* * All pte entries have been removed and pmd cleared. * Skip all the pte checks and just update the pmd mapping. @@ -1715,6 +1709,43 @@ drop_folio: return result; } +/* Can we retract page tables for this file-backed VMA? */ +static bool file_backed_vma_is_retractable(struct vm_area_struct *vma) +{ + /* + * Check vma->anon_vma to exclude MAP_PRIVATE mappings that + * got written to. These VMAs are likely not worth removing + * page tables from, as PMD-mapping is likely to be split later. + */ + if (READ_ONCE(vma->anon_vma)) + return false; + + /* + * When a vma is registered with uffd-wp, we cannot recycle + * the page table because there may be pte markers installed. + * Other vmas can still have the same file mapped hugely, but + * skip this one: it will always be mapped in small page size + * for uffd-wp registered ranges. + */ + if (userfaultfd_wp(vma)) + return false; + + /* + * If the VMA contains guard regions then we can't collapse it. + * + * This is set atomically on guard marker installation under mmap/VMA + * read lock, and here we may not hold any VMA or mmap lock at all. + * + * This is therefore serialised on the PTE page table lock, which is + * obtained on guard region installation after the flag is set, so this + * check being performed under this lock excludes races. + */ + if (vma_flag_test_atomic(vma, VMA_MAYBE_GUARD_BIT)) + return false; + + return true; +} + static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) { struct vm_area_struct *vma; @@ -1729,14 +1760,6 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) spinlock_t *ptl; bool success = false; - /* - * Check vma->anon_vma to exclude MAP_PRIVATE mappings that - * got written to. These VMAs are likely not worth removing - * page tables from, as PMD-mapping is likely to be split later. - */ - if (READ_ONCE(vma->anon_vma)) - continue; - addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); if (addr & ~HPAGE_PMD_MASK || vma->vm_end < addr + HPAGE_PMD_SIZE) @@ -1748,14 +1771,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) if (hpage_collapse_test_exit(mm)) continue; - /* - * When a vma is registered with uffd-wp, we cannot recycle - * the page table because there may be pte markers installed. - * Other vmas can still have the same file mapped hugely, but - * skip this one: it will always be mapped in small page size - * for uffd-wp registered ranges. - */ - if (userfaultfd_wp(vma)) + + if (!file_backed_vma_is_retractable(vma)) continue; /* PTEs were notified when unmapped; but now for the PMD? */ @@ -1782,15 +1799,15 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); /* - * Huge page lock is still held, so normally the page table - * must remain empty; and we have already skipped anon_vma - * and userfaultfd_wp() vmas. But since the mmap_lock is not - * held, it is still possible for a racing userfaultfd_ioctl() - * to have inserted ptes or markers. Now that we hold ptlock, - * repeating the anon_vma check protects from one category, - * and repeating the userfaultfd_wp() check from another. + * Huge page lock is still held, so normally the page table must + * remain empty; and we have already skipped anon_vma and + * userfaultfd_wp() vmas. But since the mmap_lock is not held, + * it is still possible for a racing userfaultfd_ioctl() or + * madvise() to have inserted ptes or markers. Now that we hold + * ptlock, repeating the retractable checks protects us from + * races against the prior checks. */ - if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) { + if (likely(file_backed_vma_is_retractable(vma))) { pgt_pmd = pmdp_collapse_flush(vma, addr, pmd); pmdp_get_lockless_sync(); success = true; @@ -2178,14 +2195,14 @@ immap_locked: } if (is_shmem) - __lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR); + lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR); else - __lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR); + lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR); if (nr_none) { - __lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none); + lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none); /* nr_none is always 0 for non-shmem. */ - __lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none); + lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none); } /* @@ -2784,8 +2801,6 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, hend = min(hend, vma->vm_end & HPAGE_PMD_MASK); } mmap_assert_locked(mm); - memset(cc->node_load, 0, sizeof(cc->node_load)); - nodes_clear(cc->alloc_nmask); if (!vma_is_anonymous(vma)) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, addr); @@ -2815,7 +2830,7 @@ handle_result: mmap_read_unlock(mm); goto handle_result; /* Whitelisted set of results where continuing OK */ - case SCAN_PMD_NULL: + case SCAN_NO_PTE_TABLE: case SCAN_PTE_NON_PRESENT: case SCAN_PTE_UFFD_WP: case SCAN_LACK_REFERENCED_PAGE: diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c index 8bca7fece47f..90f427b95a21 100644 --- a/mm/kmsan/core.c +++ b/mm/kmsan/core.c @@ -33,7 +33,7 @@ bool kmsan_enabled __read_mostly; /* * Per-CPU KMSAN context to be used in interrupts, where current->kmsan is - * unavaliable. + * unavailable. */ DEFINE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx); @@ -72,9 +72,6 @@ depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags, nr_entries = stack_trace_save(entries, KMSAN_STACK_DEPTH, 0); - /* Don't sleep. */ - flags &= ~(__GFP_DIRECT_RECLAIM | __GFP_KSWAPD_RECLAIM); - handle = stack_depot_save(entries, nr_entries, flags); return stack_depot_set_extra_bits(handle, extra); } diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 2cee59d89c80..8f22d1f22981 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -84,7 +84,8 @@ void kmsan_slab_free(struct kmem_cache *s, void *object) if (s->ctor) return; kmsan_enter_runtime(); - kmsan_internal_poison_memory(object, s->object_size, GFP_KERNEL, + kmsan_internal_poison_memory(object, s->object_size, + GFP_KERNEL & ~(__GFP_RECLAIM), KMSAN_POISON_CHECK | KMSAN_POISON_FREE); kmsan_leave_runtime(); } @@ -114,7 +115,8 @@ void kmsan_kfree_large(const void *ptr) kmsan_enter_runtime(); page = virt_to_head_page((void *)ptr); KMSAN_WARN_ON(ptr != page_address(page)); - kmsan_internal_poison_memory((void *)ptr, page_size(page), GFP_KERNEL, + kmsan_internal_poison_memory((void *)ptr, page_size(page), + GFP_KERNEL & ~(__GFP_RECLAIM), KMSAN_POISON_CHECK | KMSAN_POISON_FREE); kmsan_leave_runtime(); } diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 54f3c3c962f0..e7f554a31bb4 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -208,14 +208,14 @@ void kmsan_free_page(struct page *page, unsigned int order) return; kmsan_enter_runtime(); kmsan_internal_poison_memory(page_address(page), page_size(page), - GFP_KERNEL, + GFP_KERNEL & ~(__GFP_RECLAIM), KMSAN_POISON_CHECK | KMSAN_POISON_FREE); kmsan_leave_runtime(); } int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, pgprot_t prot, struct page **pages, - unsigned int page_shift) + unsigned int page_shift, gfp_t gfp_mask) { unsigned long shadow_start, origin_start, shadow_end, origin_end; struct page **s_pages, **o_pages; @@ -230,8 +230,8 @@ int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, return 0; nr = (end - start) / PAGE_SIZE; - s_pages = kcalloc(nr, sizeof(*s_pages), GFP_KERNEL); - o_pages = kcalloc(nr, sizeof(*o_pages), GFP_KERNEL); + s_pages = kcalloc(nr, sizeof(*s_pages), gfp_mask); + o_pages = kcalloc(nr, sizeof(*o_pages), gfp_mask); if (!s_pages || !o_pages) { err = -ENOMEM; goto ret; @@ -389,7 +389,7 @@ static unsigned long ewma(unsigned long prev, unsigned long curr) * exponentially weighted moving average. The new pages_to_scan value is * multiplied with that change factor: * - * new_pages_to_scan *= change facor + * new_pages_to_scan *= change factor * * The new_pages_to_scan value is limited by the cpu min and max values. It * calculates the cpu percent for the last scan and calculates the new @@ -607,7 +607,76 @@ static inline bool ksm_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } +static int break_ksm_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + unsigned long *found_addr = (unsigned long *) walk->private; + struct mm_struct *mm = walk->mm; + pte_t *start_ptep, *ptep; + spinlock_t *ptl; + int found = 0; + + if (ksm_test_exit(walk->mm)) + return 0; + if (signal_pending(current)) + return -ERESTARTSYS; + + start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (!start_ptep) + return 0; + + for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) { + pte_t pte = ptep_get(ptep); + struct folio *folio = NULL; + + if (pte_present(pte)) { + folio = vm_normal_folio(walk->vma, addr, pte); + } else if (!pte_none(pte)) { + const softleaf_t entry = softleaf_from_pte(pte); + + /* + * As KSM pages remain KSM pages until freed, no need to wait + * here for migration to end. + */ + if (softleaf_is_migration(entry)) + folio = softleaf_to_folio(entry); + } + /* return 1 if the page is an normal ksm page or KSM-placed zero page */ + found = (folio && folio_test_ksm(folio)) || + (pte_present(pte) && is_ksm_zero_pte(pte)); + if (found) { + *found_addr = addr; + goto out_unlock; + } + } +out_unlock: + pte_unmap_unlock(ptep, ptl); + return found; +} + +static const struct mm_walk_ops break_ksm_ops = { + .pmd_entry = break_ksm_pmd_entry, + .walk_lock = PGWALK_RDLOCK, +}; + +static const struct mm_walk_ops break_ksm_lock_vma_ops = { + .pmd_entry = break_ksm_pmd_entry, + .walk_lock = PGWALK_WRLOCK, +}; + /* + * Though it's very tempting to unmerge rmap_items from stable tree rather + * than check every pte of a given vma, the locking doesn't quite work for + * that - an rmap_item is assigned to the stable tree after inserting ksm + * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing + * rmap_items from parent to child at fork time (so as not to waste time + * if exit comes before the next scan reaches it). + * + * Similarly, although we'd like to remove rmap_items (so updating counts + * and freeing memory) when unmerging an area, it's easier to leave that + * to the next pass of ksmd - consider, for example, how ksmd might be + * in cmp_and_merge_page on one of the rmap_items we would be removing. + * * We use break_ksm to break COW on a ksm page by triggering unsharing, * such that the ksm page will get replaced by an exclusive anonymous page. * @@ -620,31 +689,20 @@ static inline bool ksm_test_exit(struct mm_struct *mm) * of the process that owns 'vma'. We also do not want to enforce * protection keys here anyway. */ -static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma) +static int break_ksm(struct vm_area_struct *vma, unsigned long addr, + unsigned long end, bool lock_vma) { vm_fault_t ret = 0; - - if (lock_vma) - vma_start_write(vma); + const struct mm_walk_ops *ops = lock_vma ? + &break_ksm_lock_vma_ops : &break_ksm_ops; do { - bool ksm_page = false; - struct folio_walk fw; - struct folio *folio; + int ksm_page; cond_resched(); - folio = folio_walk_start(&fw, vma, addr, - FW_MIGRATION | FW_ZEROPAGE); - if (folio) { - /* Small folio implies FW_LEVEL_PTE. */ - if (!folio_test_large(folio) && - (folio_test_ksm(folio) || is_ksm_zero_pte(fw.pte))) - ksm_page = true; - folio_walk_end(&fw, vma); - } - - if (!ksm_page) - return 0; + ksm_page = walk_page_range_vma(vma, addr, end, ops, &addr); + if (ksm_page <= 0) + return ksm_page; ret = handle_mm_fault(vma, addr, FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, NULL); @@ -730,7 +788,7 @@ static void break_cow(struct ksm_rmap_item *rmap_item) mmap_read_lock(mm); vma = find_mergeable_vma(mm, addr); if (vma) - break_ksm(vma, addr, false); + break_ksm(vma, addr, addr + PAGE_SIZE, false); mmap_read_unlock(mm); } @@ -1025,36 +1083,6 @@ static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list) } } -/* - * Though it's very tempting to unmerge rmap_items from stable tree rather - * than check every pte of a given vma, the locking doesn't quite work for - * that - an rmap_item is assigned to the stable tree after inserting ksm - * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing - * rmap_items from parent to child at fork time (so as not to waste time - * if exit comes before the next scan reaches it). - * - * Similarly, although we'd like to remove rmap_items (so updating counts - * and freeing memory) when unmerging an area, it's easier to leave that - * to the next pass of ksmd - consider, for example, how ksmd might be - * in cmp_and_merge_page on one of the rmap_items we would be removing. - */ -static int unmerge_ksm_pages(struct vm_area_struct *vma, - unsigned long start, unsigned long end, bool lock_vma) -{ - unsigned long addr; - int err = 0; - - for (addr = start; addr < end && !err; addr += PAGE_SIZE) { - if (ksm_test_exit(vma->vm_mm)) - break; - if (signal_pending(current)) - err = -ERESTARTSYS; - else - err = break_ksm(vma, addr, lock_vma); - } - return err; -} - static inline struct ksm_stable_node *folio_stable_node(const struct folio *folio) { @@ -1192,8 +1220,7 @@ static int unmerge_and_remove_all_rmap_items(void) for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) continue; - err = unmerge_ksm_pages(vma, - vma->vm_start, vma->vm_end, false); + err = break_ksm(vma, vma->vm_start, vma->vm_end, false); if (err) goto error; } @@ -2455,6 +2482,95 @@ static bool should_skip_rmap_item(struct folio *folio, return true; } +struct ksm_next_page_arg { + struct folio *folio; + struct page *page; + unsigned long addr; +}; + +static int ksm_next_page_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct ksm_next_page_arg *private = walk->private; + struct vm_area_struct *vma = walk->vma; + pte_t *start_ptep = NULL, *ptep, pte; + struct mm_struct *mm = walk->mm; + struct folio *folio; + struct page *page; + spinlock_t *ptl; + pmd_t pmd; + + if (ksm_test_exit(mm)) + return 0; + + cond_resched(); + + pmd = pmdp_get_lockless(pmdp); + if (!pmd_present(pmd)) + return 0; + + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && pmd_leaf(pmd)) { + ptl = pmd_lock(mm, pmdp); + pmd = pmdp_get(pmdp); + + if (!pmd_present(pmd)) { + goto not_found_unlock; + } else if (pmd_leaf(pmd)) { + page = vm_normal_page_pmd(vma, addr, pmd); + if (!page) + goto not_found_unlock; + folio = page_folio(page); + + if (folio_is_zone_device(folio) || !folio_test_anon(folio)) + goto not_found_unlock; + + page += ((addr & (PMD_SIZE - 1)) >> PAGE_SHIFT); + goto found_unlock; + } + spin_unlock(ptl); + } + + start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (!start_ptep) + return 0; + + for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) { + pte = ptep_get(ptep); + + if (!pte_present(pte)) + continue; + + page = vm_normal_page(vma, addr, pte); + if (!page) + continue; + folio = page_folio(page); + + if (folio_is_zone_device(folio) || !folio_test_anon(folio)) + continue; + goto found_unlock; + } + +not_found_unlock: + spin_unlock(ptl); + if (start_ptep) + pte_unmap(start_ptep); + return 0; +found_unlock: + folio_get(folio); + spin_unlock(ptl); + if (start_ptep) + pte_unmap(start_ptep); + private->page = page; + private->folio = folio; + private->addr = addr; + return 1; +} + +static struct mm_walk_ops ksm_next_page_ops = { + .pmd_entry = ksm_next_page_pmd_entry, + .walk_lock = PGWALK_RDLOCK, +}; + static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) { struct mm_struct *mm; @@ -2542,21 +2658,27 @@ next_mm: ksm_scan.address = vma->vm_end; while (ksm_scan.address < vma->vm_end) { + struct ksm_next_page_arg ksm_next_page_arg; struct page *tmp_page = NULL; - struct folio_walk fw; struct folio *folio; if (ksm_test_exit(mm)) break; - folio = folio_walk_start(&fw, vma, ksm_scan.address, 0); - if (folio) { - if (!folio_is_zone_device(folio) && - folio_test_anon(folio)) { - folio_get(folio); - tmp_page = fw.page; - } - folio_walk_end(&fw, vma); + int found; + + found = walk_page_range_vma(vma, ksm_scan.address, + vma->vm_end, + &ksm_next_page_ops, + &ksm_next_page_arg); + + if (found > 0) { + folio = ksm_next_page_arg.folio; + tmp_page = ksm_next_page_arg.page; + ksm_scan.address = ksm_next_page_arg.addr; + } else { + VM_WARN_ON_ONCE(found < 0); + ksm_scan.address = vma->vm_end - PAGE_SIZE; } if (tmp_page) { @@ -2617,8 +2739,14 @@ no_vmas: spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); + /* + * Only clear MMF_VM_MERGEABLE. We must not clear + * MMF_VM_MERGE_ANY, because for those MMF_VM_MERGE_ANY process, + * perhaps their mm_struct has just been added to ksm_mm_slot + * list, and its process has not yet officially started running + * or has not yet performed mmap/brk to allocate anonymous VMAS. + */ mm_flags_clear(MMF_VM_MERGEABLE, mm); - mm_flags_clear(MMF_VM_MERGE_ANY, mm); mmap_read_unlock(mm); mmdrop(mm); } else { @@ -2719,7 +2847,7 @@ static int __ksm_del_vma(struct vm_area_struct *vma) return 0; if (vma->anon_vma) { - err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, true); + err = break_ksm(vma, vma->vm_start, vma->vm_end, true); if (err) return err; } @@ -2736,12 +2864,20 @@ static int __ksm_del_vma(struct vm_area_struct *vma) * * Returns: @vm_flags possibly updated to mark mergeable. */ -vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, +vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags) { if (mm_flags_test(MMF_VM_MERGE_ANY, mm) && - __ksm_should_add_vma(file, vm_flags)) + __ksm_should_add_vma(file, vm_flags)) { vm_flags |= VM_MERGEABLE; + /* + * Generally, the flags here always include MMF_VM_MERGEABLE. + * However, in rare cases, this flag may be cleared by ksmd who + * scans a cycle without finding any mergeable vma. + */ + if (unlikely(!mm_flags_test(MMF_VM_MERGEABLE, mm))) + __ksm_enter(mm); + } return vm_flags; } @@ -2863,7 +2999,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, return 0; /* just ignore the advice */ if (vma->anon_vma) { - err = unmerge_ksm_pages(vma, start, end, true); + err = break_ksm(vma, start, end, true); if (err) return err; } @@ -3245,7 +3381,7 @@ static int ksm_memory_callback(struct notifier_block *self, * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items() * and remove_all_stable_nodes() while memory is going offline: * it is unsafe for them to touch the stable tree at this time. - * But unmerge_ksm_pages(), rmap lookups and other entry points + * But break_ksm(), rmap lookups and other entry points * which do not need the ksm_thread_mutex are all safe. */ mutex_lock(&ksm_thread_mutex); diff --git a/mm/madvise.c b/mm/madvise.c index fb1c86e630b6..b617b1be0f53 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -29,7 +29,7 @@ #include <linux/backing-dev.h> #include <linux/pagewalk.h> #include <linux/swap.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/shmem_fs.h> #include <linux/mmu_notifier.h> @@ -167,7 +167,7 @@ static int madvise_update_vma(vm_flags_t new_flags, range->start, range->end, anon_name); else vma = vma_modify_flags(&vmi, madv_behavior->prev, vma, - range->start, range->end, new_flags); + range->start, range->end, &new_flags); if (IS_ERR(vma)) return PTR_ERR(vma); @@ -195,7 +195,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, for (addr = start; addr < end; addr += PAGE_SIZE) { pte_t pte; - swp_entry_t entry; + softleaf_t entry; struct folio *folio; if (!ptep++) { @@ -205,10 +205,8 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, } pte = ptep_get(ptep); - if (!is_swap_pte(pte)) - continue; - entry = pte_to_swp_entry(pte); - if (unlikely(non_swap_entry(entry))) + entry = softleaf_from_pte(pte); + if (unlikely(!softleaf_is_swap(entry))) continue; pte_unmap_unlock(ptep, ptl); @@ -251,7 +249,7 @@ static void shmem_swapin_range(struct vm_area_struct *vma, continue; entry = radix_to_swp_entry(folio); /* There might be swapin error entries in shmem mapping. */ - if (non_swap_entry(entry)) + if (!softleaf_is_swap(entry)) continue; addr = vma->vm_start + @@ -392,7 +390,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, if (unlikely(!pmd_present(orig_pmd))) { VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(orig_pmd)); + !pmd_is_migration_entry(orig_pmd)); goto huge_unlock; } @@ -690,17 +688,16 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, * (page allocation + zeroing). */ if (!pte_present(ptent)) { - swp_entry_t entry; + softleaf_t entry = softleaf_from_pte(ptent); - entry = pte_to_swp_entry(ptent); - if (!non_swap_entry(entry)) { + if (softleaf_is_swap(entry)) { max_nr = (end - addr) / PAGE_SIZE; nr = swap_pte_batch(pte, max_nr, ptent); nr_swap -= nr; free_swap_and_cache_nr(entry, nr); clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); - } else if (is_hwpoison_entry(entry) || - is_poisoned_swp_entry(entry)) { + } else if (softleaf_is_hwpoison(entry) || + softleaf_is_poison_marker(entry)) { pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } continue; @@ -1071,8 +1068,9 @@ static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked) static bool is_guard_pte_marker(pte_t ptent) { - return is_swap_pte(ptent) && - is_guard_swp_entry(pte_to_swp_entry(ptent)); + const softleaf_t entry = softleaf_from_pte(ptent); + + return softleaf_is_guard_marker(entry); } static int guard_install_pud_entry(pud_t *pud, unsigned long addr, @@ -1122,18 +1120,17 @@ static int guard_install_set_pte(unsigned long addr, unsigned long next, return 0; } -static const struct mm_walk_ops guard_install_walk_ops = { - .pud_entry = guard_install_pud_entry, - .pmd_entry = guard_install_pmd_entry, - .pte_entry = guard_install_pte_entry, - .install_pte = guard_install_set_pte, - .walk_lock = PGWALK_RDLOCK, -}; - static long madvise_guard_install(struct madvise_behavior *madv_behavior) { struct vm_area_struct *vma = madv_behavior->vma; struct madvise_behavior_range *range = &madv_behavior->range; + struct mm_walk_ops walk_ops = { + .pud_entry = guard_install_pud_entry, + .pmd_entry = guard_install_pmd_entry, + .pte_entry = guard_install_pte_entry, + .install_pte = guard_install_set_pte, + .walk_lock = get_walk_lock(madv_behavior->lock_mode), + }; long err; int i; @@ -1141,24 +1138,38 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior) return -EINVAL; /* - * If we install guard markers, then the range is no longer - * empty from a page table perspective and therefore it's - * appropriate to have an anon_vma. + * Set atomically under read lock. All pertinent readers will need to + * acquire an mmap/VMA write lock to read it. All remaining readers may + * or may not see the flag set, but we don't care. + */ + vma_flag_set_atomic(vma, VMA_MAYBE_GUARD_BIT); + + /* + * If anonymous and we are establishing page tables the VMA ought to + * have an anon_vma associated with it. * - * This ensures that on fork, we copy page tables correctly. + * We will hold an mmap read lock if this is necessary, this is checked + * as part of the VMA lock logic. */ - err = anon_vma_prepare(vma); - if (err) - return err; + if (vma_is_anonymous(vma)) { + VM_WARN_ON_ONCE(!vma->anon_vma && + madv_behavior->lock_mode != MADVISE_MMAP_READ_LOCK); + + err = anon_vma_prepare(vma); + if (err) + return err; + } /* * Optimistically try to install the guard marker pages first. If any - * non-guard pages are encountered, give up and zap the range before - * trying again. + * non-guard pages or THP huge pages are encountered, give up and zap + * the range before trying again. * * We try a few times before giving up and releasing back to userland to - * loop around, releasing locks in the process to avoid contention. This - * would only happen if there was a great many racing page faults. + * loop around, releasing locks in the process to avoid contention. + * + * This would only happen due to races with e.g. page faults or + * khugepaged. * * In most cases we should simply install the guard markers immediately * with no zap or looping. @@ -1167,8 +1178,13 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior) unsigned long nr_pages = 0; /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */ - err = walk_page_range_mm(vma->vm_mm, range->start, range->end, - &guard_install_walk_ops, &nr_pages); + if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK) + err = walk_page_range_vma_unsafe(madv_behavior->vma, + range->start, range->end, &walk_ops, + &nr_pages); + else + err = walk_page_range_mm_unsafe(vma->vm_mm, range->start, + range->end, &walk_ops, &nr_pages); if (err < 0) return err; @@ -1189,8 +1205,7 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior) } /* - * We were unable to install the guard pages due to being raced by page - * faults. This should not happen ordinarily. We return to userspace and + * We were unable to install the guard pages, return to userspace and * immediately retry, relieving lock contention. */ return restart_syscall(); @@ -1234,17 +1249,16 @@ static int guard_remove_pte_entry(pte_t *pte, unsigned long addr, return 0; } -static const struct mm_walk_ops guard_remove_walk_ops = { - .pud_entry = guard_remove_pud_entry, - .pmd_entry = guard_remove_pmd_entry, - .pte_entry = guard_remove_pte_entry, - .walk_lock = PGWALK_RDLOCK, -}; - static long madvise_guard_remove(struct madvise_behavior *madv_behavior) { struct vm_area_struct *vma = madv_behavior->vma; struct madvise_behavior_range *range = &madv_behavior->range; + struct mm_walk_ops wallk_ops = { + .pud_entry = guard_remove_pud_entry, + .pmd_entry = guard_remove_pmd_entry, + .pte_entry = guard_remove_pte_entry, + .walk_lock = get_walk_lock(madv_behavior->lock_mode), + }; /* * We're ok with removing guards in mlock()'d ranges, as this is a @@ -1254,7 +1268,7 @@ static long madvise_guard_remove(struct madvise_behavior *madv_behavior) return -EINVAL; return walk_page_range_vma(vma, range->start, range->end, - &guard_remove_walk_ops, NULL); + &wallk_ops, NULL); } #ifdef CONFIG_64BIT @@ -1567,6 +1581,47 @@ static bool process_madvise_remote_valid(int behavior) } } +/* Does this operation invoke anon_vma_prepare()? */ +static bool prepares_anon_vma(int behavior) +{ + switch (behavior) { + case MADV_GUARD_INSTALL: + return true; + default: + return false; + } +} + +/* + * We have acquired a VMA read lock, is the VMA valid to be madvise'd under VMA + * read lock only now we have a VMA to examine? + */ +static bool is_vma_lock_sufficient(struct vm_area_struct *vma, + struct madvise_behavior *madv_behavior) +{ + /* Must span only a single VMA.*/ + if (madv_behavior->range.end > vma->vm_end) + return false; + /* Remote processes unsupported. */ + if (current->mm != vma->vm_mm) + return false; + /* Userfaultfd unsupported. */ + if (userfaultfd_armed(vma)) + return false; + /* + * anon_vma_prepare() explicitly requires an mmap lock for + * serialisation, so we cannot use a VMA lock in this case. + * + * Note we might race with anon_vma being set, however this makes this + * check overly paranoid which is safe. + */ + if (vma_is_anonymous(vma) && + prepares_anon_vma(madv_behavior->behavior) && !vma->anon_vma) + return false; + + return true; +} + /* * Try to acquire a VMA read lock if possible. * @@ -1588,15 +1643,12 @@ static bool try_vma_read_lock(struct madvise_behavior *madv_behavior) vma = lock_vma_under_rcu(mm, madv_behavior->range.start); if (!vma) goto take_mmap_read_lock; - /* - * Must span only a single VMA; uffd and remote processes are - * unsupported. - */ - if (madv_behavior->range.end > vma->vm_end || current->mm != mm || - userfaultfd_armed(vma)) { + + if (!is_vma_lock_sufficient(vma, madv_behavior)) { vma_end_read(vma); goto take_mmap_read_lock; } + madv_behavior->vma = vma; return true; @@ -1709,9 +1761,9 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: case MADV_COLLAPSE: + return MADVISE_MMAP_READ_LOCK; case MADV_GUARD_INSTALL: case MADV_GUARD_REMOVE: - return MADVISE_MMAP_READ_LOCK; case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_FREE: diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index c193de6cb23a..737c407f4081 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -149,7 +149,7 @@ static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end, struct mm_walk *walk) { #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD - pud_t pudval = READ_ONCE(*pud); + pud_t pudval = pudp_get(pud); /* Do not split a huge pud */ if (pud_trans_huge(pudval)) { diff --git a/mm/memblock.c b/mm/memblock.c index e23e16618e9b..f0f2dc66e9a2 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1826,7 +1826,8 @@ phys_addr_t __init_memblock memblock_reserved_kern_size(phys_addr_t limit, int n */ unsigned long __init memblock_estimated_nr_free_pages(void) { - return PHYS_PFN(memblock_phys_mem_size() - memblock_reserved_size()); + return PHYS_PFN(memblock_phys_mem_size() - + memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, NUMA_NO_NODE)); } /* lowest address */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4deda33625f4..be810c1fbfc3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -81,6 +81,7 @@ struct cgroup_subsys memory_cgrp_subsys __read_mostly; EXPORT_SYMBOL(memory_cgrp_subsys); struct mem_cgroup *root_mem_cgroup __read_mostly; +EXPORT_SYMBOL(root_mem_cgroup); /* Active memory cgroup to use from an interrupt context */ DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); @@ -756,7 +757,7 @@ static void mod_memcg_lruvec_state(struct lruvec *lruvec, } /** - * __mod_lruvec_state - update lruvec memory statistics + * mod_lruvec_state - update lruvec memory statistics * @lruvec: the lruvec * @idx: the stat item * @val: delta to add to the counter, can be negative @@ -765,18 +766,18 @@ static void mod_memcg_lruvec_state(struct lruvec *lruvec, * function updates the all three counters that are affected by a * change of state at this level: per-node, per-cgroup, per-lruvec. */ -void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, +void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { /* Update node */ - __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); + mod_node_page_state(lruvec_pgdat(lruvec), idx, val); /* Update memcg and lruvec */ if (!mem_cgroup_disabled()) mod_memcg_lruvec_state(lruvec, idx, val); } -void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, +void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { struct mem_cgroup *memcg; @@ -788,17 +789,17 @@ void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, /* Untracked pages have no memcg, no lruvec. Update only the node */ if (!memcg) { rcu_read_unlock(); - __mod_node_page_state(pgdat, idx, val); + mod_node_page_state(pgdat, idx, val); return; } lruvec = mem_cgroup_lruvec(memcg, pgdat); - __mod_lruvec_state(lruvec, idx, val); + mod_lruvec_state(lruvec, idx, val); rcu_read_unlock(); } -EXPORT_SYMBOL(__lruvec_stat_mod_folio); +EXPORT_SYMBOL(lruvec_stat_mod_folio); -void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) +void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { pg_data_t *pgdat = page_pgdat(virt_to_page(p)); struct mem_cgroup *memcg; @@ -814,10 +815,10 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) * vmstats to keep it correct for the root memcg. */ if (!memcg) { - __mod_node_page_state(pgdat, idx, val); + mod_node_page_state(pgdat, idx, val); } else { lruvec = mem_cgroup_lruvec(memcg, pgdat); - __mod_lruvec_state(lruvec, idx, val); + mod_lruvec_state(lruvec, idx, val); } rcu_read_unlock(); } @@ -1625,6 +1626,37 @@ unsigned long mem_cgroup_size(struct mem_cgroup *memcg) return page_counter_read(&memcg->memory); } +void __memcg_memory_event(struct mem_cgroup *memcg, + enum memcg_memory_event event, bool allow_spinning) +{ + bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || + event == MEMCG_SWAP_FAIL; + + /* For now only MEMCG_MAX can happen with !allow_spinning context. */ + VM_WARN_ON_ONCE(!allow_spinning && event != MEMCG_MAX); + + atomic_long_inc(&memcg->memory_events_local[event]); + if (!swap_event && allow_spinning) + cgroup_file_notify(&memcg->events_local_file); + + do { + atomic_long_inc(&memcg->memory_events[event]); + if (allow_spinning) { + if (swap_event) + cgroup_file_notify(&memcg->swap_events_file); + else + cgroup_file_notify(&memcg->events_file); + } + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + break; + if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) + break; + } while ((memcg = parent_mem_cgroup(memcg)) && + !mem_cgroup_is_root(memcg)); +} +EXPORT_SYMBOL_GPL(__memcg_memory_event); + static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, int order) { @@ -2557,38 +2589,25 @@ static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, } static __always_inline -struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) +struct mem_cgroup *mem_cgroup_from_obj_slab(struct slab *slab, void *p) { /* * Slab objects are accounted individually, not per-page. * Memcg membership data for each individual object is saved in * slab->obj_exts. */ - if (folio_test_slab(folio)) { - struct slabobj_ext *obj_exts; - struct slab *slab; - unsigned int off; - - slab = folio_slab(folio); - obj_exts = slab_obj_exts(slab); - if (!obj_exts) - return NULL; - - off = obj_to_index(slab->slab_cache, slab, p); - if (obj_exts[off].objcg) - return obj_cgroup_memcg(obj_exts[off].objcg); + struct slabobj_ext *obj_exts; + unsigned int off; + obj_exts = slab_obj_exts(slab); + if (!obj_exts) return NULL; - } - /* - * folio_memcg_check() is used here, because in theory we can encounter - * a folio where the slab flag has been cleared already, but - * slab->obj_exts has not been freed yet - * folio_memcg_check() will guarantee that a proper memory - * cgroup pointer or NULL will be returned. - */ - return folio_memcg_check(folio); + off = obj_to_index(slab->slab_cache, slab, p); + if (obj_exts[off].objcg) + return obj_cgroup_memcg(obj_exts[off].objcg); + + return NULL; } /* @@ -2602,10 +2621,15 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) */ struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) { + struct slab *slab; + if (mem_cgroup_disabled()) return NULL; - return mem_cgroup_from_obj_folio(virt_to_folio(p), p); + slab = virt_to_slab(p); + if (slab) + return mem_cgroup_from_obj_slab(slab, p); + return folio_memcg_check(virt_to_folio(p)); } static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) @@ -3888,6 +3912,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) zswap_memcg_offline_cleanup(memcg); memcg_offline_kmem(memcg); + reparent_deferred_split_queue(memcg); reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg); lru_gen_offline_memcg(memcg); @@ -4463,6 +4488,8 @@ static void __memory_events_show(struct seq_file *m, atomic_long_t *events) atomic_long_read(&events[MEMCG_OOM_KILL])); seq_printf(m, "oom_group_kill %lu\n", atomic_long_read(&events[MEMCG_OOM_GROUP_KILL])); + seq_printf(m, "sock_throttled %lu\n", + atomic_long_read(&events[MEMCG_SOCK_THROTTLED])); } static int memory_events_show(struct seq_file *m, void *v) @@ -5443,7 +5470,7 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) * @size: size of compressed object * * This forces the charge after obj_cgroup_may_zswap() allowed - * compression and storage in zwap for this cgroup to go ahead. + * compression and storage in zswap for this cgroup to go ahead. */ void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) { @@ -5601,3 +5628,16 @@ bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid) { return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true; } + +void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg) +{ + if (mem_cgroup_disabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return; + + if (!memcg) + memcg = root_mem_cgroup; + + pr_warn("Memory cgroup min protection %lukB -- low protection %lukB", + K(atomic_long_read(&memcg->memory.children_min_usage)*PAGE_SIZE), + K(atomic_long_read(&memcg->memory.children_low_usage)*PAGE_SIZE)); +} diff --git a/mm/memfd.c b/mm/memfd.c index 1d109c1acf21..ab5312aff14b 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -96,9 +96,36 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) NULL, gfp_mask); if (folio) { + u32 hash; + + /* + * Zero the folio to prevent information leaks to userspace. + * Use folio_zero_user() which is optimized for huge/gigantic + * pages. Pass 0 as addr_hint since this is not a faulting path + * and we don't have a user virtual address yet. + */ + folio_zero_user(folio, 0); + + /* + * Mark the folio uptodate before adding to page cache, + * as required by filemap.c and other hugetlb paths. + */ + __folio_mark_uptodate(folio); + + /* + * Serialize hugepage allocation and instantiation to prevent + * races with concurrent allocations, as required by all other + * callers of hugetlb_add_to_page_cache(). + */ + hash = hugetlb_fault_mutex_hash(memfd->f_mapping, idx); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + err = hugetlb_add_to_page_cache(folio, memfd->f_mapping, idx); + + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + if (err) { folio_put(folio); goto err_unresv; @@ -433,6 +460,8 @@ static struct file *alloc_file(const char *name, unsigned int flags) { unsigned int *file_seals; struct file *file; + struct inode *inode; + int err = 0; if (flags & MFD_HUGETLB) { file = hugetlb_file_setup(name, 0, VM_NORESERVE, @@ -444,12 +473,20 @@ static struct file *alloc_file(const char *name, unsigned int flags) } if (IS_ERR(file)) return file; + + inode = file_inode(file); + err = security_inode_init_security_anon(inode, + &QSTR(MEMFD_ANON_NAME), NULL); + if (err) { + fput(file); + file = ERR_PTR(err); + return file; + } + file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; file->f_flags |= O_LARGEFILE; if (flags & MFD_NOEXEC_SEAL) { - struct inode *inode = file_inode(file); - inode->i_mode &= ~0111; file_seals = memfd_file_seals_ptr(file); if (file_seals) { @@ -470,9 +507,9 @@ SYSCALL_DEFINE2(memfd_create, const char __user *, uname, unsigned int, flags) { - struct file *file; - int fd, error; - char *name; + char *name __free(kfree) = NULL; + unsigned int fd_flags; + int error; error = sanitize_flags(&flags); if (error < 0) @@ -482,25 +519,6 @@ SYSCALL_DEFINE2(memfd_create, if (IS_ERR(name)) return PTR_ERR(name); - fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); - if (fd < 0) { - error = fd; - goto err_free_name; - } - - file = alloc_file(name, flags); - if (IS_ERR(file)) { - error = PTR_ERR(file); - goto err_free_fd; - } - - fd_install(fd, file); - kfree(name); - return fd; - -err_free_fd: - put_unused_fd(fd); -err_free_name: - kfree(name); - return error; + fd_flags = (flags & MFD_CLOEXEC) ? O_CLOEXEC : 0; + return FD_ADD(fd_flags, alloc_file(name, flags)); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 3edebb0cda30..fbc5a01260c8 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -38,6 +38,7 @@ #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/memory-failure.h> #include <linux/page-flags.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> @@ -50,7 +51,7 @@ #include <linux/backing-dev.h> #include <linux/migrate.h> #include <linux/slab.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/hugetlb.h> #include <linux/memory_hotplug.h> #include <linux/mm_inline.h> @@ -60,9 +61,12 @@ #include <linux/pagewalk.h> #include <linux/shmem_fs.h> #include <linux/sysctl.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/memory-failure.h> + #include "swap.h" #include "internal.h" -#include "ras/ras_event.h" static int sysctl_memory_failure_early_kill __read_mostly; @@ -154,6 +158,10 @@ static const struct ctl_table memory_failure_table[] = { } }; +static struct rb_root_cached pfn_space_itree = RB_ROOT_CACHED; + +static DEFINE_MUTEX(pfn_space_lock); + /* * Return values: * 1: the page is dissolved (if needed) and taken off from buddy, @@ -688,10 +696,10 @@ static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift, if (pte_present(pte)) { pfn = pte_pfn(pte); } else { - swp_entry_t swp = pte_to_swp_entry(pte); + const softleaf_t entry = softleaf_from_pte(pte); - if (is_hwpoison_entry(swp)) - pfn = swp_offset_pfn(swp); + if (softleaf_is_hwpoison(entry)) + pfn = softleaf_to_pfn(entry); } if (!pfn || pfn != poisoned_pfn) @@ -885,6 +893,7 @@ static const char * const action_page_types[] = { [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", [MF_MSG_ALREADY_POISONED] = "already poisoned page", + [MF_MSG_PFN_MAP] = "non struct page pfn", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -1277,7 +1286,7 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type, { trace_memory_failure_event(pfn, type, result); - if (type != MF_MSG_ALREADY_POISONED) { + if (type != MF_MSG_ALREADY_POISONED && type != MF_MSG_PFN_MAP) { num_poisoned_pages_inc(pfn); update_per_node_mf_stats(pfn, result); } @@ -1653,12 +1662,13 @@ static int identify_page_state(unsigned long pfn, struct page *p, * there is still more to do, hence the page refcount we took earlier * is still needed. */ -static int try_to_split_thp_page(struct page *page, bool release) +static int try_to_split_thp_page(struct page *page, unsigned int new_order, + bool release) { int ret; lock_page(page); - ret = split_huge_page(page); + ret = split_huge_page_to_order(page, new_order); unlock_page(page); if (ret && release) @@ -2140,8 +2150,140 @@ static void kill_procs_now(struct page *p, unsigned long pfn, int flags, { LIST_HEAD(tokill); + folio_lock(folio); collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED); + folio_unlock(folio); + + kill_procs(&tokill, true, pfn, flags); +} + +int register_pfn_address_space(struct pfn_address_space *pfn_space) +{ + guard(mutex)(&pfn_space_lock); + + if (interval_tree_iter_first(&pfn_space_itree, + pfn_space->node.start, + pfn_space->node.last)) + return -EBUSY; + + interval_tree_insert(&pfn_space->node, &pfn_space_itree); + + return 0; +} +EXPORT_SYMBOL_GPL(register_pfn_address_space); + +void unregister_pfn_address_space(struct pfn_address_space *pfn_space) +{ + guard(mutex)(&pfn_space_lock); + + if (interval_tree_iter_first(&pfn_space_itree, + pfn_space->node.start, + pfn_space->node.last)) + interval_tree_remove(&pfn_space->node, &pfn_space_itree); +} +EXPORT_SYMBOL_GPL(unregister_pfn_address_space); + +static void add_to_kill_pfn(struct task_struct *tsk, + struct vm_area_struct *vma, + struct list_head *to_kill, + unsigned long pfn) +{ + struct to_kill *tk; + + tk = kmalloc(sizeof(*tk), GFP_ATOMIC); + if (!tk) { + pr_info("Unable to kill proc %d\n", tsk->pid); + return; + } + + /* Check for pgoff not backed by struct page */ + tk->addr = vma_address(vma, pfn, 1); + tk->size_shift = PAGE_SHIFT; + + if (tk->addr == -EFAULT) + pr_info("Unable to find address %lx in %s\n", + pfn, tsk->comm); + + get_task_struct(tsk); + tk->tsk = tsk; + list_add_tail(&tk->nd, to_kill); +} + +/* + * Collect processes when the error hit a PFN not backed by struct page. + */ +static void collect_procs_pfn(struct address_space *mapping, + unsigned long pfn, struct list_head *to_kill) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + + i_mmap_lock_read(mapping); + rcu_read_lock(); + for_each_process(tsk) { + struct task_struct *t = tsk; + + t = task_early_kill(tsk, true); + if (!t) + continue; + vma_interval_tree_foreach(vma, &mapping->i_mmap, pfn, pfn) { + if (vma->vm_mm == t->mm) + add_to_kill_pfn(t, vma, to_kill, pfn); + } + } + rcu_read_unlock(); + i_mmap_unlock_read(mapping); +} + +/** + * memory_failure_pfn - Handle memory failure on a page not backed by + * struct page. + * @pfn: Page Number of the corrupted page + * @flags: fine tune action taken + * + * Return: + * 0 - success, + * -EBUSY - Page PFN does not belong to any address space mapping. + */ +static int memory_failure_pfn(unsigned long pfn, int flags) +{ + struct interval_tree_node *node; + LIST_HEAD(tokill); + + scoped_guard(mutex, &pfn_space_lock) { + bool mf_handled = false; + + /* + * Modules registers with MM the address space mapping to + * the device memory they manage. Iterate to identify + * exactly which address space has mapped to this failing + * PFN. + */ + for (node = interval_tree_iter_first(&pfn_space_itree, pfn, pfn); node; + node = interval_tree_iter_next(node, pfn, pfn)) { + struct pfn_address_space *pfn_space = + container_of(node, struct pfn_address_space, node); + + collect_procs_pfn(pfn_space->mapping, pfn, &tokill); + + mf_handled = true; + } + + if (!mf_handled) + return action_result(pfn, MF_MSG_PFN_MAP, MF_IGNORED); + } + + /* + * Unlike System-RAM there is no possibility to swap in a different + * physical page at a given virtual address, so all userspace + * consumption of direct PFN memory necessitates SIGBUS (i.e. + * MF_MUST_KILL) + */ + flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; + kill_procs(&tokill, true, pfn, flags); + + return action_result(pfn, MF_MSG_PFN_MAP, MF_RECOVERED); } /** @@ -2193,6 +2335,14 @@ int memory_failure(unsigned long pfn, int flags) if (res == 0) goto unlock_mutex; + if (!pfn_valid(pfn) && !arch_is_platform_page(PFN_PHYS(pfn))) { + /* + * The PFN is not backed by struct page. + */ + res = memory_failure_pfn(pfn, flags); + goto unlock_mutex; + } + if (pfn_valid(pfn)) { pgmap = get_dev_pagemap(pfn); put_ref_page(pfn, flags); @@ -2274,6 +2424,9 @@ try_again: folio_unlock(folio); if (folio_test_large(folio)) { + const int new_order = min_order_for_split(folio); + int err; + /* * The flag must be set after the refcount is bumped * otherwise it may race with THP split. @@ -2288,7 +2441,16 @@ try_again: * page is a valid handlable page. */ folio_set_has_hwpoisoned(folio); - if (try_to_split_thp_page(p, false) < 0) { + err = try_to_split_thp_page(p, new_order, /* release= */ false); + /* + * If splitting a folio to order-0 fails, kill the process. + * Split the folio regardless to minimize unusable pages. + * Because the memory failure code cannot handle large + * folios, this split is always treated as if it failed. + */ + if (err || new_order) { + /* get folio again in case the original one is split */ + folio = page_folio(p); res = -EHWPOISON; kill_procs_now(p, pfn, flags, folio); put_page(p); @@ -2615,7 +2777,17 @@ static int soft_offline_in_use_page(struct page *page) }; if (!huge && folio_test_large(folio)) { - if (try_to_split_thp_page(page, true)) { + const int new_order = min_order_for_split(folio); + + /* + * If new_order (target split order) is not 0, do not split the + * folio at all to retain the still accessible large folio. + * NOTE: if minimizing the number of soft offline pages is + * preferred, split it to non-zero new_order like it is done in + * memory_failure(). + */ + if (new_order || try_to_split_thp_page(page, /* new_order= */ 0, + /* release= */ true)) { pr_info("%#lx: thp split failed\n", pfn); return -EBUSY; } diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 0ea5c13f10a2..864811fff409 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -519,7 +519,7 @@ static inline void __init_node_memory_type(int node, struct memory_dev_type *mem * for each device getting added in the same NUMA node * with this specific memtype, bump the map count. We * Only take memtype device reference once, so that - * changing a node memtype can be done by droping the + * changing a node memtype can be done by dropping the * only reference count taken here. */ diff --git a/mm/memory.c b/mm/memory.c index 74b45e258323..2a55edc48a65 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -60,11 +60,12 @@ #include <linux/writeback.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/elf.h> #include <linux/gfp.h> #include <linux/migrate.h> #include <linux/string.h> +#include <linux/shmem_fs.h> #include <linux/memory-tiers.h> #include <linux/debugfs.h> #include <linux/userfaultfd_k.h> @@ -75,13 +76,13 @@ #include <linux/ptrace.h> #include <linux/vmalloc.h> #include <linux/sched/sysctl.h> +#include <linux/pgalloc.h> +#include <linux/uaccess.h> #include <trace/events/kmem.h> #include <asm/io.h> #include <asm/mmu_context.h> -#include <asm/pgalloc.h> -#include <linux/uaccess.h> #include <asm/tlb.h> #include <asm/tlbflush.h> @@ -108,7 +109,7 @@ static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf) if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) return false; - return pte_marker_uffd_wp(vmf->orig_pte); + return pte_is_uffd_wp_marker(vmf->orig_pte); } /* @@ -901,7 +902,8 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, static int try_restore_exclusive_pte(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t orig_pte) { - struct page *page = pfn_swap_entry_to_page(pte_to_swp_entry(orig_pte)); + const softleaf_t entry = softleaf_from_pte(orig_pte); + struct page *page = softleaf_to_page(entry); struct folio *folio = page_folio(page); if (folio_trylock(folio)) { @@ -926,12 +928,12 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, { vm_flags_t vm_flags = dst_vma->vm_flags; pte_t orig_pte = ptep_get(src_pte); + softleaf_t entry = softleaf_from_pte(orig_pte); pte_t pte = orig_pte; struct folio *folio; struct page *page; - swp_entry_t entry = pte_to_swp_entry(orig_pte); - if (likely(!non_swap_entry(entry))) { + if (likely(softleaf_is_swap(entry))) { if (swap_duplicate(entry) < 0) return -EIO; @@ -949,12 +951,12 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, set_pte_at(src_mm, addr, src_pte, pte); } rss[MM_SWAPENTS]++; - } else if (is_migration_entry(entry)) { - folio = pfn_swap_entry_folio(entry); + } else if (softleaf_is_migration(entry)) { + folio = softleaf_to_folio(entry); rss[mm_counter(folio)]++; - if (!is_readable_migration_entry(entry) && + if (!softleaf_is_migration_read(entry) && is_cow_mapping(vm_flags)) { /* * COW mappings require pages in both parent and child @@ -963,15 +965,15 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ entry = make_readable_migration_entry( swp_offset(entry)); - pte = swp_entry_to_pte(entry); + pte = softleaf_to_pte(entry); if (pte_swp_soft_dirty(orig_pte)) pte = pte_swp_mksoft_dirty(pte); if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } - } else if (is_device_private_entry(entry)) { - page = pfn_swap_entry_to_page(entry); + } else if (softleaf_is_device_private(entry)) { + page = softleaf_to_page(entry); folio = page_folio(page); /* @@ -995,7 +997,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, * when a device driver is involved (you cannot easily * save and restore device driver state). */ - if (is_writable_device_private_entry(entry) && + if (softleaf_is_device_private_write(entry) && is_cow_mapping(vm_flags)) { entry = make_readable_device_private_entry( swp_offset(entry)); @@ -1004,7 +1006,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } - } else if (is_device_exclusive_entry(entry)) { + } else if (softleaf_is_device_exclusive(entry)) { /* * Make device exclusive entries present by restoring the * original entry then copying as for a present pte. Device @@ -1015,7 +1017,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (try_restore_exclusive_pte(src_vma, addr, src_pte, orig_pte)) return -EBUSY; return -ENOENT; - } else if (is_pte_marker_entry(entry)) { + } else if (softleaf_is_marker(entry)) { pte_marker marker = copy_pte_marker(entry, dst_vma); if (marker) @@ -1216,7 +1218,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, spinlock_t *src_ptl, *dst_ptl; int progress, max_nr, ret = 0; int rss[NR_MM_COUNTERS]; - swp_entry_t entry = (swp_entry_t){0}; + softleaf_t entry = softleaf_mk_none(); struct folio *prealloc = NULL; int nr; @@ -1280,7 +1282,7 @@ again: dst_vma, src_vma, addr, rss); if (ret == -EIO) { - entry = pte_to_swp_entry(ptep_get(src_pte)); + entry = softleaf_from_pte(ptep_get(src_pte)); break; } else if (ret == -EBUSY) { break; @@ -1373,8 +1375,9 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); - if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)) { + if (pmd_is_huge(*src_pmd)) { int err; + VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma); err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, dst_vma, src_vma); @@ -1462,18 +1465,12 @@ copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, static bool vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { + if (src_vma->vm_flags & VM_COPY_ON_FORK) + return true; /* - * Always copy pgtables when dst_vma has uffd-wp enabled even if it's - * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable - * contains uffd-wp protection information, that's something we can't - * retrieve from page cache, and skip copying will lose those info. + * The presence of an anon_vma indicates an anonymous VMA has page + * tables which naturally cannot be reconstituted on page fault. */ - if (userfaultfd_wp(dst_vma)) - return true; - - if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) - return true; - if (src_vma->anon_vma) return true; @@ -1593,7 +1590,9 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, { bool was_installed = false; -#ifdef CONFIG_PTE_MARKER_UFFD_WP + if (!uffd_supports_wp_marker()) + return false; + /* Zap on anonymous always means dropping everything */ if (vma_is_anonymous(vma)) return false; @@ -1610,7 +1609,7 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, pte++; addr += PAGE_SIZE; } -#endif + return was_installed; } @@ -1716,14 +1715,14 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, unsigned int max_nr, unsigned long addr, struct zap_details *details, int *rss, bool *any_skipped) { - swp_entry_t entry; + softleaf_t entry; int nr = 1; *any_skipped = true; - entry = pte_to_swp_entry(ptent); - if (is_device_private_entry(entry) || - is_device_exclusive_entry(entry)) { - struct page *page = pfn_swap_entry_to_page(entry); + entry = softleaf_from_pte(ptent); + if (softleaf_is_device_private(entry) || + softleaf_is_device_exclusive(entry)) { + struct page *page = softleaf_to_page(entry); struct folio *folio = page_folio(page); if (unlikely(!should_zap_folio(details, folio))) @@ -1738,7 +1737,7 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, rss[mm_counter(folio)]--; folio_remove_rmap_pte(folio, page, vma); folio_put(folio); - } else if (!non_swap_entry(entry)) { + } else if (softleaf_is_swap(entry)) { /* Genuine swap entries, hence a private anon pages */ if (!should_zap_cows(details)) return 1; @@ -1746,20 +1745,20 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, nr = swap_pte_batch(pte, max_nr, ptent); rss[MM_SWAPENTS] -= nr; free_swap_and_cache_nr(entry, nr); - } else if (is_migration_entry(entry)) { - struct folio *folio = pfn_swap_entry_folio(entry); + } else if (softleaf_is_migration(entry)) { + struct folio *folio = softleaf_to_folio(entry); if (!should_zap_folio(details, folio)) return 1; rss[mm_counter(folio)]--; - } else if (pte_marker_entry_uffd_wp(entry)) { + } else if (softleaf_is_uffd_wp_marker(entry)) { /* * For anon: always drop the marker; for file: only * drop the marker if explicitly requested. */ if (!vma_is_anonymous(vma) && !zap_drop_markers(details)) return 1; - } else if (is_guard_swp_entry(entry)) { + } else if (softleaf_is_guard_marker(entry)) { /* * Ordinary zapping should not remove guard PTE * markers. Only do so if we should remove PTE markers @@ -1767,7 +1766,8 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, */ if (!zap_drop_markers(details)) return 1; - } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { + } else if (softleaf_is_hwpoison(entry) || + softleaf_is_poison_marker(entry)) { if (!should_zap_cows(details)) return 1; } else { @@ -1920,7 +1920,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)) { + if (pmd_is_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) __split_huge_pmd(vma, pmd, addr, false); else if (zap_huge_pmd(tlb, vma, pmd, addr)) { @@ -2022,8 +2022,7 @@ void unmap_page_range(struct mmu_gather *tlb, static void unmap_single_vma(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, - struct zap_details *details, bool mm_wr_locked) + unsigned long end_addr, struct zap_details *details) { unsigned long start = max(vma->vm_start, start_addr); unsigned long end; @@ -2069,7 +2068,6 @@ static void unmap_single_vma(struct mmu_gather *tlb, * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping * @tree_end: The maximum index to check - * @mm_wr_locked: lock flag * * Unmap all pages in the vma list. * @@ -2084,8 +2082,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, */ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long tree_end, - bool mm_wr_locked) + unsigned long end_addr, unsigned long tree_end) { struct mmu_notifier_range range; struct zap_details details = { @@ -2101,8 +2098,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, unsigned long start = start_addr; unsigned long end = end_addr; hugetlb_zap_begin(vma, &start, &end); - unmap_single_vma(tlb, vma, start, end, &details, - mm_wr_locked); + unmap_single_vma(tlb, vma, start, end, &details); hugetlb_zap_end(vma, &details); vma = mas_find(mas, tree_end - 1); } while (vma && likely(!xa_is_zero(vma))); @@ -2138,7 +2134,7 @@ void zap_page_range_single_batched(struct mmu_gather *tlb, * unmap 'address-end' not 'range.start-range.end' as range * could have been expanded for hugetlb pmd sharing. */ - unmap_single_vma(tlb, vma, address, end, details, false); + unmap_single_vma(tlb, vma, address, end, details); mmu_notifier_invalidate_range_end(&range); if (is_vm_hugetlb_page(vma)) { /* @@ -2899,6 +2895,25 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, return 0; } +static int get_remap_pgoff(vm_flags_t vm_flags, unsigned long addr, + unsigned long end, unsigned long vm_start, unsigned long vm_end, + unsigned long pfn, pgoff_t *vm_pgoff_p) +{ + /* + * There's a horrible special case to handle copy-on-write + * behaviour that some programs depend on. We mark the "original" + * un-COW'ed pages by matching them up with "vma->vm_pgoff". + * See vm_normal_page() for details. + */ + if (is_cow_mapping(vm_flags)) { + if (addr != vm_start || end != vm_end) + return -EINVAL; + *vm_pgoff_p = pfn; + } + + return 0; +} + static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { @@ -2911,31 +2926,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) return -EINVAL; - /* - * Physically remapped pages are special. Tell the - * rest of the world about it: - * VM_IO tells people not to look at these pages - * (accesses can have side effects). - * VM_PFNMAP tells the core MM that the base pages are just - * raw PFN mappings, and do not have a "struct page" associated - * with them. - * VM_DONTEXPAND - * Disable vma merging and expanding with mremap(). - * VM_DONTDUMP - * Omit vma from core dump, even when VM_IO turned off. - * - * There's a horrible special case to handle copy-on-write - * behaviour that some programs depend on. We mark the "original" - * un-COW'ed pages by matching them up with "vma->vm_pgoff". - * See vm_normal_page() for details. - */ - if (is_cow_mapping(vma->vm_flags)) { - if (addr != vma->vm_start || end != vma->vm_end) - return -EINVAL; - vma->vm_pgoff = pfn; - } - - vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); + VM_WARN_ON_ONCE((vma->vm_flags & VM_REMAP_FLAGS) != VM_REMAP_FLAGS); BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; @@ -2956,7 +2947,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad * Variant of remap_pfn_range that does not call track_pfn_remap. The caller * must have pre-validated the caching bits of the pgprot_t. */ -int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, +static int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { int error = remap_pfn_range_internal(vma, addr, pfn, size, prot); @@ -3001,23 +2992,9 @@ void pfnmap_track_ctx_release(struct kref *ref) pfnmap_untrack(ctx->pfn, ctx->size); kfree(ctx); } -#endif /* __HAVE_PFNMAP_TRACKING */ -/** - * remap_pfn_range - remap kernel memory to userspace - * @vma: user vma to map to - * @addr: target page aligned user address to start at - * @pfn: page frame number of kernel physical memory address - * @size: size of mapping area - * @prot: page protection flags for this mapping - * - * Note: this is only safe if the mm semaphore is held when called. - * - * Return: %0 on success, negative error code otherwise. - */ -#ifdef __HAVE_PFNMAP_TRACKING -int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot) +static int remap_pfn_range_track(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) { struct pfnmap_track_ctx *ctx = NULL; int err; @@ -3053,15 +3030,78 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, return err; } +static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + return remap_pfn_range_track(vma, addr, pfn, size, prot); +} #else -int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot) +static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) { return remap_pfn_range_notrack(vma, addr, pfn, size, prot); } #endif + +void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) +{ + /* + * We set addr=VMA start, end=VMA end here, so this won't fail, but we + * check it again on complete and will fail there if specified addr is + * invalid. + */ + get_remap_pgoff(desc->vm_flags, desc->start, desc->end, + desc->start, desc->end, pfn, &desc->pgoff); + desc->vm_flags |= VM_REMAP_FLAGS; +} + +static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size) +{ + unsigned long end = addr + PAGE_ALIGN(size); + int err; + + err = get_remap_pgoff(vma->vm_flags, addr, end, + vma->vm_start, vma->vm_end, + pfn, &vma->vm_pgoff); + if (err) + return err; + + vm_flags_set(vma, VM_REMAP_FLAGS); + return 0; +} + +/** + * remap_pfn_range - remap kernel memory to userspace + * @vma: user vma to map to + * @addr: target page aligned user address to start at + * @pfn: page frame number of kernel physical memory address + * @size: size of mapping area + * @prot: page protection flags for this mapping + * + * Note: this is only safe if the mm semaphore is held when called. + * + * Return: %0 on success, negative error code otherwise. + */ +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + int err; + + err = remap_pfn_range_prepare_vma(vma, addr, pfn, size); + if (err) + return err; + + return do_remap_pfn_range(vma, addr, pfn, size, prot); +} EXPORT_SYMBOL(remap_pfn_range); +int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + return do_remap_pfn_range(vma, addr, pfn, size, prot); +} + /** * vm_iomap_memory - remap memory to userspace * @vma: user vma to map to @@ -4327,7 +4367,7 @@ static inline bool should_try_to_free_swap(struct folio *folio, * If we want to map a page that's in the swapcache writable, we * have to detect via the refcount if we're really the exclusive * user. Try freeing the swapcache to get rid of the swapcache - * reference only in case it's likely that we'll be the exlusive user. + * reference only in case it's likely that we'll be the exclusive user. */ return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) && folio_ref_count(folio) == (1 + folio_nr_pages(folio)); @@ -4345,7 +4385,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf) * * This should also cover the case where e.g. the pte changed * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED. - * So is_pte_marker() check is not enough to safely drop the pte. + * So pte_is_marker() check is not enough to safely drop the pte. */ if (pte_same(vmf->orig_pte, ptep_get(vmf->pte))) pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); @@ -4379,8 +4419,8 @@ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf) static vm_fault_t handle_pte_marker(struct vm_fault *vmf) { - swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte); - unsigned long marker = pte_marker_get(entry); + const softleaf_t entry = softleaf_from_pte(vmf->orig_pte); + const pte_marker marker = softleaf_to_marker(entry); /* * PTE markers should never be empty. If anything weird happened, @@ -4397,7 +4437,7 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) if (marker & PTE_MARKER_GUARD) return VM_FAULT_SIGSEGV; - if (pte_marker_entry_uffd_wp(entry)) + if (softleaf_is_uffd_wp_marker(entry)) return pte_marker_handle_uffd_wp(vmf); /* This is an unknown pte marker */ @@ -4408,13 +4448,13 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio; - swp_entry_t entry; + softleaf_t entry; folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address); if (!folio) return NULL; - entry = pte_to_swp_entry(vmf->orig_pte); + entry = softleaf_from_pte(vmf->orig_pte); if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, GFP_KERNEL, entry)) { folio_put(folio); @@ -4432,7 +4472,7 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf) static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) { unsigned long addr; - swp_entry_t entry; + softleaf_t entry; int idx; pte_t pte; @@ -4442,7 +4482,7 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx))) return false; - entry = pte_to_swp_entry(pte); + entry = softleaf_from_pte(pte); if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages) return false; @@ -4488,7 +4528,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) unsigned long orders; struct folio *folio; unsigned long addr; - swp_entry_t entry; + softleaf_t entry; spinlock_t *ptl; pte_t *pte; gfp_t gfp; @@ -4509,7 +4549,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) if (!zswap_never_enabled()) goto fallback; - entry = pte_to_swp_entry(vmf->orig_pte); + entry = softleaf_from_pte(vmf->orig_pte); /* * Get a list of all the (large) orders below PMD_ORDER that are enabled * and suitable for swapping THP. @@ -4588,7 +4628,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) rmap_t rmap_flags = RMAP_NONE; bool need_clear_cache = false; bool exclusive = false; - swp_entry_t entry; + softleaf_t entry; pte_t pte; vm_fault_t ret = 0; void *shadow = NULL; @@ -4600,15 +4640,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (!pte_unmap_same(vmf)) goto out; - entry = pte_to_swp_entry(vmf->orig_pte); - if (unlikely(non_swap_entry(entry))) { - if (is_migration_entry(entry)) { + entry = softleaf_from_pte(vmf->orig_pte); + if (unlikely(!softleaf_is_swap(entry))) { + if (softleaf_is_migration(entry)) { migration_entry_wait(vma->vm_mm, vmf->pmd, vmf->address); - } else if (is_device_exclusive_entry(entry)) { - vmf->page = pfn_swap_entry_to_page(entry); + } else if (softleaf_is_device_exclusive(entry)) { + vmf->page = softleaf_to_page(entry); ret = remove_device_exclusive_entry(vmf); - } else if (is_device_private_entry(entry)) { + } else if (softleaf_is_device_private(entry)) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) { /* * migrate_to_ram is not yet ready to operate @@ -4619,7 +4659,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out; } - vmf->page = pfn_swap_entry_to_page(entry); + vmf->page = softleaf_to_page(entry); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte || @@ -4643,9 +4683,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } else { pte_unmap_unlock(vmf->pte, vmf->ptl); } - } else if (is_hwpoison_entry(entry)) { + } else if (softleaf_is_hwpoison(entry)) { ret = VM_FAULT_HWPOISON; - } else if (is_pte_marker_entry(entry)) { + } else if (softleaf_is_marker(entry)) { ret = handle_pte_marker(vmf); } else { print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); @@ -5404,7 +5444,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *pa /** * set_pte_range - Set a range of PTEs to point to pages in a folio. - * @vmf: Fault decription. + * @vmf: Fault description. * @folio: The folio that contains @page. * @page: The first page to create a PTE for. * @nr: The number of PTEs to create. @@ -5501,8 +5541,25 @@ fallback: return ret; } + if (!needs_fallback && vma->vm_file) { + struct address_space *mapping = vma->vm_file->f_mapping; + pgoff_t file_end; + + file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); + + /* + * Do not allow to map with PTEs beyond i_size and with PMD + * across i_size to preserve SIGBUS semantics. + * + * Make an exception for shmem/tmpfs that for long time + * intentionally mapped with PMDs across i_size. + */ + needs_fallback = !shmem_mapping(mapping) && + file_end < folio_next_index(folio); + } + if (pmd_none(*vmf->pmd)) { - if (folio_test_pmd_mappable(folio)) { + if (!needs_fallback && folio_test_pmd_mappable(folio)) { ret = do_set_pmd(vmf, folio, page); if (ret != VM_FAULT_FALLBACK) return ret; @@ -6116,6 +6173,45 @@ split: } /* + * The page faults may be spurious because of the racy access to the + * page table. For example, a non-populated virtual page is accessed + * on 2 CPUs simultaneously, thus the page faults are triggered on + * both CPUs. However, it's possible that one CPU (say CPU A) cannot + * find the reason for the page fault if the other CPU (say CPU B) has + * changed the page table before the PTE is checked on CPU A. Most of + * the time, the spurious page faults can be ignored safely. However, + * if the page fault is for the write access, it's possible that a + * stale read-only TLB entry exists in the local CPU and needs to be + * flushed on some architectures. This is called the spurious page + * fault fixing. + * + * Note: flush_tlb_fix_spurious_fault() is defined as flush_tlb_page() + * by default and used as such on most architectures, while + * flush_tlb_fix_spurious_fault_pmd() is defined as NOP by default and + * used as such on most architectures. + */ +static void fix_spurious_fault(struct vm_fault *vmf, + enum pgtable_level ptlevel) +{ + /* Skip spurious TLB flush for retried page fault */ + if (vmf->flags & FAULT_FLAG_TRIED) + return; + /* + * This is needed only for protection faults but the arch code + * is not yet telling us if this is a protection fault or not. + * This still avoids useless tlb flushes for .text page faults + * with threads. + */ + if (vmf->flags & FAULT_FLAG_WRITE) { + if (ptlevel == PGTABLE_LEVEL_PTE) + flush_tlb_fix_spurious_fault(vmf->vma, vmf->address, + vmf->pte); + else + flush_tlb_fix_spurious_fault_pmd(vmf->vma, vmf->address, + vmf->pmd); + } +} +/* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most * RISC architectures). The early dirtying is also good on the i386. @@ -6196,23 +6292,11 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) } entry = pte_mkyoung(entry); if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, - vmf->flags & FAULT_FLAG_WRITE)) { + vmf->flags & FAULT_FLAG_WRITE)) update_mmu_cache_range(vmf, vmf->vma, vmf->address, vmf->pte, 1); - } else { - /* Skip spurious TLB flush for retried page fault */ - if (vmf->flags & FAULT_FLAG_TRIED) - goto unlock; - /* - * This is needed only for protection faults but the arch code - * is not yet telling us if this is a protection fault or not. - * This still avoids useless tlb flushes for .text page faults - * with threads. - */ - if (vmf->flags & FAULT_FLAG_WRITE) - flush_tlb_fix_spurious_fault(vmf->vma, vmf->address, - vmf->pte); - } + else + fix_spurious_fault(vmf, PGTABLE_LEVEL_PTE); unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; @@ -6287,34 +6371,43 @@ retry_pud: if (pmd_none(*vmf.pmd) && thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) { ret = create_huge_pmd(&vmf); - if (!(ret & VM_FAULT_FALLBACK)) + if (ret & VM_FAULT_FALLBACK) + goto fallback; + else return ret; - } else { - vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); + } - if (unlikely(is_swap_pmd(vmf.orig_pmd))) { - VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(vmf.orig_pmd)); - if (is_pmd_migration_entry(vmf.orig_pmd)) - pmd_migration_entry_wait(mm, vmf.pmd); - return 0; - } - if (pmd_trans_huge(vmf.orig_pmd)) { - if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) - return do_huge_pmd_numa_page(&vmf); + vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); + if (pmd_none(vmf.orig_pmd)) + goto fallback; - if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && - !pmd_write(vmf.orig_pmd)) { - ret = wp_huge_pmd(&vmf); - if (!(ret & VM_FAULT_FALLBACK)) - return ret; - } else { - huge_pmd_set_accessed(&vmf); - return 0; - } + if (unlikely(!pmd_present(vmf.orig_pmd))) { + if (pmd_is_device_private_entry(vmf.orig_pmd)) + return do_huge_pmd_device_private(&vmf); + + if (pmd_is_migration_entry(vmf.orig_pmd)) + pmd_migration_entry_wait(mm, vmf.pmd); + return 0; + } + if (pmd_trans_huge(vmf.orig_pmd)) { + if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) + return do_huge_pmd_numa_page(&vmf); + + if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && + !pmd_write(vmf.orig_pmd)) { + ret = wp_huge_pmd(&vmf); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } else { + vmf.ptl = pmd_lock(mm, vmf.pmd); + if (!huge_pmd_set_accessed(&vmf)) + fix_spurious_fault(&vmf, PGTABLE_LEVEL_PMD); + spin_unlock(vmf.ptl); + return 0; } } +fallback: return handle_pte_fault(&vmf); } @@ -6672,12 +6765,12 @@ retry: goto out; p4dp = p4d_offset(pgdp, address); - p4d = READ_ONCE(*p4dp); + p4d = p4dp_get(p4dp); if (p4d_none(p4d) || unlikely(p4d_bad(p4d))) goto out; pudp = pud_offset(p4dp, address); - pud = READ_ONCE(*pudp); + pud = pudp_get(pudp); if (pud_none(pud)) goto out; if (pud_leaf(pud)) { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0be83039c3b5..a63ec679d861 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1088,7 +1088,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group, } int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, - struct zone *zone, bool mhp_off_inaccessible) + struct zone *zone) { unsigned long end_pfn = pfn + nr_pages; int ret, i; @@ -1097,15 +1097,6 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, if (ret) return ret; - /* - * Memory block is accessible at this stage and hence poison the struct - * pages now. If the memory block is accessible during memory hotplug - * addition phase, then page poisining is already performed in - * sparse_add_section(). - */ - if (mhp_off_inaccessible) - page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages); - move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE, false); @@ -1311,7 +1302,7 @@ static int __try_online_node(int nid, bool set_node_online) if (set_node_online) { node_set_online(nid); - ret = register_one_node(nid); + ret = register_node(nid); BUG_ON(ret); } out: @@ -1444,7 +1435,7 @@ static void remove_memory_blocks_and_altmaps(u64 start, u64 size) } static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, - u64 start, u64 size, mhp_t mhp_flags) + u64 start, u64 size) { unsigned long memblock_size = memory_block_size_bytes(); u64 cur_start; @@ -1460,8 +1451,6 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, }; mhp_altmap.free = memory_block_memmap_on_memory_pages(); - if (mhp_flags & MHP_OFFLINE_INACCESSIBLE) - mhp_altmap.inaccessible = true; params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap), GFP_KERNEL); if (!params.altmap) { @@ -1542,7 +1531,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) goto error_memblock_remove; if (ret) { node_set_online(nid); - ret = register_one_node(nid); + ret = register_node(nid); if (WARN_ON(ret)) { node_set_offline(nid); goto error_memblock_remove; @@ -1555,7 +1544,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) */ if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) && mhp_supports_memmap_on_memory()) { - ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags); + ret = create_altmaps_and_memory_blocks(nid, group, start, size); if (ret) goto error; } else { @@ -1596,7 +1585,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) error: if (new_node) { node_set_offline(nid); - unregister_one_node(nid); + unregister_node(nid); } error_memblock_remove: if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) @@ -2201,7 +2190,7 @@ void try_offline_node(int nid) * node now. */ node_set_offline(nid); - unregister_one_node(nid); + unregister_node(nid); } EXPORT_SYMBOL(try_offline_node); @@ -2327,7 +2316,7 @@ static int try_offline_memory_block(struct memory_block *mem, void *arg) * by offlining code ... so we don't care about that. */ page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr)); - if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE) + if (page && page_zonenum(page) == ZONE_MOVABLE) online_type = MMOP_ONLINE_MOVABLE; rc = device_offline(&mem->dev); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index eb83cff7db8c..68a98ba57882 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -85,6 +85,7 @@ #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/numa_balancing.h> +#include <linux/sched/sysctl.h> #include <linux/sched/task.h> #include <linux/nodemask.h> #include <linux/cpuset.h> @@ -99,6 +100,7 @@ #include <linux/swap.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> +#include <linux/memory-tiers.h> #include <linux/migrate.h> #include <linux/ksm.h> #include <linux/rmap.h> @@ -108,7 +110,7 @@ #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/printk.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/gcd.h> #include <asm/tlbflush.h> @@ -354,6 +356,7 @@ struct mempolicy *get_task_policy(struct task_struct *p) return &default_policy; } +EXPORT_SYMBOL_FOR_MODULES(get_task_policy, "kvm"); static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); @@ -487,6 +490,7 @@ void __mpol_put(struct mempolicy *pol) return; kmem_cache_free(policy_cache, pol); } +EXPORT_SYMBOL_FOR_MODULES(__mpol_put, "kvm"); static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) { @@ -645,7 +649,7 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) struct folio *folio; struct queue_pages *qp = walk->private; - if (unlikely(is_pmd_migration_entry(*pmd))) { + if (unlikely(pmd_is_migration_entry(*pmd))) { qp->nr_failed++; return; } @@ -703,7 +707,9 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, if (pte_none(ptent)) continue; if (!pte_present(ptent)) { - if (is_migration_entry(pte_to_swp_entry(ptent))) + const softleaf_t entry = softleaf_from_pte(ptent); + + if (softleaf_is_migration(entry)) qp->nr_failed++; continue; } @@ -766,16 +772,21 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, unsigned long flags = qp->flags; struct folio *folio; spinlock_t *ptl; - pte_t entry; + pte_t ptep; ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); - entry = huge_ptep_get(walk->mm, addr, pte); - if (!pte_present(entry)) { - if (unlikely(is_hugetlb_entry_migration(entry))) - qp->nr_failed++; + ptep = huge_ptep_get(walk->mm, addr, pte); + if (!pte_present(ptep)) { + if (!huge_pte_none(ptep)) { + const softleaf_t entry = softleaf_from_pte(ptep); + + if (unlikely(softleaf_is_migration(entry))) + qp->nr_failed++; + } + goto unlock; } - folio = pfn_folio(pte_pfn(entry)); + folio = pfn_folio(pte_pfn(ptep)); if (!queue_folio_required(folio, qp)) goto unlock; if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || @@ -803,6 +814,65 @@ unlock: } #ifdef CONFIG_NUMA_BALANCING +/** + * folio_can_map_prot_numa() - check whether the folio can map prot numa + * @folio: The folio whose mapping considered for being made NUMA hintable + * @vma: The VMA that the folio belongs to. + * @is_private_single_threaded: Is this a single-threaded private VMA or not + * + * This function checks to see if the folio actually indicates that + * we need to make the mapping one which causes a NUMA hinting fault, + * as there are cases where it's simply unnecessary, and the folio's + * access time is adjusted for memory tiering if prot numa needed. + * + * Return: True if the mapping of the folio needs to be changed, false otherwise. + */ +bool folio_can_map_prot_numa(struct folio *folio, struct vm_area_struct *vma, + bool is_private_single_threaded) +{ + int nid; + + if (!folio || folio_is_zone_device(folio) || folio_test_ksm(folio)) + return false; + + /* Also skip shared copy-on-write folios */ + if (is_cow_mapping(vma->vm_flags) && folio_maybe_mapped_shared(folio)) + return false; + + /* Folios are pinned and can't be migrated */ + if (folio_maybe_dma_pinned(folio)) + return false; + + /* + * While migration can move some dirty folios, + * it cannot move them all from MIGRATE_ASYNC + * context. + */ + if (folio_is_file_lru(folio) && folio_test_dirty(folio)) + return false; + + /* + * Don't mess with PTEs if folio is already on the node + * a single-threaded process is running on. + */ + nid = folio_nid(folio); + if (is_private_single_threaded && (nid == numa_node_id())) + return false; + + /* + * Skip scanning top tier node if normal numa + * balancing is disabled + */ + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && + node_is_toptier(nid)) + return false; + + if (folio_use_access_time(folio)) + folio_xchg_access_time(folio, jiffies_to_msecs(jiffies)); + + return true; +} + /* * This is used to mark a range of virtual addresses to be inaccessible. * These are later cleared by a NUMA hinting fault. Depending on these @@ -2885,6 +2955,7 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, read_unlock(&sp->lock); return pol; } +EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_lookup, "kvm"); static void sp_free(struct sp_node *n) { @@ -3170,6 +3241,7 @@ put_mpol: mpol_put(mpol); /* drop our incoming ref on sb mpol */ } } +EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_init, "kvm"); int mpol_set_shared_policy(struct shared_policy *sp, struct vm_area_struct *vma, struct mempolicy *pol) @@ -3188,6 +3260,7 @@ int mpol_set_shared_policy(struct shared_policy *sp, sp_free(new); return err; } +EXPORT_SYMBOL_FOR_MODULES(mpol_set_shared_policy, "kvm"); /* Free a backing policy store on inode delete. */ void mpol_free_shared_policy(struct shared_policy *sp) @@ -3206,6 +3279,7 @@ void mpol_free_shared_policy(struct shared_policy *sp) } write_unlock(&sp->lock); } +EXPORT_SYMBOL_FOR_MODULES(mpol_free_shared_policy, "kvm"); #ifdef CONFIG_NUMA_BALANCING static int __initdata numabalancing_override; diff --git a/mm/mempool.c b/mm/mempool.c index 1c38e873e546..c290e5261b47 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * linux/mm/mempool.c - * * memory buffer pool support. Such pools are mostly used * for guaranteed, deadlock-free memory allocations during * extreme VM load. @@ -9,7 +7,7 @@ * started by Ingo Molnar, Copyright (C) 2001 * debugging by David Rientjes, Copyright (C) 2015 */ - +#include <linux/fault-inject.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/highmem.h> @@ -20,8 +18,27 @@ #include <linux/writeback.h> #include "slab.h" +static DECLARE_FAULT_ATTR(fail_mempool_alloc); +static DECLARE_FAULT_ATTR(fail_mempool_alloc_bulk); + +static int __init mempool_faul_inject_init(void) +{ + int error; + + error = PTR_ERR_OR_ZERO(fault_create_debugfs_attr("fail_mempool_alloc", + NULL, &fail_mempool_alloc)); + if (error) + return error; + + /* booting will fail on error return here, don't bother to cleanup */ + return PTR_ERR_OR_ZERO( + fault_create_debugfs_attr("fail_mempool_alloc_bulk", NULL, + &fail_mempool_alloc_bulk)); +} +late_initcall(mempool_faul_inject_init); + #ifdef CONFIG_SLUB_DEBUG_ON -static void poison_error(mempool_t *pool, void *element, size_t size, +static void poison_error(struct mempool *pool, void *element, size_t size, size_t byte) { const int nr = pool->curr_nr; @@ -38,7 +55,7 @@ static void poison_error(mempool_t *pool, void *element, size_t size, dump_stack(); } -static void __check_element(mempool_t *pool, void *element, size_t size) +static void __check_element(struct mempool *pool, void *element, size_t size) { u8 *obj = element; size_t i; @@ -54,7 +71,7 @@ static void __check_element(mempool_t *pool, void *element, size_t size) memset(obj, POISON_INUSE, size); } -static void check_element(mempool_t *pool, void *element) +static void check_element(struct mempool *pool, void *element) { /* Skip checking: KASAN might save its metadata in the element. */ if (kasan_enabled()) @@ -68,10 +85,20 @@ static void check_element(mempool_t *pool, void *element) } else if (pool->free == mempool_free_pages) { /* Mempools backed by page allocator */ int order = (int)(long)pool->pool_data; - void *addr = kmap_local_page((struct page *)element); - __check_element(pool, addr, 1UL << (PAGE_SHIFT + order)); - kunmap_local(addr); +#ifdef CONFIG_HIGHMEM + for (int i = 0; i < (1 << order); i++) { + struct page *page = (struct page *)element; + void *addr = kmap_local_page(page + i); + + __check_element(pool, addr, PAGE_SIZE); + kunmap_local(addr); + } +#else + void *addr = page_address((struct page *)element); + + __check_element(pool, addr, PAGE_SIZE << order); +#endif } } @@ -83,7 +110,7 @@ static void __poison_element(void *element, size_t size) obj[size - 1] = POISON_END; } -static void poison_element(mempool_t *pool, void *element) +static void poison_element(struct mempool *pool, void *element) { /* Skip poisoning: KASAN might save its metadata in the element. */ if (kasan_enabled()) @@ -97,22 +124,33 @@ static void poison_element(mempool_t *pool, void *element) } else if (pool->alloc == mempool_alloc_pages) { /* Mempools backed by page allocator */ int order = (int)(long)pool->pool_data; - void *addr = kmap_local_page((struct page *)element); - __poison_element(addr, 1UL << (PAGE_SHIFT + order)); - kunmap_local(addr); +#ifdef CONFIG_HIGHMEM + for (int i = 0; i < (1 << order); i++) { + struct page *page = (struct page *)element; + void *addr = kmap_local_page(page + i); + + __poison_element(addr, PAGE_SIZE); + kunmap_local(addr); + } +#else + void *addr = page_address((struct page *)element); + + __poison_element(addr, PAGE_SIZE << order); +#endif } } #else /* CONFIG_SLUB_DEBUG_ON */ -static inline void check_element(mempool_t *pool, void *element) +static inline void check_element(struct mempool *pool, void *element) { } -static inline void poison_element(mempool_t *pool, void *element) +static inline void poison_element(struct mempool *pool, void *element) { } #endif /* CONFIG_SLUB_DEBUG_ON */ -static __always_inline bool kasan_poison_element(mempool_t *pool, void *element) +static __always_inline bool kasan_poison_element(struct mempool *pool, + void *element) { if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) return kasan_mempool_poison_object(element); @@ -122,7 +160,7 @@ static __always_inline bool kasan_poison_element(mempool_t *pool, void *element) return true; } -static void kasan_unpoison_element(mempool_t *pool, void *element) +static void kasan_unpoison_element(struct mempool *pool, void *element) { if (pool->alloc == mempool_kmalloc) kasan_mempool_unpoison_object(element, (size_t)pool->pool_data); @@ -134,7 +172,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element) (unsigned long)pool->pool_data); } -static __always_inline void add_element(mempool_t *pool, void *element) +static __always_inline void add_element(struct mempool *pool, void *element) { BUG_ON(pool->min_nr != 0 && pool->curr_nr >= pool->min_nr); poison_element(pool, element); @@ -142,7 +180,7 @@ static __always_inline void add_element(mempool_t *pool, void *element) pool->elements[pool->curr_nr++] = element; } -static void *remove_element(mempool_t *pool) +static void *remove_element(struct mempool *pool) { void *element = pool->elements[--pool->curr_nr]; @@ -163,7 +201,7 @@ static void *remove_element(mempool_t *pool) * May be called on a zeroed but uninitialized mempool (i.e. allocated with * kzalloc()). */ -void mempool_exit(mempool_t *pool) +void mempool_exit(struct mempool *pool) { while (pool->curr_nr) { void *element = remove_element(pool); @@ -182,7 +220,7 @@ EXPORT_SYMBOL(mempool_exit); * Free all reserved elements in @pool and @pool itself. This function * only sleeps if the free_fn() function sleeps. */ -void mempool_destroy(mempool_t *pool) +void mempool_destroy(struct mempool *pool) { if (unlikely(!pool)) return; @@ -192,9 +230,9 @@ void mempool_destroy(mempool_t *pool) } EXPORT_SYMBOL(mempool_destroy); -int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data, - gfp_t gfp_mask, int node_id) +int mempool_init_node(struct mempool *pool, int min_nr, + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, + void *pool_data, gfp_t gfp_mask, int node_id) { spin_lock_init(&pool->lock); pool->min_nr = min_nr; @@ -244,8 +282,9 @@ EXPORT_SYMBOL(mempool_init_node); * * Return: %0 on success, negative error code otherwise. */ -int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data) +int mempool_init_noprof(struct mempool *pool, int min_nr, + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, + void *pool_data) { return mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data, GFP_KERNEL, NUMA_NO_NODE); @@ -271,11 +310,11 @@ EXPORT_SYMBOL(mempool_init_noprof); * * Return: pointer to the created memory pool object or %NULL on error. */ -mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data, - gfp_t gfp_mask, int node_id) +struct mempool *mempool_create_node_noprof(int min_nr, + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, + void *pool_data, gfp_t gfp_mask, int node_id) { - mempool_t *pool; + struct mempool *pool; pool = kmalloc_node_noprof(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id); if (!pool) @@ -309,7 +348,7 @@ EXPORT_SYMBOL(mempool_create_node_noprof); * * Return: %0 on success, negative error code otherwise. */ -int mempool_resize(mempool_t *pool, int new_min_nr) +int mempool_resize(struct mempool *pool, int new_min_nr) { void *element; void **new_elements; @@ -371,140 +410,227 @@ out: } EXPORT_SYMBOL(mempool_resize); -/** - * mempool_alloc - allocate an element from a specific memory pool - * @pool: pointer to the memory pool which was allocated via - * mempool_create(). - * @gfp_mask: the usual allocation bitmask. - * - * this function only sleeps if the alloc_fn() function sleeps or - * returns NULL. Note that due to preallocation, this function - * *never* fails when called from process contexts. (it might - * fail if called from an IRQ context.) - * Note: using __GFP_ZERO is not supported. - * - * Return: pointer to the allocated element or %NULL on error. - */ -void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) +static unsigned int mempool_alloc_from_pool(struct mempool *pool, void **elems, + unsigned int count, unsigned int allocated, + gfp_t gfp_mask) { - void *element; unsigned long flags; - wait_queue_entry_t wait; - gfp_t gfp_temp; + unsigned int i; - VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); - might_alloc(gfp_mask); - - gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ - gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ - gfp_mask |= __GFP_NOWARN; /* failures are OK */ + spin_lock_irqsave(&pool->lock, flags); + if (unlikely(pool->curr_nr < count - allocated)) + goto fail; + for (i = 0; i < count; i++) { + if (!elems[i]) { + elems[i] = remove_element(pool); + allocated++; + } + } + spin_unlock_irqrestore(&pool->lock, flags); - gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); + /* Paired with rmb in mempool_free(), read comment there. */ + smp_wmb(); -repeat_alloc: + /* + * Update the allocation stack trace as this is more useful for + * debugging. + */ + for (i = 0; i < count; i++) + kmemleak_update_trace(elems[i]); + return allocated; - element = pool->alloc(gfp_temp, pool->pool_data); - if (likely(element != NULL)) - return element; +fail: + if (gfp_mask & __GFP_DIRECT_RECLAIM) { + DEFINE_WAIT(wait); - spin_lock_irqsave(&pool->lock, flags); - if (likely(pool->curr_nr)) { - element = remove_element(pool); + prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); spin_unlock_irqrestore(&pool->lock, flags); - /* paired with rmb in mempool_free(), read comment there */ - smp_wmb(); + /* - * Update the allocation stack trace as this is more useful - * for debugging. + * Wait for someone else to return an element to @pool, but wake + * up occasionally as memory pressure might have reduced even + * and the normal allocation in alloc_fn could succeed even if + * no element was returned. */ - kmemleak_update_trace(element); - return element; - } - - /* - * We use gfp mask w/o direct reclaim or IO for the first round. If - * alloc failed with that and @pool was empty, retry immediately. - */ - if (gfp_temp != gfp_mask) { + io_schedule_timeout(5 * HZ); + finish_wait(&pool->wait, &wait); + } else { + /* We must not sleep if __GFP_DIRECT_RECLAIM is not set. */ spin_unlock_irqrestore(&pool->lock, flags); - gfp_temp = gfp_mask; - goto repeat_alloc; } - /* We must not sleep if !__GFP_DIRECT_RECLAIM */ - if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { - spin_unlock_irqrestore(&pool->lock, flags); - return NULL; - } + return allocated; +} - /* Let's wait for someone else to return an element to @pool */ - init_wait(&wait); - prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); +/* + * Adjust the gfp flags for mempool allocations, as we never want to dip into + * the global emergency reserves or retry in the page allocator. + * + * The first pass also doesn't want to go reclaim, but the next passes do, so + * return a separate subset for that first iteration. + */ +static inline gfp_t mempool_adjust_gfp(gfp_t *gfp_mask) +{ + *gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; + return *gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO); +} - spin_unlock_irqrestore(&pool->lock, flags); +/** + * mempool_alloc_bulk - allocate multiple elements from a memory pool + * @pool: pointer to the memory pool + * @elems: partially or fully populated elements array + * @count: number of entries in @elem that need to be allocated + * @allocated: number of entries in @elem already allocated + * + * Allocate elements for each slot in @elem that is non-%NULL. This is done by + * first calling into the alloc_fn supplied at pool initialization time, and + * dipping into the reserved pool when alloc_fn fails to allocate an element. + * + * On return all @count elements in @elems will be populated. + * + * Return: Always 0. If it wasn't for %$#^$ alloc tags, it would return void. + */ +int mempool_alloc_bulk_noprof(struct mempool *pool, void **elems, + unsigned int count, unsigned int allocated) +{ + gfp_t gfp_mask = GFP_KERNEL; + gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask); + unsigned int i = 0; + + VM_WARN_ON_ONCE(count > pool->min_nr); + might_alloc(gfp_mask); /* - * FIXME: this should be io_schedule(). The timeout is there as a - * workaround for some DM problems in 2.6.18. + * If an error is injected, fail all elements in a bulk allocation so + * that we stress the multiple elements missing path. */ - io_schedule_timeout(5*HZ); + if (should_fail_ex(&fail_mempool_alloc_bulk, 1, FAULT_NOWARN)) { + pr_info("forcing mempool usage for %pS\n", + (void *)_RET_IP_); + goto use_pool; + } - finish_wait(&pool->wait, &wait); +repeat_alloc: + /* + * Try to allocate the elements using the allocation callback first as + * that might succeed even when the caller's bulk allocation did not. + */ + for (i = 0; i < count; i++) { + if (elems[i]) + continue; + elems[i] = pool->alloc(gfp_temp, pool->pool_data); + if (unlikely(!elems[i])) + goto use_pool; + allocated++; + } + + return 0; + +use_pool: + allocated = mempool_alloc_from_pool(pool, elems, count, allocated, + gfp_temp); + gfp_temp = gfp_mask; goto repeat_alloc; } -EXPORT_SYMBOL(mempool_alloc_noprof); +EXPORT_SYMBOL_GPL(mempool_alloc_bulk_noprof); /** - * mempool_alloc_preallocated - allocate an element from preallocated elements - * belonging to a specific memory pool - * @pool: pointer to the memory pool which was allocated via - * mempool_create(). + * mempool_alloc - allocate an element from a memory pool + * @pool: pointer to the memory pool + * @gfp_mask: GFP_* flags. %__GFP_ZERO is not supported. * - * This function is similar to mempool_alloc, but it only attempts allocating - * an element from the preallocated elements. It does not sleep and immediately - * returns if no preallocated elements are available. + * Allocate an element from @pool. This is done by first calling into the + * alloc_fn supplied at pool initialization time, and dipping into the reserved + * pool when alloc_fn fails to allocate an element. * - * Return: pointer to the allocated element or %NULL if no elements are - * available. + * This function only sleeps if the alloc_fn callback sleeps, or when waiting + * for elements to become available in the pool. + * + * Return: pointer to the allocated element or %NULL when failing to allocate + * an element. Allocation failure can only happen when @gfp_mask does not + * include %__GFP_DIRECT_RECLAIM. */ -void *mempool_alloc_preallocated(mempool_t *pool) +void *mempool_alloc_noprof(struct mempool *pool, gfp_t gfp_mask) { + gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask); void *element; - unsigned long flags; - spin_lock_irqsave(&pool->lock, flags); - if (likely(pool->curr_nr)) { - element = remove_element(pool); - spin_unlock_irqrestore(&pool->lock, flags); - /* paired with rmb in mempool_free(), read comment there */ - smp_wmb(); + VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); + might_alloc(gfp_mask); + +repeat_alloc: + if (should_fail_ex(&fail_mempool_alloc, 1, FAULT_NOWARN)) { + pr_info("forcing mempool usage for %pS\n", + (void *)_RET_IP_); + element = NULL; + } else { + element = pool->alloc(gfp_temp, pool->pool_data); + } + + if (unlikely(!element)) { /* - * Update the allocation stack trace as this is more useful - * for debugging. + * Try to allocate an element from the pool. + * + * The first pass won't have __GFP_DIRECT_RECLAIM and won't + * sleep in mempool_alloc_from_pool. Retry the allocation + * with all flags set in that case. */ - kmemleak_update_trace(element); - return element; + if (!mempool_alloc_from_pool(pool, &element, 1, 0, gfp_temp)) { + if (gfp_temp != gfp_mask) { + gfp_temp = gfp_mask; + goto repeat_alloc; + } + if (gfp_mask & __GFP_DIRECT_RECLAIM) { + goto repeat_alloc; + } + } } - spin_unlock_irqrestore(&pool->lock, flags); - return NULL; + return element; +} +EXPORT_SYMBOL(mempool_alloc_noprof); + +/** + * mempool_alloc_preallocated - allocate an element from preallocated elements + * belonging to a memory pool + * @pool: pointer to the memory pool + * + * This function is similar to mempool_alloc(), but it only attempts allocating + * an element from the preallocated elements. It only takes a single spinlock_t + * and immediately returns if no preallocated elements are available. + * + * Return: pointer to the allocated element or %NULL if no elements are + * available. + */ +void *mempool_alloc_preallocated(struct mempool *pool) +{ + void *element = NULL; + + mempool_alloc_from_pool(pool, &element, 1, 0, GFP_NOWAIT); + return element; } EXPORT_SYMBOL(mempool_alloc_preallocated); /** - * mempool_free - return an element to the pool. - * @element: pool element pointer. - * @pool: pointer to the memory pool which was allocated via - * mempool_create(). + * mempool_free_bulk - return elements to a mempool + * @pool: pointer to the memory pool + * @elems: elements to return + * @count: number of elements to return * - * this function only sleeps if the free_fn() function sleeps. + * Returns a number of elements from the start of @elem to @pool if @pool needs + * replenishing and sets their slots in @elem to NULL. Other elements are left + * in @elem. + * + * Return: number of elements transferred to @pool. Elements are always + * transferred from the beginning of @elem, so the return value can be used as + * an offset into @elem for the freeing the remaining elements in the caller. */ -void mempool_free(void *element, mempool_t *pool) +unsigned int mempool_free_bulk(struct mempool *pool, void **elems, + unsigned int count) { unsigned long flags; - - if (unlikely(element == NULL)) - return; + unsigned int freed = 0; + bool added = false; /* * Paired with the wmb in mempool_alloc(). The preceding read is @@ -538,21 +664,6 @@ void mempool_free(void *element, mempool_t *pool) * Waiters happen iff curr_nr is 0 and the above guarantee also * ensures that there will be frees which return elements to the * pool waking up the waiters. - */ - if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) { - spin_lock_irqsave(&pool->lock, flags); - if (likely(pool->curr_nr < pool->min_nr)) { - add_element(pool, element); - spin_unlock_irqrestore(&pool->lock, flags); - if (wq_has_sleeper(&pool->wait)) - wake_up(&pool->wait); - return; - } - spin_unlock_irqrestore(&pool->lock, flags); - } - - /* - * Handle the min_nr = 0 edge case: * * For zero-minimum pools, curr_nr < min_nr (0 < 0) never succeeds, * so waiters sleeping on pool->wait would never be woken by the @@ -560,20 +671,45 @@ void mempool_free(void *element, mempool_t *pool) * allocation of element when both min_nr and curr_nr are 0, and * any active waiters are properly awakened. */ - if (unlikely(pool->min_nr == 0 && + if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) { + spin_lock_irqsave(&pool->lock, flags); + while (pool->curr_nr < pool->min_nr && freed < count) { + add_element(pool, elems[freed++]); + added = true; + } + spin_unlock_irqrestore(&pool->lock, flags); + } else if (unlikely(pool->min_nr == 0 && READ_ONCE(pool->curr_nr) == 0)) { + /* Handle the min_nr = 0 edge case: */ spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr == 0)) { - add_element(pool, element); - spin_unlock_irqrestore(&pool->lock, flags); - if (wq_has_sleeper(&pool->wait)) - wake_up(&pool->wait); - return; + add_element(pool, elems[freed++]); + added = true; } spin_unlock_irqrestore(&pool->lock, flags); } - pool->free(element, pool->pool_data); + if (unlikely(added) && wq_has_sleeper(&pool->wait)) + wake_up(&pool->wait); + + return freed; +} +EXPORT_SYMBOL_GPL(mempool_free_bulk); + +/** + * mempool_free - return an element to the pool. + * @element: element to return + * @pool: pointer to the memory pool + * + * Returns @element to @pool if it needs replenishing, else frees it using + * the free_fn callback in @pool. + * + * This function only sleeps if the free_fn callback sleeps. + */ +void mempool_free(void *element, struct mempool *pool) +{ + if (likely(element) && !mempool_free_bulk(pool, &element, 1)) + pool->free(element, pool->pool_data); } EXPORT_SYMBOL(mempool_free); @@ -612,19 +748,6 @@ void mempool_kfree(void *element, void *pool_data) } EXPORT_SYMBOL(mempool_kfree); -void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data) -{ - size_t size = (size_t)pool_data; - return kvmalloc(size, gfp_mask); -} -EXPORT_SYMBOL(mempool_kvmalloc); - -void mempool_kvfree(void *element, void *pool_data) -{ - kvfree(element); -} -EXPORT_SYMBOL(mempool_kvfree); - /* * A simple mempool-backed page allocator that allocates pages * of the order specified by pool_data. diff --git a/mm/memremap.c b/mm/memremap.c index 46cb1b0b6f72..4c2e0d68eb27 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -289,8 +289,8 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) WARN(1, "Missing migrate_to_ram method\n"); return ERR_PTR(-EINVAL); } - if (!pgmap->ops->page_free) { - WARN(1, "Missing page_free method\n"); + if (!pgmap->ops->folio_free) { + WARN(1, "Missing folio_free method\n"); return ERR_PTR(-EINVAL); } if (!pgmap->owner) { @@ -299,8 +299,8 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) } break; case MEMORY_DEVICE_COHERENT: - if (!pgmap->ops->page_free) { - WARN(1, "Missing page_free method\n"); + if (!pgmap->ops->folio_free) { + WARN(1, "Missing folio_free method\n"); return ERR_PTR(-EINVAL); } if (!pgmap->owner) { @@ -416,20 +416,19 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap); void free_zone_device_folio(struct folio *folio) { struct dev_pagemap *pgmap = folio->pgmap; + unsigned long nr = folio_nr_pages(folio); + int i; if (WARN_ON_ONCE(!pgmap)) return; mem_cgroup_uncharge(folio); - /* - * Note: we don't expect anonymous compound pages yet. Once supported - * and we could PTE-map them similar to THP, we'd have to clear - * PG_anon_exclusive on all tail pages. - */ if (folio_test_anon(folio)) { - VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - __ClearPageAnonExclusive(folio_page(folio, 0)); + for (i = 0; i < nr; i++) + __ClearPageAnonExclusive(folio_page(folio, i)); + } else { + VM_WARN_ON_ONCE(folio_test_large(folio)); } /* @@ -454,10 +453,10 @@ void free_zone_device_folio(struct folio *folio) switch (pgmap->type) { case MEMORY_DEVICE_PRIVATE: case MEMORY_DEVICE_COHERENT: - if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free)) + if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->folio_free)) break; - pgmap->ops->page_free(folio_page(folio, 0)); - put_dev_pagemap(pgmap); + pgmap->ops->folio_free(folio); + percpu_ref_put_many(&folio->pgmap->ref, nr); break; case MEMORY_DEVICE_GENERIC: @@ -473,21 +472,26 @@ void free_zone_device_folio(struct folio *folio) break; case MEMORY_DEVICE_PCI_P2PDMA: - if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free)) + if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->folio_free)) break; - pgmap->ops->page_free(folio_page(folio, 0)); + pgmap->ops->folio_free(folio); break; } } -void zone_device_page_init(struct page *page) +void zone_device_page_init(struct page *page, unsigned int order) { + VM_WARN_ON_ONCE(order > MAX_ORDER_NR_PAGES); + /* * Drivers shouldn't be allocating pages after calling * memunmap_pages(). */ - WARN_ON_ONCE(!percpu_ref_tryget_live(&page_pgmap(page)->ref)); + WARN_ON_ONCE(!percpu_ref_tryget_many(&page_pgmap(page)->ref, 1 << order)); set_page_count(page, 1); lock_page(page); + + if (order) + prep_compound_page(page, order); } EXPORT_SYMBOL_GPL(zone_device_page_init); diff --git a/mm/migrate.c b/mm/migrate.c index c0e9f15be2a2..5169f9717f60 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -16,7 +16,7 @@ #include <linux/migrate.h> #include <linux/export.h> #include <linux/swap.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/mm_inline.h> @@ -307,6 +307,7 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw, VM_BUG_ON_PAGE(!PageAnon(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(pte_present(old_pte), page); + VM_WARN_ON_ONCE_FOLIO(folio_is_device_private(folio), folio); if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) || mm_forbids_zeropage(pvmw->vma->vm_mm)) @@ -352,7 +353,7 @@ static bool remove_migration_pte(struct folio *folio, rmap_t rmap_flags = RMAP_NONE; pte_t old_pte; pte_t pte; - swp_entry_t entry; + softleaf_t entry; struct page *new; unsigned long idx = 0; @@ -378,22 +379,22 @@ static bool remove_migration_pte(struct folio *folio, folio_get(folio); pte = mk_pte(new, READ_ONCE(vma->vm_page_prot)); - entry = pte_to_swp_entry(old_pte); - if (!is_migration_entry_young(entry)) + entry = softleaf_from_pte(old_pte); + if (!softleaf_is_migration_young(entry)) pte = pte_mkold(pte); - if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) + if (folio_test_dirty(folio) && softleaf_is_migration_dirty(entry)) pte = pte_mkdirty(pte); if (pte_swp_soft_dirty(old_pte)) pte = pte_mksoft_dirty(pte); else pte = pte_clear_soft_dirty(pte); - if (is_writable_migration_entry(entry)) + if (softleaf_is_migration_write(entry)) pte = pte_mkwrite(pte, vma); else if (pte_swp_uffd_wp(old_pte)) pte = pte_mkuffd_wp(pte); - if (folio_test_anon(folio) && !is_readable_migration_entry(entry)) + if (folio_test_anon(folio) && !softleaf_is_migration_read(entry)) rmap_flags |= RMAP_EXCLUSIVE; if (unlikely(is_device_private_page(new))) { @@ -403,7 +404,7 @@ static bool remove_migration_pte(struct folio *folio, else entry = make_readable_device_private_entry( page_to_pfn(new)); - pte = swp_entry_to_pte(entry); + pte = softleaf_to_pte(entry); if (pte_swp_soft_dirty(old_pte)) pte = pte_swp_mksoft_dirty(pte); if (pte_swp_uffd_wp(old_pte)) @@ -482,7 +483,7 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, spinlock_t *ptl; pte_t *ptep; pte_t pte; - swp_entry_t entry; + softleaf_t entry; ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) @@ -491,11 +492,11 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, pte = ptep_get(ptep); pte_unmap(ptep); - if (!is_swap_pte(pte)) + if (pte_none(pte) || pte_present(pte)) goto out; - entry = pte_to_swp_entry(pte); - if (!is_migration_entry(entry)) + entry = softleaf_from_pte(pte); + if (!softleaf_is_migration(entry)) goto out; migration_entry_wait_on_locked(entry, ptl); @@ -514,16 +515,18 @@ out: void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, ptep); + softleaf_t entry; pte_t pte; hugetlb_vma_assert_locked(vma); spin_lock(ptl); pte = huge_ptep_get(vma->vm_mm, addr, ptep); - if (unlikely(!is_hugetlb_entry_migration(pte))) { - spin_unlock(ptl); - hugetlb_vma_unlock_read(vma); - } else { + if (huge_pte_none(pte)) + goto fail; + + entry = softleaf_from_pte(pte); + if (softleaf_is_migration(entry)) { /* * If migration entry existed, safe to release vma lock * here because the pgtable page won't be freed without the @@ -531,8 +534,13 @@ void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, p * lock release in migration_entry_wait_on_locked(). */ hugetlb_vma_unlock_read(vma); - migration_entry_wait_on_locked(pte_to_swp_entry(pte), ptl); + migration_entry_wait_on_locked(entry, ptl); + return; } + +fail: + spin_unlock(ptl); + hugetlb_vma_unlock_read(vma); } #endif @@ -542,9 +550,9 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) spinlock_t *ptl; ptl = pmd_lock(mm, pmd); - if (!is_pmd_migration_entry(*pmd)) + if (!pmd_is_migration_entry(*pmd)) goto unlock; - migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), ptl); + migration_entry_wait_on_locked(softleaf_from_pmd(*pmd), ptl); return; unlock: spin_unlock(ptl); @@ -562,7 +570,7 @@ unlock: static int __folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int expected_count) { - XA_STATE(xas, &mapping->i_pages, folio_index(folio)); + XA_STATE(xas, &mapping->i_pages, folio->index); struct swap_cluster_info *ci = NULL; struct zone *oldzone, *newzone; int dirty; @@ -667,27 +675,27 @@ static int __folio_migrate_mapping(struct address_space *mapping, old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat); new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat); - __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr); - __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr); + mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr); + mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr); if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) { - __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr); - __mod_lruvec_state(new_lruvec, NR_SHMEM, nr); + mod_lruvec_state(old_lruvec, NR_SHMEM, -nr); + mod_lruvec_state(new_lruvec, NR_SHMEM, nr); if (folio_test_pmd_mappable(folio)) { - __mod_lruvec_state(old_lruvec, NR_SHMEM_THPS, -nr); - __mod_lruvec_state(new_lruvec, NR_SHMEM_THPS, nr); + mod_lruvec_state(old_lruvec, NR_SHMEM_THPS, -nr); + mod_lruvec_state(new_lruvec, NR_SHMEM_THPS, nr); } } #ifdef CONFIG_SWAP if (folio_test_swapcache(folio)) { - __mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr); - __mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr); + mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr); + mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr); } #endif if (dirty && mapping_can_writeback(mapping)) { - __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr); + mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr); __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr); - __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr); + mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr); __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr); } } @@ -715,7 +723,7 @@ EXPORT_SYMBOL(folio_migrate_mapping); int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src) { - XA_STATE(xas, &mapping->i_pages, folio_index(src)); + XA_STATE(xas, &mapping->i_pages, src->index); int rc, expected_count = folio_expected_ref_count(src) + 1; if (folio_ref_count(src) != expected_count) @@ -2164,7 +2172,7 @@ struct folio *alloc_migration_target(struct folio *src, unsigned long private) gfp_t gfp_mask; unsigned int order = 0; int nid; - int zidx; + enum zone_type zidx; mtc = (struct migration_target_control *)private; gfp_mask = mtc->gfp_mask; @@ -2190,7 +2198,7 @@ struct folio *alloc_migration_target(struct folio *src, unsigned long private) gfp_mask |= GFP_TRANSHUGE; order = folio_order(src); } - zidx = zone_idx(folio_zone(src)); + zidx = folio_zonenum(src); if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE) gfp_mask |= __GFP_HIGHMEM; diff --git a/mm/migrate_device.c b/mm/migrate_device.c index abd9f6850db6..23379663b1e1 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -13,7 +13,8 @@ #include <linux/oom.h> #include <linux/pagewalk.h> #include <linux/rmap.h> -#include <linux/swapops.h> +#include <linux/leafops.h> +#include <linux/pgalloc.h> #include <asm/tlbflush.h> #include "internal.h" @@ -44,6 +45,23 @@ static int migrate_vma_collect_hole(unsigned long start, if (!vma_is_anonymous(walk->vma)) return migrate_vma_collect_skip(start, end, walk); + if (thp_migration_supported() && + (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) && + (IS_ALIGNED(start, HPAGE_PMD_SIZE) && + IS_ALIGNED(end, HPAGE_PMD_SIZE))) { + migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE | + MIGRATE_PFN_COMPOUND; + migrate->dst[migrate->npages] = 0; + migrate->npages++; + migrate->cpages++; + + /* + * Collect the remaining entries as holes, in case we + * need to split later + */ + return migrate_vma_collect_skip(start + PAGE_SIZE, end, walk); + } + for (addr = start; addr < end; addr += PAGE_SIZE) { migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; migrate->dst[migrate->npages] = 0; @@ -54,70 +72,214 @@ static int migrate_vma_collect_hole(unsigned long start, return 0; } -static int migrate_vma_collect_pmd(pmd_t *pmdp, - unsigned long start, - unsigned long end, - struct mm_walk *walk) +/** + * migrate_vma_split_folio() - Helper function to split a THP folio + * @folio: the folio to split + * @fault_page: struct page associated with the fault if any + * + * Returns 0 on success + */ +static int migrate_vma_split_folio(struct folio *folio, + struct page *fault_page) +{ + int ret; + struct folio *fault_folio = fault_page ? page_folio(fault_page) : NULL; + struct folio *new_fault_folio = NULL; + + if (folio != fault_folio) { + folio_get(folio); + folio_lock(folio); + } + + ret = split_folio(folio); + if (ret) { + if (folio != fault_folio) { + folio_unlock(folio); + folio_put(folio); + } + return ret; + } + + new_fault_folio = fault_page ? page_folio(fault_page) : NULL; + + /* + * Ensure the lock is held on the correct + * folio after the split + */ + if (!new_fault_folio) { + folio_unlock(folio); + folio_put(folio); + } else if (folio != new_fault_folio) { + if (new_fault_folio != fault_folio) { + folio_get(new_fault_folio); + folio_lock(new_fault_folio); + } + folio_unlock(folio); + folio_put(folio); + } + + return 0; +} + +/** migrate_vma_collect_huge_pmd - collect THP pages without splitting the + * folio for device private pages. + * @pmdp: pointer to pmd entry + * @start: start address of the range for migration + * @end: end address of the range for migration + * @walk: mm_walk callback structure + * @fault_folio: folio associated with the fault if any + * + * Collect the huge pmd entry at @pmdp for migration and set the + * MIGRATE_PFN_COMPOUND flag in the migrate src entry to indicate that + * migration will occur at HPAGE_PMD granularity + */ +static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start, + unsigned long end, struct mm_walk *walk, + struct folio *fault_folio) { + struct mm_struct *mm = walk->mm; + struct folio *folio; struct migrate_vma *migrate = walk->private; - struct folio *fault_folio = migrate->fault_page ? - page_folio(migrate->fault_page) : NULL; - struct vm_area_struct *vma = walk->vma; - struct mm_struct *mm = vma->vm_mm; - unsigned long addr = start, unmapped = 0; spinlock_t *ptl; - pte_t *ptep; + int ret; + unsigned long write = 0; -again: - if (pmd_none(*pmdp)) + ptl = pmd_lock(mm, pmdp); + if (pmd_none(*pmdp)) { + spin_unlock(ptl); return migrate_vma_collect_hole(start, end, -1, walk); + } if (pmd_trans_huge(*pmdp)) { - struct folio *folio; - - ptl = pmd_lock(mm, pmdp); - if (unlikely(!pmd_trans_huge(*pmdp))) { + if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) { spin_unlock(ptl); - goto again; + return migrate_vma_collect_skip(start, end, walk); } folio = pmd_folio(*pmdp); if (is_huge_zero_folio(folio)) { spin_unlock(ptl); - split_huge_pmd(vma, pmdp, addr); - } else { - int ret; + return migrate_vma_collect_hole(start, end, -1, walk); + } + if (pmd_write(*pmdp)) + write = MIGRATE_PFN_WRITE; + } else if (!pmd_present(*pmdp)) { + const softleaf_t entry = softleaf_from_pmd(*pmdp); + + folio = softleaf_to_folio(entry); - folio_get(folio); + if (!softleaf_is_device_private(entry) || + !(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || + (folio->pgmap->owner != migrate->pgmap_owner)) { spin_unlock(ptl); - /* FIXME: we don't expect THP for fault_folio */ - if (WARN_ON_ONCE(fault_folio == folio)) - return migrate_vma_collect_skip(start, end, - walk); - if (unlikely(!folio_trylock(folio))) - return migrate_vma_collect_skip(start, end, - walk); - ret = split_folio(folio); - if (fault_folio != folio) - folio_unlock(folio); - folio_put(folio); - if (ret) - return migrate_vma_collect_skip(start, end, - walk); + return migrate_vma_collect_skip(start, end, walk); + } + + if (softleaf_is_migration(entry)) { + migration_entry_wait_on_locked(entry, ptl); + spin_unlock(ptl); + return -EAGAIN; } + + if (softleaf_is_device_private_write(entry)) + write = MIGRATE_PFN_WRITE; + } else { + spin_unlock(ptl); + return -EAGAIN; } - ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + folio_get(folio); + if (folio != fault_folio && unlikely(!folio_trylock(folio))) { + spin_unlock(ptl); + folio_put(folio); + return migrate_vma_collect_skip(start, end, walk); + } + + if (thp_migration_supported() && + (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) && + (IS_ALIGNED(start, HPAGE_PMD_SIZE) && + IS_ALIGNED(end, HPAGE_PMD_SIZE))) { + + struct page_vma_mapped_walk pvmw = { + .ptl = ptl, + .address = start, + .pmd = pmdp, + .vma = walk->vma, + }; + + unsigned long pfn = page_to_pfn(folio_page(folio, 0)); + + migrate->src[migrate->npages] = migrate_pfn(pfn) | write + | MIGRATE_PFN_MIGRATE + | MIGRATE_PFN_COMPOUND; + migrate->dst[migrate->npages++] = 0; + migrate->cpages++; + ret = set_pmd_migration_entry(&pvmw, folio_page(folio, 0)); + if (ret) { + migrate->npages--; + migrate->cpages--; + migrate->src[migrate->npages] = 0; + migrate->dst[migrate->npages] = 0; + goto fallback; + } + migrate_vma_collect_skip(start + PAGE_SIZE, end, walk); + spin_unlock(ptl); + return 0; + } + +fallback: + spin_unlock(ptl); + if (!folio_test_large(folio)) + goto done; + ret = split_folio(folio); + if (fault_folio != folio) + folio_unlock(folio); + folio_put(folio); + if (ret) + return migrate_vma_collect_skip(start, end, walk); + if (pmd_none(pmdp_get_lockless(pmdp))) + return migrate_vma_collect_hole(start, end, -1, walk); + +done: + return -ENOENT; +} + +static int migrate_vma_collect_pmd(pmd_t *pmdp, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + struct vm_area_struct *vma = walk->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long addr = start, unmapped = 0; + spinlock_t *ptl; + struct folio *fault_folio = migrate->fault_page ? + page_folio(migrate->fault_page) : NULL; + pte_t *ptep; + +again: + if (pmd_trans_huge(*pmdp) || !pmd_present(*pmdp)) { + int ret = migrate_vma_collect_huge_pmd(pmdp, start, end, walk, fault_folio); + + if (ret == -EAGAIN) + goto again; + if (ret == 0) + return 0; + } + + ptep = pte_offset_map_lock(mm, pmdp, start, &ptl); if (!ptep) goto again; arch_enter_lazy_mmu_mode(); + ptep += (addr - start) / PAGE_SIZE; for (; addr < end; addr += PAGE_SIZE, ptep++) { struct dev_pagemap *pgmap; unsigned long mpfn = 0, pfn; struct folio *folio; struct page *page; - swp_entry_t entry; + softleaf_t entry; pte_t pte; pte = ptep_get(ptep); @@ -136,20 +298,39 @@ again: * page table entry. Other special swap entries are not * migratable, and we ignore regular swapped page. */ - entry = pte_to_swp_entry(pte); - if (!is_device_private_entry(entry)) + entry = softleaf_from_pte(pte); + if (!softleaf_is_device_private(entry)) goto next; - page = pfn_swap_entry_to_page(entry); + page = softleaf_to_page(entry); pgmap = page_pgmap(page); if (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || pgmap->owner != migrate->pgmap_owner) goto next; + folio = page_folio(page); + if (folio_test_large(folio)) { + int ret; + + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(ptep, ptl); + ret = migrate_vma_split_folio(folio, + migrate->fault_page); + + if (ret) { + if (unmapped) + flush_tlb_range(walk->vma, start, end); + + return migrate_vma_collect_skip(addr, end, walk); + } + + goto again; + } + mpfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE; - if (is_writable_device_private_entry(entry)) + if (softleaf_is_device_private_write(entry)) mpfn |= MIGRATE_PFN_WRITE; } else { pfn = pte_pfn(pte); @@ -171,12 +352,29 @@ again: pgmap->owner != migrate->pgmap_owner) goto next; } + folio = page ? page_folio(page) : NULL; + if (folio && folio_test_large(folio)) { + int ret; + + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(ptep, ptl); + ret = migrate_vma_split_folio(folio, + migrate->fault_page); + + if (ret) { + if (unmapped) + flush_tlb_range(walk->vma, start, end); + + return migrate_vma_collect_skip(addr, end, walk); + } + + goto again; + } mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; } - /* FIXME support THP */ - if (!page || !page->mapping || PageTransCompound(page)) { + if (!page || !page->mapping) { mpfn = 0; goto next; } @@ -347,14 +545,6 @@ static bool migrate_vma_check_page(struct page *page, struct page *fault_page) */ int extra = 1 + (page == fault_page); - /* - * FIXME support THP (transparent huge page), it is bit more complex to - * check them than regular pages, because they can be mapped with a pmd - * or with a pte (split pte mapping). - */ - if (folio_test_large(folio)) - return false; - /* Page from ZONE_DEVICE have one extra reference */ if (folio_is_zone_device(folio)) extra++; @@ -385,17 +575,24 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, lru_add_drain(); - for (i = 0; i < npages; i++) { + for (i = 0; i < npages; ) { struct page *page = migrate_pfn_to_page(src_pfns[i]); struct folio *folio; + unsigned int nr = 1; if (!page) { if (src_pfns[i] & MIGRATE_PFN_MIGRATE) unmapped++; - continue; + goto next; } folio = page_folio(page); + nr = folio_nr_pages(folio); + + if (nr > 1) + src_pfns[i] |= MIGRATE_PFN_COMPOUND; + + /* ZONE_DEVICE folios are not on LRU */ if (!folio_is_zone_device(folio)) { if (!folio_test_lru(folio) && allow_drain) { @@ -407,7 +604,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, if (!folio_isolate_lru(folio)) { src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; restore++; - continue; + goto next; } /* Drop the reference we took in collect */ @@ -426,10 +623,12 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; restore++; - continue; + goto next; } unmapped++; +next: + i += nr; } for (i = 0; i < npages && restore; i++) { @@ -575,6 +774,189 @@ int migrate_vma_setup(struct migrate_vma *args) } EXPORT_SYMBOL(migrate_vma_setup); +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +/** + * migrate_vma_insert_huge_pmd_page: Insert a huge folio into @migrate->vma->vm_mm + * at @addr. folio is already allocated as a part of the migration process with + * large page. + * + * @page needs to be initialized and setup after it's allocated. The code bits + * here follow closely the code in __do_huge_pmd_anonymous_page(). This API does + * not support THP zero pages. + * + * @migrate: migrate_vma arguments + * @addr: address where the folio will be inserted + * @page: page to be inserted at @addr + * @src: src pfn which is being migrated + * @pmdp: pointer to the pmd + */ +static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate, + unsigned long addr, + struct page *page, + unsigned long *src, + pmd_t *pmdp) +{ + struct vm_area_struct *vma = migrate->vma; + gfp_t gfp = vma_thp_gfp_mask(vma); + struct folio *folio = page_folio(page); + int ret; + vm_fault_t csa_ret; + spinlock_t *ptl; + pgtable_t pgtable; + pmd_t entry; + bool flush = false; + unsigned long i; + + VM_WARN_ON_FOLIO(!folio, folio); + VM_WARN_ON_ONCE(!pmd_none(*pmdp) && !is_huge_zero_pmd(*pmdp)); + + if (!thp_vma_suitable_order(vma, addr, HPAGE_PMD_ORDER)) + return -EINVAL; + + ret = anon_vma_prepare(vma); + if (ret) + return ret; + + folio_set_order(folio, HPAGE_PMD_ORDER); + folio_set_large_rmappable(folio); + + if (mem_cgroup_charge(folio, migrate->vma->vm_mm, gfp)) { + count_vm_event(THP_FAULT_FALLBACK); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); + ret = -ENOMEM; + goto abort; + } + + __folio_mark_uptodate(folio); + + pgtable = pte_alloc_one(vma->vm_mm); + if (unlikely(!pgtable)) + goto abort; + + if (folio_is_device_private(folio)) { + swp_entry_t swp_entry; + + if (vma->vm_flags & VM_WRITE) + swp_entry = make_writable_device_private_entry( + page_to_pfn(page)); + else + swp_entry = make_readable_device_private_entry( + page_to_pfn(page)); + entry = swp_entry_to_pmd(swp_entry); + } else { + if (folio_is_zone_device(folio) && + !folio_is_device_coherent(folio)) { + goto abort; + } + entry = folio_mk_pmd(folio, vma->vm_page_prot); + if (vma->vm_flags & VM_WRITE) + entry = pmd_mkwrite(pmd_mkdirty(entry), vma); + } + + ptl = pmd_lock(vma->vm_mm, pmdp); + csa_ret = check_stable_address_space(vma->vm_mm); + if (csa_ret) + goto abort; + + /* + * Check for userfaultfd but do not deliver the fault. Instead, + * just back off. + */ + if (userfaultfd_missing(vma)) + goto unlock_abort; + + if (!pmd_none(*pmdp)) { + if (!is_huge_zero_pmd(*pmdp)) + goto unlock_abort; + flush = true; + } else if (!pmd_none(*pmdp)) + goto unlock_abort; + + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); + if (!folio_is_zone_device(folio)) + folio_add_lru_vma(folio, vma); + folio_get(folio); + + if (flush) { + pte_free(vma->vm_mm, pgtable); + flush_cache_page(vma, addr, addr + HPAGE_PMD_SIZE); + pmdp_invalidate(vma, addr, pmdp); + } else { + pgtable_trans_huge_deposit(vma->vm_mm, pmdp, pgtable); + mm_inc_nr_ptes(vma->vm_mm); + } + set_pmd_at(vma->vm_mm, addr, pmdp, entry); + update_mmu_cache_pmd(vma, addr, pmdp); + + spin_unlock(ptl); + + count_vm_event(THP_FAULT_ALLOC); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); + + return 0; + +unlock_abort: + spin_unlock(ptl); +abort: + for (i = 0; i < HPAGE_PMD_NR; i++) + src[i] &= ~MIGRATE_PFN_MIGRATE; + return 0; +} + +static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate, + unsigned long idx, unsigned long addr, + struct folio *folio) +{ + unsigned long i; + unsigned long pfn; + unsigned long flags; + int ret = 0; + + folio_get(folio); + split_huge_pmd_address(migrate->vma, addr, true); + ret = folio_split_unmapped(folio, 0); + if (ret) + return ret; + migrate->src[idx] &= ~MIGRATE_PFN_COMPOUND; + flags = migrate->src[idx] & ((1UL << MIGRATE_PFN_SHIFT) - 1); + pfn = migrate->src[idx] >> MIGRATE_PFN_SHIFT; + for (i = 1; i < HPAGE_PMD_NR; i++) + migrate->src[i+idx] = migrate_pfn(pfn + i) | flags; + return ret; +} +#else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */ +static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate, + unsigned long addr, + struct page *page, + unsigned long *src, + pmd_t *pmdp) +{ + return 0; +} + +static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate, + unsigned long idx, unsigned long addr, + struct folio *folio) +{ + return 0; +} +#endif + +static unsigned long migrate_vma_nr_pages(unsigned long *src) +{ + unsigned long nr = 1; +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + if (*src & MIGRATE_PFN_COMPOUND) + nr = HPAGE_PMD_NR; +#else + if (*src & MIGRATE_PFN_COMPOUND) + VM_WARN_ON_ONCE(true); +#endif + return nr; +} + /* * This code closely matches the code in: * __handle_mm_fault() @@ -585,9 +967,10 @@ EXPORT_SYMBOL(migrate_vma_setup); */ static void migrate_vma_insert_page(struct migrate_vma *migrate, unsigned long addr, - struct page *page, + unsigned long *dst, unsigned long *src) { + struct page *page = migrate_pfn_to_page(*dst); struct folio *folio = page_folio(page); struct vm_area_struct *vma = migrate->vma; struct mm_struct *mm = vma->vm_mm; @@ -615,8 +998,24 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, pmdp = pmd_alloc(mm, pudp, addr); if (!pmdp) goto abort; - if (pmd_trans_huge(*pmdp)) - goto abort; + + if (thp_migration_supported() && (*dst & MIGRATE_PFN_COMPOUND)) { + int ret = migrate_vma_insert_huge_pmd_page(migrate, addr, page, + src, pmdp); + if (ret) + goto abort; + return; + } + + if (!pmd_none(*pmdp)) { + if (pmd_trans_huge(*pmdp)) { + if (!is_huge_zero_pmd(*pmdp)) + goto abort; + split_huge_pmd(vma, pmdp, addr); + } else if (pmd_leaf(*pmdp)) + goto abort; + } + if (pte_alloc(mm, pmdp)) goto abort; if (unlikely(anon_vma_prepare(vma))) @@ -704,26 +1103,28 @@ static void __migrate_device_pages(unsigned long *src_pfns, struct migrate_vma *migrate) { struct mmu_notifier_range range; - unsigned long i; + unsigned long i, j; bool notified = false; + unsigned long addr; - for (i = 0; i < npages; i++) { + for (i = 0; i < npages; ) { struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); struct page *page = migrate_pfn_to_page(src_pfns[i]); struct address_space *mapping; struct folio *newfolio, *folio; int r, extra_cnt = 0; + unsigned long nr = 1; if (!newpage) { src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; - continue; + goto next; } if (!page) { unsigned long addr; if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE)) - continue; + goto next; /* * The only time there is no vma is when called from @@ -741,15 +1142,57 @@ static void __migrate_device_pages(unsigned long *src_pfns, migrate->pgmap_owner); mmu_notifier_invalidate_range_start(&range); } - migrate_vma_insert_page(migrate, addr, newpage, - &src_pfns[i]); - continue; + + if ((src_pfns[i] & MIGRATE_PFN_COMPOUND) && + (!(dst_pfns[i] & MIGRATE_PFN_COMPOUND))) { + nr = migrate_vma_nr_pages(&src_pfns[i]); + src_pfns[i] &= ~MIGRATE_PFN_COMPOUND; + } else { + nr = 1; + } + + for (j = 0; j < nr && i + j < npages; j++) { + src_pfns[i+j] |= MIGRATE_PFN_MIGRATE; + migrate_vma_insert_page(migrate, + addr + j * PAGE_SIZE, + &dst_pfns[i+j], &src_pfns[i+j]); + } + goto next; } newfolio = page_folio(newpage); folio = page_folio(page); mapping = folio_mapping(folio); + /* + * If THP migration is enabled, check if both src and dst + * can migrate large pages + */ + if (thp_migration_supported()) { + if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) && + (src_pfns[i] & MIGRATE_PFN_COMPOUND) && + !(dst_pfns[i] & MIGRATE_PFN_COMPOUND)) { + + if (!migrate) { + src_pfns[i] &= ~(MIGRATE_PFN_MIGRATE | + MIGRATE_PFN_COMPOUND); + goto next; + } + nr = 1 << folio_order(folio); + addr = migrate->start + i * PAGE_SIZE; + if (migrate_vma_split_unmapped_folio(migrate, i, addr, folio)) { + src_pfns[i] &= ~(MIGRATE_PFN_MIGRATE | + MIGRATE_PFN_COMPOUND); + goto next; + } + } else if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) && + (dst_pfns[i] & MIGRATE_PFN_COMPOUND) && + !(src_pfns[i] & MIGRATE_PFN_COMPOUND)) { + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; + } + } + + if (folio_is_device_private(newfolio) || folio_is_device_coherent(newfolio)) { if (mapping) { @@ -762,7 +1205,7 @@ static void __migrate_device_pages(unsigned long *src_pfns, if (!folio_test_anon(folio) || !folio_free_swap(folio)) { src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; - continue; + goto next; } } } else if (folio_is_zone_device(newfolio)) { @@ -770,18 +1213,25 @@ static void __migrate_device_pages(unsigned long *src_pfns, * Other types of ZONE_DEVICE page are not supported. */ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; - continue; + goto next; } BUG_ON(folio_test_writeback(folio)); if (migrate && migrate->fault_page == page) extra_cnt = 1; - r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt); - if (r) - src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; - else - folio_migrate_flags(newfolio, folio); + for (j = 0; j < nr && i + j < npages; j++) { + folio = page_folio(migrate_pfn_to_page(src_pfns[i+j])); + newfolio = page_folio(migrate_pfn_to_page(dst_pfns[i+j])); + + r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt); + if (r) + src_pfns[i+j] &= ~MIGRATE_PFN_MIGRATE; + else + folio_migrate_flags(newfolio, folio); + } +next: + i += nr; } if (notified) @@ -943,10 +1393,23 @@ static unsigned long migrate_device_pfn_lock(unsigned long pfn) int migrate_device_range(unsigned long *src_pfns, unsigned long start, unsigned long npages) { - unsigned long i, pfn; + unsigned long i, j, pfn; + + for (pfn = start, i = 0; i < npages; pfn++, i++) { + struct page *page = pfn_to_page(pfn); + struct folio *folio = page_folio(page); + unsigned int nr = 1; - for (pfn = start, i = 0; i < npages; pfn++, i++) src_pfns[i] = migrate_device_pfn_lock(pfn); + nr = folio_nr_pages(folio); + if (nr > 1) { + src_pfns[i] |= MIGRATE_PFN_COMPOUND; + for (j = 1; j < nr; j++) + src_pfns[i+j] = 0; + i += j - 1; + pfn += j - 1; + } + } migrate_device_unmap(src_pfns, npages, NULL); @@ -964,10 +1427,22 @@ EXPORT_SYMBOL(migrate_device_range); */ int migrate_device_pfns(unsigned long *src_pfns, unsigned long npages) { - unsigned long i; + unsigned long i, j; + + for (i = 0; i < npages; i++) { + struct page *page = pfn_to_page(src_pfns[i]); + struct folio *folio = page_folio(page); + unsigned int nr = 1; - for (i = 0; i < npages; i++) src_pfns[i] = migrate_device_pfn_lock(src_pfns[i]); + nr = folio_nr_pages(folio); + if (nr > 1) { + src_pfns[i] |= MIGRATE_PFN_COMPOUND; + for (j = 1; j < nr; j++) + src_pfns[i+j] = 0; + i += j - 1; + } + } migrate_device_unmap(src_pfns, npages, NULL); diff --git a/mm/mincore.c b/mm/mincore.c index 8ec4719370e1..e5d13eea9234 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -14,7 +14,7 @@ #include <linux/mman.h> #include <linux/syscalls.h> #include <linux/swap.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/shmem_fs.h> #include <linux/hugetlb.h> #include <linux/pgtable.h> @@ -32,11 +32,22 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, spinlock_t *ptl; ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); + /* * Hugepages under user process are always in RAM and never * swapped out, but theoretically it needs to be checked. */ - present = pte && !huge_pte_none_mostly(huge_ptep_get(walk->mm, addr, pte)); + if (!pte) { + present = 0; + } else { + const pte_t ptep = huge_ptep_get(walk->mm, addr, pte); + + if (huge_pte_none(ptep) || pte_is_marker(ptep)) + present = 0; + else + present = 1; + } + for (; addr != end; vec++, addr += PAGE_SIZE) *vec = present; walk->private = vec; @@ -63,7 +74,7 @@ static unsigned char mincore_swap(swp_entry_t entry, bool shmem) * absent. Page table may contain migration or hwpoison * entries which are always uptodate. */ - if (non_swap_entry(entry)) + if (!softleaf_is_swap(entry)) return !shmem; /* @@ -175,8 +186,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t pte = ptep_get(ptep); step = 1; - /* We need to do cache lookup too for pte markers */ - if (pte_none_mostly(pte)) + /* We need to do cache lookup too for markers */ + if (pte_none(pte) || pte_is_marker(pte)) __mincore_unmapped_range(addr, addr + PAGE_SIZE, vma, vec); else if (pte_present(pte)) { @@ -191,7 +202,9 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, for (i = 0; i < step; i++) vec[i] = 1; } else { /* pte is a swap entry */ - *vec = mincore_swap(pte_to_swp_entry(pte), false); + const softleaf_t entry = softleaf_from_pte(pte); + + *vec = mincore_swap(entry, false); } vec += step; } diff --git a/mm/mlock.c b/mm/mlock.c index bb0776f5ef7c..2f699c3497a5 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -478,7 +478,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ goto out; - vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags); + vma = vma_modify_flags(vmi, *prev, vma, start, end, &newflags); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; diff --git a/mm/mm_init.c b/mm/mm_init.c index 3db2dea7db4c..c6812b4dbb2e 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1909,7 +1909,7 @@ void __init free_area_init(unsigned long *max_zone_pfn) free_area_init_node(nid); /* - * No sysfs hierarchy will be created via register_one_node() + * No sysfs hierarchy will be created via register_node() *for memory-less node because here it's not marked as N_MEMORY *and won't be set online later. The benefit is userspace *program won't be confused by sysfs files/directories of @@ -2469,7 +2469,7 @@ void *__init alloc_large_system_hash(const char *tablename, panic("Failed to allocate %s hash table\n", tablename); pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n", - tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size, + tablename, 1UL << log2qty, get_order(size), size, virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear"); if (_hash_shift) diff --git a/mm/mmap.c b/mm/mmap.c index 5fd3b80fda1d..4bdb9ffa9e25 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -797,12 +797,11 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, } #endif -unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags, - vm_flags_t vm_flags) +unsigned long mm_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags) { - if (mm_flags_test(MMF_TOPDOWN, mm)) + if (mm_flags_test(MMF_TOPDOWN, current->mm)) return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags, vm_flags); return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); @@ -848,7 +847,7 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, addr = thp_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, vm_flags); } else { - addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len, + addr = mm_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, vm_flags); } if (IS_ERR_VALUE(addr)) @@ -864,12 +863,10 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, } unsigned long -mm_get_unmapped_area(struct mm_struct *mm, struct file *file, - unsigned long addr, unsigned long len, +mm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return mm_get_unmapped_area_vmflags(mm, file, addr, len, - pgoff, flags, 0); + return mm_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, 0); } EXPORT_SYMBOL(mm_get_unmapped_area); @@ -1277,7 +1274,7 @@ void exit_mmap(struct mm_struct *mm) tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ - unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX, false); + unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX); mmap_read_unlock(mm); /* @@ -1451,8 +1448,10 @@ static struct vm_area_struct *__install_special_mapping( return ERR_PTR(-ENOMEM); vma_set_range(vma, addr, addr + len, 0); - vm_flags_init(vma, (vm_flags | mm->def_flags | - VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK); + vm_flags |= mm->def_flags | VM_DONTEXPAND; + if (pgtable_supports_soft_dirty()) + vm_flags |= VM_SOFTDIRTY; + vm_flags_init(vma, vm_flags & ~VM_LOCKED_MASK); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_ops = ops; @@ -1750,7 +1749,9 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) for_each_vma(vmi, mpnt) { struct file *file; - vma_start_write(mpnt); + retval = vma_start_write_killable(mpnt); + if (retval < 0) + goto loop_out; if (mpnt->vm_flags & VM_DONTCOPY) { retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, mpnt->vm_end, GFP_KERNEL); @@ -1761,14 +1762,6 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) continue; } charge = 0; - /* - * Don't duplicate many vmas if we've been oom-killed (for - * example) - */ - if (fatal_signal_pending(current)) { - retval = -EINTR; - goto loop_out; - } if (mpnt->vm_flags & VM_ACCOUNT) { unsigned long len = vma_pages(mpnt); diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 0a0db5849b8e..7421b7ea8001 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -45,10 +45,19 @@ EXPORT_SYMBOL(__mmap_lock_do_trace_released); #ifdef CONFIG_MMU #ifdef CONFIG_PER_VMA_LOCK -static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching) +/* + * __vma_enter_locked() returns 0 immediately if the vma is not + * attached, otherwise it waits for any current readers to finish and + * returns 1. Returns -EINTR if a signal is received while waiting. + */ +static inline int __vma_enter_locked(struct vm_area_struct *vma, + bool detaching, int state) { + int err; unsigned int tgt_refcnt = VMA_LOCK_OFFSET; + mmap_assert_write_locked(vma->vm_mm); + /* Additional refcnt if the vma is attached. */ if (!detaching) tgt_refcnt++; @@ -58,15 +67,27 @@ static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). */ if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) - return false; + return 0; rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); - rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, + err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, refcount_read(&vma->vm_refcnt) == tgt_refcnt, - TASK_UNINTERRUPTIBLE); + state); + if (err) { + if (refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt)) { + /* + * The wait failed, but the last reader went away + * as well. Tell the caller the VMA is detached. + */ + WARN_ON_ONCE(!detaching); + err = 0; + } + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); + return err; + } lock_acquired(&vma->vmlock_dep_map, _RET_IP_); - return true; + return 1; } static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) @@ -75,16 +96,14 @@ static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) rwsem_release(&vma->vmlock_dep_map, _RET_IP_); } -void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) +int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, + int state) { - bool locked; + int locked; - /* - * __vma_enter_locked() returns false immediately if the vma is not - * attached, otherwise it waits until refcnt is indicating that vma - * is attached with no readers. - */ - locked = __vma_enter_locked(vma, false); + locked = __vma_enter_locked(vma, false, state); + if (locked < 0) + return locked; /* * We should use WRITE_ONCE() here because we can have concurrent reads @@ -100,6 +119,8 @@ void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) __vma_exit_locked(vma, &detached); WARN_ON_ONCE(detached); /* vma should remain attached */ } + + return 0; } EXPORT_SYMBOL_GPL(__vma_start_write); @@ -118,7 +139,7 @@ void vma_mark_detached(struct vm_area_struct *vma) */ if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { /* Wait until vma is detached with no readers. */ - if (__vma_enter_locked(vma, true)) { + if (__vma_enter_locked(vma, true, TASK_UNINTERRUPTIBLE)) { bool detached; __vma_exit_locked(vma, &detached); @@ -241,6 +262,7 @@ retry: if (PTR_ERR(vma) == -EAGAIN) { count_vm_vma_lock_event(VMA_LOCK_MISS); /* The area was replaced with another one */ + mas_set(&mas, address); goto retry; } diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 374aa6f021c6..247e3f9db6c7 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -9,8 +9,8 @@ #include <linux/smp.h> #include <linux/swap.h> #include <linux/rmap.h> +#include <linux/pgalloc.h> -#include <asm/pgalloc.h> #include <asm/tlb.h> #ifndef CONFIG_MMU_GATHER_NO_GATHER diff --git a/mm/mprotect.c b/mm/mprotect.c index 113b48985834..283889e4f1ce 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -29,9 +29,7 @@ #include <linux/uaccess.h> #include <linux/mm_inline.h> #include <linux/pgtable.h> -#include <linux/sched/sysctl.h> #include <linux/userfaultfd_k.h> -#include <linux/memory-tiers.h> #include <uapi/linux/mman.h> #include <asm/cacheflush.h> #include <asm/mmu_context.h> @@ -118,62 +116,6 @@ static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep, return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr_ptes, flags); } -static bool prot_numa_skip(struct vm_area_struct *vma, unsigned long addr, - pte_t oldpte, pte_t *pte, int target_node, - struct folio *folio) -{ - bool ret = true; - bool toptier; - int nid; - - /* Avoid TLB flush if possible */ - if (pte_protnone(oldpte)) - goto skip; - - if (!folio) - goto skip; - - if (folio_is_zone_device(folio) || folio_test_ksm(folio)) - goto skip; - - /* Also skip shared copy-on-write pages */ - if (is_cow_mapping(vma->vm_flags) && - (folio_maybe_dma_pinned(folio) || folio_maybe_mapped_shared(folio))) - goto skip; - - /* - * While migration can move some dirty pages, - * it cannot move them all from MIGRATE_ASYNC - * context. - */ - if (folio_is_file_lru(folio) && folio_test_dirty(folio)) - goto skip; - - /* - * Don't mess with PTEs if page is already on the node - * a single-threaded process is running on. - */ - nid = folio_nid(folio); - if (target_node == nid) - goto skip; - - toptier = node_is_toptier(nid); - - /* - * Skip scanning top tier node if normal numa - * balancing is disabled - */ - if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && toptier) - goto skip; - - ret = false; - if (folio_use_access_time(folio)) - folio_xchg_access_time(folio, jiffies_to_msecs(jiffies)); - -skip: - return ret; -} - /* Set nr_ptes number of ptes, starting from idx */ static void prot_commit_flush_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes, @@ -276,7 +218,7 @@ static long change_pte_range(struct mmu_gather *tlb, pte_t *pte, oldpte; spinlock_t *ptl; long pages = 0; - int target_node = NUMA_NO_NODE; + bool is_private_single_threaded; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; @@ -287,10 +229,8 @@ static long change_pte_range(struct mmu_gather *tlb, if (!pte) return -EAGAIN; - /* Get target node for single threaded private VMAs */ - if (prot_numa && !(vma->vm_flags & VM_SHARED) && - atomic_read(&vma->vm_mm->mm_users) == 1) - target_node = numa_node_id(); + if (prot_numa) + is_private_single_threaded = vma_is_single_threaded_private(vma); flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); @@ -304,23 +244,26 @@ static long change_pte_range(struct mmu_gather *tlb, struct page *page; pte_t ptent; + /* Already in the desired state. */ + if (prot_numa && pte_protnone(oldpte)) + continue; + page = vm_normal_page(vma, addr, oldpte); if (page) folio = page_folio(page); + /* * Avoid trapping faults against the zero or KSM * pages. See similar comment in change_huge_pmd. */ - if (prot_numa) { - int ret = prot_numa_skip(vma, addr, oldpte, pte, - target_node, folio); - if (ret) { + if (prot_numa && + !folio_can_map_prot_numa(folio, vma, + is_private_single_threaded)) { - /* determine batch to skip */ - nr_ptes = mprotect_folio_pte_batch(folio, - pte, oldpte, max_nr_ptes, /* flags = */ 0); - continue; - } + /* determine batch to skip */ + nr_ptes = mprotect_folio_pte_batch(folio, + pte, oldpte, max_nr_ptes, /* flags = */ 0); + continue; } nr_ptes = mprotect_folio_pte_batch(folio, pte, oldpte, max_nr_ptes, flags); @@ -354,12 +297,31 @@ static long change_pte_range(struct mmu_gather *tlb, prot_commit_flush_ptes(vma, addr, pte, oldpte, ptent, nr_ptes, /* idx = */ 0, /* set_write = */ false, tlb); pages += nr_ptes; - } else if (is_swap_pte(oldpte)) { - swp_entry_t entry = pte_to_swp_entry(oldpte); + } else if (pte_none(oldpte)) { + /* + * Nobody plays with any none ptes besides + * userfaultfd when applying the protections. + */ + if (likely(!uffd_wp)) + continue; + + if (userfaultfd_wp_use_markers(vma)) { + /* + * For file-backed mem, we need to be able to + * wr-protect a none pte, because even if the + * pte is none, the page/swap cache could + * exist. Doing that by install a marker. + */ + set_pte_at(vma->vm_mm, addr, pte, + make_pte_marker(PTE_MARKER_UFFD_WP)); + pages++; + } + } else { + softleaf_t entry = softleaf_from_pte(oldpte); pte_t newpte; - if (is_writable_migration_entry(entry)) { - struct folio *folio = pfn_swap_entry_folio(entry); + if (softleaf_is_migration_write(entry)) { + const struct folio *folio = softleaf_to_folio(entry); /* * A protection check is difficult so @@ -373,7 +335,7 @@ static long change_pte_range(struct mmu_gather *tlb, newpte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(oldpte)) newpte = pte_swp_mksoft_dirty(newpte); - } else if (is_writable_device_private_entry(entry)) { + } else if (softleaf_is_device_private_write(entry)) { /* * We do not preserve soft-dirtiness. See * copy_nonpresent_pte() for explanation. @@ -383,14 +345,14 @@ static long change_pte_range(struct mmu_gather *tlb, newpte = swp_entry_to_pte(entry); if (pte_swp_uffd_wp(oldpte)) newpte = pte_swp_mkuffd_wp(newpte); - } else if (is_pte_marker_entry(entry)) { + } else if (softleaf_is_marker(entry)) { /* * Ignore error swap entries unconditionally, * because any access should sigbus/sigsegv * anyway. */ - if (is_poisoned_swp_entry(entry) || - is_guard_swp_entry(entry)) + if (softleaf_is_poison_marker(entry) || + softleaf_is_guard_marker(entry)) continue; /* * If this is uffd-wp pte marker and we'd like @@ -415,28 +377,6 @@ static long change_pte_range(struct mmu_gather *tlb, set_pte_at(vma->vm_mm, addr, pte, newpte); pages++; } - } else { - /* It must be an none page, or what else?.. */ - WARN_ON_ONCE(!pte_none(oldpte)); - - /* - * Nobody plays with any none ptes besides - * userfaultfd when applying the protections. - */ - if (likely(!uffd_wp)) - continue; - - if (userfaultfd_wp_use_markers(vma)) { - /* - * For file-backed mem, we need to be able to - * wr-protect a none pte, because even if the - * pte is none, the page/swap cache could - * exist. Doing that by install a marker. - */ - set_pte_at(vma->vm_mm, addr, pte, - make_pte_marker(PTE_MARKER_UFFD_WP)); - pages++; - } } } while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); @@ -534,7 +474,7 @@ again: goto next; _pmd = pmdp_get_lockless(pmd); - if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd)) { + if (pmd_is_huge(_pmd)) { if ((next - addr != HPAGE_PMD_SIZE) || pgtable_split_needed(vma, cp_flags)) { __split_huge_pmd(vma, pmd, addr, false); @@ -599,7 +539,7 @@ again: break; } - pud = READ_ONCE(*pudp); + pud = pudp_get(pudp); if (pud_none(pud)) continue; @@ -813,7 +753,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, newflags &= ~VM_ACCOUNT; } - vma = vma_modify_flags(vmi, *pprev, vma, start, end, newflags); + vma = vma_modify_flags(vmi, *pprev, vma, start, end, &newflags); if (IS_ERR(vma)) { error = PTR_ERR(vma); goto fail; diff --git a/mm/mremap.c b/mm/mremap.c index bd7314898ec5..672264807db6 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -17,7 +17,7 @@ #include <linux/swap.h> #include <linux/capability.h> #include <linux/fs.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/highmem.h> #include <linux/security.h> #include <linux/syscalls.h> @@ -25,10 +25,10 @@ #include <linux/uaccess.h> #include <linux/userfaultfd_k.h> #include <linux/mempolicy.h> +#include <linux/pgalloc.h> #include <asm/cacheflush.h> #include <asm/tlb.h> -#include <asm/pgalloc.h> #include "internal.h" @@ -158,16 +158,20 @@ static void drop_rmap_locks(struct vm_area_struct *vma) static pte_t move_soft_dirty_pte(pte_t pte) { + if (pte_none(pte)) + return pte; + /* * Set soft dirty bit so we can notice * in userspace the ptes were moved. */ -#ifdef CONFIG_MEM_SOFT_DIRTY - if (pte_present(pte)) - pte = pte_mksoft_dirty(pte); - else if (is_swap_pte(pte)) - pte = pte_swp_mksoft_dirty(pte); -#endif + if (pgtable_supports_soft_dirty()) { + if (pte_present(pte)) + pte = pte_mksoft_dirty(pte); + else + pte = pte_swp_mksoft_dirty(pte); + } + return pte; } @@ -187,7 +191,7 @@ static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr if (!folio || !folio_test_large(folio)) return 1; - return folio_pte_batch(folio, ptep, pte, max_nr); + return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, FPB_RESPECT_WRITE); } static int move_ptes(struct pagetable_move_control *pmc, @@ -288,13 +292,13 @@ static int move_ptes(struct pagetable_move_control *pmc, pte = move_pte(pte, old_addr, new_addr); pte = move_soft_dirty_pte(pte); - if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) + if (need_clear_uffd_wp && pte_is_uffd_wp_marker(pte)) pte_clear(mm, new_addr, new_ptep); else { if (need_clear_uffd_wp) { if (pte_present(pte)) pte = pte_clear_uffd_wp(pte); - else if (is_swap_pte(pte)) + else pte = pte_swp_clear_uffd_wp(pte); } set_ptes(mm, new_addr, new_ptep, pte, nr_ptes); @@ -847,7 +851,7 @@ unsigned long move_page_tables(struct pagetable_move_control *pmc) if (!new_pmd) break; again: - if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) { + if (pmd_is_huge(*old_pmd)) { if (extent == HPAGE_PMD_SIZE && move_pgt_entry(pmc, HPAGE_PMD, old_pmd, new_pmd)) continue; diff --git a/mm/mseal.c b/mm/mseal.c index e5b205562d2e..ae442683c5c0 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -66,12 +66,13 @@ static int mseal_apply(struct mm_struct *mm, prev = vma; for_each_vma_range(vmi, vma, end) { - unsigned long curr_end = MIN(vma->vm_end, end); + const unsigned long curr_end = MIN(vma->vm_end, end); if (!(vma->vm_flags & VM_SEALED)) { - vma = vma_modify_flags(&vmi, prev, vma, - curr_start, curr_end, - vma->vm_flags | VM_SEALED); + vm_flags_t vm_flags = vma->vm_flags | VM_SEALED; + + vma = vma_modify_flags(&vmi, prev, vma, curr_start, + curr_end, &vm_flags); if (IS_ERR(vma)) return PTR_ERR(vma); vm_flags_set(vma, VM_SEALED); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c145b0feecc1..5eb11fbba704 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -472,6 +472,7 @@ static void dump_header(struct oom_control *oc) if (should_dump_unreclaim_slab()) dump_unreclaimable_slab(); } + mem_cgroup_show_protected_memory(oc->memcg); if (sysctl_oom_dump_tasks) dump_tasks(oc); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 757bc4d3b5b5..ccdeb0e84d39 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2434,12 +2434,6 @@ static bool folio_prepare_writeback(struct address_space *mapping, return true; } -static xa_mark_t wbc_to_tag(struct writeback_control *wbc) -{ - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - return PAGECACHE_TAG_TOWRITE; - return PAGECACHE_TAG_DIRTY; -} static pgoff_t wbc_end(struct writeback_control *wbc) { @@ -2658,7 +2652,7 @@ static void folio_account_dirtied(struct folio *folio, inode_attach_wb(inode, folio); wb = inode_to_wb(inode); - __lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr); + lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr); __zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); __node_stat_mod_folio(folio, NR_DIRTIED, nr); wb_stat_mod(wb, WB_RECLAIMABLE, nr); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 600d9e981c23..822e05f1a964 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -99,9 +99,12 @@ static DEFINE_MUTEX(pcp_batch_high_lock); /* * On SMP, spin_trylock is sufficient protection. * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP. + * Pass flags to a no-op inline function to typecheck and silence the unused + * variable warning. */ -#define pcp_trylock_prepare(flags) do { } while (0) -#define pcp_trylock_finish(flag) do { } while (0) +static inline void __pcp_trylock_noop(unsigned long *flags) { } +#define pcp_trylock_prepare(flags) __pcp_trylock_noop(&(flags)) +#define pcp_trylock_finish(flags) __pcp_trylock_noop(&(flags)) #else /* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */ @@ -129,15 +132,6 @@ static DEFINE_MUTEX(pcp_batch_high_lock); * Generic helper to lookup and a per-cpu variable with an embedded spinlock. * Return value should be used with equivalent unlock helper. */ -#define pcpu_spin_lock(type, member, ptr) \ -({ \ - type *_ret; \ - pcpu_task_pin(); \ - _ret = this_cpu_ptr(ptr); \ - spin_lock(&_ret->member); \ - _ret; \ -}) - #define pcpu_spin_trylock(type, member, ptr) \ ({ \ type *_ret; \ @@ -157,14 +151,21 @@ static DEFINE_MUTEX(pcp_batch_high_lock); }) /* struct per_cpu_pages specific helpers. */ -#define pcp_spin_lock(ptr) \ - pcpu_spin_lock(struct per_cpu_pages, lock, ptr) - -#define pcp_spin_trylock(ptr) \ - pcpu_spin_trylock(struct per_cpu_pages, lock, ptr) +#define pcp_spin_trylock(ptr, UP_flags) \ +({ \ + struct per_cpu_pages *__ret; \ + pcp_trylock_prepare(UP_flags); \ + __ret = pcpu_spin_trylock(struct per_cpu_pages, lock, ptr); \ + if (!__ret) \ + pcp_trylock_finish(UP_flags); \ + __ret; \ +}) -#define pcp_spin_unlock(ptr) \ - pcpu_spin_unlock(lock, ptr) +#define pcp_spin_unlock(ptr, UP_flags) \ +({ \ + pcpu_spin_unlock(lock, ptr); \ + pcp_trylock_finish(UP_flags); \ +}) #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); @@ -1822,14 +1823,9 @@ inline void post_alloc_hook(struct page *page, unsigned int order, * If memory tags should be zeroed * (which happens only when memory should be initialized as well). */ - if (zero_tags) { - /* Initialize both memory and memory tags. */ - for (i = 0; i != 1 << order; ++i) - tag_clear_highpage(page + i); + if (zero_tags) + init = !tag_clear_highpages(page, 1 << order); - /* Take note that memory was initialized by the loop above. */ - init = false; - } if (!should_skip_kasan_unpoison(gfp_flags) && kasan_unpoison_pages(page, order, init)) { /* Take note that memory was initialized by KASAN. */ @@ -2557,10 +2553,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * Called from the vmstat counter updater to decay the PCP high. * Return whether there are addition works to do. */ -int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) +bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) { - int high_min, to_drain, batch; - int todo = 0; + int high_min, to_drain, to_drain_batched, batch; + bool todo = false; high_min = READ_ONCE(pcp->high_min); batch = READ_ONCE(pcp->batch); @@ -2573,15 +2569,18 @@ int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX), pcp->high - (pcp->high >> 3), high_min); if (pcp->high > high_min) - todo++; + todo = true; } to_drain = pcp->count - pcp->high; - if (to_drain > 0) { + while (to_drain > 0) { + to_drain_batched = min(to_drain, batch); spin_lock(&pcp->lock); - free_pcppages_bulk(zone, to_drain, pcp, 0); + free_pcppages_bulk(zone, to_drain_batched, pcp, 0); spin_unlock(&pcp->lock); - todo++; + todo = true; + + to_drain -= to_drain_batched; } return todo; @@ -2815,12 +2814,22 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, return high; } -static void free_frozen_page_commit(struct zone *zone, +/* + * Tune pcp alloc factor and adjust count & free_count. Free pages to bring the + * pcp's watermarks below high. + * + * May return a freed pcp, if during page freeing the pcp spinlock cannot be + * reacquired. Return true if pcp is locked, false otherwise. + */ +static bool free_frozen_page_commit(struct zone *zone, struct per_cpu_pages *pcp, struct page *page, int migratetype, - unsigned int order, fpi_t fpi_flags) + unsigned int order, fpi_t fpi_flags, unsigned long *UP_flags) { int high, batch; + int to_free, to_free_batched; int pindex; + int cpu = smp_processor_id(); + int ret = true; bool free_high = false; /* @@ -2858,15 +2867,42 @@ static void free_frozen_page_commit(struct zone *zone, * Do not attempt to take a zone lock. Let pcp->count get * over high mark temporarily. */ - return; + return true; } high = nr_pcp_high(pcp, zone, batch, free_high); if (pcp->count < high) - return; + return true; + + to_free = nr_pcp_free(pcp, batch, high, free_high); + while (to_free > 0 && pcp->count > 0) { + to_free_batched = min(to_free, batch); + free_pcppages_bulk(zone, to_free_batched, pcp, pindex); + to_free -= to_free_batched; + + if (to_free == 0 || pcp->count == 0) + break; + + pcp_spin_unlock(pcp, *UP_flags); + + pcp = pcp_spin_trylock(zone->per_cpu_pageset, *UP_flags); + if (!pcp) { + ret = false; + break; + } + + /* + * Check if this thread has been migrated to a different CPU. + * If that is the case, give up and indicate that the pcp is + * returned in an unlocked state. + */ + if (smp_processor_id() != cpu) { + pcp_spin_unlock(pcp, *UP_flags); + ret = false; + break; + } + } - free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high), - pcp, pindex); if (test_bit(ZONE_BELOW_HIGH, &zone->flags) && zone_watermark_ok(zone, 0, high_wmark_pages(zone), ZONE_MOVABLE, 0)) { @@ -2884,6 +2920,7 @@ static void free_frozen_page_commit(struct zone *zone, next_memory_node(pgdat->node_id) < MAX_NUMNODES) atomic_set(&pgdat->kswapd_failures, 0); } + return ret; } /* @@ -2892,7 +2929,7 @@ static void free_frozen_page_commit(struct zone *zone, static void __free_frozen_pages(struct page *page, unsigned int order, fpi_t fpi_flags) { - unsigned long __maybe_unused UP_flags; + unsigned long UP_flags; struct per_cpu_pages *pcp; struct zone *zone; unsigned long pfn = page_to_pfn(page); @@ -2928,15 +2965,15 @@ static void __free_frozen_pages(struct page *page, unsigned int order, add_page_to_zone_llist(zone, page, order); return; } - pcp_trylock_prepare(UP_flags); - pcp = pcp_spin_trylock(zone->per_cpu_pageset); + pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags); if (pcp) { - free_frozen_page_commit(zone, pcp, page, migratetype, order, fpi_flags); - pcp_spin_unlock(pcp); + if (!free_frozen_page_commit(zone, pcp, page, migratetype, + order, fpi_flags, &UP_flags)) + return; + pcp_spin_unlock(pcp, UP_flags); } else { free_one_page(zone, page, pfn, order, fpi_flags); } - pcp_trylock_finish(UP_flags); } void free_frozen_pages(struct page *page, unsigned int order) @@ -2949,7 +2986,7 @@ void free_frozen_pages(struct page *page, unsigned int order) */ void free_unref_folios(struct folio_batch *folios) { - unsigned long __maybe_unused UP_flags; + unsigned long UP_flags; struct per_cpu_pages *pcp = NULL; struct zone *locked_zone = NULL; int i, j; @@ -2992,8 +3029,7 @@ void free_unref_folios(struct folio_batch *folios) if (zone != locked_zone || is_migrate_isolate(migratetype)) { if (pcp) { - pcp_spin_unlock(pcp); - pcp_trylock_finish(UP_flags); + pcp_spin_unlock(pcp, UP_flags); locked_zone = NULL; pcp = NULL; } @@ -3012,10 +3048,8 @@ void free_unref_folios(struct folio_batch *folios) * trylock is necessary as folios may be getting freed * from IRQ or SoftIRQ context after an IO completion. */ - pcp_trylock_prepare(UP_flags); - pcp = pcp_spin_trylock(zone->per_cpu_pageset); + pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags); if (unlikely(!pcp)) { - pcp_trylock_finish(UP_flags); free_one_page(zone, &folio->page, pfn, order, FPI_NONE); continue; @@ -3031,14 +3065,15 @@ void free_unref_folios(struct folio_batch *folios) migratetype = MIGRATE_MOVABLE; trace_mm_page_free_batched(&folio->page); - free_frozen_page_commit(zone, pcp, &folio->page, migratetype, - order, FPI_NONE); + if (!free_frozen_page_commit(zone, pcp, &folio->page, + migratetype, order, FPI_NONE, &UP_flags)) { + pcp = NULL; + locked_zone = NULL; + } } - if (pcp) { - pcp_spin_unlock(pcp); - pcp_trylock_finish(UP_flags); - } + if (pcp) + pcp_spin_unlock(pcp, UP_flags); folio_batch_reinit(folios); } @@ -3289,15 +3324,12 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct per_cpu_pages *pcp; struct list_head *list; struct page *page; - unsigned long __maybe_unused UP_flags; + unsigned long UP_flags; /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ - pcp_trylock_prepare(UP_flags); - pcp = pcp_spin_trylock(zone->per_cpu_pageset); - if (!pcp) { - pcp_trylock_finish(UP_flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags); + if (!pcp) return NULL; - } /* * On allocation, reduce the number of pages that are batch freed. @@ -3307,8 +3339,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, pcp->free_count >>= 1; list = &pcp->lists[order_to_pindex(migratetype, order)]; page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); - pcp_spin_unlock(pcp); - pcp_trylock_finish(UP_flags); + pcp_spin_unlock(pcp, UP_flags); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone, 1); @@ -3941,6 +3972,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) filter &= ~SHOW_MEM_FILTER_NODES; __show_mem(filter, nodemask, gfp_zone(gfp_mask)); + mem_cgroup_show_protected_memory(NULL); } void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) @@ -4648,11 +4680,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, if (unlikely(nofail)) { /* - * We most definitely don't want callers attempting to - * allocate greater than order-1 page units with __GFP_NOFAIL. - */ - WARN_ON_ONCE(order > 1); - /* * Also we don't support __GFP_NOFAIL without __GFP_DIRECT_RECLAIM, * otherwise, we may result in lockup. */ @@ -4982,20 +5009,25 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, * @nr_pages: The number of pages desired in the array * @page_array: Array to store the pages * - * This is a batched version of the page allocator that attempts to - * allocate nr_pages quickly. Pages are added to the page_array. + * This is a batched version of the page allocator that attempts to allocate + * @nr_pages quickly. Pages are added to @page_array. * - * Note that only NULL elements are populated with pages and nr_pages - * is the maximum number of pages that will be stored in the array. + * Note that only the elements in @page_array that were cleared to %NULL on + * entry are populated with newly allocated pages. @nr_pages is the maximum + * number of pages that will be stored in the array. * - * Returns the number of pages in the array. + * Returns the number of pages in @page_array, including ones already + * allocated on entry. This can be less than the number requested in @nr_pages, + * but all empty slots are filled from the beginning. I.e., if all slots in + * @page_array were set to %NULL on entry, the slots from 0 to the return value + * - 1 will be filled. */ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, struct page **page_array) { struct page *page; - unsigned long __maybe_unused UP_flags; + unsigned long UP_flags; struct zone *zone; struct zoneref *z; struct per_cpu_pages *pcp; @@ -5089,10 +5121,9 @@ retry_this_zone: goto failed; /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ - pcp_trylock_prepare(UP_flags); - pcp = pcp_spin_trylock(zone->per_cpu_pageset); + pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags); if (!pcp) - goto failed_irq; + goto failed; /* Attempt the batch allocation */ pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; @@ -5109,8 +5140,8 @@ retry_this_zone: if (unlikely(!page)) { /* Try and allocate at least one page */ if (!nr_account) { - pcp_spin_unlock(pcp); - goto failed_irq; + pcp_spin_unlock(pcp, UP_flags); + goto failed; } break; } @@ -5121,8 +5152,7 @@ retry_this_zone: page_array[nr_populated++] = page; } - pcp_spin_unlock(pcp); - pcp_trylock_finish(UP_flags); + pcp_spin_unlock(pcp, UP_flags); __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_account); @@ -5130,9 +5160,6 @@ retry_this_zone: out: return nr_populated; -failed_irq: - pcp_trylock_finish(UP_flags); - failed: page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask); if (page) @@ -5860,15 +5887,14 @@ static int zone_batchsize(struct zone *zone) int batch; /* - * The number of pages to batch allocate is either ~0.1% - * of the zone or 1MB, whichever is smaller. The batch + * The number of pages to batch allocate is either ~0.025% + * of the zone or 256KB, whichever is smaller. The batch * size is striking a balance between allocation latency * and zone lock contention. */ - batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE); - batch /= 4; /* We effectively *= 4 below */ - if (batch < 1) - batch = 1; + batch = min(zone_managed_pages(zone) >> 12, SZ_256K / PAGE_SIZE); + if (batch <= 1) + return 1; /* * Clamp the batch to a 2^n - 1 value. Having a power @@ -6019,7 +6045,7 @@ static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online) { int new_high_min, new_high_max, new_batch; - new_batch = max(1, zone_batchsize(zone)); + new_batch = zone_batchsize(zone); if (percpu_pagelist_high_fraction) { new_high_min = zone_highsize(zone, new_batch, cpu_online, percpu_pagelist_high_fraction); @@ -6285,10 +6311,21 @@ static void calculate_totalreserve_pages(void) long max = 0; unsigned long managed_pages = zone_managed_pages(zone); - /* Find valid and maximum lowmem_reserve in the zone */ - for (j = i; j < MAX_NR_ZONES; j++) - max = max(max, zone->lowmem_reserve[j]); + /* + * lowmem_reserve[j] is monotonically non-decreasing + * in j for a given zone (see + * setup_per_zone_lowmem_reserve()). The maximum + * valid reserve lives at the highest index with a + * non-zero value, so scan backwards and stop at the + * first hit. + */ + for (j = MAX_NR_ZONES - 1; j > i; j--) { + if (!zone->lowmem_reserve[j]) + continue; + max = zone->lowmem_reserve[j]; + break; + } /* we treat the high watermark as reserved pages. */ max += high_wmark_pages(zone); @@ -6313,7 +6350,21 @@ static void setup_per_zone_lowmem_reserve(void) { struct pglist_data *pgdat; enum zone_type i, j; - + /* + * For a given zone node_zones[i], lowmem_reserve[j] (j > i) + * represents how many pages in zone i must effectively be kept + * in reserve when deciding whether an allocation class that is + * allowed to allocate from zones up to j may fall back into + * zone i. + * + * As j increases, the allocation class can use a strictly larger + * set of fallback zones and therefore must not be allowed to + * deplete low zones more aggressively than a less flexible one. + * As a result, lowmem_reserve[j] is required to be monotonically + * non-decreasing in j for each zone i. Callers such as + * calculate_totalreserve_pages() rely on this monotonicity when + * selecting the maximum reserve entry. + */ for_each_online_pgdat(pgdat) { for (i = 0; i < MAX_NR_ZONES - 1; i++) { struct zone *zone = &pgdat->node_zones[i]; diff --git a/mm/page_idle.c b/mm/page_idle.c index a82b340dc204..96bb94c7b6c3 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -71,8 +71,11 @@ static bool page_idle_clear_pte_refs_one(struct folio *folio, referenced |= ptep_test_and_clear_young(vma, addr, pvmw.pte); referenced |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE); } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - if (pmdp_clear_young_notify(vma, addr, pvmw.pmd)) - referenced = true; + pmd_t pmdval = pmdp_get(pvmw.pmd); + + if (likely(pmd_present(pmdval))) + referenced |= pmdp_clear_young_notify(vma, addr, pvmw.pmd); + referenced |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PMD_SIZE); } else { /* unexpected pmd-mapped page? */ WARN_ON_ONCE(1); @@ -101,19 +104,15 @@ static void page_idle_clear_pte_refs(struct folio *folio) .rmap_one = page_idle_clear_pte_refs_one, .anon_lock = folio_lock_anon_vma_read, }; - bool need_lock; if (!folio_mapped(folio) || !folio_raw_mapping(folio)) return; - need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); - if (need_lock && !folio_trylock(folio)) + if (!folio_trylock(folio)) return; rmap_walk(folio, &rwc); - - if (need_lock) - folio_unlock(folio); + folio_unlock(folio); } static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj, diff --git a/mm/page_owner.c b/mm/page_owner.c index 589ec37c94aa..a70245684206 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -45,6 +45,15 @@ static struct stack failure_stack; static struct stack *stack_list; static DEFINE_SPINLOCK(stack_list_lock); +#define STACK_PRINT_FLAG_STACK 0x1 +#define STACK_PRINT_FLAG_PAGES 0x2 +#define STACK_PRINT_FLAG_HANDLE 0x4 + +struct stack_print_ctx { + struct stack *stack; + u8 flags; +}; + static bool page_owner_enabled __initdata; DEFINE_STATIC_KEY_FALSE(page_owner_inited); @@ -760,7 +769,7 @@ static loff_t lseek_page_owner(struct file *file, loff_t offset, int orig) return file->f_pos; } -static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) +static void init_pages_in_zone(struct zone *zone) { unsigned long pfn = zone->zone_start_pfn; unsigned long end_pfn = zone_end_pfn(zone); @@ -827,31 +836,18 @@ ext_put_continue: } pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n", - pgdat->node_id, zone->name, count); -} - -static void init_zones_in_node(pg_data_t *pgdat) -{ - struct zone *zone; - struct zone *node_zones = pgdat->node_zones; - - for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { - if (!populated_zone(zone)) - continue; - - init_pages_in_zone(pgdat, zone); - } + zone->zone_pgdat->node_id, zone->name, count); } static void init_early_allocated_pages(void) { - pg_data_t *pgdat; + struct zone *zone; - for_each_online_pgdat(pgdat) - init_zones_in_node(pgdat); + for_each_populated_zone(zone) + init_pages_in_zone(zone); } -static const struct file_operations proc_page_owner_operations = { +static const struct file_operations page_owner_fops = { .read = read_page_owner, .llseek = lseek_page_owner, }; @@ -859,6 +855,7 @@ static const struct file_operations proc_page_owner_operations = { static void *stack_start(struct seq_file *m, loff_t *ppos) { struct stack *stack; + struct stack_print_ctx *ctx = m->private; if (*ppos == -1UL) return NULL; @@ -870,9 +867,9 @@ static void *stack_start(struct seq_file *m, loff_t *ppos) * value of stack_list. */ stack = smp_load_acquire(&stack_list); - m->private = stack; + ctx->stack = stack; } else { - stack = m->private; + stack = ctx->stack; } return stack; @@ -881,10 +878,11 @@ static void *stack_start(struct seq_file *m, loff_t *ppos) static void *stack_next(struct seq_file *m, void *v, loff_t *ppos) { struct stack *stack = v; + struct stack_print_ctx *ctx = m->private; stack = stack->next; *ppos = stack ? *ppos + 1 : -1UL; - m->private = stack; + ctx->stack = stack; return stack; } @@ -898,20 +896,28 @@ static int stack_print(struct seq_file *m, void *v) unsigned long *entries; unsigned long nr_entries; struct stack_record *stack_record = stack->stack_record; + struct stack_print_ctx *ctx = m->private; if (!stack->stack_record) return 0; - nr_entries = stack_record->size; - entries = stack_record->entries; nr_base_pages = refcount_read(&stack_record->count) - 1; - if (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold) + if (ctx->flags & STACK_PRINT_FLAG_PAGES && + (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold)) return 0; - for (i = 0; i < nr_entries; i++) - seq_printf(m, " %pS\n", (void *)entries[i]); - seq_printf(m, "nr_base_pages: %d\n\n", nr_base_pages); + if (ctx->flags & STACK_PRINT_FLAG_STACK) { + nr_entries = stack_record->size; + entries = stack_record->entries; + for (i = 0; i < nr_entries; i++) + seq_printf(m, " %pS\n", (void *)entries[i]); + } + if (ctx->flags & STACK_PRINT_FLAG_HANDLE) + seq_printf(m, "handle: %d\n", stack_record->handle.handle); + if (ctx->flags & STACK_PRINT_FLAG_PAGES) + seq_printf(m, "nr_base_pages: %d\n", nr_base_pages); + seq_putc(m, '\n'); return 0; } @@ -929,10 +935,20 @@ static const struct seq_operations page_owner_stack_op = { static int page_owner_stack_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &page_owner_stack_op, 0); + int ret = seq_open_private(file, &page_owner_stack_op, + sizeof(struct stack_print_ctx)); + + if (!ret) { + struct seq_file *m = file->private_data; + struct stack_print_ctx *ctx = m->private; + + ctx->flags = (uintptr_t) inode->i_private; + } + + return ret; } -static const struct file_operations page_owner_stack_operations = { +static const struct file_operations page_owner_stack_fops = { .open = page_owner_stack_open, .read = seq_read, .llseek = seq_lseek, @@ -951,7 +967,7 @@ static int page_owner_threshold_set(void *data, u64 val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(proc_page_owner_threshold, &page_owner_threshold_get, +DEFINE_SIMPLE_ATTRIBUTE(page_owner_threshold_fops, &page_owner_threshold_get, &page_owner_threshold_set, "%llu"); @@ -964,14 +980,22 @@ static int __init pageowner_init(void) return 0; } - debugfs_create_file("page_owner", 0400, NULL, NULL, - &proc_page_owner_operations); + debugfs_create_file("page_owner", 0400, NULL, NULL, &page_owner_fops); dir = debugfs_create_dir("page_owner_stacks", NULL); - debugfs_create_file("show_stacks", 0400, dir, NULL, - &page_owner_stack_operations); + debugfs_create_file("show_stacks", 0400, dir, + (void *)(STACK_PRINT_FLAG_STACK | + STACK_PRINT_FLAG_PAGES), + &page_owner_stack_fops); + debugfs_create_file("show_handles", 0400, dir, + (void *)(STACK_PRINT_FLAG_HANDLE | + STACK_PRINT_FLAG_PAGES), + &page_owner_stack_fops); + debugfs_create_file("show_stacks_handles", 0400, dir, + (void *)(STACK_PRINT_FLAG_STACK | + STACK_PRINT_FLAG_HANDLE), + &page_owner_stack_fops); debugfs_create_file("count_threshold", 0600, dir, NULL, - &proc_page_owner_threshold); - + &page_owner_threshold_fops); return 0; } late_initcall(pageowner_init) diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 4eeca782b888..741884645ab0 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -8,7 +8,7 @@ #include <linux/mm.h> #include <linux/page_table_check.h> #include <linux/swap.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #undef pr_fmt #define pr_fmt(fmt) "page_table_check: " fmt @@ -179,18 +179,21 @@ void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) EXPORT_SYMBOL(__page_table_check_pud_clear); /* Whether the swap entry cached writable information */ -static inline bool swap_cached_writable(swp_entry_t entry) +static inline bool softleaf_cached_writable(softleaf_t entry) { - return is_writable_device_private_entry(entry) || - is_writable_migration_entry(entry); + return softleaf_is_device_private_write(entry) || + softleaf_is_migration_write(entry); } -static inline void page_table_check_pte_flags(pte_t pte) +static void page_table_check_pte_flags(pte_t pte) { - if (pte_present(pte) && pte_uffd_wp(pte)) - WARN_ON_ONCE(pte_write(pte)); - else if (is_swap_pte(pte) && pte_swp_uffd_wp(pte)) - WARN_ON_ONCE(swap_cached_writable(pte_to_swp_entry(pte))); + if (pte_present(pte)) { + WARN_ON_ONCE(pte_uffd_wp(pte) && pte_write(pte)); + } else if (pte_swp_uffd_wp(pte)) { + const softleaf_t entry = softleaf_from_pte(pte); + + WARN_ON_ONCE(softleaf_cached_writable(entry)); + } } void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, @@ -212,10 +215,14 @@ EXPORT_SYMBOL(__page_table_check_ptes_set); static inline void page_table_check_pmd_flags(pmd_t pmd) { - if (pmd_present(pmd) && pmd_uffd_wp(pmd)) - WARN_ON_ONCE(pmd_write(pmd)); - else if (is_swap_pmd(pmd) && pmd_swp_uffd_wp(pmd)) - WARN_ON_ONCE(swap_cached_writable(pmd_to_swp_entry(pmd))); + if (pmd_present(pmd)) { + if (pmd_uffd_wp(pmd)) + WARN_ON_ONCE(pmd_write(pmd)); + } else if (pmd_swp_uffd_wp(pmd)) { + const softleaf_t entry = softleaf_from_pmd(pmd); + + WARN_ON_ONCE(softleaf_cached_writable(entry)); + } } void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd, diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index c498a91b6706..b38a1d00c971 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -3,7 +3,7 @@ #include <linux/rmap.h> #include <linux/hugetlb.h> #include <linux/swap.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include "internal.h" @@ -16,6 +16,7 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw) static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp, spinlock_t **ptlp) { + bool is_migration; pte_t ptent; if (pvmw->flags & PVMW_SYNC) { @@ -26,6 +27,7 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp, return !!pvmw->pte; } + is_migration = pvmw->flags & PVMW_MIGRATION; again: /* * It is important to return the ptl corresponding to pte, @@ -41,11 +43,14 @@ again: ptent = ptep_get(pvmw->pte); - if (pvmw->flags & PVMW_MIGRATION) { - if (!is_swap_pte(ptent)) + if (pte_none(ptent)) { + return false; + } else if (pte_present(ptent)) { + if (is_migration) return false; - } else if (is_swap_pte(ptent)) { - swp_entry_t entry; + } else if (!is_migration) { + softleaf_t entry; + /* * Handle un-addressable ZONE_DEVICE memory. * @@ -62,12 +67,10 @@ again: * For more details on device private memory see HMM * (include/linux/hmm.h or mm/hmm.c). */ - entry = pte_to_swp_entry(ptent); - if (!is_device_private_entry(entry) && - !is_device_exclusive_entry(entry)) + entry = softleaf_from_pte(ptent); + if (!softleaf_is_device_private(entry) && + !softleaf_is_device_exclusive(entry)) return false; - } else if (!pte_present(ptent)) { - return false; } spin_lock(*ptlp); if (unlikely(!pmd_same(*pmdvalp, pmdp_get_lockless(pvmw->pmd)))) { @@ -107,30 +110,23 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw, unsigned long pte_nr) pte_t ptent = ptep_get(pvmw->pte); if (pvmw->flags & PVMW_MIGRATION) { - swp_entry_t entry; - if (!is_swap_pte(ptent)) - return false; - entry = pte_to_swp_entry(ptent); + const softleaf_t entry = softleaf_from_pte(ptent); - if (!is_migration_entry(entry)) + if (!softleaf_is_migration(entry)) return false; - pfn = swp_offset_pfn(entry); - } else if (is_swap_pte(ptent)) { - swp_entry_t entry; + pfn = softleaf_to_pfn(entry); + } else if (pte_present(ptent)) { + pfn = pte_pfn(ptent); + } else { + const softleaf_t entry = softleaf_from_pte(ptent); /* Handle un-addressable ZONE_DEVICE memory */ - entry = pte_to_swp_entry(ptent); - if (!is_device_private_entry(entry) && - !is_device_exclusive_entry(entry)) - return false; - - pfn = swp_offset_pfn(entry); - } else { - if (!pte_present(ptent)) + if (!softleaf_is_device_private(entry) && + !softleaf_is_device_exclusive(entry)) return false; - pfn = pte_pfn(ptent); + pfn = softleaf_to_pfn(entry); } if ((pfn + pte_nr - 1) < pvmw->pfn) @@ -246,18 +242,19 @@ restart: */ pmde = pmdp_get_lockless(pvmw->pmd); - if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) { + if (pmd_trans_huge(pmde) || pmd_is_migration_entry(pmde)) { pvmw->ptl = pmd_lock(mm, pvmw->pmd); pmde = *pvmw->pmd; if (!pmd_present(pmde)) { - swp_entry_t entry; + softleaf_t entry; if (!thp_migration_supported() || !(pvmw->flags & PVMW_MIGRATION)) return not_found(pvmw); - entry = pmd_to_swp_entry(pmde); - if (!is_migration_entry(entry) || - !check_pmd(swp_offset_pfn(entry), pvmw)) + entry = softleaf_from_pmd(pmde); + + if (!softleaf_is_migration(entry) || + !check_pmd(softleaf_to_pfn(entry), pvmw)) return not_found(pvmw); return true; } @@ -277,6 +274,13 @@ restart: * cannot return prematurely, while zap_huge_pmd() has * cleared *pmd but not decremented compound_mapcount(). */ + const softleaf_t entry = softleaf_from_pmd(pmde); + + if (softleaf_is_device_private(entry)) { + pvmw->ptl = pmd_lock(mm, pvmw->pmd); + return true; + } + if ((pvmw->flags & PVMW_SYNC) && thp_vma_suitable_order(vma, pvmw->address, PMD_ORDER) && diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 9f91cf85a5be..90cc346a6ecf 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -5,7 +5,7 @@ #include <linux/hugetlb.h> #include <linux/mmu_context.h> #include <linux/swap.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <asm/tlbflush.h> @@ -452,7 +452,7 @@ static inline void process_vma_walk_lock(struct vm_area_struct *vma, * We usually restrict the ability to install PTEs, but this functionality is * available to internal memory management code and provided in mm/internal.h. */ -int walk_page_range_mm(struct mm_struct *mm, unsigned long start, +int walk_page_range_mm_unsafe(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private) { @@ -518,10 +518,10 @@ int walk_page_range_mm(struct mm_struct *mm, unsigned long start, * This check is performed on all functions which are parameterised by walk * operations and exposed in include/linux/pagewalk.h. * - * Internal memory management code can use the walk_page_range_mm() function to - * be able to use all page walking operations. + * Internal memory management code can use *_unsafe() functions to be able to + * use all page walking operations. */ -static bool check_ops_valid(const struct mm_walk_ops *ops) +static bool check_ops_safe(const struct mm_walk_ops *ops) { /* * The installation of PTEs is solely under the control of memory @@ -579,10 +579,10 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private) { - if (!check_ops_valid(ops)) + if (!check_ops_safe(ops)) return -EINVAL; - return walk_page_range_mm(mm, start, end, ops, private); + return walk_page_range_mm_unsafe(mm, start, end, ops, private); } /** @@ -639,7 +639,7 @@ int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end if (start >= end) return -EINVAL; - if (!check_ops_valid(ops)) + if (!check_ops_safe(ops)) return -EINVAL; return walk_pgd_range(start, end, &walk); @@ -678,7 +678,7 @@ int walk_page_range_debug(struct mm_struct *mm, unsigned long start, pgd, private); if (start >= end || !walk.mm) return -EINVAL; - if (!check_ops_valid(ops)) + if (!check_ops_safe(ops)) return -EINVAL; /* @@ -694,9 +694,8 @@ int walk_page_range_debug(struct mm_struct *mm, unsigned long start, return walk_pgd_range(start, end, &walk); } -int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, - unsigned long end, const struct mm_walk_ops *ops, - void *private) +int walk_page_range_vma_unsafe(struct vm_area_struct *vma, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, void *private) { struct mm_walk walk = { .ops = ops, @@ -709,14 +708,22 @@ int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, return -EINVAL; if (start < vma->vm_start || end > vma->vm_end) return -EINVAL; - if (!check_ops_valid(ops)) - return -EINVAL; process_mm_walk_lock(walk.mm, ops->walk_lock); process_vma_walk_lock(vma, ops->walk_lock); return __walk_page_range(start, end, &walk); } +int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private) +{ + if (!check_ops_safe(ops)) + return -EINVAL; + + return walk_page_range_vma_unsafe(vma, start, end, ops, private); +} + int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, void *private) { @@ -729,7 +736,7 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, if (!walk.mm) return -EINVAL; - if (!check_ops_valid(ops)) + if (!check_ops_safe(ops)) return -EINVAL; process_mm_walk_lock(walk.mm, ops->walk_lock); @@ -780,7 +787,7 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, unsigned long start_addr, end_addr; int err = 0; - if (!check_ops_valid(ops)) + if (!check_ops_safe(ops)) return -EINVAL; lockdep_assert_held(&mapping->i_mmap_rwsem); @@ -966,10 +973,10 @@ pmd_table: goto found; } } else if ((flags & FW_MIGRATION) && - is_pmd_migration_entry(pmd)) { - swp_entry_t entry = pmd_to_swp_entry(pmd); + pmd_is_migration_entry(pmd)) { + const softleaf_t entry = softleaf_from_pmd(pmd); - page = pfn_swap_entry_to_page(entry); + page = softleaf_to_page(entry); expose_page = false; goto found; } @@ -1000,11 +1007,10 @@ pte_table: goto found; } } else if (!pte_none(pte)) { - swp_entry_t entry = pte_to_swp_entry(pte); + const softleaf_t entry = softleaf_from_pte(pte); - if ((flags & FW_MIGRATION) && - is_migration_entry(entry)) { - page = pfn_swap_entry_to_page(entry); + if ((flags & FW_MIGRATION) && softleaf_is_migration(entry)) { + page = softleaf_to_page(entry); expose_page = false; goto found; } diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index cd69caf6aa8d..4f5937090590 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -194,7 +194,7 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages, int nr_pages) { return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT), - PAGE_KERNEL, pages, PAGE_SHIFT); + PAGE_KERNEL, pages, PAGE_SHIFT, GFP_KERNEL); } /** diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 567e2d084071..d3aec7a9926a 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -13,7 +13,9 @@ #include <linux/swap.h> #include <linux/swapops.h> #include <linux/mm_inline.h> -#include <asm/pgalloc.h> +#include <linux/iommu.h> +#include <linux/pgalloc.h> + #include <asm/tlb.h> /* @@ -290,7 +292,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) if (pmdvalp) *pmdvalp = pmdval; - if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) + if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval))) goto nomap; if (unlikely(pmd_trans_huge(pmdval))) goto nomap; @@ -406,3 +408,41 @@ again: pte_unmap_unlock(pte, ptl); goto again; } + +#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE +static void kernel_pgtable_work_func(struct work_struct *work); + +static struct { + struct list_head list; + /* protect above ptdesc lists */ + spinlock_t lock; + struct work_struct work; +} kernel_pgtable_work = { + .list = LIST_HEAD_INIT(kernel_pgtable_work.list), + .lock = __SPIN_LOCK_UNLOCKED(kernel_pgtable_work.lock), + .work = __WORK_INITIALIZER(kernel_pgtable_work.work, kernel_pgtable_work_func), +}; + +static void kernel_pgtable_work_func(struct work_struct *work) +{ + struct ptdesc *pt, *next; + LIST_HEAD(page_list); + + spin_lock(&kernel_pgtable_work.lock); + list_splice_tail_init(&kernel_pgtable_work.list, &page_list); + spin_unlock(&kernel_pgtable_work.lock); + + iommu_sva_invalidate_kva_range(PAGE_OFFSET, TLB_FLUSH_ALL); + list_for_each_entry_safe(pt, next, &page_list, pt_list) + __pagetable_free(pt); +} + +void pagetable_free_kernel(struct ptdesc *pt) +{ + spin_lock(&kernel_pgtable_work.lock); + list_add(&pt->pt_list, &kernel_pgtable_work.list); + spin_unlock(&kernel_pgtable_work.lock); + + schedule_work(&kernel_pgtable_work.work); +} +#endif diff --git a/mm/pt_reclaim.c b/mm/pt_reclaim.c index 7e9455a18aae..0d9cfbf4fe5d 100644 --- a/mm/pt_reclaim.c +++ b/mm/pt_reclaim.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/hugetlb.h> +#include <linux/pgalloc.h> + #include <asm-generic/tlb.h> -#include <asm/pgalloc.h> #include "internal.h" diff --git a/mm/ptdump.c b/mm/ptdump.c index b600c7f864b8..973020000096 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -31,7 +31,7 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct ptdump_state *st = walk->private; - pgd_t val = READ_ONCE(*pgd); + pgd_t val = pgdp_get(pgd); #if CONFIG_PGTABLE_LEVELS > 4 && \ (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) @@ -54,7 +54,7 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct ptdump_state *st = walk->private; - p4d_t val = READ_ONCE(*p4d); + p4d_t val = p4dp_get(p4d); #if CONFIG_PGTABLE_LEVELS > 3 && \ (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) @@ -77,7 +77,7 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct ptdump_state *st = walk->private; - pud_t val = READ_ONCE(*pud); + pud_t val = pudp_get(pud); #if CONFIG_PGTABLE_LEVELS > 2 && \ (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) @@ -100,7 +100,7 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct ptdump_state *st = walk->private; - pmd_t val = READ_ONCE(*pmd); + pmd_t val = pmdp_get(pmd); #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) if (pmd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pte))) @@ -121,7 +121,7 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct ptdump_state *st = walk->private; - pte_t val = ptep_get_lockless(pte); + pte_t val = ptep_get(pte); if (st->effective_prot_pte) st->effective_prot_pte(st, val); diff --git a/mm/readahead.c b/mm/readahead.c index 3a4b5d58eeb6..b415c9969176 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -186,7 +186,7 @@ static struct folio *ractl_alloc_folio(struct readahead_control *ractl, { struct folio *folio; - folio = filemap_alloc_folio(gfp_mask, order); + folio = filemap_alloc_folio(gfp_mask, order, NULL); if (folio && ractl->dropbehind) __folio_set_dropbehind(folio); diff --git a/mm/rmap.c b/mm/rmap.c index ac4f783d6ec2..f955f02d570e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -57,7 +57,7 @@ #include <linux/sched/task.h> #include <linux/pagemap.h> #include <linux/swap.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/ksm.h> @@ -489,17 +489,15 @@ void __init anon_vma_init(void) * if there is a mapcount, we can dereference the anon_vma after observing * those. * - * NOTE: the caller should normally hold folio lock when calling this. If - * not, the caller needs to double check the anon_vma didn't change after - * taking the anon_vma lock for either read or write (UFFDIO_MOVE can modify it - * concurrently without folio lock protection). See folio_lock_anon_vma_read() - * which has already covered that, and comment above remap_pages(). + * NOTE: the caller should hold folio lock when calling this. */ struct anon_vma *folio_get_anon_vma(const struct folio *folio) { struct anon_vma *anon_vma = NULL; unsigned long anon_mapping; + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + rcu_read_lock(); anon_mapping = (unsigned long)READ_ONCE(folio->mapping); if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) @@ -546,7 +544,8 @@ struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio, struct anon_vma *root_anon_vma; unsigned long anon_mapping; -retry: + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + rcu_read_lock(); anon_mapping = (unsigned long)READ_ONCE(folio->mapping); if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) @@ -558,17 +557,6 @@ retry: root_anon_vma = READ_ONCE(anon_vma->root); if (down_read_trylock(&root_anon_vma->rwsem)) { /* - * folio_move_anon_rmap() might have changed the anon_vma as we - * might not hold the folio lock here. - */ - if (unlikely((unsigned long)READ_ONCE(folio->mapping) != - anon_mapping)) { - up_read(&root_anon_vma->rwsem); - rcu_read_unlock(); - goto retry; - } - - /* * If the folio is still mapped, then this anon_vma is still * its anon_vma, and holding the mutex ensures that it will * not go away, see anon_vma_free(). @@ -602,18 +590,6 @@ retry: rcu_read_unlock(); anon_vma_lock_read(anon_vma); - /* - * folio_move_anon_rmap() might have changed the anon_vma as we might - * not hold the folio lock here. - */ - if (unlikely((unsigned long)READ_ONCE(folio->mapping) != - anon_mapping)) { - anon_vma_unlock_read(anon_vma); - put_anon_vma(anon_vma); - anon_vma = NULL; - goto retry; - } - if (atomic_dec_and_test(&anon_vma->refcount)) { /* * Oops, we held the last refcount, release the lock @@ -988,7 +964,7 @@ int folio_referenced(struct folio *folio, int is_locked, if (!folio_raw_mapping(folio)) return 0; - if (!is_locked && (!folio_test_anon(folio) || folio_test_ksm(folio))) { + if (!is_locked) { we_locked = folio_trylock(folio); if (!we_locked) return 1; @@ -1046,9 +1022,16 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) } else { #ifdef CONFIG_TRANSPARENT_HUGEPAGE pmd_t *pmd = pvmw->pmd; - pmd_t entry; + pmd_t entry = pmdp_get(pmd); - if (!pmd_dirty(*pmd) && !pmd_write(*pmd)) + /* + * Please see the comment above (!pte_present). + * A non present PMD is not writable from a CPU + * perspective. + */ + if (!pmd_present(entry)) + continue; + if (!pmd_dirty(entry) && !pmd_write(entry)) continue; flush_cache_range(vma, address, @@ -1229,12 +1212,12 @@ static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped) if (nr) { idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED; - __lruvec_stat_mod_folio(folio, idx, nr); + lruvec_stat_mod_folio(folio, idx, nr); } if (nr_pmdmapped) { if (folio_test_anon(folio)) { idx = NR_ANON_THPS; - __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped); + lruvec_stat_mod_folio(folio, idx, nr_pmdmapped); } else { /* NR_*_PMDMAPPED are not maintained per-memcg */ idx = folio_test_swapbacked(folio) ? @@ -1757,9 +1740,13 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, * the folio is unmapped and at least one page is still mapped. * * Check partially_mapped first to ensure it is a large folio. + * + * Device private folios do not support deferred splitting and + * shrinker based scanning of the folios to free. */ if (partially_mapped && folio_test_anon(folio) && - !folio_test_partially_mapped(folio)) + !folio_test_partially_mapped(folio) && + !folio_is_device_private(folio)) deferred_split_folio(folio, true); __folio_mod_stat(folio, -nr, -nr_pmdmapped); @@ -1982,7 +1969,9 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, if (likely(pte_present(pteval))) { pfn = pte_pfn(pteval); } else { - pfn = swp_offset_pfn(pte_to_swp_entry(pteval)); + const softleaf_t entry = softleaf_from_pte(pteval); + + pfn = softleaf_to_pfn(entry); VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); } @@ -2339,6 +2328,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, while (page_vma_mapped_walk(&pvmw)) { /* PMD-mapped THP migration entry */ if (!pvmw.pte) { + __maybe_unused unsigned long pfn; + __maybe_unused pmd_t pmdval; + if (flags & TTU_SPLIT_HUGE_PMD) { split_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, true); @@ -2347,8 +2339,14 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, break; } #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION - subpage = folio_page(folio, - pmd_pfn(*pvmw.pmd) - folio_pfn(folio)); + pmdval = pmdp_get(pvmw.pmd); + if (likely(pmd_present(pmdval))) + pfn = pmd_pfn(pmdval); + else + pfn = softleaf_to_pfn(softleaf_from_pmd(pmdval)); + + subpage = folio_page(folio, pfn - folio_pfn(folio)); + VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || !folio_test_pmd_mappable(folio), folio); @@ -2372,7 +2370,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, if (likely(pte_present(pteval))) { pfn = pte_pfn(pteval); } else { - pfn = swp_offset_pfn(pte_to_swp_entry(pteval)); + const softleaf_t entry = softleaf_from_pte(pteval); + + pfn = softleaf_to_pfn(entry); VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); } @@ -2457,8 +2457,11 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, folio_mark_dirty(folio); writable = pte_write(pteval); } else { + const softleaf_t entry = softleaf_from_pte(pteval); + pte_clear(mm, address, pvmw.pte); - writable = is_writable_device_private_entry(pte_to_swp_entry(pteval)); + + writable = softleaf_is_device_private_write(entry); } VM_WARN_ON_FOLIO(writable && folio_test_anon(folio) && @@ -2828,6 +2831,12 @@ static void rmap_walk_anon(struct folio *folio, pgoff_t pgoff_start, pgoff_end; struct anon_vma_chain *avc; + /* + * The folio lock ensures that folio->mapping can't be changed under us + * to an anon_vma with different root. + */ + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + if (locked) { anon_vma = folio_anon_vma(folio); /* anon_vma disappear under us? */ diff --git a/mm/secretmem.c b/mm/secretmem.c index 60137305bc20..edf111e0a1bb 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -82,13 +82,13 @@ retry: __folio_mark_uptodate(folio); err = filemap_add_folio(mapping, folio, offset, gfp); if (unlikely(err)) { - folio_put(folio); /* * If a split of large page was required, it * already happened when we marked the page invalid * which guarantees that this call won't fail */ set_direct_map_default_noflush(folio_page(folio, 0)); + folio_put(folio); if (err == -EEXIST) goto retry; @@ -120,7 +120,7 @@ static int secretmem_release(struct inode *inode, struct file *file) static int secretmem_mmap_prepare(struct vm_area_desc *desc) { - const unsigned long len = desc->end - desc->start; + const unsigned long len = vma_desc_size(desc); if ((desc->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) return -EINVAL; @@ -224,10 +224,7 @@ err_free_inode: SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) { - struct file *file; - int fd, err; - - /* make sure local flags do not confict with global fcntl.h */ + /* make sure local flags do not conflict with global fcntl.h */ BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC); if (!secretmem_enable || !can_set_direct_map()) @@ -238,22 +235,7 @@ SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) if (atomic_read(&secretmem_users) < 0) return -ENFILE; - fd = get_unused_fd_flags(flags & O_CLOEXEC); - if (fd < 0) - return fd; - - file = secretmem_file_create(flags); - if (IS_ERR(file)) { - err = PTR_ERR(file); - goto err_put_fd; - } - - fd_install(fd, file); - return fd; - -err_put_fd: - put_unused_fd(fd); - return err; + return FD_ADD(flags & O_CLOEXEC, secretmem_file_create(flags)); } static int secretmem_init_fs_context(struct fs_context *fc) diff --git a/mm/shmem.c b/mm/shmem.c index b9081b817d28..d578d8e765d7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -66,7 +66,7 @@ static struct vfsmount *shm_mnt __ro_after_init; #include <linux/falloc.h> #include <linux/splice.h> #include <linux/security.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/mempolicy.h> #include <linux/namei.h> #include <linux/ctype.h> @@ -131,8 +131,7 @@ struct shmem_options { #define SHMEM_SEEN_INODES 2 #define SHMEM_SEEN_HUGE 4 #define SHMEM_SEEN_INUMS 8 -#define SHMEM_SEEN_NOSWAP 16 -#define SHMEM_SEEN_QUOTA 32 +#define SHMEM_SEEN_QUOTA 16 }; #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -570,8 +569,37 @@ static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index, #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* ifdef here to avoid bloating shmem.o when not necessary */ -static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; -static int tmpfs_huge __read_mostly = SHMEM_HUGE_NEVER; +#if defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER) +#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ALWAYS) +#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ALWAYS +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_WITHIN_SIZE) +#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ADVISE) +#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ADVISE +#else +#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER +#endif + +static int shmem_huge __read_mostly = SHMEM_HUGE_DEFAULT; + +#undef SHMEM_HUGE_DEFAULT + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER) +#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ALWAYS) +#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ALWAYS +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_WITHIN_SIZE) +#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ADVISE) +#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ADVISE +#else +#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER +#endif + +static int tmpfs_huge __read_mostly = TMPFS_HUGE_DEFAULT; + +#undef TMPFS_HUGE_DEFAULT static unsigned int shmem_get_orders_within_size(struct inode *inode, unsigned long within_size_orders, pgoff_t index, @@ -616,34 +644,23 @@ static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index * the mTHP interface, so we still use PMD-sized huge order to * check whether global control is enabled. * - * For tmpfs mmap()'s huge order, we still use PMD-sized order to - * allocate huge pages due to lack of a write size hint. - * * For tmpfs with 'huge=always' or 'huge=within_size' mount option, * we will always try PMD-sized order first. If that failed, it will * fall back to small large folios. */ switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: - if (vma) - return maybe_pmd_order; - return THP_ORDERS_ALL_FILE_DEFAULT; case SHMEM_HUGE_WITHIN_SIZE: - if (vma) - within_size_orders = maybe_pmd_order; - else - within_size_orders = THP_ORDERS_ALL_FILE_DEFAULT; - - within_size_orders = shmem_get_orders_within_size(inode, within_size_orders, - index, write_end); + within_size_orders = shmem_get_orders_within_size(inode, + THP_ORDERS_ALL_FILE_DEFAULT, index, write_end); if (within_size_orders > 0) return within_size_orders; fallthrough; case SHMEM_HUGE_ADVISE: if (vm_flags & VM_HUGEPAGE) - return maybe_pmd_order; + return THP_ORDERS_ALL_FILE_DEFAULT; fallthrough; default: return 0; @@ -853,9 +870,9 @@ static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index static void shmem_update_stats(struct folio *folio, int nr_pages) { if (folio_test_pmd_mappable(folio)) - __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages); - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); - __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages); + lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages); + lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); + lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages); } /* @@ -1076,7 +1093,7 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) * Remove range of pages and swap entries from page cache, and free them. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. */ -static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, +static void shmem_undo_range(struct inode *inode, loff_t lstart, uoff_t lend, bool unfalloc) { struct address_space *mapping = inode->i_mapping; @@ -1133,7 +1150,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT); if (folio) { - same_folio = lend < folio_pos(folio) + folio_size(folio); + same_folio = lend < folio_next_pos(folio); folio_mark_dirty(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) { start = folio_next_index(folio); @@ -1227,7 +1244,7 @@ whole_folios: shmem_recalc_inode(inode, 0, -nr_swaps_freed); } -void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend) { shmem_undo_range(inode, lstart, lend, false); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); @@ -1617,7 +1634,7 @@ try_split: folio_mark_uptodate(folio); } - if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) { + if (!folio_alloc_swap(folio)) { bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages); int error; @@ -1882,6 +1899,7 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, struct shmem_inode_info *info = SHMEM_I(inode); unsigned long suitable_orders = 0; struct folio *folio = NULL; + pgoff_t aligned_index; long pages; int error, order; @@ -1895,10 +1913,12 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, order = highest_order(suitable_orders); while (suitable_orders) { pages = 1UL << order; - index = round_down(index, pages); - folio = shmem_alloc_folio(gfp, order, info, index); - if (folio) + aligned_index = round_down(index, pages); + folio = shmem_alloc_folio(gfp, order, info, aligned_index); + if (folio) { + index = aligned_index; goto allocated; + } if (pages == HPAGE_PMD_NR) count_vm_event(THP_FILE_FALLBACK); @@ -2254,7 +2274,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL; struct shmem_inode_info *info = SHMEM_I(inode); - swp_entry_t swap, index_entry; + swp_entry_t swap; + softleaf_t index_entry; struct swap_info_struct *si; struct folio *folio = NULL; bool skip_swapcache = false; @@ -2266,7 +2287,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, swap = index_entry; *foliop = NULL; - if (is_poisoned_swp_entry(index_entry)) + if (softleaf_is_poison_marker(index_entry)) return -EIO; si = get_swap_device(index_entry); @@ -2756,8 +2777,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (len > TASK_SIZE) return -ENOMEM; - addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff, - flags); + addr = mm_get_unmapped_area(file, uaddr, len, pgoff, flags); if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return addr; @@ -2835,8 +2855,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (inflated_len < len) return addr; - inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr, - inflated_len, 0, flags); + inflated_addr = mm_get_unmapped_area(NULL, uaddr, inflated_len, 0, flags); if (IS_ERR_VALUE(inflated_addr)) return addr; if (inflated_addr & ~PAGE_MASK) @@ -2924,16 +2943,17 @@ out_nomem: return retval; } -static int shmem_mmap(struct file *file, struct vm_area_struct *vma) +static int shmem_mmap_prepare(struct vm_area_desc *desc) { + struct file *file = desc->file; struct inode *inode = file_inode(file); file_accessed(file); /* This is anonymous shared memory if it is unlinked at the time of mmap */ if (inode->i_nlink) - vma->vm_ops = &shmem_vm_ops; + desc->vm_ops = &shmem_vm_ops; else - vma->vm_ops = &shmem_anon_vm_ops; + desc->vm_ops = &shmem_anon_vm_ops; return 0; } @@ -3858,12 +3878,7 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_inc_iversion(dir); - if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) - d_add(dentry, inode); - else - d_instantiate(dentry, inode); - - dget(dentry); /* Extra count - pin the dentry in core */ + d_make_persistent(dentry, inode); return error; out_iput: @@ -3924,7 +3939,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); - int ret = 0; + int ret; /* * No ordinary (disk based) filesystem counts links as inodes; @@ -3936,29 +3951,19 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, if (inode->i_nlink) { ret = shmem_reserve_inode(inode->i_sb, NULL); if (ret) - goto out; + return ret; } ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry); if (ret) { if (inode->i_nlink) shmem_free_inode(inode->i_sb, 0); - goto out; + return ret; } dir->i_size += BOGO_DIRENT_SIZE; - inode_set_mtime_to_ts(dir, - inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); inode_inc_iversion(dir); - inc_nlink(inode); - ihold(inode); /* New dentry reference */ - dget(dentry); /* Extra pinning count for the created dentry */ - if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) - d_add(dentry, inode); - else - d_instantiate(dentry, inode); -out: - return ret; + return simple_link(old_dentry, dir, dentry); } static int shmem_unlink(struct inode *dir, struct dentry *dentry) @@ -3971,11 +3976,8 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) simple_offset_remove(shmem_get_offset_ctx(dir), dentry); dir->i_size -= BOGO_DIRENT_SIZE; - inode_set_mtime_to_ts(dir, - inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); inode_inc_iversion(dir); - drop_nlink(inode); - dput(dentry); /* Undo the count from "create" - does all the work */ + simple_unlink(dir, dentry); /* * For now, VFS can't deal with case-insensitive negative dentries, so @@ -4130,11 +4132,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, dir->i_size += BOGO_DIRENT_SIZE; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_inc_iversion(dir); - if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) - d_add(dentry, inode); - else - d_instantiate(dentry, inode); - dget(dentry); + d_make_persistent(dentry, inode); return 0; out_remove_offset: @@ -4677,7 +4675,6 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) "Turning off swap in unprivileged tmpfs mounts unsupported"); } ctx->noswap = true; - ctx->seen |= SHMEM_SEEN_NOSWAP; break; case Opt_quota: if (fc->user_ns != &init_user_ns) @@ -4827,14 +4824,15 @@ static int shmem_reconfigure(struct fs_context *fc) err = "Current inum too high to switch to 32-bit inums"; goto out; } - if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) { + + /* + * "noswap" doesn't use fsparam_flag_no, i.e. there's no "swap" + * counterpart for (re-)enabling swap. + */ + if (ctx->noswap && !sbinfo->noswap) { err = "Cannot disable swap on remount"; goto out; } - if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) { - err = "Cannot enable swap on remount if it was disabled on first mount"; - goto out; - } if (ctx->seen & SHMEM_SEEN_QUOTA && !sb_any_quota_loaded(fc->root->d_sb)) { @@ -5203,7 +5201,7 @@ static const struct address_space_operations shmem_aops = { }; static const struct file_operations shmem_file_operations = { - .mmap = shmem_mmap, + .mmap_prepare = shmem_mmap_prepare, .open = shmem_file_open, .get_unmapped_area = shmem_get_unmapped_area, #ifdef CONFIG_TMPFS @@ -5334,7 +5332,7 @@ static struct file_system_type shmem_fs_type = { #ifdef CONFIG_TMPFS .parameters = shmem_fs_parameters, #endif - .kill_sb = kill_litter_super, + .kill_sb = kill_anon_super, .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME, }; @@ -5772,11 +5770,11 @@ unsigned long shmem_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); + return mm_get_unmapped_area(file, addr, len, pgoff, flags); } #endif -void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend) { truncate_inode_pages_range(inode->i_mapping, lstart, lend); } @@ -5878,14 +5876,9 @@ struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, } EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); -/** - * shmem_zero_setup - setup a shared anonymous mapping - * @vma: the vma to be mmapped is prepared by do_mmap - */ -int shmem_zero_setup(struct vm_area_struct *vma) +static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, vm_flags_t vm_flags) { - struct file *file; - loff_t size = vma->vm_end - vma->vm_start; + loff_t size = end - start; /* * Cloning a new file under mmap_lock leads to a lock ordering conflict @@ -5893,7 +5886,18 @@ int shmem_zero_setup(struct vm_area_struct *vma) * accessible to the user through its mapping, use S_PRIVATE flag to * bypass file security, in the same way as shmem_kernel_file_setup(). */ - file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags); + return shmem_kernel_file_setup("dev/zero", size, vm_flags); +} + +/** + * shmem_zero_setup - setup a shared anonymous mapping + * @vma: the vma to be mmapped is prepared by do_mmap + * Returns: 0 on success, or error + */ +int shmem_zero_setup(struct vm_area_struct *vma) +{ + struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->vm_flags); + if (IS_ERR(file)) return PTR_ERR(file); @@ -5906,6 +5910,25 @@ int shmem_zero_setup(struct vm_area_struct *vma) } /** + * shmem_zero_setup_desc - same as shmem_zero_setup, but determined by VMA + * descriptor for convenience. + * @desc: Describes VMA + * Returns: 0 on success, or error + */ +int shmem_zero_setup_desc(struct vm_area_desc *desc) +{ + struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vm_flags); + + if (IS_ERR(file)) + return PTR_ERR(file); + + desc->vm_file = file; + desc->vm_ops = &shmem_anon_vm_ops; + + return 0; +} + +/** * shmem_read_folio_gfp - read into page cache, using specified page allocation flags. * @mapping: the folio's address_space * @index: the folio index diff --git a/mm/slab.h b/mm/slab.h index 078daecc7cf5..f730e012553c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -40,13 +40,29 @@ typedef u64 freelist_full_t; * Freelist pointer and counter to cmpxchg together, avoids the typical ABA * problems with cmpxchg of just a pointer. */ -typedef union { - struct { - void *freelist; - unsigned long counter; +struct freelist_counters { + union { + struct { + void *freelist; + union { + unsigned long counters; + struct { + unsigned inuse:16; + unsigned objects:15; + /* + * If slab debugging is enabled then the + * frozen bit can be reused to indicate + * that the slab was corrupted + */ + unsigned frozen:1; + }; + }; + }; +#ifdef system_has_freelist_aba + freelist_full_t freelist_counters; +#endif }; - freelist_full_t full; -} freelist_aba_t; +}; /* Reuses the bits in struct page */ struct slab { @@ -69,27 +85,7 @@ struct slab { #endif }; /* Double-word boundary */ - union { - struct { - void *freelist; /* first free object */ - union { - unsigned long counters; - struct { - unsigned inuse:16; - unsigned objects:15; - /* - * If slab debugging is enabled then the - * frozen bit can be reused to indicate - * that the slab was corrupted - */ - unsigned frozen:1; - }; - }; - }; -#ifdef system_has_freelist_aba - freelist_aba_t freelist_counter; -#endif - }; + struct freelist_counters; }; struct rcu_head rcu_head; }; @@ -114,23 +110,10 @@ SLAB_MATCH(_unused_slab_obj_exts, obj_exts); #undef SLAB_MATCH static_assert(sizeof(struct slab) <= sizeof(struct page)); #if defined(system_has_freelist_aba) -static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t))); +static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(struct freelist_counters))); #endif /** - * folio_slab - Converts from folio to slab. - * @folio: The folio. - * - * Currently struct slab is a different representation of a folio where - * folio_test_slab() is true. - * - * Return: The slab which contains this folio. - */ -#define folio_slab(folio) (_Generic((folio), \ - const struct folio *: (const struct slab *)(folio), \ - struct folio *: (struct slab *)(folio))) - -/** * slab_folio - The folio allocated for a slab * @s: The slab. * @@ -146,20 +129,24 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t) struct slab *: (struct folio *)s)) /** - * page_slab - Converts from first struct page to slab. - * @p: The first (either head of compound or single) page of slab. - * - * A temporary wrapper to convert struct page to struct slab in situations where - * we know the page is the compound head, or single order-0 page. - * - * Long-term ideally everything would work with struct slab directly or go - * through folio to struct slab. + * page_slab - Converts from struct page to its slab. + * @page: A page which may or may not belong to a slab. * - * Return: The slab which contains this page + * Return: The slab which contains this page or NULL if the page does + * not belong to a slab. This includes pages returned from large kmalloc. */ -#define page_slab(p) (_Generic((p), \ - const struct page *: (const struct slab *)(p), \ - struct page *: (struct slab *)(p))) +static inline struct slab *page_slab(const struct page *page) +{ + unsigned long head; + + head = READ_ONCE(page->compound_head); + if (head & 1) + page = (struct page *)(head - 1); + if (data_race(page->page_type >> 24) != PGTY_slab) + page = NULL; + + return (struct slab *)page; +} /** * slab_page - The first struct page allocated for a slab @@ -188,12 +175,7 @@ static inline pg_data_t *slab_pgdat(const struct slab *slab) static inline struct slab *virt_to_slab(const void *addr) { - struct folio *folio = virt_to_folio(addr); - - if (!folio_test_slab(folio)) - return NULL; - - return folio_slab(folio); + return page_slab(virt_to_page(addr)); } static inline int slab_order(const struct slab *slab) @@ -236,10 +218,8 @@ struct kmem_cache_order_objects { * Slab cache management. */ struct kmem_cache { -#ifndef CONFIG_SLUB_TINY struct kmem_cache_cpu __percpu *cpu_slab; struct lock_class_key lock_key; -#endif struct slub_percpu_sheaves __percpu *cpu_sheaves; /* Used for retrieving partial slabs, etc. */ slab_flags_t flags; @@ -601,6 +581,16 @@ static inline size_t slab_ksize(const struct kmem_cache *s) return s->size; } +static inline unsigned int large_kmalloc_order(const struct page *page) +{ + return page[1].flags.f & 0xff; +} + +static inline size_t large_kmalloc_size(const struct page *page) +{ + return PAGE_SIZE << large_kmalloc_order(page); +} + #ifdef CONFIG_SLUB_DEBUG void dump_unreclaimable_slab(void); #else diff --git a/mm/slab_common.c b/mm/slab_common.c index 932d13ada36c..b613533b29e7 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -259,7 +259,7 @@ out: * @object_size: The size of objects to be created in this cache. * @args: Additional arguments for the cache creation (see * &struct kmem_cache_args). - * @flags: See the desriptions of individual flags. The common ones are listed + * @flags: See the descriptions of individual flags. The common ones are listed * in the description below. * * Not to be called directly, use the kmem_cache_create() wrapper with the same @@ -997,26 +997,27 @@ void __init create_kmalloc_caches(void) */ size_t __ksize(const void *object) { - struct folio *folio; + const struct page *page; + const struct slab *slab; if (unlikely(object == ZERO_SIZE_PTR)) return 0; - folio = virt_to_folio(object); + page = virt_to_page(object); - if (unlikely(!folio_test_slab(folio))) { - if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE)) - return 0; - if (WARN_ON(object != folio_address(folio))) - return 0; - return folio_size(folio); - } + if (unlikely(PageLargeKmalloc(page))) + return large_kmalloc_size(page); + + slab = page_slab(page); + /* Delete this after we're sure there are no users */ + if (WARN_ON(!slab)) + return page_size(page); #ifdef CONFIG_SLUB_DEBUG - skip_orig_size_check(folio_slab(folio)->slab_cache, object); + skip_orig_size_check(slab->slab_cache, object); #endif - return slab_ksize(folio_slab(folio)->slab_cache); + return slab_ksize(slab->slab_cache); } gfp_t kmalloc_fix_flags(gfp_t flags) @@ -1614,17 +1615,15 @@ static void kfree_rcu_work(struct work_struct *work) static bool kfree_rcu_sheaf(void *obj) { struct kmem_cache *s; - struct folio *folio; struct slab *slab; if (is_vmalloc_addr(obj)) return false; - folio = virt_to_folio(obj); - if (unlikely(!folio_test_slab(folio))) + slab = virt_to_slab(obj); + if (unlikely(!slab)) return false; - slab = folio_slab(folio); s = slab->slab_cache; if (s->cpu_sheaves) { if (likely(!IS_ENABLED(CONFIG_NUMA) || diff --git a/mm/slub.c b/mm/slub.c index d4367f25b20d..e6a330e24145 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -410,19 +410,22 @@ enum stat_item { NR_SLUB_STAT_ITEMS }; -#ifndef CONFIG_SLUB_TINY -/* - * When changing the layout, make sure freelist and tid are still compatible - * with this_cpu_cmpxchg_double() alignment requirements. - */ -struct kmem_cache_cpu { +struct freelist_tid { union { struct { - void **freelist; /* Pointer to next available object */ + void *freelist; /* Pointer to next available object */ unsigned long tid; /* Globally unique transaction id */ }; - freelist_aba_t freelist_tid; + freelist_full_t freelist_tid; }; +}; + +/* + * When changing the layout, make sure freelist and tid are still compatible + * with this_cpu_cmpxchg_double() alignment requirements. + */ +struct kmem_cache_cpu { + struct freelist_tid; struct slab *slab; /* The slab from which we are allocating */ #ifdef CONFIG_SLUB_CPU_PARTIAL struct slab *partial; /* Partially allocated slabs */ @@ -432,7 +435,6 @@ struct kmem_cache_cpu { unsigned int stat[NR_SLUB_STAT_ITEMS]; #endif }; -#endif /* CONFIG_SLUB_TINY */ static inline void stat(const struct kmem_cache *s, enum stat_item si) { @@ -469,7 +471,10 @@ struct slab_sheaf { struct rcu_head rcu_head; struct list_head barn_list; /* only used for prefilled sheafs */ - unsigned int capacity; + struct { + unsigned int capacity; + bool pfmemalloc; + }; }; struct kmem_cache *cache; unsigned int size; @@ -594,12 +599,10 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object) return freelist_ptr_decode(s, p, ptr_addr); } -#ifndef CONFIG_SLUB_TINY static void prefetch_freepointer(const struct kmem_cache *s, void *object) { prefetchw(object + s->offset); } -#endif /* * When running under KMSAN, get_freepointer_safe() may return an uninitialized @@ -711,10 +714,12 @@ static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) return s->cpu_partial_slabs; } #else +#ifdef SLAB_SUPPORTS_SYSFS static inline void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) { } +#endif static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) { @@ -755,32 +760,29 @@ static __always_inline void slab_unlock(struct slab *slab) } static inline bool -__update_freelist_fast(struct slab *slab, - void *freelist_old, unsigned long counters_old, - void *freelist_new, unsigned long counters_new) +__update_freelist_fast(struct slab *slab, struct freelist_counters *old, + struct freelist_counters *new) { #ifdef system_has_freelist_aba - freelist_aba_t old = { .freelist = freelist_old, .counter = counters_old }; - freelist_aba_t new = { .freelist = freelist_new, .counter = counters_new }; - - return try_cmpxchg_freelist(&slab->freelist_counter.full, &old.full, new.full); + return try_cmpxchg_freelist(&slab->freelist_counters, + &old->freelist_counters, + new->freelist_counters); #else return false; #endif } static inline bool -__update_freelist_slow(struct slab *slab, - void *freelist_old, unsigned long counters_old, - void *freelist_new, unsigned long counters_new) +__update_freelist_slow(struct slab *slab, struct freelist_counters *old, + struct freelist_counters *new) { bool ret = false; slab_lock(slab); - if (slab->freelist == freelist_old && - slab->counters == counters_old) { - slab->freelist = freelist_new; - slab->counters = counters_new; + if (slab->freelist == old->freelist && + slab->counters == old->counters) { + slab->freelist = new->freelist; + slab->counters = new->counters; ret = true; } slab_unlock(slab); @@ -796,22 +798,18 @@ __update_freelist_slow(struct slab *slab, * interrupt the operation. */ static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *slab, - void *freelist_old, unsigned long counters_old, - void *freelist_new, unsigned long counters_new, - const char *n) + struct freelist_counters *old, struct freelist_counters *new, const char *n) { bool ret; if (USE_LOCKLESS_FAST_PATH()) lockdep_assert_irqs_disabled(); - if (s->flags & __CMPXCHG_DOUBLE) { - ret = __update_freelist_fast(slab, freelist_old, counters_old, - freelist_new, counters_new); - } else { - ret = __update_freelist_slow(slab, freelist_old, counters_old, - freelist_new, counters_new); - } + if (s->flags & __CMPXCHG_DOUBLE) + ret = __update_freelist_fast(slab, old, new); + else + ret = __update_freelist_slow(slab, old, new); + if (likely(ret)) return true; @@ -826,21 +824,17 @@ static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *sla } static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab, - void *freelist_old, unsigned long counters_old, - void *freelist_new, unsigned long counters_new, - const char *n) + struct freelist_counters *old, struct freelist_counters *new, const char *n) { bool ret; if (s->flags & __CMPXCHG_DOUBLE) { - ret = __update_freelist_fast(slab, freelist_old, counters_old, - freelist_new, counters_new); + ret = __update_freelist_fast(slab, old, new); } else { unsigned long flags; local_irq_save(flags); - ret = __update_freelist_slow(slab, freelist_old, counters_old, - freelist_new, counters_new); + ret = __update_freelist_slow(slab, old, new); local_irq_restore(flags); } if (likely(ret)) @@ -978,7 +972,7 @@ static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS; static slab_flags_t slub_debug; #endif -static char *slub_debug_string; +static const char *slub_debug_string __ro_after_init; static int disable_higher_order_debug; /* @@ -1785,8 +1779,8 @@ static inline int free_consistency_checks(struct kmem_cache *s, * * returns the start of next block if there's any, or NULL */ -static char * -parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init) +static const char * +parse_slub_debug_flags(const char *str, slab_flags_t *flags, const char **slabs, bool init) { bool higher_order_disable = false; @@ -1863,17 +1857,17 @@ check_slabs: return NULL; } -static int __init setup_slub_debug(char *str) +static int __init setup_slub_debug(const char *str, const struct kernel_param *kp) { slab_flags_t flags; slab_flags_t global_flags; - char *saved_str; - char *slab_list; + const char *saved_str; + const char *slab_list; bool global_slub_debug_changed = false; bool slab_list_specified = false; global_flags = DEBUG_DEFAULT_FLAGS; - if (*str++ != '=' || !*str) + if (!str || !*str) /* * No options specified. Switch on full debugging. */ @@ -1917,11 +1911,15 @@ out: static_branch_unlikely(&init_on_free)) && (slub_debug & SLAB_POISON)) pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n"); - return 1; + return 0; } -__setup("slab_debug", setup_slub_debug); -__setup_param("slub_debug", slub_debug, setup_slub_debug, 0); +static const struct kernel_param_ops param_ops_slab_debug __initconst = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = setup_slub_debug, +}; +__core_param_cb(slab_debug, ¶m_ops_slab_debug, NULL, 0); +__core_param_cb(slub_debug, ¶m_ops_slab_debug, NULL, 0); /* * kmem_cache_flags - apply debugging options to the cache @@ -1935,9 +1933,9 @@ __setup_param("slub_debug", slub_debug, setup_slub_debug, 0); */ slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name) { - char *iter; + const char *iter; size_t len; - char *next_block; + const char *next_block; slab_flags_t block_flags; slab_flags_t slub_debug_local = slub_debug; @@ -1961,7 +1959,7 @@ slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name) continue; /* Found a block that has a slab list, search it */ while (*iter) { - char *end, *glob; + const char *end, *glob; size_t cmplen; end = strchrnul(iter, ','); @@ -2023,15 +2021,21 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) {} static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} -#ifndef CONFIG_SLUB_TINY static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, void **freelist, void *nextfree) { return false; } -#endif #endif /* CONFIG_SLUB_DEBUG */ +/* + * The allocated objcg pointers array is not accounted directly. + * Moreover, it should not come from DMA buffer and is not readily + * reclaimable. So those GFP bits should be masked off. + */ +#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ + __GFP_ACCOUNT | __GFP_NOFAIL) + #ifdef CONFIG_SLAB_OBJ_EXT #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG @@ -2046,7 +2050,11 @@ static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) if (slab_exts) { unsigned int offs = obj_to_index(obj_exts_slab->slab_cache, obj_exts_slab, obj_exts); - /* codetag should be NULL */ + + if (unlikely(is_codetag_empty(&slab_exts[offs].ref))) + return; + + /* codetag should be NULL here */ WARN_ON(slab_exts[offs].ref.ct); set_codetag_empty(&slab_exts[offs].ref); } @@ -2082,14 +2090,6 @@ static inline void handle_failed_objexts_alloc(unsigned long obj_exts, #endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ -/* - * The allocated objcg pointers array is not accounted directly. - * Moreover, it should not come from DMA buffer and is not readily - * reclaimable. So those GFP bits should be masked off. - */ -#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ - __GFP_ACCOUNT | __GFP_NOFAIL) - static inline void init_slab_obj_exts(struct slab *slab) { slab->obj_exts = 0; @@ -2369,33 +2369,34 @@ bool memcg_slab_post_charge(void *p, gfp_t flags) { struct slabobj_ext *slab_exts; struct kmem_cache *s; - struct folio *folio; + struct page *page; struct slab *slab; unsigned long off; - folio = virt_to_folio(p); - if (!folio_test_slab(folio)) { + page = virt_to_page(p); + if (PageLargeKmalloc(page)) { + unsigned int order; int size; - if (folio_memcg_kmem(folio)) + if (PageMemcgKmem(page)) return true; - if (__memcg_kmem_charge_page(folio_page(folio, 0), flags, - folio_order(folio))) + order = large_kmalloc_order(page); + if (__memcg_kmem_charge_page(page, flags, order)) return false; /* - * This folio has already been accounted in the global stats but + * This page has already been accounted in the global stats but * not in the memcg stats. So, subtract from the global and use * the interface which adds to both global and memcg stats. */ - size = folio_size(folio); - node_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, -size); - lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, size); + size = PAGE_SIZE << order; + mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B, -size); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, size); return true; } - slab = folio_slab(folio); + slab = page_slab(page); s = slab->slab_cache; /* @@ -2529,7 +2530,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init, memset((char *)kasan_reset_tag(x) + inuse, 0, s->size - inuse - rsize); /* - * Restore orig_size, otherwize kmalloc redzone overwritten + * Restore orig_size, otherwise kmalloc redzone overwritten * would be reported */ set_orig_size(s, x, orig_size); @@ -2597,8 +2598,24 @@ static void *setup_object(struct kmem_cache *s, void *object) static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) { - struct slab_sheaf *sheaf = kzalloc(struct_size(sheaf, objects, - s->sheaf_capacity), gfp); + struct slab_sheaf *sheaf; + size_t sheaf_size; + + if (gfp & __GFP_NO_OBJ_EXT) + return NULL; + + gfp &= ~OBJCGS_CLEAR_MASK; + + /* + * Prevent recursion to the same cache, or a deep stack of kmallocs of + * varying sizes (sheaf capacity might differ for each kmalloc size + * bucket) + */ + if (s->flags & SLAB_KMALLOC) + gfp |= __GFP_NO_OBJ_EXT; + + sheaf_size = struct_size(sheaf, objects, s->sheaf_capacity); + sheaf = kzalloc(sheaf_size, gfp); if (unlikely(!sheaf)) return NULL; @@ -2651,7 +2668,7 @@ static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp) if (!sheaf) return NULL; - if (refill_sheaf(s, sheaf, gfp)) { + if (refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC)) { free_empty_sheaf(s, sheaf); return NULL; } @@ -2729,12 +2746,13 @@ static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf) sheaf->size = 0; } -static void __rcu_free_sheaf_prepare(struct kmem_cache *s, +static bool __rcu_free_sheaf_prepare(struct kmem_cache *s, struct slab_sheaf *sheaf) { bool init = slab_want_init_on_free(s); void **p = &sheaf->objects[0]; unsigned int i = 0; + bool pfmemalloc = false; while (i < sheaf->size) { struct slab *slab = virt_to_slab(p[i]); @@ -2747,8 +2765,13 @@ static void __rcu_free_sheaf_prepare(struct kmem_cache *s, continue; } + if (slab_test_pfmemalloc(slab)) + pfmemalloc = true; + i++; } + + return pfmemalloc; } static void rcu_free_sheaf_nobarn(struct rcu_head *head) @@ -3011,14 +3034,11 @@ static void barn_init(struct node_barn *barn) static void barn_shrink(struct kmem_cache *s, struct node_barn *barn) { - struct list_head empty_list; - struct list_head full_list; + LIST_HEAD(empty_list); + LIST_HEAD(full_list); struct slab_sheaf *sheaf, *sheaf2; unsigned long flags; - INIT_LIST_HEAD(&empty_list); - INIT_LIST_HEAD(&full_list); - spin_lock_irqsave(&barn->lock, flags); list_splice_init(&barn->sheaves_full, &full_list); @@ -3044,24 +3064,24 @@ static inline struct slab *alloc_slab_page(gfp_t flags, int node, struct kmem_cache_order_objects oo, bool allow_spin) { - struct folio *folio; + struct page *page; struct slab *slab; unsigned int order = oo_order(oo); if (unlikely(!allow_spin)) - folio = (struct folio *)alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */, + page = alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */, node, order); else if (node == NUMA_NO_NODE) - folio = (struct folio *)alloc_frozen_pages(flags, order); + page = alloc_frozen_pages(flags, order); else - folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL); + page = __alloc_frozen_pages(flags, order, node, NULL); - if (!folio) + if (!page) return NULL; - slab = folio_slab(folio); - __folio_set_slab(folio); - if (folio_is_pfmemalloc(folio)) + __SetPageSlab(page); + slab = page_slab(page); + if (page_is_pfmemalloc(page)) slab_set_pfmemalloc(slab); return slab; @@ -3285,16 +3305,16 @@ static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node) static void __free_slab(struct kmem_cache *s, struct slab *slab) { - struct folio *folio = slab_folio(slab); - int order = folio_order(folio); + struct page *page = slab_page(slab); + int order = compound_order(page); int pages = 1 << order; __slab_clear_pfmemalloc(slab); - folio->mapping = NULL; - __folio_clear_slab(folio); + page->mapping = NULL; + __ClearPageSlab(page); mm_account_reclaimed_pages(pages); unaccount_slab(slab, order, s); - free_frozen_pages(&folio->page, order); + free_frozen_pages(page, order); } static void rcu_free_slab(struct rcu_head *h) @@ -3614,8 +3634,6 @@ static struct slab *get_partial(struct kmem_cache *s, int node, return get_any_partial(s, pc); } -#ifndef CONFIG_SLUB_TINY - #ifdef CONFIG_PREEMPTION /* * Calculate the next globally unique transaction for disambiguation @@ -3719,8 +3737,7 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, void *nextfree, *freelist_iter, *freelist_tail; int tail = DEACTIVATE_TO_HEAD; unsigned long flags = 0; - struct slab new; - struct slab old; + struct freelist_counters old, new; if (READ_ONCE(slab->freelist)) { stat(s, DEACTIVATE_REMOTE_FREES); @@ -3769,10 +3786,7 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, } else { new.freelist = old.freelist; } - } while (!slab_update_freelist(s, slab, - old.freelist, old.counters, - new.freelist, new.counters, - "unfreezing slab")); + } while (!slab_update_freelist(s, slab, &old, &new, "unfreezing slab")); /* * Stage three: Manipulate the slab list based on the updated state. @@ -4015,12 +4029,6 @@ static bool has_cpu_slab(int cpu, struct kmem_cache *s) return c->slab || slub_percpu_partial(c); } -#else /* CONFIG_SLUB_TINY */ -static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { } -static inline bool has_cpu_slab(int cpu, struct kmem_cache *s) { return false; } -static inline void flush_this_cpu_slab(struct kmem_cache *s) { } -#endif /* CONFIG_SLUB_TINY */ - static bool has_pcs_used(int cpu, struct kmem_cache *s) { struct slub_percpu_sheaves *pcs; @@ -4361,17 +4369,16 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags) return true; } -#ifndef CONFIG_SLUB_TINY static inline bool __update_cpu_freelist_fast(struct kmem_cache *s, void *freelist_old, void *freelist_new, unsigned long tid) { - freelist_aba_t old = { .freelist = freelist_old, .counter = tid }; - freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) }; + struct freelist_tid old = { .freelist = freelist_old, .tid = tid }; + struct freelist_tid new = { .freelist = freelist_new, .tid = next_tid(tid) }; - return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid.full, - &old.full, new.full); + return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid, + &old.freelist_tid, new.freelist_tid); } /* @@ -4384,27 +4391,24 @@ __update_cpu_freelist_fast(struct kmem_cache *s, */ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) { - struct slab new; - unsigned long counters; - void *freelist; + struct freelist_counters old, new; lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); do { - freelist = slab->freelist; - counters = slab->counters; + old.freelist = slab->freelist; + old.counters = slab->counters; - new.counters = counters; + new.freelist = NULL; + new.counters = old.counters; - new.inuse = slab->objects; - new.frozen = freelist != NULL; + new.inuse = old.objects; + new.frozen = old.freelist != NULL; - } while (!__slab_update_freelist(s, slab, - freelist, counters, - NULL, new.counters, - "get_freelist")); - return freelist; + } while (!__slab_update_freelist(s, slab, &old, &new, "get_freelist")); + + return old.freelist; } /* @@ -4412,26 +4416,22 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) */ static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) { - struct slab new; - unsigned long counters; - void *freelist; + struct freelist_counters old, new; do { - freelist = slab->freelist; - counters = slab->counters; + old.freelist = slab->freelist; + old.counters = slab->counters; - new.counters = counters; + new.freelist = NULL; + new.counters = old.counters; VM_BUG_ON(new.frozen); - new.inuse = slab->objects; + new.inuse = old.objects; new.frozen = 1; - } while (!slab_update_freelist(s, slab, - freelist, counters, - NULL, new.counters, - "freeze_slab")); + } while (!slab_update_freelist(s, slab, &old, &new, "freeze_slab")); - return freelist; + return old.freelist; } /* @@ -4625,7 +4625,7 @@ new_objects: pc.orig_size = orig_size; slab = get_partial(s, node, &pc); if (slab) { - if (kmem_cache_debug(s)) { + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { freelist = pc.object; /* * For debug caches here we had to go through @@ -4663,11 +4663,15 @@ new_objects: stat(s, ALLOC_SLAB); - if (kmem_cache_debug(s)) { + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); - if (unlikely(!freelist)) + if (unlikely(!freelist)) { + /* This could cause an endless loop. Fail instead. */ + if (!allow_spin) + return NULL; goto new_objects; + } if (s->flags & SLAB_STORE_USER) set_track(s, freelist, TRACK_ALLOC, addr, @@ -4871,32 +4875,6 @@ redo: return object; } -#else /* CONFIG_SLUB_TINY */ -static void *__slab_alloc_node(struct kmem_cache *s, - gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) -{ - struct partial_context pc; - struct slab *slab; - void *object; - - pc.flags = gfpflags; - pc.orig_size = orig_size; - slab = get_partial(s, node, &pc); - - if (slab) - return pc.object; - - slab = new_slab(s, gfpflags, node); - if (unlikely(!slab)) { - slab_out_of_memory(s, gfpflags, node); - return NULL; - } - - object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); - - return object; -} -#endif /* CONFIG_SLUB_TINY */ /* * If the object has been wiped upon free, make sure it's fully initialized by @@ -5037,7 +5015,7 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, return NULL; if (empty) { - if (!refill_sheaf(s, empty, gfp)) { + if (!refill_sheaf(s, empty, gfp | __GFP_NOMEMALLOC)) { full = empty; } else { /* @@ -5148,7 +5126,7 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node) * be false because of cpu migration during an unlocked part of * the current allocation or previous freeing process. */ - if (folio_nid(virt_to_folio(object)) != node) { + if (page_to_nid(virt_to_page(object)) != node) { local_unlock(&s->cpu_sheaves->lock); return NULL; } @@ -5337,6 +5315,26 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int nod } EXPORT_SYMBOL(kmem_cache_alloc_node_noprof); +static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s, + struct slab_sheaf *sheaf, gfp_t gfp) +{ + int ret = 0; + + ret = refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC); + + if (likely(!ret || !gfp_pfmemalloc_allowed(gfp))) + return ret; + + /* + * if we are allowed to, refill sheaf with pfmemalloc but then remember + * it for when it's returned + */ + ret = refill_sheaf(s, sheaf, gfp); + sheaf->pfmemalloc = true; + + return ret; +} + /* * returns a sheaf that has at least the requested size * when prefilling is needed, do so with given gfp flags @@ -5371,6 +5369,10 @@ kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size) sheaf->cache = s; sheaf->capacity = size; + /* + * we do not need to care about pfmemalloc here because oversize + * sheaves area always flushed and freed when returned + */ if (!__kmem_cache_alloc_bulk(s, gfp, size, &sheaf->objects[0])) { kfree(sheaf); @@ -5407,17 +5409,18 @@ kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size) if (!sheaf) sheaf = alloc_empty_sheaf(s, gfp); - if (sheaf && sheaf->size < size) { - if (refill_sheaf(s, sheaf, gfp)) { + if (sheaf) { + sheaf->capacity = s->sheaf_capacity; + sheaf->pfmemalloc = false; + + if (sheaf->size < size && + __prefill_sheaf_pfmemalloc(s, sheaf, gfp)) { sheaf_flush_unused(s, sheaf); free_empty_sheaf(s, sheaf); sheaf = NULL; } } - if (sheaf) - sheaf->capacity = s->sheaf_capacity; - return sheaf; } @@ -5437,7 +5440,8 @@ void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, struct slub_percpu_sheaves *pcs; struct node_barn *barn; - if (unlikely(sheaf->capacity != s->sheaf_capacity)) { + if (unlikely((sheaf->capacity != s->sheaf_capacity) + || sheaf->pfmemalloc)) { sheaf_flush_unused(s, sheaf); kfree(sheaf); return; @@ -5503,7 +5507,7 @@ int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, if (likely(sheaf->capacity >= size)) { if (likely(sheaf->capacity == s->sheaf_capacity)) - return refill_sheaf(s, sheaf, gfp); + return __prefill_sheaf_pfmemalloc(s, sheaf, gfp); if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size, &sheaf->objects[sheaf->size])) { @@ -5536,6 +5540,9 @@ int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, * * The gfp parameter is meant only to specify __GFP_ZERO or __GFP_ACCOUNT * memcg charging is forced over limit if necessary, to avoid failure. + * + * It is possible that the allocation comes from kfence and then the sheaf + * size is not decreased. */ void * kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp, @@ -5547,7 +5554,10 @@ kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp, if (sheaf->size == 0) goto out; - ret = sheaf->objects[--sheaf->size]; + ret = kfence_alloc(s, s->object_size, gfp); + + if (likely(!ret)) + ret = sheaf->objects[--sheaf->size]; init = slab_want_init_on_alloc(gfp, s); @@ -5570,7 +5580,7 @@ unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf) */ static void *___kmalloc_large_node(size_t size, gfp_t flags, int node) { - struct folio *folio; + struct page *page; void *ptr = NULL; unsigned int order = get_order(size); @@ -5580,15 +5590,15 @@ static void *___kmalloc_large_node(size_t size, gfp_t flags, int node) flags |= __GFP_COMP; if (node == NUMA_NO_NODE) - folio = (struct folio *)alloc_frozen_pages_noprof(flags, order); + page = alloc_frozen_pages_noprof(flags, order); else - folio = (struct folio *)__alloc_frozen_pages_noprof(flags, order, node, NULL); + page = __alloc_frozen_pages_noprof(flags, order, node, NULL); - if (folio) { - ptr = folio_address(folio); - lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, + if (page) { + ptr = page_address(page); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, PAGE_SIZE << order); - __folio_set_large_kmalloc(folio); + __SetPageLargeKmalloc(page); } ptr = kasan_kmalloc_large(ptr, size, flags); @@ -5715,9 +5725,7 @@ retry: * it did local_lock_irqsave(&s->cpu_slab->lock, flags). * In this case fast path with __update_cpu_freelist_fast() is not safe. */ -#ifndef CONFIG_SLUB_TINY if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock)) -#endif ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); if (PTR_ERR(ret) == -EBUSY) { @@ -5855,10 +5863,8 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, unsigned long addr) { - void *prior; - int was_frozen; - struct slab new; - unsigned long counters; + bool was_frozen, was_full; + struct freelist_counters old, new; struct kmem_cache_node *n = NULL; unsigned long flags; bool on_node_partial; @@ -5870,20 +5876,43 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, return; } + /* + * It is enough to test IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) below + * instead of kmem_cache_has_cpu_partial(s), because kmem_cache_debug(s) + * is the only other reason it can be false, and it is already handled + * above. + */ + do { if (unlikely(n)) { spin_unlock_irqrestore(&n->list_lock, flags); n = NULL; } - prior = slab->freelist; - counters = slab->counters; - set_freepointer(s, tail, prior); - new.counters = counters; - was_frozen = new.frozen; + + old.freelist = slab->freelist; + old.counters = slab->counters; + + was_full = (old.freelist == NULL); + was_frozen = old.frozen; + + set_freepointer(s, tail, old.freelist); + + new.freelist = head; + new.counters = old.counters; new.inuse -= cnt; - if ((!new.inuse || !prior) && !was_frozen) { - /* Needs to be taken off a list */ - if (!kmem_cache_has_cpu_partial(s) || prior) { + + /* + * Might need to be taken off (due to becoming empty) or added + * to (due to not being full anymore) the partial list. + * Unless it's frozen. + */ + if ((!new.inuse || was_full) && !was_frozen) { + /* + * If slab becomes non-full and we have cpu partial + * lists, we put it there unconditionally to avoid + * taking the list_lock. Otherwise we need it. + */ + if (!(IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full)) { n = get_node(s, slab_nid(slab)); /* @@ -5900,10 +5929,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, } } - } while (!slab_update_freelist(s, slab, - prior, counters, - head, new.counters, - "__slab_free")); + } while (!slab_update_freelist(s, slab, &old, &new, "__slab_free")); if (likely(!n)) { @@ -5913,7 +5939,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, * activity can be necessary. */ stat(s, FREE_FROZEN); - } else if (kmem_cache_has_cpu_partial(s) && !prior) { + } else if (IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full) { /* * If we started with a full slab then put it onto the * per cpu partial list. @@ -5922,6 +5948,11 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, stat(s, CPU_PARTIAL_FREE); } + /* + * In other cases we didn't take the list_lock because the slab + * was already on the partial list and will remain there. + */ + return; } @@ -5929,19 +5960,24 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, * This slab was partially empty but not on the per-node partial list, * in which case we shouldn't manipulate its list, just return. */ - if (prior && !on_node_partial) { + if (!was_full && !on_node_partial) { spin_unlock_irqrestore(&n->list_lock, flags); return; } + /* + * If slab became empty, should we add/keep it on the partial list or we + * have enough? + */ if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) goto slab_empty; /* * Objects left in the slab. If it was not on the partial list before - * then add it. + * then add it. This can only happen when cache has no per cpu partial + * list otherwise we would have put it there. */ - if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { + if (!IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && unlikely(was_full)) { add_partial(n, slab, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } @@ -5949,10 +5985,11 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, return; slab_empty: - if (prior) { - /* - * Slab on the partial list. - */ + /* + * The slab could have a single object and thus go from full to empty in + * a single free, but more likely it was on the partial list. Remove it. + */ + if (likely(!was_full)) { remove_partial(n, slab); stat(s, FREE_REMOVE_PARTIAL); } @@ -6177,8 +6214,12 @@ static void rcu_free_sheaf(struct rcu_head *head) * handles it fine. The only downside is that sheaf will serve fewer * allocations when reused. It only happens due to debugging, which is a * performance hit anyway. + * + * If it returns true, there was at least one object from pfmemalloc + * slab so simply flush everything. */ - __rcu_free_sheaf_prepare(s, sheaf); + if (__rcu_free_sheaf_prepare(s, sheaf)) + goto flush; n = get_node(s, sheaf->node); if (!n) @@ -6328,12 +6369,11 @@ next_remote_batch: if (unlikely(!slab_free_hook(s, p[i], init, false))) { p[i] = p[--size]; - if (!size) - goto flush_remote; continue; } - if (unlikely(IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node)) { + if (unlikely((IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node) + || slab_test_pfmemalloc(slab))) { remote_objects[remote_nr] = p[i]; p[i] = p[--size]; if (++remote_nr >= PCS_BATCH_MAX) @@ -6344,6 +6384,9 @@ next_remote_batch: i++; } + if (!size) + goto flush_remote; + next_batch: if (!local_trylock(&s->cpu_sheaves->lock)) goto fallback; @@ -6398,6 +6441,9 @@ do_free: goto next_batch; } + if (remote_nr) + goto flush_remote; + return; no_empty: @@ -6475,14 +6521,10 @@ static void free_deferred_objects(struct irq_work *work) llist_for_each_safe(pos, t, llnode) { struct slab *slab = container_of(pos, struct slab, llnode); -#ifdef CONFIG_SLUB_TINY - free_slab(slab->slab_cache, slab); -#else if (slab->frozen) deactivate_slab(slab->slab_cache, slab, slab->flush_freelist); else free_slab(slab->slab_cache, slab); -#endif } } @@ -6518,7 +6560,6 @@ void defer_free_barrier(void) irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work); } -#ifndef CONFIG_SLUB_TINY /* * Fastpath with forced inlining to produce a kfree and kmem_cache_free that * can perform fastpath freeing without additional function calls. @@ -6611,14 +6652,6 @@ redo: } stat_add(s, FREE_FASTPATH, cnt); } -#else /* CONFIG_SLUB_TINY */ -static void do_slab_free(struct kmem_cache *s, - struct slab *slab, void *head, void *tail, - int cnt, unsigned long addr) -{ - __slab_free(s, slab, head, tail, cnt, addr); -} -#endif /* CONFIG_SLUB_TINY */ static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab, void *object, @@ -6631,7 +6664,8 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, return; if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) || - slab_nid(slab) == numa_mem_id())) { + slab_nid(slab) == numa_mem_id()) + && likely(!slab_test_pfmemalloc(slab))) { if (likely(free_to_pcs(s, object))) return; } @@ -6741,12 +6775,12 @@ void kmem_cache_free(struct kmem_cache *s, void *x) } EXPORT_SYMBOL(kmem_cache_free); -static void free_large_kmalloc(struct folio *folio, void *object) +static void free_large_kmalloc(struct page *page, void *object) { - unsigned int order = folio_order(folio); + unsigned int order = compound_order(page); - if (WARN_ON_ONCE(!folio_test_large_kmalloc(folio))) { - dump_page(&folio->page, "Not a kmalloc allocation"); + if (WARN_ON_ONCE(!PageLargeKmalloc(page))) { + dump_page(page, "Not a kmalloc allocation"); return; } @@ -6757,10 +6791,10 @@ static void free_large_kmalloc(struct folio *folio, void *object) kasan_kfree_large(object); kmsan_kfree_large(object); - lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order)); - __folio_clear_large_kmalloc(folio); - free_frozen_pages(&folio->page, order); + __ClearPageLargeKmalloc(page); + free_frozen_pages(page, order); } /* @@ -6770,7 +6804,7 @@ static void free_large_kmalloc(struct folio *folio, void *object) void kvfree_rcu_cb(struct rcu_head *head) { void *obj = head; - struct folio *folio; + struct page *page; struct slab *slab; struct kmem_cache *s; void *slab_addr; @@ -6781,20 +6815,20 @@ void kvfree_rcu_cb(struct rcu_head *head) return; } - folio = virt_to_folio(obj); - if (!folio_test_slab(folio)) { + page = virt_to_page(obj); + slab = page_slab(page); + if (!slab) { /* * rcu_head offset can be only less than page size so no need to - * consider folio order + * consider allocation order */ obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); - free_large_kmalloc(folio, obj); + free_large_kmalloc(page, obj); return; } - slab = folio_slab(folio); s = slab->slab_cache; - slab_addr = folio_address(folio); + slab_addr = slab_address(slab); if (is_kfence_address(obj)) { obj = kfence_object_start(obj); @@ -6816,7 +6850,7 @@ void kvfree_rcu_cb(struct rcu_head *head) */ void kfree(const void *object) { - struct folio *folio; + struct page *page; struct slab *slab; struct kmem_cache *s; void *x = (void *)object; @@ -6826,13 +6860,13 @@ void kfree(const void *object) if (unlikely(ZERO_OR_NULL_PTR(object))) return; - folio = virt_to_folio(object); - if (unlikely(!folio_test_slab(folio))) { - free_large_kmalloc(folio, (void *)object); + page = virt_to_page(object); + slab = page_slab(page); + if (!slab) { + free_large_kmalloc(page, (void *)object); return; } - slab = folio_slab(folio); s = slab->slab_cache; slab_free(s, slab, x, _RET_IP_); } @@ -6849,7 +6883,6 @@ EXPORT_SYMBOL(kfree); */ void kfree_nolock(const void *object) { - struct folio *folio; struct slab *slab; struct kmem_cache *s; void *x = (void *)object; @@ -6857,13 +6890,12 @@ void kfree_nolock(const void *object) if (unlikely(ZERO_OR_NULL_PTR(object))) return; - folio = virt_to_folio(object); - if (unlikely(!folio_test_slab(folio))) { + slab = virt_to_slab(object); + if (unlikely(!slab)) { WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()"); return; } - slab = folio_slab(folio); s = slab->slab_cache; memcg_slab_free_hook(s, slab, &x, 1); @@ -6895,11 +6927,7 @@ void kfree_nolock(const void *object) * since kasan quarantine takes locks and not supported from NMI. */ kasan_slab_free(s, x, false, false, /* skip quarantine */true); -#ifndef CONFIG_SLUB_TINY do_slab_free(s, slab, x, x, 0, _RET_IP_); -#else - defer_free(s, x); -#endif } EXPORT_SYMBOL_GPL(kfree_nolock); @@ -6931,16 +6959,16 @@ __do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, if (is_kfence_address(p)) { ks = orig_size = kfence_ksize(p); } else { - struct folio *folio; + struct page *page = virt_to_page(p); + struct slab *slab = page_slab(page); - folio = virt_to_folio(p); - if (unlikely(!folio_test_slab(folio))) { + if (!slab) { /* Big kmalloc object */ - WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE); - WARN_ON(p != folio_address(folio)); - ks = folio_size(folio); + ks = page_size(page); + WARN_ON(ks <= KMALLOC_MAX_CACHE_SIZE); + WARN_ON(p != page_address(page)); } else { - s = folio_slab(folio)->slab_cache; + s = slab->slab_cache; orig_size = get_orig_size(s, (void *)p); ks = s->object_size; } @@ -7082,7 +7110,7 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) * Uses kmalloc to get the memory but if the allocation fails then falls back * to the vmalloc allocator. Use kvfree for freeing the memory. * - * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. + * GFP_NOWAIT and GFP_ATOMIC are supported, the __GFP_NORETRY modifier is not. * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is * preferable to the vmalloc fallback, due to visible performance drawbacks. * @@ -7091,6 +7119,7 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align, gfp_t flags, int node) { + bool allow_block; void *ret; /* @@ -7103,10 +7132,6 @@ void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align, if (ret || size <= PAGE_SIZE) return ret; - /* non-sleeping allocations are not supported by vmalloc */ - if (!gfpflags_allow_blocking(flags)) - return NULL; - /* Don't even allow crazy sizes */ if (unlikely(size > INT_MAX)) { WARN_ON_ONCE(!(flags & __GFP_NOWARN)); @@ -7114,13 +7139,23 @@ void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align, } /* + * For non-blocking the VM_ALLOW_HUGE_VMAP is not used + * because the huge-mapping path in vmalloc contains at + * least one might_sleep() call. + * + * TODO: Revise huge-mapping path to support non-blocking + * flags. + */ + allow_block = gfpflags_allow_blocking(flags); + + /* * kvmalloc() can always use VM_ALLOW_HUGE_VMAP, * since the callers already cannot assume anything * about the resulting pointer, and cannot play * protection games. */ return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END, - flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, + flags, PAGE_KERNEL, allow_block ? VM_ALLOW_HUGE_VMAP:0, node, __builtin_return_address(0)); } EXPORT_SYMBOL(__kvmalloc_node_noprof); @@ -7244,23 +7279,25 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, { int lookahead = 3; void *object; - struct folio *folio; + struct page *page; + struct slab *slab; size_t same; object = p[--size]; - folio = virt_to_folio(object); + page = virt_to_page(object); + slab = page_slab(page); if (!s) { /* Handle kalloc'ed objects */ - if (unlikely(!folio_test_slab(folio))) { - free_large_kmalloc(folio, object); + if (!slab) { + free_large_kmalloc(page, object); df->slab = NULL; return size; } /* Derive kmem_cache from object */ - df->slab = folio_slab(folio); - df->s = df->slab->slab_cache; + df->slab = slab; + df->s = slab->slab_cache; } else { - df->slab = folio_slab(folio); + df->slab = slab; df->s = cache_from_obj(s, object); /* Support for memcg */ } @@ -7349,7 +7386,6 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) } EXPORT_SYMBOL(kmem_cache_free_bulk); -#ifndef CONFIG_SLUB_TINY static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p) @@ -7367,14 +7403,8 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, local_lock_irqsave(&s->cpu_slab->lock, irqflags); for (i = 0; i < size; i++) { - void *object = kfence_alloc(s, s->object_size, flags); - - if (unlikely(object)) { - p[i] = object; - continue; - } + void *object = c->freelist; - object = c->freelist; if (unlikely(!object)) { /* * We may have removed an object from c->freelist using @@ -7420,41 +7450,13 @@ error: return 0; } -#else /* CONFIG_SLUB_TINY */ -static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, - size_t size, void **p) -{ - int i; - - for (i = 0; i < size; i++) { - void *object = kfence_alloc(s, s->object_size, flags); - - if (unlikely(object)) { - p[i] = object; - continue; - } - - p[i] = __slab_alloc_node(s, flags, NUMA_NO_NODE, - _RET_IP_, s->object_size); - if (unlikely(!p[i])) - goto error; - - maybe_wipe_obj_freeptr(s, p[i]); - } - - return i; - -error: - __kmem_cache_free_bulk(s, i, p); - return 0; -} -#endif /* CONFIG_SLUB_TINY */ /* Note that interrupts must be enabled when calling this function. */ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, void **p) { unsigned int i = 0; + void *kfence_obj; if (!size) return 0; @@ -7463,6 +7465,20 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, if (unlikely(!s)) return 0; + /* + * to make things simpler, only assume at most once kfence allocated + * object per bulk allocation and choose its index randomly + */ + kfence_obj = kfence_alloc(s, s->object_size, flags); + + if (unlikely(kfence_obj)) { + if (unlikely(size == 1)) { + p[0] = kfence_obj; + goto out; + } + size--; + } + if (s->cpu_sheaves) i = alloc_from_pcs_bulk(s, size, p); @@ -7474,10 +7490,23 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, if (unlikely(__kmem_cache_alloc_bulk(s, flags, size - i, p + i) == 0)) { if (i > 0) __kmem_cache_free_bulk(s, i, p); + if (kfence_obj) + __kfence_free(kfence_obj); return 0; } } + if (unlikely(kfence_obj)) { + int idx = get_random_u32_below(size + 1); + + if (idx != size) + p[size] = p[idx]; + p[idx] = kfence_obj; + + size++; + } + +out: /* * memcg and kmem_cache debug support and memory initialization. * Done outside of the IRQ disabled fastpath loop. @@ -7639,7 +7668,6 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn) barn_init(barn); } -#ifndef CONFIG_SLUB_TINY static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) { BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < @@ -7660,12 +7688,6 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) return 1; } -#else -static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) -{ - return 1; -} -#endif /* CONFIG_SLUB_TINY */ static int init_percpu_sheaves(struct kmem_cache *s) { @@ -7755,13 +7777,11 @@ void __kmem_cache_release(struct kmem_cache *s) cache_random_seq_destroy(s); if (s->cpu_sheaves) pcs_destroy(s); -#ifndef CONFIG_SLUB_TINY #ifdef CONFIG_PREEMPT_RT if (s->cpu_slab) lockdep_unregister_key(&s->lock_key); #endif free_percpu(s->cpu_slab); -#endif free_kmem_cache_nodes(s); } @@ -7886,11 +7906,11 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) * permitted to overwrite the first word of the object on * kmem_cache_free. * - * This is the case if we do RCU, have a constructor or - * destructor, are poisoning the objects, or are - * redzoning an object smaller than sizeof(void *) or are - * redzoning an object with slub_debug_orig_size() enabled, - * in which case the right redzone may be extended. + * This is the case if we do RCU, have a constructor, are + * poisoning the objects, or are redzoning an object smaller + * than sizeof(void *) or are redzoning an object with + * slub_debug_orig_size() enabled, in which case the right + * redzone may be extended. * * The assumption that s->offset >= s->inuse means free * pointer is outside of the object is used in the @@ -8127,46 +8147,53 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) * Kmalloc subsystem *******************************************************************/ -static int __init setup_slub_min_order(char *str) +static int __init setup_slub_min_order(const char *str, const struct kernel_param *kp) { - get_option(&str, (int *)&slub_min_order); + int ret; + + ret = kstrtouint(str, 0, &slub_min_order); + if (ret) + return ret; if (slub_min_order > slub_max_order) slub_max_order = slub_min_order; - return 1; + return 0; } -__setup("slab_min_order=", setup_slub_min_order); -__setup_param("slub_min_order=", slub_min_order, setup_slub_min_order, 0); - +static const struct kernel_param_ops param_ops_slab_min_order __initconst = { + .set = setup_slub_min_order, +}; +__core_param_cb(slab_min_order, ¶m_ops_slab_min_order, &slub_min_order, 0); +__core_param_cb(slub_min_order, ¶m_ops_slab_min_order, &slub_min_order, 0); -static int __init setup_slub_max_order(char *str) +static int __init setup_slub_max_order(const char *str, const struct kernel_param *kp) { - get_option(&str, (int *)&slub_max_order); + int ret; + + ret = kstrtouint(str, 0, &slub_max_order); + if (ret) + return ret; + slub_max_order = min_t(unsigned int, slub_max_order, MAX_PAGE_ORDER); if (slub_min_order > slub_max_order) slub_min_order = slub_max_order; - return 1; + return 0; } -__setup("slab_max_order=", setup_slub_max_order); -__setup_param("slub_max_order=", slub_max_order, setup_slub_max_order, 0); - -static int __init setup_slub_min_objects(char *str) -{ - get_option(&str, (int *)&slub_min_objects); - - return 1; -} +static const struct kernel_param_ops param_ops_slab_max_order __initconst = { + .set = setup_slub_max_order, +}; +__core_param_cb(slab_max_order, ¶m_ops_slab_max_order, &slub_max_order, 0); +__core_param_cb(slub_max_order, ¶m_ops_slab_max_order, &slub_max_order, 0); -__setup("slab_min_objects=", setup_slub_min_objects); -__setup_param("slub_min_objects=", slub_min_objects, setup_slub_min_objects, 0); +core_param(slab_min_objects, slub_min_objects, uint, 0); +core_param(slub_min_objects, slub_min_objects, uint, 0); #ifdef CONFIG_NUMA -static int __init setup_slab_strict_numa(char *str) +static int __init setup_slab_strict_numa(const char *str, const struct kernel_param *kp) { if (nr_node_ids > 1) { static_branch_enable(&strict_numa); @@ -8175,10 +8202,14 @@ static int __init setup_slab_strict_numa(char *str) pr_warn("slab_strict_numa parameter set on non NUMA system.\n"); } - return 1; + return 0; } -__setup("slab_strict_numa", setup_slab_strict_numa); +static const struct kernel_param_ops param_ops_slab_strict_numa __initconst = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = setup_slab_strict_numa, +}; +__core_param_cb(slab_strict_numa, ¶m_ops_slab_strict_numa, NULL, 0); #endif @@ -8504,10 +8535,8 @@ void __init kmem_cache_init(void) void __init kmem_cache_init_late(void) { -#ifndef CONFIG_SLUB_TINY flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0); WARN_ON(!flushwq); -#endif } struct kmem_cache * diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index dbd8daccade2..37522d6cb398 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -439,7 +439,7 @@ int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end, return -ENOMEM; pmd = pmd_offset(pud, addr); - if (pmd_none(READ_ONCE(*pmd))) { + if (pmd_none(pmdp_get(pmd))) { void *p; p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); diff --git a/mm/sparse.c b/mm/sparse.c index 17c50a6415c2..b5b2b6f7041b 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -951,8 +951,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, * Poison uninitialized struct pages in order to catch invalid flags * combinations. */ - if (!altmap || !altmap->inaccessible) - page_init_poison(memmap, sizeof(struct page) * nr_pages); + page_init_poison(memmap, sizeof(struct page) * nr_pages); ms = __nr_to_section(section_nr); set_section_nid(section_nr, nid); diff --git a/mm/swap.h b/mm/swap.h index 8d8efdf1297a..d034c13d8dd2 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -445,25 +445,4 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) return 0; } #endif /* CONFIG_SWAP */ - -/** - * folio_index - File index of a folio. - * @folio: The folio. - * - * For a folio which is either in the page cache or the swap cache, - * return its index within the address_space it belongs to. If you know - * the folio is definitely in the page cache, you can look at the folio's - * index directly. - * - * Return: The index (offset in units of pages) of a folio in its file. - */ -static inline pgoff_t folio_index(struct folio *folio) -{ -#ifdef CONFIG_SWAP - if (unlikely(folio_test_swapcache(folio))) - return swp_offset(folio->swap); -#endif - return folio->index; -} - #endif /* _MM_SWAP_H */ diff --git a/mm/swap_state.c b/mm/swap_state.c index b13e9c4baa90..5f97c6ae70a2 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -12,7 +12,7 @@ #include <linux/kernel_stat.h> #include <linux/mempolicy.h> #include <linux/swap.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/init.h> #include <linux/pagemap.h> #include <linux/pagevec.h> @@ -509,10 +509,6 @@ put_and_return: * and reading the disk if it is not already cached. * A failure return means that either the page allocation failed or that * the swap entry is no longer in use. - * - * get/put_swap_device() aren't needed to call this function, because - * __read_swap_cache_async() call them and swap_read_folio() holds the - * swap cache folio lock. */ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, @@ -736,7 +732,6 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, pte_t *pte = NULL, pentry; int win; unsigned long start, end, addr; - swp_entry_t entry; pgoff_t ilx; bool page_allocated; @@ -748,21 +743,34 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, blk_start_plug(&plug); for (addr = start; addr < end; ilx++, addr += PAGE_SIZE) { + struct swap_info_struct *si = NULL; + softleaf_t entry; + if (!pte++) { pte = pte_offset_map(vmf->pmd, addr); if (!pte) break; } pentry = ptep_get_lockless(pte); - if (!is_swap_pte(pentry)) - continue; - entry = pte_to_swp_entry(pentry); - if (unlikely(non_swap_entry(entry))) + entry = softleaf_from_pte(pentry); + + if (!softleaf_is_swap(entry)) continue; pte_unmap(pte); pte = NULL; + /* + * Readahead entry may come from a device that we are not + * holding a reference to, try to grab a reference, or skip. + */ + if (swp_type(entry) != swp_type(targ_entry)) { + si = get_swap_device(entry); + if (!si) + continue; + } folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); + if (si) + put_swap_device(si); if (!folio) continue; if (page_allocated) { diff --git a/mm/swapfile.c b/mm/swapfile.c index 10760240a3a2..46d2008e4b99 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -44,7 +44,7 @@ #include <linux/plist.h> #include <asm/tlbflush.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/swap_cgroup.h> #include "swap_table.h" #include "internal.h" @@ -74,7 +74,7 @@ atomic_long_t nr_swap_pages; EXPORT_SYMBOL_GPL(nr_swap_pages); /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; -static int least_priority = -1; +#define DEF_SWAP_PRIO -1 unsigned long swapfile_maximum_size; #ifdef CONFIG_MIGRATION bool swap_migration_ad_supported; @@ -103,7 +103,7 @@ static PLIST_HEAD(swap_active_head); * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock. */ -static struct plist_head *swap_avail_heads; +static PLIST_HEAD(swap_avail_head); static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; @@ -236,11 +236,10 @@ again: ret = -nr_pages; /* - * When this function is called from scan_swap_map_slots() and it's - * called by vmscan.c at reclaiming folios. So we hold a folio lock - * here. We have to use trylock for avoiding deadlock. This is a special - * case and you should use folio_free_swap() with explicit folio_lock() - * in usual operations. + * We hold a folio lock here. We have to use trylock for + * avoiding deadlock. This is a special case and you should + * use folio_free_swap() with explicit folio_lock() in usual + * operations. */ if (!folio_trylock(folio)) goto out; @@ -594,7 +593,7 @@ static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info * this returns NULL for an non-empty list. */ static struct swap_cluster_info *isolate_lock_cluster( - struct swap_info_struct *si, struct list_head *list, int order) + struct swap_info_struct *si, struct list_head *list) { struct swap_cluster_info *ci, *found = NULL; @@ -751,14 +750,14 @@ static void relocate_cluster(struct swap_info_struct *si, } /* - * The cluster corresponding to page_nr will be used. The cluster will not be - * added to free cluster list and its usage counter will be increased by 1. - * Only used for initialization. + * The cluster corresponding to @offset will be accounted as having one bad + * slot. The cluster will not be added to the free cluster list, and its + * usage counter will be increased by 1. Only used for initialization. */ -static int inc_cluster_info_page(struct swap_info_struct *si, - struct swap_cluster_info *cluster_info, unsigned long page_nr) +static int swap_cluster_setup_bad_slot(struct swap_cluster_info *cluster_info, + unsigned long offset) { - unsigned long idx = page_nr / SWAPFILE_CLUSTER; + unsigned long idx = offset / SWAPFILE_CLUSTER; struct swap_table *table; struct swap_cluster_info *ci; @@ -772,8 +771,8 @@ static int inc_cluster_info_page(struct swap_info_struct *si, ci->count++; - VM_BUG_ON(ci->count > SWAPFILE_CLUSTER); - VM_BUG_ON(ci->flags); + WARN_ON(ci->count > SWAPFILE_CLUSTER); + WARN_ON(ci->flags); return 0; } @@ -957,7 +956,7 @@ static unsigned int alloc_swap_scan_list(struct swap_info_struct *si, unsigned int found = SWAP_ENTRY_INVALID; do { - struct swap_cluster_info *ci = isolate_lock_cluster(si, list, order); + struct swap_cluster_info *ci = isolate_lock_cluster(si, list); unsigned long offset; if (!ci) @@ -982,7 +981,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) if (force) to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER; - while ((ci = isolate_lock_cluster(si, &si->full_clusters, 0))) { + while ((ci = isolate_lock_cluster(si, &si->full_clusters))) { offset = cluster_offset(si, ci); end = min(si->max, offset + SWAPFILE_CLUSTER); to_scan--; @@ -1101,13 +1100,6 @@ new_cluster: goto done; } - /* - * We don't have free cluster but have some clusters in discarding, - * do discard now and reclaim them. - */ - if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si)) - goto new_cluster; - if (order) goto done; @@ -1137,7 +1129,6 @@ done: /* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff) { - int nid; unsigned long pages; spin_lock(&swap_avail_lock); @@ -1166,8 +1157,7 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff) goto skip; } - for_each_node(nid) - plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]); + plist_del(&si->avail_list, &swap_avail_head); skip: spin_unlock(&swap_avail_lock); @@ -1176,7 +1166,6 @@ skip: /* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */ static void add_to_avail_list(struct swap_info_struct *si, bool swapon) { - int nid; long val; unsigned long pages; @@ -1209,8 +1198,7 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon) goto skip; } - for_each_node(nid) - plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); + plist_add(&si->avail_list, &swap_avail_head); skip: spin_unlock(&swap_avail_lock); @@ -1350,54 +1338,79 @@ static bool swap_alloc_fast(swp_entry_t *entry, } /* Rotate the device and switch to a new cluster */ -static bool swap_alloc_slow(swp_entry_t *entry, +static void swap_alloc_slow(swp_entry_t *entry, int order) { - int node; unsigned long offset; struct swap_info_struct *si, *next; - node = numa_node_id(); spin_lock(&swap_avail_lock); start_over: - plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { /* Rotate the device and switch to a new cluster */ - plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); + plist_requeue(&si->avail_list, &swap_avail_head); spin_unlock(&swap_avail_lock); if (get_swap_device_info(si)) { offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE); put_swap_device(si); if (offset) { *entry = swp_entry(si->type, offset); - return true; + return; } if (order) - return false; + return; } spin_lock(&swap_avail_lock); /* * if we got here, it's likely that si was almost full before, - * and since scan_swap_map_slots() can drop the si->lock, * multiple callers probably all tried to get a page from the * same si and it filled up before we could get one; or, the si - * filled up between us dropping swap_avail_lock and taking - * si->lock. Since we dropped the swap_avail_lock, the - * swap_avail_head list may have been modified; so if next is - * still in the swap_avail_head list then try it, otherwise - * start over if we have not gotten any slots. + * filled up between us dropping swap_avail_lock. + * Since we dropped the swap_avail_lock, the swap_avail_list + * may have been modified; so if next is still in the + * swap_avail_head list then try it, otherwise start over if we + * have not gotten any slots. */ - if (plist_node_empty(&next->avail_lists[node])) + if (plist_node_empty(&next->avail_list)) goto start_over; } spin_unlock(&swap_avail_lock); +} + +/* + * Discard pending clusters in a synchronized way when under high pressure. + * Return: true if any cluster is discarded. + */ +static bool swap_sync_discard(void) +{ + bool ret = false; + struct swap_info_struct *si, *next; + + spin_lock(&swap_lock); +start_over: + plist_for_each_entry_safe(si, next, &swap_active_head, list) { + spin_unlock(&swap_lock); + if (get_swap_device_info(si)) { + if (si->flags & SWP_PAGE_DISCARD) + ret = swap_do_scheduled_discard(si); + put_swap_device(si); + } + if (ret) + return true; + + spin_lock(&swap_lock); + if (plist_node_empty(&next->list)) + goto start_over; + } + spin_unlock(&swap_lock); + return false; } /** * folio_alloc_swap - allocate swap space for a folio * @folio: folio we want to move to swap - * @gfp: gfp mask for shadow nodes * * Allocate swap space for the folio and add the folio to the * swap cache. @@ -1405,7 +1418,7 @@ start_over: * Context: Caller needs to hold the folio lock. * Return: Whether the folio was added to the swap cache. */ -int folio_alloc_swap(struct folio *folio, gfp_t gfp) +int folio_alloc_swap(struct folio *folio) { unsigned int order = folio_order(folio); unsigned int size = 1 << order; @@ -1432,11 +1445,17 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp) } } +again: local_lock(&percpu_swap_cluster.lock); if (!swap_alloc_fast(&entry, order)) swap_alloc_slow(&entry, order); local_unlock(&percpu_swap_cluster.lock); + if (unlikely(!order && !entry.val)) { + if (swap_sync_discard()) + goto again; + } + /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */ if (mem_cgroup_try_charge_swap(folio, entry)) goto out_free; @@ -1677,7 +1696,7 @@ static bool swap_entries_put_map_nr(struct swap_info_struct *si, /* * Check if it's the last ref of swap entry in the freeing path. - * Qualified vlaue includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM. + * Qualified value includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM. */ static inline bool __maybe_unused swap_is_last_ref(unsigned char count) { @@ -2005,10 +2024,8 @@ swp_entry_t get_swap_page_of_type(int type) local_lock(&percpu_swap_cluster.lock); offset = cluster_alloc_swap_entry(si, 0, 1); local_unlock(&percpu_swap_cluster.lock); - if (offset) { + if (offset) entry = swp_entry(si->type, offset); - atomic_long_dec(&nr_swap_pages); - } } put_swap_device(si); } @@ -2241,7 +2258,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, struct folio *folio; unsigned long offset; unsigned char swp_count; - swp_entry_t entry; + softleaf_t entry; int ret; pte_t ptent; @@ -2252,11 +2269,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } ptent = ptep_get_lockless(pte); + entry = softleaf_from_pte(ptent); - if (!is_swap_pte(ptent)) + if (!softleaf_is_swap(entry)) continue; - - entry = pte_to_swp_entry(ptent); if (swp_type(entry) != type) continue; @@ -2684,44 +2700,18 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) return generic_swapfile_activate(sis, swap_file, span); } -static int swap_node(struct swap_info_struct *si) -{ - struct block_device *bdev; - - if (si->bdev) - bdev = si->bdev; - else - bdev = si->swap_file->f_inode->i_sb->s_bdev; - - return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; -} - static void setup_swap_info(struct swap_info_struct *si, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info, unsigned long *zeromap) { - int i; - - if (prio >= 0) - si->prio = prio; - else - si->prio = --least_priority; + si->prio = prio; /* * the plist prio is negated because plist ordering is * low-to-high, while swap ordering is high-to-low */ si->list.prio = -si->prio; - for_each_node(i) { - if (si->prio >= 0) - si->avail_lists[i].prio = -si->prio; - else { - if (swap_node(si) == i) - si->avail_lists[i].prio = 1; - else - si->avail_lists[i].prio = -si->prio; - } - } + si->avail_list.prio = -si->prio; si->swap_map = swap_map; si->cluster_info = cluster_info; si->zeromap = zeromap; @@ -2733,16 +2723,7 @@ static void _enable_swap_info(struct swap_info_struct *si) total_swap_pages += si->pages; assert_spin_locked(&swap_lock); - /* - * both lists are plists, and thus priority ordered. - * swap_active_head needs to be priority ordered for swapoff(), - * which on removal of any swap_info_struct with an auto-assigned - * (i.e. negative) priority increments the auto-assigned priority - * of any lower-priority swap_info_structs. - * swap_avail_head needs to be priority ordered for folio_alloc_swap(), - * which allocates swap pages from the highest available priority - * swap_info_struct. - */ + plist_add(&si->list, &swap_active_head); /* Add back to available list */ @@ -2892,20 +2873,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) } spin_lock(&p->lock); del_from_avail_list(p, true); - if (p->prio < 0) { - struct swap_info_struct *si = p; - int nid; - - plist_for_each_entry_continue(si, &swap_active_head, list) { - si->prio++; - si->list.prio--; - for_each_node(nid) { - if (si->avail_lists[nid].prio != 1) - si->avail_lists[nid].prio--; - } - } - least_priority++; - } plist_del(&p->list, &swap_active_head); atomic_long_sub(p->pages, &nr_swap_pages); total_swap_pages -= p->pages; @@ -2944,7 +2911,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (p->flags & SWP_CONTINUED) free_swap_count_continuations(p); - if (!p->bdev || !bdev_nonrot(p->bdev)) + if (!(p->flags & SWP_SOLIDSTATE)) atomic_dec(&nr_rotate_swap); mutex_lock(&swapon_mutex); @@ -3143,9 +3110,8 @@ static struct swap_info_struct *alloc_swap_info(void) struct swap_info_struct *p; struct swap_info_struct *defer = NULL; unsigned int type; - int i; - p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); + p = kvzalloc(sizeof(struct swap_info_struct), GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); @@ -3184,8 +3150,7 @@ static struct swap_info_struct *alloc_swap_info(void) } p->swap_extent_root = RB_ROOT; plist_node_init(&p->list, 0); - for_each_node(i) - plist_node_init(&p->avail_lists[i], 0); + plist_node_init(&p->avail_list, 0); p->flags = SWP_USED; spin_unlock(&swap_lock); if (defer) { @@ -3238,8 +3203,17 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode) */ unsigned long generic_max_swapfile_size(void) { - return swp_offset(pte_to_swp_entry( - swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; + swp_entry_t entry = swp_entry(0, ~0UL); + const pte_t pte = softleaf_to_pte(entry); + + /* + * Since the PTE can be an invalid softleaf entry (e.g. the none PTE), + * we need to do this manually. + */ + entry = __pte_to_swp_entry(pte); + entry = swp_entry(__swp_type(entry), __swp_offset(entry)); + + return swp_offset(entry) + 1; } /* Can be overridden by an architecture for additional checks. */ @@ -3357,7 +3331,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, si->global_cluster = kmalloc(sizeof(*si->global_cluster), GFP_KERNEL); if (!si->global_cluster) - goto err_free; + goto err; for (i = 0; i < SWAP_NR_ORDERS; i++) si->global_cluster->next[i] = SWAP_ENTRY_INVALID; spin_lock_init(&si->global_cluster_lock); @@ -3370,7 +3344,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, * See setup_swap_map(): header page, bad pages, * and the EOF part of the last cluster. */ - err = inc_cluster_info_page(si, cluster_info, 0); + err = swap_cluster_setup_bad_slot(cluster_info, 0); if (err) goto err; for (i = 0; i < swap_header->info.nr_badpages; i++) { @@ -3378,12 +3352,12 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, if (page_nr >= maxpages) continue; - err = inc_cluster_info_page(si, cluster_info, page_nr); + err = swap_cluster_setup_bad_slot(cluster_info, page_nr); if (err) goto err; } for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) { - err = inc_cluster_info_page(si, cluster_info, i); + err = swap_cluster_setup_bad_slot(cluster_info, i); if (err) goto err; } @@ -3410,9 +3384,8 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, } return cluster_info; -err_free: - free_cluster_info(cluster_info, maxpages); err: + free_cluster_info(cluster_info, maxpages); return ERR_PTR(err); } @@ -3442,9 +3415,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!swap_avail_heads) - return -ENOMEM; - si = alloc_swap_info(); if (IS_ERR(si)) return PTR_ERR(si); @@ -3621,7 +3591,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } mutex_lock(&swapon_mutex); - prio = -1; + prio = DEF_SWAP_PRIO; if (swap_flags & SWAP_FLAG_PREFER) prio = swap_flags & SWAP_FLAG_PRIO_MASK; enable_swap_info(si, prio, swap_map, cluster_info, zeromap); @@ -4053,8 +4023,7 @@ static bool __has_usable_swap(void) void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { - struct swap_info_struct *si, *next; - int nid = folio_nid(folio); + struct swap_info_struct *si; if (!(gfp & __GFP_IO)) return; @@ -4073,8 +4042,7 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) return; spin_lock(&swap_avail_lock); - plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], - avail_lists[nid]) { + plist_for_each_entry(si, &swap_avail_head, avail_list) { if (si->bdev) { blkcg_schedule_throttle(si->bdev->bd_disk, true); break; @@ -4086,18 +4054,6 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) static int __init swapfile_init(void) { - int nid; - - swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head), - GFP_KERNEL); - if (!swap_avail_heads) { - pr_emerg("Not enough memory for swap heads, swap is disabled\n"); - return -ENOMEM; - } - - for_each_node(nid) - plist_head_init(&swap_avail_heads[nid]); - swapfile_maximum_size = arch_max_swapfile_size(); /* diff --git a/mm/truncate.c b/mm/truncate.c index 91eb92a5ce4f..12467c1bd711 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -46,7 +46,7 @@ static void clear_shadow_entries(struct address_space *mapping, xas_unlock_irq(&xas); if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); } @@ -111,7 +111,7 @@ static void truncate_folio_batch_exceptionals(struct address_space *mapping, xas_unlock_irq(&xas); if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); out: folio_batch_remove_exceptionals(fbatch); @@ -177,6 +177,32 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio) return 0; } +static int try_folio_split_or_unmap(struct folio *folio, struct page *split_at, + unsigned long min_order) +{ + enum ttu_flags ttu_flags = + TTU_SYNC | + TTU_SPLIT_HUGE_PMD | + TTU_IGNORE_MLOCK; + int ret; + + ret = try_folio_split_to_order(folio, split_at, min_order); + + /* + * If the split fails, unmap the folio, so it will be refaulted + * with PTEs to respect SIGBUS semantics. + * + * Make an exception for shmem/tmpfs that for long time + * intentionally mapped with PMDs across i_size. + */ + if (ret && !shmem_mapping(folio->mapping)) { + try_to_unmap(folio, ttu_flags); + WARN_ON(folio_mapped(folio)); + } + + return ret; +} + /* * Handle partial folios. The folio may be entirely within the * range if a split has raced with us. If not, we zero the part of the @@ -194,6 +220,7 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) size_t size = folio_size(folio); unsigned int offset, length; struct page *split_at, *split_at2; + unsigned int min_order; if (pos < start) offset = start - pos; @@ -223,8 +250,9 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) if (!folio_test_large(folio)) return true; + min_order = mapping_min_folio_order(folio->mapping); split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE); - if (!try_folio_split(folio, split_at, NULL)) { + if (!try_folio_split_or_unmap(folio, split_at, min_order)) { /* * try to split at offset + length to make sure folios within * the range can be dropped, especially to avoid memory waste @@ -248,13 +276,10 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) if (!folio_trylock(folio2)) goto out; - /* - * make sure folio2 is large and does not change its mapping. - * Its split result does not matter here. - */ + /* make sure folio2 is large and does not change its mapping */ if (folio_test_large(folio2) && folio2->mapping == folio->mapping) - try_folio_split(folio2, split_at2, NULL); + try_folio_split_or_unmap(folio2, split_at2, min_order); folio_unlock(folio2); out: @@ -339,7 +364,7 @@ long mapping_evict_folio(struct address_space *mapping, struct folio *folio) * page aligned properly. */ void truncate_inode_pages_range(struct address_space *mapping, - loff_t lstart, loff_t lend) + loff_t lstart, uoff_t lend) { pgoff_t start; /* inclusive */ pgoff_t end; /* exclusive */ @@ -387,7 +412,7 @@ void truncate_inode_pages_range(struct address_space *mapping, same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0); if (!IS_ERR(folio)) { - same_folio = lend < folio_pos(folio) + folio_size(folio); + same_folio = lend < folio_next_pos(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) { start = folio_next_index(folio); if (same_folio) @@ -622,7 +647,7 @@ int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, __filemap_remove_folio(folio, NULL); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); filemap_free_folio(mapping, folio); diff --git a/mm/usercopy.c b/mm/usercopy.c index dbdcc43964fb..5de7a518b1b1 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -164,7 +164,8 @@ static inline void check_heap_object(const void *ptr, unsigned long n, { unsigned long addr = (unsigned long)ptr; unsigned long offset; - struct folio *folio; + struct page *page; + struct slab *slab; if (is_kmap_addr(ptr)) { offset = offset_in_page(ptr); @@ -189,16 +190,23 @@ static inline void check_heap_object(const void *ptr, unsigned long n, if (!virt_addr_valid(ptr)) return; - folio = virt_to_folio(ptr); - - if (folio_test_slab(folio)) { + page = virt_to_page(ptr); + slab = page_slab(page); + if (slab) { /* Check slab allocator for flags and size. */ - __check_heap_object(ptr, n, folio_slab(folio), to_user); - } else if (folio_test_large(folio)) { - offset = ptr - folio_address(folio); - if (n > folio_size(folio) - offset) + __check_heap_object(ptr, n, slab, to_user); + } else if (PageCompound(page)) { + page = compound_head(page); + offset = ptr - page_address(page); + if (n > page_size(page) - offset) usercopy_abort("page alloc", NULL, to_user, offset, n); } + + /* + * We cannot check non-compound pages. They might be part of + * a large allocation, in which case crossing a page boundary + * is fine. + */ } DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_HARDENED_USERCOPY_DEFAULT_ON, diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index af61b95c89e4..e6dfd5f28acd 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -10,7 +10,7 @@ #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/swap.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/userfaultfd_k.h> #include <linux/mmu_notifier.h> #include <linux/hugetlb.h> @@ -178,6 +178,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, spinlock_t *ptl; struct folio *folio = page_folio(page); bool page_in_cache = folio_mapping(folio); + pte_t dst_ptep; _dst_pte = mk_pte(page, dst_vma->vm_page_prot); _dst_pte = pte_mkdirty(_dst_pte); @@ -199,12 +200,15 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, } ret = -EEXIST; + + dst_ptep = ptep_get(dst_pte); + /* - * We allow to overwrite a pte marker: consider when both MISSING|WP - * registered, we firstly wr-protect a none pte which has no page cache - * page backing it, then access the page. + * We are allowed to overwrite a UFFD pte marker: consider when both + * MISSING|WP registered, we firstly wr-protect a none pte which has no + * page cache page backing it, then access the page. */ - if (!pte_none_mostly(ptep_get(dst_pte))) + if (!pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep)) goto out_unlock; if (page_in_cache) { @@ -583,12 +587,15 @@ retry: goto out_unlock; } - if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) && - !huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte))) { - err = -EEXIST; - hugetlb_vma_unlock_read(dst_vma); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); - goto out_unlock; + if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { + const pte_t ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte); + + if (!huge_pte_none(ptep) && !pte_is_uffd_marker(ptep)) { + err = -EEXIST; + hugetlb_vma_unlock_read(dst_vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + goto out_unlock; + } } err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr, @@ -1035,8 +1042,7 @@ static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte, */ static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma, unsigned long src_addr, - pte_t *src_pte, pte_t *dst_pte, - struct anon_vma *src_anon_vma) + pte_t *src_pte, pte_t *dst_pte) { pte_t orig_dst_pte, orig_src_pte; struct folio *folio; @@ -1052,8 +1058,7 @@ static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma, folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); if (!folio || !folio_trylock(folio)) return NULL; - if (!PageAnonExclusive(&folio->page) || folio_test_large(folio) || - folio_anon_vma(folio) != src_anon_vma) { + if (!PageAnonExclusive(&folio->page) || folio_test_large(folio)) { folio_unlock(folio); return NULL; } @@ -1061,9 +1066,8 @@ static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma, } /* - * Moves src folios to dst in a batch as long as they share the same - * anon_vma as the first folio, are not large, and can successfully - * take the lock via folio_trylock(). + * Moves src folios to dst in a batch as long as they are not large, and can + * successfully take the lock via folio_trylock(). */ static long move_present_ptes(struct mm_struct *mm, struct vm_area_struct *dst_vma, @@ -1073,8 +1077,7 @@ static long move_present_ptes(struct mm_struct *mm, pte_t orig_dst_pte, pte_t orig_src_pte, pmd_t *dst_pmd, pmd_t dst_pmdval, spinlock_t *dst_ptl, spinlock_t *src_ptl, - struct folio **first_src_folio, unsigned long len, - struct anon_vma *src_anon_vma) + struct folio **first_src_folio, unsigned long len) { int err = 0; struct folio *src_folio = *first_src_folio; @@ -1116,9 +1119,8 @@ static long move_present_ptes(struct mm_struct *mm, orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot); /* Set soft dirty bit so userspace can notice the pte was moved */ -#ifdef CONFIG_MEM_SOFT_DIRTY - orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); -#endif + if (pgtable_supports_soft_dirty()) + orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); if (pte_dirty(orig_src_pte)) orig_dst_pte = pte_mkdirty(orig_dst_pte); orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma); @@ -1132,8 +1134,8 @@ static long move_present_ptes(struct mm_struct *mm, src_pte++; folio_unlock(src_folio); - src_folio = check_ptes_for_batched_move(src_vma, src_addr, src_pte, - dst_pte, src_anon_vma); + src_folio = check_ptes_for_batched_move(src_vma, src_addr, + src_pte, dst_pte); if (!src_folio) break; } @@ -1205,9 +1207,8 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, } orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); -#ifdef CONFIG_MEM_SOFT_DIRTY - orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte); -#endif + if (pgtable_supports_soft_dirty()) + orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte); set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); double_pt_unlock(dst_ptl, src_ptl); @@ -1253,7 +1254,6 @@ static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd unsigned long dst_addr, unsigned long src_addr, unsigned long len, __u64 mode) { - swp_entry_t entry; struct swap_info_struct *si = NULL; pte_t orig_src_pte, orig_dst_pte; pte_t src_folio_pte; @@ -1263,7 +1263,6 @@ static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd pmd_t dummy_pmdval; pmd_t dst_pmdval; struct folio *src_folio = NULL; - struct anon_vma *src_anon_vma = NULL; struct mmu_notifier_range range; long ret = 0; @@ -1347,9 +1346,9 @@ retry: } /* - * Pin and lock both source folio and anon_vma. Since we are in - * RCU read section, we can't block, so on contention have to - * unmap the ptes, obtain the lock and retry. + * Pin and lock source folio. Since we are in RCU read section, + * we can't block, so on contention have to unmap the ptes, + * obtain the lock and retry. */ if (!src_folio) { struct folio *folio; @@ -1423,46 +1422,25 @@ retry: goto retry; } - if (!src_anon_vma) { - /* - * folio_referenced walks the anon_vma chain - * without the folio lock. Serialize against it with - * the anon_vma lock, the folio lock is not enough. - */ - src_anon_vma = folio_get_anon_vma(src_folio); - if (!src_anon_vma) { - /* page was unmapped from under us */ - ret = -EAGAIN; - goto out; - } - if (!anon_vma_trylock_write(src_anon_vma)) { - pte_unmap(src_pte); - pte_unmap(dst_pte); - src_pte = dst_pte = NULL; - /* now we can block and wait */ - anon_vma_lock_write(src_anon_vma); - goto retry; - } - } - ret = move_present_ptes(mm, dst_vma, src_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, dst_ptl, src_ptl, &src_folio, - len, src_anon_vma); - } else { + len); + } else { /* !pte_present() */ struct folio *folio = NULL; + const softleaf_t entry = softleaf_from_pte(orig_src_pte); - entry = pte_to_swp_entry(orig_src_pte); - if (non_swap_entry(entry)) { - if (is_migration_entry(entry)) { - pte_unmap(src_pte); - pte_unmap(dst_pte); - src_pte = dst_pte = NULL; - migration_entry_wait(mm, src_pmd, src_addr); - ret = -EAGAIN; - } else - ret = -EFAULT; + if (softleaf_is_migration(entry)) { + pte_unmap(src_pte); + pte_unmap(dst_pte); + src_pte = dst_pte = NULL; + migration_entry_wait(mm, src_pmd, src_addr); + + ret = -EAGAIN; + goto out; + } else if (!softleaf_is_swap(entry)) { + ret = -EFAULT; goto out; } @@ -1515,10 +1493,6 @@ retry: } out: - if (src_anon_vma) { - anon_vma_unlock_write(src_anon_vma); - put_anon_vma(src_anon_vma); - } if (src_folio) { folio_unlock(src_folio); folio_put(src_folio); @@ -1578,7 +1552,7 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx, /* * For now, we keep it simple and only move between writable VMAs. - * Access flags are equal, therefore cheching only the source is enough. + * Access flags are equal, therefore checking only the source is enough. */ if (!(src_vma->vm_flags & VM_WRITE)) return -EINVAL; @@ -1792,15 +1766,6 @@ static void uffd_move_unlock(struct vm_area_struct *dst_vma, * virtual regions without knowing if there are transparent hugepage * in the regions or not, but preventing the risk of having to split * the hugepmd during the remap. - * - * If there's any rmap walk that is taking the anon_vma locks without - * first obtaining the folio lock (the only current instance is - * folio_referenced), they will have to verify if the folio->mapping - * has changed after taking the anon_vma lock. If it changed they - * should release the lock and retry obtaining a new anon_vma, because - * it means the anon_vma was changed by move_pages() before the lock - * could be obtained. This is the only additional complexity added to - * the rmap code to provide this anonymous page remapping functionality. */ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, __u64 mode) diff --git a/mm/util.c b/mm/util.c index 8989d5767528..97cae40c0209 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1135,7 +1135,7 @@ EXPORT_SYMBOL(flush_dcache_folio); #endif /** - * __compat_vma_mmap_prepare() - See description for compat_vma_mmap_prepare() + * __compat_vma_mmap() - See description for compat_vma_mmap() * for details. This is the same operation, only with a specific file operations * struct which may or may not be the same as vma->vm_file->f_op. * @f_op: The file operations whose .mmap_prepare() hook is specified. @@ -1143,7 +1143,7 @@ EXPORT_SYMBOL(flush_dcache_folio); * @vma: The VMA to apply the .mmap_prepare() hook to. * Returns: 0 on success or error. */ -int __compat_vma_mmap_prepare(const struct file_operations *f_op, +int __compat_vma_mmap(const struct file_operations *f_op, struct file *file, struct vm_area_struct *vma) { struct vm_area_desc desc = { @@ -1156,21 +1156,24 @@ int __compat_vma_mmap_prepare(const struct file_operations *f_op, .vm_file = vma->vm_file, .vm_flags = vma->vm_flags, .page_prot = vma->vm_page_prot, + + .action.type = MMAP_NOTHING, /* Default */ }; int err; err = f_op->mmap_prepare(&desc); if (err) return err; - set_vma_from_desc(vma, &desc); - return 0; + mmap_action_prepare(&desc.action, &desc); + set_vma_from_desc(vma, &desc); + return mmap_action_complete(&desc.action, vma); } -EXPORT_SYMBOL(__compat_vma_mmap_prepare); +EXPORT_SYMBOL(__compat_vma_mmap); /** - * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an - * existing VMA. + * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an + * existing VMA and execute any requested actions. * @file: The file which possesss an f_op->mmap_prepare() hook. * @vma: The VMA to apply the .mmap_prepare() hook to. * @@ -1185,7 +1188,7 @@ EXPORT_SYMBOL(__compat_vma_mmap_prepare); * .mmap_prepare() hook, as we are in a different context when we invoke the * .mmap() hook, already having a VMA to deal with. * - * compat_vma_mmap_prepare() is a compatibility function that takes VMA state, + * compat_vma_mmap() is a compatibility function that takes VMA state, * establishes a struct vm_area_desc descriptor, passes to the underlying * .mmap_prepare() hook and applies any changes performed by it. * @@ -1194,11 +1197,11 @@ EXPORT_SYMBOL(__compat_vma_mmap_prepare); * * Returns: 0 on success or error. */ -int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma) +int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) { - return __compat_vma_mmap_prepare(file->f_op, file, vma); + return __compat_vma_mmap(file->f_op, file, vma); } -EXPORT_SYMBOL(compat_vma_mmap_prepare); +EXPORT_SYMBOL(compat_vma_mmap); static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio, const struct page *page) @@ -1280,6 +1283,127 @@ again: } } +static int mmap_action_finish(struct mmap_action *action, + const struct vm_area_struct *vma, int err) +{ + /* + * If an error occurs, unmap the VMA altogether and return an error. We + * only clear the newly allocated VMA, since this function is only + * invoked if we do NOT merge, so we only clean up the VMA we created. + */ + if (err) { + const size_t len = vma_pages(vma) << PAGE_SHIFT; + + do_munmap(current->mm, vma->vm_start, len, NULL); + + if (action->error_hook) { + /* We may want to filter the error. */ + err = action->error_hook(err); + + /* The caller should not clear the error. */ + VM_WARN_ON_ONCE(!err); + } + return err; + } + + if (action->success_hook) + return action->success_hook(vma); + + return 0; +} + +#ifdef CONFIG_MMU +/** + * mmap_action_prepare - Perform preparatory setup for an VMA descriptor + * action which need to be performed. + * @desc: The VMA descriptor to prepare for @action. + * @action: The action to perform. + */ +void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc) +{ + switch (action->type) { + case MMAP_NOTHING: + break; + case MMAP_REMAP_PFN: + remap_pfn_range_prepare(desc, action->remap.start_pfn); + break; + case MMAP_IO_REMAP_PFN: + io_remap_pfn_range_prepare(desc, action->remap.start_pfn, + action->remap.size); + break; + } +} +EXPORT_SYMBOL(mmap_action_prepare); + +/** + * mmap_action_complete - Execute VMA descriptor action. + * @action: The action to perform. + * @vma: The VMA to perform the action upon. + * + * Similar to mmap_action_prepare(). + * + * Return: 0 on success, or error, at which point the VMA will be unmapped. + */ +int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma) +{ + int err = 0; + + switch (action->type) { + case MMAP_NOTHING: + break; + case MMAP_REMAP_PFN: + err = remap_pfn_range_complete(vma, action->remap.start, + action->remap.start_pfn, action->remap.size, + action->remap.pgprot); + break; + case MMAP_IO_REMAP_PFN: + err = io_remap_pfn_range_complete(vma, action->remap.start, + action->remap.start_pfn, action->remap.size, + action->remap.pgprot); + break; + } + + return mmap_action_finish(action, vma, err); +} +EXPORT_SYMBOL(mmap_action_complete); +#else +void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc) +{ + switch (action->type) { + case MMAP_NOTHING: + break; + case MMAP_REMAP_PFN: + case MMAP_IO_REMAP_PFN: + WARN_ON_ONCE(1); /* nommu cannot handle these. */ + break; + } +} +EXPORT_SYMBOL(mmap_action_prepare); + +int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma) +{ + int err = 0; + + switch (action->type) { + case MMAP_NOTHING: + break; + case MMAP_REMAP_PFN: + case MMAP_IO_REMAP_PFN: + WARN_ON_ONCE(1); /* nommu cannot handle this. */ + + err = -EINVAL; + break; + } + + return mmap_action_finish(action, vma, err); +} +EXPORT_SYMBOL(mmap_action_complete); +#endif + #ifdef CONFIG_MMU /** * folio_pte_batch - detect a PTE batch for a large folio @@ -34,7 +34,9 @@ struct mmap_state { struct maple_tree mt_detach; /* Determine if we can check KSM flags early in mmap() logic. */ - bool check_ksm_early; + bool check_ksm_early :1; + /* If we map new, hold the file rmap lock on mapping. */ + bool hold_file_rmap_lock :1; }; #define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \ @@ -87,15 +89,7 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex if (!mpol_equal(vmg->policy, vma_policy(vma))) return false; - /* - * VM_SOFTDIRTY should not prevent from VMA merging, if we - * match the flags but dirty bit -- the caller should mark - * merged VMA as dirty. If dirty bit won't be excluded from - * comparison, we increase pressure on the memory system forcing - * the kernel to generate new VMAs when old one could be - * extended instead. - */ - if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_SOFTDIRTY) + if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_IGNORE_MERGE) return false; if (vma->vm_file != vmg->file) return false; @@ -109,7 +103,7 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next) { struct vm_area_struct *tgt = merge_next ? vmg->next : vmg->prev; - struct vm_area_struct *src = vmg->middle; /* exisitng merge case. */ + struct vm_area_struct *src = vmg->middle; /* existing merge case. */ struct anon_vma *tgt_anon = tgt->anon_vma; struct anon_vma *src_anon = vmg->anon_vma; @@ -481,8 +475,7 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); - unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end, - /* mm_wr_locked = */ true); + unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end); mas_set(mas, vma->vm_end); free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : USER_PGTABLES_CEILING, @@ -798,7 +791,7 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma) * Returns: The merged VMA if merge succeeds, or NULL otherwise. * * ASSUMPTIONS: - * - The caller must assign the VMA to be modifed to @vmg->middle. + * - The caller must assign the VMA to be modified to @vmg->middle. * - The caller must have set @vmg->prev to the previous VMA, if there is one. * - The caller must not set @vmg->next, as we determine this. * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. @@ -807,6 +800,7 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma) static __must_check struct vm_area_struct *vma_merge_existing_range( struct vma_merge_struct *vmg) { + vm_flags_t sticky_flags = vmg->vm_flags & VM_STICKY; struct vm_area_struct *middle = vmg->middle; struct vm_area_struct *prev = vmg->prev; struct vm_area_struct *next; @@ -899,11 +893,13 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( if (merge_right) { vma_start_write(next); vmg->target = next; + sticky_flags |= (next->vm_flags & VM_STICKY); } if (merge_left) { vma_start_write(prev); vmg->target = prev; + sticky_flags |= (prev->vm_flags & VM_STICKY); } if (merge_both) { @@ -973,6 +969,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( if (err || commit_merge(vmg)) goto abort; + vm_flags_set(vmg->target, sticky_flags); khugepaged_enter_vma(vmg->target, vmg->vm_flags); vmg->state = VMA_MERGE_SUCCESS; return vmg->target; @@ -1123,6 +1120,10 @@ int vma_expand(struct vma_merge_struct *vmg) bool remove_next = false; struct vm_area_struct *target = vmg->target; struct vm_area_struct *next = vmg->next; + vm_flags_t sticky_flags; + + sticky_flags = vmg->vm_flags & VM_STICKY; + sticky_flags |= target->vm_flags & VM_STICKY; VM_WARN_ON_VMG(!target, vmg); @@ -1132,6 +1133,7 @@ int vma_expand(struct vma_merge_struct *vmg) if (next && (target != next) && (vmg->end == next->vm_end)) { int ret; + sticky_flags |= next->vm_flags & VM_STICKY; remove_next = true; /* This should already have been checked by this point. */ VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg); @@ -1158,6 +1160,7 @@ int vma_expand(struct vma_merge_struct *vmg) if (commit_merge(vmg)) goto nomem; + vm_flags_set(target, sticky_flags); return 0; nomem: @@ -1226,7 +1229,7 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms, tlb_gather_mmu(&tlb, vms->vma->vm_mm); update_hiwater_rss(vms->vma->vm_mm); unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, - vms->vma_count, mm_wr_locked); + vms->vma_count); mas_set(mas_detach, 1); /* start and end may be different if there is no prev or next vma. */ @@ -1637,25 +1640,35 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) return vma; } -struct vm_area_struct *vma_modify_flags( - struct vma_iterator *vmi, struct vm_area_struct *prev, - struct vm_area_struct *vma, unsigned long start, unsigned long end, - vm_flags_t vm_flags) +struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, + struct vm_area_struct *prev, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + vm_flags_t *vm_flags_ptr) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); + const vm_flags_t vm_flags = *vm_flags_ptr; + struct vm_area_struct *ret; vmg.vm_flags = vm_flags; - return vma_modify(&vmg); + ret = vma_modify(&vmg); + if (IS_ERR(ret)) + return ret; + + /* + * For a merge to succeed, the flags must match those + * requested. However, sticky flags may have been retained, so propagate + * them to the caller. + */ + if (vmg.state == VMA_MERGE_SUCCESS) + *vm_flags_ptr = ret->vm_flags; + return ret; } -struct vm_area_struct -*vma_modify_name(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, - unsigned long end, - struct anon_vma_name *new_name) +struct vm_area_struct *vma_modify_name(struct vma_iterator *vmi, + struct vm_area_struct *prev, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct anon_vma_name *new_name) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); @@ -1664,12 +1677,10 @@ struct vm_area_struct return vma_modify(&vmg); } -struct vm_area_struct -*vma_modify_policy(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct mempolicy *new_pol) +struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, + struct vm_area_struct *prev, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct mempolicy *new_pol) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); @@ -1678,14 +1689,10 @@ struct vm_area_struct return vma_modify(&vmg); } -struct vm_area_struct -*vma_modify_flags_uffd(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - vm_flags_t vm_flags, - struct vm_userfaultfd_ctx new_ctx, - bool give_up_on_oom) +struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, + struct vm_area_struct *prev, struct vm_area_struct *vma, + unsigned long start, unsigned long end, vm_flags_t vm_flags, + struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); @@ -1754,24 +1761,7 @@ void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) unlink_file_vma_batch_process(vb); } -/* - * Unlink a file-based vm structure from its interval tree, to hide - * vma from rmap and vmtruncate before freeing its page tables. - */ -void unlink_file_vma(struct vm_area_struct *vma) -{ - struct file *file = vma->vm_file; - - if (file) { - struct address_space *mapping = file->f_mapping; - - i_mmap_lock_write(mapping); - __remove_shared_vm_struct(vma, mapping); - i_mmap_unlock_write(mapping); - } -} - -void vma_link_file(struct vm_area_struct *vma) +static void vma_link_file(struct vm_area_struct *vma, bool hold_rmap_lock) { struct file *file = vma->vm_file; struct address_space *mapping; @@ -1780,11 +1770,12 @@ void vma_link_file(struct vm_area_struct *vma) mapping = file->f_mapping; i_mmap_lock_write(mapping); __vma_link_file(vma, mapping); - i_mmap_unlock_write(mapping); + if (!hold_rmap_lock) + i_mmap_unlock_write(mapping); } } -int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) +static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) { VMA_ITERATOR(vmi, mm, 0); @@ -1794,7 +1785,7 @@ int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) vma_start_write(vma); vma_iter_store_new(&vmi, vma); - vma_link_file(vma); + vma_link_file(vma, /* hold_rmap_lock= */false); mm->map_count++; validate_mm(mm); return 0; @@ -1917,7 +1908,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct * return a->vm_end == b->vm_start && mpol_equal(vma_policy(a), vma_policy(b)) && a->vm_file == b->vm_file && - !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) && + !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_IGNORE_MERGE)) && b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); } @@ -2328,17 +2319,33 @@ static void update_ksm_flags(struct mmap_state *map) map->vm_flags = ksm_vma_flags(map->mm, map->file, map->vm_flags); } +static void set_desc_from_map(struct vm_area_desc *desc, + const struct mmap_state *map) +{ + desc->start = map->addr; + desc->end = map->end; + + desc->pgoff = map->pgoff; + desc->vm_file = map->file; + desc->vm_flags = map->vm_flags; + desc->page_prot = map->page_prot; +} + /* - * __mmap_prepare() - Prepare to gather any overlapping VMAs that need to be + * __mmap_setup() - Prepare to gather any overlapping VMAs that need to be * unmapped once the map operation is completed, check limits, account mapping * and clean up any pre-existing VMAs. * + * As a result it sets up the @map and @desc objects. + * * @map: Mapping state. + * @desc: VMA descriptor * @uf: Userfaultfd context list. * * Returns: 0 on success, error code otherwise. */ -static int __mmap_prepare(struct mmap_state *map, struct list_head *uf) +static int __mmap_setup(struct mmap_state *map, struct vm_area_desc *desc, + struct list_head *uf) { int error; struct vma_iterator *vmi = map->vmi; @@ -2395,6 +2402,7 @@ static int __mmap_prepare(struct mmap_state *map, struct list_head *uf) */ vms_clean_up_area(vms, &map->mas_detach); + set_desc_from_map(desc, map); return 0; } @@ -2496,7 +2504,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) vma_start_write(vma); vma_iter_store_new(vmi, vma); map->mm->map_count++; - vma_link_file(vma); + vma_link_file(vma, map->hold_file_rmap_lock); /* * vma_merge_new_range() calls khugepaged_enter_vma() too, the below @@ -2551,11 +2559,23 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) * then new mapped in-place (which must be aimed as * a completely new data area). */ - vm_flags_set(vma, VM_SOFTDIRTY); + if (pgtable_supports_soft_dirty()) + vm_flags_set(vma, VM_SOFTDIRTY); vma_set_page_prot(vma); } +static void call_action_prepare(struct mmap_state *map, + struct vm_area_desc *desc) +{ + struct mmap_action *action = &desc->action; + + mmap_action_prepare(action, desc); + + if (action->hide_from_rmap_until_complete) + map->hold_file_rmap_lock = true; +} + /* * Invoke the f_op->mmap_prepare() callback for a file-backed mapping that * specifies it. @@ -2567,34 +2587,26 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) * * Returns 0 on success, or an error code otherwise. */ -static int call_mmap_prepare(struct mmap_state *map) +static int call_mmap_prepare(struct mmap_state *map, + struct vm_area_desc *desc) { int err; - struct vm_area_desc desc = { - .mm = map->mm, - .file = map->file, - .start = map->addr, - .end = map->end, - - .pgoff = map->pgoff, - .vm_file = map->file, - .vm_flags = map->vm_flags, - .page_prot = map->page_prot, - }; /* Invoke the hook. */ - err = vfs_mmap_prepare(map->file, &desc); + err = vfs_mmap_prepare(map->file, desc); if (err) return err; + call_action_prepare(map, desc); + /* Update fields permitted to be changed. */ - map->pgoff = desc.pgoff; - map->file = desc.vm_file; - map->vm_flags = desc.vm_flags; - map->page_prot = desc.page_prot; + map->pgoff = desc->pgoff; + map->file = desc->vm_file; + map->vm_flags = desc->vm_flags; + map->page_prot = desc->page_prot; /* User-defined fields. */ - map->vm_ops = desc.vm_ops; - map->vm_private_data = desc.private_data; + map->vm_ops = desc->vm_ops; + map->vm_private_data = desc->private_data; return 0; } @@ -2636,22 +2648,48 @@ static bool can_set_ksm_flags_early(struct mmap_state *map) return false; } +static int call_action_complete(struct mmap_state *map, + struct vm_area_desc *desc, + struct vm_area_struct *vma) +{ + struct mmap_action *action = &desc->action; + int ret; + + ret = mmap_action_complete(action, vma); + + /* If we held the file rmap we need to release it. */ + if (map->hold_file_rmap_lock) { + struct file *file = vma->vm_file; + + i_mmap_unlock_write(file->f_mapping); + } + return ret; +} + static unsigned long __mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; - int error; bool have_mmap_prepare = file && file->f_op->mmap_prepare; VMA_ITERATOR(vmi, mm, addr); MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file); + struct vm_area_desc desc = { + .mm = mm, + .file = file, + .action = { + .type = MMAP_NOTHING, /* Default to no further action. */ + }, + }; + bool allocated_new = false; + int error; map.check_ksm_early = can_set_ksm_flags_early(&map); - error = __mmap_prepare(&map, uf); + error = __mmap_setup(&map, &desc, uf); if (!error && have_mmap_prepare) - error = call_mmap_prepare(&map); + error = call_mmap_prepare(&map, &desc); if (error) goto abort_munmap; @@ -2670,6 +2708,7 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, error = __mmap_new_vma(&map, &vma); if (error) goto unacct_error; + allocated_new = true; } if (have_mmap_prepare) @@ -2677,9 +2716,16 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, __mmap_complete(&map, vma); + if (have_mmap_prepare && allocated_new) { + error = call_action_complete(&map, &desc, vma); + + if (error) + return error; + } + return addr; - /* Accounting was done by __mmap_prepare(). */ + /* Accounting was done by __mmap_setup(). */ unacct_error: if (map.charged) vm_unacct_memory(map.charged); @@ -2819,7 +2865,8 @@ out: mm->data_vm += len >> PAGE_SHIFT; if (vm_flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); - vm_flags_set(vma, VM_SOFTDIRTY); + if (pgtable_supports_soft_dirty()) + vm_flags_set(vma, VM_SOFTDIRTY); return 0; mas_store_fail: @@ -263,47 +263,113 @@ void remove_vma(struct vm_area_struct *vma); void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next); -/* We are about to modify the VMA's flags. */ -__must_check struct vm_area_struct -*vma_modify_flags(struct vma_iterator *vmi, +/** + * vma_modify_flags() - Peform any necessary split/merge in preparation for + * setting VMA flags to *@vm_flags in the range @start to @end contained within + * @vma. + * @vmi: Valid VMA iterator positioned at @vma. + * @prev: The VMA immediately prior to @vma or NULL if @vma is the first. + * @vma: The VMA containing the range @start to @end to be updated. + * @start: The start of the range to update. May be offset within @vma. + * @end: The exclusive end of the range to update, may be offset within @vma. + * @vm_flags_ptr: A pointer to the VMA flags that the @start to @end range is + * about to be set to. On merge, this will be updated to include sticky flags. + * + * IMPORTANT: The actual modification being requested here is NOT applied, + * rather the VMA is perhaps split, perhaps merged to accommodate the change, + * and the caller is expected to perform the actual modification. + * + * In order to account for sticky VMA flags, the @vm_flags_ptr parameter points + * to the requested flags which are then updated so the caller, should they + * overwrite any existing flags, correctly retains these. + * + * Returns: A VMA which contains the range @start to @end ready to have its + * flags altered to *@vm_flags. + */ +__must_check struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, + struct vm_area_struct *prev, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + vm_flags_t *vm_flags_ptr); + +/** + * vma_modify_name() - Peform any necessary split/merge in preparation for + * setting anonymous VMA name to @new_name in the range @start to @end contained + * within @vma. + * @vmi: Valid VMA iterator positioned at @vma. + * @prev: The VMA immediately prior to @vma or NULL if @vma is the first. + * @vma: The VMA containing the range @start to @end to be updated. + * @start: The start of the range to update. May be offset within @vma. + * @end: The exclusive end of the range to update, may be offset within @vma. + * @new_name: The anonymous VMA name that the @start to @end range is about to + * be set to. + * + * IMPORTANT: The actual modification being requested here is NOT applied, + * rather the VMA is perhaps split, perhaps merged to accommodate the change, + * and the caller is expected to perform the actual modification. + * + * Returns: A VMA which contains the range @start to @end ready to have its + * anonymous VMA name changed to @new_name. + */ +__must_check struct vm_area_struct *vma_modify_name(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, - vm_flags_t vm_flags); - -/* We are about to modify the VMA's anon_name. */ -__must_check struct vm_area_struct -*vma_modify_name(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, - unsigned long end, - struct anon_vma_name *new_name); - -/* We are about to modify the VMA's memory policy. */ -__must_check struct vm_area_struct -*vma_modify_policy(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, + struct anon_vma_name *new_name); + +/** + * vma_modify_policy() - Peform any necessary split/merge in preparation for + * setting NUMA policy to @new_pol in the range @start to @end contained + * within @vma. + * @vmi: Valid VMA iterator positioned at @vma. + * @prev: The VMA immediately prior to @vma or NULL if @vma is the first. + * @vma: The VMA containing the range @start to @end to be updated. + * @start: The start of the range to update. May be offset within @vma. + * @end: The exclusive end of the range to update, may be offset within @vma. + * @new_pol: The NUMA policy that the @start to @end range is about to be set + * to. + * + * IMPORTANT: The actual modification being requested here is NOT applied, + * rather the VMA is perhaps split, perhaps merged to accommodate the change, + * and the caller is expected to perform the actual modification. + * + * Returns: A VMA which contains the range @start to @end ready to have its + * NUMA policy changed to @new_pol. + */ +__must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, + struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct mempolicy *new_pol); -/* We are about to modify the VMA's flags and/or uffd context. */ -__must_check struct vm_area_struct -*vma_modify_flags_uffd(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - vm_flags_t vm_flags, - struct vm_userfaultfd_ctx new_ctx, - bool give_up_on_oom); +/** + * vma_modify_flags_uffd() - Peform any necessary split/merge in preparation for + * setting VMA flags to @vm_flags and UFFD context to @new_ctx in the range + * @start to @end contained within @vma. + * @vmi: Valid VMA iterator positioned at @vma. + * @prev: The VMA immediately prior to @vma or NULL if @vma is the first. + * @vma: The VMA containing the range @start to @end to be updated. + * @start: The start of the range to update. May be offset within @vma. + * @end: The exclusive end of the range to update, may be offset within @vma. + * @vm_flags: The VMA flags that the @start to @end range is about to be set to. + * @new_ctx: The userfaultfd context that the @start to @end range is about to + * be set to. + * @give_up_on_oom: If an out of memory condition occurs on merge, simply give + * up on it and treat the merge as best-effort. + * + * IMPORTANT: The actual modification being requested here is NOT applied, + * rather the VMA is perhaps split, perhaps merged to accommodate the change, + * and the caller is expected to perform the actual modification. + * + * Returns: A VMA which contains the range @start to @end ready to have its VMA + * flags changed to @vm_flags and its userfaultfd context changed to @new_ctx. + */ +__must_check struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, + struct vm_area_struct *prev, struct vm_area_struct *vma, + unsigned long start, unsigned long end, vm_flags_t vm_flags, + struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom); -__must_check struct vm_area_struct -*vma_merge_new_range(struct vma_merge_struct *vmg); +__must_check struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg); -__must_check struct vm_area_struct -*vma_merge_extend(struct vma_iterator *vmi, - struct vm_area_struct *vma, - unsigned long delta); +__must_check struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, + struct vm_area_struct *vma, unsigned long delta); void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb); @@ -312,12 +378,6 @@ void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb); void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, struct vm_area_struct *vma); -void unlink_file_vma(struct vm_area_struct *vma); - -void vma_link_file(struct vm_area_struct *vma); - -int vma_link(struct mm_struct *mm, struct vm_area_struct *vma); - struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, unsigned long addr, unsigned long len, pgoff_t pgoff, bool *need_rmap_locks); diff --git a/mm/vma_exec.c b/mm/vma_exec.c index 922ee51747a6..8134e1afca68 100644 --- a/mm/vma_exec.c +++ b/mm/vma_exec.c @@ -107,6 +107,7 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, unsigned long *top_mem_p) { + unsigned long flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; int err; struct vm_area_struct *vma = vm_area_alloc(mm); @@ -137,7 +138,9 @@ int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; - vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); + if (pgtable_supports_soft_dirty()) + flags |= VM_SOFTDIRTY; + vm_flags_init(vma, flags); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); err = insert_vm_struct(mm, vma); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 798b2ed21e46..ecbac900c35f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -100,6 +100,9 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct page *page; unsigned long size = PAGE_SIZE; + if (WARN_ON_ONCE(!PAGE_ALIGNED(end - addr))) + return -EINVAL; + pfn = phys_addr >> PAGE_SHIFT; pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) @@ -167,6 +170,7 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, { pmd_t *pmd; unsigned long next; + int err = 0; pmd = pmd_alloc_track(&init_mm, pud, addr, mask); if (!pmd) @@ -180,10 +184,11 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, continue; } - if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask)) - return -ENOMEM; + err = vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask); + if (err) + break; } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); - return 0; + return err; } static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, @@ -217,6 +222,7 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, { pud_t *pud; unsigned long next; + int err = 0; pud = pud_alloc_track(&init_mm, p4d, addr, mask); if (!pud) @@ -230,11 +236,11 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, continue; } - if (vmap_pmd_range(pud, addr, next, phys_addr, prot, - max_page_shift, mask)) - return -ENOMEM; + err = vmap_pmd_range(pud, addr, next, phys_addr, prot, max_page_shift, mask); + if (err) + break; } while (pud++, phys_addr += (next - addr), addr = next, addr != end); - return 0; + return err; } static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, @@ -268,6 +274,7 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, { p4d_t *p4d; unsigned long next; + int err = 0; p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); if (!p4d) @@ -281,11 +288,11 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, continue; } - if (vmap_pud_range(p4d, addr, next, phys_addr, prot, - max_page_shift, mask)) - return -ENOMEM; + err = vmap_pud_range(p4d, addr, next, phys_addr, prot, max_page_shift, mask); + if (err) + break; } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); - return 0; + return err; } static int vmap_range_noflush(unsigned long addr, unsigned long end, @@ -671,16 +678,28 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, } int vmap_pages_range_noflush(unsigned long addr, unsigned long end, - pgprot_t prot, struct page **pages, unsigned int page_shift) + pgprot_t prot, struct page **pages, unsigned int page_shift, + gfp_t gfp_mask) { int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages, - page_shift); + page_shift, gfp_mask); if (ret) return ret; return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); } +static int __vmap_pages_range(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift, + gfp_t gfp_mask) +{ + int err; + + err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift, gfp_mask); + flush_cache_vmap(addr, end); + return err; +} + /** * vmap_pages_range - map pages to a kernel virtual address * @addr: start of the VM area to map @@ -696,11 +715,7 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { - int err; - - err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); - flush_cache_vmap(addr, end); - return err; + return __vmap_pages_range(addr, end, prot, pages, page_shift, GFP_KERNEL); } static int check_sparse_vm_area(struct vm_struct *area, unsigned long start, @@ -2017,6 +2032,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long freed; unsigned long addr; unsigned int vn_id; + bool allow_block; int purged = 0; int ret; @@ -2028,7 +2044,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, /* Only reclaim behaviour flags are relevant. */ gfp_mask = gfp_mask & GFP_RECLAIM_MASK; - might_sleep(); + allow_block = gfpflags_allow_blocking(gfp_mask); + might_sleep_if(allow_block); /* * If a VA is obtained from a global heap(if it fails here) @@ -2062,7 +2079,8 @@ retry: * This is not a fast path. Check if yielding is needed. This * is the only reschedule point in the vmalloc() path. */ - cond_resched(); + if (allow_block) + cond_resched(); } trace_alloc_vmap_area(addr, size, align, vstart, vend, IS_ERR_VALUE(addr)); @@ -2071,8 +2089,16 @@ retry: * If an allocation fails, the error value is * returned. Therefore trigger the overflow path. */ - if (IS_ERR_VALUE(addr)) - goto overflow; + if (IS_ERR_VALUE(addr)) { + if (allow_block) + goto overflow; + + /* + * We can not trigger any reclaim logic because + * sleeping is not allowed, thus fail an allocation. + */ + goto out_free_va; + } va->va_start = addr; va->va_end = addr + size; @@ -2122,6 +2148,7 @@ overflow: pr_warn("vmalloc_node_range for size %lu failed: Address range restricted to %#lx - %#lx\n", size, vstart, vend); +out_free_va: kmem_cache_free(vmap_area_cachep, va); return ERR_PTR(-EBUSY); } @@ -2672,8 +2699,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) node = numa_node_id(); - vb = kmalloc_node(sizeof(struct vmap_block), - gfp_mask & GFP_RECLAIM_MASK, node); + vb = kmalloc_node(sizeof(struct vmap_block), gfp_mask, node); if (unlikely(!vb)) return ERR_PTR(-ENOMEM); @@ -3587,13 +3613,58 @@ void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot) EXPORT_SYMBOL_GPL(vmap_pfn); #endif /* CONFIG_VMAP_PFN */ +/* + * Helper for vmalloc to adjust the gfp flags for certain allocations. + */ +static inline gfp_t vmalloc_gfp_adjust(gfp_t flags, const bool large) +{ + flags |= __GFP_NOWARN; + if (large) + flags &= ~__GFP_NOFAIL; + return flags; +} + static inline unsigned int vm_area_alloc_pages(gfp_t gfp, int nid, unsigned int order, unsigned int nr_pages, struct page **pages) { unsigned int nr_allocated = 0; + unsigned int nr_remaining = nr_pages; + unsigned int max_attempt_order = MAX_PAGE_ORDER; struct page *page; int i; + unsigned int large_order = ilog2(nr_remaining); + gfp_t large_gfp = vmalloc_gfp_adjust(gfp, large_order) & ~__GFP_DIRECT_RECLAIM; + + large_order = min(max_attempt_order, large_order); + + /* + * Initially, attempt to have the page allocator give us large order + * pages. Do not attempt allocating smaller than order chunks since + * __vmap_pages_range() expects physically contigous pages of exactly + * order long chunks. + */ + while (large_order > order && nr_remaining) { + if (nid == NUMA_NO_NODE) + page = alloc_pages_noprof(large_gfp, large_order); + else + page = alloc_pages_node_noprof(nid, large_gfp, large_order); + + if (unlikely(!page)) { + max_attempt_order = --large_order; + continue; + } + + split_page(page, large_order); + for (i = 0; i < (1U << large_order); i++) + pages[nr_allocated + i] = page + i; + + nr_allocated += 1U << large_order; + nr_remaining = nr_pages - nr_allocated; + + large_order = ilog2(nr_remaining); + large_order = min(max_attempt_order, large_order); + } /* * For order-0 pages we make use of bulk allocator, if @@ -3675,6 +3746,71 @@ vm_area_alloc_pages(gfp_t gfp, int nid, return nr_allocated; } +static LLIST_HEAD(pending_vm_area_cleanup); +static void cleanup_vm_area_work(struct work_struct *work) +{ + struct vm_struct *area, *tmp; + struct llist_node *head; + + head = llist_del_all(&pending_vm_area_cleanup); + if (!head) + return; + + llist_for_each_entry_safe(area, tmp, head, llnode) { + if (!area->pages) + free_vm_area(area); + else + vfree(area->addr); + } +} + +/* + * Helper for __vmalloc_area_node() to defer cleanup + * of partially initialized vm_struct in error paths. + */ +static DECLARE_WORK(cleanup_vm_area, cleanup_vm_area_work); +static void defer_vm_area_cleanup(struct vm_struct *area) +{ + if (llist_add(&area->llnode, &pending_vm_area_cleanup)) + schedule_work(&cleanup_vm_area); +} + +/* + * Page tables allocations ignore external GFP. Enforces it by + * the memalloc scope API. It is used by vmalloc internals and + * KASAN shadow population only. + * + * GFP to scope mapping: + * + * non-blocking (no __GFP_DIRECT_RECLAIM) - memalloc_noreclaim_save() + * GFP_NOFS - memalloc_nofs_save() + * GFP_NOIO - memalloc_noio_save() + * + * Returns a flag cookie to pair with restore. + */ +unsigned int +memalloc_apply_gfp_scope(gfp_t gfp_mask) +{ + unsigned int flags = 0; + + if (!gfpflags_allow_blocking(gfp_mask)) + flags = memalloc_noreclaim_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + flags = memalloc_nofs_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + flags = memalloc_noio_save(); + + /* 0 - no scope applied. */ + return flags; +} + +void +memalloc_restore_scope(unsigned int flags) +{ + if (flags) + memalloc_flags_restore(flags); +} + static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, unsigned int page_shift, int node) @@ -3691,6 +3827,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, array_size = (unsigned long)nr_small_pages * sizeof(struct page *); + /* __GFP_NOFAIL and "noblock" flags are mutually exclusive. */ + if (!gfpflags_allow_blocking(gfp_mask)) + nofail = false; + if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; @@ -3706,8 +3846,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to allocated page array size %lu", nr_small_pages * PAGE_SIZE, array_size); - free_vm_area(area); - return NULL; + goto fail; } set_vm_area_page_order(area, page_shift - PAGE_SHIFT); @@ -3721,9 +3860,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, * Please note, the __vmalloc_node_range_noprof() falls-back * to order-0 pages if high-order attempt is unsuccessful. */ - area->nr_pages = vm_area_alloc_pages((page_order ? - gfp_mask & ~__GFP_NOFAIL : gfp_mask) | __GFP_NOWARN, - node, page_order, nr_small_pages, area->pages); + area->nr_pages = vm_area_alloc_pages( + vmalloc_gfp_adjust(gfp_mask, page_order), node, + page_order, nr_small_pages, area->pages); atomic_long_add(area->nr_pages, &nr_vmalloc_pages); /* All pages of vm should be charged to same memcg, so use first one. */ @@ -3757,22 +3896,14 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, * page tables allocations ignore external gfp mask, enforce it * by the scope API */ - if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) - flags = memalloc_nofs_save(); - else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) - flags = memalloc_noio_save(); - + flags = memalloc_apply_gfp_scope(gfp_mask); do { - ret = vmap_pages_range(addr, addr + size, prot, area->pages, - page_shift); + ret = __vmap_pages_range(addr, addr + size, prot, area->pages, + page_shift, nested_gfp); if (nofail && (ret < 0)) schedule_timeout_uninterruptible(1); } while (nofail && (ret < 0)); - - if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) - memalloc_nofs_restore(flags); - else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) - memalloc_noio_restore(flags); + memalloc_restore_scope(flags); if (ret < 0) { warn_alloc(gfp_mask, NULL, @@ -3784,10 +3915,32 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, return area->addr; fail: - vfree(area->addr); + defer_vm_area_cleanup(area); return NULL; } +/* + * See __vmalloc_node_range() for a clear list of supported vmalloc flags. + * This gfp lists all flags currently passed through vmalloc. Currently, + * __GFP_ZERO is used by BPF and __GFP_NORETRY is used by percpu. Both drm + * and BPF also use GFP_USER. Additionally, various users pass + * GFP_KERNEL_ACCOUNT. Xfs uses __GFP_NOLOCKDEP. + */ +#define GFP_VMALLOC_SUPPORTED (GFP_KERNEL | GFP_ATOMIC | GFP_NOWAIT |\ + __GFP_NOFAIL | __GFP_ZERO | __GFP_NORETRY |\ + GFP_NOFS | GFP_NOIO | GFP_KERNEL_ACCOUNT |\ + GFP_USER | __GFP_NOLOCKDEP) + +static gfp_t vmalloc_fix_flags(gfp_t flags) +{ + gfp_t invalid_mask = flags & ~GFP_VMALLOC_SUPPORTED; + + flags &= GFP_VMALLOC_SUPPORTED; + WARN_ONCE(1, "Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n", + invalid_mask, &invalid_mask, flags, &flags); + return flags; +} + /** * __vmalloc_node_range - allocate virtually contiguous memory * @size: allocation size @@ -3801,19 +3954,20 @@ fail: * @caller: caller's return address * * Allocate enough pages to cover @size from the page level - * allocator with @gfp_mask flags. Please note that the full set of gfp - * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all - * supported. - * Zone modifiers are not supported. From the reclaim modifiers - * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported) - * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and - * __GFP_RETRY_MAYFAIL are not supported). + * allocator with @gfp_mask flags and map them into contiguous + * virtual range with protection @prot. + * + * Supported GFP classes: %GFP_KERNEL, %GFP_ATOMIC, %GFP_NOWAIT, + * %GFP_NOFS and %GFP_NOIO. Zone modifiers are not supported. + * Please note %GFP_ATOMIC and %GFP_NOWAIT are supported only + * by __vmalloc(). * - * __GFP_NOWARN can be used to suppress failures messages. + * Retry modifiers: only %__GFP_NOFAIL is supported; %__GFP_NORETRY + * and %__GFP_RETRY_MAYFAIL are not supported. * - * Map them into contiguous kernel virtual space, using a pagetable - * protection of @prot. + * %__GFP_NOWARN can be used to suppress failure messages. * + * Can not be called from interrupt nor NMI contexts. * Return: the address of the area or %NULL on failure */ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, @@ -3946,11 +4100,8 @@ fail: * Allocate enough pages to cover @size from the page level allocator with * @gfp_mask flags. Map them into contiguous kernel virtual space. * - * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL - * and __GFP_NOFAIL are not supported - * - * Any use of gfp flags outside of GFP_KERNEL should be consulted - * with mm people. + * Semantics of @gfp_mask (including reclaim/retry modifiers such as + * __GFP_NOFAIL) are the same as in __vmalloc_node_range_noprof(). * * Return: pointer to the allocated memory or %NULL on error */ @@ -3971,6 +4122,8 @@ EXPORT_SYMBOL_GPL(__vmalloc_node_noprof); void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) { + if (unlikely(gfp_mask & ~GFP_VMALLOC_SUPPORTED)) + gfp_mask = vmalloc_fix_flags(gfp_mask); return __vmalloc_node_noprof(size, 1, gfp_mask, NUMA_NO_NODE, __builtin_return_address(0)); } @@ -4010,6 +4163,8 @@ EXPORT_SYMBOL(vmalloc_noprof); */ void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node) { + if (unlikely(gfp_mask & ~GFP_VMALLOC_SUPPORTED)) + gfp_mask = vmalloc_fix_flags(gfp_mask); return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, node, __builtin_return_address(0)); @@ -5055,7 +5210,7 @@ static int vmalloc_info_show(struct seq_file *m, void *p) unsigned int *counters; if (IS_ENABLED(CONFIG_NUMA)) - counters = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); + counters = kmalloc_array(nr_node_ids, sizeof(unsigned int), GFP_KERNEL); for_each_vmap_node(vn) { spin_lock(&vn->busy.lock); diff --git a/mm/vmscan.c b/mm/vmscan.c index b2fc8b626d3d..900c74b6aa62 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -477,17 +477,6 @@ static int reclaimer_offset(struct scan_control *sc) return PGSTEAL_DIRECT - PGSTEAL_KSWAPD; } -static inline int is_page_cache_freeable(struct folio *folio) -{ - /* - * A freeable page cache folio is referenced only by the caller - * that isolated the folio, the page cache and optional filesystem - * private data at folio->private. - */ - return folio_ref_count(folio) - folio_test_private(folio) == - 1 + folio_nr_pages(folio); -} - /* * We detected a synchronous write error writing a folio out. Probably * -ENOSPC. We need to propagate that into the address_space for a subsequent @@ -696,24 +685,12 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping, * block, for some throttling. This happens by accident, because * swap_backing_dev_info is bust: it doesn't reflect the * congestion state of the swapdevs. Easy to fix, if needed. + * + * A freeable shmem or swapcache folio is referenced only by the + * caller that isolated the folio and the page cache. */ - if (!is_page_cache_freeable(folio)) - return PAGE_KEEP; - if (!mapping) { - /* - * Some data journaling orphaned folios can have - * folio->mapping == NULL while being dirty with clean buffers. - */ - if (folio_test_private(folio)) { - if (try_to_free_buffers(folio)) { - folio_clear_dirty(folio); - pr_info("%s: orphaned folio\n", __func__); - return PAGE_CLEAN; - } - } + if (folio_ref_count(folio) != 1 + folio_nr_pages(folio) || !mapping) return PAGE_KEEP; - } - if (!shmem_mapping(mapping) && !folio_test_anon(folio)) return PAGE_ACTIVATE; if (!folio_clear_dirty_for_io(folio)) @@ -811,7 +788,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, __filemap_remove_folio(folio, shadow); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); if (free_folio) @@ -1054,7 +1031,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, * When this happens, 'page' will likely just be discarded * instead of migrated. */ - .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN | + .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOMEMALLOC | GFP_NOWAIT, .nid = target_nid, .nmask = &allowed_mask, @@ -1318,7 +1295,7 @@ retry: split_folio_to_list(folio, folio_list)) goto activate_locked; } - if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) { + if (folio_alloc_swap(folio)) { int __maybe_unused order = folio_order(folio); if (!folio_test_large(folio)) @@ -1334,7 +1311,7 @@ retry: } #endif count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); - if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) + if (folio_alloc_swap(folio)) goto activate_locked_split; } /* @@ -1409,21 +1386,7 @@ retry: mapping = folio_mapping(folio); if (folio_test_dirty(folio)) { - /* - * Only kswapd can writeback filesystem folios - * to avoid risk of stack overflow. But avoid - * injecting inefficient single-folio I/O into - * flusher writeback as much as possible: only - * write folios when we've encountered many - * dirty folios, and when we've already scanned - * the rest of the LRU for clean folios and see - * the same dirty folios again (with the reclaim - * flag set). - */ - if (folio_is_file_lru(folio) && - (!current_is_kswapd() || - !folio_test_reclaim(folio) || - !test_bit(PGDAT_DIRTY, &pgdat->flags))) { + if (folio_is_file_lru(folio)) { /* * Immediately reclaim when written back. * Similar in principle to folio_deactivate() @@ -1432,7 +1395,8 @@ retry: */ node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE, nr_pages); - folio_set_reclaim(folio); + if (!folio_test_reclaim(folio)) + folio_set_reclaim(folio); goto activate_locked; } @@ -2054,7 +2018,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, spin_lock_irq(&lruvec->lru_lock); move_folios_to_lru(lruvec, &folio_list); - __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), + mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), stat.nr_demoted); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); item = PGSTEAL_KSWAPD + reclaimer_offset(sc); @@ -3773,7 +3737,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, pud = pud_offset(p4d, start & P4D_MASK); restart: for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { - pud_t val = READ_ONCE(pud[i]); + pud_t val = pudp_get(pud + i); next = pud_addr_end(addr, end); @@ -4780,7 +4744,7 @@ retry: reset_batch_size(walk); } - __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), + mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), stat.nr_demoted); item = PGSTEAL_KSWAPD + reclaimer_offset(sc); @@ -6127,11 +6091,6 @@ again: if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) set_bit(PGDAT_WRITEBACK, &pgdat->flags); - /* Allow kswapd to start writing pages during reclaim.*/ - if (sc->nr.unqueued_dirty && - sc->nr.unqueued_dirty == sc->nr.file_taken) - set_bit(PGDAT_DIRTY, &pgdat->flags); - /* * If kswapd scans pages marked for immediate * reclaim and under writeback (nr_immediate), it @@ -6872,7 +6831,6 @@ static void clear_pgdat_congested(pg_data_t *pgdat) clear_bit(LRUVEC_NODE_CONGESTED, &lruvec->flags); clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags); - clear_bit(PGDAT_DIRTY, &pgdat->flags); clear_bit(PGDAT_WRITEBACK, &pgdat->flags); } @@ -7169,7 +7127,12 @@ restart: goto restart; } - if (!sc.nr_reclaimed) + /* + * If the reclaim was boosted, we might still be far from the + * watermark_high at this point. We need to avoid increasing the + * failure count to prevent the kswapd thread from stopping. + */ + if (!sc.nr_reclaimed && !boosted) atomic_inc(&pgdat->kswapd_failures); out: @@ -7623,9 +7586,11 @@ static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat) else nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat); - /* If we can't clean pages, remove dirty pages from consideration */ - if (!(node_reclaim_mode & RECLAIM_WRITE)) - delta += node_page_state(pgdat, NR_FILE_DIRTY); + /* + * Since we can't clean folios through reclaim, remove dirty file + * folios from consideration. + */ + delta += node_page_state(pgdat, NR_FILE_DIRTY); /* Watch for any possible underflows due to delta */ if (unlikely(delta > nr_pagecache_reclaimable)) diff --git a/mm/vmstat.c b/mm/vmstat.c index bb09c032eecf..65de88cdf40e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -392,7 +392,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, delta >>= PAGE_SHIFT; } - /* See __mod_node_page_state */ + /* See __mod_zone_page_state() */ preempt_disable_nested(); x = delta + __this_cpu_read(*p); @@ -438,7 +438,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) s8 __percpu *p = pcp->vm_stat_diff + item; s8 v, t; - /* See __mod_node_page_state */ + /* See __mod_zone_page_state() */ preempt_disable_nested(); v = __this_cpu_inc_return(*p); @@ -461,7 +461,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); - /* See __mod_node_page_state */ + /* See __mod_zone_page_state() */ preempt_disable_nested(); v = __this_cpu_inc_return(*p); @@ -494,7 +494,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) s8 __percpu *p = pcp->vm_stat_diff + item; s8 v, t; - /* See __mod_node_page_state */ + /* See __mod_zone_page_state() */ preempt_disable_nested(); v = __this_cpu_dec_return(*p); @@ -517,7 +517,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); - /* See __mod_node_page_state */ + /* See __mod_zone_page_state() */ preempt_disable_nested(); v = __this_cpu_dec_return(*p); @@ -771,25 +771,28 @@ EXPORT_SYMBOL(dec_node_page_state); /* * Fold a differential into the global counters. - * Returns the number of counters updated. + * Returns whether counters were updated. */ static int fold_diff(int *zone_diff, int *node_diff) { int i; - int changes = 0; + bool changed = false; - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { if (zone_diff[i]) { atomic_long_add(zone_diff[i], &vm_zone_stat[i]); - changes++; + changed = true; + } } - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { if (node_diff[i]) { atomic_long_add(node_diff[i], &vm_node_stat[i]); - changes++; + changed = true; + } } - return changes; + + return changed; } /* @@ -806,16 +809,16 @@ static int fold_diff(int *zone_diff, int *node_diff) * with the global counters. These could cause remote node cache line * bouncing and will have to be only done when necessary. * - * The function returns the number of global counters updated. + * The function returns whether global counters were updated. */ -static int refresh_cpu_vm_stats(bool do_pagesets) +static bool refresh_cpu_vm_stats(bool do_pagesets) { struct pglist_data *pgdat; struct zone *zone; int i; int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; - int changes = 0; + bool changed = false; for_each_populated_zone(zone) { struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; @@ -839,7 +842,8 @@ static int refresh_cpu_vm_stats(bool do_pagesets) if (do_pagesets) { cond_resched(); - changes += decay_pcp_high(zone, this_cpu_ptr(pcp)); + if (decay_pcp_high(zone, this_cpu_ptr(pcp))) + changed = true; #ifdef CONFIG_NUMA /* * Deal with draining the remote pageset of this @@ -861,13 +865,13 @@ static int refresh_cpu_vm_stats(bool do_pagesets) } if (__this_cpu_dec_return(pcp->expire)) { - changes++; + changed = true; continue; } if (__this_cpu_read(pcp->count)) { drain_zone_pages(zone, this_cpu_ptr(pcp)); - changes++; + changed = true; } #endif } @@ -887,8 +891,9 @@ static int refresh_cpu_vm_stats(bool do_pagesets) } } - changes += fold_diff(global_zone_diff, global_node_diff); - return changes; + if (fold_diff(global_zone_diff, global_node_diff)) + changed = true; + return changed; } /* @@ -1847,9 +1852,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, } seq_printf(m, "\n node_unreclaimable: %u" - "\n start_pfn: %lu", + "\n start_pfn: %lu" + "\n reserved_highatomic: %lu" + "\n free_highatomic: %lu", atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES, - zone->zone_start_pfn); + zone->zone_start_pfn, + zone->nr_reserved_highatomic, + zone->nr_free_highatomic); seq_putc(m, '\n'); } diff --git a/mm/workingset.c b/mm/workingset.c index 68a76a91111f..e9f05634747a 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -749,13 +749,13 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; xa_delete_node(node, workingset_update_node); - __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); + mod_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM, 1); out_invalid: xa_unlock_irq(&mapping->i_pages); if (mapping->host != NULL) { if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); } ret = LRU_REMOVED_RETRY; diff --git a/mm/zswap.c b/mm/zswap.c index c1af782e54ec..5d0f8b13a958 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -175,7 +175,7 @@ static struct shrinker *zswap_shrinker; * This structure contains the metadata for tracking a single compressed * page within zswap. * - * swpentry - associated swap entry, the offset indexes into the red-black tree + * swpentry - associated swap entry, the offset indexes into the xarray * length - the length in bytes of the compressed page data. Needed during * decompression. * referenced - true if the entry recently entered the zswap pool. Unset by the @@ -879,7 +879,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, * acomp instance, then get those requests done simultaneously. but in this * case, zswap actually does store and load page by page, there is no * existing method to send the second page before the first page is done - * in one thread doing zwap. + * in one thread doing zswap. * but in different threads running on different cpu, we have different * acomp instance, so multiple threads can do (de)compression in parallel. */ @@ -894,7 +894,6 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, * to the active LRU list in the case. */ if (comp_ret || !dlen || dlen >= PAGE_SIZE) { - dlen = PAGE_SIZE; if (!mem_cgroup_zswap_writeback_enabled( folio_memcg(page_folio(page)))) { comp_ret = comp_ret ? comp_ret : -EINVAL; @@ -1129,7 +1128,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o * * 1. We extract the swp_entry_t to the stack, allowing * zswap_writeback_entry() to pin the swap entry and - * then validate the zwap entry against that swap entry's + * then validate the zswap entry against that swap entry's * tree using pointer value comparison. Only when that * is successful can the entry be dereferenced. * |
