diff options
Diffstat (limited to 'include')
37 files changed, 1053 insertions, 544 deletions
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 4d679d2a206b..4aeac0c3d3f0 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -213,7 +213,7 @@ struct mmu_table_batch { #define MAX_TABLE_BATCH \ ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *)) -#ifndef __HAVE_ARCH_TLB_REMOVE_TABLE +#ifndef CONFIG_HAVE_ARCH_TLB_REMOVE_TABLE static inline void __tlb_remove_table(void *table) { struct ptdesc *ptdesc = (struct ptdesc *)table; @@ -287,8 +287,7 @@ struct mmu_gather_batch { */ #define MAX_GATHER_BATCH_COUNT (10000UL/MAX_GATHER_BATCH) -extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, - bool delay_rmap, int page_size); +extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size); bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, unsigned int nr_pages, bool delay_rmap); @@ -510,7 +509,7 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { - if (__tlb_remove_page_size(tlb, page, false, page_size)) + if (__tlb_remove_page_size(tlb, page, page_size)) tlb_flush_mmu(tlb); } diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 0217c1073735..c88fd4d37d1f 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -46,7 +46,6 @@ enum wb_reason { WB_REASON_VMSCAN, WB_REASON_SYNC, WB_REASON_PERIODIC, - WB_REASON_LAPTOP_TIMER, WB_REASON_FS_FREE_SPACE, /* * There is no bdi forker thread any more and works are done @@ -204,8 +203,6 @@ struct backing_dev_info { char dev_name[64]; struct device *owner; - struct timer_list laptop_mode_wb_timer; - #ifdef CONFIG_DEBUG_FS struct dentry *debug_dir; #endif diff --git a/include/linux/balloon.h b/include/linux/balloon.h new file mode 100644 index 000000000000..ca5b15150f42 --- /dev/null +++ b/include/linux/balloon.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common interface for implementing a memory balloon, including support + * for migration of pages inflated in a memory balloon. + * + * Balloon page migration makes use of the general "movable_ops page migration" + * feature. + * + * page->private is used to reference the responsible balloon device. + * That these pages have movable_ops, and which movable_ops apply, + * is derived from the page type (PageOffline()) combined with the + * PG_movable_ops flag (PageMovableOps()). + * + * Once the page type and the PG_movable_ops are set, migration code + * can initiate page isolation by invoking the + * movable_operations()->isolate_page() callback + * + * As long as page->private is set, the page is either on the balloon list + * or isolated for migration. If page->private is not set, the page is + * either still getting inflated, or was deflated to be freed by the balloon + * driver soon. Isolation is impossible in both cases. + * + * As the page isolation scanning step a compaction thread does is a lockless + * procedure (from a page standpoint), it might bring some racy situations while + * performing balloon page migration. In order to sort out these racy scenarios + * and safely perform balloon's page migration we must, always, ensure following + * these simple rules: + * + * i. Inflation/deflation must set/clear page->private under the + * balloon_pages_lock + * + * ii. isolation or dequeueing procedure must remove the page from balloon + * device page list under balloon_pages_lock + * + * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini <aquini@redhat.com> + */ +#ifndef _LINUX_BALLOON_H +#define _LINUX_BALLOON_H +#include <linux/pagemap.h> +#include <linux/page-flags.h> +#include <linux/migrate.h> +#include <linux/gfp.h> +#include <linux/err.h> +#include <linux/list.h> + +/* + * Balloon device information descriptor. + * This struct is used to allow the common balloon page migration interface + * procedures to find the proper balloon device holding memory pages they'll + * have to cope for page migration, as well as it serves the balloon driver as + * a page book-keeper for its registered balloon devices. + */ +struct balloon_dev_info { + unsigned long isolated_pages; /* # of isolated pages for migration */ + struct list_head pages; /* Pages enqueued & handled to Host */ + int (*migratepage)(struct balloon_dev_info *, struct page *newpage, + struct page *page, enum migrate_mode mode); + bool adjust_managed_page_count; +}; + +struct page *balloon_page_alloc(void); +void balloon_page_enqueue(struct balloon_dev_info *b_dev_info, + struct page *page); +struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info); +size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info, + struct list_head *pages); +size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, + struct list_head *pages, size_t n_req_pages); + +static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) +{ + balloon->isolated_pages = 0; + INIT_LIST_HEAD(&balloon->pages); + balloon->migratepage = NULL; + balloon->adjust_managed_page_count = false; +} +#endif /* _LINUX_BALLOON_H */ diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h deleted file mode 100644 index 7cfe48769239..000000000000 --- a/include/linux/balloon_compaction.h +++ /dev/null @@ -1,160 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * include/linux/balloon_compaction.h - * - * Common interface definitions for making balloon pages movable by compaction. - * - * Balloon page migration makes use of the general "movable_ops page migration" - * feature. - * - * page->private is used to reference the responsible balloon device. - * That these pages have movable_ops, and which movable_ops apply, - * is derived from the page type (PageOffline()) combined with the - * PG_movable_ops flag (PageMovableOps()). - * - * As the page isolation scanning step a compaction thread does is a lockless - * procedure (from a page standpoint), it might bring some racy situations while - * performing balloon page compaction. In order to sort out these racy scenarios - * and safely perform balloon's page compaction and migration we must, always, - * ensure following these simple rules: - * - * i. Setting the PG_movable_ops flag and page->private with the following - * lock order - * +-page_lock(page); - * +--spin_lock_irq(&b_dev_info->pages_lock); - * - * ii. isolation or dequeueing procedure must remove the page from balloon - * device page list under b_dev_info->pages_lock. - * - * The functions provided by this interface are placed to help on coping with - * the aforementioned balloon page corner case, as well as to ensure the simple - * set of exposed rules are satisfied while we are dealing with balloon pages - * compaction / migration. - * - * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini <aquini@redhat.com> - */ -#ifndef _LINUX_BALLOON_COMPACTION_H -#define _LINUX_BALLOON_COMPACTION_H -#include <linux/pagemap.h> -#include <linux/page-flags.h> -#include <linux/migrate.h> -#include <linux/gfp.h> -#include <linux/err.h> -#include <linux/fs.h> -#include <linux/list.h> - -/* - * Balloon device information descriptor. - * This struct is used to allow the common balloon compaction interface - * procedures to find the proper balloon device holding memory pages they'll - * have to cope for page compaction / migration, as well as it serves the - * balloon driver as a page book-keeper for its registered balloon devices. - */ -struct balloon_dev_info { - unsigned long isolated_pages; /* # of isolated pages for migration */ - spinlock_t pages_lock; /* Protection to pages list */ - struct list_head pages; /* Pages enqueued & handled to Host */ - int (*migratepage)(struct balloon_dev_info *, struct page *newpage, - struct page *page, enum migrate_mode mode); -}; - -extern struct page *balloon_page_alloc(void); -extern void balloon_page_enqueue(struct balloon_dev_info *b_dev_info, - struct page *page); -extern struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info); -extern size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info, - struct list_head *pages); -extern size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, - struct list_head *pages, size_t n_req_pages); - -static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) -{ - balloon->isolated_pages = 0; - spin_lock_init(&balloon->pages_lock); - INIT_LIST_HEAD(&balloon->pages); - balloon->migratepage = NULL; -} - -#ifdef CONFIG_BALLOON_COMPACTION -extern const struct movable_operations balloon_mops; -/* - * balloon_page_device - get the b_dev_info descriptor for the balloon device - * that enqueues the given page. - */ -static inline struct balloon_dev_info *balloon_page_device(struct page *page) -{ - return (struct balloon_dev_info *)page_private(page); -} -#endif /* CONFIG_BALLOON_COMPACTION */ - -/* - * balloon_page_insert - insert a page into the balloon's page list and make - * the page->private assignment accordingly. - * @balloon : pointer to balloon device - * @page : page to be assigned as a 'balloon page' - * - * Caller must ensure the page is locked and the spin_lock protecting balloon - * pages list is held before inserting a page into the balloon device. - */ -static inline void balloon_page_insert(struct balloon_dev_info *balloon, - struct page *page) -{ - __SetPageOffline(page); - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) { - SetPageMovableOps(page); - set_page_private(page, (unsigned long)balloon); - } - list_add(&page->lru, &balloon->pages); -} - -static inline gfp_t balloon_mapping_gfp_mask(void) -{ - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) - return GFP_HIGHUSER_MOVABLE; - return GFP_HIGHUSER; -} - -/* - * balloon_page_finalize - prepare a balloon page that was removed from the - * balloon list for release to the page allocator - * @page: page to be released to the page allocator - * - * Caller must ensure that the page is locked. - */ -static inline void balloon_page_finalize(struct page *page) -{ - if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) - set_page_private(page, 0); - /* PageOffline is sticky until the page is freed to the buddy. */ -} - -/* - * balloon_page_push - insert a page into a page list. - * @head : pointer to list - * @page : page to be added - * - * Caller must ensure the page is private and protect the list. - */ -static inline void balloon_page_push(struct list_head *pages, struct page *page) -{ - list_add(&page->lru, pages); -} - -/* - * balloon_page_pop - remove a page from a page list. - * @head : pointer to list - * @page : page to be added - * - * Caller must ensure the page is private and protect the list. - */ -static inline struct page *balloon_page_pop(struct list_head *pages) -{ - struct page *page = list_first_entry_or_null(pages, struct page, lru); - - if (!page) - return NULL; - - list_del(&page->lru); - return page; -} -#endif /* _LINUX_BALLOON_COMPACTION_H */ diff --git a/include/linux/cma.h b/include/linux/cma.h index 2e6931735880..d0793eaaadaa 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -49,9 +49,14 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, struct cma **res_cma); extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align, bool no_warn); -extern bool cma_pages_valid(struct cma *cma, const struct page *pages, unsigned long count); extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count); +struct page *cma_alloc_frozen(struct cma *cma, unsigned long count, + unsigned int align, bool no_warn); +struct page *cma_alloc_frozen_compound(struct cma *cma, unsigned int order); +bool cma_release_frozen(struct cma *cma, const struct page *pages, + unsigned long count); + extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); extern bool cma_intersects(struct cma *cma, unsigned long start, unsigned long end); @@ -66,24 +71,4 @@ static inline bool cma_skip_dt_default_reserved_mem(void) } #endif -#ifdef CONFIG_CMA -struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp); -bool cma_free_folio(struct cma *cma, const struct folio *folio); -bool cma_validate_zones(struct cma *cma); -#else -static inline struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp) -{ - return NULL; -} - -static inline bool cma_free_folio(struct cma *cma, const struct folio *folio) -{ - return false; -} -static inline bool cma_validate_zones(struct cma *cma) -{ - return false; -} -#endif - #endif diff --git a/include/linux/damon.h b/include/linux/damon.h index 3813373a9200..a4fea23da857 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -15,7 +15,7 @@ #include <linux/random.h> /* Minimal region size. Every damon_region is aligned by this. */ -#define DAMON_MIN_REGION PAGE_SIZE +#define DAMON_MIN_REGION_SZ PAGE_SIZE /* Max priority score for DAMON-based operation schemes */ #define DAMOS_MAX_SCORE (99) @@ -155,6 +155,8 @@ enum damos_action { * @DAMOS_QUOTA_NODE_MEM_FREE_BP: MemFree ratio of a node. * @DAMOS_QUOTA_NODE_MEMCG_USED_BP: MemUsed ratio of a node for a cgroup. * @DAMOS_QUOTA_NODE_MEMCG_FREE_BP: MemFree ratio of a node for a cgroup. + * @DAMOS_QUOTA_ACTIVE_MEM_BP: Active to total LRU memory ratio. + * @DAMOS_QUOTA_INACTIVE_MEM_BP: Inactive to total LRU memory ratio. * @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics. * * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported. @@ -166,6 +168,8 @@ enum damos_quota_goal_metric { DAMOS_QUOTA_NODE_MEM_FREE_BP, DAMOS_QUOTA_NODE_MEMCG_USED_BP, DAMOS_QUOTA_NODE_MEMCG_FREE_BP, + DAMOS_QUOTA_ACTIVE_MEM_BP, + DAMOS_QUOTA_INACTIVE_MEM_BP, NR_DAMOS_QUOTA_GOAL_METRICS, }; @@ -203,7 +207,7 @@ struct damos_quota_goal { u64 last_psi_total; struct { int nid; - unsigned short memcg_id; + u64 memcg_id; }; }; struct list_head list; @@ -330,6 +334,8 @@ struct damos_watermarks { * @sz_ops_filter_passed: * Total bytes that passed ops layer-handled DAMOS filters. * @qt_exceeds: Total number of times the quota of the scheme has exceeded. + * @nr_snapshots: + * Total number of DAMON snapshots that the scheme has tried. * * "Tried an action to a region" in this context means the DAMOS core logic * determined the region as eligible to apply the action. The access pattern @@ -355,6 +361,7 @@ struct damos_stat { unsigned long sz_applied; unsigned long sz_ops_filter_passed; unsigned long qt_exceeds; + unsigned long nr_snapshots; }; /** @@ -416,7 +423,7 @@ struct damos_filter { bool matching; bool allow; union { - unsigned short memcg_id; + u64 memcg_id; struct damon_addr_range addr_range; int target_idx; struct damon_size_range sz_range; @@ -496,6 +503,7 @@ struct damos_migrate_dests { * @ops_filters: ops layer handling &struct damos_filter objects list. * @last_applied: Last @action applied ops-managing entity. * @stat: Statistics of this scheme. + * @max_nr_snapshots: Upper limit of nr_snapshots stat. * @list: List head for siblings. * * For each @apply_interval_us, DAMON finds regions which fit in the @@ -529,9 +537,10 @@ struct damos_migrate_dests { * unsets @last_applied when each regions walking for applying the scheme is * finished. * - * After applying the &action to each region, &stat_count and &stat_sz is - * updated to reflect the number of regions and total size of regions that the - * &action is applied. + * After applying the &action to each region, &stat is updated. + * + * If &max_nr_snapshots is set as non-zero and &stat.nr_snapshots be same to or + * greater than it, the scheme is deactivated. */ struct damos { struct damos_access_pattern pattern; @@ -566,6 +575,7 @@ struct damos { struct list_head ops_filters; void *last_applied; struct damos_stat stat; + unsigned long max_nr_snapshots; struct list_head list; }; @@ -597,7 +607,6 @@ enum damon_ops_id { * @apply_scheme: Apply a DAMON-based operation scheme. * @target_valid: Determine if the target is valid. * @cleanup_target: Clean up each target before deallocation. - * @cleanup: Clean up the context. * * DAMON can be extended for various address spaces and usages. For this, * users should register the low level operations for their target address @@ -630,7 +639,6 @@ enum damon_ops_id { * @target_valid should check whether the target is still valid for the * monitoring. * @cleanup_target is called before the target will be deallocated. - * @cleanup is called from @kdamond just before its termination. */ struct damon_operations { enum damon_ops_id id; @@ -646,7 +654,6 @@ struct damon_operations { struct damos *scheme, unsigned long *sz_filter_passed); bool (*target_valid)(struct damon_target *t); void (*cleanup_target)(struct damon_target *t); - void (*cleanup)(struct damon_ctx *context); }; /* @@ -656,7 +663,7 @@ struct damon_operations { * @data: Data that will be passed to @fn. * @repeat: Repeat invocations. * @return_code: Return code from @fn invocation. - * @dealloc_on_cancel: De-allocate when canceled. + * @dealloc_on_cancel: If @repeat is true, de-allocate when canceled. * * Control damon_call(), which requests specific kdamond to invoke a given * function. Refer to damon_call() for more details. @@ -749,27 +756,24 @@ struct damon_attrs { * of the monitoring. * * @attrs: Monitoring attributes for accuracy/overhead control. - * @kdamond: Kernel thread who does the monitoring. - * @kdamond_lock: Mutex for the synchronizations with @kdamond. * - * For each monitoring context, one kernel thread for the monitoring is - * created. The pointer to the thread is stored in @kdamond. + * For each monitoring context, one kernel thread for the monitoring, namely + * kdamond, is created. The pid of kdamond can be retrieved using + * damon_kdamond_pid(). * - * Once started, the monitoring thread runs until explicitly required to be - * terminated or every monitoring target is invalid. The validity of the - * targets is checked via the &damon_operations.target_valid of @ops. The - * termination can also be explicitly requested by calling damon_stop(). - * The thread sets @kdamond to NULL when it terminates. Therefore, users can - * know whether the monitoring is ongoing or terminated by reading @kdamond. - * Reads and writes to @kdamond from outside of the monitoring thread must - * be protected by @kdamond_lock. + * Once started, kdamond runs until explicitly required to be terminated or + * every monitoring target is invalid. The validity of the targets is checked + * via the &damon_operations.target_valid of @ops. The termination can also be + * explicitly requested by calling damon_stop(). To know if a kdamond is + * running, damon_is_running() can be used. * - * Note that the monitoring thread protects only @kdamond via @kdamond_lock. - * Accesses to other fields must be protected by themselves. + * While the kdamond is running, all accesses to &struct damon_ctx from a + * thread other than the kdamond should be made using safe DAMON APIs, + * including damon_call() and damos_walk(). * * @ops: Set of monitoring operations for given use cases. * @addr_unit: Scale factor for core to ops address conversion. - * @min_sz_region: Minimum region size. + * @min_region_sz: Minimum region size. * @adaptive_targets: Head of monitoring targets (&damon_target) list. * @schemes: Head of schemes (&damos) list. */ @@ -806,13 +810,15 @@ struct damon_ctx { struct damos_walk_control *walk_control; struct mutex walk_control_lock; -/* public: */ + /* Working thread of the given DAMON context */ struct task_struct *kdamond; + /* Protects @kdamond field access */ struct mutex kdamond_lock; +/* public: */ struct damon_operations ops; unsigned long addr_unit; - unsigned long min_sz_region; + unsigned long min_region_sz; struct list_head adaptive_targets; struct list_head schemes; @@ -901,7 +907,7 @@ static inline void damon_insert_region(struct damon_region *r, void damon_add_region(struct damon_region *r, struct damon_target *t); void damon_destroy_region(struct damon_region *r, struct damon_target *t); int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, - unsigned int nr_ranges, unsigned long min_sz_region); + unsigned int nr_ranges, unsigned long min_region_sz); void damon_update_region_access_rate(struct damon_region *r, bool accessed, struct damon_attrs *attrs); @@ -962,13 +968,14 @@ bool damon_initialized(void); int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); bool damon_is_running(struct damon_ctx *ctx); +int damon_kdamond_pid(struct damon_ctx *ctx); int damon_call(struct damon_ctx *ctx, struct damon_call_control *control); int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control); int damon_set_region_biggest_system_ram_default(struct damon_target *t, unsigned long *start, unsigned long *end, - unsigned long min_sz_region); + unsigned long min_region_sz); #endif /* CONFIG_DAMON */ diff --git a/include/linux/gfp.h b/include/linux/gfp.h index b155929af5b1..6ecf6dda93e0 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -407,9 +407,15 @@ extern gfp_t gfp_allowed_mask; /* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask); +/* A helper for checking if gfp includes all the specified flags */ +static inline bool gfp_has_flags(gfp_t gfp, gfp_t flags) +{ + return (gfp & flags) == flags; +} + static inline bool gfp_has_io_fs(gfp_t gfp) { - return (gfp & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS); + return gfp_has_flags(gfp, __GFP_IO | __GFP_FS); } /* @@ -430,39 +436,29 @@ typedef unsigned int __bitwise acr_flags_t; #define ACR_FLAGS_CMA ((__force acr_flags_t)BIT(0)) // allocate for CMA /* The below functions must be run on a range from a single zone. */ -extern int alloc_contig_range_noprof(unsigned long start, unsigned long end, - acr_flags_t alloc_flags, gfp_t gfp_mask); -#define alloc_contig_range(...) alloc_hooks(alloc_contig_range_noprof(__VA_ARGS__)) - -extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, - int nid, nodemask_t *nodemask); -#define alloc_contig_pages(...) alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__)) - -#endif +int alloc_contig_frozen_range_noprof(unsigned long start, unsigned long end, + acr_flags_t alloc_flags, gfp_t gfp_mask); +#define alloc_contig_frozen_range(...) \ + alloc_hooks(alloc_contig_frozen_range_noprof(__VA_ARGS__)) + +int alloc_contig_range_noprof(unsigned long start, unsigned long end, + acr_flags_t alloc_flags, gfp_t gfp_mask); +#define alloc_contig_range(...) \ + alloc_hooks(alloc_contig_range_noprof(__VA_ARGS__)) + +struct page *alloc_contig_frozen_pages_noprof(unsigned long nr_pages, + gfp_t gfp_mask, int nid, nodemask_t *nodemask); +#define alloc_contig_frozen_pages(...) \ + alloc_hooks(alloc_contig_frozen_pages_noprof(__VA_ARGS__)) + +struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, + int nid, nodemask_t *nodemask); +#define alloc_contig_pages(...) \ + alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__)) + +void free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages); void free_contig_range(unsigned long pfn, unsigned long nr_pages); - -#ifdef CONFIG_CONTIG_ALLOC -static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp, - int nid, nodemask_t *node) -{ - struct page *page; - - if (WARN_ON(!order || !(gfp & __GFP_COMP))) - return NULL; - - page = alloc_contig_pages_noprof(1 << order, gfp, nid, node); - - return page ? page_folio(page) : NULL; -} -#else -static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp, - int nid, nodemask_t *node) -{ - return NULL; -} #endif -/* This should be paired with folio_put() rather than free_contig_range(). */ -#define folio_alloc_gigantic(...) alloc_hooks(folio_alloc_gigantic_noprof(__VA_ARGS__)) DEFINE_FREE(free_page, void *, free_page((unsigned long)_T)) diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 3de43b12209e..814bb2892f99 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -309,8 +309,10 @@ enum { * * %GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower * watermark is applied to allow access to "atomic reserves". - * The current implementation doesn't support NMI and few other strict - * non-preemptive contexts (e.g. raw_spin_lock). The same applies to %GFP_NOWAIT. + * The current implementation doesn't support NMI, nor contexts that disable + * preemption under PREEMPT_RT. This includes raw_spin_lock() and plain + * preempt_disable() - see "Memory allocation" in + * Documentation/core-api/real-time/differences.rst for more info. * * %GFP_KERNEL is typical for kernel-internal allocations. The caller requires * %ZONE_NORMAL or a lower zone for direct access but can direct reclaim. @@ -321,6 +323,7 @@ enum { * %GFP_NOWAIT is for kernel allocations that should not stall for direct * reclaim, start physical IO or use any filesystem callback. It is very * likely to fail to allocate memory, even for very small allocations. + * The same restrictions on calling contexts apply as for %GFP_ATOMIC. * * %GFP_NOIO will use direct reclaim to discard clean pages or slab pages * that do not require the starting of any physical IO. diff --git a/include/linux/highmem.h b/include/linux/highmem.h index abc20f9810fd..af03db851a1d 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -197,15 +197,111 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) } #endif -/* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ #ifndef clear_user_highpage +#ifndef clear_user_page +/** + * clear_user_page() - clear a page to be mapped to user space + * @addr: the address of the page + * @vaddr: the address of the user mapping + * @page: the page + * + * We condition the definition of clear_user_page() on the architecture + * not having a custom clear_user_highpage(). That's because if there + * is some special flushing needed for clear_user_highpage() then it + * is likely that clear_user_page() also needs some magic. And, since + * our only caller is the generic clear_user_highpage(), not defining + * is not much of a loss. + */ +static inline void clear_user_page(void *addr, unsigned long vaddr, struct page *page) +{ + clear_page(addr); +} +#endif + +/** + * clear_user_pages() - clear a page range to be mapped to user space + * @addr: start address + * @vaddr: start address of the user mapping + * @page: start page + * @npages: number of pages + * + * Assumes that the region (@addr, +@npages) has been validated + * already so this does no exception handling. + * + * If the architecture provides a clear_user_page(), use that; + * otherwise, we can safely use clear_pages(). + */ +static inline void clear_user_pages(void *addr, unsigned long vaddr, + struct page *page, unsigned int npages) +{ + +#ifdef clear_user_page + do { + clear_user_page(addr, vaddr, page); + addr += PAGE_SIZE; + vaddr += PAGE_SIZE; + page++; + } while (--npages); +#else + /* + * Prefer clear_pages() to allow for architectural optimizations + * when operating on contiguous page ranges. + */ + clear_pages(addr, npages); +#endif +} + +/** + * clear_user_highpage() - clear a page to be mapped to user space + * @page: start page + * @vaddr: start address of the user mapping + * + * With !CONFIG_HIGHMEM this (and the copy_user_highpage() below) will + * be plain clear_user_page() (and copy_user_page()). + */ static inline void clear_user_highpage(struct page *page, unsigned long vaddr) { void *addr = kmap_local_page(page); clear_user_page(addr, vaddr, page); kunmap_local(addr); } +#endif /* clear_user_highpage */ + +/** + * clear_user_highpages() - clear a page range to be mapped to user space + * @page: start page + * @vaddr: start address of the user mapping + * @npages: number of pages + * + * Assumes that all the pages in the region (@page, +@npages) are valid + * so this does no exception handling. + */ +static inline void clear_user_highpages(struct page *page, unsigned long vaddr, + unsigned int npages) +{ + +#if defined(clear_user_highpage) || defined(CONFIG_HIGHMEM) + /* + * An architecture defined clear_user_highpage() implies special + * handling is needed. + * + * So we use that or, the generic variant if CONFIG_HIGHMEM is + * enabled. + */ + do { + clear_user_highpage(page, vaddr); + vaddr += PAGE_SIZE; + page++; + } while (--npages); +#else + + /* + * Prefer clear_user_pages() to allow for architectural optimizations + * when operating on contiguous page ranges. + */ + clear_user_pages(page_address(page), vaddr, page, npages); #endif +} #ifndef vma_alloc_zeroed_movable_folio /** diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e51b8ef0cebd..94a03591990c 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -171,11 +171,11 @@ bool hugetlbfs_pagecache_present(struct hstate *h, struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio); +extern int movable_gigantic_pages __read_mostly; extern int sysctl_hugetlb_shm_group __read_mostly; extern struct list_head huge_boot_pages[MAX_NUMNODES]; void hugetlb_bootmem_alloc(void); -bool hugetlb_bootmem_allocated(void); extern nodemask_t hugetlb_bootmem_nodes; void hugetlb_bootmem_set_nodes(void); @@ -280,6 +280,8 @@ void fixup_hugetlb_reservations(struct vm_area_struct *vma); void hugetlb_split(struct vm_area_struct *vma, unsigned long addr); int hugetlb_vma_lock_alloc(struct vm_area_struct *vma); +unsigned int arch_hugetlb_cma_order(void); + #else /* !CONFIG_HUGETLB_PAGE */ static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma) @@ -929,7 +931,7 @@ static inline bool hugepage_movable_supported(struct hstate *h) if (!hugepage_migration_supported(h)) return false; - if (hstate_is_gigantic(h)) + if (hstate_is_gigantic(h) && !movable_gigantic_pages) return false; return true; } @@ -1303,11 +1305,6 @@ static inline bool hugetlbfs_pagecache_present( static inline void hugetlb_bootmem_alloc(void) { } - -static inline bool hugetlb_bootmem_allocated(void) -{ - return false; -} #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, @@ -1321,9 +1318,9 @@ static inline spinlock_t *huge_pte_lock(struct hstate *h, } #if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA) -extern void __init hugetlb_cma_reserve(int order); +extern void __init hugetlb_cma_reserve(void); #else -static inline __init void hugetlb_cma_reserve(int order) +static inline __init void hugetlb_cma_reserve(void) { } #endif diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index eb1946a70cff..d7a9053ff4fe 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -17,8 +17,8 @@ extern void khugepaged_enter_vma(struct vm_area_struct *vma, vm_flags_t vm_flags); extern void khugepaged_min_free_kbytes_update(void); extern bool current_is_khugepaged(void); -extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, - bool install_pmd); +void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, + bool install_pmd); static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) { @@ -42,10 +42,9 @@ static inline void khugepaged_enter_vma(struct vm_area_struct *vma, vm_flags_t vm_flags) { } -static inline int collapse_pte_mapped_thp(struct mm_struct *mm, - unsigned long addr, bool install_pmd) +static inline void collapse_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr, bool install_pmd) { - return 0; } static inline void khugepaged_min_free_kbytes_update(void) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 66f98a3da8d8..7b8aad47121e 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -129,13 +129,6 @@ struct maple_arange_64 { struct maple_metadata meta; }; -struct maple_alloc { - unsigned long total; - unsigned char node_count; - unsigned int request_count; - struct maple_alloc *slot[MAPLE_ALLOC_SLOTS]; -}; - struct maple_topiary { struct maple_pnode *parent; struct maple_enode *next; /* Overlaps the pivot */ @@ -306,7 +299,6 @@ struct maple_node { }; struct maple_range_64 mr64; struct maple_arange_64 ma64; - struct maple_alloc alloc; }; }; @@ -536,7 +528,6 @@ bool mas_nomem(struct ma_state *mas, gfp_t gfp); void mas_pause(struct ma_state *mas); void maple_tree_init(void); void mas_destroy(struct ma_state *mas); -int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries); void *mas_prev(struct ma_state *mas, unsigned long min); void *mas_prev_range(struct ma_state *mas, unsigned long max); diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 221118b5a16e..6ec5e9ac0699 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -598,9 +598,9 @@ extern void *alloc_large_system_hash(const char *tablename, */ #ifdef CONFIG_NUMA #define HASHDIST_DEFAULT IS_ENABLED(CONFIG_64BIT) -extern int hashdist; /* Distribute hashes across NUMA nodes? */ +extern bool hashdist; /* Distribute hashes across NUMA nodes? */ #else -#define hashdist (0) +#define hashdist (false) #endif #ifdef CONFIG_MEMTEST diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f29d4969c0c3..1baee139999f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -65,7 +65,7 @@ struct mem_cgroup_reclaim_cookie { #define MEM_CGROUP_ID_SHIFT 16 -struct mem_cgroup_id { +struct mem_cgroup_private_id { int id; refcount_t ref; }; @@ -191,7 +191,7 @@ struct mem_cgroup { struct cgroup_subsys_state css; /* Private memcg ID. Used to ID objects that outlive the cgroup */ - struct mem_cgroup_id id; + struct mem_cgroup_private_id id; /* Accounted resources */ struct page_counter memory; /* Both v1 & v2 */ @@ -557,13 +557,15 @@ static inline bool mem_cgroup_disabled(void) static inline void mem_cgroup_protection(struct mem_cgroup *root, struct mem_cgroup *memcg, unsigned long *min, - unsigned long *low) + unsigned long *low, + unsigned long *usage) { - *min = *low = 0; + *min = *low = *usage = 0; if (mem_cgroup_disabled()) return; + *usage = page_counter_read(&memcg->memory); /* * There is no reclaim protection applied to a targeted reclaim. * We are special casing this specific case here because @@ -819,23 +821,21 @@ void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, int (*)(struct task_struct *, void *), void *arg); -static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) +static inline unsigned short mem_cgroup_private_id(struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) return 0; return memcg->id.id; } -struct mem_cgroup *mem_cgroup_from_id(unsigned short id); +struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id); -#ifdef CONFIG_SHRINKER_DEBUG -static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) +static inline u64 mem_cgroup_id(struct mem_cgroup *memcg) { - return memcg ? cgroup_ino(memcg->css.cgroup) : 0; + return memcg ? cgroup_id(memcg->css.cgroup) : 0; } -struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino); -#endif +struct mem_cgroup *mem_cgroup_get_from_id(u64 id); static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { @@ -919,8 +919,6 @@ static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); -unsigned long mem_cgroup_size(struct mem_cgroup *memcg); - void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p); @@ -1108,9 +1106,10 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, static inline void mem_cgroup_protection(struct mem_cgroup *root, struct mem_cgroup *memcg, unsigned long *min, - unsigned long *low) + unsigned long *low, + unsigned long *usage) { - *min = *low = 0; + *min = *low = *usage = 0; } static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root, @@ -1283,29 +1282,27 @@ static inline void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, { } -static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) +static inline unsigned short mem_cgroup_private_id(struct mem_cgroup *memcg) { return 0; } -static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +static inline struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id) { WARN_ON_ONCE(id); /* XXX: This should always return root_mem_cgroup */ return NULL; } -#ifdef CONFIG_SHRINKER_DEBUG -static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) +static inline u64 mem_cgroup_id(struct mem_cgroup *memcg) { return 0; } -static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) +static inline struct mem_cgroup *mem_cgroup_get_from_id(u64 id) { return NULL; } -#endif static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { @@ -1334,11 +1331,6 @@ static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) return 0; } -static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg) -{ - return 0; -} - static inline void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) { @@ -1750,7 +1742,7 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg) return memcg ? memcg->kmemcg_id : -1; } -struct mem_cgroup *mem_cgroup_from_slab_obj(void *p); +struct mem_cgroup *mem_cgroup_from_virt(void *p); static inline void count_objcg_events(struct obj_cgroup *objcg, enum vm_event_item idx, @@ -1822,7 +1814,7 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg) return -1; } -static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) +static inline struct mem_cgroup *mem_cgroup_from_virt(void *p) { return NULL; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 6c5d06e27230..2dbe1c2219ee 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -46,6 +46,7 @@ struct pt_regs; struct folio_batch; void arch_mm_preinit(void); +void mm_core_init_early(void); void mm_core_init(void); void init_mm_internals(void); @@ -1008,10 +1009,7 @@ static inline void vma_flag_set_atomic(struct vm_area_struct *vma, { unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags); - /* mmap read lock/VMA read lock must be held. */ - if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) - vma_assert_locked(vma); - + vma_assert_stabilised(vma); if (__vma_flag_atomic_valid(vma, bit)) set_bit((__force int)bit, bitmap); } @@ -2906,6 +2904,13 @@ static inline unsigned long get_mm_rss(struct mm_struct *mm) get_mm_counter(mm, MM_SHMEMPAGES); } +static inline unsigned long get_mm_rss_sum(struct mm_struct *mm) +{ + return get_mm_counter_sum(mm, MM_FILEPAGES) + + get_mm_counter_sum(mm, MM_ANONPAGES) + + get_mm_counter_sum(mm, MM_SHMEMPAGES); +} + static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) { return max(mm->hiwater_rss, get_mm_rss(mm)); @@ -3518,7 +3523,7 @@ static inline unsigned long get_num_physpages(void) } /* - * Using memblock node mappings, an architecture may initialise its + * FIXME: Using memblock node mappings, an architecture may initialise its * zones, allocate the backing mem_map and account for memory holes in an * architecture independent manner. * @@ -3533,7 +3538,7 @@ static inline unsigned long get_num_physpages(void) * memblock_add_node(base, size, nid, MEMBLOCK_NONE) * free_area_init(max_zone_pfns); */ -void free_area_init(unsigned long *max_zone_pfn); +void arch_zone_limits_init(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); extern unsigned long absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn); @@ -4180,6 +4185,61 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order) {} #endif /* CONFIG_DEBUG_PAGEALLOC */ +#ifndef clear_pages +/** + * clear_pages() - clear a page range for kernel-internal use. + * @addr: start address + * @npages: number of pages + * + * Use clear_user_pages() instead when clearing a page range to be + * mapped to user space. + * + * Does absolutely no exception handling. + * + * Note that even though the clearing operation is preemptible, clear_pages() + * does not (and on architectures where it reduces to a few long-running + * instructions, might not be able to) call cond_resched() to check if + * rescheduling is required. + * + * When running under preemptible models this is not a problem. Under + * cooperatively scheduled models, however, the caller is expected to + * limit @npages to no more than PROCESS_PAGES_NON_PREEMPT_BATCH. + */ +static inline void clear_pages(void *addr, unsigned int npages) +{ + do { + clear_page(addr); + addr += PAGE_SIZE; + } while (--npages); +} +#endif + +#ifndef PROCESS_PAGES_NON_PREEMPT_BATCH +#ifdef clear_pages +/* + * The architecture defines clear_pages(), and we assume that it is + * generally "fast". So choose a batch size large enough to allow the processor + * headroom for optimizing the operation and yet small enough that we see + * reasonable preemption latency for when this optimization is not possible + * (ex. slow microarchitectures, memory bandwidth saturation.) + * + * With a value of 32MB and assuming a memory bandwidth of ~10GBps, this should + * result in worst case preemption latency of around 3ms when clearing pages. + * + * (See comment above clear_pages() for why preemption latency is a concern + * here.) + */ +#define PROCESS_PAGES_NON_PREEMPT_BATCH (SZ_32M >> PAGE_SHIFT) +#else /* !clear_pages */ +/* + * The architecture does not provide a clear_pages() implementation. Assume + * that clear_page() -- which clear_pages() will fallback to -- is relatively + * slow and choose a small value for PROCESS_PAGES_NON_PREEMPT_BATCH. + */ +#define PROCESS_PAGES_NON_PREEMPT_BATCH 1 +#endif +#endif + #ifdef __HAVE_ARCH_GATE_AREA extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); extern int in_gate_area_no_mm(unsigned long addr); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 78950eb8926d..8731606d8d36 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -752,8 +752,18 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) } #endif -#define VMA_LOCK_OFFSET 0x40000000 -#define VMA_REF_LIMIT (VMA_LOCK_OFFSET - 1) +/* + * While __vma_enter_locked() is working to ensure are no read-locks held on a + * VMA (either while acquiring a VMA write lock or marking a VMA detached) we + * set the VM_REFCNT_EXCLUDE_READERS_FLAG in vma->vm_refcnt to indiciate to + * vma_start_read() that the reference count should be left alone. + * + * See the comment describing vm_refcnt in vm_area_struct for details as to + * which values the VMA reference count can be. + */ +#define VM_REFCNT_EXCLUDE_READERS_BIT (30) +#define VM_REFCNT_EXCLUDE_READERS_FLAG (1U << VM_REFCNT_EXCLUDE_READERS_BIT) +#define VM_REFCNT_LIMIT (VM_REFCNT_EXCLUDE_READERS_FLAG - 1) struct vma_numab_state { /* @@ -935,10 +945,10 @@ struct vm_area_struct { /* * Can only be written (using WRITE_ONCE()) while holding both: * - mmap_lock (in write mode) - * - vm_refcnt bit at VMA_LOCK_OFFSET is set + * - vm_refcnt bit at VM_REFCNT_EXCLUDE_READERS_FLAG is set * Can be read reliably while holding one of: * - mmap_lock (in read or write mode) - * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1 + * - vm_refcnt bit at VM_REFCNT_EXCLUDE_READERS_BIT is set or vm_refcnt > 1 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout * while holding nothing (except RCU to keep the VMA struct allocated). * @@ -980,7 +990,44 @@ struct vm_area_struct { struct vma_numab_state *numab_state; /* NUMA Balancing state */ #endif #ifdef CONFIG_PER_VMA_LOCK - /* Unstable RCU readers are allowed to read this. */ + /* + * Used to keep track of firstly, whether the VMA is attached, secondly, + * if attached, how many read locks are taken, and thirdly, if the + * VM_REFCNT_EXCLUDE_READERS_FLAG is set, whether any read locks held + * are currently in the process of being excluded. + * + * This value can be equal to: + * + * 0 - Detached. IMPORTANT: when the refcnt is zero, readers cannot + * increment it. + * + * 1 - Attached and either unlocked or write-locked. Write locks are + * identified via __is_vma_write_locked() which checks for equality of + * vma->vm_lock_seq and mm->mm_lock_seq. + * + * >1, < VM_REFCNT_EXCLUDE_READERS_FLAG - Read-locked or (unlikely) + * write-locked with other threads having temporarily incremented the + * reference count prior to determining it is write-locked and + * decrementing it again. + * + * VM_REFCNT_EXCLUDE_READERS_FLAG - Detached, pending + * __vma_end_exclude_readers() completion which will decrement the + * reference count to zero. IMPORTANT - at this stage no further readers + * can increment the reference count. It can only be reduced. + * + * VM_REFCNT_EXCLUDE_READERS_FLAG + 1 - A thread is either write-locking + * an attached VMA and has yet to invoke __vma_end_exclude_readers(), + * OR a thread is detaching a VMA and is waiting on a single spurious + * reader in order to decrement the reference count. IMPORTANT - as + * above, no further readers can increment the reference count. + * + * > VM_REFCNT_EXCLUDE_READERS_FLAG + 1 - A thread is either + * write-locking or detaching a VMA is waiting on readers to + * exit. IMPORTANT - as above, no further readers can increment the + * reference count. + * + * NOTE: Unstable RCU readers are allowed to read this. + */ refcount_t vm_refcnt ____cacheline_aligned_in_smp; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map vmlock_dep_map; diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index a82aa80c0ba4..11bf319d78ec 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -88,4 +88,9 @@ struct tlbflush_unmap_batch { #endif }; +struct lazy_mmu_state { + u8 enable_count; + u8 pause_count; +}; + #endif /* _LINUX_MM_TYPES_TASK_H */ diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index d53f72dba7fe..93eca48bc443 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -78,6 +78,43 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) #ifdef CONFIG_PER_VMA_LOCK +#ifdef CONFIG_LOCKDEP +#define __vma_lockdep_map(vma) (&vma->vmlock_dep_map) +#else +#define __vma_lockdep_map(vma) NULL +#endif + +/* + * VMA locks do not behave like most ordinary locks found in the kernel, so we + * cannot quite have full lockdep tracking in the way we would ideally prefer. + * + * Read locks act as shared locks which exclude an exclusive lock being + * taken. We therefore mark these accordingly on read lock acquire/release. + * + * Write locks are acquired exclusively per-VMA, but released in a shared + * fashion, that is upon vma_end_write_all(), we update the mmap's seqcount such + * that write lock is released. + * + * We therefore cannot track write locks per-VMA, nor do we try. Mitigating this + * is the fact that, of course, we do lockdep-track the mmap lock rwsem which + * must be held when taking a VMA write lock. + * + * We do, however, want to indicate that during either acquisition of a VMA + * write lock or detachment of a VMA that we require the lock held be exclusive, + * so we utilise lockdep to do so. + */ +#define __vma_lockdep_acquire_read(vma) \ + lock_acquire_shared(__vma_lockdep_map(vma), 0, 1, NULL, _RET_IP_) +#define __vma_lockdep_release_read(vma) \ + lock_release(__vma_lockdep_map(vma), _RET_IP_) +#define __vma_lockdep_acquire_exclusive(vma) \ + lock_acquire_exclusive(__vma_lockdep_map(vma), 0, 0, NULL, _RET_IP_) +#define __vma_lockdep_release_exclusive(vma) \ + lock_release(__vma_lockdep_map(vma), _RET_IP_) +/* Only meaningful if CONFIG_LOCK_STAT is defined. */ +#define __vma_lockdep_stat_mark_acquired(vma) \ + lock_acquired(__vma_lockdep_map(vma), _RET_IP_) + static inline void mm_lock_seqcount_init(struct mm_struct *mm) { seqcount_init(&mm->mm_lock_seq); @@ -115,36 +152,81 @@ static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key lockdep_key; - lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0); + lockdep_init_map(__vma_lockdep_map(vma), "vm_lock", &lockdep_key, 0); #endif if (reset_refcnt) refcount_set(&vma->vm_refcnt, 0); vma->vm_lock_seq = UINT_MAX; } -static inline bool is_vma_writer_only(int refcnt) +/* + * This function determines whether the input VMA reference count describes a + * VMA which has excluded all VMA read locks. + * + * In the case of a detached VMA, we may incorrectly indicate that readers are + * excluded when one remains, because in that scenario we target a refcount of + * VM_REFCNT_EXCLUDE_READERS_FLAG, rather than the attached target of + * VM_REFCNT_EXCLUDE_READERS_FLAG + 1. + * + * However, the race window for that is very small so it is unlikely. + * + * Returns: true if readers are excluded, false otherwise. + */ +static inline bool __vma_are_readers_excluded(int refcnt) { /* - * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma - * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on - * a detached vma happens only in vma_mark_detached() and is a rare - * case, therefore most of the time there will be no unnecessary wakeup. + * See the comment describing the vm_area_struct->vm_refcnt field for + * details of possible refcnt values. */ - return (refcnt & VMA_LOCK_OFFSET) && refcnt <= VMA_LOCK_OFFSET + 1; + return (refcnt & VM_REFCNT_EXCLUDE_READERS_FLAG) && + refcnt <= VM_REFCNT_EXCLUDE_READERS_FLAG + 1; +} + +/* + * Actually decrement the VMA reference count. + * + * The function returns the reference count as it was immediately after the + * decrement took place. If it returns zero, the VMA is now detached. + */ +static inline __must_check unsigned int +__vma_refcount_put_return(struct vm_area_struct *vma) +{ + int oldcnt; + + if (__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) + return 0; + + return oldcnt - 1; } +/** + * vma_refcount_put() - Drop reference count in VMA vm_refcnt field due to a + * read-lock being dropped. + * @vma: The VMA whose reference count we wish to decrement. + * + * If we were the last reader, wake up threads waiting to obtain an exclusive + * lock. + */ static inline void vma_refcount_put(struct vm_area_struct *vma) { - /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ + /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt. */ struct mm_struct *mm = vma->vm_mm; - int oldcnt; + int newcnt; - rwsem_release(&vma->vmlock_dep_map, _RET_IP_); - if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) { + __vma_lockdep_release_read(vma); + newcnt = __vma_refcount_put_return(vma); - if (is_vma_writer_only(oldcnt - 1)) - rcuwait_wake_up(&mm->vma_writer_wait); - } + /* + * __vma_start_exclude_readers() may be sleeping waiting for readers to + * drop their reference count, so wake it up if we were the last reader + * blocking it from being acquired. + * + * We may be raced by other readers temporarily incrementing the + * reference count, though the race window is very small, this might + * cause spurious wakeups. + */ + if (newcnt && __vma_are_readers_excluded(newcnt)) + rcuwait_wake_up(&mm->vma_writer_wait); } /* @@ -159,10 +241,10 @@ static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int mmap_assert_locked(vma->vm_mm); if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, - VMA_REF_LIMIT))) + VM_REFCNT_LIMIT))) return false; - rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); + __vma_lockdep_acquire_read(vma); return true; } @@ -182,21 +264,31 @@ static inline void vma_end_read(struct vm_area_struct *vma) vma_refcount_put(vma); } -/* WARNING! Can only be used if mmap_lock is expected to be write-locked */ -static inline bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) +static inline unsigned int __vma_raw_mm_seqnum(struct vm_area_struct *vma) { + const struct mm_struct *mm = vma->vm_mm; + + /* We must hold an exclusive write lock for this access to be valid. */ mmap_assert_write_locked(vma->vm_mm); + return mm->mm_lock_seq.sequence; +} +/* + * Determine whether a VMA is write-locked. Must be invoked ONLY if the mmap + * write lock is held. + * + * Returns true if write-locked, otherwise false. + */ +static inline bool __is_vma_write_locked(struct vm_area_struct *vma) +{ /* * current task is holding mmap_write_lock, both vma->vm_lock_seq and * mm->mm_lock_seq can't be concurrently modified. */ - *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; - return (vma->vm_lock_seq == *mm_lock_seq); + return vma->vm_lock_seq == __vma_raw_mm_seqnum(vma); } -int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, - int state); +int __vma_start_write(struct vm_area_struct *vma, int state); /* * Begin writing to a VMA. @@ -205,12 +297,10 @@ int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, */ static inline void vma_start_write(struct vm_area_struct *vma) { - unsigned int mm_lock_seq; - - if (__is_vma_write_locked(vma, &mm_lock_seq)) + if (__is_vma_write_locked(vma)) return; - __vma_start_write(vma, mm_lock_seq, TASK_UNINTERRUPTIBLE); + __vma_start_write(vma, TASK_UNINTERRUPTIBLE); } /** @@ -229,26 +319,110 @@ static inline void vma_start_write(struct vm_area_struct *vma) static inline __must_check int vma_start_write_killable(struct vm_area_struct *vma) { - unsigned int mm_lock_seq; - - if (__is_vma_write_locked(vma, &mm_lock_seq)) + if (__is_vma_write_locked(vma)) return 0; - return __vma_start_write(vma, mm_lock_seq, TASK_KILLABLE); + + return __vma_start_write(vma, TASK_KILLABLE); } +/** + * vma_assert_write_locked() - assert that @vma holds a VMA write lock. + * @vma: The VMA to assert. + */ static inline void vma_assert_write_locked(struct vm_area_struct *vma) { - unsigned int mm_lock_seq; - - VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); + VM_WARN_ON_ONCE_VMA(!__is_vma_write_locked(vma), vma); } +/** + * vma_assert_locked() - assert that @vma holds either a VMA read or a VMA write + * lock and is not detached. + * @vma: The VMA to assert. + */ static inline void vma_assert_locked(struct vm_area_struct *vma) { - unsigned int mm_lock_seq; + unsigned int refcnt; + + if (IS_ENABLED(CONFIG_LOCKDEP)) { + if (!lock_is_held(__vma_lockdep_map(vma))) + vma_assert_write_locked(vma); + return; + } + + /* + * See the comment describing the vm_area_struct->vm_refcnt field for + * details of possible refcnt values. + */ + refcnt = refcount_read(&vma->vm_refcnt); + + /* + * In this case we're either read-locked, write-locked with temporary + * readers, or in the midst of excluding readers, all of which means + * we're locked. + */ + if (refcnt > 1) + return; + + /* It is a bug for the VMA to be detached here. */ + VM_WARN_ON_ONCE_VMA(!refcnt, vma); + + /* + * OK, the VMA has a reference count of 1 which means it is either + * unlocked and attached or write-locked, so assert that it is + * write-locked. + */ + vma_assert_write_locked(vma); +} + +/** + * vma_assert_stabilised() - assert that this VMA cannot be changed from + * underneath us either by having a VMA or mmap lock held. + * @vma: The VMA whose stability we wish to assess. + * + * If lockdep is enabled we can precisely ensure stability via either an mmap + * lock owned by us or a specific VMA lock. + * + * With lockdep disabled we may sometimes race with other threads acquiring the + * mmap read lock simultaneous with our VMA read lock. + */ +static inline void vma_assert_stabilised(struct vm_area_struct *vma) +{ + /* + * If another thread owns an mmap lock, it may go away at any time, and + * thus is no guarantee of stability. + * + * If lockdep is enabled we can accurately determine if an mmap lock is + * held and owned by us. Otherwise we must approximate. + * + * It doesn't necessarily mean we are not stabilised however, as we may + * hold a VMA read lock (not a write lock as this would require an owned + * mmap lock). + * + * If (assuming lockdep is not enabled) we were to assert a VMA read + * lock first we may also run into issues, as other threads can hold VMA + * read locks simlutaneous to us. + * + * Therefore if lockdep is not enabled we risk a false negative (i.e. no + * assert fired). If accurate checking is required, enable lockdep. + */ + if (IS_ENABLED(CONFIG_LOCKDEP)) { + if (lockdep_is_held(&vma->vm_mm->mmap_lock)) + return; + } else { + if (rwsem_is_locked(&vma->vm_mm->mmap_lock)) + return; + } - VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 && - !__is_vma_write_locked(vma, &mm_lock_seq), vma); + /* + * We're not stabilised by the mmap lock, so assert that we're + * stabilised by a VMA lock. + */ + vma_assert_locked(vma); +} + +static inline bool vma_is_attached(struct vm_area_struct *vma) +{ + return refcount_read(&vma->vm_refcnt); } /* @@ -258,12 +432,12 @@ static inline void vma_assert_locked(struct vm_area_struct *vma) */ static inline void vma_assert_attached(struct vm_area_struct *vma) { - WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt)); + WARN_ON_ONCE(!vma_is_attached(vma)); } static inline void vma_assert_detached(struct vm_area_struct *vma) { - WARN_ON_ONCE(refcount_read(&vma->vm_refcnt)); + WARN_ON_ONCE(vma_is_attached(vma)); } static inline void vma_mark_attached(struct vm_area_struct *vma) @@ -273,7 +447,28 @@ static inline void vma_mark_attached(struct vm_area_struct *vma) refcount_set_release(&vma->vm_refcnt, 1); } -void vma_mark_detached(struct vm_area_struct *vma); +void __vma_exclude_readers_for_detach(struct vm_area_struct *vma); + +static inline void vma_mark_detached(struct vm_area_struct *vma) +{ + vma_assert_write_locked(vma); + vma_assert_attached(vma); + + /* + * The VMA still being attached (refcnt > 0) - is unlikely, because the + * vma has been already write-locked and readers can increment vm_refcnt + * only temporarily before they check vm_lock_seq, realize the vma is + * locked and drop back the vm_refcnt. That is a narrow window for + * observing a raised vm_refcnt. + * + * See the comment describing the vm_area_struct->vm_refcnt field for + * details of possible refcnt values. + */ + if (likely(!__vma_refcount_put_return(vma))) + return; + + __vma_exclude_readers_for_detach(vma); +} struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address); @@ -327,6 +522,12 @@ static inline void vma_assert_locked(struct vm_area_struct *vma) mmap_assert_locked(vma->vm_mm); } +static inline void vma_assert_stabilised(struct vm_area_struct *vma) +{ + /* If no VMA locks, then either mmap lock suffices to stabilise. */ + mmap_assert_locked(vma->vm_mm); +} + #endif /* CONFIG_PER_VMA_LOCK */ static inline void mmap_write_lock(struct mm_struct *mm) diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 14a45979cccc..ab60ffba08f5 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -47,6 +47,15 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi); BUG(); \ } \ } while (0) +#define VM_WARN_ON_PAGE(cond, page) ({ \ + int __ret_warn = !!(cond); \ + \ + if (unlikely(__ret_warn)) { \ + dump_page(page, "VM_WARN_ON_PAGE(" __stringify(cond)")");\ + WARN_ON(1); \ + } \ + unlikely(__ret_warn); \ +}) #define VM_WARN_ON_ONCE_PAGE(cond, page) ({ \ static bool __section(".data..once") __warned; \ int __ret_warn_once = !!(cond); \ @@ -122,6 +131,7 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi); #define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond) #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) +#define VM_WARN_ON_PAGE(cond, page) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_PAGE(cond, page) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fc5d6c88d2f0..3e51190a55e4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1534,14 +1534,27 @@ static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) #include <linux/memory_hotplug.h> void build_all_zonelists(pg_data_t *pgdat); -void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, - enum zone_type highest_zoneidx); bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags, long free_pages); bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags); + +enum kswapd_clear_hopeless_reason { + KSWAPD_CLEAR_HOPELESS_OTHER = 0, + KSWAPD_CLEAR_HOPELESS_KSWAPD, + KSWAPD_CLEAR_HOPELESS_DIRECT, + KSWAPD_CLEAR_HOPELESS_PCP, +}; + +void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, + enum zone_type highest_zoneidx); +void kswapd_try_clear_hopeless(struct pglist_data *pgdat, + unsigned int order, int highest_zoneidx); +void kswapd_clear_hopeless(pg_data_t *pgdat, enum kswapd_clear_hopeless_reason reason); +bool kswapd_test_hopeless(pg_data_t *pgdat); + /* * Memory initialization context, use to differentiate memory added by * the platform statically or via memory hotplug interface. @@ -2286,9 +2299,7 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr) #define pfn_to_nid(pfn) (0) #endif -void sparse_init(void); #else -#define sparse_init() do {} while (0) #define sparse_index_init(_sec, _nid) do {} while (0) #define sparse_vmemmap_init_nid_early(_nid) do {} while (0) #define sparse_vmemmap_init_nid_late(_nid) do {} while (0) diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index bd38648c998d..204c92462f3c 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -157,10 +157,10 @@ static __always_inline bool __node_test_and_set(int node, nodemask_t *addr) #define nodes_and(dst, src1, src2) \ __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES) -static __always_inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, +static __always_inline bool __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { - bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); + return bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_or(dst, src1, src2) \ @@ -181,10 +181,10 @@ static __always_inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1 #define nodes_andnot(dst, src1, src2) \ __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES) -static __always_inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, +static __always_inline bool __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { - bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); + return bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_copy(dst, src) __nodes_copy(&(dst), &(src), MAX_NUMNODES) diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 3e2f960e166c..6f8638c9904f 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -67,4 +67,6 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn); int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, enum pb_isolate_mode mode); +bool page_is_unmovable(struct zone *zone, struct page *page, + enum pb_isolate_mode mode, unsigned long *step); #endif diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 289620d4aad3..12268a32e8be 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -14,15 +14,18 @@ extern struct static_key_true page_table_check_disabled; extern struct page_ext_operations page_table_check_ops; void __page_table_check_zero(struct page *page, unsigned int order); -void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); -void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); -void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); -void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, - unsigned int nr); -void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd, - unsigned int nr); -void __page_table_check_puds_set(struct mm_struct *mm, pud_t *pudp, pud_t pud, - unsigned int nr); +void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t pte); +void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, + pmd_t pmd); +void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, + pud_t pud); +void __page_table_check_ptes_set(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr); +void __page_table_check_pmds_set(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd, unsigned int nr); +void __page_table_check_puds_set(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud, unsigned int nr); void __page_table_check_pte_clear_range(struct mm_struct *mm, unsigned long addr, pmd_t pmd); @@ -43,55 +46,59 @@ static inline void page_table_check_free(struct page *page, unsigned int order) __page_table_check_zero(page, order); } -static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) +static inline void page_table_check_pte_clear(struct mm_struct *mm, + unsigned long addr, pte_t pte) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pte_clear(mm, pte); + __page_table_check_pte_clear(mm, addr, pte); } -static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) +static inline void page_table_check_pmd_clear(struct mm_struct *mm, + unsigned long addr, pmd_t pmd) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pmd_clear(mm, pmd); + __page_table_check_pmd_clear(mm, addr, pmd); } -static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) +static inline void page_table_check_pud_clear(struct mm_struct *mm, + unsigned long addr, pud_t pud) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pud_clear(mm, pud); + __page_table_check_pud_clear(mm, addr, pud); } static inline void page_table_check_ptes_set(struct mm_struct *mm, - pte_t *ptep, pte_t pte, unsigned int nr) + unsigned long addr, pte_t *ptep, + pte_t pte, unsigned int nr) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_ptes_set(mm, ptep, pte, nr); + __page_table_check_ptes_set(mm, addr, ptep, pte, nr); } static inline void page_table_check_pmds_set(struct mm_struct *mm, - pmd_t *pmdp, pmd_t pmd, unsigned int nr) + unsigned long addr, pmd_t *pmdp, pmd_t pmd, unsigned int nr) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pmds_set(mm, pmdp, pmd, nr); + __page_table_check_pmds_set(mm, addr, pmdp, pmd, nr); } static inline void page_table_check_puds_set(struct mm_struct *mm, - pud_t *pudp, pud_t pud, unsigned int nr) + unsigned long addr, pud_t *pudp, pud_t pud, unsigned int nr) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_puds_set(mm, pudp, pud, nr); + __page_table_check_puds_set(mm, addr, pudp, pud, nr); } static inline void page_table_check_pte_clear_range(struct mm_struct *mm, @@ -114,30 +121,34 @@ static inline void page_table_check_free(struct page *page, unsigned int order) { } -static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) +static inline void page_table_check_pte_clear(struct mm_struct *mm, + unsigned long addr, pte_t pte) { } -static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) +static inline void page_table_check_pmd_clear(struct mm_struct *mm, + unsigned long addr, pmd_t pmd) { } -static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) +static inline void page_table_check_pud_clear(struct mm_struct *mm, + unsigned long addr, pud_t pud) { } static inline void page_table_check_ptes_set(struct mm_struct *mm, - pte_t *ptep, pte_t pte, unsigned int nr) + unsigned long addr, pte_t *ptep, + pte_t pte, unsigned int nr) { } static inline void page_table_check_pmds_set(struct mm_struct *mm, - pmd_t *pmdp, pmd_t pmd, unsigned int nr) + unsigned long addr, pmd_t *pmdp, pmd_t pmd, unsigned int nr) { } static inline void page_table_check_puds_set(struct mm_struct *mm, - pud_t *pudp, pud_t pud, unsigned int nr) + unsigned long addr, pud_t *pudp, pud_t pud, unsigned int nr) { } @@ -149,7 +160,7 @@ static inline void page_table_check_pte_clear_range(struct mm_struct *mm, #endif /* CONFIG_PAGE_TABLE_CHECK */ -#define page_table_check_pmd_set(mm, pmdp, pmd) page_table_check_pmds_set(mm, pmdp, pmd, 1) -#define page_table_check_pud_set(mm, pudp, pud) page_table_check_puds_set(mm, pudp, pud, 1) +#define page_table_check_pmd_set(mm, addr, pmdp, pmd) page_table_check_pmds_set(mm, addr, pmdp, pmd, 1) +#define page_table_check_pud_set(mm, addr, pudp, pud) page_table_check_puds_set(mm, addr, pudp, pud, 1) #endif /* __LINUX_PAGE_TABLE_CHECK_H */ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 652f287c1ef6..827dca25c0bc 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -225,16 +225,156 @@ static inline int pmd_dirty(pmd_t pmd) * up to date. * * In the general case, no lock is guaranteed to be held between entry and exit - * of the lazy mode. So the implementation must assume preemption may be enabled - * and cpu migration is possible; it must take steps to be robust against this. - * (In practice, for user PTE updates, the appropriate page table lock(s) are - * held, but for kernel PTE updates, no lock is held). Nesting is not permitted - * and the mode cannot be used in interrupt context. + * of the lazy mode. (In practice, for user PTE updates, the appropriate page + * table lock(s) are held, but for kernel PTE updates, no lock is held). + * The implementation must therefore assume preemption may be enabled upon + * entry to the mode and cpu migration is possible; it must take steps to be + * robust against this. An implementation may handle this by disabling + * preemption, as a consequence generic code may not sleep while the lazy MMU + * mode is active. + * + * The mode is disabled in interrupt context and calls to the lazy_mmu API have + * no effect. + * + * The lazy MMU mode is enabled for a given block of code using: + * + * lazy_mmu_mode_enable(); + * <code> + * lazy_mmu_mode_disable(); + * + * Nesting is permitted: <code> may itself use an enable()/disable() pair. + * A nested call to enable() has no functional effect; however disable() causes + * any batched architectural state to be flushed regardless of nesting. After a + * call to disable(), the caller can therefore rely on all previous page table + * modifications to have taken effect, but the lazy MMU mode may still be + * enabled. + * + * In certain cases, it may be desirable to temporarily pause the lazy MMU mode. + * This can be done using: + * + * lazy_mmu_mode_pause(); + * <code> + * lazy_mmu_mode_resume(); + * + * pause() ensures that the mode is exited regardless of the nesting level; + * resume() re-enters the mode at the same nesting level. Any call to the + * lazy_mmu_mode_* API between those two calls has no effect. In particular, + * this means that pause()/resume() pairs may nest. + * + * is_lazy_mmu_mode_active() can be used to check whether the lazy MMU mode is + * currently enabled. */ -#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE -static inline void arch_enter_lazy_mmu_mode(void) {} -static inline void arch_leave_lazy_mmu_mode(void) {} -static inline void arch_flush_lazy_mmu_mode(void) {} +#ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE +/** + * lazy_mmu_mode_enable() - Enable the lazy MMU mode. + * + * Enters a new lazy MMU mode section; if the mode was not already enabled, + * enables it and calls arch_enter_lazy_mmu_mode(). + * + * Must be paired with a call to lazy_mmu_mode_disable(). + * + * Has no effect if called: + * - While paused - see lazy_mmu_mode_pause() + * - In interrupt context + */ +static inline void lazy_mmu_mode_enable(void) +{ + struct lazy_mmu_state *state = ¤t->lazy_mmu_state; + + if (in_interrupt() || state->pause_count > 0) + return; + + VM_WARN_ON_ONCE(state->enable_count == U8_MAX); + + if (state->enable_count++ == 0) + arch_enter_lazy_mmu_mode(); +} + +/** + * lazy_mmu_mode_disable() - Disable the lazy MMU mode. + * + * Exits the current lazy MMU mode section. If it is the outermost section, + * disables the mode and calls arch_leave_lazy_mmu_mode(). Otherwise (nested + * section), calls arch_flush_lazy_mmu_mode(). + * + * Must match a call to lazy_mmu_mode_enable(). + * + * Has no effect if called: + * - While paused - see lazy_mmu_mode_pause() + * - In interrupt context + */ +static inline void lazy_mmu_mode_disable(void) +{ + struct lazy_mmu_state *state = ¤t->lazy_mmu_state; + + if (in_interrupt() || state->pause_count > 0) + return; + + VM_WARN_ON_ONCE(state->enable_count == 0); + + if (--state->enable_count == 0) + arch_leave_lazy_mmu_mode(); + else /* Exiting a nested section */ + arch_flush_lazy_mmu_mode(); + +} + +/** + * lazy_mmu_mode_pause() - Pause the lazy MMU mode. + * + * Pauses the lazy MMU mode; if it is currently active, disables it and calls + * arch_leave_lazy_mmu_mode(). + * + * Must be paired with a call to lazy_mmu_mode_resume(). Calls to the + * lazy_mmu_mode_* API have no effect until the matching resume() call. + * + * Has no effect if called: + * - While paused (inside another pause()/resume() pair) + * - In interrupt context + */ +static inline void lazy_mmu_mode_pause(void) +{ + struct lazy_mmu_state *state = ¤t->lazy_mmu_state; + + if (in_interrupt()) + return; + + VM_WARN_ON_ONCE(state->pause_count == U8_MAX); + + if (state->pause_count++ == 0 && state->enable_count > 0) + arch_leave_lazy_mmu_mode(); +} + +/** + * lazy_mmu_mode_resume() - Resume the lazy MMU mode. + * + * Resumes the lazy MMU mode; if it was active at the point where the matching + * call to lazy_mmu_mode_pause() was made, re-enables it and calls + * arch_enter_lazy_mmu_mode(). + * + * Must match a call to lazy_mmu_mode_pause(). + * + * Has no effect if called: + * - While paused (inside another pause()/resume() pair) + * - In interrupt context + */ +static inline void lazy_mmu_mode_resume(void) +{ + struct lazy_mmu_state *state = ¤t->lazy_mmu_state; + + if (in_interrupt()) + return; + + VM_WARN_ON_ONCE(state->pause_count == 0); + + if (--state->pause_count == 0 && state->enable_count > 0) + arch_enter_lazy_mmu_mode(); +} +#else +static inline void lazy_mmu_mode_enable(void) {} +static inline void lazy_mmu_mode_disable(void) {} +static inline void lazy_mmu_mode_pause(void) {} +static inline void lazy_mmu_mode_resume(void) {} #endif #ifndef pte_batch_hint @@ -289,7 +429,7 @@ static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) static inline void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr) { - page_table_check_ptes_set(mm, ptep, pte, nr); + page_table_check_ptes_set(mm, addr, ptep, pte, nr); for (;;) { set_pte(ptep, pte); @@ -494,7 +634,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, { pte_t pte = ptep_get(ptep); pte_clear(mm, address, ptep); - page_table_check_pte_clear(mm, pte); + page_table_check_pte_clear(mm, address, pte); return pte; } #endif @@ -553,7 +693,7 @@ static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, * No need for ptep_get_and_clear(): page table check doesn't care about * any bits that could have been set by HW concurrently. */ - page_table_check_pte_clear(mm, pte); + page_table_check_pte_clear(mm, addr, pte); } #ifdef CONFIG_GUP_GET_PXX_LOW_HIGH @@ -648,7 +788,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, pmd_t pmd = *pmdp; pmd_clear(pmdp); - page_table_check_pmd_clear(mm, pmd); + page_table_check_pmd_clear(mm, address, pmd); return pmd; } @@ -661,7 +801,7 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, pud_t pud = *pudp; pud_clear(pudp); - page_table_check_pud_clear(mm, pud); + page_table_check_pud_clear(mm, address, pud); return pud; } diff --git a/include/linux/rmap.h b/include/linux/rmap.h index daa92a58585d..8dc0871e5f00 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -92,6 +92,7 @@ struct anon_vma_chain { }; enum ttu_flags { + TTU_USE_SHARED_ZEROPAGE = 0x2, /* for unused pages of large folios */ TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */ TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */ TTU_SYNC = 0x10, /* avoid racy checks with PVMW_SYNC */ @@ -104,75 +105,8 @@ enum ttu_flags { }; #ifdef CONFIG_MMU -static inline void get_anon_vma(struct anon_vma *anon_vma) -{ - atomic_inc(&anon_vma->refcount); -} - -void __put_anon_vma(struct anon_vma *anon_vma); - -static inline void put_anon_vma(struct anon_vma *anon_vma) -{ - if (atomic_dec_and_test(&anon_vma->refcount)) - __put_anon_vma(anon_vma); -} - -static inline void anon_vma_lock_write(struct anon_vma *anon_vma) -{ - down_write(&anon_vma->root->rwsem); -} - -static inline int anon_vma_trylock_write(struct anon_vma *anon_vma) -{ - return down_write_trylock(&anon_vma->root->rwsem); -} - -static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) -{ - up_write(&anon_vma->root->rwsem); -} - -static inline void anon_vma_lock_read(struct anon_vma *anon_vma) -{ - down_read(&anon_vma->root->rwsem); -} - -static inline int anon_vma_trylock_read(struct anon_vma *anon_vma) -{ - return down_read_trylock(&anon_vma->root->rwsem); -} - -static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) -{ - up_read(&anon_vma->root->rwsem); -} - -/* - * anon_vma helper functions. - */ void anon_vma_init(void); /* create anon_vma_cachep */ -int __anon_vma_prepare(struct vm_area_struct *); -void unlink_anon_vmas(struct vm_area_struct *); -int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); -int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); - -static inline int anon_vma_prepare(struct vm_area_struct *vma) -{ - if (likely(vma->anon_vma)) - return 0; - - return __anon_vma_prepare(vma); -} - -static inline void anon_vma_merge(struct vm_area_struct *vma, - struct vm_area_struct *next) -{ - VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma); - unlink_anon_vmas(next); -} - -struct anon_vma *folio_get_anon_vma(const struct folio *folio); #ifdef CONFIG_MM_ID static __always_inline void folio_lock_large_mapcount(struct folio *folio) @@ -1000,12 +934,8 @@ int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff, int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, struct vm_area_struct *vma); -enum rmp_flags { - RMP_LOCKED = 1 << 0, - RMP_USE_SHARED_ZEROPAGE = 1 << 1, -}; - -void remove_migration_ptes(struct folio *src, struct folio *dst, int flags); +void remove_migration_ptes(struct folio *src, struct folio *dst, + enum ttu_flags flags); /* * rmap_walk_control: To control rmap traversing for specific needs diff --git a/include/linux/sched.h b/include/linux/sched.h index 36ae08ca0c62..873e400aafce 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1410,6 +1410,10 @@ struct task_struct { struct page_frag task_frag; +#ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE + struct lazy_mmu_state lazy_mmu_state; +#endif + #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info *delays; #endif @@ -1693,6 +1697,47 @@ static inline char task_state_to_char(struct task_struct *tsk) return task_index_to_char(task_state_index(tsk)); } +#ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE +/** + * __task_lazy_mmu_mode_active() - Test the lazy MMU mode state for a task. + * @tsk: The task to check. + * + * Test whether @tsk has its lazy MMU mode state set to active (i.e. enabled + * and not paused). + * + * This function only considers the state saved in task_struct; to test whether + * current actually is in lazy MMU mode, is_lazy_mmu_mode_active() should be + * used instead. + * + * This function is intended for architectures that implement the lazy MMU + * mode; it must not be called from generic code. + */ +static inline bool __task_lazy_mmu_mode_active(struct task_struct *tsk) +{ + struct lazy_mmu_state *state = &tsk->lazy_mmu_state; + + return state->enable_count > 0 && state->pause_count == 0; +} + +/** + * is_lazy_mmu_mode_active() - Test whether we are currently in lazy MMU mode. + * + * Test whether the current context is in lazy MMU mode. This is true if both: + * 1. We are not in interrupt context + * 2. Lazy MMU mode is active for the current task + * + * This function is intended for architectures that implement the lazy MMU + * mode; it must not be called from generic code. + */ +static inline bool is_lazy_mmu_mode_active(void) +{ + if (in_interrupt()) + return false; + + return __task_lazy_mmu_mode_active(current); +} +#endif + extern struct pid *cad_pid; /* diff --git a/include/linux/swap.h b/include/linux/swap.h index 38ca3df68716..62fc7499b408 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -224,13 +224,11 @@ enum { #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX /* Bit flag in swap_map */ -#define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */ #define COUNT_CONTINUED 0x80 /* Flag swap_map continuation for full count */ /* Special value in first swap_map */ #define SWAP_MAP_MAX 0x3e /* Max count */ #define SWAP_MAP_BAD 0x3f /* Note page is bad */ -#define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs */ /* Special value in each swap_map continuation */ #define SWAP_CONT_MAX 0x7f /* Max count */ @@ -453,16 +451,7 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -int folio_alloc_swap(struct folio *folio); -bool folio_free_swap(struct folio *folio); -void put_swap_folio(struct folio *folio, swp_entry_t entry); -extern swp_entry_t get_swap_page_of_type(int); extern int add_swap_count_continuation(swp_entry_t, gfp_t); -extern void swap_shmem_alloc(swp_entry_t, int); -extern int swap_duplicate(swp_entry_t); -extern int swapcache_prepare(swp_entry_t entry, int nr); -extern void swap_free_nr(swp_entry_t entry, int nr_pages); -extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); @@ -474,6 +463,29 @@ struct backing_dev_info; extern struct swap_info_struct *get_swap_device(swp_entry_t entry); sector_t swap_folio_sector(struct folio *folio); +/* + * If there is an existing swap slot reference (swap entry) and the caller + * guarantees that there is no race modification of it (e.g., PTL + * protecting the swap entry in page table; shmem's cmpxchg protects t + * he swap entry in shmem mapping), these two helpers below can be used + * to put/dup the entries directly. + * + * All entries must be allocated by folio_alloc_swap(). And they must have + * a swap count > 1. See comments of folio_*_swap helpers for more info. + */ +int swap_dup_entry_direct(swp_entry_t entry); +void swap_put_entries_direct(swp_entry_t entry, int nr); + +/* + * folio_free_swap tries to free the swap entries pinned by a swap cache + * folio, it has to be here to be called by other components. + */ +bool folio_free_swap(struct folio *folio); + +/* Allocate / free (hibernation) exclusive entries */ +swp_entry_t swap_alloc_hibernation_slot(int type); +void swap_free_hibernation_slot(swp_entry_t entry); + static inline void put_swap_device(struct swap_info_struct *si) { percpu_ref_put(&si->users); @@ -501,10 +513,6 @@ static inline void put_swap_device(struct swap_info_struct *si) #define free_pages_and_swap_cache(pages, nr) \ release_pages((pages), (nr)); -static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr) -{ -} - static inline void free_swap_cache(struct folio *folio) { } @@ -514,25 +522,12 @@ static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) return 0; } -static inline void swap_shmem_alloc(swp_entry_t swp, int nr) -{ -} - -static inline int swap_duplicate(swp_entry_t swp) -{ - return 0; -} - -static inline int swapcache_prepare(swp_entry_t swp, int nr) +static inline int swap_dup_entry_direct(swp_entry_t ent) { return 0; } -static inline void swap_free_nr(swp_entry_t entry, int nr_pages) -{ -} - -static inline void put_swap_folio(struct folio *folio, swp_entry_t swp) +static inline void swap_put_entries_direct(swp_entry_t ent, int nr) { } @@ -551,11 +546,6 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -static inline int folio_alloc_swap(struct folio *folio) -{ - return -EINVAL; -} - static inline bool folio_free_swap(struct folio *folio) { return false; @@ -568,17 +558,6 @@ static inline int add_swap_extent(struct swap_info_struct *sis, return -EINVAL; } #endif /* CONFIG_SWAP */ - -static inline void free_swap_and_cache(swp_entry_t entry) -{ - free_swap_and_cache_nr(entry, 1); -} - -static inline void swap_free(swp_entry_t entry) -{ - swap_free_nr(entry, 1); -} - #ifdef CONFIG_MEMCG static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) { diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 92f80b4d69a6..22a139f82d75 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -122,13 +122,13 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_SWPOUT, THP_SWPOUT_FALLBACK, #endif -#ifdef CONFIG_MEMORY_BALLOON +#ifdef CONFIG_BALLOON BALLOON_INFLATE, BALLOON_DEFLATE, -#ifdef CONFIG_BALLOON_COMPACTION +#ifdef CONFIG_BALLOON_MIGRATION BALLOON_MIGRATE, -#endif -#endif +#endif /* CONFIG_BALLOON_MIGRATION */ +#endif /* CONFIG_BALLOON */ #ifdef CONFIG_DEBUG_TLBFLUSH NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */ NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */ diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 1909b945b3ea..3c9c266cf782 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -286,10 +286,8 @@ void mod_node_page_state(struct pglist_data *, enum node_stat_item, long); void inc_node_page_state(struct page *, enum node_stat_item); void dec_node_page_state(struct page *, enum node_stat_item); -extern void inc_node_state(struct pglist_data *, enum node_stat_item); extern void __inc_zone_state(struct zone *, enum zone_stat_item); extern void __inc_node_state(struct pglist_data *, enum node_stat_item); -extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_node_state(struct pglist_data *, enum node_stat_item); @@ -395,10 +393,6 @@ static inline void __dec_node_page_state(struct page *page, #define dec_node_page_state __dec_node_page_state #define mod_node_page_state __mod_node_page_state -#define inc_zone_state __inc_zone_state -#define inc_node_state __inc_node_state -#define dec_zone_state __dec_zone_state - #define set_pgdat_percpu_threshold(pgdat, callback) { } static inline void refresh_zone_stat_thresholds(void) { } diff --git a/include/linux/writeback.h b/include/linux/writeback.h index f48e8ccffe81..e530112c4b3a 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -328,9 +328,6 @@ struct dirty_throttle_control { bool dirty_exceeded; }; -void laptop_io_completion(struct backing_dev_info *info); -void laptop_sync_completion(void); -void laptop_mode_timer_fn(struct timer_list *t); bool node_dirty_ok(struct pglist_data *pgdat); int wb_domain_init(struct wb_domain *dom, gfp_t gfp); #ifdef CONFIG_CGROUP_WRITEBACK @@ -342,7 +339,6 @@ extern struct wb_domain global_wb_domain; /* These are exported to sysctl. */ extern unsigned int dirty_writeback_interval; extern unsigned int dirty_expire_interval; -extern int laptop_mode; void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index f3ccff2d966c..478410c880b1 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -22,6 +22,7 @@ struct zs_pool_stats { }; struct zs_pool; +struct scatterlist; struct zs_pool *zs_create_pool(const char *name); void zs_destroy_pool(struct zs_pool *pool); @@ -40,9 +41,12 @@ unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size); void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats); void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, - void *local_copy); + size_t mem_len, void *local_copy); void zs_obj_read_end(struct zs_pool *pool, unsigned long handle, - void *handle_mem); + size_t mem_len, void *handle_mem); +void zs_obj_read_sg_begin(struct zs_pool *pool, unsigned long handle, + struct scatterlist *sg, size_t mem_len); +void zs_obj_read_sg_end(struct zs_pool *pool, unsigned long handle); void zs_obj_write(struct zs_pool *pool, unsigned long handle, void *handle_mem, size_t mem_len); diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h index 852d725afea2..24fc402ab3c8 100644 --- a/include/trace/events/damon.h +++ b/include/trace/events/damon.h @@ -9,6 +9,47 @@ #include <linux/types.h> #include <linux/tracepoint.h> +TRACE_EVENT(damos_stat_after_apply_interval, + + TP_PROTO(unsigned int context_idx, unsigned int scheme_idx, + struct damos_stat *stat), + + TP_ARGS(context_idx, scheme_idx, stat), + + TP_STRUCT__entry( + __field(unsigned int, context_idx) + __field(unsigned int, scheme_idx) + __field(unsigned long, nr_tried) + __field(unsigned long, sz_tried) + __field(unsigned long, nr_applied) + __field(unsigned long, sz_applied) + __field(unsigned long, sz_ops_filter_passed) + __field(unsigned long, qt_exceeds) + __field(unsigned long, nr_snapshots) + ), + + TP_fast_assign( + __entry->context_idx = context_idx; + __entry->scheme_idx = scheme_idx; + __entry->nr_tried = stat->nr_tried; + __entry->sz_tried = stat->sz_tried; + __entry->nr_applied = stat->nr_applied; + __entry->sz_applied = stat->sz_applied; + __entry->sz_ops_filter_passed = stat->sz_ops_filter_passed; + __entry->qt_exceeds = stat->qt_exceeds; + __entry->nr_snapshots = stat->nr_snapshots; + ), + + TP_printk("ctx_idx=%u scheme_idx=%u nr_tried=%lu sz_tried=%lu " + "nr_applied=%lu sz_tried=%lu sz_ops_filter_passed=%lu " + "qt_exceeds=%lu nr_snapshots=%lu", + __entry->context_idx, __entry->scheme_idx, + __entry->nr_tried, __entry->sz_tried, + __entry->nr_applied, __entry->sz_applied, + __entry->sz_ops_filter_passed, __entry->qt_exceeds, + __entry->nr_snapshots) +); + TRACE_EVENT(damos_esz, TP_PROTO(unsigned int context_idx, unsigned int scheme_idx, diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 4cde53b45a85..4e41bff31888 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -37,7 +37,8 @@ EM( SCAN_PAGE_HAS_PRIVATE, "page_has_private") \ EM( SCAN_STORE_FAILED, "store_failed") \ EM( SCAN_COPY_MC, "copy_poisoned_page") \ - EMe(SCAN_PAGE_FILLED, "page_filled") + EM( SCAN_PAGE_FILLED, "page_filled") \ + EMe(SCAN_PAGE_DIRTY_OR_WRITEBACK, "page_dirty_or_writeback") #undef EM #undef EMe diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 490958fa10de..ea58e4656abf 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -40,6 +40,16 @@ {_VMSCAN_THROTTLE_CONGESTED, "VMSCAN_THROTTLE_CONGESTED"} \ ) : "VMSCAN_THROTTLE_NONE" +TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_OTHER); +TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_KSWAPD); +TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_DIRECT); +TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_PCP); + +#define kswapd_clear_hopeless_reason_ops \ + {KSWAPD_CLEAR_HOPELESS_KSWAPD, "KSWAPD"}, \ + {KSWAPD_CLEAR_HOPELESS_DIRECT, "DIRECT"}, \ + {KSWAPD_CLEAR_HOPELESS_PCP, "PCP"}, \ + {KSWAPD_CLEAR_HOPELESS_OTHER, "OTHER"} #define trace_reclaim_flags(file) ( \ (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ @@ -535,6 +545,47 @@ TRACE_EVENT(mm_vmscan_throttled, __entry->usec_delayed, show_throttle_flags(__entry->reason)) ); + +TRACE_EVENT(mm_vmscan_kswapd_reclaim_fail, + + TP_PROTO(int nid, int failures), + + TP_ARGS(nid, failures), + + TP_STRUCT__entry( + __field(int, nid) + __field(int, failures) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->failures = failures; + ), + + TP_printk("nid=%d failures=%d", + __entry->nid, __entry->failures) +); + +TRACE_EVENT(mm_vmscan_kswapd_clear_hopeless, + + TP_PROTO(int nid, int reason), + + TP_ARGS(nid, reason), + + TP_STRUCT__entry( + __field(int, nid) + __field(int, reason) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->reason = reason; + ), + + TP_printk("nid=%d reason=%s", + __entry->nid, + __print_symbolic(__entry->reason, kswapd_clear_hopeless_reason_ops)) +); #endif /* _TRACE_VMSCAN_H */ /* This part must be outside protection */ diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 7162d03e69a5..4d3d8c8f3a1b 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -42,7 +42,6 @@ EM( WB_REASON_VMSCAN, "vmscan") \ EM( WB_REASON_SYNC, "sync") \ EM( WB_REASON_PERIODIC, "periodic") \ - EM( WB_REASON_LAPTOP_TIMER, "laptop_timer") \ EM( WB_REASON_FS_FREE_SPACE, "fs_free_space") \ EM( WB_REASON_FORKER_THREAD, "forker_thread") \ EMe(WB_REASON_FOREIGN_FLUSH, "foreign_flush") diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 8fbbe613611a..6c962d866e86 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -39,6 +39,9 @@ enum { #define MPOL_MODE_FLAGS \ (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES | MPOL_F_NUMA_BALANCING) +/* Whether the nodemask is specified by users */ +#define MPOL_USER_NODEMASK_FLAGS (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES) + /* Flags for get_mempolicy */ #define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */ #define MPOL_F_ADDR (1<<1) /* look up vma using address */ diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index 1c7fe0f4dca4..bda516064174 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -182,7 +182,7 @@ enum VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */ VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */ VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */ - VM_LAPTOP_MODE=23, /* vm laptop mode */ + VM_BLOCK_DUMP=24, /* block dump mode */ VM_HUGETLB_GROUP=25, /* permitted hugetlb group */ VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */ |
