summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/backing-dev-defs.h3
-rw-r--r--include/linux/balloon.h77
-rw-r--r--include/linux/balloon_compaction.h160
-rw-r--r--include/linux/cma.h27
-rw-r--r--include/linux/damon.h65
-rw-r--r--include/linux/gfp.h60
-rw-r--r--include/linux/gfp_types.h7
-rw-r--r--include/linux/highmem.h98
-rw-r--r--include/linux/hugetlb.h15
-rw-r--r--include/linux/khugepaged.h9
-rw-r--r--include/linux/maple_tree.h9
-rw-r--r--include/linux/memblock.h4
-rw-r--r--include/linux/memcontrol.h48
-rw-r--r--include/linux/mm.h72
-rw-r--r--include/linux/mm_types.h57
-rw-r--r--include/linux/mm_types_task.h5
-rw-r--r--include/linux/mmap_lock.h279
-rw-r--r--include/linux/mmdebug.h10
-rw-r--r--include/linux/mmzone.h19
-rw-r--r--include/linux/nodemask.h8
-rw-r--r--include/linux/page-isolation.h2
-rw-r--r--include/linux/page_table_check.h69
-rw-r--r--include/linux/pgtable.h168
-rw-r--r--include/linux/rmap.h76
-rw-r--r--include/linux/sched.h45
-rw-r--r--include/linux/swap.h71
-rw-r--r--include/linux/vm_event_item.h8
-rw-r--r--include/linux/vmstat.h6
-rw-r--r--include/linux/writeback.h4
-rw-r--r--include/linux/zsmalloc.h8
30 files changed, 952 insertions, 537 deletions
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 0217c1073735..c88fd4d37d1f 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -46,7 +46,6 @@ enum wb_reason {
WB_REASON_VMSCAN,
WB_REASON_SYNC,
WB_REASON_PERIODIC,
- WB_REASON_LAPTOP_TIMER,
WB_REASON_FS_FREE_SPACE,
/*
* There is no bdi forker thread any more and works are done
@@ -204,8 +203,6 @@ struct backing_dev_info {
char dev_name[64];
struct device *owner;
- struct timer_list laptop_mode_wb_timer;
-
#ifdef CONFIG_DEBUG_FS
struct dentry *debug_dir;
#endif
diff --git a/include/linux/balloon.h b/include/linux/balloon.h
new file mode 100644
index 000000000000..ca5b15150f42
--- /dev/null
+++ b/include/linux/balloon.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common interface for implementing a memory balloon, including support
+ * for migration of pages inflated in a memory balloon.
+ *
+ * Balloon page migration makes use of the general "movable_ops page migration"
+ * feature.
+ *
+ * page->private is used to reference the responsible balloon device.
+ * That these pages have movable_ops, and which movable_ops apply,
+ * is derived from the page type (PageOffline()) combined with the
+ * PG_movable_ops flag (PageMovableOps()).
+ *
+ * Once the page type and the PG_movable_ops are set, migration code
+ * can initiate page isolation by invoking the
+ * movable_operations()->isolate_page() callback
+ *
+ * As long as page->private is set, the page is either on the balloon list
+ * or isolated for migration. If page->private is not set, the page is
+ * either still getting inflated, or was deflated to be freed by the balloon
+ * driver soon. Isolation is impossible in both cases.
+ *
+ * As the page isolation scanning step a compaction thread does is a lockless
+ * procedure (from a page standpoint), it might bring some racy situations while
+ * performing balloon page migration. In order to sort out these racy scenarios
+ * and safely perform balloon's page migration we must, always, ensure following
+ * these simple rules:
+ *
+ * i. Inflation/deflation must set/clear page->private under the
+ * balloon_pages_lock
+ *
+ * ii. isolation or dequeueing procedure must remove the page from balloon
+ * device page list under balloon_pages_lock
+ *
+ * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini <aquini@redhat.com>
+ */
+#ifndef _LINUX_BALLOON_H
+#define _LINUX_BALLOON_H
+#include <linux/pagemap.h>
+#include <linux/page-flags.h>
+#include <linux/migrate.h>
+#include <linux/gfp.h>
+#include <linux/err.h>
+#include <linux/list.h>
+
+/*
+ * Balloon device information descriptor.
+ * This struct is used to allow the common balloon page migration interface
+ * procedures to find the proper balloon device holding memory pages they'll
+ * have to cope for page migration, as well as it serves the balloon driver as
+ * a page book-keeper for its registered balloon devices.
+ */
+struct balloon_dev_info {
+ unsigned long isolated_pages; /* # of isolated pages for migration */
+ struct list_head pages; /* Pages enqueued & handled to Host */
+ int (*migratepage)(struct balloon_dev_info *, struct page *newpage,
+ struct page *page, enum migrate_mode mode);
+ bool adjust_managed_page_count;
+};
+
+struct page *balloon_page_alloc(void);
+void balloon_page_enqueue(struct balloon_dev_info *b_dev_info,
+ struct page *page);
+struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info);
+size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info,
+ struct list_head *pages);
+size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
+ struct list_head *pages, size_t n_req_pages);
+
+static inline void balloon_devinfo_init(struct balloon_dev_info *balloon)
+{
+ balloon->isolated_pages = 0;
+ INIT_LIST_HEAD(&balloon->pages);
+ balloon->migratepage = NULL;
+ balloon->adjust_managed_page_count = false;
+}
+#endif /* _LINUX_BALLOON_H */
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
deleted file mode 100644
index 7cfe48769239..000000000000
--- a/include/linux/balloon_compaction.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * include/linux/balloon_compaction.h
- *
- * Common interface definitions for making balloon pages movable by compaction.
- *
- * Balloon page migration makes use of the general "movable_ops page migration"
- * feature.
- *
- * page->private is used to reference the responsible balloon device.
- * That these pages have movable_ops, and which movable_ops apply,
- * is derived from the page type (PageOffline()) combined with the
- * PG_movable_ops flag (PageMovableOps()).
- *
- * As the page isolation scanning step a compaction thread does is a lockless
- * procedure (from a page standpoint), it might bring some racy situations while
- * performing balloon page compaction. In order to sort out these racy scenarios
- * and safely perform balloon's page compaction and migration we must, always,
- * ensure following these simple rules:
- *
- * i. Setting the PG_movable_ops flag and page->private with the following
- * lock order
- * +-page_lock(page);
- * +--spin_lock_irq(&b_dev_info->pages_lock);
- *
- * ii. isolation or dequeueing procedure must remove the page from balloon
- * device page list under b_dev_info->pages_lock.
- *
- * The functions provided by this interface are placed to help on coping with
- * the aforementioned balloon page corner case, as well as to ensure the simple
- * set of exposed rules are satisfied while we are dealing with balloon pages
- * compaction / migration.
- *
- * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini <aquini@redhat.com>
- */
-#ifndef _LINUX_BALLOON_COMPACTION_H
-#define _LINUX_BALLOON_COMPACTION_H
-#include <linux/pagemap.h>
-#include <linux/page-flags.h>
-#include <linux/migrate.h>
-#include <linux/gfp.h>
-#include <linux/err.h>
-#include <linux/fs.h>
-#include <linux/list.h>
-
-/*
- * Balloon device information descriptor.
- * This struct is used to allow the common balloon compaction interface
- * procedures to find the proper balloon device holding memory pages they'll
- * have to cope for page compaction / migration, as well as it serves the
- * balloon driver as a page book-keeper for its registered balloon devices.
- */
-struct balloon_dev_info {
- unsigned long isolated_pages; /* # of isolated pages for migration */
- spinlock_t pages_lock; /* Protection to pages list */
- struct list_head pages; /* Pages enqueued & handled to Host */
- int (*migratepage)(struct balloon_dev_info *, struct page *newpage,
- struct page *page, enum migrate_mode mode);
-};
-
-extern struct page *balloon_page_alloc(void);
-extern void balloon_page_enqueue(struct balloon_dev_info *b_dev_info,
- struct page *page);
-extern struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info);
-extern size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info,
- struct list_head *pages);
-extern size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
- struct list_head *pages, size_t n_req_pages);
-
-static inline void balloon_devinfo_init(struct balloon_dev_info *balloon)
-{
- balloon->isolated_pages = 0;
- spin_lock_init(&balloon->pages_lock);
- INIT_LIST_HEAD(&balloon->pages);
- balloon->migratepage = NULL;
-}
-
-#ifdef CONFIG_BALLOON_COMPACTION
-extern const struct movable_operations balloon_mops;
-/*
- * balloon_page_device - get the b_dev_info descriptor for the balloon device
- * that enqueues the given page.
- */
-static inline struct balloon_dev_info *balloon_page_device(struct page *page)
-{
- return (struct balloon_dev_info *)page_private(page);
-}
-#endif /* CONFIG_BALLOON_COMPACTION */
-
-/*
- * balloon_page_insert - insert a page into the balloon's page list and make
- * the page->private assignment accordingly.
- * @balloon : pointer to balloon device
- * @page : page to be assigned as a 'balloon page'
- *
- * Caller must ensure the page is locked and the spin_lock protecting balloon
- * pages list is held before inserting a page into the balloon device.
- */
-static inline void balloon_page_insert(struct balloon_dev_info *balloon,
- struct page *page)
-{
- __SetPageOffline(page);
- if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) {
- SetPageMovableOps(page);
- set_page_private(page, (unsigned long)balloon);
- }
- list_add(&page->lru, &balloon->pages);
-}
-
-static inline gfp_t balloon_mapping_gfp_mask(void)
-{
- if (IS_ENABLED(CONFIG_BALLOON_COMPACTION))
- return GFP_HIGHUSER_MOVABLE;
- return GFP_HIGHUSER;
-}
-
-/*
- * balloon_page_finalize - prepare a balloon page that was removed from the
- * balloon list for release to the page allocator
- * @page: page to be released to the page allocator
- *
- * Caller must ensure that the page is locked.
- */
-static inline void balloon_page_finalize(struct page *page)
-{
- if (IS_ENABLED(CONFIG_BALLOON_COMPACTION))
- set_page_private(page, 0);
- /* PageOffline is sticky until the page is freed to the buddy. */
-}
-
-/*
- * balloon_page_push - insert a page into a page list.
- * @head : pointer to list
- * @page : page to be added
- *
- * Caller must ensure the page is private and protect the list.
- */
-static inline void balloon_page_push(struct list_head *pages, struct page *page)
-{
- list_add(&page->lru, pages);
-}
-
-/*
- * balloon_page_pop - remove a page from a page list.
- * @head : pointer to list
- * @page : page to be added
- *
- * Caller must ensure the page is private and protect the list.
- */
-static inline struct page *balloon_page_pop(struct list_head *pages)
-{
- struct page *page = list_first_entry_or_null(pages, struct page, lru);
-
- if (!page)
- return NULL;
-
- list_del(&page->lru);
- return page;
-}
-#endif /* _LINUX_BALLOON_COMPACTION_H */
diff --git a/include/linux/cma.h b/include/linux/cma.h
index 2e6931735880..d0793eaaadaa 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -49,9 +49,14 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
struct cma **res_cma);
extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align,
bool no_warn);
-extern bool cma_pages_valid(struct cma *cma, const struct page *pages, unsigned long count);
extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count);
+struct page *cma_alloc_frozen(struct cma *cma, unsigned long count,
+ unsigned int align, bool no_warn);
+struct page *cma_alloc_frozen_compound(struct cma *cma, unsigned int order);
+bool cma_release_frozen(struct cma *cma, const struct page *pages,
+ unsigned long count);
+
extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data);
extern bool cma_intersects(struct cma *cma, unsigned long start, unsigned long end);
@@ -66,24 +71,4 @@ static inline bool cma_skip_dt_default_reserved_mem(void)
}
#endif
-#ifdef CONFIG_CMA
-struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp);
-bool cma_free_folio(struct cma *cma, const struct folio *folio);
-bool cma_validate_zones(struct cma *cma);
-#else
-static inline struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp)
-{
- return NULL;
-}
-
-static inline bool cma_free_folio(struct cma *cma, const struct folio *folio)
-{
- return false;
-}
-static inline bool cma_validate_zones(struct cma *cma)
-{
- return false;
-}
-#endif
-
#endif
diff --git a/include/linux/damon.h b/include/linux/damon.h
index 3813373a9200..a4fea23da857 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -15,7 +15,7 @@
#include <linux/random.h>
/* Minimal region size. Every damon_region is aligned by this. */
-#define DAMON_MIN_REGION PAGE_SIZE
+#define DAMON_MIN_REGION_SZ PAGE_SIZE
/* Max priority score for DAMON-based operation schemes */
#define DAMOS_MAX_SCORE (99)
@@ -155,6 +155,8 @@ enum damos_action {
* @DAMOS_QUOTA_NODE_MEM_FREE_BP: MemFree ratio of a node.
* @DAMOS_QUOTA_NODE_MEMCG_USED_BP: MemUsed ratio of a node for a cgroup.
* @DAMOS_QUOTA_NODE_MEMCG_FREE_BP: MemFree ratio of a node for a cgroup.
+ * @DAMOS_QUOTA_ACTIVE_MEM_BP: Active to total LRU memory ratio.
+ * @DAMOS_QUOTA_INACTIVE_MEM_BP: Inactive to total LRU memory ratio.
* @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics.
*
* Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported.
@@ -166,6 +168,8 @@ enum damos_quota_goal_metric {
DAMOS_QUOTA_NODE_MEM_FREE_BP,
DAMOS_QUOTA_NODE_MEMCG_USED_BP,
DAMOS_QUOTA_NODE_MEMCG_FREE_BP,
+ DAMOS_QUOTA_ACTIVE_MEM_BP,
+ DAMOS_QUOTA_INACTIVE_MEM_BP,
NR_DAMOS_QUOTA_GOAL_METRICS,
};
@@ -203,7 +207,7 @@ struct damos_quota_goal {
u64 last_psi_total;
struct {
int nid;
- unsigned short memcg_id;
+ u64 memcg_id;
};
};
struct list_head list;
@@ -330,6 +334,8 @@ struct damos_watermarks {
* @sz_ops_filter_passed:
* Total bytes that passed ops layer-handled DAMOS filters.
* @qt_exceeds: Total number of times the quota of the scheme has exceeded.
+ * @nr_snapshots:
+ * Total number of DAMON snapshots that the scheme has tried.
*
* "Tried an action to a region" in this context means the DAMOS core logic
* determined the region as eligible to apply the action. The access pattern
@@ -355,6 +361,7 @@ struct damos_stat {
unsigned long sz_applied;
unsigned long sz_ops_filter_passed;
unsigned long qt_exceeds;
+ unsigned long nr_snapshots;
};
/**
@@ -416,7 +423,7 @@ struct damos_filter {
bool matching;
bool allow;
union {
- unsigned short memcg_id;
+ u64 memcg_id;
struct damon_addr_range addr_range;
int target_idx;
struct damon_size_range sz_range;
@@ -496,6 +503,7 @@ struct damos_migrate_dests {
* @ops_filters: ops layer handling &struct damos_filter objects list.
* @last_applied: Last @action applied ops-managing entity.
* @stat: Statistics of this scheme.
+ * @max_nr_snapshots: Upper limit of nr_snapshots stat.
* @list: List head for siblings.
*
* For each @apply_interval_us, DAMON finds regions which fit in the
@@ -529,9 +537,10 @@ struct damos_migrate_dests {
* unsets @last_applied when each regions walking for applying the scheme is
* finished.
*
- * After applying the &action to each region, &stat_count and &stat_sz is
- * updated to reflect the number of regions and total size of regions that the
- * &action is applied.
+ * After applying the &action to each region, &stat is updated.
+ *
+ * If &max_nr_snapshots is set as non-zero and &stat.nr_snapshots be same to or
+ * greater than it, the scheme is deactivated.
*/
struct damos {
struct damos_access_pattern pattern;
@@ -566,6 +575,7 @@ struct damos {
struct list_head ops_filters;
void *last_applied;
struct damos_stat stat;
+ unsigned long max_nr_snapshots;
struct list_head list;
};
@@ -597,7 +607,6 @@ enum damon_ops_id {
* @apply_scheme: Apply a DAMON-based operation scheme.
* @target_valid: Determine if the target is valid.
* @cleanup_target: Clean up each target before deallocation.
- * @cleanup: Clean up the context.
*
* DAMON can be extended for various address spaces and usages. For this,
* users should register the low level operations for their target address
@@ -630,7 +639,6 @@ enum damon_ops_id {
* @target_valid should check whether the target is still valid for the
* monitoring.
* @cleanup_target is called before the target will be deallocated.
- * @cleanup is called from @kdamond just before its termination.
*/
struct damon_operations {
enum damon_ops_id id;
@@ -646,7 +654,6 @@ struct damon_operations {
struct damos *scheme, unsigned long *sz_filter_passed);
bool (*target_valid)(struct damon_target *t);
void (*cleanup_target)(struct damon_target *t);
- void (*cleanup)(struct damon_ctx *context);
};
/*
@@ -656,7 +663,7 @@ struct damon_operations {
* @data: Data that will be passed to @fn.
* @repeat: Repeat invocations.
* @return_code: Return code from @fn invocation.
- * @dealloc_on_cancel: De-allocate when canceled.
+ * @dealloc_on_cancel: If @repeat is true, de-allocate when canceled.
*
* Control damon_call(), which requests specific kdamond to invoke a given
* function. Refer to damon_call() for more details.
@@ -749,27 +756,24 @@ struct damon_attrs {
* of the monitoring.
*
* @attrs: Monitoring attributes for accuracy/overhead control.
- * @kdamond: Kernel thread who does the monitoring.
- * @kdamond_lock: Mutex for the synchronizations with @kdamond.
*
- * For each monitoring context, one kernel thread for the monitoring is
- * created. The pointer to the thread is stored in @kdamond.
+ * For each monitoring context, one kernel thread for the monitoring, namely
+ * kdamond, is created. The pid of kdamond can be retrieved using
+ * damon_kdamond_pid().
*
- * Once started, the monitoring thread runs until explicitly required to be
- * terminated or every monitoring target is invalid. The validity of the
- * targets is checked via the &damon_operations.target_valid of @ops. The
- * termination can also be explicitly requested by calling damon_stop().
- * The thread sets @kdamond to NULL when it terminates. Therefore, users can
- * know whether the monitoring is ongoing or terminated by reading @kdamond.
- * Reads and writes to @kdamond from outside of the monitoring thread must
- * be protected by @kdamond_lock.
+ * Once started, kdamond runs until explicitly required to be terminated or
+ * every monitoring target is invalid. The validity of the targets is checked
+ * via the &damon_operations.target_valid of @ops. The termination can also be
+ * explicitly requested by calling damon_stop(). To know if a kdamond is
+ * running, damon_is_running() can be used.
*
- * Note that the monitoring thread protects only @kdamond via @kdamond_lock.
- * Accesses to other fields must be protected by themselves.
+ * While the kdamond is running, all accesses to &struct damon_ctx from a
+ * thread other than the kdamond should be made using safe DAMON APIs,
+ * including damon_call() and damos_walk().
*
* @ops: Set of monitoring operations for given use cases.
* @addr_unit: Scale factor for core to ops address conversion.
- * @min_sz_region: Minimum region size.
+ * @min_region_sz: Minimum region size.
* @adaptive_targets: Head of monitoring targets (&damon_target) list.
* @schemes: Head of schemes (&damos) list.
*/
@@ -806,13 +810,15 @@ struct damon_ctx {
struct damos_walk_control *walk_control;
struct mutex walk_control_lock;
-/* public: */
+ /* Working thread of the given DAMON context */
struct task_struct *kdamond;
+ /* Protects @kdamond field access */
struct mutex kdamond_lock;
+/* public: */
struct damon_operations ops;
unsigned long addr_unit;
- unsigned long min_sz_region;
+ unsigned long min_region_sz;
struct list_head adaptive_targets;
struct list_head schemes;
@@ -901,7 +907,7 @@ static inline void damon_insert_region(struct damon_region *r,
void damon_add_region(struct damon_region *r, struct damon_target *t);
void damon_destroy_region(struct damon_region *r, struct damon_target *t);
int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
- unsigned int nr_ranges, unsigned long min_sz_region);
+ unsigned int nr_ranges, unsigned long min_region_sz);
void damon_update_region_access_rate(struct damon_region *r, bool accessed,
struct damon_attrs *attrs);
@@ -962,13 +968,14 @@ bool damon_initialized(void);
int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive);
int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
bool damon_is_running(struct damon_ctx *ctx);
+int damon_kdamond_pid(struct damon_ctx *ctx);
int damon_call(struct damon_ctx *ctx, struct damon_call_control *control);
int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control);
int damon_set_region_biggest_system_ram_default(struct damon_target *t,
unsigned long *start, unsigned long *end,
- unsigned long min_sz_region);
+ unsigned long min_region_sz);
#endif /* CONFIG_DAMON */
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index b155929af5b1..6ecf6dda93e0 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -407,9 +407,15 @@ extern gfp_t gfp_allowed_mask;
/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
+/* A helper for checking if gfp includes all the specified flags */
+static inline bool gfp_has_flags(gfp_t gfp, gfp_t flags)
+{
+ return (gfp & flags) == flags;
+}
+
static inline bool gfp_has_io_fs(gfp_t gfp)
{
- return (gfp & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS);
+ return gfp_has_flags(gfp, __GFP_IO | __GFP_FS);
}
/*
@@ -430,39 +436,29 @@ typedef unsigned int __bitwise acr_flags_t;
#define ACR_FLAGS_CMA ((__force acr_flags_t)BIT(0)) // allocate for CMA
/* The below functions must be run on a range from a single zone. */
-extern int alloc_contig_range_noprof(unsigned long start, unsigned long end,
- acr_flags_t alloc_flags, gfp_t gfp_mask);
-#define alloc_contig_range(...) alloc_hooks(alloc_contig_range_noprof(__VA_ARGS__))
-
-extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
- int nid, nodemask_t *nodemask);
-#define alloc_contig_pages(...) alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__))
-
-#endif
+int alloc_contig_frozen_range_noprof(unsigned long start, unsigned long end,
+ acr_flags_t alloc_flags, gfp_t gfp_mask);
+#define alloc_contig_frozen_range(...) \
+ alloc_hooks(alloc_contig_frozen_range_noprof(__VA_ARGS__))
+
+int alloc_contig_range_noprof(unsigned long start, unsigned long end,
+ acr_flags_t alloc_flags, gfp_t gfp_mask);
+#define alloc_contig_range(...) \
+ alloc_hooks(alloc_contig_range_noprof(__VA_ARGS__))
+
+struct page *alloc_contig_frozen_pages_noprof(unsigned long nr_pages,
+ gfp_t gfp_mask, int nid, nodemask_t *nodemask);
+#define alloc_contig_frozen_pages(...) \
+ alloc_hooks(alloc_contig_frozen_pages_noprof(__VA_ARGS__))
+
+struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask);
+#define alloc_contig_pages(...) \
+ alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__))
+
+void free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages);
void free_contig_range(unsigned long pfn, unsigned long nr_pages);
-
-#ifdef CONFIG_CONTIG_ALLOC
-static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp,
- int nid, nodemask_t *node)
-{
- struct page *page;
-
- if (WARN_ON(!order || !(gfp & __GFP_COMP)))
- return NULL;
-
- page = alloc_contig_pages_noprof(1 << order, gfp, nid, node);
-
- return page ? page_folio(page) : NULL;
-}
-#else
-static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp,
- int nid, nodemask_t *node)
-{
- return NULL;
-}
#endif
-/* This should be paired with folio_put() rather than free_contig_range(). */
-#define folio_alloc_gigantic(...) alloc_hooks(folio_alloc_gigantic_noprof(__VA_ARGS__))
DEFINE_FREE(free_page, void *, free_page((unsigned long)_T))
diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index 3de43b12209e..814bb2892f99 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -309,8 +309,10 @@ enum {
*
* %GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower
* watermark is applied to allow access to "atomic reserves".
- * The current implementation doesn't support NMI and few other strict
- * non-preemptive contexts (e.g. raw_spin_lock). The same applies to %GFP_NOWAIT.
+ * The current implementation doesn't support NMI, nor contexts that disable
+ * preemption under PREEMPT_RT. This includes raw_spin_lock() and plain
+ * preempt_disable() - see "Memory allocation" in
+ * Documentation/core-api/real-time/differences.rst for more info.
*
* %GFP_KERNEL is typical for kernel-internal allocations. The caller requires
* %ZONE_NORMAL or a lower zone for direct access but can direct reclaim.
@@ -321,6 +323,7 @@ enum {
* %GFP_NOWAIT is for kernel allocations that should not stall for direct
* reclaim, start physical IO or use any filesystem callback. It is very
* likely to fail to allocate memory, even for very small allocations.
+ * The same restrictions on calling contexts apply as for %GFP_ATOMIC.
*
* %GFP_NOIO will use direct reclaim to discard clean pages or slab pages
* that do not require the starting of any physical IO.
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index abc20f9810fd..af03db851a1d 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -197,15 +197,111 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size)
}
#endif
-/* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */
#ifndef clear_user_highpage
+#ifndef clear_user_page
+/**
+ * clear_user_page() - clear a page to be mapped to user space
+ * @addr: the address of the page
+ * @vaddr: the address of the user mapping
+ * @page: the page
+ *
+ * We condition the definition of clear_user_page() on the architecture
+ * not having a custom clear_user_highpage(). That's because if there
+ * is some special flushing needed for clear_user_highpage() then it
+ * is likely that clear_user_page() also needs some magic. And, since
+ * our only caller is the generic clear_user_highpage(), not defining
+ * is not much of a loss.
+ */
+static inline void clear_user_page(void *addr, unsigned long vaddr, struct page *page)
+{
+ clear_page(addr);
+}
+#endif
+
+/**
+ * clear_user_pages() - clear a page range to be mapped to user space
+ * @addr: start address
+ * @vaddr: start address of the user mapping
+ * @page: start page
+ * @npages: number of pages
+ *
+ * Assumes that the region (@addr, +@npages) has been validated
+ * already so this does no exception handling.
+ *
+ * If the architecture provides a clear_user_page(), use that;
+ * otherwise, we can safely use clear_pages().
+ */
+static inline void clear_user_pages(void *addr, unsigned long vaddr,
+ struct page *page, unsigned int npages)
+{
+
+#ifdef clear_user_page
+ do {
+ clear_user_page(addr, vaddr, page);
+ addr += PAGE_SIZE;
+ vaddr += PAGE_SIZE;
+ page++;
+ } while (--npages);
+#else
+ /*
+ * Prefer clear_pages() to allow for architectural optimizations
+ * when operating on contiguous page ranges.
+ */
+ clear_pages(addr, npages);
+#endif
+}
+
+/**
+ * clear_user_highpage() - clear a page to be mapped to user space
+ * @page: start page
+ * @vaddr: start address of the user mapping
+ *
+ * With !CONFIG_HIGHMEM this (and the copy_user_highpage() below) will
+ * be plain clear_user_page() (and copy_user_page()).
+ */
static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
{
void *addr = kmap_local_page(page);
clear_user_page(addr, vaddr, page);
kunmap_local(addr);
}
+#endif /* clear_user_highpage */
+
+/**
+ * clear_user_highpages() - clear a page range to be mapped to user space
+ * @page: start page
+ * @vaddr: start address of the user mapping
+ * @npages: number of pages
+ *
+ * Assumes that all the pages in the region (@page, +@npages) are valid
+ * so this does no exception handling.
+ */
+static inline void clear_user_highpages(struct page *page, unsigned long vaddr,
+ unsigned int npages)
+{
+
+#if defined(clear_user_highpage) || defined(CONFIG_HIGHMEM)
+ /*
+ * An architecture defined clear_user_highpage() implies special
+ * handling is needed.
+ *
+ * So we use that or, the generic variant if CONFIG_HIGHMEM is
+ * enabled.
+ */
+ do {
+ clear_user_highpage(page, vaddr);
+ vaddr += PAGE_SIZE;
+ page++;
+ } while (--npages);
+#else
+
+ /*
+ * Prefer clear_user_pages() to allow for architectural optimizations
+ * when operating on contiguous page ranges.
+ */
+ clear_user_pages(page_address(page), vaddr, page, npages);
#endif
+}
#ifndef vma_alloc_zeroed_movable_folio
/**
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e51b8ef0cebd..94a03591990c 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -171,11 +171,11 @@ bool hugetlbfs_pagecache_present(struct hstate *h,
struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio);
+extern int movable_gigantic_pages __read_mostly;
extern int sysctl_hugetlb_shm_group __read_mostly;
extern struct list_head huge_boot_pages[MAX_NUMNODES];
void hugetlb_bootmem_alloc(void);
-bool hugetlb_bootmem_allocated(void);
extern nodemask_t hugetlb_bootmem_nodes;
void hugetlb_bootmem_set_nodes(void);
@@ -280,6 +280,8 @@ void fixup_hugetlb_reservations(struct vm_area_struct *vma);
void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
int hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
+unsigned int arch_hugetlb_cma_order(void);
+
#else /* !CONFIG_HUGETLB_PAGE */
static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma)
@@ -929,7 +931,7 @@ static inline bool hugepage_movable_supported(struct hstate *h)
if (!hugepage_migration_supported(h))
return false;
- if (hstate_is_gigantic(h))
+ if (hstate_is_gigantic(h) && !movable_gigantic_pages)
return false;
return true;
}
@@ -1303,11 +1305,6 @@ static inline bool hugetlbfs_pagecache_present(
static inline void hugetlb_bootmem_alloc(void)
{
}
-
-static inline bool hugetlb_bootmem_allocated(void)
-{
- return false;
-}
#endif /* CONFIG_HUGETLB_PAGE */
static inline spinlock_t *huge_pte_lock(struct hstate *h,
@@ -1321,9 +1318,9 @@ static inline spinlock_t *huge_pte_lock(struct hstate *h,
}
#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
-extern void __init hugetlb_cma_reserve(int order);
+extern void __init hugetlb_cma_reserve(void);
#else
-static inline __init void hugetlb_cma_reserve(int order)
+static inline __init void hugetlb_cma_reserve(void)
{
}
#endif
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index eb1946a70cff..d7a9053ff4fe 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -17,8 +17,8 @@ extern void khugepaged_enter_vma(struct vm_area_struct *vma,
vm_flags_t vm_flags);
extern void khugepaged_min_free_kbytes_update(void);
extern bool current_is_khugepaged(void);
-extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
- bool install_pmd);
+void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
+ bool install_pmd);
static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
@@ -42,10 +42,9 @@ static inline void khugepaged_enter_vma(struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
}
-static inline int collapse_pte_mapped_thp(struct mm_struct *mm,
- unsigned long addr, bool install_pmd)
+static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
+ unsigned long addr, bool install_pmd)
{
- return 0;
}
static inline void khugepaged_min_free_kbytes_update(void)
diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index 66f98a3da8d8..7b8aad47121e 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -129,13 +129,6 @@ struct maple_arange_64 {
struct maple_metadata meta;
};
-struct maple_alloc {
- unsigned long total;
- unsigned char node_count;
- unsigned int request_count;
- struct maple_alloc *slot[MAPLE_ALLOC_SLOTS];
-};
-
struct maple_topiary {
struct maple_pnode *parent;
struct maple_enode *next; /* Overlaps the pivot */
@@ -306,7 +299,6 @@ struct maple_node {
};
struct maple_range_64 mr64;
struct maple_arange_64 ma64;
- struct maple_alloc alloc;
};
};
@@ -536,7 +528,6 @@ bool mas_nomem(struct ma_state *mas, gfp_t gfp);
void mas_pause(struct ma_state *mas);
void maple_tree_init(void);
void mas_destroy(struct ma_state *mas);
-int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries);
void *mas_prev(struct ma_state *mas, unsigned long min);
void *mas_prev_range(struct ma_state *mas, unsigned long max);
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 221118b5a16e..6ec5e9ac0699 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -598,9 +598,9 @@ extern void *alloc_large_system_hash(const char *tablename,
*/
#ifdef CONFIG_NUMA
#define HASHDIST_DEFAULT IS_ENABLED(CONFIG_64BIT)
-extern int hashdist; /* Distribute hashes across NUMA nodes? */
+extern bool hashdist; /* Distribute hashes across NUMA nodes? */
#else
-#define hashdist (0)
+#define hashdist (false)
#endif
#ifdef CONFIG_MEMTEST
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f29d4969c0c3..1baee139999f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -65,7 +65,7 @@ struct mem_cgroup_reclaim_cookie {
#define MEM_CGROUP_ID_SHIFT 16
-struct mem_cgroup_id {
+struct mem_cgroup_private_id {
int id;
refcount_t ref;
};
@@ -191,7 +191,7 @@ struct mem_cgroup {
struct cgroup_subsys_state css;
/* Private memcg ID. Used to ID objects that outlive the cgroup */
- struct mem_cgroup_id id;
+ struct mem_cgroup_private_id id;
/* Accounted resources */
struct page_counter memory; /* Both v1 & v2 */
@@ -557,13 +557,15 @@ static inline bool mem_cgroup_disabled(void)
static inline void mem_cgroup_protection(struct mem_cgroup *root,
struct mem_cgroup *memcg,
unsigned long *min,
- unsigned long *low)
+ unsigned long *low,
+ unsigned long *usage)
{
- *min = *low = 0;
+ *min = *low = *usage = 0;
if (mem_cgroup_disabled())
return;
+ *usage = page_counter_read(&memcg->memory);
/*
* There is no reclaim protection applied to a targeted reclaim.
* We are special casing this specific case here because
@@ -819,23 +821,21 @@ void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
int (*)(struct task_struct *, void *), void *arg);
-static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
+static inline unsigned short mem_cgroup_private_id(struct mem_cgroup *memcg)
{
if (mem_cgroup_disabled())
return 0;
return memcg->id.id;
}
-struct mem_cgroup *mem_cgroup_from_id(unsigned short id);
+struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id);
-#ifdef CONFIG_SHRINKER_DEBUG
-static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg)
+static inline u64 mem_cgroup_id(struct mem_cgroup *memcg)
{
- return memcg ? cgroup_ino(memcg->css.cgroup) : 0;
+ return memcg ? cgroup_id(memcg->css.cgroup) : 0;
}
-struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino);
-#endif
+struct mem_cgroup *mem_cgroup_get_from_id(u64 id);
static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
{
@@ -919,8 +919,6 @@ static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask)
unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
-unsigned long mem_cgroup_size(struct mem_cgroup *memcg);
-
void mem_cgroup_print_oom_context(struct mem_cgroup *memcg,
struct task_struct *p);
@@ -1108,9 +1106,10 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
static inline void mem_cgroup_protection(struct mem_cgroup *root,
struct mem_cgroup *memcg,
unsigned long *min,
- unsigned long *low)
+ unsigned long *low,
+ unsigned long *usage)
{
- *min = *low = 0;
+ *min = *low = *usage = 0;
}
static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root,
@@ -1283,29 +1282,27 @@ static inline void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
{
}
-static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
+static inline unsigned short mem_cgroup_private_id(struct mem_cgroup *memcg)
{
return 0;
}
-static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
+static inline struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id)
{
WARN_ON_ONCE(id);
/* XXX: This should always return root_mem_cgroup */
return NULL;
}
-#ifdef CONFIG_SHRINKER_DEBUG
-static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg)
+static inline u64 mem_cgroup_id(struct mem_cgroup *memcg)
{
return 0;
}
-static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
+static inline struct mem_cgroup *mem_cgroup_get_from_id(u64 id)
{
return NULL;
}
-#endif
static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
{
@@ -1334,11 +1331,6 @@ static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
return 0;
}
-static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
-{
- return 0;
-}
-
static inline void
mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
{
@@ -1750,7 +1742,7 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg)
return memcg ? memcg->kmemcg_id : -1;
}
-struct mem_cgroup *mem_cgroup_from_slab_obj(void *p);
+struct mem_cgroup *mem_cgroup_from_virt(void *p);
static inline void count_objcg_events(struct obj_cgroup *objcg,
enum vm_event_item idx,
@@ -1822,7 +1814,7 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg)
return -1;
}
-static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
+static inline struct mem_cgroup *mem_cgroup_from_virt(void *p)
{
return NULL;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6c5d06e27230..2dbe1c2219ee 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -46,6 +46,7 @@ struct pt_regs;
struct folio_batch;
void arch_mm_preinit(void);
+void mm_core_init_early(void);
void mm_core_init(void);
void init_mm_internals(void);
@@ -1008,10 +1009,7 @@ static inline void vma_flag_set_atomic(struct vm_area_struct *vma,
{
unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags);
- /* mmap read lock/VMA read lock must be held. */
- if (!rwsem_is_locked(&vma->vm_mm->mmap_lock))
- vma_assert_locked(vma);
-
+ vma_assert_stabilised(vma);
if (__vma_flag_atomic_valid(vma, bit))
set_bit((__force int)bit, bitmap);
}
@@ -2906,6 +2904,13 @@ static inline unsigned long get_mm_rss(struct mm_struct *mm)
get_mm_counter(mm, MM_SHMEMPAGES);
}
+static inline unsigned long get_mm_rss_sum(struct mm_struct *mm)
+{
+ return get_mm_counter_sum(mm, MM_FILEPAGES) +
+ get_mm_counter_sum(mm, MM_ANONPAGES) +
+ get_mm_counter_sum(mm, MM_SHMEMPAGES);
+}
+
static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
{
return max(mm->hiwater_rss, get_mm_rss(mm));
@@ -3518,7 +3523,7 @@ static inline unsigned long get_num_physpages(void)
}
/*
- * Using memblock node mappings, an architecture may initialise its
+ * FIXME: Using memblock node mappings, an architecture may initialise its
* zones, allocate the backing mem_map and account for memory holes in an
* architecture independent manner.
*
@@ -3533,7 +3538,7 @@ static inline unsigned long get_num_physpages(void)
* memblock_add_node(base, size, nid, MEMBLOCK_NONE)
* free_area_init(max_zone_pfns);
*/
-void free_area_init(unsigned long *max_zone_pfn);
+void arch_zone_limits_init(unsigned long *max_zone_pfn);
unsigned long node_map_pfn_alignment(void);
extern unsigned long absent_pages_in_range(unsigned long start_pfn,
unsigned long end_pfn);
@@ -4180,6 +4185,61 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
unsigned int order) {}
#endif /* CONFIG_DEBUG_PAGEALLOC */
+#ifndef clear_pages
+/**
+ * clear_pages() - clear a page range for kernel-internal use.
+ * @addr: start address
+ * @npages: number of pages
+ *
+ * Use clear_user_pages() instead when clearing a page range to be
+ * mapped to user space.
+ *
+ * Does absolutely no exception handling.
+ *
+ * Note that even though the clearing operation is preemptible, clear_pages()
+ * does not (and on architectures where it reduces to a few long-running
+ * instructions, might not be able to) call cond_resched() to check if
+ * rescheduling is required.
+ *
+ * When running under preemptible models this is not a problem. Under
+ * cooperatively scheduled models, however, the caller is expected to
+ * limit @npages to no more than PROCESS_PAGES_NON_PREEMPT_BATCH.
+ */
+static inline void clear_pages(void *addr, unsigned int npages)
+{
+ do {
+ clear_page(addr);
+ addr += PAGE_SIZE;
+ } while (--npages);
+}
+#endif
+
+#ifndef PROCESS_PAGES_NON_PREEMPT_BATCH
+#ifdef clear_pages
+/*
+ * The architecture defines clear_pages(), and we assume that it is
+ * generally "fast". So choose a batch size large enough to allow the processor
+ * headroom for optimizing the operation and yet small enough that we see
+ * reasonable preemption latency for when this optimization is not possible
+ * (ex. slow microarchitectures, memory bandwidth saturation.)
+ *
+ * With a value of 32MB and assuming a memory bandwidth of ~10GBps, this should
+ * result in worst case preemption latency of around 3ms when clearing pages.
+ *
+ * (See comment above clear_pages() for why preemption latency is a concern
+ * here.)
+ */
+#define PROCESS_PAGES_NON_PREEMPT_BATCH (SZ_32M >> PAGE_SHIFT)
+#else /* !clear_pages */
+/*
+ * The architecture does not provide a clear_pages() implementation. Assume
+ * that clear_page() -- which clear_pages() will fallback to -- is relatively
+ * slow and choose a small value for PROCESS_PAGES_NON_PREEMPT_BATCH.
+ */
+#define PROCESS_PAGES_NON_PREEMPT_BATCH 1
+#endif
+#endif
+
#ifdef __HAVE_ARCH_GATE_AREA
extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm);
extern int in_gate_area_no_mm(unsigned long addr);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 78950eb8926d..8731606d8d36 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -752,8 +752,18 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
}
#endif
-#define VMA_LOCK_OFFSET 0x40000000
-#define VMA_REF_LIMIT (VMA_LOCK_OFFSET - 1)
+/*
+ * While __vma_enter_locked() is working to ensure are no read-locks held on a
+ * VMA (either while acquiring a VMA write lock or marking a VMA detached) we
+ * set the VM_REFCNT_EXCLUDE_READERS_FLAG in vma->vm_refcnt to indiciate to
+ * vma_start_read() that the reference count should be left alone.
+ *
+ * See the comment describing vm_refcnt in vm_area_struct for details as to
+ * which values the VMA reference count can be.
+ */
+#define VM_REFCNT_EXCLUDE_READERS_BIT (30)
+#define VM_REFCNT_EXCLUDE_READERS_FLAG (1U << VM_REFCNT_EXCLUDE_READERS_BIT)
+#define VM_REFCNT_LIMIT (VM_REFCNT_EXCLUDE_READERS_FLAG - 1)
struct vma_numab_state {
/*
@@ -935,10 +945,10 @@ struct vm_area_struct {
/*
* Can only be written (using WRITE_ONCE()) while holding both:
* - mmap_lock (in write mode)
- * - vm_refcnt bit at VMA_LOCK_OFFSET is set
+ * - vm_refcnt bit at VM_REFCNT_EXCLUDE_READERS_FLAG is set
* Can be read reliably while holding one of:
* - mmap_lock (in read or write mode)
- * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1
+ * - vm_refcnt bit at VM_REFCNT_EXCLUDE_READERS_BIT is set or vm_refcnt > 1
* Can be read unreliably (using READ_ONCE()) for pessimistic bailout
* while holding nothing (except RCU to keep the VMA struct allocated).
*
@@ -980,7 +990,44 @@ struct vm_area_struct {
struct vma_numab_state *numab_state; /* NUMA Balancing state */
#endif
#ifdef CONFIG_PER_VMA_LOCK
- /* Unstable RCU readers are allowed to read this. */
+ /*
+ * Used to keep track of firstly, whether the VMA is attached, secondly,
+ * if attached, how many read locks are taken, and thirdly, if the
+ * VM_REFCNT_EXCLUDE_READERS_FLAG is set, whether any read locks held
+ * are currently in the process of being excluded.
+ *
+ * This value can be equal to:
+ *
+ * 0 - Detached. IMPORTANT: when the refcnt is zero, readers cannot
+ * increment it.
+ *
+ * 1 - Attached and either unlocked or write-locked. Write locks are
+ * identified via __is_vma_write_locked() which checks for equality of
+ * vma->vm_lock_seq and mm->mm_lock_seq.
+ *
+ * >1, < VM_REFCNT_EXCLUDE_READERS_FLAG - Read-locked or (unlikely)
+ * write-locked with other threads having temporarily incremented the
+ * reference count prior to determining it is write-locked and
+ * decrementing it again.
+ *
+ * VM_REFCNT_EXCLUDE_READERS_FLAG - Detached, pending
+ * __vma_end_exclude_readers() completion which will decrement the
+ * reference count to zero. IMPORTANT - at this stage no further readers
+ * can increment the reference count. It can only be reduced.
+ *
+ * VM_REFCNT_EXCLUDE_READERS_FLAG + 1 - A thread is either write-locking
+ * an attached VMA and has yet to invoke __vma_end_exclude_readers(),
+ * OR a thread is detaching a VMA and is waiting on a single spurious
+ * reader in order to decrement the reference count. IMPORTANT - as
+ * above, no further readers can increment the reference count.
+ *
+ * > VM_REFCNT_EXCLUDE_READERS_FLAG + 1 - A thread is either
+ * write-locking or detaching a VMA is waiting on readers to
+ * exit. IMPORTANT - as above, no further readers can increment the
+ * reference count.
+ *
+ * NOTE: Unstable RCU readers are allowed to read this.
+ */
refcount_t vm_refcnt ____cacheline_aligned_in_smp;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map vmlock_dep_map;
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index a82aa80c0ba4..11bf319d78ec 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -88,4 +88,9 @@ struct tlbflush_unmap_batch {
#endif
};
+struct lazy_mmu_state {
+ u8 enable_count;
+ u8 pause_count;
+};
+
#endif /* _LINUX_MM_TYPES_TASK_H */
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index d53f72dba7fe..93eca48bc443 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -78,6 +78,43 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm)
#ifdef CONFIG_PER_VMA_LOCK
+#ifdef CONFIG_LOCKDEP
+#define __vma_lockdep_map(vma) (&vma->vmlock_dep_map)
+#else
+#define __vma_lockdep_map(vma) NULL
+#endif
+
+/*
+ * VMA locks do not behave like most ordinary locks found in the kernel, so we
+ * cannot quite have full lockdep tracking in the way we would ideally prefer.
+ *
+ * Read locks act as shared locks which exclude an exclusive lock being
+ * taken. We therefore mark these accordingly on read lock acquire/release.
+ *
+ * Write locks are acquired exclusively per-VMA, but released in a shared
+ * fashion, that is upon vma_end_write_all(), we update the mmap's seqcount such
+ * that write lock is released.
+ *
+ * We therefore cannot track write locks per-VMA, nor do we try. Mitigating this
+ * is the fact that, of course, we do lockdep-track the mmap lock rwsem which
+ * must be held when taking a VMA write lock.
+ *
+ * We do, however, want to indicate that during either acquisition of a VMA
+ * write lock or detachment of a VMA that we require the lock held be exclusive,
+ * so we utilise lockdep to do so.
+ */
+#define __vma_lockdep_acquire_read(vma) \
+ lock_acquire_shared(__vma_lockdep_map(vma), 0, 1, NULL, _RET_IP_)
+#define __vma_lockdep_release_read(vma) \
+ lock_release(__vma_lockdep_map(vma), _RET_IP_)
+#define __vma_lockdep_acquire_exclusive(vma) \
+ lock_acquire_exclusive(__vma_lockdep_map(vma), 0, 0, NULL, _RET_IP_)
+#define __vma_lockdep_release_exclusive(vma) \
+ lock_release(__vma_lockdep_map(vma), _RET_IP_)
+/* Only meaningful if CONFIG_LOCK_STAT is defined. */
+#define __vma_lockdep_stat_mark_acquired(vma) \
+ lock_acquired(__vma_lockdep_map(vma), _RET_IP_)
+
static inline void mm_lock_seqcount_init(struct mm_struct *mm)
{
seqcount_init(&mm->mm_lock_seq);
@@ -115,36 +152,81 @@ static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key lockdep_key;
- lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0);
+ lockdep_init_map(__vma_lockdep_map(vma), "vm_lock", &lockdep_key, 0);
#endif
if (reset_refcnt)
refcount_set(&vma->vm_refcnt, 0);
vma->vm_lock_seq = UINT_MAX;
}
-static inline bool is_vma_writer_only(int refcnt)
+/*
+ * This function determines whether the input VMA reference count describes a
+ * VMA which has excluded all VMA read locks.
+ *
+ * In the case of a detached VMA, we may incorrectly indicate that readers are
+ * excluded when one remains, because in that scenario we target a refcount of
+ * VM_REFCNT_EXCLUDE_READERS_FLAG, rather than the attached target of
+ * VM_REFCNT_EXCLUDE_READERS_FLAG + 1.
+ *
+ * However, the race window for that is very small so it is unlikely.
+ *
+ * Returns: true if readers are excluded, false otherwise.
+ */
+static inline bool __vma_are_readers_excluded(int refcnt)
{
/*
- * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma
- * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on
- * a detached vma happens only in vma_mark_detached() and is a rare
- * case, therefore most of the time there will be no unnecessary wakeup.
+ * See the comment describing the vm_area_struct->vm_refcnt field for
+ * details of possible refcnt values.
*/
- return (refcnt & VMA_LOCK_OFFSET) && refcnt <= VMA_LOCK_OFFSET + 1;
+ return (refcnt & VM_REFCNT_EXCLUDE_READERS_FLAG) &&
+ refcnt <= VM_REFCNT_EXCLUDE_READERS_FLAG + 1;
+}
+
+/*
+ * Actually decrement the VMA reference count.
+ *
+ * The function returns the reference count as it was immediately after the
+ * decrement took place. If it returns zero, the VMA is now detached.
+ */
+static inline __must_check unsigned int
+__vma_refcount_put_return(struct vm_area_struct *vma)
+{
+ int oldcnt;
+
+ if (__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt))
+ return 0;
+
+ return oldcnt - 1;
}
+/**
+ * vma_refcount_put() - Drop reference count in VMA vm_refcnt field due to a
+ * read-lock being dropped.
+ * @vma: The VMA whose reference count we wish to decrement.
+ *
+ * If we were the last reader, wake up threads waiting to obtain an exclusive
+ * lock.
+ */
static inline void vma_refcount_put(struct vm_area_struct *vma)
{
- /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */
+ /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt. */
struct mm_struct *mm = vma->vm_mm;
- int oldcnt;
+ int newcnt;
- rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
- if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) {
+ __vma_lockdep_release_read(vma);
+ newcnt = __vma_refcount_put_return(vma);
- if (is_vma_writer_only(oldcnt - 1))
- rcuwait_wake_up(&mm->vma_writer_wait);
- }
+ /*
+ * __vma_start_exclude_readers() may be sleeping waiting for readers to
+ * drop their reference count, so wake it up if we were the last reader
+ * blocking it from being acquired.
+ *
+ * We may be raced by other readers temporarily incrementing the
+ * reference count, though the race window is very small, this might
+ * cause spurious wakeups.
+ */
+ if (newcnt && __vma_are_readers_excluded(newcnt))
+ rcuwait_wake_up(&mm->vma_writer_wait);
}
/*
@@ -159,10 +241,10 @@ static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int
mmap_assert_locked(vma->vm_mm);
if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
- VMA_REF_LIMIT)))
+ VM_REFCNT_LIMIT)))
return false;
- rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
+ __vma_lockdep_acquire_read(vma);
return true;
}
@@ -182,21 +264,31 @@ static inline void vma_end_read(struct vm_area_struct *vma)
vma_refcount_put(vma);
}
-/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
-static inline bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq)
+static inline unsigned int __vma_raw_mm_seqnum(struct vm_area_struct *vma)
{
+ const struct mm_struct *mm = vma->vm_mm;
+
+ /* We must hold an exclusive write lock for this access to be valid. */
mmap_assert_write_locked(vma->vm_mm);
+ return mm->mm_lock_seq.sequence;
+}
+/*
+ * Determine whether a VMA is write-locked. Must be invoked ONLY if the mmap
+ * write lock is held.
+ *
+ * Returns true if write-locked, otherwise false.
+ */
+static inline bool __is_vma_write_locked(struct vm_area_struct *vma)
+{
/*
* current task is holding mmap_write_lock, both vma->vm_lock_seq and
* mm->mm_lock_seq can't be concurrently modified.
*/
- *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence;
- return (vma->vm_lock_seq == *mm_lock_seq);
+ return vma->vm_lock_seq == __vma_raw_mm_seqnum(vma);
}
-int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq,
- int state);
+int __vma_start_write(struct vm_area_struct *vma, int state);
/*
* Begin writing to a VMA.
@@ -205,12 +297,10 @@ int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq,
*/
static inline void vma_start_write(struct vm_area_struct *vma)
{
- unsigned int mm_lock_seq;
-
- if (__is_vma_write_locked(vma, &mm_lock_seq))
+ if (__is_vma_write_locked(vma))
return;
- __vma_start_write(vma, mm_lock_seq, TASK_UNINTERRUPTIBLE);
+ __vma_start_write(vma, TASK_UNINTERRUPTIBLE);
}
/**
@@ -229,26 +319,110 @@ static inline void vma_start_write(struct vm_area_struct *vma)
static inline __must_check
int vma_start_write_killable(struct vm_area_struct *vma)
{
- unsigned int mm_lock_seq;
-
- if (__is_vma_write_locked(vma, &mm_lock_seq))
+ if (__is_vma_write_locked(vma))
return 0;
- return __vma_start_write(vma, mm_lock_seq, TASK_KILLABLE);
+
+ return __vma_start_write(vma, TASK_KILLABLE);
}
+/**
+ * vma_assert_write_locked() - assert that @vma holds a VMA write lock.
+ * @vma: The VMA to assert.
+ */
static inline void vma_assert_write_locked(struct vm_area_struct *vma)
{
- unsigned int mm_lock_seq;
-
- VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
+ VM_WARN_ON_ONCE_VMA(!__is_vma_write_locked(vma), vma);
}
+/**
+ * vma_assert_locked() - assert that @vma holds either a VMA read or a VMA write
+ * lock and is not detached.
+ * @vma: The VMA to assert.
+ */
static inline void vma_assert_locked(struct vm_area_struct *vma)
{
- unsigned int mm_lock_seq;
+ unsigned int refcnt;
+
+ if (IS_ENABLED(CONFIG_LOCKDEP)) {
+ if (!lock_is_held(__vma_lockdep_map(vma)))
+ vma_assert_write_locked(vma);
+ return;
+ }
+
+ /*
+ * See the comment describing the vm_area_struct->vm_refcnt field for
+ * details of possible refcnt values.
+ */
+ refcnt = refcount_read(&vma->vm_refcnt);
+
+ /*
+ * In this case we're either read-locked, write-locked with temporary
+ * readers, or in the midst of excluding readers, all of which means
+ * we're locked.
+ */
+ if (refcnt > 1)
+ return;
+
+ /* It is a bug for the VMA to be detached here. */
+ VM_WARN_ON_ONCE_VMA(!refcnt, vma);
+
+ /*
+ * OK, the VMA has a reference count of 1 which means it is either
+ * unlocked and attached or write-locked, so assert that it is
+ * write-locked.
+ */
+ vma_assert_write_locked(vma);
+}
+
+/**
+ * vma_assert_stabilised() - assert that this VMA cannot be changed from
+ * underneath us either by having a VMA or mmap lock held.
+ * @vma: The VMA whose stability we wish to assess.
+ *
+ * If lockdep is enabled we can precisely ensure stability via either an mmap
+ * lock owned by us or a specific VMA lock.
+ *
+ * With lockdep disabled we may sometimes race with other threads acquiring the
+ * mmap read lock simultaneous with our VMA read lock.
+ */
+static inline void vma_assert_stabilised(struct vm_area_struct *vma)
+{
+ /*
+ * If another thread owns an mmap lock, it may go away at any time, and
+ * thus is no guarantee of stability.
+ *
+ * If lockdep is enabled we can accurately determine if an mmap lock is
+ * held and owned by us. Otherwise we must approximate.
+ *
+ * It doesn't necessarily mean we are not stabilised however, as we may
+ * hold a VMA read lock (not a write lock as this would require an owned
+ * mmap lock).
+ *
+ * If (assuming lockdep is not enabled) we were to assert a VMA read
+ * lock first we may also run into issues, as other threads can hold VMA
+ * read locks simlutaneous to us.
+ *
+ * Therefore if lockdep is not enabled we risk a false negative (i.e. no
+ * assert fired). If accurate checking is required, enable lockdep.
+ */
+ if (IS_ENABLED(CONFIG_LOCKDEP)) {
+ if (lockdep_is_held(&vma->vm_mm->mmap_lock))
+ return;
+ } else {
+ if (rwsem_is_locked(&vma->vm_mm->mmap_lock))
+ return;
+ }
- VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 &&
- !__is_vma_write_locked(vma, &mm_lock_seq), vma);
+ /*
+ * We're not stabilised by the mmap lock, so assert that we're
+ * stabilised by a VMA lock.
+ */
+ vma_assert_locked(vma);
+}
+
+static inline bool vma_is_attached(struct vm_area_struct *vma)
+{
+ return refcount_read(&vma->vm_refcnt);
}
/*
@@ -258,12 +432,12 @@ static inline void vma_assert_locked(struct vm_area_struct *vma)
*/
static inline void vma_assert_attached(struct vm_area_struct *vma)
{
- WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt));
+ WARN_ON_ONCE(!vma_is_attached(vma));
}
static inline void vma_assert_detached(struct vm_area_struct *vma)
{
- WARN_ON_ONCE(refcount_read(&vma->vm_refcnt));
+ WARN_ON_ONCE(vma_is_attached(vma));
}
static inline void vma_mark_attached(struct vm_area_struct *vma)
@@ -273,7 +447,28 @@ static inline void vma_mark_attached(struct vm_area_struct *vma)
refcount_set_release(&vma->vm_refcnt, 1);
}
-void vma_mark_detached(struct vm_area_struct *vma);
+void __vma_exclude_readers_for_detach(struct vm_area_struct *vma);
+
+static inline void vma_mark_detached(struct vm_area_struct *vma)
+{
+ vma_assert_write_locked(vma);
+ vma_assert_attached(vma);
+
+ /*
+ * The VMA still being attached (refcnt > 0) - is unlikely, because the
+ * vma has been already write-locked and readers can increment vm_refcnt
+ * only temporarily before they check vm_lock_seq, realize the vma is
+ * locked and drop back the vm_refcnt. That is a narrow window for
+ * observing a raised vm_refcnt.
+ *
+ * See the comment describing the vm_area_struct->vm_refcnt field for
+ * details of possible refcnt values.
+ */
+ if (likely(!__vma_refcount_put_return(vma)))
+ return;
+
+ __vma_exclude_readers_for_detach(vma);
+}
struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
unsigned long address);
@@ -327,6 +522,12 @@ static inline void vma_assert_locked(struct vm_area_struct *vma)
mmap_assert_locked(vma->vm_mm);
}
+static inline void vma_assert_stabilised(struct vm_area_struct *vma)
+{
+ /* If no VMA locks, then either mmap lock suffices to stabilise. */
+ mmap_assert_locked(vma->vm_mm);
+}
+
#endif /* CONFIG_PER_VMA_LOCK */
static inline void mmap_write_lock(struct mm_struct *mm)
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index 14a45979cccc..ab60ffba08f5 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -47,6 +47,15 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi);
BUG(); \
} \
} while (0)
+#define VM_WARN_ON_PAGE(cond, page) ({ \
+ int __ret_warn = !!(cond); \
+ \
+ if (unlikely(__ret_warn)) { \
+ dump_page(page, "VM_WARN_ON_PAGE(" __stringify(cond)")");\
+ WARN_ON(1); \
+ } \
+ unlikely(__ret_warn); \
+})
#define VM_WARN_ON_ONCE_PAGE(cond, page) ({ \
static bool __section(".data..once") __warned; \
int __ret_warn_once = !!(cond); \
@@ -122,6 +131,7 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi);
#define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond)
#define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
#define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
+#define VM_WARN_ON_PAGE(cond, page) BUILD_BUG_ON_INVALID(cond)
#define VM_WARN_ON_ONCE_PAGE(cond, page) BUILD_BUG_ON_INVALID(cond)
#define VM_WARN_ON_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond)
#define VM_WARN_ON_ONCE_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fc5d6c88d2f0..3e51190a55e4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1534,14 +1534,27 @@ static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
#include <linux/memory_hotplug.h>
void build_all_zonelists(pg_data_t *pgdat);
-void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
- enum zone_type highest_zoneidx);
bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
int highest_zoneidx, unsigned int alloc_flags,
long free_pages);
bool zone_watermark_ok(struct zone *z, unsigned int order,
unsigned long mark, int highest_zoneidx,
unsigned int alloc_flags);
+
+enum kswapd_clear_hopeless_reason {
+ KSWAPD_CLEAR_HOPELESS_OTHER = 0,
+ KSWAPD_CLEAR_HOPELESS_KSWAPD,
+ KSWAPD_CLEAR_HOPELESS_DIRECT,
+ KSWAPD_CLEAR_HOPELESS_PCP,
+};
+
+void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
+ enum zone_type highest_zoneidx);
+void kswapd_try_clear_hopeless(struct pglist_data *pgdat,
+ unsigned int order, int highest_zoneidx);
+void kswapd_clear_hopeless(pg_data_t *pgdat, enum kswapd_clear_hopeless_reason reason);
+bool kswapd_test_hopeless(pg_data_t *pgdat);
+
/*
* Memory initialization context, use to differentiate memory added by
* the platform statically or via memory hotplug interface.
@@ -2286,9 +2299,7 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr)
#define pfn_to_nid(pfn) (0)
#endif
-void sparse_init(void);
#else
-#define sparse_init() do {} while (0)
#define sparse_index_init(_sec, _nid) do {} while (0)
#define sparse_vmemmap_init_nid_early(_nid) do {} while (0)
#define sparse_vmemmap_init_nid_late(_nid) do {} while (0)
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index bd38648c998d..204c92462f3c 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -157,10 +157,10 @@ static __always_inline bool __node_test_and_set(int node, nodemask_t *addr)
#define nodes_and(dst, src1, src2) \
__nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
-static __always_inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
+static __always_inline bool __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
const nodemask_t *src2p, unsigned int nbits)
{
- bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
+ return bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
}
#define nodes_or(dst, src1, src2) \
@@ -181,10 +181,10 @@ static __always_inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1
#define nodes_andnot(dst, src1, src2) \
__nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES)
-static __always_inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
+static __always_inline bool __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
const nodemask_t *src2p, unsigned int nbits)
{
- bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
+ return bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
}
#define nodes_copy(dst, src) __nodes_copy(&(dst), &(src), MAX_NUMNODES)
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 3e2f960e166c..6f8638c9904f 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -67,4 +67,6 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn);
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
enum pb_isolate_mode mode);
+bool page_is_unmovable(struct zone *zone, struct page *page,
+ enum pb_isolate_mode mode, unsigned long *step);
#endif
diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h
index 289620d4aad3..12268a32e8be 100644
--- a/include/linux/page_table_check.h
+++ b/include/linux/page_table_check.h
@@ -14,15 +14,18 @@ extern struct static_key_true page_table_check_disabled;
extern struct page_ext_operations page_table_check_ops;
void __page_table_check_zero(struct page *page, unsigned int order);
-void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte);
-void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd);
-void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud);
-void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte,
- unsigned int nr);
-void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd,
- unsigned int nr);
-void __page_table_check_puds_set(struct mm_struct *mm, pud_t *pudp, pud_t pud,
- unsigned int nr);
+void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t pte);
+void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr,
+ pmd_t pmd);
+void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr,
+ pud_t pud);
+void __page_table_check_ptes_set(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte, unsigned int nr);
+void __page_table_check_pmds_set(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t pmd, unsigned int nr);
+void __page_table_check_puds_set(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, pud_t pud, unsigned int nr);
void __page_table_check_pte_clear_range(struct mm_struct *mm,
unsigned long addr,
pmd_t pmd);
@@ -43,55 +46,59 @@ static inline void page_table_check_free(struct page *page, unsigned int order)
__page_table_check_zero(page, order);
}
-static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte)
+static inline void page_table_check_pte_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t pte)
{
if (static_branch_likely(&page_table_check_disabled))
return;
- __page_table_check_pte_clear(mm, pte);
+ __page_table_check_pte_clear(mm, addr, pte);
}
-static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd)
+static inline void page_table_check_pmd_clear(struct mm_struct *mm,
+ unsigned long addr, pmd_t pmd)
{
if (static_branch_likely(&page_table_check_disabled))
return;
- __page_table_check_pmd_clear(mm, pmd);
+ __page_table_check_pmd_clear(mm, addr, pmd);
}
-static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud)
+static inline void page_table_check_pud_clear(struct mm_struct *mm,
+ unsigned long addr, pud_t pud)
{
if (static_branch_likely(&page_table_check_disabled))
return;
- __page_table_check_pud_clear(mm, pud);
+ __page_table_check_pud_clear(mm, addr, pud);
}
static inline void page_table_check_ptes_set(struct mm_struct *mm,
- pte_t *ptep, pte_t pte, unsigned int nr)
+ unsigned long addr, pte_t *ptep,
+ pte_t pte, unsigned int nr)
{
if (static_branch_likely(&page_table_check_disabled))
return;
- __page_table_check_ptes_set(mm, ptep, pte, nr);
+ __page_table_check_ptes_set(mm, addr, ptep, pte, nr);
}
static inline void page_table_check_pmds_set(struct mm_struct *mm,
- pmd_t *pmdp, pmd_t pmd, unsigned int nr)
+ unsigned long addr, pmd_t *pmdp, pmd_t pmd, unsigned int nr)
{
if (static_branch_likely(&page_table_check_disabled))
return;
- __page_table_check_pmds_set(mm, pmdp, pmd, nr);
+ __page_table_check_pmds_set(mm, addr, pmdp, pmd, nr);
}
static inline void page_table_check_puds_set(struct mm_struct *mm,
- pud_t *pudp, pud_t pud, unsigned int nr)
+ unsigned long addr, pud_t *pudp, pud_t pud, unsigned int nr)
{
if (static_branch_likely(&page_table_check_disabled))
return;
- __page_table_check_puds_set(mm, pudp, pud, nr);
+ __page_table_check_puds_set(mm, addr, pudp, pud, nr);
}
static inline void page_table_check_pte_clear_range(struct mm_struct *mm,
@@ -114,30 +121,34 @@ static inline void page_table_check_free(struct page *page, unsigned int order)
{
}
-static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte)
+static inline void page_table_check_pte_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t pte)
{
}
-static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd)
+static inline void page_table_check_pmd_clear(struct mm_struct *mm,
+ unsigned long addr, pmd_t pmd)
{
}
-static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud)
+static inline void page_table_check_pud_clear(struct mm_struct *mm,
+ unsigned long addr, pud_t pud)
{
}
static inline void page_table_check_ptes_set(struct mm_struct *mm,
- pte_t *ptep, pte_t pte, unsigned int nr)
+ unsigned long addr, pte_t *ptep,
+ pte_t pte, unsigned int nr)
{
}
static inline void page_table_check_pmds_set(struct mm_struct *mm,
- pmd_t *pmdp, pmd_t pmd, unsigned int nr)
+ unsigned long addr, pmd_t *pmdp, pmd_t pmd, unsigned int nr)
{
}
static inline void page_table_check_puds_set(struct mm_struct *mm,
- pud_t *pudp, pud_t pud, unsigned int nr)
+ unsigned long addr, pud_t *pudp, pud_t pud, unsigned int nr)
{
}
@@ -149,7 +160,7 @@ static inline void page_table_check_pte_clear_range(struct mm_struct *mm,
#endif /* CONFIG_PAGE_TABLE_CHECK */
-#define page_table_check_pmd_set(mm, pmdp, pmd) page_table_check_pmds_set(mm, pmdp, pmd, 1)
-#define page_table_check_pud_set(mm, pudp, pud) page_table_check_puds_set(mm, pudp, pud, 1)
+#define page_table_check_pmd_set(mm, addr, pmdp, pmd) page_table_check_pmds_set(mm, addr, pmdp, pmd, 1)
+#define page_table_check_pud_set(mm, addr, pudp, pud) page_table_check_puds_set(mm, addr, pudp, pud, 1)
#endif /* __LINUX_PAGE_TABLE_CHECK_H */
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 652f287c1ef6..827dca25c0bc 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -225,16 +225,156 @@ static inline int pmd_dirty(pmd_t pmd)
* up to date.
*
* In the general case, no lock is guaranteed to be held between entry and exit
- * of the lazy mode. So the implementation must assume preemption may be enabled
- * and cpu migration is possible; it must take steps to be robust against this.
- * (In practice, for user PTE updates, the appropriate page table lock(s) are
- * held, but for kernel PTE updates, no lock is held). Nesting is not permitted
- * and the mode cannot be used in interrupt context.
+ * of the lazy mode. (In practice, for user PTE updates, the appropriate page
+ * table lock(s) are held, but for kernel PTE updates, no lock is held).
+ * The implementation must therefore assume preemption may be enabled upon
+ * entry to the mode and cpu migration is possible; it must take steps to be
+ * robust against this. An implementation may handle this by disabling
+ * preemption, as a consequence generic code may not sleep while the lazy MMU
+ * mode is active.
+ *
+ * The mode is disabled in interrupt context and calls to the lazy_mmu API have
+ * no effect.
+ *
+ * The lazy MMU mode is enabled for a given block of code using:
+ *
+ * lazy_mmu_mode_enable();
+ * <code>
+ * lazy_mmu_mode_disable();
+ *
+ * Nesting is permitted: <code> may itself use an enable()/disable() pair.
+ * A nested call to enable() has no functional effect; however disable() causes
+ * any batched architectural state to be flushed regardless of nesting. After a
+ * call to disable(), the caller can therefore rely on all previous page table
+ * modifications to have taken effect, but the lazy MMU mode may still be
+ * enabled.
+ *
+ * In certain cases, it may be desirable to temporarily pause the lazy MMU mode.
+ * This can be done using:
+ *
+ * lazy_mmu_mode_pause();
+ * <code>
+ * lazy_mmu_mode_resume();
+ *
+ * pause() ensures that the mode is exited regardless of the nesting level;
+ * resume() re-enters the mode at the same nesting level. Any call to the
+ * lazy_mmu_mode_* API between those two calls has no effect. In particular,
+ * this means that pause()/resume() pairs may nest.
+ *
+ * is_lazy_mmu_mode_active() can be used to check whether the lazy MMU mode is
+ * currently enabled.
*/
-#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
-static inline void arch_enter_lazy_mmu_mode(void) {}
-static inline void arch_leave_lazy_mmu_mode(void) {}
-static inline void arch_flush_lazy_mmu_mode(void) {}
+#ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE
+/**
+ * lazy_mmu_mode_enable() - Enable the lazy MMU mode.
+ *
+ * Enters a new lazy MMU mode section; if the mode was not already enabled,
+ * enables it and calls arch_enter_lazy_mmu_mode().
+ *
+ * Must be paired with a call to lazy_mmu_mode_disable().
+ *
+ * Has no effect if called:
+ * - While paused - see lazy_mmu_mode_pause()
+ * - In interrupt context
+ */
+static inline void lazy_mmu_mode_enable(void)
+{
+ struct lazy_mmu_state *state = &current->lazy_mmu_state;
+
+ if (in_interrupt() || state->pause_count > 0)
+ return;
+
+ VM_WARN_ON_ONCE(state->enable_count == U8_MAX);
+
+ if (state->enable_count++ == 0)
+ arch_enter_lazy_mmu_mode();
+}
+
+/**
+ * lazy_mmu_mode_disable() - Disable the lazy MMU mode.
+ *
+ * Exits the current lazy MMU mode section. If it is the outermost section,
+ * disables the mode and calls arch_leave_lazy_mmu_mode(). Otherwise (nested
+ * section), calls arch_flush_lazy_mmu_mode().
+ *
+ * Must match a call to lazy_mmu_mode_enable().
+ *
+ * Has no effect if called:
+ * - While paused - see lazy_mmu_mode_pause()
+ * - In interrupt context
+ */
+static inline void lazy_mmu_mode_disable(void)
+{
+ struct lazy_mmu_state *state = &current->lazy_mmu_state;
+
+ if (in_interrupt() || state->pause_count > 0)
+ return;
+
+ VM_WARN_ON_ONCE(state->enable_count == 0);
+
+ if (--state->enable_count == 0)
+ arch_leave_lazy_mmu_mode();
+ else /* Exiting a nested section */
+ arch_flush_lazy_mmu_mode();
+
+}
+
+/**
+ * lazy_mmu_mode_pause() - Pause the lazy MMU mode.
+ *
+ * Pauses the lazy MMU mode; if it is currently active, disables it and calls
+ * arch_leave_lazy_mmu_mode().
+ *
+ * Must be paired with a call to lazy_mmu_mode_resume(). Calls to the
+ * lazy_mmu_mode_* API have no effect until the matching resume() call.
+ *
+ * Has no effect if called:
+ * - While paused (inside another pause()/resume() pair)
+ * - In interrupt context
+ */
+static inline void lazy_mmu_mode_pause(void)
+{
+ struct lazy_mmu_state *state = &current->lazy_mmu_state;
+
+ if (in_interrupt())
+ return;
+
+ VM_WARN_ON_ONCE(state->pause_count == U8_MAX);
+
+ if (state->pause_count++ == 0 && state->enable_count > 0)
+ arch_leave_lazy_mmu_mode();
+}
+
+/**
+ * lazy_mmu_mode_resume() - Resume the lazy MMU mode.
+ *
+ * Resumes the lazy MMU mode; if it was active at the point where the matching
+ * call to lazy_mmu_mode_pause() was made, re-enables it and calls
+ * arch_enter_lazy_mmu_mode().
+ *
+ * Must match a call to lazy_mmu_mode_pause().
+ *
+ * Has no effect if called:
+ * - While paused (inside another pause()/resume() pair)
+ * - In interrupt context
+ */
+static inline void lazy_mmu_mode_resume(void)
+{
+ struct lazy_mmu_state *state = &current->lazy_mmu_state;
+
+ if (in_interrupt())
+ return;
+
+ VM_WARN_ON_ONCE(state->pause_count == 0);
+
+ if (--state->pause_count == 0 && state->enable_count > 0)
+ arch_enter_lazy_mmu_mode();
+}
+#else
+static inline void lazy_mmu_mode_enable(void) {}
+static inline void lazy_mmu_mode_disable(void) {}
+static inline void lazy_mmu_mode_pause(void) {}
+static inline void lazy_mmu_mode_resume(void) {}
#endif
#ifndef pte_batch_hint
@@ -289,7 +429,7 @@ static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte, unsigned int nr)
{
- page_table_check_ptes_set(mm, ptep, pte, nr);
+ page_table_check_ptes_set(mm, addr, ptep, pte, nr);
for (;;) {
set_pte(ptep, pte);
@@ -494,7 +634,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
{
pte_t pte = ptep_get(ptep);
pte_clear(mm, address, ptep);
- page_table_check_pte_clear(mm, pte);
+ page_table_check_pte_clear(mm, address, pte);
return pte;
}
#endif
@@ -553,7 +693,7 @@ static inline void ptep_clear(struct mm_struct *mm, unsigned long addr,
* No need for ptep_get_and_clear(): page table check doesn't care about
* any bits that could have been set by HW concurrently.
*/
- page_table_check_pte_clear(mm, pte);
+ page_table_check_pte_clear(mm, addr, pte);
}
#ifdef CONFIG_GUP_GET_PXX_LOW_HIGH
@@ -648,7 +788,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
pmd_t pmd = *pmdp;
pmd_clear(pmdp);
- page_table_check_pmd_clear(mm, pmd);
+ page_table_check_pmd_clear(mm, address, pmd);
return pmd;
}
@@ -661,7 +801,7 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
pud_t pud = *pudp;
pud_clear(pudp);
- page_table_check_pud_clear(mm, pud);
+ page_table_check_pud_clear(mm, address, pud);
return pud;
}
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index daa92a58585d..8dc0871e5f00 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -92,6 +92,7 @@ struct anon_vma_chain {
};
enum ttu_flags {
+ TTU_USE_SHARED_ZEROPAGE = 0x2, /* for unused pages of large folios */
TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */
TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */
TTU_SYNC = 0x10, /* avoid racy checks with PVMW_SYNC */
@@ -104,75 +105,8 @@ enum ttu_flags {
};
#ifdef CONFIG_MMU
-static inline void get_anon_vma(struct anon_vma *anon_vma)
-{
- atomic_inc(&anon_vma->refcount);
-}
-
-void __put_anon_vma(struct anon_vma *anon_vma);
-
-static inline void put_anon_vma(struct anon_vma *anon_vma)
-{
- if (atomic_dec_and_test(&anon_vma->refcount))
- __put_anon_vma(anon_vma);
-}
-
-static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
-{
- down_write(&anon_vma->root->rwsem);
-}
-
-static inline int anon_vma_trylock_write(struct anon_vma *anon_vma)
-{
- return down_write_trylock(&anon_vma->root->rwsem);
-}
-
-static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
-{
- up_write(&anon_vma->root->rwsem);
-}
-
-static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
-{
- down_read(&anon_vma->root->rwsem);
-}
-
-static inline int anon_vma_trylock_read(struct anon_vma *anon_vma)
-{
- return down_read_trylock(&anon_vma->root->rwsem);
-}
-
-static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
-{
- up_read(&anon_vma->root->rwsem);
-}
-
-/*
- * anon_vma helper functions.
- */
void anon_vma_init(void); /* create anon_vma_cachep */
-int __anon_vma_prepare(struct vm_area_struct *);
-void unlink_anon_vmas(struct vm_area_struct *);
-int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
-int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
-
-static inline int anon_vma_prepare(struct vm_area_struct *vma)
-{
- if (likely(vma->anon_vma))
- return 0;
-
- return __anon_vma_prepare(vma);
-}
-
-static inline void anon_vma_merge(struct vm_area_struct *vma,
- struct vm_area_struct *next)
-{
- VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma);
- unlink_anon_vmas(next);
-}
-
-struct anon_vma *folio_get_anon_vma(const struct folio *folio);
#ifdef CONFIG_MM_ID
static __always_inline void folio_lock_large_mapcount(struct folio *folio)
@@ -1000,12 +934,8 @@ int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff,
int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
struct vm_area_struct *vma);
-enum rmp_flags {
- RMP_LOCKED = 1 << 0,
- RMP_USE_SHARED_ZEROPAGE = 1 << 1,
-};
-
-void remove_migration_ptes(struct folio *src, struct folio *dst, int flags);
+void remove_migration_ptes(struct folio *src, struct folio *dst,
+ enum ttu_flags flags);
/*
* rmap_walk_control: To control rmap traversing for specific needs
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 36ae08ca0c62..873e400aafce 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1410,6 +1410,10 @@ struct task_struct {
struct page_frag task_frag;
+#ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE
+ struct lazy_mmu_state lazy_mmu_state;
+#endif
+
#ifdef CONFIG_TASK_DELAY_ACCT
struct task_delay_info *delays;
#endif
@@ -1693,6 +1697,47 @@ static inline char task_state_to_char(struct task_struct *tsk)
return task_index_to_char(task_state_index(tsk));
}
+#ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE
+/**
+ * __task_lazy_mmu_mode_active() - Test the lazy MMU mode state for a task.
+ * @tsk: The task to check.
+ *
+ * Test whether @tsk has its lazy MMU mode state set to active (i.e. enabled
+ * and not paused).
+ *
+ * This function only considers the state saved in task_struct; to test whether
+ * current actually is in lazy MMU mode, is_lazy_mmu_mode_active() should be
+ * used instead.
+ *
+ * This function is intended for architectures that implement the lazy MMU
+ * mode; it must not be called from generic code.
+ */
+static inline bool __task_lazy_mmu_mode_active(struct task_struct *tsk)
+{
+ struct lazy_mmu_state *state = &tsk->lazy_mmu_state;
+
+ return state->enable_count > 0 && state->pause_count == 0;
+}
+
+/**
+ * is_lazy_mmu_mode_active() - Test whether we are currently in lazy MMU mode.
+ *
+ * Test whether the current context is in lazy MMU mode. This is true if both:
+ * 1. We are not in interrupt context
+ * 2. Lazy MMU mode is active for the current task
+ *
+ * This function is intended for architectures that implement the lazy MMU
+ * mode; it must not be called from generic code.
+ */
+static inline bool is_lazy_mmu_mode_active(void)
+{
+ if (in_interrupt())
+ return false;
+
+ return __task_lazy_mmu_mode_active(current);
+}
+#endif
+
extern struct pid *cad_pid;
/*
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 38ca3df68716..62fc7499b408 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -224,13 +224,11 @@ enum {
#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
/* Bit flag in swap_map */
-#define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */
#define COUNT_CONTINUED 0x80 /* Flag swap_map continuation for full count */
/* Special value in first swap_map */
#define SWAP_MAP_MAX 0x3e /* Max count */
#define SWAP_MAP_BAD 0x3f /* Note page is bad */
-#define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs */
/* Special value in each swap_map continuation */
#define SWAP_CONT_MAX 0x7f /* Max count */
@@ -453,16 +451,7 @@ static inline long get_nr_swap_pages(void)
}
extern void si_swapinfo(struct sysinfo *);
-int folio_alloc_swap(struct folio *folio);
-bool folio_free_swap(struct folio *folio);
-void put_swap_folio(struct folio *folio, swp_entry_t entry);
-extern swp_entry_t get_swap_page_of_type(int);
extern int add_swap_count_continuation(swp_entry_t, gfp_t);
-extern void swap_shmem_alloc(swp_entry_t, int);
-extern int swap_duplicate(swp_entry_t);
-extern int swapcache_prepare(swp_entry_t entry, int nr);
-extern void swap_free_nr(swp_entry_t entry, int nr_pages);
-extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
int swap_type_of(dev_t device, sector_t offset);
int find_first_swap(dev_t *device);
extern unsigned int count_swap_pages(int, int);
@@ -474,6 +463,29 @@ struct backing_dev_info;
extern struct swap_info_struct *get_swap_device(swp_entry_t entry);
sector_t swap_folio_sector(struct folio *folio);
+/*
+ * If there is an existing swap slot reference (swap entry) and the caller
+ * guarantees that there is no race modification of it (e.g., PTL
+ * protecting the swap entry in page table; shmem's cmpxchg protects t
+ * he swap entry in shmem mapping), these two helpers below can be used
+ * to put/dup the entries directly.
+ *
+ * All entries must be allocated by folio_alloc_swap(). And they must have
+ * a swap count > 1. See comments of folio_*_swap helpers for more info.
+ */
+int swap_dup_entry_direct(swp_entry_t entry);
+void swap_put_entries_direct(swp_entry_t entry, int nr);
+
+/*
+ * folio_free_swap tries to free the swap entries pinned by a swap cache
+ * folio, it has to be here to be called by other components.
+ */
+bool folio_free_swap(struct folio *folio);
+
+/* Allocate / free (hibernation) exclusive entries */
+swp_entry_t swap_alloc_hibernation_slot(int type);
+void swap_free_hibernation_slot(swp_entry_t entry);
+
static inline void put_swap_device(struct swap_info_struct *si)
{
percpu_ref_put(&si->users);
@@ -501,10 +513,6 @@ static inline void put_swap_device(struct swap_info_struct *si)
#define free_pages_and_swap_cache(pages, nr) \
release_pages((pages), (nr));
-static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
-{
-}
-
static inline void free_swap_cache(struct folio *folio)
{
}
@@ -514,25 +522,12 @@ static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
return 0;
}
-static inline void swap_shmem_alloc(swp_entry_t swp, int nr)
-{
-}
-
-static inline int swap_duplicate(swp_entry_t swp)
-{
- return 0;
-}
-
-static inline int swapcache_prepare(swp_entry_t swp, int nr)
+static inline int swap_dup_entry_direct(swp_entry_t ent)
{
return 0;
}
-static inline void swap_free_nr(swp_entry_t entry, int nr_pages)
-{
-}
-
-static inline void put_swap_folio(struct folio *folio, swp_entry_t swp)
+static inline void swap_put_entries_direct(swp_entry_t ent, int nr)
{
}
@@ -551,11 +546,6 @@ static inline int swp_swapcount(swp_entry_t entry)
return 0;
}
-static inline int folio_alloc_swap(struct folio *folio)
-{
- return -EINVAL;
-}
-
static inline bool folio_free_swap(struct folio *folio)
{
return false;
@@ -568,17 +558,6 @@ static inline int add_swap_extent(struct swap_info_struct *sis,
return -EINVAL;
}
#endif /* CONFIG_SWAP */
-
-static inline void free_swap_and_cache(swp_entry_t entry)
-{
- free_swap_and_cache_nr(entry, 1);
-}
-
-static inline void swap_free(swp_entry_t entry)
-{
- swap_free_nr(entry, 1);
-}
-
#ifdef CONFIG_MEMCG
static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
{
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 92f80b4d69a6..22a139f82d75 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -122,13 +122,13 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
THP_SWPOUT,
THP_SWPOUT_FALLBACK,
#endif
-#ifdef CONFIG_MEMORY_BALLOON
+#ifdef CONFIG_BALLOON
BALLOON_INFLATE,
BALLOON_DEFLATE,
-#ifdef CONFIG_BALLOON_COMPACTION
+#ifdef CONFIG_BALLOON_MIGRATION
BALLOON_MIGRATE,
-#endif
-#endif
+#endif /* CONFIG_BALLOON_MIGRATION */
+#endif /* CONFIG_BALLOON */
#ifdef CONFIG_DEBUG_TLBFLUSH
NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */
NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 1909b945b3ea..3c9c266cf782 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -286,10 +286,8 @@ void mod_node_page_state(struct pglist_data *, enum node_stat_item, long);
void inc_node_page_state(struct page *, enum node_stat_item);
void dec_node_page_state(struct page *, enum node_stat_item);
-extern void inc_node_state(struct pglist_data *, enum node_stat_item);
extern void __inc_zone_state(struct zone *, enum zone_stat_item);
extern void __inc_node_state(struct pglist_data *, enum node_stat_item);
-extern void dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_node_state(struct pglist_data *, enum node_stat_item);
@@ -395,10 +393,6 @@ static inline void __dec_node_page_state(struct page *page,
#define dec_node_page_state __dec_node_page_state
#define mod_node_page_state __mod_node_page_state
-#define inc_zone_state __inc_zone_state
-#define inc_node_state __inc_node_state
-#define dec_zone_state __dec_zone_state
-
#define set_pgdat_percpu_threshold(pgdat, callback) { }
static inline void refresh_zone_stat_thresholds(void) { }
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index f48e8ccffe81..e530112c4b3a 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -328,9 +328,6 @@ struct dirty_throttle_control {
bool dirty_exceeded;
};
-void laptop_io_completion(struct backing_dev_info *info);
-void laptop_sync_completion(void);
-void laptop_mode_timer_fn(struct timer_list *t);
bool node_dirty_ok(struct pglist_data *pgdat);
int wb_domain_init(struct wb_domain *dom, gfp_t gfp);
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -342,7 +339,6 @@ extern struct wb_domain global_wb_domain;
/* These are exported to sysctl. */
extern unsigned int dirty_writeback_interval;
extern unsigned int dirty_expire_interval;
-extern int laptop_mode;
void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index f3ccff2d966c..478410c880b1 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -22,6 +22,7 @@ struct zs_pool_stats {
};
struct zs_pool;
+struct scatterlist;
struct zs_pool *zs_create_pool(const char *name);
void zs_destroy_pool(struct zs_pool *pool);
@@ -40,9 +41,12 @@ unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size);
void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats);
void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle,
- void *local_copy);
+ size_t mem_len, void *local_copy);
void zs_obj_read_end(struct zs_pool *pool, unsigned long handle,
- void *handle_mem);
+ size_t mem_len, void *handle_mem);
+void zs_obj_read_sg_begin(struct zs_pool *pool, unsigned long handle,
+ struct scatterlist *sg, size_t mem_len);
+void zs_obj_read_sg_end(struct zs_pool *pool, unsigned long handle);
void zs_obj_write(struct zs_pool *pool, unsigned long handle,
void *handle_mem, size_t mem_len);