summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
Diffstat (limited to 'include')
-rw-r--r--include/asm-generic/memory_model.h2
-rw-r--r--include/crypto/scatterwalk.h4
-rw-r--r--include/linux/alloc_tag.h12
-rw-r--r--include/linux/blkdev.h2
-rw-r--r--include/linux/bpfptr.h2
-rw-r--r--include/linux/bvec.h7
-rw-r--r--include/linux/codetag.h5
-rw-r--r--include/linux/damon.h18
-rw-r--r--include/linux/freezer.h2
-rw-r--r--include/linux/fs.h8
-rw-r--r--include/linux/gfp.h2
-rw-r--r--include/linux/highmem-internal.h36
-rw-r--r--include/linux/highmem.h8
-rw-r--r--include/linux/huge_mm.h112
-rw-r--r--include/linux/hugetlb.h7
-rw-r--r--include/linux/idr.h8
-rw-r--r--include/linux/kasan-enabled.h32
-rw-r--r--include/linux/kasan.h19
-rw-r--r--include/linux/kernel.h21
-rw-r--r--include/linux/kexec.h5
-rw-r--r--include/linux/kexec_handover.h6
-rw-r--r--include/linux/khugepaged.h6
-rw-r--r--include/linux/ksm.h12
-rw-r--r--include/linux/list.h8
-rw-r--r--include/linux/local_lock.h2
-rw-r--r--include/linux/local_lock_internal.h16
-rw-r--r--include/linux/maple_tree.h33
-rw-r--r--include/linux/memcontrol.h22
-rw-r--r--include/linux/mempool.h2
-rw-r--r--include/linux/memremap.h45
-rw-r--r--include/linux/migrate.h11
-rw-r--r--include/linux/mm.h273
-rw-r--r--include/linux/mm_inline.h37
-rw-r--r--include/linux/mm_types.h136
-rw-r--r--include/linux/mman.h2
-rw-r--r--include/linux/mmap_lock.h85
-rw-r--r--include/linux/mmzone.h91
-rw-r--r--include/linux/moduleparam.h13
-rw-r--r--include/linux/netfs.h2
-rw-r--r--include/linux/nvmem-provider.h2
-rw-r--r--include/linux/oom.h2
-rw-r--r--include/linux/page-flags.h42
-rw-r--r--include/linux/pageblock-flags.h12
-rw-r--r--include/linux/pagemap.h65
-rw-r--r--include/linux/pagevec.h4
-rw-r--r--include/linux/panic.h6
-rw-r--r--include/linux/pgalloc_tag.h7
-rw-r--r--include/linux/pgtable.h26
-rw-r--r--include/linux/printk.h2
-rw-r--r--include/linux/rmap.h67
-rw-r--r--include/linux/rtmutex.h10
-rw-r--r--include/linux/scatterlist.h3
-rw-r--r--include/linux/sched/coredump.h18
-rw-r--r--include/linux/sched/mm.h4
-rw-r--r--include/linux/sched/task.h5
-rw-r--r--include/linux/shmem_fs.h4
-rw-r--r--include/linux/slab.h90
-rw-r--r--include/linux/swap.h50
-rw-r--r--include/linux/vm_event_item.h2
-rw-r--r--include/linux/vmalloc.h12
-rw-r--r--include/linux/writeback.h6
-rw-r--r--include/linux/zpool.h86
-rw-r--r--include/trace/events/cma.h19
-rw-r--r--include/trace/events/huge_memory.h19
-rw-r--r--include/trace/events/kmem.h5
-rw-r--r--include/trace/events/page_ref.h4
-rw-r--r--include/trace/events/readahead.h132
-rw-r--r--include/uapi/linux/kexec.h4
-rw-r--r--include/uapi/linux/mempolicy.h12
-rw-r--r--include/uapi/linux/prctl.h10
70 files changed, 1132 insertions, 712 deletions
diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h
index 74d0077cc5fa..efa6610acbc7 100644
--- a/include/asm-generic/memory_model.h
+++ b/include/asm-generic/memory_model.h
@@ -53,7 +53,7 @@ static inline int pfn_valid(unsigned long pfn)
*/
#define __page_to_pfn(pg) \
({ const struct page *__pg = (pg); \
- int __sec = page_to_section(__pg); \
+ int __sec = memdesc_section(__pg->flags); \
(unsigned long)(__pg - __section_mem_map_addr(__nr_to_section(__sec))); \
})
diff --git a/include/crypto/scatterwalk.h b/include/crypto/scatterwalk.h
index 15ab743f68c8..83d14376ff2b 100644
--- a/include/crypto/scatterwalk.h
+++ b/include/crypto/scatterwalk.h
@@ -159,7 +159,7 @@ static inline void scatterwalk_map(struct scatter_walk *walk)
if (IS_ENABLED(CONFIG_HIGHMEM)) {
struct page *page;
- page = nth_page(base_page, offset >> PAGE_SHIFT);
+ page = base_page + (offset >> PAGE_SHIFT);
offset = offset_in_page(offset);
addr = kmap_local_page(page) + offset;
} else {
@@ -259,7 +259,7 @@ static inline void scatterwalk_done_dst(struct scatter_walk *walk,
end += (offset_in_page(offset) + offset_in_page(nbytes) +
PAGE_SIZE - 1) >> PAGE_SHIFT;
for (i = start; i < end; i++)
- flush_dcache_page(nth_page(base_page, i));
+ flush_dcache_page(base_page + i);
}
scatterwalk_advance(walk, nbytes);
}
diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index 9ef2633e2c08..d40ac39bfbe8 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -221,6 +221,16 @@ static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes)
ref->ct = NULL;
}
+static inline void alloc_tag_set_inaccurate(struct alloc_tag *tag)
+{
+ tag->ct.flags |= CODETAG_FLAG_INACCURATE;
+}
+
+static inline bool alloc_tag_is_inaccurate(struct alloc_tag *tag)
+{
+ return !!(tag->ct.flags & CODETAG_FLAG_INACCURATE);
+}
+
#define alloc_tag_record(p) ((p) = current->alloc_tag)
#else /* CONFIG_MEM_ALLOC_PROFILING */
@@ -230,6 +240,8 @@ static inline bool mem_alloc_profiling_enabled(void) { return false; }
static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag,
size_t bytes) {}
static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {}
+static inline void alloc_tag_set_inaccurate(struct alloc_tag *tag) {}
+static inline bool alloc_tag_is_inaccurate(struct alloc_tag *tag) { return false; }
#define alloc_tag_record(p) do {} while (0)
#endif /* CONFIG_MEM_ALLOC_PROFILING */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 066e5309bd45..dad5cb5b3812 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -199,7 +199,7 @@ struct gendisk {
unsigned int zone_wplugs_hash_bits;
atomic_t nr_zone_wplugs;
spinlock_t zone_wplugs_lock;
- struct mempool_s *zone_wplugs_pool;
+ struct mempool *zone_wplugs_pool;
struct hlist_head *zone_wplugs_hash;
struct workqueue_struct *zone_wplugs_wq;
#endif /* CONFIG_BLK_DEV_ZONED */
diff --git a/include/linux/bpfptr.h b/include/linux/bpfptr.h
index 1af241525a17..f6e0795db484 100644
--- a/include/linux/bpfptr.h
+++ b/include/linux/bpfptr.h
@@ -67,7 +67,7 @@ static inline int copy_to_bpfptr_offset(bpfptr_t dst, size_t offset,
static inline void *kvmemdup_bpfptr_noprof(bpfptr_t src, size_t len)
{
- void *p = kvmalloc_noprof(len, GFP_USER | __GFP_NOWARN);
+ void *p = kvmalloc_node_align_noprof(len, 1, GFP_USER | __GFP_NOWARN, NUMA_NO_NODE);
if (!p)
return ERR_PTR(-ENOMEM);
diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 0a80e1f9aa20..3fc0efa0825b 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -22,11 +22,8 @@ struct page;
* @bv_len: Number of bytes in the address range.
* @bv_offset: Start of the address range relative to the start of @bv_page.
*
- * The following holds for a bvec if n * PAGE_SIZE < bv_offset + bv_len:
- *
- * nth_page(@bv_page, n) == @bv_page + n
- *
- * This holds because page_is_mergeable() checks the above property.
+ * All pages within a bio_vec starting from @bv_page are contiguous and
+ * can simply be iterated (see bvec_advance()).
*/
struct bio_vec {
struct page *bv_page;
diff --git a/include/linux/codetag.h b/include/linux/codetag.h
index 457ed8fd3214..8ea2a5f7c98a 100644
--- a/include/linux/codetag.h
+++ b/include/linux/codetag.h
@@ -16,13 +16,16 @@ struct module;
#define CODETAG_SECTION_START_PREFIX "__start_"
#define CODETAG_SECTION_STOP_PREFIX "__stop_"
+/* codetag flags */
+#define CODETAG_FLAG_INACCURATE (1 << 0)
+
/*
* An instance of this structure is created in a special ELF section at every
* code location being tagged. At runtime, the special section is treated as
* an array of these.
*/
struct codetag {
- unsigned int flags; /* used in later patches */
+ unsigned int flags;
unsigned int lineno;
const char *modname;
const char *function;
diff --git a/include/linux/damon.h b/include/linux/damon.h
index 9e62b2a85538..cae8c613c5fc 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -110,7 +110,7 @@ struct damon_target {
*
* @DAMOS_WILLNEED: Call ``madvise()`` for the region with MADV_WILLNEED.
* @DAMOS_COLD: Call ``madvise()`` for the region with MADV_COLD.
- * @DAMOS_PAGEOUT: Call ``madvise()`` for the region with MADV_PAGEOUT.
+ * @DAMOS_PAGEOUT: Reclaim the region.
* @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE.
* @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
* @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists.
@@ -121,10 +121,10 @@ struct damon_target {
* @NR_DAMOS_ACTIONS: Total number of DAMOS actions
*
* The support of each action is up to running &struct damon_operations.
- * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR supports all actions except
- * &enum DAMOS_LRU_PRIO and &enum DAMOS_LRU_DEPRIO. &enum DAMON_OPS_PADDR
- * supports only &enum DAMOS_PAGEOUT, &enum DAMOS_LRU_PRIO, &enum
- * DAMOS_LRU_DEPRIO, and &DAMOS_STAT.
+ * Refer to 'Operation Action' section of Documentation/mm/damon/design.rst for
+ * status of the supports.
+ *
+ * Note that DAMOS_PAGEOUT doesn't trigger demotions.
*/
enum damos_action {
DAMOS_WILLNEED,
@@ -748,7 +748,8 @@ struct damon_attrs {
* Accesses to other fields must be protected by themselves.
*
* @ops: Set of monitoring operations for given use cases.
- *
+ * @addr_unit: Scale factor for core to ops address conversion.
+ * @min_sz_region: Minimum region size.
* @adaptive_targets: Head of monitoring targets (&damon_target) list.
* @schemes: Head of schemes (&damos) list.
*/
@@ -790,6 +791,8 @@ struct damon_ctx {
struct mutex kdamond_lock;
struct damon_operations ops;
+ unsigned long addr_unit;
+ unsigned long min_sz_region;
struct list_head adaptive_targets;
struct list_head schemes;
@@ -878,7 +881,7 @@ static inline void damon_insert_region(struct damon_region *r,
void damon_add_region(struct damon_region *r, struct damon_target *t);
void damon_destroy_region(struct damon_region *r, struct damon_target *t);
int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
- unsigned int nr_ranges);
+ unsigned int nr_ranges, unsigned long min_sz_region);
void damon_update_region_access_rate(struct damon_region *r, bool accessed,
struct damon_attrs *attrs);
@@ -935,6 +938,7 @@ static inline unsigned int damon_max_nr_accesses(const struct damon_attrs *attrs
}
+bool damon_initialized(void);
int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive);
int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
bool damon_is_running(struct damon_ctx *ctx);
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index b303472255be..32884c9721e5 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -47,6 +47,7 @@ extern int freeze_processes(void);
extern int freeze_kernel_threads(void);
extern void thaw_processes(void);
extern void thaw_kernel_threads(void);
+extern void thaw_process(struct task_struct *p);
static inline bool try_to_freeze(void)
{
@@ -80,6 +81,7 @@ static inline int freeze_processes(void) { return -ENOSYS; }
static inline int freeze_kernel_threads(void) { return -ENOSYS; }
static inline void thaw_processes(void) {}
static inline void thaw_kernel_threads(void) {}
+static inline void thaw_process(struct task_struct *p) {}
static inline bool try_to_freeze(void) { return false; }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9e9d7c757efe..75fb216b0f7a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -537,7 +537,7 @@ struct address_space {
/*
* Returns true if any of the pages in the mapping are marked with the tag.
*/
-static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag)
+static inline bool mapping_tagged(const struct address_space *mapping, xa_mark_t tag)
{
return xa_marked(&mapping->i_pages, tag);
}
@@ -585,7 +585,7 @@ static inline void i_mmap_assert_write_locked(struct address_space *mapping)
/*
* Might pages of this file be mapped into userspace?
*/
-static inline int mapping_mapped(struct address_space *mapping)
+static inline int mapping_mapped(const struct address_space *mapping)
{
return !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root);
}
@@ -599,7 +599,7 @@ static inline int mapping_mapped(struct address_space *mapping)
* If i_mmap_writable is negative, no new writable mappings are allowed. You
* can only deny writable mappings, if none exists right now.
*/
-static inline int mapping_writably_mapped(struct address_space *mapping)
+static inline int mapping_writably_mapped(const struct address_space *mapping)
{
return atomic_read(&mapping->i_mmap_writable) > 0;
}
@@ -2385,6 +2385,8 @@ static inline bool can_mmap_file(struct file *file)
return true;
}
+int __compat_vma_mmap_prepare(const struct file_operations *f_op,
+ struct file *file, struct vm_area_struct *vma);
int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma);
static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 5ebf26fcdcfa..0ceb4e09306c 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -354,7 +354,7 @@ static inline struct page *alloc_page_vma_noprof(gfp_t gfp,
}
#define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__))
-struct page *alloc_pages_nolock_noprof(int nid, unsigned int order);
+struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order);
#define alloc_pages_nolock(...) alloc_hooks(alloc_pages_nolock_noprof(__VA_ARGS__))
extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order);
diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h
index 36053c3d6d64..0574c21ca45d 100644
--- a/include/linux/highmem-internal.h
+++ b/include/linux/highmem-internal.h
@@ -7,7 +7,7 @@
*/
#ifdef CONFIG_KMAP_LOCAL
void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot);
-void *__kmap_local_page_prot(struct page *page, pgprot_t prot);
+void *__kmap_local_page_prot(const struct page *page, pgprot_t prot);
void kunmap_local_indexed(const void *vaddr);
void kmap_local_fork(struct task_struct *tsk);
void __kmap_local_sched_out(void);
@@ -33,7 +33,7 @@ static inline void kmap_flush_tlb(unsigned long addr) { }
#endif
void *kmap_high(struct page *page);
-void kunmap_high(struct page *page);
+void kunmap_high(const struct page *page);
void __kmap_flush_unused(void);
struct page *__kmap_to_page(void *addr);
@@ -50,7 +50,7 @@ static inline void *kmap(struct page *page)
return addr;
}
-static inline void kunmap(struct page *page)
+static inline void kunmap(const struct page *page)
{
might_sleep();
if (!PageHighMem(page))
@@ -68,12 +68,12 @@ static inline void kmap_flush_unused(void)
__kmap_flush_unused();
}
-static inline void *kmap_local_page(struct page *page)
+static inline void *kmap_local_page(const struct page *page)
{
return __kmap_local_page_prot(page, kmap_prot);
}
-static inline void *kmap_local_page_try_from_panic(struct page *page)
+static inline void *kmap_local_page_try_from_panic(const struct page *page)
{
if (!PageHighMem(page))
return page_address(page);
@@ -81,13 +81,13 @@ static inline void *kmap_local_page_try_from_panic(struct page *page)
return NULL;
}
-static inline void *kmap_local_folio(struct folio *folio, size_t offset)
+static inline void *kmap_local_folio(const struct folio *folio, size_t offset)
{
- struct page *page = folio_page(folio, offset / PAGE_SIZE);
+ const struct page *page = folio_page(folio, offset / PAGE_SIZE);
return __kmap_local_page_prot(page, kmap_prot) + offset % PAGE_SIZE;
}
-static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot)
+static inline void *kmap_local_page_prot(const struct page *page, pgprot_t prot)
{
return __kmap_local_page_prot(page, prot);
}
@@ -102,7 +102,7 @@ static inline void __kunmap_local(const void *vaddr)
kunmap_local_indexed(vaddr);
}
-static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+static inline void *kmap_atomic_prot(const struct page *page, pgprot_t prot)
{
if (IS_ENABLED(CONFIG_PREEMPT_RT))
migrate_disable();
@@ -113,7 +113,7 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
return __kmap_local_page_prot(page, prot);
}
-static inline void *kmap_atomic(struct page *page)
+static inline void *kmap_atomic(const struct page *page)
{
return kmap_atomic_prot(page, kmap_prot);
}
@@ -173,32 +173,32 @@ static inline void *kmap(struct page *page)
return page_address(page);
}
-static inline void kunmap_high(struct page *page) { }
+static inline void kunmap_high(const struct page *page) { }
static inline void kmap_flush_unused(void) { }
-static inline void kunmap(struct page *page)
+static inline void kunmap(const struct page *page)
{
#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
kunmap_flush_on_unmap(page_address(page));
#endif
}
-static inline void *kmap_local_page(struct page *page)
+static inline void *kmap_local_page(const struct page *page)
{
return page_address(page);
}
-static inline void *kmap_local_page_try_from_panic(struct page *page)
+static inline void *kmap_local_page_try_from_panic(const struct page *page)
{
return page_address(page);
}
-static inline void *kmap_local_folio(struct folio *folio, size_t offset)
+static inline void *kmap_local_folio(const struct folio *folio, size_t offset)
{
return folio_address(folio) + offset;
}
-static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot)
+static inline void *kmap_local_page_prot(const struct page *page, pgprot_t prot)
{
return kmap_local_page(page);
}
@@ -215,7 +215,7 @@ static inline void __kunmap_local(const void *addr)
#endif
}
-static inline void *kmap_atomic(struct page *page)
+static inline void *kmap_atomic(const struct page *page)
{
if (IS_ENABLED(CONFIG_PREEMPT_RT))
migrate_disable();
@@ -225,7 +225,7 @@ static inline void *kmap_atomic(struct page *page)
return page_address(page);
}
-static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+static inline void *kmap_atomic_prot(const struct page *page, pgprot_t prot)
{
return kmap_atomic(page);
}
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 6234f316468c..105cc4c00cc3 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -43,7 +43,7 @@ static inline void *kmap(struct page *page);
* Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of
* pages in the low memory area.
*/
-static inline void kunmap(struct page *page);
+static inline void kunmap(const struct page *page);
/**
* kmap_to_page - Get the page for a kmap'ed address
@@ -93,7 +93,7 @@ static inline void kmap_flush_unused(void);
* disabling migration in order to keep the virtual address stable across
* preemption. No caller of kmap_local_page() can rely on this side effect.
*/
-static inline void *kmap_local_page(struct page *page);
+static inline void *kmap_local_page(const struct page *page);
/**
* kmap_local_folio - Map a page in this folio for temporary usage
@@ -129,7 +129,7 @@ static inline void *kmap_local_page(struct page *page);
* Context: Can be invoked from any context.
* Return: The virtual address of @offset.
*/
-static inline void *kmap_local_folio(struct folio *folio, size_t offset);
+static inline void *kmap_local_folio(const struct folio *folio, size_t offset);
/**
* kmap_atomic - Atomically map a page for temporary usage - Deprecated!
@@ -176,7 +176,7 @@ static inline void *kmap_local_folio(struct folio *folio, size_t offset);
* kunmap_atomic(vaddr2);
* kunmap_atomic(vaddr1);
*/
-static inline void *kmap_atomic(struct page *page);
+static inline void *kmap_atomic(const struct page *page);
/* Highmem related interfaces for management code */
static inline unsigned long nr_free_highpages(void);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7748489fde1b..f327d62fc985 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -94,12 +94,15 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr;
#define THP_ORDERS_ALL \
(THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_SPECIAL | THP_ORDERS_ALL_FILE_DEFAULT)
-#define TVA_SMAPS (1 << 0) /* Will be used for procfs */
-#define TVA_IN_PF (1 << 1) /* Page fault handler */
-#define TVA_ENFORCE_SYSFS (1 << 2) /* Obey sysfs configuration */
+enum tva_type {
+ TVA_SMAPS, /* Exposing "THPeligible:" in smaps. */
+ TVA_PAGEFAULT, /* Serving a page fault. */
+ TVA_KHUGEPAGED, /* Khugepaged collapse. */
+ TVA_FORCED_COLLAPSE, /* Forced collapse (e.g. MADV_COLLAPSE). */
+};
-#define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \
- (!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order)))
+#define thp_vma_allowable_order(vma, vm_flags, type, order) \
+ (!!thp_vma_allowable_orders(vma, vm_flags, type, BIT(order)))
#define split_folio(f) split_folio_to_list(f, NULL)
@@ -264,14 +267,14 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
vm_flags_t vm_flags,
- unsigned long tva_flags,
+ enum tva_type type,
unsigned long orders);
/**
* thp_vma_allowable_orders - determine hugepage orders that are allowed for vma
* @vma: the vm area to check
* @vm_flags: use these vm_flags instead of vma->vm_flags
- * @tva_flags: Which TVA flags to honour
+ * @type: TVA type
* @orders: bitfield of all orders to consider
*
* Calculates the intersection of the requested hugepage orders and the allowed
@@ -285,11 +288,14 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
static inline
unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
vm_flags_t vm_flags,
- unsigned long tva_flags,
+ enum tva_type type,
unsigned long orders)
{
- /* Optimization to check if required orders are enabled early. */
- if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) {
+ /*
+ * Optimization to check if required orders are enabled early. Only
+ * forced collapse ignores sysfs configs.
+ */
+ if (type != TVA_FORCED_COLLAPSE && vma_is_anonymous(vma)) {
unsigned long mask = READ_ONCE(huge_anon_orders_always);
if (vm_flags & VM_HUGEPAGE)
@@ -303,7 +309,7 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
return 0;
}
- return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
+ return __thp_vma_allowable_orders(vma, vm_flags, type, orders);
}
struct thpsize {
@@ -318,16 +324,32 @@ struct thpsize {
(transparent_hugepage_flags & \
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
+/*
+ * Check whether THPs are explicitly disabled for this VMA, for example,
+ * through madvise or prctl.
+ */
static inline bool vma_thp_disabled(struct vm_area_struct *vma,
- vm_flags_t vm_flags)
-{
+ vm_flags_t vm_flags, bool forced_collapse)
+{
+ /* Are THPs disabled for this VMA? */
+ if (vm_flags & VM_NOHUGEPAGE)
+ return true;
+ /* Are THPs disabled for all VMAs in the whole process? */
+ if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, vma->vm_mm))
+ return true;
/*
- * Explicitly disabled through madvise or prctl, or some
- * architectures may disable THP for some mappings, for
- * example, s390 kvm.
+ * Are THPs disabled only for VMAs where we didn't get an explicit
+ * advise to use them?
*/
- return (vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags);
+ if (vm_flags & VM_HUGEPAGE)
+ return false;
+ /*
+ * Forcing a collapse (e.g., madv_collapse), is a clear advice to
+ * use THPs.
+ */
+ if (forced_collapse)
+ return false;
+ return mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, vma->vm_mm);
}
static inline bool thp_disabled_by_hw(void)
@@ -479,6 +501,8 @@ extern unsigned long huge_zero_pfn;
static inline bool is_huge_zero_folio(const struct folio *folio)
{
+ VM_WARN_ON_ONCE(!folio);
+
return READ_ONCE(huge_zero_folio) == folio;
}
@@ -495,6 +519,17 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
void mm_put_huge_zero_folio(struct mm_struct *mm);
+static inline struct folio *get_persistent_huge_zero_folio(void)
+{
+ if (!IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
+ return NULL;
+
+ if (unlikely(!huge_zero_folio))
+ return NULL;
+
+ return huge_zero_folio;
+}
+
static inline bool thp_migration_supported(void)
{
return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
@@ -526,7 +561,7 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
vm_flags_t vm_flags,
- unsigned long tva_flags,
+ enum tva_type type,
unsigned long orders)
{
return 0;
@@ -553,22 +588,26 @@ static inline int
split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
unsigned int new_order)
{
- return 0;
+ VM_WARN_ON_ONCE_PAGE(1, page);
+ return -EINVAL;
}
static inline int split_huge_page(struct page *page)
{
- return 0;
+ VM_WARN_ON_ONCE_PAGE(1, page);
+ return -EINVAL;
}
static inline int split_folio_to_list(struct folio *folio, struct list_head *list)
{
- return 0;
+ VM_WARN_ON_ONCE_FOLIO(1, folio);
+ return -EINVAL;
}
static inline int try_folio_split(struct folio *folio, struct page *page,
struct list_head *list)
{
- return 0;
+ VM_WARN_ON_ONCE_FOLIO(1, folio);
+ return -EINVAL;
}
static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
@@ -685,6 +724,11 @@ static inline int change_huge_pud(struct mmu_gather *tlb,
{
return 0;
}
+
+static inline struct folio *get_persistent_huge_zero_folio(void)
+{
+ return NULL;
+}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static inline int split_folio_to_list_to_order(struct folio *folio,
@@ -698,4 +742,26 @@ static inline int split_folio_to_order(struct folio *folio, int new_order)
return split_folio_to_list_to_order(folio, NULL, new_order);
}
+/**
+ * largest_zero_folio - Get the largest zero size folio available
+ *
+ * This function shall be used when mm_get_huge_zero_folio() cannot be
+ * used as there is no appropriate mm lifetime to tie the huge zero folio
+ * from the caller.
+ *
+ * Deduce the size of the folio with folio_size instead of assuming the
+ * folio size.
+ *
+ * Return: pointer to PMD sized zero folio if CONFIG_PERSISTENT_HUGE_ZERO_FOLIO
+ * is enabled or a single page sized zero folio
+ */
+static inline struct folio *largest_zero_folio(void)
+{
+ struct folio *folio = get_persistent_huge_zero_folio();
+
+ if (folio)
+ return folio;
+
+ return page_folio(ZERO_PAGE(0));
+}
#endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 526d27e88b3b..8e63e46b8e1f 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -788,9 +788,14 @@ static inline unsigned huge_page_shift(struct hstate *h)
return h->order + PAGE_SHIFT;
}
+static inline bool order_is_gigantic(unsigned int order)
+{
+ return order > MAX_PAGE_ORDER;
+}
+
static inline bool hstate_is_gigantic(struct hstate *h)
{
- return huge_page_order(h) > MAX_PAGE_ORDER;
+ return order_is_gigantic(huge_page_order(h));
}
static inline unsigned int pages_per_huge_page(const struct hstate *h)
diff --git a/include/linux/idr.h b/include/linux/idr.h
index 2267902d29a7..789e23e67444 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -334,14 +334,6 @@ static inline void ida_init(struct ida *ida)
xa_init_flags(&ida->xa, IDA_INIT_FLAGS);
}
-/*
- * ida_simple_get() and ida_simple_remove() are deprecated. Use
- * ida_alloc() and ida_free() instead respectively.
- */
-#define ida_simple_get(ida, start, end, gfp) \
- ida_alloc_range(ida, start, (end) - 1, gfp)
-#define ida_simple_remove(ida, id) ida_free(ida, id)
-
static inline bool ida_is_empty(const struct ida *ida)
{
return xa_empty(&ida->xa);
diff --git a/include/linux/kasan-enabled.h b/include/linux/kasan-enabled.h
index 6f612d69ea0c..9eca967d8526 100644
--- a/include/linux/kasan-enabled.h
+++ b/include/linux/kasan-enabled.h
@@ -4,32 +4,46 @@
#include <linux/static_key.h>
-#ifdef CONFIG_KASAN_HW_TAGS
-
+#if defined(CONFIG_ARCH_DEFER_KASAN) || defined(CONFIG_KASAN_HW_TAGS)
+/*
+ * Global runtime flag for KASAN modes that need runtime control.
+ * Used by ARCH_DEFER_KASAN architectures and HW_TAGS mode.
+ */
DECLARE_STATIC_KEY_FALSE(kasan_flag_enabled);
+/*
+ * Runtime control for shadow memory initialization or HW_TAGS mode.
+ * Uses static key for architectures that need deferred KASAN or HW_TAGS.
+ */
static __always_inline bool kasan_enabled(void)
{
return static_branch_likely(&kasan_flag_enabled);
}
-static inline bool kasan_hw_tags_enabled(void)
+static inline void kasan_enable(void)
{
- return kasan_enabled();
+ static_branch_enable(&kasan_flag_enabled);
}
-
-#else /* CONFIG_KASAN_HW_TAGS */
-
-static inline bool kasan_enabled(void)
+#else
+/* For architectures that can enable KASAN early, use compile-time check. */
+static __always_inline bool kasan_enabled(void)
{
return IS_ENABLED(CONFIG_KASAN);
}
+static inline void kasan_enable(void) {}
+#endif /* CONFIG_ARCH_DEFER_KASAN || CONFIG_KASAN_HW_TAGS */
+
+#ifdef CONFIG_KASAN_HW_TAGS
+static inline bool kasan_hw_tags_enabled(void)
+{
+ return kasan_enabled();
+}
+#else
static inline bool kasan_hw_tags_enabled(void)
{
return false;
}
-
#endif /* CONFIG_KASAN_HW_TAGS */
#endif /* LINUX_KASAN_ENABLED_H */
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index fe5ce9215821..d12e1a5f5a9a 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -200,7 +200,7 @@ static __always_inline bool kasan_slab_pre_free(struct kmem_cache *s,
}
bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init,
- bool still_accessible);
+ bool still_accessible, bool no_quarantine);
/**
* kasan_slab_free - Poison, initialize, and quarantine a slab object.
* @object: Object to be freed.
@@ -226,11 +226,13 @@ bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init,
* @Return true if KASAN took ownership of the object; false otherwise.
*/
static __always_inline bool kasan_slab_free(struct kmem_cache *s,
- void *object, bool init,
- bool still_accessible)
+ void *object, bool init,
+ bool still_accessible,
+ bool no_quarantine)
{
if (kasan_enabled())
- return __kasan_slab_free(s, object, init, still_accessible);
+ return __kasan_slab_free(s, object, init, still_accessible,
+ no_quarantine);
return false;
}
@@ -427,7 +429,8 @@ static inline bool kasan_slab_pre_free(struct kmem_cache *s, void *object)
}
static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
- bool init, bool still_accessible)
+ bool init, bool still_accessible,
+ bool no_quarantine)
{
return false;
}
@@ -543,6 +546,12 @@ void kasan_report_async(void);
#endif /* CONFIG_KASAN_HW_TAGS */
+#ifdef CONFIG_KASAN_GENERIC
+void __init kasan_init_generic(void);
+#else
+static inline void kasan_init_generic(void) { }
+#endif
+
#ifdef CONFIG_KASAN_SW_TAGS
void __init kasan_init_sw_tags(void);
#else
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 989315dabb86..5b46924fdff5 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -164,11 +164,23 @@ extern int root_mountflags;
extern bool early_boot_irqs_disabled;
-/*
- * Values used for system_state. Ordering of the states must not be changed
+/**
+ * enum system_states - Values used for system_state.
+ *
+ * @SYSTEM_BOOTING: %0, no init needed
+ * @SYSTEM_SCHEDULING: system is ready for scheduling; OK to use RCU
+ * @SYSTEM_FREEING_INITMEM: system is freeing all of initmem; almost running
+ * @SYSTEM_RUNNING: system is up and running
+ * @SYSTEM_HALT: system entered clean system halt state
+ * @SYSTEM_POWER_OFF: system entered shutdown/clean power off state
+ * @SYSTEM_RESTART: system entered emergency power off or normal restart
+ * @SYSTEM_SUSPEND: system entered suspend or hibernate state
+ *
+ * Note:
+ * Ordering of the states must not be changed
* as code checks for <, <=, >, >= STATE.
*/
-extern enum system_states {
+enum system_states {
SYSTEM_BOOTING,
SYSTEM_SCHEDULING,
SYSTEM_FREEING_INITMEM,
@@ -177,7 +189,8 @@ extern enum system_states {
SYSTEM_POWER_OFF,
SYSTEM_RESTART,
SYSTEM_SUSPEND,
-} system_state;
+};
+extern enum system_states system_state;
/*
* General tracing related utility functions - trace_printk(),
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 39fe3e6cd282..ff7e231b0485 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -395,6 +395,9 @@ struct kimage {
/* Information for loading purgatory */
struct purgatory_info purgatory_info;
+
+ /* Force carrying over the DTB from the current boot */
+ bool force_dtb;
#endif
#ifdef CONFIG_CRASH_HOTPLUG
@@ -461,7 +464,7 @@ bool kexec_load_permitted(int kexec_image_type);
/* List of defined/legal kexec file flags */
#define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \
KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG | \
- KEXEC_FILE_NO_CMA)
+ KEXEC_FILE_NO_CMA | KEXEC_FILE_FORCE_DTB)
/* flag to track if kexec reboot is in progress */
extern bool kexec_in_progress;
diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
index 348844cffb13..559d13a3bc44 100644
--- a/include/linux/kexec_handover.h
+++ b/include/linux/kexec_handover.h
@@ -40,6 +40,7 @@ struct kho_serialization;
#ifdef CONFIG_KEXEC_HANDOVER
bool kho_is_enabled(void);
+bool is_kho_boot(void);
int kho_preserve_folio(struct folio *folio);
int kho_preserve_phys(phys_addr_t phys, size_t size);
@@ -60,6 +61,11 @@ static inline bool kho_is_enabled(void)
return false;
}
+static inline bool is_kho_boot(void)
+{
+ return false;
+}
+
static inline int kho_preserve_folio(struct folio *folio)
{
return -EOPNOTSUPP;
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index ff6120463745..eb1946a70cff 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -2,6 +2,8 @@
#ifndef _LINUX_KHUGEPAGED_H
#define _LINUX_KHUGEPAGED_H
+#include <linux/mm.h>
+
extern unsigned int khugepaged_max_ptes_none __read_mostly;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern struct attribute_group khugepaged_attr_group;
@@ -20,13 +22,13 @@ extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
- if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags))
+ if (mm_flags_test(MMF_VM_HUGEPAGE, oldmm))
__khugepaged_enter(mm);
}
static inline void khugepaged_exit(struct mm_struct *mm)
{
- if (test_bit(MMF_VM_HUGEPAGE, &mm->flags))
+ if (mm_flags_test(MMF_VM_HUGEPAGE, mm))
__khugepaged_exit(mm);
}
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index c17b955e7b0b..067538fc4d58 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -56,13 +56,19 @@ static inline long mm_ksm_zero_pages(struct mm_struct *mm)
static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
/* Adding mm to ksm is best effort on fork. */
- if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
+ if (mm_flags_test(MMF_VM_MERGEABLE, oldmm)) {
+ long nr_ksm_zero_pages = atomic_long_read(&mm->ksm_zero_pages);
+
+ mm->ksm_merging_pages = 0;
+ mm->ksm_rmap_items = 0;
+ atomic_long_add(nr_ksm_zero_pages, &ksm_zero_pages);
__ksm_enter(mm);
+ }
}
static inline int ksm_execve(struct mm_struct *mm)
{
- if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+ if (mm_flags_test(MMF_VM_MERGE_ANY, mm))
return __ksm_enter(mm);
return 0;
@@ -70,7 +76,7 @@ static inline int ksm_execve(struct mm_struct *mm)
static inline void ksm_exit(struct mm_struct *mm)
{
- if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
+ if (mm_flags_test(MMF_VM_MERGEABLE, mm))
__ksm_exit(mm);
}
diff --git a/include/linux/list.h b/include/linux/list.h
index 7f7657e41620..5bfda2f91fca 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -20,8 +20,16 @@
* using the generic single-entry routines.
*/
+/**
+ * LIST_HEAD_INIT - initialize a &struct list_head's links to point to itself
+ * @name: name of the list_head
+ */
#define LIST_HEAD_INIT(name) { &(name), &(name) }
+/**
+ * LIST_HEAD - definition of a &struct list_head with initialization values
+ * @name: name of the list_head
+ */
#define LIST_HEAD(name) \
struct list_head name = LIST_HEAD_INIT(name)
diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h
index 2ba846419524..0d91d060e3e9 100644
--- a/include/linux/local_lock.h
+++ b/include/linux/local_lock.h
@@ -66,6 +66,8 @@
*/
#define local_trylock(lock) __local_trylock(this_cpu_ptr(lock))
+#define local_lock_is_locked(lock) __local_lock_is_locked(lock)
+
/**
* local_trylock_irqsave - Try to acquire a per CPU local lock, save and disable
* interrupts if acquired
diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h
index d80b5306a2c0..a4dc479157b5 100644
--- a/include/linux/local_lock_internal.h
+++ b/include/linux/local_lock_internal.h
@@ -17,7 +17,10 @@ typedef struct {
/* local_trylock() and local_trylock_irqsave() only work with local_trylock_t */
typedef struct {
- local_lock_t llock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+ struct task_struct *owner;
+#endif
u8 acquired;
} local_trylock_t;
@@ -31,7 +34,7 @@ typedef struct {
.owner = NULL,
# define LOCAL_TRYLOCK_DEBUG_INIT(lockname) \
- .llock = { LOCAL_LOCK_DEBUG_INIT((lockname).llock) },
+ LOCAL_LOCK_DEBUG_INIT(lockname)
static inline void local_lock_acquire(local_lock_t *l)
{
@@ -81,7 +84,7 @@ do { \
local_lock_debug_init(lock); \
} while (0)
-#define __local_trylock_init(lock) __local_lock_init(lock.llock)
+#define __local_trylock_init(lock) __local_lock_init((local_lock_t *)lock)
#define __spinlock_nested_bh_init(lock) \
do { \
@@ -162,6 +165,9 @@ do { \
!!tl; \
})
+/* preemption or migration must be disabled before calling __local_lock_is_locked */
+#define __local_lock_is_locked(lock) READ_ONCE(this_cpu_ptr(lock)->acquired)
+
#define __local_lock_release(lock) \
do { \
local_trylock_t *tl; \
@@ -282,4 +288,8 @@ do { \
__local_trylock(lock); \
})
+/* migration must be disabled before calling __local_lock_is_locked */
+#define __local_lock_is_locked(__lock) \
+ (rt_mutex_owner(&this_cpu_ptr(__lock)->lock) == current)
+
#endif /* CONFIG_PREEMPT_RT */
diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index bafe143b1f78..66f98a3da8d8 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -57,17 +57,17 @@
* MT_FLAGS_ALLOC_RANGE flag.
*
* Node types:
- * 0x??1 = Root
- * 0x?00 = 16 bit nodes
- * 0x010 = 32 bit nodes
- * 0x110 = 64 bit nodes
+ * 0b??1 = Root
+ * 0b?00 = 16 bit nodes
+ * 0b010 = 32 bit nodes
+ * 0b110 = 64 bit nodes
*
* Slot size and location in the parent pointer:
* type : slot location
- * 0x??1 : Root
- * 0x?00 : 16 bit values, type in 0-1, slot in 2-6
- * 0x010 : 32 bit values, type in 0-2, slot in 3-6
- * 0x110 : 64 bit values, type in 0-2, slot in 3-6
+ * 0b??1 : Root
+ * 0b?00 : 16 bit values, type in 0-1, slot in 2-6
+ * 0b010 : 32 bit values, type in 0-2, slot in 3-6
+ * 0b110 : 64 bit values, type in 0-2, slot in 3-6
*/
/*
@@ -194,7 +194,6 @@ enum store_type {
#define MAPLE_RESERVED_RANGE 4096
#ifdef CONFIG_LOCKDEP
-typedef struct lockdep_map *lockdep_map_p;
#define mt_lock_is_held(mt) \
(!(mt)->ma_external_lock || lock_is_held((mt)->ma_external_lock))
@@ -207,7 +206,6 @@ typedef struct lockdep_map *lockdep_map_p;
#define mt_on_stack(mt) (mt).ma_external_lock = NULL
#else
-typedef struct { /* nothing */ } lockdep_map_p;
#define mt_lock_is_held(mt) 1
#define mt_write_lock_is_held(mt) 1
#define mt_set_external_lock(mt, lock) do { } while (0)
@@ -230,8 +228,10 @@ typedef struct { /* nothing */ } lockdep_map_p;
*/
struct maple_tree {
union {
- spinlock_t ma_lock;
- lockdep_map_p ma_external_lock;
+ spinlock_t ma_lock;
+#ifdef CONFIG_LOCKDEP
+ struct lockdep_map *ma_external_lock;
+#endif
};
unsigned int ma_flags;
void __rcu *ma_root;
@@ -442,7 +442,9 @@ struct ma_state {
struct maple_enode *node; /* The node containing this entry */
unsigned long min; /* The minimum index of this node - implied pivot min */
unsigned long max; /* The maximum index of this node - implied pivot max */
- struct maple_alloc *alloc; /* Allocated nodes for this operation */
+ struct slab_sheaf *sheaf; /* Allocated nodes for this operation */
+ struct maple_node *alloc; /* A single allocated node for fast path writes */
+ unsigned long node_request; /* The number of nodes to allocate for this operation */
enum maple_status status; /* The status of the state (active, start, none, etc) */
unsigned char depth; /* depth of tree descent during write */
unsigned char offset;
@@ -481,6 +483,9 @@ struct ma_wr_state {
#define MA_ERROR(err) \
((struct maple_enode *)(((unsigned long)err << 2) | 2UL))
+/*
+ * When changing MA_STATE, remember to also change rust/kernel/maple_tree.rs
+ */
#define MA_STATE(name, mt, first, end) \
struct ma_state name = { \
.tree = mt, \
@@ -490,7 +495,9 @@ struct ma_wr_state {
.status = ma_start, \
.min = 0, \
.max = ULONG_MAX, \
+ .sheaf = NULL, \
.alloc = NULL, \
+ .node_request = 0, \
.mas_flags = 0, \
.store_type = wr_invalid, \
}
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index fb27e3d2fdac..16fe0306e50e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -341,17 +341,25 @@ enum page_memcg_data_flags {
__NR_MEMCG_DATA_FLAGS = (1UL << 2),
};
+#define __OBJEXTS_ALLOC_FAIL MEMCG_DATA_OBJEXTS
#define __FIRST_OBJEXT_FLAG __NR_MEMCG_DATA_FLAGS
#else /* CONFIG_MEMCG */
+#define __OBJEXTS_ALLOC_FAIL (1UL << 0)
#define __FIRST_OBJEXT_FLAG (1UL << 0)
#endif /* CONFIG_MEMCG */
enum objext_flags {
- /* slabobj_ext vector failed to allocate */
- OBJEXTS_ALLOC_FAIL = __FIRST_OBJEXT_FLAG,
+ /*
+ * Use bit 0 with zero other bits to signal that slabobj_ext vector
+ * failed to allocate. The same bit 0 with valid upper bits means
+ * MEMCG_DATA_OBJEXTS.
+ */
+ OBJEXTS_ALLOC_FAIL = __OBJEXTS_ALLOC_FAIL,
+ /* slabobj_ext vector allocated with kmalloc_nolock() */
+ OBJEXTS_NOSPIN_ALLOC = __FIRST_OBJEXT_FLAG,
/* the next bit after the last actual flag */
__NR_OBJEXTS_FLAGS = (__FIRST_OBJEXT_FLAG << 1),
};
@@ -900,7 +908,13 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
return READ_ONCE(mz->lru_zone_size[zone_idx][lru]);
}
-void mem_cgroup_handle_over_high(gfp_t gfp_mask);
+void __mem_cgroup_handle_over_high(gfp_t gfp_mask);
+
+static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask)
+{
+ if (unlikely(current->memcg_nr_pages_over_high))
+ __mem_cgroup_handle_over_high(gfp_mask);
+}
unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
@@ -1053,6 +1067,8 @@ extern int mem_cgroup_init(void);
#define MEM_CGROUP_ID_SHIFT 0
+#define root_mem_cgroup (NULL)
+
static inline struct mem_cgroup *folio_memcg(struct folio *folio)
{
return NULL;
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 7b151441341b..34941a4b9026 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -15,7 +15,7 @@ struct kmem_cache;
typedef void * (mempool_alloc_t)(gfp_t gfp_mask, void *pool_data);
typedef void (mempool_free_t)(void *element, void *pool_data);
-typedef struct mempool_s {
+typedef struct mempool {
spinlock_t lock;
int min_nr; /* nr of elements at *elements */
int curr_nr; /* Current nr of elements at *elements */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 4aa151914eab..e5951ba12a28 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -157,45 +157,52 @@ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap)
return 1 << pgmap->vmemmap_shift;
}
-static inline bool is_device_private_page(const struct page *page)
+static inline bool folio_is_device_private(const struct folio *folio)
{
return IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
- is_zone_device_page(page) &&
- page_pgmap(page)->type == MEMORY_DEVICE_PRIVATE;
+ folio_is_zone_device(folio) &&
+ folio->pgmap->type == MEMORY_DEVICE_PRIVATE;
}
-static inline bool folio_is_device_private(const struct folio *folio)
+static inline bool is_device_private_page(const struct page *page)
{
- return is_device_private_page(&folio->page);
+ return IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
+ folio_is_device_private(page_folio(page));
}
-static inline bool is_pci_p2pdma_page(const struct page *page)
+static inline bool folio_is_pci_p2pdma(const struct folio *folio)
{
return IS_ENABLED(CONFIG_PCI_P2PDMA) &&
- is_zone_device_page(page) &&
- page_pgmap(page)->type == MEMORY_DEVICE_PCI_P2PDMA;
+ folio_is_zone_device(folio) &&
+ folio->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
}
-static inline bool is_device_coherent_page(const struct page *page)
+static inline bool is_pci_p2pdma_page(const struct page *page)
{
- return is_zone_device_page(page) &&
- page_pgmap(page)->type == MEMORY_DEVICE_COHERENT;
+ return IS_ENABLED(CONFIG_PCI_P2PDMA) &&
+ folio_is_pci_p2pdma(page_folio(page));
}
static inline bool folio_is_device_coherent(const struct folio *folio)
{
- return is_device_coherent_page(&folio->page);
+ return folio_is_zone_device(folio) &&
+ folio->pgmap->type == MEMORY_DEVICE_COHERENT;
}
-static inline bool is_fsdax_page(const struct page *page)
+static inline bool is_device_coherent_page(const struct page *page)
{
- return is_zone_device_page(page) &&
- page_pgmap(page)->type == MEMORY_DEVICE_FS_DAX;
+ return folio_is_device_coherent(page_folio(page));
}
static inline bool folio_is_fsdax(const struct folio *folio)
{
- return is_fsdax_page(&folio->page);
+ return folio_is_zone_device(folio) &&
+ folio->pgmap->type == MEMORY_DEVICE_FS_DAX;
+}
+
+static inline bool is_fsdax_page(const struct page *page)
+{
+ return folio_is_fsdax(page_folio(page));
}
#ifdef CONFIG_ZONE_DEVICE
@@ -204,8 +211,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid);
void memunmap_pages(struct dev_pagemap *pgmap);
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap);
-struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
- struct dev_pagemap *pgmap);
+struct dev_pagemap *get_dev_pagemap(unsigned long pfn);
bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn);
unsigned long memremap_compat_align(void);
@@ -227,8 +233,7 @@ static inline void devm_memunmap_pages(struct device *dev,
{
}
-static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
- struct dev_pagemap *pgmap)
+static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn)
{
return NULL;
}
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 9009e27b5f44..1f0ac122c3bf 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -12,14 +12,6 @@ typedef void free_folio_t(struct folio *folio, unsigned long private);
struct migration_target_control;
-/*
- * Return values from addresss_space_operations.migratepage():
- * - negative errno on page migration failure;
- * - zero on page migration success;
- */
-#define MIGRATEPAGE_SUCCESS 0
-#define MIGRATEPAGE_UNMAP 1
-
/**
* struct movable_operations - Driver page migration
* @isolate_page:
@@ -35,8 +27,7 @@ struct migration_target_control;
* @src page. The driver should copy the contents of the
* @src page to the @dst page and set up the fields of @dst page.
* Both pages are locked.
- * If page migration is successful, the driver should
- * return MIGRATEPAGE_SUCCESS.
+ * If page migration is successful, the driver should return 0.
* If the driver cannot migrate the page at the moment, it can return
* -EAGAIN. The VM interprets this as a temporary migration failure and
* will retry it later. Any other error value is a permanent migration
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1ae97a0b8ec7..06978b4dbeb8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -34,6 +34,8 @@
#include <linux/slab.h>
#include <linux/cacheinfo.h>
#include <linux/rcuwait.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
struct mempolicy;
struct anon_vma;
@@ -69,6 +71,15 @@ static inline void totalram_pages_add(long count)
extern void * high_memory;
+/*
+ * Convert between pages and MB
+ * 20 is the shift for 1MB (2^20 = 1MB)
+ * PAGE_SHIFT is the shift for page size (e.g., 12 for 4KB pages)
+ * So (20 - PAGE_SHIFT) converts between pages and MB
+ */
+#define PAGES_TO_MB(pages) ((pages) >> (20 - PAGE_SHIFT))
+#define MB_TO_PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
+
#ifdef CONFIG_SYSCTL
extern int sysctl_legacy_va_layout;
#else
@@ -198,11 +209,13 @@ extern unsigned long sysctl_user_reserve_kbytes;
extern unsigned long sysctl_admin_reserve_kbytes;
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
-#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
-#define folio_page_idx(folio, p) (page_to_pfn(p) - folio_pfn(folio))
+bool page_range_contiguous(const struct page *page, unsigned long nr_pages);
#else
-#define nth_page(page,n) ((page) + (n))
-#define folio_page_idx(folio, p) ((p) - &(folio)->page)
+static inline bool page_range_contiguous(const struct page *page,
+ unsigned long nr_pages)
+{
+ return true;
+}
#endif
/* to align the pointer to the (next) page boundary */
@@ -214,6 +227,20 @@ extern unsigned long sysctl_admin_reserve_kbytes;
/* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */
#define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE)
+/**
+ * folio_page_idx - Return the number of a page in a folio.
+ * @folio: The folio.
+ * @page: The folio page.
+ *
+ * This function expects that the page is actually part of the folio.
+ * The returned number is relative to the start of the folio.
+ */
+static inline unsigned long folio_page_idx(const struct folio *folio,
+ const struct page *page)
+{
+ return page - &folio->page;
+}
+
static inline struct folio *lru_to_folio(struct list_head *head)
{
return list_entry((head)->prev, struct folio, lru);
@@ -648,13 +675,21 @@ struct vm_operations_struct {
struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
unsigned long addr, pgoff_t *ilx);
#endif
+#ifdef CONFIG_FIND_NORMAL_PAGE
/*
- * Called by vm_normal_page() for special PTEs to find the
- * page for @addr. This is useful if the default behavior
- * (using pte_page()) would not find the correct page.
+ * Called by vm_normal_page() for special PTEs in @vma at @addr. This
+ * allows for returning a "normal" page from vm_normal_page() even
+ * though the PTE indicates that the "struct page" either does not exist
+ * or should not be touched: "special".
+ *
+ * Do not add new users: this really only works when a "normal" page
+ * was mapped, but then the PTE got changed to something weird (+
+ * marked special) that would not make pte_pfn() identify the originally
+ * inserted page.
*/
- struct page *(*find_special_page)(struct vm_area_struct *vma,
- unsigned long addr);
+ struct page *(*find_normal_page)(struct vm_area_struct *vma,
+ unsigned long addr);
+#endif /* CONFIG_FIND_NORMAL_PAGE */
};
#ifdef CONFIG_NUMA_BALANCING
@@ -684,7 +719,7 @@ static inline void release_fault_lock(struct vm_fault *vmf)
mmap_read_unlock(vmf->vma->vm_mm);
}
-static inline void assert_fault_locked(struct vm_fault *vmf)
+static inline void assert_fault_locked(const struct vm_fault *vmf)
{
if (vmf->flags & FAULT_FLAG_VMA_LOCK)
vma_assert_locked(vmf->vma);
@@ -697,12 +732,42 @@ static inline void release_fault_lock(struct vm_fault *vmf)
mmap_read_unlock(vmf->vma->vm_mm);
}
-static inline void assert_fault_locked(struct vm_fault *vmf)
+static inline void assert_fault_locked(const struct vm_fault *vmf)
{
mmap_assert_locked(vmf->vma->vm_mm);
}
#endif /* CONFIG_PER_VMA_LOCK */
+static inline bool mm_flags_test(int flag, const struct mm_struct *mm)
+{
+ return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
+}
+
+static inline bool mm_flags_test_and_set(int flag, struct mm_struct *mm)
+{
+ return test_and_set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
+}
+
+static inline bool mm_flags_test_and_clear(int flag, struct mm_struct *mm)
+{
+ return test_and_clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
+}
+
+static inline void mm_flags_set(int flag, struct mm_struct *mm)
+{
+ set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
+}
+
+static inline void mm_flags_clear(int flag, struct mm_struct *mm)
+{
+ clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
+}
+
+static inline void mm_flags_clear_all(struct mm_struct *mm)
+{
+ bitmap_zero(ACCESS_PRIVATE(&mm->flags, __mm_flags), NUM_MM_FLAG_BITS);
+}
+
extern const struct vm_operations_struct vma_dummy_vm_ops;
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
@@ -810,7 +875,7 @@ static inline bool vma_is_initial_stack(const struct vm_area_struct *vma)
vma->vm_end >= vma->vm_mm->start_stack;
}
-static inline bool vma_is_temporary_stack(struct vm_area_struct *vma)
+static inline bool vma_is_temporary_stack(const struct vm_area_struct *vma)
{
int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
@@ -824,7 +889,7 @@ static inline bool vma_is_temporary_stack(struct vm_area_struct *vma)
return false;
}
-static inline bool vma_is_foreign(struct vm_area_struct *vma)
+static inline bool vma_is_foreign(const struct vm_area_struct *vma)
{
if (!current->mm)
return true;
@@ -835,7 +900,7 @@ static inline bool vma_is_foreign(struct vm_area_struct *vma)
return false;
}
-static inline bool vma_is_accessible(struct vm_area_struct *vma)
+static inline bool vma_is_accessible(const struct vm_area_struct *vma)
{
return vma->vm_flags & VM_ACCESS_FLAGS;
}
@@ -846,7 +911,7 @@ static inline bool is_shared_maywrite(vm_flags_t vm_flags)
(VM_SHARED | VM_MAYWRITE);
}
-static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma)
+static inline bool vma_is_shared_maywrite(const struct vm_area_struct *vma)
{
return is_shared_maywrite(vma->vm_flags);
}
@@ -930,14 +995,14 @@ static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr)
* The vma_is_shmem is not inline because it is used only by slow
* paths in userfault.
*/
-bool vma_is_shmem(struct vm_area_struct *vma);
-bool vma_is_anon_shmem(struct vm_area_struct *vma);
+bool vma_is_shmem(const struct vm_area_struct *vma);
+bool vma_is_anon_shmem(const struct vm_area_struct *vma);
#else
-static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; }
-static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; }
+static inline bool vma_is_shmem(const struct vm_area_struct *vma) { return false; }
+static inline bool vma_is_anon_shmem(const struct vm_area_struct *vma) { return false; }
#endif
-int vma_is_stack_for_current(struct vm_area_struct *vma);
+int vma_is_stack_for_current(const struct vm_area_struct *vma);
/* flush_tlb_range() takes a vma, not a mm, and can care about flags */
#define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) }
@@ -953,12 +1018,12 @@ static inline unsigned int folio_large_order(const struct folio *folio)
}
#ifdef NR_PAGES_IN_LARGE_FOLIO
-static inline long folio_large_nr_pages(const struct folio *folio)
+static inline unsigned long folio_large_nr_pages(const struct folio *folio)
{
return folio->_nr_pages;
}
#else
-static inline long folio_large_nr_pages(const struct folio *folio)
+static inline unsigned long folio_large_nr_pages(const struct folio *folio)
{
return 1L << folio_large_order(folio);
}
@@ -971,11 +1036,11 @@ static inline long folio_large_nr_pages(const struct folio *folio)
* set before the order is initialised, or this may be a tail page.
* See compaction.c for some good examples.
*/
-static inline unsigned int compound_order(struct page *page)
+static inline unsigned int compound_order(const struct page *page)
{
- struct folio *folio = (struct folio *)page;
+ const struct folio *folio = (struct folio *)page;
- if (!test_bit(PG_head, &folio->flags))
+ if (!test_bit(PG_head, &folio->flags.f))
return 0;
return folio_large_order(folio);
}
@@ -1191,7 +1256,7 @@ int folio_mc_copy(struct folio *dst, struct folio *src);
unsigned long nr_free_buffer_pages(void);
/* Returns the number of bytes in this potentially compound page. */
-static inline unsigned long page_size(struct page *page)
+static inline unsigned long page_size(const struct page *page)
{
return PAGE_SIZE << compound_order(page);
}
@@ -1505,21 +1570,26 @@ static inline bool is_nommu_shared_mapping(vm_flags_t flags)
*/
static inline int page_zone_id(struct page *page)
{
- return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
+ return (page->flags.f >> ZONEID_PGSHIFT) & ZONEID_MASK;
}
#ifdef NODE_NOT_IN_PAGE_FLAGS
-int page_to_nid(const struct page *page);
+int memdesc_nid(memdesc_flags_t mdf);
#else
-static inline int page_to_nid(const struct page *page)
+static inline int memdesc_nid(memdesc_flags_t mdf)
{
- return (PF_POISONED_CHECK(page)->flags >> NODES_PGSHIFT) & NODES_MASK;
+ return (mdf.f >> NODES_PGSHIFT) & NODES_MASK;
}
#endif
+static inline int page_to_nid(const struct page *page)
+{
+ return memdesc_nid(PF_POISONED_CHECK(page)->flags);
+}
+
static inline int folio_nid(const struct folio *folio)
{
- return page_to_nid(&folio->page);
+ return memdesc_nid(folio->flags);
}
#ifdef CONFIG_NUMA_BALANCING
@@ -1588,14 +1658,14 @@ static inline void page_cpupid_reset_last(struct page *page)
#else
static inline int folio_last_cpupid(struct folio *folio)
{
- return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
+ return (folio->flags.f >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
}
int folio_xchg_last_cpupid(struct folio *folio, int cpupid);
static inline void page_cpupid_reset_last(struct page *page)
{
- page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
+ page->flags.f |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
}
#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
@@ -1691,7 +1761,7 @@ static inline u8 page_kasan_tag(const struct page *page)
u8 tag = KASAN_TAG_KERNEL;
if (kasan_enabled()) {
- tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
+ tag = (page->flags.f >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
tag ^= 0xff;
}
@@ -1706,12 +1776,12 @@ static inline void page_kasan_tag_set(struct page *page, u8 tag)
return;
tag ^= 0xff;
- old_flags = READ_ONCE(page->flags);
+ old_flags = READ_ONCE(page->flags.f);
do {
flags = old_flags;
flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
- } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags)));
+ } while (unlikely(!try_cmpxchg(&page->flags.f, &old_flags, flags)));
}
static inline void page_kasan_tag_reset(struct page *page)
@@ -1742,26 +1812,26 @@ static inline pg_data_t *page_pgdat(const struct page *page)
return NODE_DATA(page_to_nid(page));
}
-static inline struct zone *folio_zone(const struct folio *folio)
+static inline pg_data_t *folio_pgdat(const struct folio *folio)
{
- return page_zone(&folio->page);
+ return NODE_DATA(folio_nid(folio));
}
-static inline pg_data_t *folio_pgdat(const struct folio *folio)
+static inline struct zone *folio_zone(const struct folio *folio)
{
- return page_pgdat(&folio->page);
+ return &folio_pgdat(folio)->node_zones[folio_zonenum(folio)];
}
#ifdef SECTION_IN_PAGE_FLAGS
static inline void set_page_section(struct page *page, unsigned long section)
{
- page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
- page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
+ page->flags.f &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
+ page->flags.f |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
}
-static inline unsigned long page_to_section(const struct page *page)
+static inline unsigned long memdesc_section(memdesc_flags_t mdf)
{
- return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
+ return (mdf.f >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
}
#endif
@@ -1785,7 +1855,7 @@ static inline struct folio *pfn_folio(unsigned long pfn)
}
#ifdef CONFIG_MMU
-static inline pte_t mk_pte(struct page *page, pgprot_t pgprot)
+static inline pte_t mk_pte(const struct page *page, pgprot_t pgprot)
{
return pfn_pte(page_to_pfn(page), pgprot);
}
@@ -1800,7 +1870,7 @@ static inline pte_t mk_pte(struct page *page, pgprot_t pgprot)
*
* Return: A page table entry suitable for mapping this folio.
*/
-static inline pte_t folio_mk_pte(struct folio *folio, pgprot_t pgprot)
+static inline pte_t folio_mk_pte(const struct folio *folio, pgprot_t pgprot)
{
return pfn_pte(folio_pfn(folio), pgprot);
}
@@ -1816,7 +1886,7 @@ static inline pte_t folio_mk_pte(struct folio *folio, pgprot_t pgprot)
*
* Return: A page table entry suitable for mapping this folio.
*/
-static inline pmd_t folio_mk_pmd(struct folio *folio, pgprot_t pgprot)
+static inline pmd_t folio_mk_pmd(const struct folio *folio, pgprot_t pgprot)
{
return pmd_mkhuge(pfn_pmd(folio_pfn(folio), pgprot));
}
@@ -1832,7 +1902,7 @@ static inline pmd_t folio_mk_pmd(struct folio *folio, pgprot_t pgprot)
*
* Return: A page table entry suitable for mapping this folio.
*/
-static inline pud_t folio_mk_pud(struct folio *folio, pgprot_t pgprot)
+static inline pud_t folio_mk_pud(const struct folio *folio, pgprot_t pgprot)
{
return pud_mkhuge(pfn_pud(folio_pfn(folio), pgprot));
}
@@ -1900,7 +1970,7 @@ static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma,
{
VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1));
- if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))
+ if (!mm_flags_test(MMF_HAS_PINNED, vma->vm_mm))
return false;
return folio_maybe_dma_pinned(folio);
@@ -1966,14 +2036,14 @@ static inline bool folio_is_longterm_pinnable(struct folio *folio)
static inline void set_page_zone(struct page *page, enum zone_type zone)
{
- page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
- page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
+ page->flags.f &= ~(ZONES_MASK << ZONES_PGSHIFT);
+ page->flags.f |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
}
static inline void set_page_node(struct page *page, unsigned long node)
{
- page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
- page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
+ page->flags.f &= ~(NODES_MASK << NODES_PGSHIFT);
+ page->flags.f |= (node & NODES_MASK) << NODES_PGSHIFT;
}
static inline void set_page_links(struct page *page, enum zone_type zone,
@@ -1992,30 +2062,46 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
*
* Return: A positive power of two.
*/
-static inline long folio_nr_pages(const struct folio *folio)
+static inline unsigned long folio_nr_pages(const struct folio *folio)
{
if (!folio_test_large(folio))
return 1;
return folio_large_nr_pages(folio);
}
-/* Only hugetlbfs can allocate folios larger than MAX_ORDER */
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-#define MAX_FOLIO_NR_PAGES (1UL << PUD_ORDER)
+#if !defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE)
+/*
+ * We don't expect any folios that exceed buddy sizes (and consequently
+ * memory sections).
+ */
+#define MAX_FOLIO_ORDER MAX_PAGE_ORDER
+#elif defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
+/*
+ * Only pages within a single memory section are guaranteed to be
+ * contiguous. By limiting folios to a single memory section, all folio
+ * pages are guaranteed to be contiguous.
+ */
+#define MAX_FOLIO_ORDER PFN_SECTION_SHIFT
#else
-#define MAX_FOLIO_NR_PAGES MAX_ORDER_NR_PAGES
+/*
+ * There is no real limit on the folio size. We limit them to the maximum we
+ * currently expect (e.g., hugetlb, dax).
+ */
+#define MAX_FOLIO_ORDER PUD_ORDER
#endif
+#define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER)
+
/*
* compound_nr() returns the number of pages in this potentially compound
* page. compound_nr() can be called on a tail page, and is defined to
* return 1 in that case.
*/
-static inline long compound_nr(struct page *page)
+static inline unsigned long compound_nr(const struct page *page)
{
- struct folio *folio = (struct folio *)page;
+ const struct folio *folio = (struct folio *)page;
- if (!test_bit(PG_head, &folio->flags))
+ if (!test_bit(PG_head, &folio->flags.f))
return 1;
return folio_large_nr_pages(folio);
}
@@ -2351,6 +2437,8 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd);
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd);
+struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr,
+ pud_t pud);
void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
unsigned long size);
@@ -2529,7 +2617,7 @@ void folio_add_pin(struct folio *folio);
int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc);
int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
- struct task_struct *task, bool bypass_rlim);
+ const struct task_struct *task, bool bypass_rlim);
struct kvec;
struct page *get_dump_page(unsigned long addr, int *locked);
@@ -2654,7 +2742,7 @@ static inline void update_hiwater_rss(struct mm_struct *mm)
unsigned long _rss = get_mm_rss(mm);
if (data_race(mm->hiwater_rss) < _rss)
- (mm)->hiwater_rss = _rss;
+ data_race(mm->hiwater_rss = _rss);
}
static inline void update_hiwater_vm(struct mm_struct *mm)
@@ -2846,16 +2934,22 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
}
#endif /* CONFIG_MMU */
+enum pt_flags {
+ PT_reserved = PG_reserved,
+ /* High bits are used for zone/node/section */
+};
+
static inline struct ptdesc *virt_to_ptdesc(const void *x)
{
return page_ptdesc(virt_to_page(x));
}
-static inline void *ptdesc_to_virt(const struct ptdesc *pt)
-{
- return page_to_virt(ptdesc_page(pt));
-}
-
+/**
+ * ptdesc_address - Virtual address of page table.
+ * @pt: Page table descriptor.
+ *
+ * Return: The first byte of the page table described by @pt.
+ */
static inline void *ptdesc_address(const struct ptdesc *pt)
{
return folio_address(ptdesc_folio(pt));
@@ -2863,7 +2957,7 @@ static inline void *ptdesc_address(const struct ptdesc *pt)
static inline bool pagetable_is_reserved(struct ptdesc *pt)
{
- return folio_test_reserved(ptdesc_folio(pt));
+ return test_bit(PT_reserved, &pt->pt_flags.f);
}
/**
@@ -2973,21 +3067,26 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
static inline void ptlock_free(struct ptdesc *ptdesc) {}
#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */
+static inline unsigned long ptdesc_nr_pages(const struct ptdesc *ptdesc)
+{
+ return compound_nr(ptdesc_page(ptdesc));
+}
+
static inline void __pagetable_ctor(struct ptdesc *ptdesc)
{
- struct folio *folio = ptdesc_folio(ptdesc);
+ pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags));
- __folio_set_pgtable(folio);
- lruvec_stat_add_folio(folio, NR_PAGETABLE);
+ __SetPageTable(ptdesc_page(ptdesc));
+ mod_node_page_state(pgdat, NR_PAGETABLE, ptdesc_nr_pages(ptdesc));
}
static inline void pagetable_dtor(struct ptdesc *ptdesc)
{
- struct folio *folio = ptdesc_folio(ptdesc);
+ pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags));
ptlock_free(ptdesc);
- __folio_clear_pgtable(folio);
- lruvec_stat_sub_folio(folio, NR_PAGETABLE);
+ __ClearPageTable(ptdesc_page(ptdesc));
+ mod_node_page_state(pgdat, NR_PAGETABLE, -ptdesc_nr_pages(ptdesc));
}
static inline void pagetable_dtor_free(struct ptdesc *ptdesc)
@@ -3292,7 +3391,7 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
avc; avc = anon_vma_interval_tree_iter_next(avc, start, last))
/* mmap.c */
-extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
+extern int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
extern void exit_mmap(struct mm_struct *);
bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -3432,7 +3531,7 @@ struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
return mtree_load(&mm->mm_mt, addr);
}
-static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma)
+static inline unsigned long stack_guard_start_gap(const struct vm_area_struct *vma)
{
if (vma->vm_flags & VM_GROWSDOWN)
return stack_guard_gap;
@@ -3444,7 +3543,7 @@ static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma)
return 0;
}
-static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
+static inline unsigned long vm_start_gap(const struct vm_area_struct *vma)
{
unsigned long gap = stack_guard_start_gap(vma);
unsigned long vm_start = vma->vm_start;
@@ -3455,7 +3554,7 @@ static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
return vm_start;
}
-static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
+static inline unsigned long vm_end_gap(const struct vm_area_struct *vma)
{
unsigned long vm_end = vma->vm_end;
@@ -3467,7 +3566,7 @@ static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
return vm_end;
}
-static inline unsigned long vma_pages(struct vm_area_struct *vma)
+static inline unsigned long vma_pages(const struct vm_area_struct *vma)
{
return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
}
@@ -3484,7 +3583,7 @@ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
return vma;
}
-static inline bool range_in_vma(struct vm_area_struct *vma,
+static inline bool range_in_vma(const struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
return (vma && vma->vm_start <= start && end <= vma->vm_end);
@@ -3600,7 +3699,7 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
* Indicates whether GUP can follow a PROT_NONE mapped page, or whether
* a (NUMA hinting) fault is required.
*/
-static inline bool gup_can_follow_protnone(struct vm_area_struct *vma,
+static inline bool gup_can_follow_protnone(const struct vm_area_struct *vma,
unsigned int flags)
{
/*
@@ -3730,7 +3829,7 @@ static inline bool debug_guardpage_enabled(void)
return static_branch_unlikely(&_debug_guardpage_enabled);
}
-static inline bool page_is_guard(struct page *page)
+static inline bool page_is_guard(const struct page *page)
{
if (!debug_guardpage_enabled())
return false;
@@ -3761,7 +3860,7 @@ static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {}
static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {}
static inline unsigned int debug_guardpage_minorder(void) { return 0; }
static inline bool debug_guardpage_enabled(void) { return false; }
-static inline bool page_is_guard(struct page *page) { return false; }
+static inline bool page_is_guard(const struct page *page) { return false; }
static inline bool set_page_guard(struct zone *zone, struct page *page,
unsigned int order) { return false; }
static inline void clear_page_guard(struct zone *zone, struct page *page,
@@ -3784,7 +3883,7 @@ static inline int in_gate_area(struct mm_struct *mm, unsigned long addr)
}
#endif /* __HAVE_ARCH_GATE_AREA */
-extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);
+bool process_shares_mm(const struct task_struct *p, const struct mm_struct *mm);
void drop_slab(void);
@@ -3843,7 +3942,7 @@ void vmemmap_free(unsigned long start, unsigned long end,
#endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
+static inline unsigned long vmem_altmap_offset(const struct vmem_altmap *altmap)
{
/* number of pfns from base where pfn_to_page() is valid */
if (altmap)
@@ -3857,7 +3956,7 @@ static inline void vmem_altmap_free(struct vmem_altmap *altmap,
altmap->alloc -= nr_pfns;
}
#else
-static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
+static inline unsigned long vmem_altmap_offset(const struct vmem_altmap *altmap)
{
return 0;
}
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 89b518ff097e..d6c1011b38f2 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -25,7 +25,7 @@
* 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise
* ram or swap backed folio.
*/
-static inline int folio_is_file_lru(struct folio *folio)
+static inline int folio_is_file_lru(const struct folio *folio)
{
return !folio_test_swapbacked(folio);
}
@@ -84,7 +84,7 @@ static __always_inline void __folio_clear_lru_flags(struct folio *folio)
* Return: The LRU list a folio should be on, as an index
* into the array of LRU lists.
*/
-static __always_inline enum lru_list folio_lru_list(struct folio *folio)
+static __always_inline enum lru_list folio_lru_list(const struct folio *folio)
{
enum lru_list lru;
@@ -141,9 +141,9 @@ static inline int lru_tier_from_refs(int refs, bool workingset)
return workingset ? MAX_NR_TIERS - 1 : order_base_2(refs);
}
-static inline int folio_lru_refs(struct folio *folio)
+static inline int folio_lru_refs(const struct folio *folio)
{
- unsigned long flags = READ_ONCE(folio->flags);
+ unsigned long flags = READ_ONCE(folio->flags.f);
if (!(flags & BIT(PG_referenced)))
return 0;
@@ -154,14 +154,14 @@ static inline int folio_lru_refs(struct folio *folio)
return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1;
}
-static inline int folio_lru_gen(struct folio *folio)
+static inline int folio_lru_gen(const struct folio *folio)
{
- unsigned long flags = READ_ONCE(folio->flags);
+ unsigned long flags = READ_ONCE(folio->flags.f);
return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
}
-static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
+static inline bool lru_gen_is_active(const struct lruvec *lruvec, int gen)
{
unsigned long max_seq = lruvec->lrugen.max_seq;
@@ -217,12 +217,13 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli
VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
}
-static inline unsigned long lru_gen_folio_seq(struct lruvec *lruvec, struct folio *folio,
+static inline unsigned long lru_gen_folio_seq(const struct lruvec *lruvec,
+ const struct folio *folio,
bool reclaiming)
{
int gen;
int type = folio_is_file_lru(folio);
- struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ const struct lru_gen_folio *lrugen = &lruvec->lrugen;
/*
* +-----------------------------------+-----------------------------------+
@@ -268,7 +269,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
gen = lru_gen_from_seq(seq);
flags = (gen + 1UL) << LRU_GEN_PGOFF;
/* see the comment on MIN_NR_GENS about PG_active */
- set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags);
+ set_mask_bits(&folio->flags.f, LRU_GEN_MASK | BIT(PG_active), flags);
lru_gen_update_size(lruvec, folio, -1, gen);
/* for folio_rotate_reclaimable() */
@@ -293,7 +294,7 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio,
/* for folio_migrate_flags() */
flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
- flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags);
+ flags = set_mask_bits(&folio->flags.f, LRU_GEN_MASK, flags);
gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
lru_gen_update_size(lruvec, folio, gen, -1);
@@ -302,11 +303,11 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio,
return true;
}
-static inline void folio_migrate_refs(struct folio *new, struct folio *old)
+static inline void folio_migrate_refs(struct folio *new, const struct folio *old)
{
- unsigned long refs = READ_ONCE(old->flags) & LRU_REFS_MASK;
+ unsigned long refs = READ_ONCE(old->flags.f) & LRU_REFS_MASK;
- set_mask_bits(&new->flags, LRU_REFS_MASK, refs);
+ set_mask_bits(&new->flags.f, LRU_REFS_MASK, refs);
}
#else /* !CONFIG_LRU_GEN */
@@ -330,7 +331,7 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio,
return false;
}
-static inline void folio_migrate_refs(struct folio *new, struct folio *old)
+static inline void folio_migrate_refs(struct folio *new, const struct folio *old)
{
}
@@ -508,7 +509,7 @@ static inline void dec_tlb_flush_pending(struct mm_struct *mm)
atomic_dec(&mm->tlb_flush_pending);
}
-static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
+static inline bool mm_tlb_flush_pending(const struct mm_struct *mm)
{
/*
* Must be called after having acquired the PTL; orders against that
@@ -521,7 +522,7 @@ static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
return atomic_read(&mm->tlb_flush_pending);
}
-static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
+static inline bool mm_tlb_flush_nested(const struct mm_struct *mm)
{
/*
* Similar to mm_tlb_flush_pending(), we must have acquired the PTL
@@ -605,7 +606,7 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
return false;
}
-static inline bool vma_has_recency(struct vm_area_struct *vma)
+static inline bool vma_has_recency(const struct vm_area_struct *vma)
{
if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))
return false;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7f625c35128b..90e5790c318f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -20,6 +20,7 @@
#include <linux/seqlock.h>
#include <linux/percpu_counter.h>
#include <linux/types.h>
+#include <linux/bitmap.h>
#include <asm/mmu.h>
@@ -33,6 +34,10 @@ struct address_space;
struct futex_private_hash;
struct mem_cgroup;
+typedef struct {
+ unsigned long f;
+} memdesc_flags_t;
+
/*
* Each physical page in the system has a struct page associated with
* it to keep track of whatever it is we are using the page for at the
@@ -71,7 +76,7 @@ struct mem_cgroup;
#endif
struct page {
- unsigned long flags; /* Atomic flags, some possibly
+ memdesc_flags_t flags; /* Atomic flags, some possibly
* updated asynchronously */
/*
* Five words (20/40 bytes) are available in this union.
@@ -89,21 +94,10 @@ struct page {
union {
struct list_head lru;
- /* Or, for the Unevictable "LRU list" slot */
- struct {
- /* Always even, to negate PageTail */
- void *__filler;
- /* Count page's or folio's mlocks */
- unsigned int mlock_count;
- };
-
/* Or, free page */
struct list_head buddy_list;
struct list_head pcp_list;
- struct {
- struct llist_node pcp_llist;
- unsigned int order;
- };
+ struct llist_node pcp_llist;
};
struct address_space *mapping;
union {
@@ -114,7 +108,8 @@ struct page {
* @private: Mapping-private opaque data.
* Usually used for buffer_heads if PagePrivate.
* Used for swp_entry_t if swapcache flag set.
- * Indicates order in the buddy system if PageBuddy.
+ * Indicates order in the buddy system if PageBuddy
+ * or on pcp_llist.
*/
unsigned long private;
};
@@ -382,11 +377,13 @@ struct folio {
union {
struct {
/* public: */
- unsigned long flags;
+ memdesc_flags_t flags;
union {
struct list_head lru;
/* private: avoid cluttering the output */
+ /* For the Unevictable "LRU list" slot */
struct {
+ /* Avoid compound_head */
void *__filler;
/* public: */
unsigned int mlock_count;
@@ -525,7 +522,7 @@ FOLIO_MATCH(compound_head, _head_3);
/**
* struct ptdesc - Memory descriptor for page tables.
- * @__page_flags: Same as page flags. Powerpc only.
+ * @pt_flags: enum pt_flags plus zone/node/section.
* @pt_rcu_head: For freeing page table pages.
* @pt_list: List of used page tables. Used for s390 gmap shadow pages
* (which are not linked into the user page tables) and x86
@@ -547,7 +544,7 @@ FOLIO_MATCH(compound_head, _head_3);
* understanding of the issues.
*/
struct ptdesc {
- unsigned long __page_flags;
+ memdesc_flags_t pt_flags;
union {
struct rcu_head pt_rcu_head;
@@ -585,7 +582,7 @@ struct ptdesc {
#define TABLE_MATCH(pg, pt) \
static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt))
-TABLE_MATCH(flags, __page_flags);
+TABLE_MATCH(flags, pt_flags);
TABLE_MATCH(compound_head, pt_list);
TABLE_MATCH(compound_head, _pt_pad_1);
TABLE_MATCH(mapping, __page_mapping);
@@ -627,7 +624,7 @@ static inline void ptdesc_pmd_pts_dec(struct ptdesc *ptdesc)
atomic_dec(&ptdesc->pt_share_count);
}
-static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc)
+static inline int ptdesc_pmd_pts_count(const struct ptdesc *ptdesc)
{
return atomic_read(&ptdesc->pt_share_count);
}
@@ -660,7 +657,7 @@ static inline void set_page_private(struct page *page, unsigned long private)
page->private = private;
}
-static inline void *folio_get_private(struct folio *folio)
+static inline void *folio_get_private(const struct folio *folio)
{
return folio->private;
}
@@ -785,13 +782,14 @@ struct pfnmap_track_ctx {
*/
struct vm_area_desc {
/* Immutable state. */
- struct mm_struct *mm;
+ const struct mm_struct *const mm;
+ struct file *const file; /* May vary from vm_file in stacked callers. */
unsigned long start;
unsigned long end;
/* Mutable fields. Populated with initial state. */
pgoff_t pgoff;
- struct file *file;
+ struct file *vm_file;
vm_flags_t vm_flags;
pgprot_t page_prot;
@@ -932,6 +930,15 @@ struct mm_cid {
};
#endif
+/*
+ * Opaque type representing current mm_struct flag state. Must be accessed via
+ * mm_flags_xxx() helper functions.
+ */
+#define NUM_MM_FLAG_BITS (64)
+typedef struct {
+ DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS);
+} __private mm_flags_t;
+
struct kioctx_table;
struct iommu_mm_data;
struct mm_struct {
@@ -1031,10 +1038,10 @@ struct mm_struct {
* counters
*/
/*
- * With some kernel config, the current mmap_lock's offset
- * inside 'mm_struct' is at 0x120, which is very optimal, as
+ * Typically the current mmap_lock's offset is 56 bytes from
+ * the last cacheline boundary, which is very optimal, as
* its two hot fields 'count' and 'owner' sit in 2 different
- * cachelines, and when mmap_lock is highly contended, both
+ * cachelines, and when mmap_lock is highly contended, both
* of the 2 fields will be accessed frequently, current layout
* will help to reduce cache bouncing.
*
@@ -1119,7 +1126,7 @@ struct mm_struct {
/* Architecture-specific MM context */
mm_context_t context;
- unsigned long flags; /* Must use atomic bitops to access */
+ mm_flags_t flags; /* Must use mm_flags_* hlpers to access */
#ifdef CONFIG_AIO
spinlock_t ioctx_lock;
@@ -1229,6 +1236,40 @@ struct mm_struct {
unsigned long cpu_bitmap[];
};
+/* Set the first system word of mm flags, non-atomically. */
+static inline void __mm_flags_set_word(struct mm_struct *mm, unsigned long value)
+{
+ unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags);
+
+ bitmap_copy(bitmap, &value, BITS_PER_LONG);
+}
+
+/* Obtain a read-only view of the bitmap. */
+static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct *mm)
+{
+ return (const unsigned long *)ACCESS_PRIVATE(&mm->flags, __mm_flags);
+}
+
+/* Read the first system word of mm flags, non-atomically. */
+static inline unsigned long __mm_flags_get_word(const struct mm_struct *mm)
+{
+ const unsigned long *bitmap = __mm_flags_get_bitmap(mm);
+
+ return bitmap_read(bitmap, 0, BITS_PER_LONG);
+}
+
+/*
+ * Update the first system word of mm flags ONLY, applying the specified mask to
+ * it, then setting all flags specified by bits.
+ */
+static inline void __mm_flags_set_mask_bits_word(struct mm_struct *mm,
+ unsigned long mask, unsigned long bits)
+{
+ unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags);
+
+ set_mask_bits(bitmap, mask, bits);
+}
+
#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \
MT_FLAGS_USE_RCU)
extern struct mm_struct init_mm;
@@ -1729,7 +1770,7 @@ enum {
* the modes are SUID_DUMP_* defined in linux/sched/coredump.h
*/
#define MMF_DUMPABLE_BITS 2
-#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
+#define MMF_DUMPABLE_MASK (BIT(MMF_DUMPABLE_BITS) - 1)
/* coredump filter bits */
#define MMF_DUMP_ANON_PRIVATE 2
#define MMF_DUMP_ANON_SHARED 3
@@ -1744,13 +1785,13 @@ enum {
#define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS
#define MMF_DUMP_FILTER_BITS 9
#define MMF_DUMP_FILTER_MASK \
- (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
+ ((BIT(MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
#define MMF_DUMP_FILTER_DEFAULT \
- ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\
- (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
+ (BIT(MMF_DUMP_ANON_PRIVATE) | BIT(MMF_DUMP_ANON_SHARED) | \
+ BIT(MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
-# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS)
+# define MMF_DUMP_MASK_DEFAULT_ELF BIT(MMF_DUMP_ELF_HEADERS)
#else
# define MMF_DUMP_MASK_DEFAULT_ELF 0
#endif
@@ -1758,19 +1799,16 @@ enum {
#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */
#define MMF_VM_HUGEPAGE 17 /* set when mm is available for khugepaged */
-/*
- * This one-shot flag is dropped due to necessity of changing exe once again
- * on NFS restore
- */
-//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */
+#define MMF_HUGE_ZERO_FOLIO 18 /* mm has ever used the global huge zero folio */
#define MMF_HAS_UPROBES 19 /* has uprobes */
#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */
#define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */
#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */
-#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */
-#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */
-#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP)
+#define MMF_DISABLE_THP_EXCEPT_ADVISED 23 /* no THP except when advised (e.g., VM_HUGEPAGE) */
+#define MMF_DISABLE_THP_COMPLETELY 24 /* no THP for all VMAs */
+#define MMF_DISABLE_THP_MASK (BIT(MMF_DISABLE_THP_COMPLETELY) | \
+ BIT(MMF_DISABLE_THP_EXCEPT_ADVISED))
#define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */
#define MMF_MULTIPROCESS 26 /* mm is shared between processes */
/*
@@ -1783,27 +1821,33 @@ enum {
#define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */
#define MMF_HAS_MDWE 28
-#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE)
-
+#define MMF_HAS_MDWE_MASK BIT(MMF_HAS_MDWE)
#define MMF_HAS_MDWE_NO_INHERIT 29
#define MMF_VM_MERGE_ANY 30
-#define MMF_VM_MERGE_ANY_MASK (1 << MMF_VM_MERGE_ANY)
+#define MMF_VM_MERGE_ANY_MASK BIT(MMF_VM_MERGE_ANY)
#define MMF_TOPDOWN 31 /* mm searches top down by default */
-#define MMF_TOPDOWN_MASK (1 << MMF_TOPDOWN)
+#define MMF_TOPDOWN_MASK BIT(MMF_TOPDOWN)
-#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
+#define MMF_INIT_LEGACY_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\
MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK)
-static inline unsigned long mmf_init_flags(unsigned long flags)
+/* Legacy flags must fit within 32 bits. */
+static_assert((u64)MMF_INIT_LEGACY_MASK <= (u64)UINT_MAX);
+
+/*
+ * Initialise legacy flags according to masks, propagating selected flags on
+ * fork. Further flag manipulation can be performed by the caller.
+ */
+static inline unsigned long mmf_init_legacy_flags(unsigned long flags)
{
if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT))
flags &= ~((1UL << MMF_HAS_MDWE) |
(1UL << MMF_HAS_MDWE_NO_INHERIT));
- return flags & MMF_INIT_MASK;
+ return flags & MMF_INIT_LEGACY_MASK;
}
#endif /* _LINUX_MM_TYPES_H */
diff --git a/include/linux/mman.h b/include/linux/mman.h
index de9e8e6229a4..0ba8a7e8b90a 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -201,7 +201,7 @@ static inline bool arch_memory_deny_write_exec_supported(void)
static inline bool map_deny_write_exec(unsigned long old, unsigned long new)
{
/* If MDWE is disabled, we have nothing to deny. */
- if (!test_bit(MMF_HAS_MDWE, &current->mm->flags))
+ if (!mm_flags_test(MMF_HAS_MDWE, current->mm))
return false;
/* If the new VMA is not executable, we have nothing to deny. */
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 11a078de9150..2c9fffa58714 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -148,91 +148,6 @@ static inline void vma_refcount_put(struct vm_area_struct *vma)
}
/*
- * Try to read-lock a vma. The function is allowed to occasionally yield false
- * locked result to avoid performance overhead, in which case we fall back to
- * using mmap_lock. The function should never yield false unlocked result.
- * False locked result is possible if mm_lock_seq overflows or if vma gets
- * reused and attached to a different mm before we lock it.
- * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
- * detached.
- *
- * WARNING! The vma passed to this function cannot be used if the function
- * fails to lock it because in certain cases RCU lock is dropped and then
- * reacquired. Once RCU lock is dropped the vma can be concurently freed.
- */
-static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
- struct vm_area_struct *vma)
-{
- int oldcnt;
-
- /*
- * Check before locking. A race might cause false locked result.
- * We can use READ_ONCE() for the mm_lock_seq here, and don't need
- * ACQUIRE semantics, because this is just a lockless check whose result
- * we don't rely on for anything - the mm_lock_seq read against which we
- * need ordering is below.
- */
- if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence))
- return NULL;
-
- /*
- * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
- * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
- * Acquire fence is required here to avoid reordering against later
- * vm_lock_seq check and checks inside lock_vma_under_rcu().
- */
- if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
- VMA_REF_LIMIT))) {
- /* return EAGAIN if vma got detached from under us */
- return oldcnt ? NULL : ERR_PTR(-EAGAIN);
- }
-
- rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
-
- /*
- * If vma got attached to another mm from under us, that mm is not
- * stable and can be freed in the narrow window after vma->vm_refcnt
- * is dropped and before rcuwait_wake_up(mm) is called. Grab it before
- * releasing vma->vm_refcnt.
- */
- if (unlikely(vma->vm_mm != mm)) {
- /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */
- struct mm_struct *other_mm = vma->vm_mm;
-
- /*
- * __mmdrop() is a heavy operation and we don't need RCU
- * protection here. Release RCU lock during these operations.
- * We reinstate the RCU read lock as the caller expects it to
- * be held when this function returns even on error.
- */
- rcu_read_unlock();
- mmgrab(other_mm);
- vma_refcount_put(vma);
- mmdrop(other_mm);
- rcu_read_lock();
- return NULL;
- }
-
- /*
- * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
- * False unlocked result is impossible because we modify and check
- * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
- * modification invalidates all existing locks.
- *
- * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
- * racing with vma_end_write_all(), we only start reading from the VMA
- * after it has been unlocked.
- * This pairs with RELEASE semantics in vma_end_write_all().
- */
- if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
- vma_refcount_put(vma);
- return NULL;
- }
-
- return vma;
-}
-
-/*
* Use only while holding mmap read lock which guarantees that locking will not
* fail (nobody can concurrently write-lock the vma). vma_start_read() should
* not be used in such cases because it might fail due to mm_lock_seq overflow.
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0c5da9141983..7fb7331c5725 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -234,7 +234,21 @@ enum node_stat_item {
#endif
#ifdef CONFIG_NUMA_BALANCING
PGPROMOTE_SUCCESS, /* promote successfully */
- PGPROMOTE_CANDIDATE, /* candidate pages to promote */
+ /**
+ * Candidate pages for promotion based on hint fault latency. This
+ * counter is used to control the promotion rate and adjust the hot
+ * threshold.
+ */
+ PGPROMOTE_CANDIDATE,
+ /**
+ * Not rate-limited (NRL) candidate pages for those can be promoted
+ * without considering hot threshold because of enough free pages in
+ * fast-tier node. These promotions bypass the regular hotness checks
+ * and do NOT influence the promotion rate-limiter or
+ * threshold-adjustment logic.
+ * This is for statistics/monitoring purposes.
+ */
+ PGPROMOTE_CANDIDATE_NRL,
#endif
/* PGDEMOTE_*: pages demoted */
PGDEMOTE_KSWAPD,
@@ -245,6 +259,7 @@ enum node_stat_item {
NR_HUGETLB,
#endif
NR_BALLOON_PAGES,
+ NR_KERNEL_FILE_PAGES,
NR_VM_NODE_STAT_ITEMS
};
@@ -1089,7 +1104,7 @@ static inline unsigned long promo_wmark_pages(const struct zone *z)
return wmark_pages(z, WMARK_PROMO);
}
-static inline unsigned long zone_managed_pages(struct zone *zone)
+static inline unsigned long zone_managed_pages(const struct zone *zone)
{
return (unsigned long)atomic_long_read(&zone->managed_pages);
}
@@ -1113,12 +1128,12 @@ static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
}
-static inline bool zone_is_initialized(struct zone *zone)
+static inline bool zone_is_initialized(const struct zone *zone)
{
return zone->initialized;
}
-static inline bool zone_is_empty(struct zone *zone)
+static inline bool zone_is_empty(const struct zone *zone)
{
return zone->spanned_pages == 0;
}
@@ -1169,26 +1184,31 @@ static inline bool zone_is_empty(struct zone *zone)
#define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1)
#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1)
+static inline enum zone_type memdesc_zonenum(memdesc_flags_t flags)
+{
+ ASSERT_EXCLUSIVE_BITS(flags.f, ZONES_MASK << ZONES_PGSHIFT);
+ return (flags.f >> ZONES_PGSHIFT) & ZONES_MASK;
+}
+
static inline enum zone_type page_zonenum(const struct page *page)
{
- ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT);
- return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
+ return memdesc_zonenum(page->flags);
}
static inline enum zone_type folio_zonenum(const struct folio *folio)
{
- return page_zonenum(&folio->page);
+ return memdesc_zonenum(folio->flags);
}
#ifdef CONFIG_ZONE_DEVICE
-static inline bool is_zone_device_page(const struct page *page)
+static inline bool memdesc_is_zone_device(memdesc_flags_t mdf)
{
- return page_zonenum(page) == ZONE_DEVICE;
+ return memdesc_zonenum(mdf) == ZONE_DEVICE;
}
static inline struct dev_pagemap *page_pgmap(const struct page *page)
{
- VM_WARN_ON_ONCE_PAGE(!is_zone_device_page(page), page);
+ VM_WARN_ON_ONCE_PAGE(!memdesc_is_zone_device(page->flags), page);
return page_folio(page)->pgmap;
}
@@ -1203,9 +1223,9 @@ static inline struct dev_pagemap *page_pgmap(const struct page *page)
static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
const struct page *b)
{
- if (is_zone_device_page(a) != is_zone_device_page(b))
+ if (memdesc_is_zone_device(a->flags) != memdesc_is_zone_device(b->flags))
return false;
- if (!is_zone_device_page(a))
+ if (!memdesc_is_zone_device(a->flags))
return true;
return page_pgmap(a) == page_pgmap(b);
}
@@ -1213,7 +1233,7 @@ static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
extern void memmap_init_zone_device(struct zone *, unsigned long,
unsigned long, struct dev_pagemap *);
#else
-static inline bool is_zone_device_page(const struct page *page)
+static inline bool memdesc_is_zone_device(memdesc_flags_t mdf)
{
return false;
}
@@ -1228,9 +1248,14 @@ static inline struct dev_pagemap *page_pgmap(const struct page *page)
}
#endif
+static inline bool is_zone_device_page(const struct page *page)
+{
+ return memdesc_is_zone_device(page->flags);
+}
+
static inline bool folio_is_zone_device(const struct folio *folio)
{
- return is_zone_device_page(&folio->page);
+ return memdesc_is_zone_device(folio->flags);
}
static inline bool is_zone_movable_page(const struct page *page)
@@ -1248,7 +1273,7 @@ static inline bool folio_is_zone_movable(const struct folio *folio)
* Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty
* intersection with the given zone
*/
-static inline bool zone_intersects(struct zone *zone,
+static inline bool zone_intersects(const struct zone *zone,
unsigned long start_pfn, unsigned long nr_pages)
{
if (zone_is_empty(zone))
@@ -1415,7 +1440,7 @@ typedef struct pglist_data {
int kswapd_order;
enum zone_type kswapd_highest_zoneidx;
- int kswapd_failures; /* Number of 'reclaimed == 0' runs */
+ atomic_t kswapd_failures; /* Number of 'reclaimed == 0' runs */
#ifdef CONFIG_COMPACTION
int kcompactd_max_order;
@@ -1556,12 +1581,12 @@ static inline int local_memory_node(int node_id) { return node_id; };
#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
#ifdef CONFIG_ZONE_DEVICE
-static inline bool zone_is_zone_device(struct zone *zone)
+static inline bool zone_is_zone_device(const struct zone *zone)
{
return zone_idx(zone) == ZONE_DEVICE;
}
#else
-static inline bool zone_is_zone_device(struct zone *zone)
+static inline bool zone_is_zone_device(const struct zone *zone)
{
return false;
}
@@ -1573,19 +1598,19 @@ static inline bool zone_is_zone_device(struct zone *zone)
* populated_zone(). If the whole zone is reserved then we can easily
* end up with populated_zone() && !managed_zone().
*/
-static inline bool managed_zone(struct zone *zone)
+static inline bool managed_zone(const struct zone *zone)
{
return zone_managed_pages(zone);
}
/* Returns true if a zone has memory */
-static inline bool populated_zone(struct zone *zone)
+static inline bool populated_zone(const struct zone *zone)
{
return zone->present_pages;
}
#ifdef CONFIG_NUMA
-static inline int zone_to_nid(struct zone *zone)
+static inline int zone_to_nid(const struct zone *zone)
{
return zone->node;
}
@@ -1595,7 +1620,7 @@ static inline void zone_set_nid(struct zone *zone, int nid)
zone->node = nid;
}
#else
-static inline int zone_to_nid(struct zone *zone)
+static inline int zone_to_nid(const struct zone *zone)
{
return 0;
}
@@ -1622,7 +1647,7 @@ static inline int is_highmem_idx(enum zone_type idx)
* @zone: pointer to struct zone variable
* Return: 1 for a highmem zone, 0 otherwise
*/
-static inline int is_highmem(struct zone *zone)
+static inline int is_highmem(const struct zone *zone)
{
return is_highmem_idx(zone_idx(zone));
}
@@ -1688,12 +1713,12 @@ static inline struct zone *zonelist_zone(struct zoneref *zoneref)
return zoneref->zone;
}
-static inline int zonelist_zone_idx(struct zoneref *zoneref)
+static inline int zonelist_zone_idx(const struct zoneref *zoneref)
{
return zoneref->zone_idx;
}
-static inline int zonelist_node_idx(struct zoneref *zoneref)
+static inline int zonelist_node_idx(const struct zoneref *zoneref)
{
return zone_to_nid(zoneref->zone);
}
@@ -1996,7 +2021,7 @@ static inline struct page *__section_mem_map_addr(struct mem_section *section)
return (struct page *)map;
}
-static inline int present_section(struct mem_section *section)
+static inline int present_section(const struct mem_section *section)
{
return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
}
@@ -2006,12 +2031,12 @@ static inline int present_section_nr(unsigned long nr)
return present_section(__nr_to_section(nr));
}
-static inline int valid_section(struct mem_section *section)
+static inline int valid_section(const struct mem_section *section)
{
return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP));
}
-static inline int early_section(struct mem_section *section)
+static inline int early_section(const struct mem_section *section)
{
return (section && (section->section_mem_map & SECTION_IS_EARLY));
}
@@ -2021,27 +2046,27 @@ static inline int valid_section_nr(unsigned long nr)
return valid_section(__nr_to_section(nr));
}
-static inline int online_section(struct mem_section *section)
+static inline int online_section(const struct mem_section *section)
{
return (section && (section->section_mem_map & SECTION_IS_ONLINE));
}
#ifdef CONFIG_ZONE_DEVICE
-static inline int online_device_section(struct mem_section *section)
+static inline int online_device_section(const struct mem_section *section)
{
unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE;
return section && ((section->section_mem_map & flags) == flags);
}
#else
-static inline int online_device_section(struct mem_section *section)
+static inline int online_device_section(const struct mem_section *section)
{
return 0;
}
#endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
-static inline int preinited_vmemmap_section(struct mem_section *section)
+static inline int preinited_vmemmap_section(const struct mem_section *section)
{
return (section &&
(section->section_mem_map & SECTION_IS_VMEMMAP_PREINIT));
@@ -2051,7 +2076,7 @@ void sparse_vmemmap_init_nid_early(int nid);
void sparse_vmemmap_init_nid_late(int nid);
#else
-static inline int preinited_vmemmap_section(struct mem_section *section)
+static inline int preinited_vmemmap_section(const struct mem_section *section)
{
return 0;
}
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 3a25122d83e2..6907aedc4f74 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -349,6 +349,19 @@ static inline void kernel_param_unlock(struct module *mod)
__module_param_call("", name, &param_ops_##type, &var, perm, \
-1, KERNEL_PARAM_FL_UNSAFE)
+/**
+ * __core_param_cb - similar like core_param, with a set/get ops instead of type.
+ * @name: the name of the cmdline and sysfs parameter (often the same as var)
+ * @var: the variable
+ * @ops: the set & get operations for this parameter.
+ * @perm: visibility in sysfs
+ *
+ * Ideally this should be called 'core_param_cb', but the name has been
+ * used for module core parameter, so add the '__' prefix
+ */
+#define __core_param_cb(name, ops, arg, perm) \
+ __module_param_call("", name, ops, arg, perm, -1, 0)
+
#endif /* !MODULE */
/**
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 98c96d649bf9..72ee7d210a74 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -21,7 +21,7 @@
#include <linux/rolling_buffer.h>
enum netfs_sreq_ref_trace;
-typedef struct mempool_s mempool_t;
+typedef struct mempool mempool_t;
struct folio_queue;
/**
diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index 615a560d9edb..f3b13da78aac 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -103,7 +103,7 @@ struct nvmem_cell_info {
*
* Note: A default "nvmem<id>" name will be assigned to the device if
* no name is specified in its configuration. In such case "<id>" is
- * generated with ida_simple_get() and provided id field is ignored.
+ * generated with ida_alloc() and provided id field is ignored.
*
* Note: Specifying name and setting id to -1 implies a unique device
* whose name is provided as-is (kept unaltered).
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 1e0fc6931ce9..7b02bc1d0a7e 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -91,7 +91,7 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk)
*/
static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
{
- if (unlikely(test_bit(MMF_UNSTABLE, &mm->flags)))
+ if (unlikely(mm_flags_test(MMF_UNSTABLE, mm)))
return VM_FAULT_SIGBUS;
return 0;
}
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 8d3fa3a91ce4..48e27768e7ba 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -217,7 +217,7 @@ static __always_inline const struct page *page_fixed_fake_head(const struct page
* cold cacheline in some cases.
*/
if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) &&
- test_bit(PG_head, &page->flags)) {
+ test_bit(PG_head, &page->flags.f)) {
/*
* We can safely access the field of the @page[1] with PG_head
* because the @page is a compound page composed with at least
@@ -316,7 +316,7 @@ static __always_inline unsigned long _compound_head(const struct page *page)
* check that the page number lies within @folio; the caller is presumed
* to have a reference to the page.
*/
-#define folio_page(folio, n) nth_page(&(folio)->page, n)
+#define folio_page(folio, n) (&(folio)->page + (n))
static __always_inline int PageTail(const struct page *page)
{
@@ -325,14 +325,14 @@ static __always_inline int PageTail(const struct page *page)
static __always_inline int PageCompound(const struct page *page)
{
- return test_bit(PG_head, &page->flags) ||
+ return test_bit(PG_head, &page->flags.f) ||
READ_ONCE(page->compound_head) & 1;
}
#define PAGE_POISON_PATTERN -1l
static inline int PagePoisoned(const struct page *page)
{
- return READ_ONCE(page->flags) == PAGE_POISON_PATTERN;
+ return READ_ONCE(page->flags.f) == PAGE_POISON_PATTERN;
}
#ifdef CONFIG_DEBUG_VM
@@ -349,8 +349,8 @@ static const unsigned long *const_folio_flags(const struct folio *folio,
const struct page *page = &folio->page;
VM_BUG_ON_PGFLAGS(page->compound_head & 1, page);
- VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page);
- return &page[n].flags;
+ VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page);
+ return &page[n].flags.f;
}
static unsigned long *folio_flags(struct folio *folio, unsigned n)
@@ -358,8 +358,8 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n)
struct page *page = &folio->page;
VM_BUG_ON_PGFLAGS(page->compound_head & 1, page);
- VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page);
- return &page[n].flags;
+ VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page);
+ return &page[n].flags.f;
}
/*
@@ -449,37 +449,37 @@ FOLIO_CLEAR_FLAG(name, page)
#define TESTPAGEFLAG(uname, lname, policy) \
FOLIO_TEST_FLAG(lname, FOLIO_##policy) \
static __always_inline int Page##uname(const struct page *page) \
-{ return test_bit(PG_##lname, &policy(page, 0)->flags); }
+{ return test_bit(PG_##lname, &policy(page, 0)->flags.f); }
#define SETPAGEFLAG(uname, lname, policy) \
FOLIO_SET_FLAG(lname, FOLIO_##policy) \
static __always_inline void SetPage##uname(struct page *page) \
-{ set_bit(PG_##lname, &policy(page, 1)->flags); }
+{ set_bit(PG_##lname, &policy(page, 1)->flags.f); }
#define CLEARPAGEFLAG(uname, lname, policy) \
FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \
static __always_inline void ClearPage##uname(struct page *page) \
-{ clear_bit(PG_##lname, &policy(page, 1)->flags); }
+{ clear_bit(PG_##lname, &policy(page, 1)->flags.f); }
#define __SETPAGEFLAG(uname, lname, policy) \
__FOLIO_SET_FLAG(lname, FOLIO_##policy) \
static __always_inline void __SetPage##uname(struct page *page) \
-{ __set_bit(PG_##lname, &policy(page, 1)->flags); }
+{ __set_bit(PG_##lname, &policy(page, 1)->flags.f); }
#define __CLEARPAGEFLAG(uname, lname, policy) \
__FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \
static __always_inline void __ClearPage##uname(struct page *page) \
-{ __clear_bit(PG_##lname, &policy(page, 1)->flags); }
+{ __clear_bit(PG_##lname, &policy(page, 1)->flags.f); }
#define TESTSETFLAG(uname, lname, policy) \
FOLIO_TEST_SET_FLAG(lname, FOLIO_##policy) \
static __always_inline int TestSetPage##uname(struct page *page) \
-{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
+{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags.f); }
#define TESTCLEARFLAG(uname, lname, policy) \
FOLIO_TEST_CLEAR_FLAG(lname, FOLIO_##policy) \
static __always_inline int TestClearPage##uname(struct page *page) \
-{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
+{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags.f); }
#define PAGEFLAG(uname, lname, policy) \
TESTPAGEFLAG(uname, lname, policy) \
@@ -846,7 +846,7 @@ static __always_inline bool folio_test_head(const struct folio *folio)
static __always_inline int PageHead(const struct page *page)
{
PF_POISONED_CHECK(page);
- return test_bit(PG_head, &page->flags) && !page_is_fake_head(page);
+ return test_bit(PG_head, &page->flags.f) && !page_is_fake_head(page);
}
__SETPAGEFLAG(Head, head, PF_ANY)
@@ -1170,28 +1170,28 @@ static __always_inline int PageAnonExclusive(const struct page *page)
*/
if (PageHuge(page))
page = compound_head(page);
- return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
+ return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f);
}
static __always_inline void SetPageAnonExclusive(struct page *page)
{
VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page);
VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
- set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
+ set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f);
}
static __always_inline void ClearPageAnonExclusive(struct page *page)
{
VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page);
VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
- clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
+ clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f);
}
static __always_inline void __ClearPageAnonExclusive(struct page *page)
{
VM_BUG_ON_PGFLAGS(!PageAnon(page), page);
VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
- __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
+ __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f);
}
#ifdef CONFIG_MMU
@@ -1241,7 +1241,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
*/
static inline int folio_has_private(const struct folio *folio)
{
- return !!(folio->flags & PAGE_FLAGS_PRIVATE);
+ return !!(folio->flags.f & PAGE_FLAGS_PRIVATE);
}
#undef PF_ANY
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index 6a44be0f39f4..e046278a01fa 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -13,12 +13,11 @@
#include <linux/types.h>
-#define PB_migratetype_bits 3
/* Bit indices that affect a whole block of pages */
enum pageblock_bits {
- PB_migrate,
- PB_migrate_end = PB_migrate + PB_migratetype_bits - 1,
- /* 3 bits required for migrate types */
+ PB_migrate_0,
+ PB_migrate_1,
+ PB_migrate_2,
PB_compact_skip,/* If set the block is skipped by compaction */
#ifdef CONFIG_MEMORY_ISOLATION
@@ -37,11 +36,10 @@ enum pageblock_bits {
#define NR_PAGEBLOCK_BITS (roundup_pow_of_two(__NR_PAGEBLOCK_BITS))
-#define MIGRATETYPE_MASK ((1UL << (PB_migrate_end + 1)) - 1)
+#define MIGRATETYPE_MASK (BIT(PB_migrate_0)|BIT(PB_migrate_1)|BIT(PB_migrate_2))
#ifdef CONFIG_MEMORY_ISOLATION
-#define MIGRATETYPE_AND_ISO_MASK \
- (((1UL << (PB_migrate_end + 1)) - 1) | BIT(PB_migrate_isolate))
+#define MIGRATETYPE_AND_ISO_MASK (MIGRATETYPE_MASK | BIT(PB_migrate_isolate))
#else
#define MIGRATETYPE_AND_ISO_MASK MIGRATETYPE_MASK
#endif
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 12a12dae727d..185644e288ea 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -140,7 +140,7 @@ static inline int inode_drain_writes(struct inode *inode)
return filemap_write_and_wait(inode->i_mapping);
}
-static inline bool mapping_empty(struct address_space *mapping)
+static inline bool mapping_empty(const struct address_space *mapping)
{
return xa_empty(&mapping->i_pages);
}
@@ -166,7 +166,7 @@ static inline bool mapping_empty(struct address_space *mapping)
* refcount and the referenced bit, which will be elevated or set in
* the process of adding new cache pages to an inode.
*/
-static inline bool mapping_shrinkable(struct address_space *mapping)
+static inline bool mapping_shrinkable(const struct address_space *mapping)
{
void *head;
@@ -211,6 +211,8 @@ enum mapping_flags {
folio contents */
AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */
AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9,
+ AS_KERNEL_FILE = 10, /* mapping for a fake kernel file that shouldn't
+ account usage to user cgroups */
/* Bits 16-25 are used for FOLIO_ORDER */
AS_FOLIO_ORDER_BITS = 5,
AS_FOLIO_ORDER_MIN = 16,
@@ -265,7 +267,7 @@ static inline void mapping_clear_unevictable(struct address_space *mapping)
clear_bit(AS_UNEVICTABLE, &mapping->flags);
}
-static inline bool mapping_unevictable(struct address_space *mapping)
+static inline bool mapping_unevictable(const struct address_space *mapping)
{
return mapping && test_bit(AS_UNEVICTABLE, &mapping->flags);
}
@@ -275,7 +277,7 @@ static inline void mapping_set_exiting(struct address_space *mapping)
set_bit(AS_EXITING, &mapping->flags);
}
-static inline int mapping_exiting(struct address_space *mapping)
+static inline int mapping_exiting(const struct address_space *mapping)
{
return test_bit(AS_EXITING, &mapping->flags);
}
@@ -285,7 +287,7 @@ static inline void mapping_set_no_writeback_tags(struct address_space *mapping)
set_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
}
-static inline int mapping_use_writeback_tags(struct address_space *mapping)
+static inline int mapping_use_writeback_tags(const struct address_space *mapping)
{
return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
}
@@ -331,7 +333,7 @@ static inline void mapping_set_inaccessible(struct address_space *mapping)
set_bit(AS_INACCESSIBLE, &mapping->flags);
}
-static inline bool mapping_inaccessible(struct address_space *mapping)
+static inline bool mapping_inaccessible(const struct address_space *mapping)
{
return test_bit(AS_INACCESSIBLE, &mapping->flags);
}
@@ -341,18 +343,18 @@ static inline void mapping_set_writeback_may_deadlock_on_reclaim(struct address_
set_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags);
}
-static inline bool mapping_writeback_may_deadlock_on_reclaim(struct address_space *mapping)
+static inline bool mapping_writeback_may_deadlock_on_reclaim(const struct address_space *mapping)
{
return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags);
}
-static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
+static inline gfp_t mapping_gfp_mask(const struct address_space *mapping)
{
return mapping->gfp_mask;
}
/* Restricts the given gfp_mask to what the mapping allows. */
-static inline gfp_t mapping_gfp_constraint(struct address_space *mapping,
+static inline gfp_t mapping_gfp_constraint(const struct address_space *mapping,
gfp_t gfp_mask)
{
return mapping_gfp_mask(mapping) & gfp_mask;
@@ -475,11 +477,17 @@ mapping_min_folio_order(const struct address_space *mapping)
}
static inline unsigned long
-mapping_min_folio_nrpages(struct address_space *mapping)
+mapping_min_folio_nrpages(const struct address_space *mapping)
{
return 1UL << mapping_min_folio_order(mapping);
}
+static inline unsigned long
+mapping_min_folio_nrbytes(const struct address_space *mapping)
+{
+ return mapping_min_folio_nrpages(mapping) << PAGE_SHIFT;
+}
+
/**
* mapping_align_index() - Align index for this mapping.
* @mapping: The address_space.
@@ -489,7 +497,7 @@ mapping_min_folio_nrpages(struct address_space *mapping)
* new folio to the page cache and need to know what index to give it,
* call this function.
*/
-static inline pgoff_t mapping_align_index(struct address_space *mapping,
+static inline pgoff_t mapping_align_index(const struct address_space *mapping,
pgoff_t index)
{
return round_down(index, mapping_min_folio_nrpages(mapping));
@@ -499,7 +507,7 @@ static inline pgoff_t mapping_align_index(struct address_space *mapping,
* Large folio support currently depends on THP. These dependencies are
* being worked on but are not yet fixed.
*/
-static inline bool mapping_large_folio_support(struct address_space *mapping)
+static inline bool mapping_large_folio_support(const struct address_space *mapping)
{
/* AS_FOLIO_ORDER is only reasonable for pagecache folios */
VM_WARN_ONCE((unsigned long)mapping & FOLIO_MAPPING_ANON,
@@ -514,7 +522,7 @@ static inline size_t mapping_max_folio_size(const struct address_space *mapping)
return PAGE_SIZE << mapping_max_folio_order(mapping);
}
-static inline int filemap_nr_thps(struct address_space *mapping)
+static inline int filemap_nr_thps(const struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
return atomic_read(&mapping->nr_thps);
@@ -543,7 +551,7 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping)
#endif
}
-struct address_space *folio_mapping(struct folio *);
+struct address_space *folio_mapping(const struct folio *folio);
/**
* folio_flush_mapping - Find the file mapping this folio belongs to.
@@ -928,7 +936,7 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
*
* Return: The index of the folio which follows this folio in the file.
*/
-static inline pgoff_t folio_next_index(struct folio *folio)
+static inline pgoff_t folio_next_index(const struct folio *folio)
{
return folio->index + folio_nr_pages(folio);
}
@@ -957,7 +965,7 @@ static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)
* e.g., shmem did not move this folio to the swap cache.
* Return: true or false.
*/
-static inline bool folio_contains(struct folio *folio, pgoff_t index)
+static inline bool folio_contains(const struct folio *folio, pgoff_t index)
{
VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio);
return index - folio->index < folio_nr_pages(folio);
@@ -1034,13 +1042,13 @@ static inline loff_t page_offset(struct page *page)
/*
* Get the offset in PAGE_SIZE (even for hugetlb folios).
*/
-static inline pgoff_t folio_pgoff(struct folio *folio)
+static inline pgoff_t folio_pgoff(const struct folio *folio)
{
return folio->index;
}
-static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
- unsigned long address)
+static inline pgoff_t linear_page_index(const struct vm_area_struct *vma,
+ const unsigned long address)
{
pgoff_t pgoff;
pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
@@ -1460,7 +1468,7 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac,
* readahead_pos - The byte offset into the file of this readahead request.
* @rac: The readahead request.
*/
-static inline loff_t readahead_pos(struct readahead_control *rac)
+static inline loff_t readahead_pos(const struct readahead_control *rac)
{
return (loff_t)rac->_index * PAGE_SIZE;
}
@@ -1469,7 +1477,7 @@ static inline loff_t readahead_pos(struct readahead_control *rac)
* readahead_length - The number of bytes in this readahead request.
* @rac: The readahead request.
*/
-static inline size_t readahead_length(struct readahead_control *rac)
+static inline size_t readahead_length(const struct readahead_control *rac)
{
return rac->_nr_pages * PAGE_SIZE;
}
@@ -1478,7 +1486,7 @@ static inline size_t readahead_length(struct readahead_control *rac)
* readahead_index - The index of the first page in this readahead request.
* @rac: The readahead request.
*/
-static inline pgoff_t readahead_index(struct readahead_control *rac)
+static inline pgoff_t readahead_index(const struct readahead_control *rac)
{
return rac->_index;
}
@@ -1487,7 +1495,7 @@ static inline pgoff_t readahead_index(struct readahead_control *rac)
* readahead_count - The number of pages in this readahead request.
* @rac: The readahead request.
*/
-static inline unsigned int readahead_count(struct readahead_control *rac)
+static inline unsigned int readahead_count(const struct readahead_control *rac)
{
return rac->_nr_pages;
}
@@ -1496,12 +1504,12 @@ static inline unsigned int readahead_count(struct readahead_control *rac)
* readahead_batch_length - The number of bytes in the current batch.
* @rac: The readahead request.
*/
-static inline size_t readahead_batch_length(struct readahead_control *rac)
+static inline size_t readahead_batch_length(const struct readahead_control *rac)
{
return rac->_batch_count * PAGE_SIZE;
}
-static inline unsigned long dir_pages(struct inode *inode)
+static inline unsigned long dir_pages(const struct inode *inode)
{
return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >>
PAGE_SHIFT;
@@ -1515,8 +1523,8 @@ static inline unsigned long dir_pages(struct inode *inode)
* Return: the number of bytes in the folio up to EOF,
* or -EFAULT if the folio was truncated.
*/
-static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio,
- struct inode *inode)
+static inline ssize_t folio_mkwrite_check_truncate(const struct folio *folio,
+ const struct inode *inode)
{
loff_t size = i_size_read(inode);
pgoff_t index = size >> PAGE_SHIFT;
@@ -1547,7 +1555,8 @@ static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio,
* Return: The number of filesystem blocks covered by this folio.
*/
static inline
-unsigned int i_blocks_per_folio(struct inode *inode, struct folio *folio)
+unsigned int i_blocks_per_folio(const struct inode *inode,
+ const struct folio *folio)
{
return folio_size(folio) >> inode->i_blkbits;
}
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 5d3a0cccc6bf..63be5a451627 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -51,12 +51,12 @@ static inline void folio_batch_reinit(struct folio_batch *fbatch)
fbatch->i = 0;
}
-static inline unsigned int folio_batch_count(struct folio_batch *fbatch)
+static inline unsigned int folio_batch_count(const struct folio_batch *fbatch)
{
return fbatch->nr;
}
-static inline unsigned int folio_batch_space(struct folio_batch *fbatch)
+static inline unsigned int folio_batch_space(const struct folio_batch *fbatch)
{
return PAGEVEC_SIZE - fbatch->nr;
}
diff --git a/include/linux/panic.h b/include/linux/panic.h
index 7be742628c25..6f972a66c13e 100644
--- a/include/linux/panic.h
+++ b/include/linux/panic.h
@@ -43,6 +43,12 @@ void abort(void);
extern atomic_t panic_cpu;
#define PANIC_CPU_INVALID -1
+bool panic_try_start(void);
+void panic_reset(void);
+bool panic_in_progress(void);
+bool panic_on_this_cpu(void);
+bool panic_on_other_cpu(void);
+
/*
* Only to be used by arch init code. If the user over-wrote the default
* CONFIG_PANIC_TIMEOUT, honor it.
diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
index 8a7f4f802c57..38a82d65e58e 100644
--- a/include/linux/pgalloc_tag.h
+++ b/include/linux/pgalloc_tag.h
@@ -107,7 +107,8 @@ static inline bool get_page_tag_ref(struct page *page, union codetag_ref *ref,
if (static_key_enabled(&mem_profiling_compressed)) {
pgalloc_tag_idx idx;
- idx = (page->flags >> alloc_tag_ref_offs) & alloc_tag_ref_mask;
+ idx = (page->flags.f >> alloc_tag_ref_offs) &
+ alloc_tag_ref_mask;
idx_to_ref(idx, ref);
handle->page = page;
} else {
@@ -149,11 +150,11 @@ static inline void update_page_tag_ref(union pgtag_ref_handle handle, union code
idx = (unsigned long)ref_to_idx(ref);
idx = (idx & alloc_tag_ref_mask) << alloc_tag_ref_offs;
do {
- old_flags = READ_ONCE(page->flags);
+ old_flags = READ_ONCE(page->flags.f);
flags = old_flags;
flags &= ~(alloc_tag_ref_mask << alloc_tag_ref_offs);
flags |= idx;
- } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags)));
+ } while (unlikely(!try_cmpxchg(&page->flags.f, &old_flags, flags)));
} else {
if (WARN_ON(!handle.ref || !ref))
return;
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 25a7257052ff..32e8457ad535 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1975,6 +1975,32 @@ static inline bool arch_has_pfn_modify_check(void)
/* Page-Table Modification Mask */
typedef unsigned int pgtbl_mod_mask;
+enum pgtable_level {
+ PGTABLE_LEVEL_PTE = 0,
+ PGTABLE_LEVEL_PMD,
+ PGTABLE_LEVEL_PUD,
+ PGTABLE_LEVEL_P4D,
+ PGTABLE_LEVEL_PGD,
+};
+
+static inline const char *pgtable_level_to_str(enum pgtable_level level)
+{
+ switch (level) {
+ case PGTABLE_LEVEL_PTE:
+ return "pte";
+ case PGTABLE_LEVEL_PMD:
+ return "pmd";
+ case PGTABLE_LEVEL_PUD:
+ return "pud";
+ case PGTABLE_LEVEL_P4D:
+ return "p4d";
+ case PGTABLE_LEVEL_PGD:
+ return "pgd";
+ default:
+ return "unknown";
+ }
+}
+
#endif /* !__ASSEMBLY__ */
#if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT)
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 5d22b803f51e..45c663124c9b 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -330,8 +330,6 @@ static inline bool pr_flush(int timeout_ms, bool reset_on_progress)
#endif
-bool this_cpu_in_panic(void);
-
#ifdef CONFIG_SMP
extern int __printk_cpu_sync_try_get(void);
extern void __printk_cpu_sync_wait(void);
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 6cd020eea37a..daa92a58585d 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -394,18 +394,8 @@ typedef int __bitwise rmap_t;
/* The anonymous (sub)page is exclusive to a single process. */
#define RMAP_EXCLUSIVE ((__force rmap_t)BIT(0))
-/*
- * Internally, we're using an enum to specify the granularity. We make the
- * compiler emit specialized code for each granularity.
- */
-enum rmap_level {
- RMAP_LEVEL_PTE = 0,
- RMAP_LEVEL_PMD,
- RMAP_LEVEL_PUD,
-};
-
-static inline void __folio_rmap_sanity_checks(const struct folio *folio,
- const struct page *page, int nr_pages, enum rmap_level level)
+static __always_inline void __folio_rmap_sanity_checks(const struct folio *folio,
+ const struct page *page, int nr_pages, enum pgtable_level level)
{
/* hugetlb folios are handled separately. */
VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
@@ -427,18 +417,18 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio,
VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio);
switch (level) {
- case RMAP_LEVEL_PTE:
+ case PGTABLE_LEVEL_PTE:
break;
- case RMAP_LEVEL_PMD:
+ case PGTABLE_LEVEL_PMD:
/*
* We don't support folios larger than a single PMD yet. So
- * when RMAP_LEVEL_PMD is set, we assume that we are creating
+ * when PGTABLE_LEVEL_PMD is set, we assume that we are creating
* a single "entire" mapping of the folio.
*/
VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio);
VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio);
break;
- case RMAP_LEVEL_PUD:
+ case PGTABLE_LEVEL_PUD:
/*
* Assume that we are creating a single "entire" mapping of the
* folio.
@@ -447,7 +437,7 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio,
VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio);
break;
default:
- VM_WARN_ON_ONCE(true);
+ BUILD_BUG();
}
/*
@@ -567,14 +557,14 @@ static inline void hugetlb_remove_rmap(struct folio *folio)
static __always_inline void __folio_dup_file_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
- enum rmap_level level)
+ enum pgtable_level level)
{
const int orig_nr_pages = nr_pages;
__folio_rmap_sanity_checks(folio, page, nr_pages, level);
switch (level) {
- case RMAP_LEVEL_PTE:
+ case PGTABLE_LEVEL_PTE:
if (!folio_test_large(folio)) {
atomic_inc(&folio->_mapcount);
break;
@@ -587,11 +577,13 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio,
}
folio_add_large_mapcount(folio, orig_nr_pages, dst_vma);
break;
- case RMAP_LEVEL_PMD:
- case RMAP_LEVEL_PUD:
+ case PGTABLE_LEVEL_PMD:
+ case PGTABLE_LEVEL_PUD:
atomic_inc(&folio->_entire_mapcount);
folio_inc_large_mapcount(folio, dst_vma);
break;
+ default:
+ BUILD_BUG();
}
}
@@ -609,13 +601,13 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio,
static inline void folio_dup_file_rmap_ptes(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *dst_vma)
{
- __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, RMAP_LEVEL_PTE);
+ __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, PGTABLE_LEVEL_PTE);
}
static __always_inline void folio_dup_file_rmap_pte(struct folio *folio,
struct page *page, struct vm_area_struct *dst_vma)
{
- __folio_dup_file_rmap(folio, page, 1, dst_vma, RMAP_LEVEL_PTE);
+ __folio_dup_file_rmap(folio, page, 1, dst_vma, PGTABLE_LEVEL_PTE);
}
/**
@@ -632,7 +624,7 @@ static inline void folio_dup_file_rmap_pmd(struct folio *folio,
struct page *page, struct vm_area_struct *dst_vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, RMAP_LEVEL_PTE);
+ __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, PGTABLE_LEVEL_PTE);
#else
WARN_ON_ONCE(true);
#endif
@@ -640,7 +632,7 @@ static inline void folio_dup_file_rmap_pmd(struct folio *folio,
static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
- struct vm_area_struct *src_vma, enum rmap_level level)
+ struct vm_area_struct *src_vma, enum pgtable_level level)
{
const int orig_nr_pages = nr_pages;
bool maybe_pinned;
@@ -665,7 +657,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
* copying if the folio maybe pinned.
*/
switch (level) {
- case RMAP_LEVEL_PTE:
+ case PGTABLE_LEVEL_PTE:
if (unlikely(maybe_pinned)) {
for (i = 0; i < nr_pages; i++)
if (PageAnonExclusive(page + i))
@@ -687,8 +679,8 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
} while (page++, --nr_pages > 0);
folio_add_large_mapcount(folio, orig_nr_pages, dst_vma);
break;
- case RMAP_LEVEL_PMD:
- case RMAP_LEVEL_PUD:
+ case PGTABLE_LEVEL_PMD:
+ case PGTABLE_LEVEL_PUD:
if (PageAnonExclusive(page)) {
if (unlikely(maybe_pinned))
return -EBUSY;
@@ -697,6 +689,8 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
atomic_inc(&folio->_entire_mapcount);
folio_inc_large_mapcount(folio, dst_vma);
break;
+ default:
+ BUILD_BUG();
}
return 0;
}
@@ -730,7 +724,7 @@ static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio,
struct vm_area_struct *src_vma)
{
return __folio_try_dup_anon_rmap(folio, page, nr_pages, dst_vma,
- src_vma, RMAP_LEVEL_PTE);
+ src_vma, PGTABLE_LEVEL_PTE);
}
static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio,
@@ -738,7 +732,7 @@ static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio,
struct vm_area_struct *src_vma)
{
return __folio_try_dup_anon_rmap(folio, page, 1, dst_vma, src_vma,
- RMAP_LEVEL_PTE);
+ PGTABLE_LEVEL_PTE);
}
/**
@@ -770,7 +764,7 @@ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio,
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, dst_vma,
- src_vma, RMAP_LEVEL_PMD);
+ src_vma, PGTABLE_LEVEL_PMD);
#else
WARN_ON_ONCE(true);
return -EBUSY;
@@ -778,7 +772,7 @@ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio,
}
static __always_inline int __folio_try_share_anon_rmap(struct folio *folio,
- struct page *page, int nr_pages, enum rmap_level level)
+ struct page *page, int nr_pages, enum pgtable_level level)
{
VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio);
@@ -873,7 +867,7 @@ static __always_inline int __folio_try_share_anon_rmap(struct folio *folio,
static inline int folio_try_share_anon_rmap_pte(struct folio *folio,
struct page *page)
{
- return __folio_try_share_anon_rmap(folio, page, 1, RMAP_LEVEL_PTE);
+ return __folio_try_share_anon_rmap(folio, page, 1, PGTABLE_LEVEL_PTE);
}
/**
@@ -904,7 +898,7 @@ static inline int folio_try_share_anon_rmap_pmd(struct folio *folio,
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR,
- RMAP_LEVEL_PMD);
+ PGTABLE_LEVEL_PMD);
#else
WARN_ON_ONCE(true);
return -EBUSY;
@@ -928,6 +922,11 @@ struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
/* Look for migration entries rather than present PTEs */
#define PVMW_MIGRATION (1 << 1)
+/* Result flags */
+
+/* The page is mapped across page table boundary */
+#define PVMW_PGTABLE_CROSSED (1 << 16)
+
struct page_vma_mapped_walk {
unsigned long pfn;
unsigned long nr_pages;
diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index fa9f1021541e..ede4c6bf6f22 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -44,6 +44,16 @@ static inline bool rt_mutex_base_is_locked(struct rt_mutex_base *lock)
return READ_ONCE(lock->owner) != NULL;
}
+#ifdef CONFIG_RT_MUTEXES
+#define RT_MUTEX_HAS_WAITERS 1UL
+
+static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock)
+{
+ unsigned long owner = (unsigned long) READ_ONCE(lock->owner);
+
+ return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS);
+}
+#endif
extern void rt_mutex_base_init(struct rt_mutex_base *rtb);
/**
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 6f8a4965f9b9..29f6ceb98d74 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -158,6 +158,7 @@ static inline void sg_assign_page(struct scatterlist *sg, struct page *page)
static inline void sg_set_page(struct scatterlist *sg, struct page *page,
unsigned int len, unsigned int offset)
{
+ VM_WARN_ON_ONCE(!page_range_contiguous(page, ALIGN(len + offset, PAGE_SIZE) / PAGE_SIZE));
sg_assign_page(sg, page);
sg->offset = offset;
sg->length = len;
@@ -600,7 +601,7 @@ void __sg_page_iter_start(struct sg_page_iter *piter,
*/
static inline struct page *sg_page_iter_page(struct sg_page_iter *piter)
{
- return nth_page(sg_page(piter->sg), piter->sg_pgoffset);
+ return sg_page(piter->sg) + piter->sg_pgoffset;
}
/**
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 6eb65ceed213..b7fafe999073 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -8,6 +8,20 @@
#define SUID_DUMP_USER 1 /* Dump as user of process */
#define SUID_DUMP_ROOT 2 /* Dump as root */
+static inline unsigned long __mm_flags_get_dumpable(struct mm_struct *mm)
+{
+ /*
+ * By convention, dumpable bits are contained in first 32 bits of the
+ * bitmap, so we can simply access this first unsigned long directly.
+ */
+ return __mm_flags_get_word(mm);
+}
+
+static inline void __mm_flags_set_mask_dumpable(struct mm_struct *mm, int value)
+{
+ __mm_flags_set_mask_bits_word(mm, MMF_DUMPABLE_MASK, value);
+}
+
extern void set_dumpable(struct mm_struct *mm, int value);
/*
* This returns the actual value of the suid_dumpable flag. For things
@@ -22,7 +36,9 @@ static inline int __get_dumpable(unsigned long mm_flags)
static inline int get_dumpable(struct mm_struct *mm)
{
- return __get_dumpable(mm->flags);
+ unsigned long flags = __mm_flags_get_dumpable(mm);
+
+ return __get_dumpable(flags);
}
#endif /* _LINUX_SCHED_COREDUMP_H */
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 2201da0afecc..0232d983b715 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -178,7 +178,7 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
#endif
extern void arch_pick_mmap_layout(struct mm_struct *mm,
- struct rlimit *rlim_stack);
+ const struct rlimit *rlim_stack);
unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
@@ -211,7 +211,7 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
unsigned long flags, vm_flags_t vm_flags);
#else
static inline void arch_pick_mmap_layout(struct mm_struct *mm,
- struct rlimit *rlim_stack) {}
+ const struct rlimit *rlim_stack) {}
#endif
static inline bool in_vfork(struct task_struct *tsk)
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 34d6a0e108c3..525aa2a632b2 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -210,9 +210,8 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
* pins the final release of task.io_context. Also protects ->cpuset and
* ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist.
*
- * Nests both inside and outside of read_lock(&tasklist_lock).
- * It must not be nested with write_lock_irq(&tasklist_lock),
- * neither inside nor outside.
+ * Nests inside of read_lock(&tasklist_lock). It must not be nested with
+ * write_lock_irq(&tasklist_lock), neither inside nor outside.
*/
static inline void task_lock(struct task_struct *p)
{
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 6d0f9c599ff7..0e47465ef0fd 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -99,9 +99,9 @@ extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags);
extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts);
#ifdef CONFIG_SHMEM
-bool shmem_mapping(struct address_space *mapping);
+bool shmem_mapping(const struct address_space *mapping);
#else
-static inline bool shmem_mapping(struct address_space *mapping)
+static inline bool shmem_mapping(const struct address_space *mapping)
{
return false;
}
diff --git a/include/linux/slab.h b/include/linux/slab.h
index d5a8ab98035c..cf443f064a66 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -335,6 +335,37 @@ struct kmem_cache_args {
* %NULL means no constructor.
*/
void (*ctor)(void *);
+ /**
+ * @sheaf_capacity: Enable sheaves of given capacity for the cache.
+ *
+ * With a non-zero value, allocations from the cache go through caching
+ * arrays called sheaves. Each cpu has a main sheaf that's always
+ * present, and a spare sheaf that may be not present. When both become
+ * empty, there's an attempt to replace an empty sheaf with a full sheaf
+ * from the per-node barn.
+ *
+ * When no full sheaf is available, and gfp flags allow blocking, a
+ * sheaf is allocated and filled from slab(s) using bulk allocation.
+ * Otherwise the allocation falls back to the normal operation
+ * allocating a single object from a slab.
+ *
+ * Analogically when freeing and both percpu sheaves are full, the barn
+ * may replace it with an empty sheaf, unless it's over capacity. In
+ * that case a sheaf is bulk freed to slab pages.
+ *
+ * The sheaves do not enforce NUMA placement of objects, so allocations
+ * via kmem_cache_alloc_node() with a node specified other than
+ * NUMA_NO_NODE will bypass them.
+ *
+ * Bulk allocation and free operations also try to use the cpu sheaves
+ * and barn, but fallback to using slab pages directly.
+ *
+ * When slub_debug is enabled for the cache, the sheaf_capacity argument
+ * is ignored.
+ *
+ * %0 means no sheaves will be created.
+ */
+ unsigned int sheaf_capacity;
};
struct kmem_cache *__kmem_cache_create_args(const char *name,
@@ -465,11 +496,16 @@ int kmem_cache_shrink(struct kmem_cache *s);
/*
* Common kmalloc functions provided by all allocators
*/
-void * __must_check krealloc_noprof(const void *objp, size_t new_size,
- gfp_t flags) __realloc_size(2);
-#define krealloc(...) alloc_hooks(krealloc_noprof(__VA_ARGS__))
+void * __must_check krealloc_node_align_noprof(const void *objp, size_t new_size,
+ unsigned long align,
+ gfp_t flags, int nid) __realloc_size(2);
+#define krealloc_noprof(_o, _s, _f) krealloc_node_align_noprof(_o, _s, 1, _f, NUMA_NO_NODE)
+#define krealloc_node_align(...) alloc_hooks(krealloc_node_align_noprof(__VA_ARGS__))
+#define krealloc_node(_o, _s, _f, _n) krealloc_node_align(_o, _s, 1, _f, _n)
+#define krealloc(...) krealloc_node(__VA_ARGS__, NUMA_NO_NODE)
void kfree(const void *objp);
+void kfree_nolock(const void *objp);
void kfree_sensitive(const void *objp);
size_t __ksize(const void *objp);
@@ -798,6 +834,22 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t flags,
int node) __assume_slab_alignment __malloc;
#define kmem_cache_alloc_node(...) alloc_hooks(kmem_cache_alloc_node_noprof(__VA_ARGS__))
+struct slab_sheaf *
+kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size);
+
+int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp,
+ struct slab_sheaf **sheafp, unsigned int size);
+
+void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp,
+ struct slab_sheaf *sheaf);
+
+void *kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *cachep, gfp_t gfp,
+ struct slab_sheaf *sheaf) __assume_slab_alignment __malloc;
+#define kmem_cache_alloc_from_sheaf(...) \
+ alloc_hooks(kmem_cache_alloc_from_sheaf_noprof(__VA_ARGS__))
+
+unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf);
+
/*
* These macros allow declaring a kmem_buckets * parameter alongside size, which
* can be compiled out with CONFIG_SLAB_BUCKETS=n so that a large number of call
@@ -910,6 +962,9 @@ static __always_inline __alloc_size(1) void *kmalloc_noprof(size_t size, gfp_t f
}
#define kmalloc(...) alloc_hooks(kmalloc_noprof(__VA_ARGS__))
+void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node);
+#define kmalloc_nolock(...) alloc_hooks(kmalloc_nolock_noprof(__VA_ARGS__))
+
#define kmem_buckets_alloc(_b, _size, _flags) \
alloc_hooks(__kmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE))
@@ -1041,18 +1096,20 @@ static inline __alloc_size(1) void *kzalloc_noprof(size_t size, gfp_t flags)
#define kzalloc(...) alloc_hooks(kzalloc_noprof(__VA_ARGS__))
#define kzalloc_node(_size, _flags, _node) kmalloc_node(_size, (_flags)|__GFP_ZERO, _node)
-void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) __alloc_size(1);
-#define kvmalloc_node_noprof(size, flags, node) \
- __kvmalloc_node_noprof(PASS_BUCKET_PARAMS(size, NULL), flags, node)
-#define kvmalloc_node(...) alloc_hooks(kvmalloc_node_noprof(__VA_ARGS__))
-
-#define kvmalloc(_size, _flags) kvmalloc_node(_size, _flags, NUMA_NO_NODE)
-#define kvmalloc_noprof(_size, _flags) kvmalloc_node_noprof(_size, _flags, NUMA_NO_NODE)
+void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align,
+ gfp_t flags, int node) __alloc_size(1);
+#define kvmalloc_node_align_noprof(_size, _align, _flags, _node) \
+ __kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, NULL), _align, _flags, _node)
+#define kvmalloc_node_align(...) \
+ alloc_hooks(kvmalloc_node_align_noprof(__VA_ARGS__))
+#define kvmalloc_node(_s, _f, _n) kvmalloc_node_align(_s, 1, _f, _n)
+#define kvmalloc(...) kvmalloc_node(__VA_ARGS__, NUMA_NO_NODE)
#define kvzalloc(_size, _flags) kvmalloc(_size, (_flags)|__GFP_ZERO)
#define kvzalloc_node(_size, _flags, _node) kvmalloc_node(_size, (_flags)|__GFP_ZERO, _node)
+
#define kmem_buckets_valloc(_b, _size, _flags) \
- alloc_hooks(__kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE))
+ alloc_hooks(__kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), 1, _flags, NUMA_NO_NODE))
static inline __alloc_size(1, 2) void *
kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node)
@@ -1062,7 +1119,7 @@ kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node)
if (unlikely(check_mul_overflow(n, size, &bytes)))
return NULL;
- return kvmalloc_node_noprof(bytes, flags, node);
+ return kvmalloc_node_align_noprof(bytes, 1, flags, node);
}
#define kvmalloc_array_noprof(...) kvmalloc_array_node_noprof(__VA_ARGS__, NUMA_NO_NODE)
@@ -1073,9 +1130,12 @@ kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node)
#define kvcalloc_node(...) alloc_hooks(kvcalloc_node_noprof(__VA_ARGS__))
#define kvcalloc(...) alloc_hooks(kvcalloc_noprof(__VA_ARGS__))
-void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
- __realloc_size(2);
-#define kvrealloc(...) alloc_hooks(kvrealloc_noprof(__VA_ARGS__))
+void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
+ gfp_t flags, int nid) __realloc_size(2);
+#define kvrealloc_node_align(...) \
+ alloc_hooks(kvrealloc_node_align_noprof(__VA_ARGS__))
+#define kvrealloc_node(_p, _s, _f, _n) kvrealloc_node_align(_p, _s, 1, _f, _n)
+#define kvrealloc(...) kvrealloc_node(__VA_ARGS__, NUMA_NO_NODE)
extern void kvfree(const void *addr);
DEFINE_FREE(kvfree, void *, if (!IS_ERR_OR_NULL(_T)) kvfree(_T))
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7012a0f758d8..e818fbade1e2 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -236,40 +236,6 @@ enum {
#define SWAP_CONT_MAX 0x7f /* Max count */
/*
- * We use this to track usage of a cluster. A cluster is a block of swap disk
- * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All
- * free clusters are organized into a list. We fetch an entry from the list to
- * get a free cluster.
- *
- * The flags field determines if a cluster is free. This is
- * protected by cluster lock.
- */
-struct swap_cluster_info {
- spinlock_t lock; /*
- * Protect swap_cluster_info fields
- * other than list, and swap_info_struct->swap_map
- * elements corresponding to the swap cluster.
- */
- u16 count;
- u8 flags;
- u8 order;
- struct list_head list;
-};
-
-/* All on-list cluster must have a non-zero flag. */
-enum swap_cluster_flags {
- CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */
- CLUSTER_FLAG_FREE,
- CLUSTER_FLAG_NONFULL,
- CLUSTER_FLAG_FRAG,
- /* Clusters with flags above are allocatable */
- CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG,
- CLUSTER_FLAG_FULL,
- CLUSTER_FLAG_DISCARD,
- CLUSTER_FLAG_MAX,
-};
-
-/*
* The first page in the swap file is the swap header, which is always marked
* bad to prevent it from being allocated as an entry. This also prevents the
* cluster to which it belongs being marked free. Therefore 0 is safe to use as
@@ -310,7 +276,6 @@ struct swap_info_struct {
/* list of cluster that contains at least one free slot */
struct list_head frag_clusters[SWAP_NR_ORDERS];
/* list of cluster that are fragmented or contented */
- atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS];
unsigned int pages; /* total of usable pages of swap */
atomic_long_t inuse_pages; /* number of those currently in use */
struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */
@@ -321,11 +286,8 @@ struct swap_info_struct {
struct completion comp; /* seldom referenced */
spinlock_t lock; /*
* protect map scan related fields like
- * swap_map, lowest_bit, highest_bit,
- * inuse_pages, cluster_next,
- * cluster_nr, lowest_alloc,
- * highest_alloc, free/discard cluster
- * list. other fields are only changed
+ * swap_map, inuse_pages and all cluster
+ * lists. other fields are only changed
* at swapon/swapoff, so are protected
* by swap_lock. changing flags need
* hold this lock and swap_lock. If
@@ -517,10 +479,7 @@ extern sector_t swapdev_block(int, pgoff_t);
extern int __swap_count(swp_entry_t entry);
extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry);
extern int swp_swapcount(swp_entry_t entry);
-struct swap_info_struct *swp_swap_info(swp_entry_t entry);
struct backing_dev_info;
-extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
-extern void exit_swap_address_space(unsigned int type);
extern struct swap_info_struct *get_swap_device(swp_entry_t entry);
sector_t swap_folio_sector(struct folio *folio);
@@ -530,11 +489,6 @@ static inline void put_swap_device(struct swap_info_struct *si)
}
#else /* CONFIG_SWAP */
-static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry)
-{
- return NULL;
-}
-
static inline struct swap_info_struct *get_swap_device(swp_entry_t entry)
{
return NULL;
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 9e15a088ba38..92f80b4d69a6 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -2,6 +2,8 @@
#ifndef VM_EVENT_ITEM_H_INCLUDED
#define VM_EVENT_ITEM_H_INCLUDED
+#include <linux/thread_info.h>
+
#ifdef CONFIG_ZONE_DMA
#define DMA_ZONE(xx) xx##_DMA,
#else
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 2759dac6be44..eb54b7b3202f 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -197,9 +197,15 @@ extern void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1
extern void *vcalloc_noprof(size_t n, size_t size) __alloc_size(1, 2);
#define vcalloc(...) alloc_hooks(vcalloc_noprof(__VA_ARGS__))
-void * __must_check vrealloc_noprof(const void *p, size_t size, gfp_t flags)
- __realloc_size(2);
-#define vrealloc(...) alloc_hooks(vrealloc_noprof(__VA_ARGS__))
+void *__must_check vrealloc_node_align_noprof(const void *p, size_t size,
+ unsigned long align, gfp_t flags, int nid) __realloc_size(2);
+#define vrealloc_node_noprof(_p, _s, _f, _nid) \
+ vrealloc_node_align_noprof(_p, _s, 1, _f, _nid)
+#define vrealloc_noprof(_p, _s, _f) \
+ vrealloc_node_align_noprof(_p, _s, 1, _f, NUMA_NO_NODE)
+#define vrealloc_node_align(...) alloc_hooks(vrealloc_node_align_noprof(__VA_ARGS__))
+#define vrealloc_node(...) alloc_hooks(vrealloc_node_noprof(__VA_ARGS__))
+#define vrealloc(...) alloc_hooks(vrealloc_noprof(__VA_ARGS__))
extern void vfree(const void *addr);
extern void vfree_atomic(const void *addr);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 15a4bc4ab819..22dd4adc5667 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -362,12 +362,6 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb);
struct folio *writeback_iter(struct address_space *mapping,
struct writeback_control *wbc, struct folio *folio, int *error);
-typedef int (*writepage_t)(struct folio *folio, struct writeback_control *wbc,
- void *data);
-
-int write_cache_pages(struct address_space *mapping,
- struct writeback_control *wbc, writepage_t writepage,
- void *data);
int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
void writeback_set_ratelimit(void);
void tag_pages_for_writeback(struct address_space *mapping,
diff --git a/include/linux/zpool.h b/include/linux/zpool.h
deleted file mode 100644
index 369ef068fad8..000000000000
--- a/include/linux/zpool.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * zpool memory storage api
- *
- * Copyright (C) 2014 Dan Streetman
- *
- * This is a common frontend for the zswap compressed memory storage
- * implementations.
- */
-
-#ifndef _ZPOOL_H_
-#define _ZPOOL_H_
-
-struct zpool;
-
-bool zpool_has_pool(char *type);
-
-struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp);
-
-const char *zpool_get_type(struct zpool *pool);
-
-void zpool_destroy_pool(struct zpool *pool);
-
-int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp,
- unsigned long *handle, const int nid);
-
-void zpool_free(struct zpool *pool, unsigned long handle);
-
-void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle,
- void *local_copy);
-
-void zpool_obj_read_end(struct zpool *zpool, unsigned long handle,
- void *handle_mem);
-
-void zpool_obj_write(struct zpool *zpool, unsigned long handle,
- void *handle_mem, size_t mem_len);
-
-u64 zpool_get_total_pages(struct zpool *pool);
-
-
-/**
- * struct zpool_driver - driver implementation for zpool
- * @type: name of the driver.
- * @list: entry in the list of zpool drivers.
- * @create: create a new pool.
- * @destroy: destroy a pool.
- * @malloc: allocate mem from a pool.
- * @free: free mem from a pool.
- * @sleep_mapped: whether zpool driver can sleep during map.
- * @map: map a handle.
- * @unmap: unmap a handle.
- * @total_size: get total size of a pool.
- *
- * This is created by a zpool implementation and registered
- * with zpool.
- */
-struct zpool_driver {
- char *type;
- struct module *owner;
- atomic_t refcount;
- struct list_head list;
-
- void *(*create)(const char *name, gfp_t gfp);
- void (*destroy)(void *pool);
-
- int (*malloc)(void *pool, size_t size, gfp_t gfp,
- unsigned long *handle, const int nid);
- void (*free)(void *pool, unsigned long handle);
-
- void *(*obj_read_begin)(void *pool, unsigned long handle,
- void *local_copy);
- void (*obj_read_end)(void *pool, unsigned long handle,
- void *handle_mem);
- void (*obj_write)(void *pool, unsigned long handle,
- void *handle_mem, size_t mem_len);
-
- u64 (*total_pages)(void *pool);
-};
-
-void zpool_register_driver(struct zpool_driver *driver);
-
-int zpool_unregister_driver(struct zpool_driver *driver);
-
-bool zpool_can_sleep_mapped(struct zpool *pool);
-
-#endif
diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h
index 383c09f583ac..37195edf2498 100644
--- a/include/trace/events/cma.h
+++ b/include/trace/events/cma.h
@@ -38,25 +38,32 @@ TRACE_EVENT(cma_release,
TRACE_EVENT(cma_alloc_start,
- TP_PROTO(const char *name, unsigned long count, unsigned int align),
+ TP_PROTO(const char *name, unsigned long request_count, unsigned long available_count,
+ unsigned long total_count, unsigned int align),
- TP_ARGS(name, count, align),
+ TP_ARGS(name, request_count, available_count, total_count, align),
TP_STRUCT__entry(
__string(name, name)
- __field(unsigned long, count)
+ __field(unsigned long, request_count)
+ __field(unsigned long, available_count)
+ __field(unsigned long, total_count)
__field(unsigned int, align)
),
TP_fast_assign(
__assign_str(name);
- __entry->count = count;
+ __entry->request_count = request_count;
+ __entry->available_count = available_count;
+ __entry->total_count = total_count;
__entry->align = align;
),
- TP_printk("name=%s count=%lu align=%u",
+ TP_printk("name=%s request_count=%lu available_count=%lu total_count=%lu align=%u",
__get_str(name),
- __entry->count,
+ __entry->request_count,
+ __entry->available_count,
+ __entry->total_count,
__entry->align)
);
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index 2305df6cb485..dd94d14a2427 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -19,7 +19,6 @@
EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \
EM( SCAN_PTE_UFFD_WP, "pte_uffd_wp") \
EM( SCAN_PTE_MAPPED_HUGEPAGE, "pte_mapped_hugepage") \
- EM( SCAN_PAGE_RO, "no_writable_page") \
EM( SCAN_LACK_REFERENCED_PAGE, "lack_referenced_page") \
EM( SCAN_PAGE_NULL, "page_null") \
EM( SCAN_SCAN_ABORT, "scan_aborted") \
@@ -55,15 +54,14 @@ SCAN_STATUS
TRACE_EVENT(mm_khugepaged_scan_pmd,
- TP_PROTO(struct mm_struct *mm, struct folio *folio, bool writable,
+ TP_PROTO(struct mm_struct *mm, struct folio *folio,
int referenced, int none_or_zero, int status, int unmapped),
- TP_ARGS(mm, folio, writable, referenced, none_or_zero, status, unmapped),
+ TP_ARGS(mm, folio, referenced, none_or_zero, status, unmapped),
TP_STRUCT__entry(
__field(struct mm_struct *, mm)
__field(unsigned long, pfn)
- __field(bool, writable)
__field(int, referenced)
__field(int, none_or_zero)
__field(int, status)
@@ -73,17 +71,15 @@ TRACE_EVENT(mm_khugepaged_scan_pmd,
TP_fast_assign(
__entry->mm = mm;
__entry->pfn = folio ? folio_pfn(folio) : -1;
- __entry->writable = writable;
__entry->referenced = referenced;
__entry->none_or_zero = none_or_zero;
__entry->status = status;
__entry->unmapped = unmapped;
),
- TP_printk("mm=%p, scan_pfn=0x%lx, writable=%d, referenced=%d, none_or_zero=%d, status=%s, unmapped=%d",
+ TP_printk("mm=%p, scan_pfn=0x%lx, referenced=%d, none_or_zero=%d, status=%s, unmapped=%d",
__entry->mm,
__entry->pfn,
- __entry->writable,
__entry->referenced,
__entry->none_or_zero,
__print_symbolic(__entry->status, SCAN_STATUS),
@@ -117,15 +113,14 @@ TRACE_EVENT(mm_collapse_huge_page,
TRACE_EVENT(mm_collapse_huge_page_isolate,
TP_PROTO(struct folio *folio, int none_or_zero,
- int referenced, bool writable, int status),
+ int referenced, int status),
- TP_ARGS(folio, none_or_zero, referenced, writable, status),
+ TP_ARGS(folio, none_or_zero, referenced, status),
TP_STRUCT__entry(
__field(unsigned long, pfn)
__field(int, none_or_zero)
__field(int, referenced)
- __field(bool, writable)
__field(int, status)
),
@@ -133,15 +128,13 @@ TRACE_EVENT(mm_collapse_huge_page_isolate,
__entry->pfn = folio ? folio_pfn(folio) : -1;
__entry->none_or_zero = none_or_zero;
__entry->referenced = referenced;
- __entry->writable = writable;
__entry->status = status;
),
- TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, writable=%d, status=%s",
+ TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, status=%s",
__entry->pfn,
__entry->none_or_zero,
__entry->referenced,
- __entry->writable,
__print_symbolic(__entry->status, SCAN_STATUS))
);
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index 474358773abe..7f93e754da5c 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -22,6 +22,7 @@ TRACE_EVENT(kmem_cache_alloc,
TP_STRUCT__entry(
__field( unsigned long, call_site )
__field( const void *, ptr )
+ __string( name, s->name )
__field( size_t, bytes_req )
__field( size_t, bytes_alloc )
__field( unsigned long, gfp_flags )
@@ -32,6 +33,7 @@ TRACE_EVENT(kmem_cache_alloc,
TP_fast_assign(
__entry->call_site = call_site;
__entry->ptr = ptr;
+ __assign_str(name);
__entry->bytes_req = s->object_size;
__entry->bytes_alloc = s->size;
__entry->gfp_flags = (__force unsigned long)gfp_flags;
@@ -41,9 +43,10 @@ TRACE_EVENT(kmem_cache_alloc,
(s->flags & SLAB_ACCOUNT)) : false;
),
- TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s",
+ TP_printk("call_site=%pS ptr=%p name=%s bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s",
(void *)__entry->call_site,
__entry->ptr,
+ __get_str(name),
__entry->bytes_req,
__entry->bytes_alloc,
show_gfp_flags(__entry->gfp_flags),
diff --git a/include/trace/events/page_ref.h b/include/trace/events/page_ref.h
index fe33a255b7d0..ea6b5c4baf3d 100644
--- a/include/trace/events/page_ref.h
+++ b/include/trace/events/page_ref.h
@@ -28,7 +28,7 @@ DECLARE_EVENT_CLASS(page_ref_mod_template,
TP_fast_assign(
__entry->pfn = page_to_pfn(page);
- __entry->flags = page->flags;
+ __entry->flags = page->flags.f;
__entry->count = page_ref_count(page);
__entry->mapcount = atomic_read(&page->_mapcount);
__entry->mapping = page->mapping;
@@ -77,7 +77,7 @@ DECLARE_EVENT_CLASS(page_ref_mod_and_test_template,
TP_fast_assign(
__entry->pfn = page_to_pfn(page);
- __entry->flags = page->flags;
+ __entry->flags = page->flags.f;
__entry->count = page_ref_count(page);
__entry->mapcount = atomic_read(&page->_mapcount);
__entry->mapping = page->mapping;
diff --git a/include/trace/events/readahead.h b/include/trace/events/readahead.h
new file mode 100644
index 000000000000..0997ac5eceab
--- /dev/null
+++ b/include/trace/events/readahead.h
@@ -0,0 +1,132 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM readahead
+
+#if !defined(_TRACE_FILEMAP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_READAHEAD_H
+
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+
+TRACE_EVENT(page_cache_ra_unbounded,
+ TP_PROTO(struct inode *inode, pgoff_t index, unsigned long nr_to_read,
+ unsigned long lookahead_size),
+
+ TP_ARGS(inode, index, nr_to_read, lookahead_size),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, i_ino)
+ __field(dev_t, s_dev)
+ __field(pgoff_t, index)
+ __field(unsigned long, nr_to_read)
+ __field(unsigned long, lookahead_size)
+ ),
+
+ TP_fast_assign(
+ __entry->i_ino = inode->i_ino;
+ __entry->s_dev = inode->i_sb->s_dev;
+ __entry->index = index;
+ __entry->nr_to_read = nr_to_read;
+ __entry->lookahead_size = lookahead_size;
+ ),
+
+ TP_printk(
+ "dev=%d:%d ino=%lx index=%lu nr_to_read=%lu lookahead_size=%lu",
+ MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino,
+ __entry->index, __entry->nr_to_read, __entry->lookahead_size
+ )
+);
+
+TRACE_EVENT(page_cache_ra_order,
+ TP_PROTO(struct inode *inode, pgoff_t index, struct file_ra_state *ra),
+
+ TP_ARGS(inode, index, ra),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, i_ino)
+ __field(dev_t, s_dev)
+ __field(pgoff_t, index)
+ __field(unsigned int, order)
+ __field(unsigned int, size)
+ __field(unsigned int, async_size)
+ __field(unsigned int, ra_pages)
+ ),
+
+ TP_fast_assign(
+ __entry->i_ino = inode->i_ino;
+ __entry->s_dev = inode->i_sb->s_dev;
+ __entry->index = index;
+ __entry->order = ra->order;
+ __entry->size = ra->size;
+ __entry->async_size = ra->async_size;
+ __entry->ra_pages = ra->ra_pages;
+ ),
+
+ TP_printk(
+ "dev=%d:%d ino=%lx index=%lu order=%u size=%u async_size=%u ra_pages=%u",
+ MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino,
+ __entry->index, __entry->order, __entry->size,
+ __entry->async_size, __entry->ra_pages
+ )
+);
+
+DECLARE_EVENT_CLASS(page_cache_ra_op,
+ TP_PROTO(struct inode *inode, pgoff_t index, struct file_ra_state *ra,
+ unsigned long req_count),
+
+ TP_ARGS(inode, index, ra, req_count),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, i_ino)
+ __field(dev_t, s_dev)
+ __field(pgoff_t, index)
+ __field(unsigned int, order)
+ __field(unsigned int, size)
+ __field(unsigned int, async_size)
+ __field(unsigned int, ra_pages)
+ __field(unsigned int, mmap_miss)
+ __field(loff_t, prev_pos)
+ __field(unsigned long, req_count)
+ ),
+
+ TP_fast_assign(
+ __entry->i_ino = inode->i_ino;
+ __entry->s_dev = inode->i_sb->s_dev;
+ __entry->index = index;
+ __entry->order = ra->order;
+ __entry->size = ra->size;
+ __entry->async_size = ra->async_size;
+ __entry->ra_pages = ra->ra_pages;
+ __entry->mmap_miss = ra->mmap_miss;
+ __entry->prev_pos = ra->prev_pos;
+ __entry->req_count = req_count;
+ ),
+
+ TP_printk(
+ "dev=%d:%d ino=%lx index=%lu req_count=%lu order=%u size=%u async_size=%u ra_pages=%u mmap_miss=%u prev_pos=%lld",
+ MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino,
+ __entry->index, __entry->req_count, __entry->order,
+ __entry->size, __entry->async_size, __entry->ra_pages,
+ __entry->mmap_miss, __entry->prev_pos
+ )
+);
+
+DEFINE_EVENT(page_cache_ra_op, page_cache_sync_ra,
+ TP_PROTO(struct inode *inode, pgoff_t index, struct file_ra_state *ra,
+ unsigned long req_count),
+ TP_ARGS(inode, index, ra, req_count)
+);
+
+DEFINE_EVENT(page_cache_ra_op, page_cache_async_ra,
+ TP_PROTO(struct inode *inode, pgoff_t index, struct file_ra_state *ra,
+ unsigned long req_count),
+ TP_ARGS(inode, index, ra, req_count)
+);
+
+#endif /* _TRACE_FILEMAP_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
index 8958ebfcff94..55749cb0b81d 100644
--- a/include/uapi/linux/kexec.h
+++ b/include/uapi/linux/kexec.h
@@ -22,12 +22,16 @@
* KEXEC_FILE_ON_CRASH : Load/unload operation belongs to kdump image.
* KEXEC_FILE_NO_INITRAMFS : No initramfs is being loaded. Ignore the initrd
* fd field.
+ * KEXEC_FILE_FORCE_DTB : Force carrying over the current boot's DTB to the new
+ * kernel on x86. This is already the default behavior on
+ * some other architectures, like ARM64 and PowerPC.
*/
#define KEXEC_FILE_UNLOAD 0x00000001
#define KEXEC_FILE_ON_CRASH 0x00000002
#define KEXEC_FILE_NO_INITRAMFS 0x00000004
#define KEXEC_FILE_DEBUG 0x00000008
#define KEXEC_FILE_NO_CMA 0x00000010
+#define KEXEC_FILE_FORCE_DTB 0x00000020
/* These values match the ELF architecture values.
* Unless there is a good reason that should continue to be the case.
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 1f9bb10d1a47..8fbbe613611a 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -66,10 +66,16 @@ enum {
#define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */
/*
- * These bit locations are exposed in the vm.zone_reclaim_mode sysctl
- * ABI. New bits are OK, but existing bits can never change.
+ * Enabling zone reclaim means the page allocator will attempt to fulfill
+ * the allocation request on the current node by triggering reclaim and
+ * trying to shrink the current node.
+ * Fallback allocations on the next candidates in the zonelist are considered
+ * when reclaim fails to free up enough memory in the current node/zone.
+ *
+ * These bit locations are exposed in the vm.zone_reclaim_mode sysctl.
+ * New bits are OK, but existing bits should not be changed.
*/
-#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
+#define RECLAIM_ZONE (1<<0) /* Enable zone reclaim */
#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index ed3aed264aeb..51c4e8c82b1e 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -177,7 +177,17 @@ struct prctl_mm_map {
#define PR_GET_TID_ADDRESS 40
+/*
+ * Flags for PR_SET_THP_DISABLE are only applicable when disabling. Bit 0
+ * is reserved, so PR_GET_THP_DISABLE can return "1 | flags", to effectively
+ * return "1" when no flags were specified for PR_SET_THP_DISABLE.
+ */
#define PR_SET_THP_DISABLE 41
+/*
+ * Don't disable THPs when explicitly advised (e.g., MADV_HUGEPAGE /
+ * VM_HUGEPAGE, MADV_COLLAPSE).
+ */
+# define PR_THP_DISABLE_EXCEPT_ADVISED (1 << 1)
#define PR_GET_THP_DISABLE 42
/*