diff options
Diffstat (limited to 'include')
70 files changed, 1132 insertions, 712 deletions
diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h index 74d0077cc5fa..efa6610acbc7 100644 --- a/include/asm-generic/memory_model.h +++ b/include/asm-generic/memory_model.h @@ -53,7 +53,7 @@ static inline int pfn_valid(unsigned long pfn) */ #define __page_to_pfn(pg) \ ({ const struct page *__pg = (pg); \ - int __sec = page_to_section(__pg); \ + int __sec = memdesc_section(__pg->flags); \ (unsigned long)(__pg - __section_mem_map_addr(__nr_to_section(__sec))); \ }) diff --git a/include/crypto/scatterwalk.h b/include/crypto/scatterwalk.h index 15ab743f68c8..83d14376ff2b 100644 --- a/include/crypto/scatterwalk.h +++ b/include/crypto/scatterwalk.h @@ -159,7 +159,7 @@ static inline void scatterwalk_map(struct scatter_walk *walk) if (IS_ENABLED(CONFIG_HIGHMEM)) { struct page *page; - page = nth_page(base_page, offset >> PAGE_SHIFT); + page = base_page + (offset >> PAGE_SHIFT); offset = offset_in_page(offset); addr = kmap_local_page(page) + offset; } else { @@ -259,7 +259,7 @@ static inline void scatterwalk_done_dst(struct scatter_walk *walk, end += (offset_in_page(offset) + offset_in_page(nbytes) + PAGE_SIZE - 1) >> PAGE_SHIFT; for (i = start; i < end; i++) - flush_dcache_page(nth_page(base_page, i)); + flush_dcache_page(base_page + i); } scatterwalk_advance(walk, nbytes); } diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 9ef2633e2c08..d40ac39bfbe8 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -221,6 +221,16 @@ static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) ref->ct = NULL; } +static inline void alloc_tag_set_inaccurate(struct alloc_tag *tag) +{ + tag->ct.flags |= CODETAG_FLAG_INACCURATE; +} + +static inline bool alloc_tag_is_inaccurate(struct alloc_tag *tag) +{ + return !!(tag->ct.flags & CODETAG_FLAG_INACCURATE); +} + #define alloc_tag_record(p) ((p) = current->alloc_tag) #else /* CONFIG_MEM_ALLOC_PROFILING */ @@ -230,6 +240,8 @@ static inline bool mem_alloc_profiling_enabled(void) { return false; } static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes) {} static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {} +static inline void alloc_tag_set_inaccurate(struct alloc_tag *tag) {} +static inline bool alloc_tag_is_inaccurate(struct alloc_tag *tag) { return false; } #define alloc_tag_record(p) do {} while (0) #endif /* CONFIG_MEM_ALLOC_PROFILING */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 066e5309bd45..dad5cb5b3812 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -199,7 +199,7 @@ struct gendisk { unsigned int zone_wplugs_hash_bits; atomic_t nr_zone_wplugs; spinlock_t zone_wplugs_lock; - struct mempool_s *zone_wplugs_pool; + struct mempool *zone_wplugs_pool; struct hlist_head *zone_wplugs_hash; struct workqueue_struct *zone_wplugs_wq; #endif /* CONFIG_BLK_DEV_ZONED */ diff --git a/include/linux/bpfptr.h b/include/linux/bpfptr.h index 1af241525a17..f6e0795db484 100644 --- a/include/linux/bpfptr.h +++ b/include/linux/bpfptr.h @@ -67,7 +67,7 @@ static inline int copy_to_bpfptr_offset(bpfptr_t dst, size_t offset, static inline void *kvmemdup_bpfptr_noprof(bpfptr_t src, size_t len) { - void *p = kvmalloc_noprof(len, GFP_USER | __GFP_NOWARN); + void *p = kvmalloc_node_align_noprof(len, 1, GFP_USER | __GFP_NOWARN, NUMA_NO_NODE); if (!p) return ERR_PTR(-ENOMEM); diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 0a80e1f9aa20..3fc0efa0825b 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -22,11 +22,8 @@ struct page; * @bv_len: Number of bytes in the address range. * @bv_offset: Start of the address range relative to the start of @bv_page. * - * The following holds for a bvec if n * PAGE_SIZE < bv_offset + bv_len: - * - * nth_page(@bv_page, n) == @bv_page + n - * - * This holds because page_is_mergeable() checks the above property. + * All pages within a bio_vec starting from @bv_page are contiguous and + * can simply be iterated (see bvec_advance()). */ struct bio_vec { struct page *bv_page; diff --git a/include/linux/codetag.h b/include/linux/codetag.h index 457ed8fd3214..8ea2a5f7c98a 100644 --- a/include/linux/codetag.h +++ b/include/linux/codetag.h @@ -16,13 +16,16 @@ struct module; #define CODETAG_SECTION_START_PREFIX "__start_" #define CODETAG_SECTION_STOP_PREFIX "__stop_" +/* codetag flags */ +#define CODETAG_FLAG_INACCURATE (1 << 0) + /* * An instance of this structure is created in a special ELF section at every * code location being tagged. At runtime, the special section is treated as * an array of these. */ struct codetag { - unsigned int flags; /* used in later patches */ + unsigned int flags; unsigned int lineno; const char *modname; const char *function; diff --git a/include/linux/damon.h b/include/linux/damon.h index 9e62b2a85538..cae8c613c5fc 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -110,7 +110,7 @@ struct damon_target { * * @DAMOS_WILLNEED: Call ``madvise()`` for the region with MADV_WILLNEED. * @DAMOS_COLD: Call ``madvise()`` for the region with MADV_COLD. - * @DAMOS_PAGEOUT: Call ``madvise()`` for the region with MADV_PAGEOUT. + * @DAMOS_PAGEOUT: Reclaim the region. * @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE. * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. * @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists. @@ -121,10 +121,10 @@ struct damon_target { * @NR_DAMOS_ACTIONS: Total number of DAMOS actions * * The support of each action is up to running &struct damon_operations. - * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR supports all actions except - * &enum DAMOS_LRU_PRIO and &enum DAMOS_LRU_DEPRIO. &enum DAMON_OPS_PADDR - * supports only &enum DAMOS_PAGEOUT, &enum DAMOS_LRU_PRIO, &enum - * DAMOS_LRU_DEPRIO, and &DAMOS_STAT. + * Refer to 'Operation Action' section of Documentation/mm/damon/design.rst for + * status of the supports. + * + * Note that DAMOS_PAGEOUT doesn't trigger demotions. */ enum damos_action { DAMOS_WILLNEED, @@ -748,7 +748,8 @@ struct damon_attrs { * Accesses to other fields must be protected by themselves. * * @ops: Set of monitoring operations for given use cases. - * + * @addr_unit: Scale factor for core to ops address conversion. + * @min_sz_region: Minimum region size. * @adaptive_targets: Head of monitoring targets (&damon_target) list. * @schemes: Head of schemes (&damos) list. */ @@ -790,6 +791,8 @@ struct damon_ctx { struct mutex kdamond_lock; struct damon_operations ops; + unsigned long addr_unit; + unsigned long min_sz_region; struct list_head adaptive_targets; struct list_head schemes; @@ -878,7 +881,7 @@ static inline void damon_insert_region(struct damon_region *r, void damon_add_region(struct damon_region *r, struct damon_target *t); void damon_destroy_region(struct damon_region *r, struct damon_target *t); int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, - unsigned int nr_ranges); + unsigned int nr_ranges, unsigned long min_sz_region); void damon_update_region_access_rate(struct damon_region *r, bool accessed, struct damon_attrs *attrs); @@ -935,6 +938,7 @@ static inline unsigned int damon_max_nr_accesses(const struct damon_attrs *attrs } +bool damon_initialized(void); int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); bool damon_is_running(struct damon_ctx *ctx); diff --git a/include/linux/freezer.h b/include/linux/freezer.h index b303472255be..32884c9721e5 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -47,6 +47,7 @@ extern int freeze_processes(void); extern int freeze_kernel_threads(void); extern void thaw_processes(void); extern void thaw_kernel_threads(void); +extern void thaw_process(struct task_struct *p); static inline bool try_to_freeze(void) { @@ -80,6 +81,7 @@ static inline int freeze_processes(void) { return -ENOSYS; } static inline int freeze_kernel_threads(void) { return -ENOSYS; } static inline void thaw_processes(void) {} static inline void thaw_kernel_threads(void) {} +static inline void thaw_process(struct task_struct *p) {} static inline bool try_to_freeze(void) { return false; } diff --git a/include/linux/fs.h b/include/linux/fs.h index 9e9d7c757efe..75fb216b0f7a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -537,7 +537,7 @@ struct address_space { /* * Returns true if any of the pages in the mapping are marked with the tag. */ -static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag) +static inline bool mapping_tagged(const struct address_space *mapping, xa_mark_t tag) { return xa_marked(&mapping->i_pages, tag); } @@ -585,7 +585,7 @@ static inline void i_mmap_assert_write_locked(struct address_space *mapping) /* * Might pages of this file be mapped into userspace? */ -static inline int mapping_mapped(struct address_space *mapping) +static inline int mapping_mapped(const struct address_space *mapping) { return !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root); } @@ -599,7 +599,7 @@ static inline int mapping_mapped(struct address_space *mapping) * If i_mmap_writable is negative, no new writable mappings are allowed. You * can only deny writable mappings, if none exists right now. */ -static inline int mapping_writably_mapped(struct address_space *mapping) +static inline int mapping_writably_mapped(const struct address_space *mapping) { return atomic_read(&mapping->i_mmap_writable) > 0; } @@ -2385,6 +2385,8 @@ static inline bool can_mmap_file(struct file *file) return true; } +int __compat_vma_mmap_prepare(const struct file_operations *f_op, + struct file *file, struct vm_area_struct *vma); int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma); static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 5ebf26fcdcfa..0ceb4e09306c 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -354,7 +354,7 @@ static inline struct page *alloc_page_vma_noprof(gfp_t gfp, } #define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__)) -struct page *alloc_pages_nolock_noprof(int nid, unsigned int order); +struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order); #define alloc_pages_nolock(...) alloc_hooks(alloc_pages_nolock_noprof(__VA_ARGS__)) extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order); diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 36053c3d6d64..0574c21ca45d 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -7,7 +7,7 @@ */ #ifdef CONFIG_KMAP_LOCAL void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot); -void *__kmap_local_page_prot(struct page *page, pgprot_t prot); +void *__kmap_local_page_prot(const struct page *page, pgprot_t prot); void kunmap_local_indexed(const void *vaddr); void kmap_local_fork(struct task_struct *tsk); void __kmap_local_sched_out(void); @@ -33,7 +33,7 @@ static inline void kmap_flush_tlb(unsigned long addr) { } #endif void *kmap_high(struct page *page); -void kunmap_high(struct page *page); +void kunmap_high(const struct page *page); void __kmap_flush_unused(void); struct page *__kmap_to_page(void *addr); @@ -50,7 +50,7 @@ static inline void *kmap(struct page *page) return addr; } -static inline void kunmap(struct page *page) +static inline void kunmap(const struct page *page) { might_sleep(); if (!PageHighMem(page)) @@ -68,12 +68,12 @@ static inline void kmap_flush_unused(void) __kmap_flush_unused(); } -static inline void *kmap_local_page(struct page *page) +static inline void *kmap_local_page(const struct page *page) { return __kmap_local_page_prot(page, kmap_prot); } -static inline void *kmap_local_page_try_from_panic(struct page *page) +static inline void *kmap_local_page_try_from_panic(const struct page *page) { if (!PageHighMem(page)) return page_address(page); @@ -81,13 +81,13 @@ static inline void *kmap_local_page_try_from_panic(struct page *page) return NULL; } -static inline void *kmap_local_folio(struct folio *folio, size_t offset) +static inline void *kmap_local_folio(const struct folio *folio, size_t offset) { - struct page *page = folio_page(folio, offset / PAGE_SIZE); + const struct page *page = folio_page(folio, offset / PAGE_SIZE); return __kmap_local_page_prot(page, kmap_prot) + offset % PAGE_SIZE; } -static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) +static inline void *kmap_local_page_prot(const struct page *page, pgprot_t prot) { return __kmap_local_page_prot(page, prot); } @@ -102,7 +102,7 @@ static inline void __kunmap_local(const void *vaddr) kunmap_local_indexed(vaddr); } -static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) +static inline void *kmap_atomic_prot(const struct page *page, pgprot_t prot) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); @@ -113,7 +113,7 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) return __kmap_local_page_prot(page, prot); } -static inline void *kmap_atomic(struct page *page) +static inline void *kmap_atomic(const struct page *page) { return kmap_atomic_prot(page, kmap_prot); } @@ -173,32 +173,32 @@ static inline void *kmap(struct page *page) return page_address(page); } -static inline void kunmap_high(struct page *page) { } +static inline void kunmap_high(const struct page *page) { } static inline void kmap_flush_unused(void) { } -static inline void kunmap(struct page *page) +static inline void kunmap(const struct page *page) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(page_address(page)); #endif } -static inline void *kmap_local_page(struct page *page) +static inline void *kmap_local_page(const struct page *page) { return page_address(page); } -static inline void *kmap_local_page_try_from_panic(struct page *page) +static inline void *kmap_local_page_try_from_panic(const struct page *page) { return page_address(page); } -static inline void *kmap_local_folio(struct folio *folio, size_t offset) +static inline void *kmap_local_folio(const struct folio *folio, size_t offset) { return folio_address(folio) + offset; } -static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) +static inline void *kmap_local_page_prot(const struct page *page, pgprot_t prot) { return kmap_local_page(page); } @@ -215,7 +215,7 @@ static inline void __kunmap_local(const void *addr) #endif } -static inline void *kmap_atomic(struct page *page) +static inline void *kmap_atomic(const struct page *page) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); @@ -225,7 +225,7 @@ static inline void *kmap_atomic(struct page *page) return page_address(page); } -static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) +static inline void *kmap_atomic_prot(const struct page *page, pgprot_t prot) { return kmap_atomic(page); } diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 6234f316468c..105cc4c00cc3 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -43,7 +43,7 @@ static inline void *kmap(struct page *page); * Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of * pages in the low memory area. */ -static inline void kunmap(struct page *page); +static inline void kunmap(const struct page *page); /** * kmap_to_page - Get the page for a kmap'ed address @@ -93,7 +93,7 @@ static inline void kmap_flush_unused(void); * disabling migration in order to keep the virtual address stable across * preemption. No caller of kmap_local_page() can rely on this side effect. */ -static inline void *kmap_local_page(struct page *page); +static inline void *kmap_local_page(const struct page *page); /** * kmap_local_folio - Map a page in this folio for temporary usage @@ -129,7 +129,7 @@ static inline void *kmap_local_page(struct page *page); * Context: Can be invoked from any context. * Return: The virtual address of @offset. */ -static inline void *kmap_local_folio(struct folio *folio, size_t offset); +static inline void *kmap_local_folio(const struct folio *folio, size_t offset); /** * kmap_atomic - Atomically map a page for temporary usage - Deprecated! @@ -176,7 +176,7 @@ static inline void *kmap_local_folio(struct folio *folio, size_t offset); * kunmap_atomic(vaddr2); * kunmap_atomic(vaddr1); */ -static inline void *kmap_atomic(struct page *page); +static inline void *kmap_atomic(const struct page *page); /* Highmem related interfaces for management code */ static inline unsigned long nr_free_highpages(void); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7748489fde1b..f327d62fc985 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -94,12 +94,15 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; #define THP_ORDERS_ALL \ (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_SPECIAL | THP_ORDERS_ALL_FILE_DEFAULT) -#define TVA_SMAPS (1 << 0) /* Will be used for procfs */ -#define TVA_IN_PF (1 << 1) /* Page fault handler */ -#define TVA_ENFORCE_SYSFS (1 << 2) /* Obey sysfs configuration */ +enum tva_type { + TVA_SMAPS, /* Exposing "THPeligible:" in smaps. */ + TVA_PAGEFAULT, /* Serving a page fault. */ + TVA_KHUGEPAGED, /* Khugepaged collapse. */ + TVA_FORCED_COLLAPSE, /* Forced collapse (e.g. MADV_COLLAPSE). */ +}; -#define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \ - (!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order))) +#define thp_vma_allowable_order(vma, vm_flags, type, order) \ + (!!thp_vma_allowable_orders(vma, vm_flags, type, BIT(order))) #define split_folio(f) split_folio_to_list(f, NULL) @@ -264,14 +267,14 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, vm_flags_t vm_flags, - unsigned long tva_flags, + enum tva_type type, unsigned long orders); /** * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma * @vma: the vm area to check * @vm_flags: use these vm_flags instead of vma->vm_flags - * @tva_flags: Which TVA flags to honour + * @type: TVA type * @orders: bitfield of all orders to consider * * Calculates the intersection of the requested hugepage orders and the allowed @@ -285,11 +288,14 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, vm_flags_t vm_flags, - unsigned long tva_flags, + enum tva_type type, unsigned long orders) { - /* Optimization to check if required orders are enabled early. */ - if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) { + /* + * Optimization to check if required orders are enabled early. Only + * forced collapse ignores sysfs configs. + */ + if (type != TVA_FORCED_COLLAPSE && vma_is_anonymous(vma)) { unsigned long mask = READ_ONCE(huge_anon_orders_always); if (vm_flags & VM_HUGEPAGE) @@ -303,7 +309,7 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, return 0; } - return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders); + return __thp_vma_allowable_orders(vma, vm_flags, type, orders); } struct thpsize { @@ -318,16 +324,32 @@ struct thpsize { (transparent_hugepage_flags & \ (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG)) +/* + * Check whether THPs are explicitly disabled for this VMA, for example, + * through madvise or prctl. + */ static inline bool vma_thp_disabled(struct vm_area_struct *vma, - vm_flags_t vm_flags) -{ + vm_flags_t vm_flags, bool forced_collapse) +{ + /* Are THPs disabled for this VMA? */ + if (vm_flags & VM_NOHUGEPAGE) + return true; + /* Are THPs disabled for all VMAs in the whole process? */ + if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, vma->vm_mm)) + return true; /* - * Explicitly disabled through madvise or prctl, or some - * architectures may disable THP for some mappings, for - * example, s390 kvm. + * Are THPs disabled only for VMAs where we didn't get an explicit + * advise to use them? */ - return (vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags); + if (vm_flags & VM_HUGEPAGE) + return false; + /* + * Forcing a collapse (e.g., madv_collapse), is a clear advice to + * use THPs. + */ + if (forced_collapse) + return false; + return mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, vma->vm_mm); } static inline bool thp_disabled_by_hw(void) @@ -479,6 +501,8 @@ extern unsigned long huge_zero_pfn; static inline bool is_huge_zero_folio(const struct folio *folio) { + VM_WARN_ON_ONCE(!folio); + return READ_ONCE(huge_zero_folio) == folio; } @@ -495,6 +519,17 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) struct folio *mm_get_huge_zero_folio(struct mm_struct *mm); void mm_put_huge_zero_folio(struct mm_struct *mm); +static inline struct folio *get_persistent_huge_zero_folio(void) +{ + if (!IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) + return NULL; + + if (unlikely(!huge_zero_folio)) + return NULL; + + return huge_zero_folio; +} + static inline bool thp_migration_supported(void) { return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); @@ -526,7 +561,7 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, vm_flags_t vm_flags, - unsigned long tva_flags, + enum tva_type type, unsigned long orders) { return 0; @@ -553,22 +588,26 @@ static inline int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order) { - return 0; + VM_WARN_ON_ONCE_PAGE(1, page); + return -EINVAL; } static inline int split_huge_page(struct page *page) { - return 0; + VM_WARN_ON_ONCE_PAGE(1, page); + return -EINVAL; } static inline int split_folio_to_list(struct folio *folio, struct list_head *list) { - return 0; + VM_WARN_ON_ONCE_FOLIO(1, folio); + return -EINVAL; } static inline int try_folio_split(struct folio *folio, struct page *page, struct list_head *list) { - return 0; + VM_WARN_ON_ONCE_FOLIO(1, folio); + return -EINVAL; } static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} @@ -685,6 +724,11 @@ static inline int change_huge_pud(struct mmu_gather *tlb, { return 0; } + +static inline struct folio *get_persistent_huge_zero_folio(void) +{ + return NULL; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int split_folio_to_list_to_order(struct folio *folio, @@ -698,4 +742,26 @@ static inline int split_folio_to_order(struct folio *folio, int new_order) return split_folio_to_list_to_order(folio, NULL, new_order); } +/** + * largest_zero_folio - Get the largest zero size folio available + * + * This function shall be used when mm_get_huge_zero_folio() cannot be + * used as there is no appropriate mm lifetime to tie the huge zero folio + * from the caller. + * + * Deduce the size of the folio with folio_size instead of assuming the + * folio size. + * + * Return: pointer to PMD sized zero folio if CONFIG_PERSISTENT_HUGE_ZERO_FOLIO + * is enabled or a single page sized zero folio + */ +static inline struct folio *largest_zero_folio(void) +{ + struct folio *folio = get_persistent_huge_zero_folio(); + + if (folio) + return folio; + + return page_folio(ZERO_PAGE(0)); +} #endif /* _LINUX_HUGE_MM_H */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 526d27e88b3b..8e63e46b8e1f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -788,9 +788,14 @@ static inline unsigned huge_page_shift(struct hstate *h) return h->order + PAGE_SHIFT; } +static inline bool order_is_gigantic(unsigned int order) +{ + return order > MAX_PAGE_ORDER; +} + static inline bool hstate_is_gigantic(struct hstate *h) { - return huge_page_order(h) > MAX_PAGE_ORDER; + return order_is_gigantic(huge_page_order(h)); } static inline unsigned int pages_per_huge_page(const struct hstate *h) diff --git a/include/linux/idr.h b/include/linux/idr.h index 2267902d29a7..789e23e67444 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -334,14 +334,6 @@ static inline void ida_init(struct ida *ida) xa_init_flags(&ida->xa, IDA_INIT_FLAGS); } -/* - * ida_simple_get() and ida_simple_remove() are deprecated. Use - * ida_alloc() and ida_free() instead respectively. - */ -#define ida_simple_get(ida, start, end, gfp) \ - ida_alloc_range(ida, start, (end) - 1, gfp) -#define ida_simple_remove(ida, id) ida_free(ida, id) - static inline bool ida_is_empty(const struct ida *ida) { return xa_empty(&ida->xa); diff --git a/include/linux/kasan-enabled.h b/include/linux/kasan-enabled.h index 6f612d69ea0c..9eca967d8526 100644 --- a/include/linux/kasan-enabled.h +++ b/include/linux/kasan-enabled.h @@ -4,32 +4,46 @@ #include <linux/static_key.h> -#ifdef CONFIG_KASAN_HW_TAGS - +#if defined(CONFIG_ARCH_DEFER_KASAN) || defined(CONFIG_KASAN_HW_TAGS) +/* + * Global runtime flag for KASAN modes that need runtime control. + * Used by ARCH_DEFER_KASAN architectures and HW_TAGS mode. + */ DECLARE_STATIC_KEY_FALSE(kasan_flag_enabled); +/* + * Runtime control for shadow memory initialization or HW_TAGS mode. + * Uses static key for architectures that need deferred KASAN or HW_TAGS. + */ static __always_inline bool kasan_enabled(void) { return static_branch_likely(&kasan_flag_enabled); } -static inline bool kasan_hw_tags_enabled(void) +static inline void kasan_enable(void) { - return kasan_enabled(); + static_branch_enable(&kasan_flag_enabled); } - -#else /* CONFIG_KASAN_HW_TAGS */ - -static inline bool kasan_enabled(void) +#else +/* For architectures that can enable KASAN early, use compile-time check. */ +static __always_inline bool kasan_enabled(void) { return IS_ENABLED(CONFIG_KASAN); } +static inline void kasan_enable(void) {} +#endif /* CONFIG_ARCH_DEFER_KASAN || CONFIG_KASAN_HW_TAGS */ + +#ifdef CONFIG_KASAN_HW_TAGS +static inline bool kasan_hw_tags_enabled(void) +{ + return kasan_enabled(); +} +#else static inline bool kasan_hw_tags_enabled(void) { return false; } - #endif /* CONFIG_KASAN_HW_TAGS */ #endif /* LINUX_KASAN_ENABLED_H */ diff --git a/include/linux/kasan.h b/include/linux/kasan.h index fe5ce9215821..d12e1a5f5a9a 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -200,7 +200,7 @@ static __always_inline bool kasan_slab_pre_free(struct kmem_cache *s, } bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init, - bool still_accessible); + bool still_accessible, bool no_quarantine); /** * kasan_slab_free - Poison, initialize, and quarantine a slab object. * @object: Object to be freed. @@ -226,11 +226,13 @@ bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init, * @Return true if KASAN took ownership of the object; false otherwise. */ static __always_inline bool kasan_slab_free(struct kmem_cache *s, - void *object, bool init, - bool still_accessible) + void *object, bool init, + bool still_accessible, + bool no_quarantine) { if (kasan_enabled()) - return __kasan_slab_free(s, object, init, still_accessible); + return __kasan_slab_free(s, object, init, still_accessible, + no_quarantine); return false; } @@ -427,7 +429,8 @@ static inline bool kasan_slab_pre_free(struct kmem_cache *s, void *object) } static inline bool kasan_slab_free(struct kmem_cache *s, void *object, - bool init, bool still_accessible) + bool init, bool still_accessible, + bool no_quarantine) { return false; } @@ -543,6 +546,12 @@ void kasan_report_async(void); #endif /* CONFIG_KASAN_HW_TAGS */ +#ifdef CONFIG_KASAN_GENERIC +void __init kasan_init_generic(void); +#else +static inline void kasan_init_generic(void) { } +#endif + #ifdef CONFIG_KASAN_SW_TAGS void __init kasan_init_sw_tags(void); #else diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 989315dabb86..5b46924fdff5 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -164,11 +164,23 @@ extern int root_mountflags; extern bool early_boot_irqs_disabled; -/* - * Values used for system_state. Ordering of the states must not be changed +/** + * enum system_states - Values used for system_state. + * + * @SYSTEM_BOOTING: %0, no init needed + * @SYSTEM_SCHEDULING: system is ready for scheduling; OK to use RCU + * @SYSTEM_FREEING_INITMEM: system is freeing all of initmem; almost running + * @SYSTEM_RUNNING: system is up and running + * @SYSTEM_HALT: system entered clean system halt state + * @SYSTEM_POWER_OFF: system entered shutdown/clean power off state + * @SYSTEM_RESTART: system entered emergency power off or normal restart + * @SYSTEM_SUSPEND: system entered suspend or hibernate state + * + * Note: + * Ordering of the states must not be changed * as code checks for <, <=, >, >= STATE. */ -extern enum system_states { +enum system_states { SYSTEM_BOOTING, SYSTEM_SCHEDULING, SYSTEM_FREEING_INITMEM, @@ -177,7 +189,8 @@ extern enum system_states { SYSTEM_POWER_OFF, SYSTEM_RESTART, SYSTEM_SUSPEND, -} system_state; +}; +extern enum system_states system_state; /* * General tracing related utility functions - trace_printk(), diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 39fe3e6cd282..ff7e231b0485 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -395,6 +395,9 @@ struct kimage { /* Information for loading purgatory */ struct purgatory_info purgatory_info; + + /* Force carrying over the DTB from the current boot */ + bool force_dtb; #endif #ifdef CONFIG_CRASH_HOTPLUG @@ -461,7 +464,7 @@ bool kexec_load_permitted(int kexec_image_type); /* List of defined/legal kexec file flags */ #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG | \ - KEXEC_FILE_NO_CMA) + KEXEC_FILE_NO_CMA | KEXEC_FILE_FORCE_DTB) /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 348844cffb13..559d13a3bc44 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -40,6 +40,7 @@ struct kho_serialization; #ifdef CONFIG_KEXEC_HANDOVER bool kho_is_enabled(void); +bool is_kho_boot(void); int kho_preserve_folio(struct folio *folio); int kho_preserve_phys(phys_addr_t phys, size_t size); @@ -60,6 +61,11 @@ static inline bool kho_is_enabled(void) return false; } +static inline bool is_kho_boot(void) +{ + return false; +} + static inline int kho_preserve_folio(struct folio *folio) { return -EOPNOTSUPP; diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index ff6120463745..eb1946a70cff 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -2,6 +2,8 @@ #ifndef _LINUX_KHUGEPAGED_H #define _LINUX_KHUGEPAGED_H +#include <linux/mm.h> + extern unsigned int khugepaged_max_ptes_none __read_mostly; #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern struct attribute_group khugepaged_attr_group; @@ -20,13 +22,13 @@ extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) { - if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags)) + if (mm_flags_test(MMF_VM_HUGEPAGE, oldmm)) __khugepaged_enter(mm); } static inline void khugepaged_exit(struct mm_struct *mm) { - if (test_bit(MMF_VM_HUGEPAGE, &mm->flags)) + if (mm_flags_test(MMF_VM_HUGEPAGE, mm)) __khugepaged_exit(mm); } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/include/linux/ksm.h b/include/linux/ksm.h index c17b955e7b0b..067538fc4d58 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -56,13 +56,19 @@ static inline long mm_ksm_zero_pages(struct mm_struct *mm) static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { /* Adding mm to ksm is best effort on fork. */ - if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) + if (mm_flags_test(MMF_VM_MERGEABLE, oldmm)) { + long nr_ksm_zero_pages = atomic_long_read(&mm->ksm_zero_pages); + + mm->ksm_merging_pages = 0; + mm->ksm_rmap_items = 0; + atomic_long_add(nr_ksm_zero_pages, &ksm_zero_pages); __ksm_enter(mm); + } } static inline int ksm_execve(struct mm_struct *mm) { - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) + if (mm_flags_test(MMF_VM_MERGE_ANY, mm)) return __ksm_enter(mm); return 0; @@ -70,7 +76,7 @@ static inline int ksm_execve(struct mm_struct *mm) static inline void ksm_exit(struct mm_struct *mm) { - if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) + if (mm_flags_test(MMF_VM_MERGEABLE, mm)) __ksm_exit(mm); } diff --git a/include/linux/list.h b/include/linux/list.h index 7f7657e41620..5bfda2f91fca 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -20,8 +20,16 @@ * using the generic single-entry routines. */ +/** + * LIST_HEAD_INIT - initialize a &struct list_head's links to point to itself + * @name: name of the list_head + */ #define LIST_HEAD_INIT(name) { &(name), &(name) } +/** + * LIST_HEAD - definition of a &struct list_head with initialization values + * @name: name of the list_head + */ #define LIST_HEAD(name) \ struct list_head name = LIST_HEAD_INIT(name) diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h index 2ba846419524..0d91d060e3e9 100644 --- a/include/linux/local_lock.h +++ b/include/linux/local_lock.h @@ -66,6 +66,8 @@ */ #define local_trylock(lock) __local_trylock(this_cpu_ptr(lock)) +#define local_lock_is_locked(lock) __local_lock_is_locked(lock) + /** * local_trylock_irqsave - Try to acquire a per CPU local lock, save and disable * interrupts if acquired diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index d80b5306a2c0..a4dc479157b5 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -17,7 +17,10 @@ typedef struct { /* local_trylock() and local_trylock_irqsave() only work with local_trylock_t */ typedef struct { - local_lock_t llock; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; + struct task_struct *owner; +#endif u8 acquired; } local_trylock_t; @@ -31,7 +34,7 @@ typedef struct { .owner = NULL, # define LOCAL_TRYLOCK_DEBUG_INIT(lockname) \ - .llock = { LOCAL_LOCK_DEBUG_INIT((lockname).llock) }, + LOCAL_LOCK_DEBUG_INIT(lockname) static inline void local_lock_acquire(local_lock_t *l) { @@ -81,7 +84,7 @@ do { \ local_lock_debug_init(lock); \ } while (0) -#define __local_trylock_init(lock) __local_lock_init(lock.llock) +#define __local_trylock_init(lock) __local_lock_init((local_lock_t *)lock) #define __spinlock_nested_bh_init(lock) \ do { \ @@ -162,6 +165,9 @@ do { \ !!tl; \ }) +/* preemption or migration must be disabled before calling __local_lock_is_locked */ +#define __local_lock_is_locked(lock) READ_ONCE(this_cpu_ptr(lock)->acquired) + #define __local_lock_release(lock) \ do { \ local_trylock_t *tl; \ @@ -282,4 +288,8 @@ do { \ __local_trylock(lock); \ }) +/* migration must be disabled before calling __local_lock_is_locked */ +#define __local_lock_is_locked(__lock) \ + (rt_mutex_owner(&this_cpu_ptr(__lock)->lock) == current) + #endif /* CONFIG_PREEMPT_RT */ diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index bafe143b1f78..66f98a3da8d8 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -57,17 +57,17 @@ * MT_FLAGS_ALLOC_RANGE flag. * * Node types: - * 0x??1 = Root - * 0x?00 = 16 bit nodes - * 0x010 = 32 bit nodes - * 0x110 = 64 bit nodes + * 0b??1 = Root + * 0b?00 = 16 bit nodes + * 0b010 = 32 bit nodes + * 0b110 = 64 bit nodes * * Slot size and location in the parent pointer: * type : slot location - * 0x??1 : Root - * 0x?00 : 16 bit values, type in 0-1, slot in 2-6 - * 0x010 : 32 bit values, type in 0-2, slot in 3-6 - * 0x110 : 64 bit values, type in 0-2, slot in 3-6 + * 0b??1 : Root + * 0b?00 : 16 bit values, type in 0-1, slot in 2-6 + * 0b010 : 32 bit values, type in 0-2, slot in 3-6 + * 0b110 : 64 bit values, type in 0-2, slot in 3-6 */ /* @@ -194,7 +194,6 @@ enum store_type { #define MAPLE_RESERVED_RANGE 4096 #ifdef CONFIG_LOCKDEP -typedef struct lockdep_map *lockdep_map_p; #define mt_lock_is_held(mt) \ (!(mt)->ma_external_lock || lock_is_held((mt)->ma_external_lock)) @@ -207,7 +206,6 @@ typedef struct lockdep_map *lockdep_map_p; #define mt_on_stack(mt) (mt).ma_external_lock = NULL #else -typedef struct { /* nothing */ } lockdep_map_p; #define mt_lock_is_held(mt) 1 #define mt_write_lock_is_held(mt) 1 #define mt_set_external_lock(mt, lock) do { } while (0) @@ -230,8 +228,10 @@ typedef struct { /* nothing */ } lockdep_map_p; */ struct maple_tree { union { - spinlock_t ma_lock; - lockdep_map_p ma_external_lock; + spinlock_t ma_lock; +#ifdef CONFIG_LOCKDEP + struct lockdep_map *ma_external_lock; +#endif }; unsigned int ma_flags; void __rcu *ma_root; @@ -442,7 +442,9 @@ struct ma_state { struct maple_enode *node; /* The node containing this entry */ unsigned long min; /* The minimum index of this node - implied pivot min */ unsigned long max; /* The maximum index of this node - implied pivot max */ - struct maple_alloc *alloc; /* Allocated nodes for this operation */ + struct slab_sheaf *sheaf; /* Allocated nodes for this operation */ + struct maple_node *alloc; /* A single allocated node for fast path writes */ + unsigned long node_request; /* The number of nodes to allocate for this operation */ enum maple_status status; /* The status of the state (active, start, none, etc) */ unsigned char depth; /* depth of tree descent during write */ unsigned char offset; @@ -481,6 +483,9 @@ struct ma_wr_state { #define MA_ERROR(err) \ ((struct maple_enode *)(((unsigned long)err << 2) | 2UL)) +/* + * When changing MA_STATE, remember to also change rust/kernel/maple_tree.rs + */ #define MA_STATE(name, mt, first, end) \ struct ma_state name = { \ .tree = mt, \ @@ -490,7 +495,9 @@ struct ma_wr_state { .status = ma_start, \ .min = 0, \ .max = ULONG_MAX, \ + .sheaf = NULL, \ .alloc = NULL, \ + .node_request = 0, \ .mas_flags = 0, \ .store_type = wr_invalid, \ } diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fb27e3d2fdac..16fe0306e50e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -341,17 +341,25 @@ enum page_memcg_data_flags { __NR_MEMCG_DATA_FLAGS = (1UL << 2), }; +#define __OBJEXTS_ALLOC_FAIL MEMCG_DATA_OBJEXTS #define __FIRST_OBJEXT_FLAG __NR_MEMCG_DATA_FLAGS #else /* CONFIG_MEMCG */ +#define __OBJEXTS_ALLOC_FAIL (1UL << 0) #define __FIRST_OBJEXT_FLAG (1UL << 0) #endif /* CONFIG_MEMCG */ enum objext_flags { - /* slabobj_ext vector failed to allocate */ - OBJEXTS_ALLOC_FAIL = __FIRST_OBJEXT_FLAG, + /* + * Use bit 0 with zero other bits to signal that slabobj_ext vector + * failed to allocate. The same bit 0 with valid upper bits means + * MEMCG_DATA_OBJEXTS. + */ + OBJEXTS_ALLOC_FAIL = __OBJEXTS_ALLOC_FAIL, + /* slabobj_ext vector allocated with kmalloc_nolock() */ + OBJEXTS_NOSPIN_ALLOC = __FIRST_OBJEXT_FLAG, /* the next bit after the last actual flag */ __NR_OBJEXTS_FLAGS = (__FIRST_OBJEXT_FLAG << 1), }; @@ -900,7 +908,13 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, return READ_ONCE(mz->lru_zone_size[zone_idx][lru]); } -void mem_cgroup_handle_over_high(gfp_t gfp_mask); +void __mem_cgroup_handle_over_high(gfp_t gfp_mask); + +static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) +{ + if (unlikely(current->memcg_nr_pages_over_high)) + __mem_cgroup_handle_over_high(gfp_mask); +} unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); @@ -1053,6 +1067,8 @@ extern int mem_cgroup_init(void); #define MEM_CGROUP_ID_SHIFT 0 +#define root_mem_cgroup (NULL) + static inline struct mem_cgroup *folio_memcg(struct folio *folio) { return NULL; diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 7b151441341b..34941a4b9026 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -15,7 +15,7 @@ struct kmem_cache; typedef void * (mempool_alloc_t)(gfp_t gfp_mask, void *pool_data); typedef void (mempool_free_t)(void *element, void *pool_data); -typedef struct mempool_s { +typedef struct mempool { spinlock_t lock; int min_nr; /* nr of elements at *elements */ int curr_nr; /* Current nr of elements at *elements */ diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 4aa151914eab..e5951ba12a28 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -157,45 +157,52 @@ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap) return 1 << pgmap->vmemmap_shift; } -static inline bool is_device_private_page(const struct page *page) +static inline bool folio_is_device_private(const struct folio *folio) { return IS_ENABLED(CONFIG_DEVICE_PRIVATE) && - is_zone_device_page(page) && - page_pgmap(page)->type == MEMORY_DEVICE_PRIVATE; + folio_is_zone_device(folio) && + folio->pgmap->type == MEMORY_DEVICE_PRIVATE; } -static inline bool folio_is_device_private(const struct folio *folio) +static inline bool is_device_private_page(const struct page *page) { - return is_device_private_page(&folio->page); + return IS_ENABLED(CONFIG_DEVICE_PRIVATE) && + folio_is_device_private(page_folio(page)); } -static inline bool is_pci_p2pdma_page(const struct page *page) +static inline bool folio_is_pci_p2pdma(const struct folio *folio) { return IS_ENABLED(CONFIG_PCI_P2PDMA) && - is_zone_device_page(page) && - page_pgmap(page)->type == MEMORY_DEVICE_PCI_P2PDMA; + folio_is_zone_device(folio) && + folio->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; } -static inline bool is_device_coherent_page(const struct page *page) +static inline bool is_pci_p2pdma_page(const struct page *page) { - return is_zone_device_page(page) && - page_pgmap(page)->type == MEMORY_DEVICE_COHERENT; + return IS_ENABLED(CONFIG_PCI_P2PDMA) && + folio_is_pci_p2pdma(page_folio(page)); } static inline bool folio_is_device_coherent(const struct folio *folio) { - return is_device_coherent_page(&folio->page); + return folio_is_zone_device(folio) && + folio->pgmap->type == MEMORY_DEVICE_COHERENT; } -static inline bool is_fsdax_page(const struct page *page) +static inline bool is_device_coherent_page(const struct page *page) { - return is_zone_device_page(page) && - page_pgmap(page)->type == MEMORY_DEVICE_FS_DAX; + return folio_is_device_coherent(page_folio(page)); } static inline bool folio_is_fsdax(const struct folio *folio) { - return is_fsdax_page(&folio->page); + return folio_is_zone_device(folio) && + folio->pgmap->type == MEMORY_DEVICE_FS_DAX; +} + +static inline bool is_fsdax_page(const struct page *page) +{ + return folio_is_fsdax(page_folio(page)); } #ifdef CONFIG_ZONE_DEVICE @@ -204,8 +211,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap); -struct dev_pagemap *get_dev_pagemap(unsigned long pfn, - struct dev_pagemap *pgmap); +struct dev_pagemap *get_dev_pagemap(unsigned long pfn); bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn); unsigned long memremap_compat_align(void); @@ -227,8 +233,7 @@ static inline void devm_memunmap_pages(struct device *dev, { } -static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn, - struct dev_pagemap *pgmap) +static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn) { return NULL; } diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 9009e27b5f44..1f0ac122c3bf 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -12,14 +12,6 @@ typedef void free_folio_t(struct folio *folio, unsigned long private); struct migration_target_control; -/* - * Return values from addresss_space_operations.migratepage(): - * - negative errno on page migration failure; - * - zero on page migration success; - */ -#define MIGRATEPAGE_SUCCESS 0 -#define MIGRATEPAGE_UNMAP 1 - /** * struct movable_operations - Driver page migration * @isolate_page: @@ -35,8 +27,7 @@ struct migration_target_control; * @src page. The driver should copy the contents of the * @src page to the @dst page and set up the fields of @dst page. * Both pages are locked. - * If page migration is successful, the driver should - * return MIGRATEPAGE_SUCCESS. + * If page migration is successful, the driver should return 0. * If the driver cannot migrate the page at the moment, it can return * -EAGAIN. The VM interprets this as a temporary migration failure and * will retry it later. Any other error value is a permanent migration diff --git a/include/linux/mm.h b/include/linux/mm.h index 1ae97a0b8ec7..06978b4dbeb8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -34,6 +34,8 @@ #include <linux/slab.h> #include <linux/cacheinfo.h> #include <linux/rcuwait.h> +#include <linux/bitmap.h> +#include <linux/bitops.h> struct mempolicy; struct anon_vma; @@ -69,6 +71,15 @@ static inline void totalram_pages_add(long count) extern void * high_memory; +/* + * Convert between pages and MB + * 20 is the shift for 1MB (2^20 = 1MB) + * PAGE_SHIFT is the shift for page size (e.g., 12 for 4KB pages) + * So (20 - PAGE_SHIFT) converts between pages and MB + */ +#define PAGES_TO_MB(pages) ((pages) >> (20 - PAGE_SHIFT)) +#define MB_TO_PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) + #ifdef CONFIG_SYSCTL extern int sysctl_legacy_va_layout; #else @@ -198,11 +209,13 @@ extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) -#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) -#define folio_page_idx(folio, p) (page_to_pfn(p) - folio_pfn(folio)) +bool page_range_contiguous(const struct page *page, unsigned long nr_pages); #else -#define nth_page(page,n) ((page) + (n)) -#define folio_page_idx(folio, p) ((p) - &(folio)->page) +static inline bool page_range_contiguous(const struct page *page, + unsigned long nr_pages) +{ + return true; +} #endif /* to align the pointer to the (next) page boundary */ @@ -214,6 +227,20 @@ extern unsigned long sysctl_admin_reserve_kbytes; /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) +/** + * folio_page_idx - Return the number of a page in a folio. + * @folio: The folio. + * @page: The folio page. + * + * This function expects that the page is actually part of the folio. + * The returned number is relative to the start of the folio. + */ +static inline unsigned long folio_page_idx(const struct folio *folio, + const struct page *page) +{ + return page - &folio->page; +} + static inline struct folio *lru_to_folio(struct list_head *head) { return list_entry((head)->prev, struct folio, lru); @@ -648,13 +675,21 @@ struct vm_operations_struct { struct mempolicy *(*get_policy)(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx); #endif +#ifdef CONFIG_FIND_NORMAL_PAGE /* - * Called by vm_normal_page() for special PTEs to find the - * page for @addr. This is useful if the default behavior - * (using pte_page()) would not find the correct page. + * Called by vm_normal_page() for special PTEs in @vma at @addr. This + * allows for returning a "normal" page from vm_normal_page() even + * though the PTE indicates that the "struct page" either does not exist + * or should not be touched: "special". + * + * Do not add new users: this really only works when a "normal" page + * was mapped, but then the PTE got changed to something weird (+ + * marked special) that would not make pte_pfn() identify the originally + * inserted page. */ - struct page *(*find_special_page)(struct vm_area_struct *vma, - unsigned long addr); + struct page *(*find_normal_page)(struct vm_area_struct *vma, + unsigned long addr); +#endif /* CONFIG_FIND_NORMAL_PAGE */ }; #ifdef CONFIG_NUMA_BALANCING @@ -684,7 +719,7 @@ static inline void release_fault_lock(struct vm_fault *vmf) mmap_read_unlock(vmf->vma->vm_mm); } -static inline void assert_fault_locked(struct vm_fault *vmf) +static inline void assert_fault_locked(const struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) vma_assert_locked(vmf->vma); @@ -697,12 +732,42 @@ static inline void release_fault_lock(struct vm_fault *vmf) mmap_read_unlock(vmf->vma->vm_mm); } -static inline void assert_fault_locked(struct vm_fault *vmf) +static inline void assert_fault_locked(const struct vm_fault *vmf) { mmap_assert_locked(vmf->vma->vm_mm); } #endif /* CONFIG_PER_VMA_LOCK */ +static inline bool mm_flags_test(int flag, const struct mm_struct *mm) +{ + return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); +} + +static inline bool mm_flags_test_and_set(int flag, struct mm_struct *mm) +{ + return test_and_set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); +} + +static inline bool mm_flags_test_and_clear(int flag, struct mm_struct *mm) +{ + return test_and_clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); +} + +static inline void mm_flags_set(int flag, struct mm_struct *mm) +{ + set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); +} + +static inline void mm_flags_clear(int flag, struct mm_struct *mm) +{ + clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); +} + +static inline void mm_flags_clear_all(struct mm_struct *mm) +{ + bitmap_zero(ACCESS_PRIVATE(&mm->flags, __mm_flags), NUM_MM_FLAG_BITS); +} + extern const struct vm_operations_struct vma_dummy_vm_ops; static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) @@ -810,7 +875,7 @@ static inline bool vma_is_initial_stack(const struct vm_area_struct *vma) vma->vm_end >= vma->vm_mm->start_stack; } -static inline bool vma_is_temporary_stack(struct vm_area_struct *vma) +static inline bool vma_is_temporary_stack(const struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); @@ -824,7 +889,7 @@ static inline bool vma_is_temporary_stack(struct vm_area_struct *vma) return false; } -static inline bool vma_is_foreign(struct vm_area_struct *vma) +static inline bool vma_is_foreign(const struct vm_area_struct *vma) { if (!current->mm) return true; @@ -835,7 +900,7 @@ static inline bool vma_is_foreign(struct vm_area_struct *vma) return false; } -static inline bool vma_is_accessible(struct vm_area_struct *vma) +static inline bool vma_is_accessible(const struct vm_area_struct *vma) { return vma->vm_flags & VM_ACCESS_FLAGS; } @@ -846,7 +911,7 @@ static inline bool is_shared_maywrite(vm_flags_t vm_flags) (VM_SHARED | VM_MAYWRITE); } -static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma) +static inline bool vma_is_shared_maywrite(const struct vm_area_struct *vma) { return is_shared_maywrite(vma->vm_flags); } @@ -930,14 +995,14 @@ static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) * The vma_is_shmem is not inline because it is used only by slow * paths in userfault. */ -bool vma_is_shmem(struct vm_area_struct *vma); -bool vma_is_anon_shmem(struct vm_area_struct *vma); +bool vma_is_shmem(const struct vm_area_struct *vma); +bool vma_is_anon_shmem(const struct vm_area_struct *vma); #else -static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } -static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; } +static inline bool vma_is_shmem(const struct vm_area_struct *vma) { return false; } +static inline bool vma_is_anon_shmem(const struct vm_area_struct *vma) { return false; } #endif -int vma_is_stack_for_current(struct vm_area_struct *vma); +int vma_is_stack_for_current(const struct vm_area_struct *vma); /* flush_tlb_range() takes a vma, not a mm, and can care about flags */ #define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) } @@ -953,12 +1018,12 @@ static inline unsigned int folio_large_order(const struct folio *folio) } #ifdef NR_PAGES_IN_LARGE_FOLIO -static inline long folio_large_nr_pages(const struct folio *folio) +static inline unsigned long folio_large_nr_pages(const struct folio *folio) { return folio->_nr_pages; } #else -static inline long folio_large_nr_pages(const struct folio *folio) +static inline unsigned long folio_large_nr_pages(const struct folio *folio) { return 1L << folio_large_order(folio); } @@ -971,11 +1036,11 @@ static inline long folio_large_nr_pages(const struct folio *folio) * set before the order is initialised, or this may be a tail page. * See compaction.c for some good examples. */ -static inline unsigned int compound_order(struct page *page) +static inline unsigned int compound_order(const struct page *page) { - struct folio *folio = (struct folio *)page; + const struct folio *folio = (struct folio *)page; - if (!test_bit(PG_head, &folio->flags)) + if (!test_bit(PG_head, &folio->flags.f)) return 0; return folio_large_order(folio); } @@ -1191,7 +1256,7 @@ int folio_mc_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); /* Returns the number of bytes in this potentially compound page. */ -static inline unsigned long page_size(struct page *page) +static inline unsigned long page_size(const struct page *page) { return PAGE_SIZE << compound_order(page); } @@ -1505,21 +1570,26 @@ static inline bool is_nommu_shared_mapping(vm_flags_t flags) */ static inline int page_zone_id(struct page *page) { - return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK; + return (page->flags.f >> ZONEID_PGSHIFT) & ZONEID_MASK; } #ifdef NODE_NOT_IN_PAGE_FLAGS -int page_to_nid(const struct page *page); +int memdesc_nid(memdesc_flags_t mdf); #else -static inline int page_to_nid(const struct page *page) +static inline int memdesc_nid(memdesc_flags_t mdf) { - return (PF_POISONED_CHECK(page)->flags >> NODES_PGSHIFT) & NODES_MASK; + return (mdf.f >> NODES_PGSHIFT) & NODES_MASK; } #endif +static inline int page_to_nid(const struct page *page) +{ + return memdesc_nid(PF_POISONED_CHECK(page)->flags); +} + static inline int folio_nid(const struct folio *folio) { - return page_to_nid(&folio->page); + return memdesc_nid(folio->flags); } #ifdef CONFIG_NUMA_BALANCING @@ -1588,14 +1658,14 @@ static inline void page_cpupid_reset_last(struct page *page) #else static inline int folio_last_cpupid(struct folio *folio) { - return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; + return (folio->flags.f >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; } int folio_xchg_last_cpupid(struct folio *folio, int cpupid); static inline void page_cpupid_reset_last(struct page *page) { - page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT; + page->flags.f |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT; } #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ @@ -1691,7 +1761,7 @@ static inline u8 page_kasan_tag(const struct page *page) u8 tag = KASAN_TAG_KERNEL; if (kasan_enabled()) { - tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; + tag = (page->flags.f >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; tag ^= 0xff; } @@ -1706,12 +1776,12 @@ static inline void page_kasan_tag_set(struct page *page, u8 tag) return; tag ^= 0xff; - old_flags = READ_ONCE(page->flags); + old_flags = READ_ONCE(page->flags.f); do { flags = old_flags; flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; - } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); + } while (unlikely(!try_cmpxchg(&page->flags.f, &old_flags, flags))); } static inline void page_kasan_tag_reset(struct page *page) @@ -1742,26 +1812,26 @@ static inline pg_data_t *page_pgdat(const struct page *page) return NODE_DATA(page_to_nid(page)); } -static inline struct zone *folio_zone(const struct folio *folio) +static inline pg_data_t *folio_pgdat(const struct folio *folio) { - return page_zone(&folio->page); + return NODE_DATA(folio_nid(folio)); } -static inline pg_data_t *folio_pgdat(const struct folio *folio) +static inline struct zone *folio_zone(const struct folio *folio) { - return page_pgdat(&folio->page); + return &folio_pgdat(folio)->node_zones[folio_zonenum(folio)]; } #ifdef SECTION_IN_PAGE_FLAGS static inline void set_page_section(struct page *page, unsigned long section) { - page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); - page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; + page->flags.f &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); + page->flags.f |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; } -static inline unsigned long page_to_section(const struct page *page) +static inline unsigned long memdesc_section(memdesc_flags_t mdf) { - return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK; + return (mdf.f >> SECTIONS_PGSHIFT) & SECTIONS_MASK; } #endif @@ -1785,7 +1855,7 @@ static inline struct folio *pfn_folio(unsigned long pfn) } #ifdef CONFIG_MMU -static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) +static inline pte_t mk_pte(const struct page *page, pgprot_t pgprot) { return pfn_pte(page_to_pfn(page), pgprot); } @@ -1800,7 +1870,7 @@ static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) * * Return: A page table entry suitable for mapping this folio. */ -static inline pte_t folio_mk_pte(struct folio *folio, pgprot_t pgprot) +static inline pte_t folio_mk_pte(const struct folio *folio, pgprot_t pgprot) { return pfn_pte(folio_pfn(folio), pgprot); } @@ -1816,7 +1886,7 @@ static inline pte_t folio_mk_pte(struct folio *folio, pgprot_t pgprot) * * Return: A page table entry suitable for mapping this folio. */ -static inline pmd_t folio_mk_pmd(struct folio *folio, pgprot_t pgprot) +static inline pmd_t folio_mk_pmd(const struct folio *folio, pgprot_t pgprot) { return pmd_mkhuge(pfn_pmd(folio_pfn(folio), pgprot)); } @@ -1832,7 +1902,7 @@ static inline pmd_t folio_mk_pmd(struct folio *folio, pgprot_t pgprot) * * Return: A page table entry suitable for mapping this folio. */ -static inline pud_t folio_mk_pud(struct folio *folio, pgprot_t pgprot) +static inline pud_t folio_mk_pud(const struct folio *folio, pgprot_t pgprot) { return pud_mkhuge(pfn_pud(folio_pfn(folio), pgprot)); } @@ -1900,7 +1970,7 @@ static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma, { VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1)); - if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) + if (!mm_flags_test(MMF_HAS_PINNED, vma->vm_mm)) return false; return folio_maybe_dma_pinned(folio); @@ -1966,14 +2036,14 @@ static inline bool folio_is_longterm_pinnable(struct folio *folio) static inline void set_page_zone(struct page *page, enum zone_type zone) { - page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); - page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT; + page->flags.f &= ~(ZONES_MASK << ZONES_PGSHIFT); + page->flags.f |= (zone & ZONES_MASK) << ZONES_PGSHIFT; } static inline void set_page_node(struct page *page, unsigned long node) { - page->flags &= ~(NODES_MASK << NODES_PGSHIFT); - page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; + page->flags.f &= ~(NODES_MASK << NODES_PGSHIFT); + page->flags.f |= (node & NODES_MASK) << NODES_PGSHIFT; } static inline void set_page_links(struct page *page, enum zone_type zone, @@ -1992,30 +2062,46 @@ static inline void set_page_links(struct page *page, enum zone_type zone, * * Return: A positive power of two. */ -static inline long folio_nr_pages(const struct folio *folio) +static inline unsigned long folio_nr_pages(const struct folio *folio) { if (!folio_test_large(folio)) return 1; return folio_large_nr_pages(folio); } -/* Only hugetlbfs can allocate folios larger than MAX_ORDER */ -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -#define MAX_FOLIO_NR_PAGES (1UL << PUD_ORDER) +#if !defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) +/* + * We don't expect any folios that exceed buddy sizes (and consequently + * memory sections). + */ +#define MAX_FOLIO_ORDER MAX_PAGE_ORDER +#elif defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +/* + * Only pages within a single memory section are guaranteed to be + * contiguous. By limiting folios to a single memory section, all folio + * pages are guaranteed to be contiguous. + */ +#define MAX_FOLIO_ORDER PFN_SECTION_SHIFT #else -#define MAX_FOLIO_NR_PAGES MAX_ORDER_NR_PAGES +/* + * There is no real limit on the folio size. We limit them to the maximum we + * currently expect (e.g., hugetlb, dax). + */ +#define MAX_FOLIO_ORDER PUD_ORDER #endif +#define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) + /* * compound_nr() returns the number of pages in this potentially compound * page. compound_nr() can be called on a tail page, and is defined to * return 1 in that case. */ -static inline long compound_nr(struct page *page) +static inline unsigned long compound_nr(const struct page *page) { - struct folio *folio = (struct folio *)page; + const struct folio *folio = (struct folio *)page; - if (!test_bit(PG_head, &folio->flags)) + if (!test_bit(PG_head, &folio->flags.f)) return 1; return folio_large_nr_pages(folio); } @@ -2351,6 +2437,8 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); +struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t pud); void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); @@ -2529,7 +2617,7 @@ void folio_add_pin(struct folio *folio); int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc); int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, - struct task_struct *task, bool bypass_rlim); + const struct task_struct *task, bool bypass_rlim); struct kvec; struct page *get_dump_page(unsigned long addr, int *locked); @@ -2654,7 +2742,7 @@ static inline void update_hiwater_rss(struct mm_struct *mm) unsigned long _rss = get_mm_rss(mm); if (data_race(mm->hiwater_rss) < _rss) - (mm)->hiwater_rss = _rss; + data_race(mm->hiwater_rss = _rss); } static inline void update_hiwater_vm(struct mm_struct *mm) @@ -2846,16 +2934,22 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a } #endif /* CONFIG_MMU */ +enum pt_flags { + PT_reserved = PG_reserved, + /* High bits are used for zone/node/section */ +}; + static inline struct ptdesc *virt_to_ptdesc(const void *x) { return page_ptdesc(virt_to_page(x)); } -static inline void *ptdesc_to_virt(const struct ptdesc *pt) -{ - return page_to_virt(ptdesc_page(pt)); -} - +/** + * ptdesc_address - Virtual address of page table. + * @pt: Page table descriptor. + * + * Return: The first byte of the page table described by @pt. + */ static inline void *ptdesc_address(const struct ptdesc *pt) { return folio_address(ptdesc_folio(pt)); @@ -2863,7 +2957,7 @@ static inline void *ptdesc_address(const struct ptdesc *pt) static inline bool pagetable_is_reserved(struct ptdesc *pt) { - return folio_test_reserved(ptdesc_folio(pt)); + return test_bit(PT_reserved, &pt->pt_flags.f); } /** @@ -2973,21 +3067,26 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ +static inline unsigned long ptdesc_nr_pages(const struct ptdesc *ptdesc) +{ + return compound_nr(ptdesc_page(ptdesc)); +} + static inline void __pagetable_ctor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); + pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags)); - __folio_set_pgtable(folio); - lruvec_stat_add_folio(folio, NR_PAGETABLE); + __SetPageTable(ptdesc_page(ptdesc)); + mod_node_page_state(pgdat, NR_PAGETABLE, ptdesc_nr_pages(ptdesc)); } static inline void pagetable_dtor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); + pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags)); ptlock_free(ptdesc); - __folio_clear_pgtable(folio); - lruvec_stat_sub_folio(folio, NR_PAGETABLE); + __ClearPageTable(ptdesc_page(ptdesc)); + mod_node_page_state(pgdat, NR_PAGETABLE, -ptdesc_nr_pages(ptdesc)); } static inline void pagetable_dtor_free(struct ptdesc *ptdesc) @@ -3292,7 +3391,7 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); avc; avc = anon_vma_interval_tree_iter_next(avc, start, last)) /* mmap.c */ -extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); +extern int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void exit_mmap(struct mm_struct *); bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, @@ -3432,7 +3531,7 @@ struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) return mtree_load(&mm->mm_mt, addr); } -static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma) +static inline unsigned long stack_guard_start_gap(const struct vm_area_struct *vma) { if (vma->vm_flags & VM_GROWSDOWN) return stack_guard_gap; @@ -3444,7 +3543,7 @@ static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma) return 0; } -static inline unsigned long vm_start_gap(struct vm_area_struct *vma) +static inline unsigned long vm_start_gap(const struct vm_area_struct *vma) { unsigned long gap = stack_guard_start_gap(vma); unsigned long vm_start = vma->vm_start; @@ -3455,7 +3554,7 @@ static inline unsigned long vm_start_gap(struct vm_area_struct *vma) return vm_start; } -static inline unsigned long vm_end_gap(struct vm_area_struct *vma) +static inline unsigned long vm_end_gap(const struct vm_area_struct *vma) { unsigned long vm_end = vma->vm_end; @@ -3467,7 +3566,7 @@ static inline unsigned long vm_end_gap(struct vm_area_struct *vma) return vm_end; } -static inline unsigned long vma_pages(struct vm_area_struct *vma) +static inline unsigned long vma_pages(const struct vm_area_struct *vma) { return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } @@ -3484,7 +3583,7 @@ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, return vma; } -static inline bool range_in_vma(struct vm_area_struct *vma, +static inline bool range_in_vma(const struct vm_area_struct *vma, unsigned long start, unsigned long end) { return (vma && vma->vm_start <= start && end <= vma->vm_end); @@ -3600,7 +3699,7 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) * Indicates whether GUP can follow a PROT_NONE mapped page, or whether * a (NUMA hinting) fault is required. */ -static inline bool gup_can_follow_protnone(struct vm_area_struct *vma, +static inline bool gup_can_follow_protnone(const struct vm_area_struct *vma, unsigned int flags) { /* @@ -3730,7 +3829,7 @@ static inline bool debug_guardpage_enabled(void) return static_branch_unlikely(&_debug_guardpage_enabled); } -static inline bool page_is_guard(struct page *page) +static inline bool page_is_guard(const struct page *page) { if (!debug_guardpage_enabled()) return false; @@ -3761,7 +3860,7 @@ static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {} static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {} static inline unsigned int debug_guardpage_minorder(void) { return 0; } static inline bool debug_guardpage_enabled(void) { return false; } -static inline bool page_is_guard(struct page *page) { return false; } +static inline bool page_is_guard(const struct page *page) { return false; } static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order) { return false; } static inline void clear_page_guard(struct zone *zone, struct page *page, @@ -3784,7 +3883,7 @@ static inline int in_gate_area(struct mm_struct *mm, unsigned long addr) } #endif /* __HAVE_ARCH_GATE_AREA */ -extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); +bool process_shares_mm(const struct task_struct *p, const struct mm_struct *mm); void drop_slab(void); @@ -3843,7 +3942,7 @@ void vmemmap_free(unsigned long start, unsigned long end, #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP -static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) +static inline unsigned long vmem_altmap_offset(const struct vmem_altmap *altmap) { /* number of pfns from base where pfn_to_page() is valid */ if (altmap) @@ -3857,7 +3956,7 @@ static inline void vmem_altmap_free(struct vmem_altmap *altmap, altmap->alloc -= nr_pfns; } #else -static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) +static inline unsigned long vmem_altmap_offset(const struct vmem_altmap *altmap) { return 0; } diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 89b518ff097e..d6c1011b38f2 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -25,7 +25,7 @@ * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise * ram or swap backed folio. */ -static inline int folio_is_file_lru(struct folio *folio) +static inline int folio_is_file_lru(const struct folio *folio) { return !folio_test_swapbacked(folio); } @@ -84,7 +84,7 @@ static __always_inline void __folio_clear_lru_flags(struct folio *folio) * Return: The LRU list a folio should be on, as an index * into the array of LRU lists. */ -static __always_inline enum lru_list folio_lru_list(struct folio *folio) +static __always_inline enum lru_list folio_lru_list(const struct folio *folio) { enum lru_list lru; @@ -141,9 +141,9 @@ static inline int lru_tier_from_refs(int refs, bool workingset) return workingset ? MAX_NR_TIERS - 1 : order_base_2(refs); } -static inline int folio_lru_refs(struct folio *folio) +static inline int folio_lru_refs(const struct folio *folio) { - unsigned long flags = READ_ONCE(folio->flags); + unsigned long flags = READ_ONCE(folio->flags.f); if (!(flags & BIT(PG_referenced))) return 0; @@ -154,14 +154,14 @@ static inline int folio_lru_refs(struct folio *folio) return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1; } -static inline int folio_lru_gen(struct folio *folio) +static inline int folio_lru_gen(const struct folio *folio) { - unsigned long flags = READ_ONCE(folio->flags); + unsigned long flags = READ_ONCE(folio->flags.f); return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } -static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) +static inline bool lru_gen_is_active(const struct lruvec *lruvec, int gen) { unsigned long max_seq = lruvec->lrugen.max_seq; @@ -217,12 +217,13 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); } -static inline unsigned long lru_gen_folio_seq(struct lruvec *lruvec, struct folio *folio, +static inline unsigned long lru_gen_folio_seq(const struct lruvec *lruvec, + const struct folio *folio, bool reclaiming) { int gen; int type = folio_is_file_lru(folio); - struct lru_gen_folio *lrugen = &lruvec->lrugen; + const struct lru_gen_folio *lrugen = &lruvec->lrugen; /* * +-----------------------------------+-----------------------------------+ @@ -268,7 +269,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, gen = lru_gen_from_seq(seq); flags = (gen + 1UL) << LRU_GEN_PGOFF; /* see the comment on MIN_NR_GENS about PG_active */ - set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags); + set_mask_bits(&folio->flags.f, LRU_GEN_MASK | BIT(PG_active), flags); lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ @@ -293,7 +294,7 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, /* for folio_migrate_flags() */ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0; - flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags); + flags = set_mask_bits(&folio->flags.f, LRU_GEN_MASK, flags); gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; lru_gen_update_size(lruvec, folio, gen, -1); @@ -302,11 +303,11 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, return true; } -static inline void folio_migrate_refs(struct folio *new, struct folio *old) +static inline void folio_migrate_refs(struct folio *new, const struct folio *old) { - unsigned long refs = READ_ONCE(old->flags) & LRU_REFS_MASK; + unsigned long refs = READ_ONCE(old->flags.f) & LRU_REFS_MASK; - set_mask_bits(&new->flags, LRU_REFS_MASK, refs); + set_mask_bits(&new->flags.f, LRU_REFS_MASK, refs); } #else /* !CONFIG_LRU_GEN */ @@ -330,7 +331,7 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, return false; } -static inline void folio_migrate_refs(struct folio *new, struct folio *old) +static inline void folio_migrate_refs(struct folio *new, const struct folio *old) { } @@ -508,7 +509,7 @@ static inline void dec_tlb_flush_pending(struct mm_struct *mm) atomic_dec(&mm->tlb_flush_pending); } -static inline bool mm_tlb_flush_pending(struct mm_struct *mm) +static inline bool mm_tlb_flush_pending(const struct mm_struct *mm) { /* * Must be called after having acquired the PTL; orders against that @@ -521,7 +522,7 @@ static inline bool mm_tlb_flush_pending(struct mm_struct *mm) return atomic_read(&mm->tlb_flush_pending); } -static inline bool mm_tlb_flush_nested(struct mm_struct *mm) +static inline bool mm_tlb_flush_nested(const struct mm_struct *mm) { /* * Similar to mm_tlb_flush_pending(), we must have acquired the PTL @@ -605,7 +606,7 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, return false; } -static inline bool vma_has_recency(struct vm_area_struct *vma) +static inline bool vma_has_recency(const struct vm_area_struct *vma) { if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)) return false; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7f625c35128b..90e5790c318f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -20,6 +20,7 @@ #include <linux/seqlock.h> #include <linux/percpu_counter.h> #include <linux/types.h> +#include <linux/bitmap.h> #include <asm/mmu.h> @@ -33,6 +34,10 @@ struct address_space; struct futex_private_hash; struct mem_cgroup; +typedef struct { + unsigned long f; +} memdesc_flags_t; + /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the @@ -71,7 +76,7 @@ struct mem_cgroup; #endif struct page { - unsigned long flags; /* Atomic flags, some possibly + memdesc_flags_t flags; /* Atomic flags, some possibly * updated asynchronously */ /* * Five words (20/40 bytes) are available in this union. @@ -89,21 +94,10 @@ struct page { union { struct list_head lru; - /* Or, for the Unevictable "LRU list" slot */ - struct { - /* Always even, to negate PageTail */ - void *__filler; - /* Count page's or folio's mlocks */ - unsigned int mlock_count; - }; - /* Or, free page */ struct list_head buddy_list; struct list_head pcp_list; - struct { - struct llist_node pcp_llist; - unsigned int order; - }; + struct llist_node pcp_llist; }; struct address_space *mapping; union { @@ -114,7 +108,8 @@ struct page { * @private: Mapping-private opaque data. * Usually used for buffer_heads if PagePrivate. * Used for swp_entry_t if swapcache flag set. - * Indicates order in the buddy system if PageBuddy. + * Indicates order in the buddy system if PageBuddy + * or on pcp_llist. */ unsigned long private; }; @@ -382,11 +377,13 @@ struct folio { union { struct { /* public: */ - unsigned long flags; + memdesc_flags_t flags; union { struct list_head lru; /* private: avoid cluttering the output */ + /* For the Unevictable "LRU list" slot */ struct { + /* Avoid compound_head */ void *__filler; /* public: */ unsigned int mlock_count; @@ -525,7 +522,7 @@ FOLIO_MATCH(compound_head, _head_3); /** * struct ptdesc - Memory descriptor for page tables. - * @__page_flags: Same as page flags. Powerpc only. + * @pt_flags: enum pt_flags plus zone/node/section. * @pt_rcu_head: For freeing page table pages. * @pt_list: List of used page tables. Used for s390 gmap shadow pages * (which are not linked into the user page tables) and x86 @@ -547,7 +544,7 @@ FOLIO_MATCH(compound_head, _head_3); * understanding of the issues. */ struct ptdesc { - unsigned long __page_flags; + memdesc_flags_t pt_flags; union { struct rcu_head pt_rcu_head; @@ -585,7 +582,7 @@ struct ptdesc { #define TABLE_MATCH(pg, pt) \ static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt)) -TABLE_MATCH(flags, __page_flags); +TABLE_MATCH(flags, pt_flags); TABLE_MATCH(compound_head, pt_list); TABLE_MATCH(compound_head, _pt_pad_1); TABLE_MATCH(mapping, __page_mapping); @@ -627,7 +624,7 @@ static inline void ptdesc_pmd_pts_dec(struct ptdesc *ptdesc) atomic_dec(&ptdesc->pt_share_count); } -static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc) +static inline int ptdesc_pmd_pts_count(const struct ptdesc *ptdesc) { return atomic_read(&ptdesc->pt_share_count); } @@ -660,7 +657,7 @@ static inline void set_page_private(struct page *page, unsigned long private) page->private = private; } -static inline void *folio_get_private(struct folio *folio) +static inline void *folio_get_private(const struct folio *folio) { return folio->private; } @@ -785,13 +782,14 @@ struct pfnmap_track_ctx { */ struct vm_area_desc { /* Immutable state. */ - struct mm_struct *mm; + const struct mm_struct *const mm; + struct file *const file; /* May vary from vm_file in stacked callers. */ unsigned long start; unsigned long end; /* Mutable fields. Populated with initial state. */ pgoff_t pgoff; - struct file *file; + struct file *vm_file; vm_flags_t vm_flags; pgprot_t page_prot; @@ -932,6 +930,15 @@ struct mm_cid { }; #endif +/* + * Opaque type representing current mm_struct flag state. Must be accessed via + * mm_flags_xxx() helper functions. + */ +#define NUM_MM_FLAG_BITS (64) +typedef struct { + DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS); +} __private mm_flags_t; + struct kioctx_table; struct iommu_mm_data; struct mm_struct { @@ -1031,10 +1038,10 @@ struct mm_struct { * counters */ /* - * With some kernel config, the current mmap_lock's offset - * inside 'mm_struct' is at 0x120, which is very optimal, as + * Typically the current mmap_lock's offset is 56 bytes from + * the last cacheline boundary, which is very optimal, as * its two hot fields 'count' and 'owner' sit in 2 different - * cachelines, and when mmap_lock is highly contended, both + * cachelines, and when mmap_lock is highly contended, both * of the 2 fields will be accessed frequently, current layout * will help to reduce cache bouncing. * @@ -1119,7 +1126,7 @@ struct mm_struct { /* Architecture-specific MM context */ mm_context_t context; - unsigned long flags; /* Must use atomic bitops to access */ + mm_flags_t flags; /* Must use mm_flags_* hlpers to access */ #ifdef CONFIG_AIO spinlock_t ioctx_lock; @@ -1229,6 +1236,40 @@ struct mm_struct { unsigned long cpu_bitmap[]; }; +/* Set the first system word of mm flags, non-atomically. */ +static inline void __mm_flags_set_word(struct mm_struct *mm, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags); + + bitmap_copy(bitmap, &value, BITS_PER_LONG); +} + +/* Obtain a read-only view of the bitmap. */ +static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct *mm) +{ + return (const unsigned long *)ACCESS_PRIVATE(&mm->flags, __mm_flags); +} + +/* Read the first system word of mm flags, non-atomically. */ +static inline unsigned long __mm_flags_get_word(const struct mm_struct *mm) +{ + const unsigned long *bitmap = __mm_flags_get_bitmap(mm); + + return bitmap_read(bitmap, 0, BITS_PER_LONG); +} + +/* + * Update the first system word of mm flags ONLY, applying the specified mask to + * it, then setting all flags specified by bits. + */ +static inline void __mm_flags_set_mask_bits_word(struct mm_struct *mm, + unsigned long mask, unsigned long bits) +{ + unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags); + + set_mask_bits(bitmap, mask, bits); +} + #define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \ MT_FLAGS_USE_RCU) extern struct mm_struct init_mm; @@ -1729,7 +1770,7 @@ enum { * the modes are SUID_DUMP_* defined in linux/sched/coredump.h */ #define MMF_DUMPABLE_BITS 2 -#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1) +#define MMF_DUMPABLE_MASK (BIT(MMF_DUMPABLE_BITS) - 1) /* coredump filter bits */ #define MMF_DUMP_ANON_PRIVATE 2 #define MMF_DUMP_ANON_SHARED 3 @@ -1744,13 +1785,13 @@ enum { #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS #define MMF_DUMP_FILTER_BITS 9 #define MMF_DUMP_FILTER_MASK \ - (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) + ((BIT(MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) #define MMF_DUMP_FILTER_DEFAULT \ - ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\ - (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) + (BIT(MMF_DUMP_ANON_PRIVATE) | BIT(MMF_DUMP_ANON_SHARED) | \ + BIT(MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) #ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS -# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS) +# define MMF_DUMP_MASK_DEFAULT_ELF BIT(MMF_DUMP_ELF_HEADERS) #else # define MMF_DUMP_MASK_DEFAULT_ELF 0 #endif @@ -1758,19 +1799,16 @@ enum { #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ #define MMF_VM_HUGEPAGE 17 /* set when mm is available for khugepaged */ -/* - * This one-shot flag is dropped due to necessity of changing exe once again - * on NFS restore - */ -//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ +#define MMF_HUGE_ZERO_FOLIO 18 /* mm has ever used the global huge zero folio */ #define MMF_HAS_UPROBES 19 /* has uprobes */ #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ #define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ -#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ -#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ -#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) +#define MMF_DISABLE_THP_EXCEPT_ADVISED 23 /* no THP except when advised (e.g., VM_HUGEPAGE) */ +#define MMF_DISABLE_THP_COMPLETELY 24 /* no THP for all VMAs */ +#define MMF_DISABLE_THP_MASK (BIT(MMF_DISABLE_THP_COMPLETELY) | \ + BIT(MMF_DISABLE_THP_EXCEPT_ADVISED)) #define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ #define MMF_MULTIPROCESS 26 /* mm is shared between processes */ /* @@ -1783,27 +1821,33 @@ enum { #define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ #define MMF_HAS_MDWE 28 -#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE) - +#define MMF_HAS_MDWE_MASK BIT(MMF_HAS_MDWE) #define MMF_HAS_MDWE_NO_INHERIT 29 #define MMF_VM_MERGE_ANY 30 -#define MMF_VM_MERGE_ANY_MASK (1 << MMF_VM_MERGE_ANY) +#define MMF_VM_MERGE_ANY_MASK BIT(MMF_VM_MERGE_ANY) #define MMF_TOPDOWN 31 /* mm searches top down by default */ -#define MMF_TOPDOWN_MASK (1 << MMF_TOPDOWN) +#define MMF_TOPDOWN_MASK BIT(MMF_TOPDOWN) -#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ +#define MMF_INIT_LEGACY_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\ MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK) -static inline unsigned long mmf_init_flags(unsigned long flags) +/* Legacy flags must fit within 32 bits. */ +static_assert((u64)MMF_INIT_LEGACY_MASK <= (u64)UINT_MAX); + +/* + * Initialise legacy flags according to masks, propagating selected flags on + * fork. Further flag manipulation can be performed by the caller. + */ +static inline unsigned long mmf_init_legacy_flags(unsigned long flags) { if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT)) flags &= ~((1UL << MMF_HAS_MDWE) | (1UL << MMF_HAS_MDWE_NO_INHERIT)); - return flags & MMF_INIT_MASK; + return flags & MMF_INIT_LEGACY_MASK; } #endif /* _LINUX_MM_TYPES_H */ diff --git a/include/linux/mman.h b/include/linux/mman.h index de9e8e6229a4..0ba8a7e8b90a 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -201,7 +201,7 @@ static inline bool arch_memory_deny_write_exec_supported(void) static inline bool map_deny_write_exec(unsigned long old, unsigned long new) { /* If MDWE is disabled, we have nothing to deny. */ - if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + if (!mm_flags_test(MMF_HAS_MDWE, current->mm)) return false; /* If the new VMA is not executable, we have nothing to deny. */ diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 11a078de9150..2c9fffa58714 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -148,91 +148,6 @@ static inline void vma_refcount_put(struct vm_area_struct *vma) } /* - * Try to read-lock a vma. The function is allowed to occasionally yield false - * locked result to avoid performance overhead, in which case we fall back to - * using mmap_lock. The function should never yield false unlocked result. - * False locked result is possible if mm_lock_seq overflows or if vma gets - * reused and attached to a different mm before we lock it. - * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got - * detached. - * - * WARNING! The vma passed to this function cannot be used if the function - * fails to lock it because in certain cases RCU lock is dropped and then - * reacquired. Once RCU lock is dropped the vma can be concurently freed. - */ -static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, - struct vm_area_struct *vma) -{ - int oldcnt; - - /* - * Check before locking. A race might cause false locked result. - * We can use READ_ONCE() for the mm_lock_seq here, and don't need - * ACQUIRE semantics, because this is just a lockless check whose result - * we don't rely on for anything - the mm_lock_seq read against which we - * need ordering is below. - */ - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) - return NULL; - - /* - * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() - * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. - * Acquire fence is required here to avoid reordering against later - * vm_lock_seq check and checks inside lock_vma_under_rcu(). - */ - if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, - VMA_REF_LIMIT))) { - /* return EAGAIN if vma got detached from under us */ - return oldcnt ? NULL : ERR_PTR(-EAGAIN); - } - - rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); - - /* - * If vma got attached to another mm from under us, that mm is not - * stable and can be freed in the narrow window after vma->vm_refcnt - * is dropped and before rcuwait_wake_up(mm) is called. Grab it before - * releasing vma->vm_refcnt. - */ - if (unlikely(vma->vm_mm != mm)) { - /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ - struct mm_struct *other_mm = vma->vm_mm; - - /* - * __mmdrop() is a heavy operation and we don't need RCU - * protection here. Release RCU lock during these operations. - * We reinstate the RCU read lock as the caller expects it to - * be held when this function returns even on error. - */ - rcu_read_unlock(); - mmgrab(other_mm); - vma_refcount_put(vma); - mmdrop(other_mm); - rcu_read_lock(); - return NULL; - } - - /* - * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. - * False unlocked result is impossible because we modify and check - * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq - * modification invalidates all existing locks. - * - * We must use ACQUIRE semantics for the mm_lock_seq so that if we are - * racing with vma_end_write_all(), we only start reading from the VMA - * after it has been unlocked. - * This pairs with RELEASE semantics in vma_end_write_all(). - */ - if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { - vma_refcount_put(vma); - return NULL; - } - - return vma; -} - -/* * Use only while holding mmap read lock which guarantees that locking will not * fail (nobody can concurrently write-lock the vma). vma_start_read() should * not be used in such cases because it might fail due to mm_lock_seq overflow. diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0c5da9141983..7fb7331c5725 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -234,7 +234,21 @@ enum node_stat_item { #endif #ifdef CONFIG_NUMA_BALANCING PGPROMOTE_SUCCESS, /* promote successfully */ - PGPROMOTE_CANDIDATE, /* candidate pages to promote */ + /** + * Candidate pages for promotion based on hint fault latency. This + * counter is used to control the promotion rate and adjust the hot + * threshold. + */ + PGPROMOTE_CANDIDATE, + /** + * Not rate-limited (NRL) candidate pages for those can be promoted + * without considering hot threshold because of enough free pages in + * fast-tier node. These promotions bypass the regular hotness checks + * and do NOT influence the promotion rate-limiter or + * threshold-adjustment logic. + * This is for statistics/monitoring purposes. + */ + PGPROMOTE_CANDIDATE_NRL, #endif /* PGDEMOTE_*: pages demoted */ PGDEMOTE_KSWAPD, @@ -245,6 +259,7 @@ enum node_stat_item { NR_HUGETLB, #endif NR_BALLOON_PAGES, + NR_KERNEL_FILE_PAGES, NR_VM_NODE_STAT_ITEMS }; @@ -1089,7 +1104,7 @@ static inline unsigned long promo_wmark_pages(const struct zone *z) return wmark_pages(z, WMARK_PROMO); } -static inline unsigned long zone_managed_pages(struct zone *zone) +static inline unsigned long zone_managed_pages(const struct zone *zone) { return (unsigned long)atomic_long_read(&zone->managed_pages); } @@ -1113,12 +1128,12 @@ static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone); } -static inline bool zone_is_initialized(struct zone *zone) +static inline bool zone_is_initialized(const struct zone *zone) { return zone->initialized; } -static inline bool zone_is_empty(struct zone *zone) +static inline bool zone_is_empty(const struct zone *zone) { return zone->spanned_pages == 0; } @@ -1169,26 +1184,31 @@ static inline bool zone_is_empty(struct zone *zone) #define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1) #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) +static inline enum zone_type memdesc_zonenum(memdesc_flags_t flags) +{ + ASSERT_EXCLUSIVE_BITS(flags.f, ZONES_MASK << ZONES_PGSHIFT); + return (flags.f >> ZONES_PGSHIFT) & ZONES_MASK; +} + static inline enum zone_type page_zonenum(const struct page *page) { - ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT); - return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; + return memdesc_zonenum(page->flags); } static inline enum zone_type folio_zonenum(const struct folio *folio) { - return page_zonenum(&folio->page); + return memdesc_zonenum(folio->flags); } #ifdef CONFIG_ZONE_DEVICE -static inline bool is_zone_device_page(const struct page *page) +static inline bool memdesc_is_zone_device(memdesc_flags_t mdf) { - return page_zonenum(page) == ZONE_DEVICE; + return memdesc_zonenum(mdf) == ZONE_DEVICE; } static inline struct dev_pagemap *page_pgmap(const struct page *page) { - VM_WARN_ON_ONCE_PAGE(!is_zone_device_page(page), page); + VM_WARN_ON_ONCE_PAGE(!memdesc_is_zone_device(page->flags), page); return page_folio(page)->pgmap; } @@ -1203,9 +1223,9 @@ static inline struct dev_pagemap *page_pgmap(const struct page *page) static inline bool zone_device_pages_have_same_pgmap(const struct page *a, const struct page *b) { - if (is_zone_device_page(a) != is_zone_device_page(b)) + if (memdesc_is_zone_device(a->flags) != memdesc_is_zone_device(b->flags)) return false; - if (!is_zone_device_page(a)) + if (!memdesc_is_zone_device(a->flags)) return true; return page_pgmap(a) == page_pgmap(b); } @@ -1213,7 +1233,7 @@ static inline bool zone_device_pages_have_same_pgmap(const struct page *a, extern void memmap_init_zone_device(struct zone *, unsigned long, unsigned long, struct dev_pagemap *); #else -static inline bool is_zone_device_page(const struct page *page) +static inline bool memdesc_is_zone_device(memdesc_flags_t mdf) { return false; } @@ -1228,9 +1248,14 @@ static inline struct dev_pagemap *page_pgmap(const struct page *page) } #endif +static inline bool is_zone_device_page(const struct page *page) +{ + return memdesc_is_zone_device(page->flags); +} + static inline bool folio_is_zone_device(const struct folio *folio) { - return is_zone_device_page(&folio->page); + return memdesc_is_zone_device(folio->flags); } static inline bool is_zone_movable_page(const struct page *page) @@ -1248,7 +1273,7 @@ static inline bool folio_is_zone_movable(const struct folio *folio) * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty * intersection with the given zone */ -static inline bool zone_intersects(struct zone *zone, +static inline bool zone_intersects(const struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { if (zone_is_empty(zone)) @@ -1415,7 +1440,7 @@ typedef struct pglist_data { int kswapd_order; enum zone_type kswapd_highest_zoneidx; - int kswapd_failures; /* Number of 'reclaimed == 0' runs */ + atomic_t kswapd_failures; /* Number of 'reclaimed == 0' runs */ #ifdef CONFIG_COMPACTION int kcompactd_max_order; @@ -1556,12 +1581,12 @@ static inline int local_memory_node(int node_id) { return node_id; }; #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) #ifdef CONFIG_ZONE_DEVICE -static inline bool zone_is_zone_device(struct zone *zone) +static inline bool zone_is_zone_device(const struct zone *zone) { return zone_idx(zone) == ZONE_DEVICE; } #else -static inline bool zone_is_zone_device(struct zone *zone) +static inline bool zone_is_zone_device(const struct zone *zone) { return false; } @@ -1573,19 +1598,19 @@ static inline bool zone_is_zone_device(struct zone *zone) * populated_zone(). If the whole zone is reserved then we can easily * end up with populated_zone() && !managed_zone(). */ -static inline bool managed_zone(struct zone *zone) +static inline bool managed_zone(const struct zone *zone) { return zone_managed_pages(zone); } /* Returns true if a zone has memory */ -static inline bool populated_zone(struct zone *zone) +static inline bool populated_zone(const struct zone *zone) { return zone->present_pages; } #ifdef CONFIG_NUMA -static inline int zone_to_nid(struct zone *zone) +static inline int zone_to_nid(const struct zone *zone) { return zone->node; } @@ -1595,7 +1620,7 @@ static inline void zone_set_nid(struct zone *zone, int nid) zone->node = nid; } #else -static inline int zone_to_nid(struct zone *zone) +static inline int zone_to_nid(const struct zone *zone) { return 0; } @@ -1622,7 +1647,7 @@ static inline int is_highmem_idx(enum zone_type idx) * @zone: pointer to struct zone variable * Return: 1 for a highmem zone, 0 otherwise */ -static inline int is_highmem(struct zone *zone) +static inline int is_highmem(const struct zone *zone) { return is_highmem_idx(zone_idx(zone)); } @@ -1688,12 +1713,12 @@ static inline struct zone *zonelist_zone(struct zoneref *zoneref) return zoneref->zone; } -static inline int zonelist_zone_idx(struct zoneref *zoneref) +static inline int zonelist_zone_idx(const struct zoneref *zoneref) { return zoneref->zone_idx; } -static inline int zonelist_node_idx(struct zoneref *zoneref) +static inline int zonelist_node_idx(const struct zoneref *zoneref) { return zone_to_nid(zoneref->zone); } @@ -1996,7 +2021,7 @@ static inline struct page *__section_mem_map_addr(struct mem_section *section) return (struct page *)map; } -static inline int present_section(struct mem_section *section) +static inline int present_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_MARKED_PRESENT)); } @@ -2006,12 +2031,12 @@ static inline int present_section_nr(unsigned long nr) return present_section(__nr_to_section(nr)); } -static inline int valid_section(struct mem_section *section) +static inline int valid_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP)); } -static inline int early_section(struct mem_section *section) +static inline int early_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_EARLY)); } @@ -2021,27 +2046,27 @@ static inline int valid_section_nr(unsigned long nr) return valid_section(__nr_to_section(nr)); } -static inline int online_section(struct mem_section *section) +static inline int online_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_ONLINE)); } #ifdef CONFIG_ZONE_DEVICE -static inline int online_device_section(struct mem_section *section) +static inline int online_device_section(const struct mem_section *section) { unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE; return section && ((section->section_mem_map & flags) == flags); } #else -static inline int online_device_section(struct mem_section *section) +static inline int online_device_section(const struct mem_section *section) { return 0; } #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT -static inline int preinited_vmemmap_section(struct mem_section *section) +static inline int preinited_vmemmap_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_VMEMMAP_PREINIT)); @@ -2051,7 +2076,7 @@ void sparse_vmemmap_init_nid_early(int nid); void sparse_vmemmap_init_nid_late(int nid); #else -static inline int preinited_vmemmap_section(struct mem_section *section) +static inline int preinited_vmemmap_section(const struct mem_section *section) { return 0; } diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 3a25122d83e2..6907aedc4f74 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -349,6 +349,19 @@ static inline void kernel_param_unlock(struct module *mod) __module_param_call("", name, ¶m_ops_##type, &var, perm, \ -1, KERNEL_PARAM_FL_UNSAFE) +/** + * __core_param_cb - similar like core_param, with a set/get ops instead of type. + * @name: the name of the cmdline and sysfs parameter (often the same as var) + * @var: the variable + * @ops: the set & get operations for this parameter. + * @perm: visibility in sysfs + * + * Ideally this should be called 'core_param_cb', but the name has been + * used for module core parameter, so add the '__' prefix + */ +#define __core_param_cb(name, ops, arg, perm) \ + __module_param_call("", name, ops, arg, perm, -1, 0) + #endif /* !MODULE */ /** diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 98c96d649bf9..72ee7d210a74 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -21,7 +21,7 @@ #include <linux/rolling_buffer.h> enum netfs_sreq_ref_trace; -typedef struct mempool_s mempool_t; +typedef struct mempool mempool_t; struct folio_queue; /** diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h index 615a560d9edb..f3b13da78aac 100644 --- a/include/linux/nvmem-provider.h +++ b/include/linux/nvmem-provider.h @@ -103,7 +103,7 @@ struct nvmem_cell_info { * * Note: A default "nvmem<id>" name will be assigned to the device if * no name is specified in its configuration. In such case "<id>" is - * generated with ida_simple_get() and provided id field is ignored. + * generated with ida_alloc() and provided id field is ignored. * * Note: Specifying name and setting id to -1 implies a unique device * whose name is provided as-is (kept unaltered). diff --git a/include/linux/oom.h b/include/linux/oom.h index 1e0fc6931ce9..7b02bc1d0a7e 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -91,7 +91,7 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk) */ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm) { - if (unlikely(test_bit(MMF_UNSTABLE, &mm->flags))) + if (unlikely(mm_flags_test(MMF_UNSTABLE, mm))) return VM_FAULT_SIGBUS; return 0; } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 8d3fa3a91ce4..48e27768e7ba 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -217,7 +217,7 @@ static __always_inline const struct page *page_fixed_fake_head(const struct page * cold cacheline in some cases. */ if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) && - test_bit(PG_head, &page->flags)) { + test_bit(PG_head, &page->flags.f)) { /* * We can safely access the field of the @page[1] with PG_head * because the @page is a compound page composed with at least @@ -316,7 +316,7 @@ static __always_inline unsigned long _compound_head(const struct page *page) * check that the page number lies within @folio; the caller is presumed * to have a reference to the page. */ -#define folio_page(folio, n) nth_page(&(folio)->page, n) +#define folio_page(folio, n) (&(folio)->page + (n)) static __always_inline int PageTail(const struct page *page) { @@ -325,14 +325,14 @@ static __always_inline int PageTail(const struct page *page) static __always_inline int PageCompound(const struct page *page) { - return test_bit(PG_head, &page->flags) || + return test_bit(PG_head, &page->flags.f) || READ_ONCE(page->compound_head) & 1; } #define PAGE_POISON_PATTERN -1l static inline int PagePoisoned(const struct page *page) { - return READ_ONCE(page->flags) == PAGE_POISON_PATTERN; + return READ_ONCE(page->flags.f) == PAGE_POISON_PATTERN; } #ifdef CONFIG_DEBUG_VM @@ -349,8 +349,8 @@ static const unsigned long *const_folio_flags(const struct folio *folio, const struct page *page = &folio->page; VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); - VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page); - return &page[n].flags; + VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page); + return &page[n].flags.f; } static unsigned long *folio_flags(struct folio *folio, unsigned n) @@ -358,8 +358,8 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n) struct page *page = &folio->page; VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); - VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page); - return &page[n].flags; + VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page); + return &page[n].flags.f; } /* @@ -449,37 +449,37 @@ FOLIO_CLEAR_FLAG(name, page) #define TESTPAGEFLAG(uname, lname, policy) \ FOLIO_TEST_FLAG(lname, FOLIO_##policy) \ static __always_inline int Page##uname(const struct page *page) \ -{ return test_bit(PG_##lname, &policy(page, 0)->flags); } +{ return test_bit(PG_##lname, &policy(page, 0)->flags.f); } #define SETPAGEFLAG(uname, lname, policy) \ FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void SetPage##uname(struct page *page) \ -{ set_bit(PG_##lname, &policy(page, 1)->flags); } +{ set_bit(PG_##lname, &policy(page, 1)->flags.f); } #define CLEARPAGEFLAG(uname, lname, policy) \ FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void ClearPage##uname(struct page *page) \ -{ clear_bit(PG_##lname, &policy(page, 1)->flags); } +{ clear_bit(PG_##lname, &policy(page, 1)->flags.f); } #define __SETPAGEFLAG(uname, lname, policy) \ __FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void __SetPage##uname(struct page *page) \ -{ __set_bit(PG_##lname, &policy(page, 1)->flags); } +{ __set_bit(PG_##lname, &policy(page, 1)->flags.f); } #define __CLEARPAGEFLAG(uname, lname, policy) \ __FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void __ClearPage##uname(struct page *page) \ -{ __clear_bit(PG_##lname, &policy(page, 1)->flags); } +{ __clear_bit(PG_##lname, &policy(page, 1)->flags.f); } #define TESTSETFLAG(uname, lname, policy) \ FOLIO_TEST_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestSetPage##uname(struct page *page) \ -{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); } +{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags.f); } #define TESTCLEARFLAG(uname, lname, policy) \ FOLIO_TEST_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestClearPage##uname(struct page *page) \ -{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } +{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags.f); } #define PAGEFLAG(uname, lname, policy) \ TESTPAGEFLAG(uname, lname, policy) \ @@ -846,7 +846,7 @@ static __always_inline bool folio_test_head(const struct folio *folio) static __always_inline int PageHead(const struct page *page) { PF_POISONED_CHECK(page); - return test_bit(PG_head, &page->flags) && !page_is_fake_head(page); + return test_bit(PG_head, &page->flags.f) && !page_is_fake_head(page); } __SETPAGEFLAG(Head, head, PF_ANY) @@ -1170,28 +1170,28 @@ static __always_inline int PageAnonExclusive(const struct page *page) */ if (PageHuge(page)) page = compound_head(page); - return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); + return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } static __always_inline void SetPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); - set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); + set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } static __always_inline void ClearPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); - clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); + clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } static __always_inline void __ClearPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnon(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); - __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); + __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } #ifdef CONFIG_MMU @@ -1241,7 +1241,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) */ static inline int folio_has_private(const struct folio *folio) { - return !!(folio->flags & PAGE_FLAGS_PRIVATE); + return !!(folio->flags.f & PAGE_FLAGS_PRIVATE); } #undef PF_ANY diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 6a44be0f39f4..e046278a01fa 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -13,12 +13,11 @@ #include <linux/types.h> -#define PB_migratetype_bits 3 /* Bit indices that affect a whole block of pages */ enum pageblock_bits { - PB_migrate, - PB_migrate_end = PB_migrate + PB_migratetype_bits - 1, - /* 3 bits required for migrate types */ + PB_migrate_0, + PB_migrate_1, + PB_migrate_2, PB_compact_skip,/* If set the block is skipped by compaction */ #ifdef CONFIG_MEMORY_ISOLATION @@ -37,11 +36,10 @@ enum pageblock_bits { #define NR_PAGEBLOCK_BITS (roundup_pow_of_two(__NR_PAGEBLOCK_BITS)) -#define MIGRATETYPE_MASK ((1UL << (PB_migrate_end + 1)) - 1) +#define MIGRATETYPE_MASK (BIT(PB_migrate_0)|BIT(PB_migrate_1)|BIT(PB_migrate_2)) #ifdef CONFIG_MEMORY_ISOLATION -#define MIGRATETYPE_AND_ISO_MASK \ - (((1UL << (PB_migrate_end + 1)) - 1) | BIT(PB_migrate_isolate)) +#define MIGRATETYPE_AND_ISO_MASK (MIGRATETYPE_MASK | BIT(PB_migrate_isolate)) #else #define MIGRATETYPE_AND_ISO_MASK MIGRATETYPE_MASK #endif diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 12a12dae727d..185644e288ea 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -140,7 +140,7 @@ static inline int inode_drain_writes(struct inode *inode) return filemap_write_and_wait(inode->i_mapping); } -static inline bool mapping_empty(struct address_space *mapping) +static inline bool mapping_empty(const struct address_space *mapping) { return xa_empty(&mapping->i_pages); } @@ -166,7 +166,7 @@ static inline bool mapping_empty(struct address_space *mapping) * refcount and the referenced bit, which will be elevated or set in * the process of adding new cache pages to an inode. */ -static inline bool mapping_shrinkable(struct address_space *mapping) +static inline bool mapping_shrinkable(const struct address_space *mapping) { void *head; @@ -211,6 +211,8 @@ enum mapping_flags { folio contents */ AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */ AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9, + AS_KERNEL_FILE = 10, /* mapping for a fake kernel file that shouldn't + account usage to user cgroups */ /* Bits 16-25 are used for FOLIO_ORDER */ AS_FOLIO_ORDER_BITS = 5, AS_FOLIO_ORDER_MIN = 16, @@ -265,7 +267,7 @@ static inline void mapping_clear_unevictable(struct address_space *mapping) clear_bit(AS_UNEVICTABLE, &mapping->flags); } -static inline bool mapping_unevictable(struct address_space *mapping) +static inline bool mapping_unevictable(const struct address_space *mapping) { return mapping && test_bit(AS_UNEVICTABLE, &mapping->flags); } @@ -275,7 +277,7 @@ static inline void mapping_set_exiting(struct address_space *mapping) set_bit(AS_EXITING, &mapping->flags); } -static inline int mapping_exiting(struct address_space *mapping) +static inline int mapping_exiting(const struct address_space *mapping) { return test_bit(AS_EXITING, &mapping->flags); } @@ -285,7 +287,7 @@ static inline void mapping_set_no_writeback_tags(struct address_space *mapping) set_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags); } -static inline int mapping_use_writeback_tags(struct address_space *mapping) +static inline int mapping_use_writeback_tags(const struct address_space *mapping) { return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags); } @@ -331,7 +333,7 @@ static inline void mapping_set_inaccessible(struct address_space *mapping) set_bit(AS_INACCESSIBLE, &mapping->flags); } -static inline bool mapping_inaccessible(struct address_space *mapping) +static inline bool mapping_inaccessible(const struct address_space *mapping) { return test_bit(AS_INACCESSIBLE, &mapping->flags); } @@ -341,18 +343,18 @@ static inline void mapping_set_writeback_may_deadlock_on_reclaim(struct address_ set_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags); } -static inline bool mapping_writeback_may_deadlock_on_reclaim(struct address_space *mapping) +static inline bool mapping_writeback_may_deadlock_on_reclaim(const struct address_space *mapping) { return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags); } -static inline gfp_t mapping_gfp_mask(struct address_space * mapping) +static inline gfp_t mapping_gfp_mask(const struct address_space *mapping) { return mapping->gfp_mask; } /* Restricts the given gfp_mask to what the mapping allows. */ -static inline gfp_t mapping_gfp_constraint(struct address_space *mapping, +static inline gfp_t mapping_gfp_constraint(const struct address_space *mapping, gfp_t gfp_mask) { return mapping_gfp_mask(mapping) & gfp_mask; @@ -475,11 +477,17 @@ mapping_min_folio_order(const struct address_space *mapping) } static inline unsigned long -mapping_min_folio_nrpages(struct address_space *mapping) +mapping_min_folio_nrpages(const struct address_space *mapping) { return 1UL << mapping_min_folio_order(mapping); } +static inline unsigned long +mapping_min_folio_nrbytes(const struct address_space *mapping) +{ + return mapping_min_folio_nrpages(mapping) << PAGE_SHIFT; +} + /** * mapping_align_index() - Align index for this mapping. * @mapping: The address_space. @@ -489,7 +497,7 @@ mapping_min_folio_nrpages(struct address_space *mapping) * new folio to the page cache and need to know what index to give it, * call this function. */ -static inline pgoff_t mapping_align_index(struct address_space *mapping, +static inline pgoff_t mapping_align_index(const struct address_space *mapping, pgoff_t index) { return round_down(index, mapping_min_folio_nrpages(mapping)); @@ -499,7 +507,7 @@ static inline pgoff_t mapping_align_index(struct address_space *mapping, * Large folio support currently depends on THP. These dependencies are * being worked on but are not yet fixed. */ -static inline bool mapping_large_folio_support(struct address_space *mapping) +static inline bool mapping_large_folio_support(const struct address_space *mapping) { /* AS_FOLIO_ORDER is only reasonable for pagecache folios */ VM_WARN_ONCE((unsigned long)mapping & FOLIO_MAPPING_ANON, @@ -514,7 +522,7 @@ static inline size_t mapping_max_folio_size(const struct address_space *mapping) return PAGE_SIZE << mapping_max_folio_order(mapping); } -static inline int filemap_nr_thps(struct address_space *mapping) +static inline int filemap_nr_thps(const struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS return atomic_read(&mapping->nr_thps); @@ -543,7 +551,7 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping) #endif } -struct address_space *folio_mapping(struct folio *); +struct address_space *folio_mapping(const struct folio *folio); /** * folio_flush_mapping - Find the file mapping this folio belongs to. @@ -928,7 +936,7 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, * * Return: The index of the folio which follows this folio in the file. */ -static inline pgoff_t folio_next_index(struct folio *folio) +static inline pgoff_t folio_next_index(const struct folio *folio) { return folio->index + folio_nr_pages(folio); } @@ -957,7 +965,7 @@ static inline struct page *folio_file_page(struct folio *folio, pgoff_t index) * e.g., shmem did not move this folio to the swap cache. * Return: true or false. */ -static inline bool folio_contains(struct folio *folio, pgoff_t index) +static inline bool folio_contains(const struct folio *folio, pgoff_t index) { VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); return index - folio->index < folio_nr_pages(folio); @@ -1034,13 +1042,13 @@ static inline loff_t page_offset(struct page *page) /* * Get the offset in PAGE_SIZE (even for hugetlb folios). */ -static inline pgoff_t folio_pgoff(struct folio *folio) +static inline pgoff_t folio_pgoff(const struct folio *folio) { return folio->index; } -static inline pgoff_t linear_page_index(struct vm_area_struct *vma, - unsigned long address) +static inline pgoff_t linear_page_index(const struct vm_area_struct *vma, + const unsigned long address) { pgoff_t pgoff; pgoff = (address - vma->vm_start) >> PAGE_SHIFT; @@ -1460,7 +1468,7 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac, * readahead_pos - The byte offset into the file of this readahead request. * @rac: The readahead request. */ -static inline loff_t readahead_pos(struct readahead_control *rac) +static inline loff_t readahead_pos(const struct readahead_control *rac) { return (loff_t)rac->_index * PAGE_SIZE; } @@ -1469,7 +1477,7 @@ static inline loff_t readahead_pos(struct readahead_control *rac) * readahead_length - The number of bytes in this readahead request. * @rac: The readahead request. */ -static inline size_t readahead_length(struct readahead_control *rac) +static inline size_t readahead_length(const struct readahead_control *rac) { return rac->_nr_pages * PAGE_SIZE; } @@ -1478,7 +1486,7 @@ static inline size_t readahead_length(struct readahead_control *rac) * readahead_index - The index of the first page in this readahead request. * @rac: The readahead request. */ -static inline pgoff_t readahead_index(struct readahead_control *rac) +static inline pgoff_t readahead_index(const struct readahead_control *rac) { return rac->_index; } @@ -1487,7 +1495,7 @@ static inline pgoff_t readahead_index(struct readahead_control *rac) * readahead_count - The number of pages in this readahead request. * @rac: The readahead request. */ -static inline unsigned int readahead_count(struct readahead_control *rac) +static inline unsigned int readahead_count(const struct readahead_control *rac) { return rac->_nr_pages; } @@ -1496,12 +1504,12 @@ static inline unsigned int readahead_count(struct readahead_control *rac) * readahead_batch_length - The number of bytes in the current batch. * @rac: The readahead request. */ -static inline size_t readahead_batch_length(struct readahead_control *rac) +static inline size_t readahead_batch_length(const struct readahead_control *rac) { return rac->_batch_count * PAGE_SIZE; } -static inline unsigned long dir_pages(struct inode *inode) +static inline unsigned long dir_pages(const struct inode *inode) { return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -1515,8 +1523,8 @@ static inline unsigned long dir_pages(struct inode *inode) * Return: the number of bytes in the folio up to EOF, * or -EFAULT if the folio was truncated. */ -static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio, - struct inode *inode) +static inline ssize_t folio_mkwrite_check_truncate(const struct folio *folio, + const struct inode *inode) { loff_t size = i_size_read(inode); pgoff_t index = size >> PAGE_SHIFT; @@ -1547,7 +1555,8 @@ static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio, * Return: The number of filesystem blocks covered by this folio. */ static inline -unsigned int i_blocks_per_folio(struct inode *inode, struct folio *folio) +unsigned int i_blocks_per_folio(const struct inode *inode, + const struct folio *folio) { return folio_size(folio) >> inode->i_blkbits; } diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 5d3a0cccc6bf..63be5a451627 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -51,12 +51,12 @@ static inline void folio_batch_reinit(struct folio_batch *fbatch) fbatch->i = 0; } -static inline unsigned int folio_batch_count(struct folio_batch *fbatch) +static inline unsigned int folio_batch_count(const struct folio_batch *fbatch) { return fbatch->nr; } -static inline unsigned int folio_batch_space(struct folio_batch *fbatch) +static inline unsigned int folio_batch_space(const struct folio_batch *fbatch) { return PAGEVEC_SIZE - fbatch->nr; } diff --git a/include/linux/panic.h b/include/linux/panic.h index 7be742628c25..6f972a66c13e 100644 --- a/include/linux/panic.h +++ b/include/linux/panic.h @@ -43,6 +43,12 @@ void abort(void); extern atomic_t panic_cpu; #define PANIC_CPU_INVALID -1 +bool panic_try_start(void); +void panic_reset(void); +bool panic_in_progress(void); +bool panic_on_this_cpu(void); +bool panic_on_other_cpu(void); + /* * Only to be used by arch init code. If the user over-wrote the default * CONFIG_PANIC_TIMEOUT, honor it. diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 8a7f4f802c57..38a82d65e58e 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -107,7 +107,8 @@ static inline bool get_page_tag_ref(struct page *page, union codetag_ref *ref, if (static_key_enabled(&mem_profiling_compressed)) { pgalloc_tag_idx idx; - idx = (page->flags >> alloc_tag_ref_offs) & alloc_tag_ref_mask; + idx = (page->flags.f >> alloc_tag_ref_offs) & + alloc_tag_ref_mask; idx_to_ref(idx, ref); handle->page = page; } else { @@ -149,11 +150,11 @@ static inline void update_page_tag_ref(union pgtag_ref_handle handle, union code idx = (unsigned long)ref_to_idx(ref); idx = (idx & alloc_tag_ref_mask) << alloc_tag_ref_offs; do { - old_flags = READ_ONCE(page->flags); + old_flags = READ_ONCE(page->flags.f); flags = old_flags; flags &= ~(alloc_tag_ref_mask << alloc_tag_ref_offs); flags |= idx; - } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); + } while (unlikely(!try_cmpxchg(&page->flags.f, &old_flags, flags))); } else { if (WARN_ON(!handle.ref || !ref)) return; diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 25a7257052ff..32e8457ad535 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1975,6 +1975,32 @@ static inline bool arch_has_pfn_modify_check(void) /* Page-Table Modification Mask */ typedef unsigned int pgtbl_mod_mask; +enum pgtable_level { + PGTABLE_LEVEL_PTE = 0, + PGTABLE_LEVEL_PMD, + PGTABLE_LEVEL_PUD, + PGTABLE_LEVEL_P4D, + PGTABLE_LEVEL_PGD, +}; + +static inline const char *pgtable_level_to_str(enum pgtable_level level) +{ + switch (level) { + case PGTABLE_LEVEL_PTE: + return "pte"; + case PGTABLE_LEVEL_PMD: + return "pmd"; + case PGTABLE_LEVEL_PUD: + return "pud"; + case PGTABLE_LEVEL_P4D: + return "p4d"; + case PGTABLE_LEVEL_PGD: + return "pgd"; + default: + return "unknown"; + } +} + #endif /* !__ASSEMBLY__ */ #if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT) diff --git a/include/linux/printk.h b/include/linux/printk.h index 5d22b803f51e..45c663124c9b 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -330,8 +330,6 @@ static inline bool pr_flush(int timeout_ms, bool reset_on_progress) #endif -bool this_cpu_in_panic(void); - #ifdef CONFIG_SMP extern int __printk_cpu_sync_try_get(void); extern void __printk_cpu_sync_wait(void); diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 6cd020eea37a..daa92a58585d 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -394,18 +394,8 @@ typedef int __bitwise rmap_t; /* The anonymous (sub)page is exclusive to a single process. */ #define RMAP_EXCLUSIVE ((__force rmap_t)BIT(0)) -/* - * Internally, we're using an enum to specify the granularity. We make the - * compiler emit specialized code for each granularity. - */ -enum rmap_level { - RMAP_LEVEL_PTE = 0, - RMAP_LEVEL_PMD, - RMAP_LEVEL_PUD, -}; - -static inline void __folio_rmap_sanity_checks(const struct folio *folio, - const struct page *page, int nr_pages, enum rmap_level level) +static __always_inline void __folio_rmap_sanity_checks(const struct folio *folio, + const struct page *page, int nr_pages, enum pgtable_level level) { /* hugetlb folios are handled separately. */ VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); @@ -427,18 +417,18 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio, VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio); switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: break; - case RMAP_LEVEL_PMD: + case PGTABLE_LEVEL_PMD: /* * We don't support folios larger than a single PMD yet. So - * when RMAP_LEVEL_PMD is set, we assume that we are creating + * when PGTABLE_LEVEL_PMD is set, we assume that we are creating * a single "entire" mapping of the folio. */ VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio); VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio); break; - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PUD: /* * Assume that we are creating a single "entire" mapping of the * folio. @@ -447,7 +437,7 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio, VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio); break; default: - VM_WARN_ON_ONCE(true); + BUILD_BUG(); } /* @@ -567,14 +557,14 @@ static inline void hugetlb_remove_rmap(struct folio *folio) static __always_inline void __folio_dup_file_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *dst_vma, - enum rmap_level level) + enum pgtable_level level) { const int orig_nr_pages = nr_pages; __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: if (!folio_test_large(folio)) { atomic_inc(&folio->_mapcount); break; @@ -587,11 +577,13 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, } folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); break; - case RMAP_LEVEL_PMD: - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PMD: + case PGTABLE_LEVEL_PUD: atomic_inc(&folio->_entire_mapcount); folio_inc_large_mapcount(folio, dst_vma); break; + default: + BUILD_BUG(); } } @@ -609,13 +601,13 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, static inline void folio_dup_file_rmap_ptes(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *dst_vma) { - __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, RMAP_LEVEL_PTE); + __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, PGTABLE_LEVEL_PTE); } static __always_inline void folio_dup_file_rmap_pte(struct folio *folio, struct page *page, struct vm_area_struct *dst_vma) { - __folio_dup_file_rmap(folio, page, 1, dst_vma, RMAP_LEVEL_PTE); + __folio_dup_file_rmap(folio, page, 1, dst_vma, PGTABLE_LEVEL_PTE); } /** @@ -632,7 +624,7 @@ static inline void folio_dup_file_rmap_pmd(struct folio *folio, struct page *page, struct vm_area_struct *dst_vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, RMAP_LEVEL_PTE); + __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, PGTABLE_LEVEL_PTE); #else WARN_ON_ONCE(true); #endif @@ -640,7 +632,7 @@ static inline void folio_dup_file_rmap_pmd(struct folio *folio, static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *dst_vma, - struct vm_area_struct *src_vma, enum rmap_level level) + struct vm_area_struct *src_vma, enum pgtable_level level) { const int orig_nr_pages = nr_pages; bool maybe_pinned; @@ -665,7 +657,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, * copying if the folio maybe pinned. */ switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: if (unlikely(maybe_pinned)) { for (i = 0; i < nr_pages; i++) if (PageAnonExclusive(page + i)) @@ -687,8 +679,8 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, } while (page++, --nr_pages > 0); folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); break; - case RMAP_LEVEL_PMD: - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PMD: + case PGTABLE_LEVEL_PUD: if (PageAnonExclusive(page)) { if (unlikely(maybe_pinned)) return -EBUSY; @@ -697,6 +689,8 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, atomic_inc(&folio->_entire_mapcount); folio_inc_large_mapcount(folio, dst_vma); break; + default: + BUILD_BUG(); } return 0; } @@ -730,7 +724,7 @@ static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio, struct vm_area_struct *src_vma) { return __folio_try_dup_anon_rmap(folio, page, nr_pages, dst_vma, - src_vma, RMAP_LEVEL_PTE); + src_vma, PGTABLE_LEVEL_PTE); } static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio, @@ -738,7 +732,7 @@ static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio, struct vm_area_struct *src_vma) { return __folio_try_dup_anon_rmap(folio, page, 1, dst_vma, src_vma, - RMAP_LEVEL_PTE); + PGTABLE_LEVEL_PTE); } /** @@ -770,7 +764,7 @@ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio, { #ifdef CONFIG_TRANSPARENT_HUGEPAGE return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, dst_vma, - src_vma, RMAP_LEVEL_PMD); + src_vma, PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); return -EBUSY; @@ -778,7 +772,7 @@ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio, } static __always_inline int __folio_try_share_anon_rmap(struct folio *folio, - struct page *page, int nr_pages, enum rmap_level level) + struct page *page, int nr_pages, enum pgtable_level level) { VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio); @@ -873,7 +867,7 @@ static __always_inline int __folio_try_share_anon_rmap(struct folio *folio, static inline int folio_try_share_anon_rmap_pte(struct folio *folio, struct page *page) { - return __folio_try_share_anon_rmap(folio, page, 1, RMAP_LEVEL_PTE); + return __folio_try_share_anon_rmap(folio, page, 1, PGTABLE_LEVEL_PTE); } /** @@ -904,7 +898,7 @@ static inline int folio_try_share_anon_rmap_pmd(struct folio *folio, { #ifdef CONFIG_TRANSPARENT_HUGEPAGE return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR, - RMAP_LEVEL_PMD); + PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); return -EBUSY; @@ -928,6 +922,11 @@ struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr, /* Look for migration entries rather than present PTEs */ #define PVMW_MIGRATION (1 << 1) +/* Result flags */ + +/* The page is mapped across page table boundary */ +#define PVMW_PGTABLE_CROSSED (1 << 16) + struct page_vma_mapped_walk { unsigned long pfn; unsigned long nr_pages; diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index fa9f1021541e..ede4c6bf6f22 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -44,6 +44,16 @@ static inline bool rt_mutex_base_is_locked(struct rt_mutex_base *lock) return READ_ONCE(lock->owner) != NULL; } +#ifdef CONFIG_RT_MUTEXES +#define RT_MUTEX_HAS_WAITERS 1UL + +static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) +{ + unsigned long owner = (unsigned long) READ_ONCE(lock->owner); + + return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); +} +#endif extern void rt_mutex_base_init(struct rt_mutex_base *rtb); /** diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 6f8a4965f9b9..29f6ceb98d74 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -158,6 +158,7 @@ static inline void sg_assign_page(struct scatterlist *sg, struct page *page) static inline void sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, unsigned int offset) { + VM_WARN_ON_ONCE(!page_range_contiguous(page, ALIGN(len + offset, PAGE_SIZE) / PAGE_SIZE)); sg_assign_page(sg, page); sg->offset = offset; sg->length = len; @@ -600,7 +601,7 @@ void __sg_page_iter_start(struct sg_page_iter *piter, */ static inline struct page *sg_page_iter_page(struct sg_page_iter *piter) { - return nth_page(sg_page(piter->sg), piter->sg_pgoffset); + return sg_page(piter->sg) + piter->sg_pgoffset; } /** diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 6eb65ceed213..b7fafe999073 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -8,6 +8,20 @@ #define SUID_DUMP_USER 1 /* Dump as user of process */ #define SUID_DUMP_ROOT 2 /* Dump as root */ +static inline unsigned long __mm_flags_get_dumpable(struct mm_struct *mm) +{ + /* + * By convention, dumpable bits are contained in first 32 bits of the + * bitmap, so we can simply access this first unsigned long directly. + */ + return __mm_flags_get_word(mm); +} + +static inline void __mm_flags_set_mask_dumpable(struct mm_struct *mm, int value) +{ + __mm_flags_set_mask_bits_word(mm, MMF_DUMPABLE_MASK, value); +} + extern void set_dumpable(struct mm_struct *mm, int value); /* * This returns the actual value of the suid_dumpable flag. For things @@ -22,7 +36,9 @@ static inline int __get_dumpable(unsigned long mm_flags) static inline int get_dumpable(struct mm_struct *mm) { - return __get_dumpable(mm->flags); + unsigned long flags = __mm_flags_get_dumpable(mm); + + return __get_dumpable(flags); } #endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 2201da0afecc..0232d983b715 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -178,7 +178,7 @@ static inline void mm_update_next_owner(struct mm_struct *mm) #endif extern void arch_pick_mmap_layout(struct mm_struct *mm, - struct rlimit *rlim_stack); + const struct rlimit *rlim_stack); unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, @@ -211,7 +211,7 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long flags, vm_flags_t vm_flags); #else static inline void arch_pick_mmap_layout(struct mm_struct *mm, - struct rlimit *rlim_stack) {} + const struct rlimit *rlim_stack) {} #endif static inline bool in_vfork(struct task_struct *tsk) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 34d6a0e108c3..525aa2a632b2 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -210,9 +210,8 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) * pins the final release of task.io_context. Also protects ->cpuset and * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist. * - * Nests both inside and outside of read_lock(&tasklist_lock). - * It must not be nested with write_lock_irq(&tasklist_lock), - * neither inside nor outside. + * Nests inside of read_lock(&tasklist_lock). It must not be nested with + * write_lock_irq(&tasklist_lock), neither inside nor outside. */ static inline void task_lock(struct task_struct *p) { diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 6d0f9c599ff7..0e47465ef0fd 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -99,9 +99,9 @@ extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts); #ifdef CONFIG_SHMEM -bool shmem_mapping(struct address_space *mapping); +bool shmem_mapping(const struct address_space *mapping); #else -static inline bool shmem_mapping(struct address_space *mapping) +static inline bool shmem_mapping(const struct address_space *mapping) { return false; } diff --git a/include/linux/slab.h b/include/linux/slab.h index d5a8ab98035c..cf443f064a66 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -335,6 +335,37 @@ struct kmem_cache_args { * %NULL means no constructor. */ void (*ctor)(void *); + /** + * @sheaf_capacity: Enable sheaves of given capacity for the cache. + * + * With a non-zero value, allocations from the cache go through caching + * arrays called sheaves. Each cpu has a main sheaf that's always + * present, and a spare sheaf that may be not present. When both become + * empty, there's an attempt to replace an empty sheaf with a full sheaf + * from the per-node barn. + * + * When no full sheaf is available, and gfp flags allow blocking, a + * sheaf is allocated and filled from slab(s) using bulk allocation. + * Otherwise the allocation falls back to the normal operation + * allocating a single object from a slab. + * + * Analogically when freeing and both percpu sheaves are full, the barn + * may replace it with an empty sheaf, unless it's over capacity. In + * that case a sheaf is bulk freed to slab pages. + * + * The sheaves do not enforce NUMA placement of objects, so allocations + * via kmem_cache_alloc_node() with a node specified other than + * NUMA_NO_NODE will bypass them. + * + * Bulk allocation and free operations also try to use the cpu sheaves + * and barn, but fallback to using slab pages directly. + * + * When slub_debug is enabled for the cache, the sheaf_capacity argument + * is ignored. + * + * %0 means no sheaves will be created. + */ + unsigned int sheaf_capacity; }; struct kmem_cache *__kmem_cache_create_args(const char *name, @@ -465,11 +496,16 @@ int kmem_cache_shrink(struct kmem_cache *s); /* * Common kmalloc functions provided by all allocators */ -void * __must_check krealloc_noprof(const void *objp, size_t new_size, - gfp_t flags) __realloc_size(2); -#define krealloc(...) alloc_hooks(krealloc_noprof(__VA_ARGS__)) +void * __must_check krealloc_node_align_noprof(const void *objp, size_t new_size, + unsigned long align, + gfp_t flags, int nid) __realloc_size(2); +#define krealloc_noprof(_o, _s, _f) krealloc_node_align_noprof(_o, _s, 1, _f, NUMA_NO_NODE) +#define krealloc_node_align(...) alloc_hooks(krealloc_node_align_noprof(__VA_ARGS__)) +#define krealloc_node(_o, _s, _f, _n) krealloc_node_align(_o, _s, 1, _f, _n) +#define krealloc(...) krealloc_node(__VA_ARGS__, NUMA_NO_NODE) void kfree(const void *objp); +void kfree_nolock(const void *objp); void kfree_sensitive(const void *objp); size_t __ksize(const void *objp); @@ -798,6 +834,22 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment __malloc; #define kmem_cache_alloc_node(...) alloc_hooks(kmem_cache_alloc_node_noprof(__VA_ARGS__)) +struct slab_sheaf * +kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size); + +int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf **sheafp, unsigned int size); + +void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf *sheaf); + +void *kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *cachep, gfp_t gfp, + struct slab_sheaf *sheaf) __assume_slab_alignment __malloc; +#define kmem_cache_alloc_from_sheaf(...) \ + alloc_hooks(kmem_cache_alloc_from_sheaf_noprof(__VA_ARGS__)) + +unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf); + /* * These macros allow declaring a kmem_buckets * parameter alongside size, which * can be compiled out with CONFIG_SLAB_BUCKETS=n so that a large number of call @@ -910,6 +962,9 @@ static __always_inline __alloc_size(1) void *kmalloc_noprof(size_t size, gfp_t f } #define kmalloc(...) alloc_hooks(kmalloc_noprof(__VA_ARGS__)) +void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node); +#define kmalloc_nolock(...) alloc_hooks(kmalloc_nolock_noprof(__VA_ARGS__)) + #define kmem_buckets_alloc(_b, _size, _flags) \ alloc_hooks(__kmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE)) @@ -1041,18 +1096,20 @@ static inline __alloc_size(1) void *kzalloc_noprof(size_t size, gfp_t flags) #define kzalloc(...) alloc_hooks(kzalloc_noprof(__VA_ARGS__)) #define kzalloc_node(_size, _flags, _node) kmalloc_node(_size, (_flags)|__GFP_ZERO, _node) -void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) __alloc_size(1); -#define kvmalloc_node_noprof(size, flags, node) \ - __kvmalloc_node_noprof(PASS_BUCKET_PARAMS(size, NULL), flags, node) -#define kvmalloc_node(...) alloc_hooks(kvmalloc_node_noprof(__VA_ARGS__)) - -#define kvmalloc(_size, _flags) kvmalloc_node(_size, _flags, NUMA_NO_NODE) -#define kvmalloc_noprof(_size, _flags) kvmalloc_node_noprof(_size, _flags, NUMA_NO_NODE) +void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align, + gfp_t flags, int node) __alloc_size(1); +#define kvmalloc_node_align_noprof(_size, _align, _flags, _node) \ + __kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, NULL), _align, _flags, _node) +#define kvmalloc_node_align(...) \ + alloc_hooks(kvmalloc_node_align_noprof(__VA_ARGS__)) +#define kvmalloc_node(_s, _f, _n) kvmalloc_node_align(_s, 1, _f, _n) +#define kvmalloc(...) kvmalloc_node(__VA_ARGS__, NUMA_NO_NODE) #define kvzalloc(_size, _flags) kvmalloc(_size, (_flags)|__GFP_ZERO) #define kvzalloc_node(_size, _flags, _node) kvmalloc_node(_size, (_flags)|__GFP_ZERO, _node) + #define kmem_buckets_valloc(_b, _size, _flags) \ - alloc_hooks(__kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE)) + alloc_hooks(__kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), 1, _flags, NUMA_NO_NODE)) static inline __alloc_size(1, 2) void * kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node) @@ -1062,7 +1119,7 @@ kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node) if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; - return kvmalloc_node_noprof(bytes, flags, node); + return kvmalloc_node_align_noprof(bytes, 1, flags, node); } #define kvmalloc_array_noprof(...) kvmalloc_array_node_noprof(__VA_ARGS__, NUMA_NO_NODE) @@ -1073,9 +1130,12 @@ kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node) #define kvcalloc_node(...) alloc_hooks(kvcalloc_node_noprof(__VA_ARGS__)) #define kvcalloc(...) alloc_hooks(kvcalloc_noprof(__VA_ARGS__)) -void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) - __realloc_size(2); -#define kvrealloc(...) alloc_hooks(kvrealloc_noprof(__VA_ARGS__)) +void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, + gfp_t flags, int nid) __realloc_size(2); +#define kvrealloc_node_align(...) \ + alloc_hooks(kvrealloc_node_align_noprof(__VA_ARGS__)) +#define kvrealloc_node(_p, _s, _f, _n) kvrealloc_node_align(_p, _s, 1, _f, _n) +#define kvrealloc(...) kvrealloc_node(__VA_ARGS__, NUMA_NO_NODE) extern void kvfree(const void *addr); DEFINE_FREE(kvfree, void *, if (!IS_ERR_OR_NULL(_T)) kvfree(_T)) diff --git a/include/linux/swap.h b/include/linux/swap.h index 7012a0f758d8..e818fbade1e2 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -236,40 +236,6 @@ enum { #define SWAP_CONT_MAX 0x7f /* Max count */ /* - * We use this to track usage of a cluster. A cluster is a block of swap disk - * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All - * free clusters are organized into a list. We fetch an entry from the list to - * get a free cluster. - * - * The flags field determines if a cluster is free. This is - * protected by cluster lock. - */ -struct swap_cluster_info { - spinlock_t lock; /* - * Protect swap_cluster_info fields - * other than list, and swap_info_struct->swap_map - * elements corresponding to the swap cluster. - */ - u16 count; - u8 flags; - u8 order; - struct list_head list; -}; - -/* All on-list cluster must have a non-zero flag. */ -enum swap_cluster_flags { - CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */ - CLUSTER_FLAG_FREE, - CLUSTER_FLAG_NONFULL, - CLUSTER_FLAG_FRAG, - /* Clusters with flags above are allocatable */ - CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG, - CLUSTER_FLAG_FULL, - CLUSTER_FLAG_DISCARD, - CLUSTER_FLAG_MAX, -}; - -/* * The first page in the swap file is the swap header, which is always marked * bad to prevent it from being allocated as an entry. This also prevents the * cluster to which it belongs being marked free. Therefore 0 is safe to use as @@ -310,7 +276,6 @@ struct swap_info_struct { /* list of cluster that contains at least one free slot */ struct list_head frag_clusters[SWAP_NR_ORDERS]; /* list of cluster that are fragmented or contented */ - atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int pages; /* total of usable pages of swap */ atomic_long_t inuse_pages; /* number of those currently in use */ struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */ @@ -321,11 +286,8 @@ struct swap_info_struct { struct completion comp; /* seldom referenced */ spinlock_t lock; /* * protect map scan related fields like - * swap_map, lowest_bit, highest_bit, - * inuse_pages, cluster_next, - * cluster_nr, lowest_alloc, - * highest_alloc, free/discard cluster - * list. other fields are only changed + * swap_map, inuse_pages and all cluster + * lists. other fields are only changed * at swapon/swapoff, so are protected * by swap_lock. changing flags need * hold this lock and swap_lock. If @@ -517,10 +479,7 @@ extern sector_t swapdev_block(int, pgoff_t); extern int __swap_count(swp_entry_t entry); extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); -struct swap_info_struct *swp_swap_info(swp_entry_t entry); struct backing_dev_info; -extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); -extern void exit_swap_address_space(unsigned int type); extern struct swap_info_struct *get_swap_device(swp_entry_t entry); sector_t swap_folio_sector(struct folio *folio); @@ -530,11 +489,6 @@ static inline void put_swap_device(struct swap_info_struct *si) } #else /* CONFIG_SWAP */ -static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry) -{ - return NULL; -} - static inline struct swap_info_struct *get_swap_device(swp_entry_t entry) { return NULL; diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 9e15a088ba38..92f80b4d69a6 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -2,6 +2,8 @@ #ifndef VM_EVENT_ITEM_H_INCLUDED #define VM_EVENT_ITEM_H_INCLUDED +#include <linux/thread_info.h> + #ifdef CONFIG_ZONE_DMA #define DMA_ZONE(xx) xx##_DMA, #else diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 2759dac6be44..eb54b7b3202f 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -197,9 +197,15 @@ extern void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1 extern void *vcalloc_noprof(size_t n, size_t size) __alloc_size(1, 2); #define vcalloc(...) alloc_hooks(vcalloc_noprof(__VA_ARGS__)) -void * __must_check vrealloc_noprof(const void *p, size_t size, gfp_t flags) - __realloc_size(2); -#define vrealloc(...) alloc_hooks(vrealloc_noprof(__VA_ARGS__)) +void *__must_check vrealloc_node_align_noprof(const void *p, size_t size, + unsigned long align, gfp_t flags, int nid) __realloc_size(2); +#define vrealloc_node_noprof(_p, _s, _f, _nid) \ + vrealloc_node_align_noprof(_p, _s, 1, _f, _nid) +#define vrealloc_noprof(_p, _s, _f) \ + vrealloc_node_align_noprof(_p, _s, 1, _f, NUMA_NO_NODE) +#define vrealloc_node_align(...) alloc_hooks(vrealloc_node_align_noprof(__VA_ARGS__)) +#define vrealloc_node(...) alloc_hooks(vrealloc_node_noprof(__VA_ARGS__)) +#define vrealloc(...) alloc_hooks(vrealloc_noprof(__VA_ARGS__)) extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 15a4bc4ab819..22dd4adc5667 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -362,12 +362,6 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb); struct folio *writeback_iter(struct address_space *mapping, struct writeback_control *wbc, struct folio *folio, int *error); -typedef int (*writepage_t)(struct folio *folio, struct writeback_control *wbc, - void *data); - -int write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, writepage_t writepage, - void *data); int do_writepages(struct address_space *mapping, struct writeback_control *wbc); void writeback_set_ratelimit(void); void tag_pages_for_writeback(struct address_space *mapping, diff --git a/include/linux/zpool.h b/include/linux/zpool.h deleted file mode 100644 index 369ef068fad8..000000000000 --- a/include/linux/zpool.h +++ /dev/null @@ -1,86 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * zpool memory storage api - * - * Copyright (C) 2014 Dan Streetman - * - * This is a common frontend for the zswap compressed memory storage - * implementations. - */ - -#ifndef _ZPOOL_H_ -#define _ZPOOL_H_ - -struct zpool; - -bool zpool_has_pool(char *type); - -struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp); - -const char *zpool_get_type(struct zpool *pool); - -void zpool_destroy_pool(struct zpool *pool); - -int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, - unsigned long *handle, const int nid); - -void zpool_free(struct zpool *pool, unsigned long handle); - -void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle, - void *local_copy); - -void zpool_obj_read_end(struct zpool *zpool, unsigned long handle, - void *handle_mem); - -void zpool_obj_write(struct zpool *zpool, unsigned long handle, - void *handle_mem, size_t mem_len); - -u64 zpool_get_total_pages(struct zpool *pool); - - -/** - * struct zpool_driver - driver implementation for zpool - * @type: name of the driver. - * @list: entry in the list of zpool drivers. - * @create: create a new pool. - * @destroy: destroy a pool. - * @malloc: allocate mem from a pool. - * @free: free mem from a pool. - * @sleep_mapped: whether zpool driver can sleep during map. - * @map: map a handle. - * @unmap: unmap a handle. - * @total_size: get total size of a pool. - * - * This is created by a zpool implementation and registered - * with zpool. - */ -struct zpool_driver { - char *type; - struct module *owner; - atomic_t refcount; - struct list_head list; - - void *(*create)(const char *name, gfp_t gfp); - void (*destroy)(void *pool); - - int (*malloc)(void *pool, size_t size, gfp_t gfp, - unsigned long *handle, const int nid); - void (*free)(void *pool, unsigned long handle); - - void *(*obj_read_begin)(void *pool, unsigned long handle, - void *local_copy); - void (*obj_read_end)(void *pool, unsigned long handle, - void *handle_mem); - void (*obj_write)(void *pool, unsigned long handle, - void *handle_mem, size_t mem_len); - - u64 (*total_pages)(void *pool); -}; - -void zpool_register_driver(struct zpool_driver *driver); - -int zpool_unregister_driver(struct zpool_driver *driver); - -bool zpool_can_sleep_mapped(struct zpool *pool); - -#endif diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h index 383c09f583ac..37195edf2498 100644 --- a/include/trace/events/cma.h +++ b/include/trace/events/cma.h @@ -38,25 +38,32 @@ TRACE_EVENT(cma_release, TRACE_EVENT(cma_alloc_start, - TP_PROTO(const char *name, unsigned long count, unsigned int align), + TP_PROTO(const char *name, unsigned long request_count, unsigned long available_count, + unsigned long total_count, unsigned int align), - TP_ARGS(name, count, align), + TP_ARGS(name, request_count, available_count, total_count, align), TP_STRUCT__entry( __string(name, name) - __field(unsigned long, count) + __field(unsigned long, request_count) + __field(unsigned long, available_count) + __field(unsigned long, total_count) __field(unsigned int, align) ), TP_fast_assign( __assign_str(name); - __entry->count = count; + __entry->request_count = request_count; + __entry->available_count = available_count; + __entry->total_count = total_count; __entry->align = align; ), - TP_printk("name=%s count=%lu align=%u", + TP_printk("name=%s request_count=%lu available_count=%lu total_count=%lu align=%u", __get_str(name), - __entry->count, + __entry->request_count, + __entry->available_count, + __entry->total_count, __entry->align) ); diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 2305df6cb485..dd94d14a2427 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -19,7 +19,6 @@ EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \ EM( SCAN_PTE_UFFD_WP, "pte_uffd_wp") \ EM( SCAN_PTE_MAPPED_HUGEPAGE, "pte_mapped_hugepage") \ - EM( SCAN_PAGE_RO, "no_writable_page") \ EM( SCAN_LACK_REFERENCED_PAGE, "lack_referenced_page") \ EM( SCAN_PAGE_NULL, "page_null") \ EM( SCAN_SCAN_ABORT, "scan_aborted") \ @@ -55,15 +54,14 @@ SCAN_STATUS TRACE_EVENT(mm_khugepaged_scan_pmd, - TP_PROTO(struct mm_struct *mm, struct folio *folio, bool writable, + TP_PROTO(struct mm_struct *mm, struct folio *folio, int referenced, int none_or_zero, int status, int unmapped), - TP_ARGS(mm, folio, writable, referenced, none_or_zero, status, unmapped), + TP_ARGS(mm, folio, referenced, none_or_zero, status, unmapped), TP_STRUCT__entry( __field(struct mm_struct *, mm) __field(unsigned long, pfn) - __field(bool, writable) __field(int, referenced) __field(int, none_or_zero) __field(int, status) @@ -73,17 +71,15 @@ TRACE_EVENT(mm_khugepaged_scan_pmd, TP_fast_assign( __entry->mm = mm; __entry->pfn = folio ? folio_pfn(folio) : -1; - __entry->writable = writable; __entry->referenced = referenced; __entry->none_or_zero = none_or_zero; __entry->status = status; __entry->unmapped = unmapped; ), - TP_printk("mm=%p, scan_pfn=0x%lx, writable=%d, referenced=%d, none_or_zero=%d, status=%s, unmapped=%d", + TP_printk("mm=%p, scan_pfn=0x%lx, referenced=%d, none_or_zero=%d, status=%s, unmapped=%d", __entry->mm, __entry->pfn, - __entry->writable, __entry->referenced, __entry->none_or_zero, __print_symbolic(__entry->status, SCAN_STATUS), @@ -117,15 +113,14 @@ TRACE_EVENT(mm_collapse_huge_page, TRACE_EVENT(mm_collapse_huge_page_isolate, TP_PROTO(struct folio *folio, int none_or_zero, - int referenced, bool writable, int status), + int referenced, int status), - TP_ARGS(folio, none_or_zero, referenced, writable, status), + TP_ARGS(folio, none_or_zero, referenced, status), TP_STRUCT__entry( __field(unsigned long, pfn) __field(int, none_or_zero) __field(int, referenced) - __field(bool, writable) __field(int, status) ), @@ -133,15 +128,13 @@ TRACE_EVENT(mm_collapse_huge_page_isolate, __entry->pfn = folio ? folio_pfn(folio) : -1; __entry->none_or_zero = none_or_zero; __entry->referenced = referenced; - __entry->writable = writable; __entry->status = status; ), - TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, writable=%d, status=%s", + TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, status=%s", __entry->pfn, __entry->none_or_zero, __entry->referenced, - __entry->writable, __print_symbolic(__entry->status, SCAN_STATUS)) ); diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 474358773abe..7f93e754da5c 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -22,6 +22,7 @@ TRACE_EVENT(kmem_cache_alloc, TP_STRUCT__entry( __field( unsigned long, call_site ) __field( const void *, ptr ) + __string( name, s->name ) __field( size_t, bytes_req ) __field( size_t, bytes_alloc ) __field( unsigned long, gfp_flags ) @@ -32,6 +33,7 @@ TRACE_EVENT(kmem_cache_alloc, TP_fast_assign( __entry->call_site = call_site; __entry->ptr = ptr; + __assign_str(name); __entry->bytes_req = s->object_size; __entry->bytes_alloc = s->size; __entry->gfp_flags = (__force unsigned long)gfp_flags; @@ -41,9 +43,10 @@ TRACE_EVENT(kmem_cache_alloc, (s->flags & SLAB_ACCOUNT)) : false; ), - TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s", + TP_printk("call_site=%pS ptr=%p name=%s bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s", (void *)__entry->call_site, __entry->ptr, + __get_str(name), __entry->bytes_req, __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags), diff --git a/include/trace/events/page_ref.h b/include/trace/events/page_ref.h index fe33a255b7d0..ea6b5c4baf3d 100644 --- a/include/trace/events/page_ref.h +++ b/include/trace/events/page_ref.h @@ -28,7 +28,7 @@ DECLARE_EVENT_CLASS(page_ref_mod_template, TP_fast_assign( __entry->pfn = page_to_pfn(page); - __entry->flags = page->flags; + __entry->flags = page->flags.f; __entry->count = page_ref_count(page); __entry->mapcount = atomic_read(&page->_mapcount); __entry->mapping = page->mapping; @@ -77,7 +77,7 @@ DECLARE_EVENT_CLASS(page_ref_mod_and_test_template, TP_fast_assign( __entry->pfn = page_to_pfn(page); - __entry->flags = page->flags; + __entry->flags = page->flags.f; __entry->count = page_ref_count(page); __entry->mapcount = atomic_read(&page->_mapcount); __entry->mapping = page->mapping; diff --git a/include/trace/events/readahead.h b/include/trace/events/readahead.h new file mode 100644 index 000000000000..0997ac5eceab --- /dev/null +++ b/include/trace/events/readahead.h @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM readahead + +#if !defined(_TRACE_FILEMAP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_READAHEAD_H + +#include <linux/types.h> +#include <linux/tracepoint.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/pagemap.h> + +TRACE_EVENT(page_cache_ra_unbounded, + TP_PROTO(struct inode *inode, pgoff_t index, unsigned long nr_to_read, + unsigned long lookahead_size), + + TP_ARGS(inode, index, nr_to_read, lookahead_size), + + TP_STRUCT__entry( + __field(unsigned long, i_ino) + __field(dev_t, s_dev) + __field(pgoff_t, index) + __field(unsigned long, nr_to_read) + __field(unsigned long, lookahead_size) + ), + + TP_fast_assign( + __entry->i_ino = inode->i_ino; + __entry->s_dev = inode->i_sb->s_dev; + __entry->index = index; + __entry->nr_to_read = nr_to_read; + __entry->lookahead_size = lookahead_size; + ), + + TP_printk( + "dev=%d:%d ino=%lx index=%lu nr_to_read=%lu lookahead_size=%lu", + MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, + __entry->index, __entry->nr_to_read, __entry->lookahead_size + ) +); + +TRACE_EVENT(page_cache_ra_order, + TP_PROTO(struct inode *inode, pgoff_t index, struct file_ra_state *ra), + + TP_ARGS(inode, index, ra), + + TP_STRUCT__entry( + __field(unsigned long, i_ino) + __field(dev_t, s_dev) + __field(pgoff_t, index) + __field(unsigned int, order) + __field(unsigned int, size) + __field(unsigned int, async_size) + __field(unsigned int, ra_pages) + ), + + TP_fast_assign( + __entry->i_ino = inode->i_ino; + __entry->s_dev = inode->i_sb->s_dev; + __entry->index = index; + __entry->order = ra->order; + __entry->size = ra->size; + __entry->async_size = ra->async_size; + __entry->ra_pages = ra->ra_pages; + ), + + TP_printk( + "dev=%d:%d ino=%lx index=%lu order=%u size=%u async_size=%u ra_pages=%u", + MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, + __entry->index, __entry->order, __entry->size, + __entry->async_size, __entry->ra_pages + ) +); + +DECLARE_EVENT_CLASS(page_cache_ra_op, + TP_PROTO(struct inode *inode, pgoff_t index, struct file_ra_state *ra, + unsigned long req_count), + + TP_ARGS(inode, index, ra, req_count), + + TP_STRUCT__entry( + __field(unsigned long, i_ino) + __field(dev_t, s_dev) + __field(pgoff_t, index) + __field(unsigned int, order) + __field(unsigned int, size) + __field(unsigned int, async_size) + __field(unsigned int, ra_pages) + __field(unsigned int, mmap_miss) + __field(loff_t, prev_pos) + __field(unsigned long, req_count) + ), + + TP_fast_assign( + __entry->i_ino = inode->i_ino; + __entry->s_dev = inode->i_sb->s_dev; + __entry->index = index; + __entry->order = ra->order; + __entry->size = ra->size; + __entry->async_size = ra->async_size; + __entry->ra_pages = ra->ra_pages; + __entry->mmap_miss = ra->mmap_miss; + __entry->prev_pos = ra->prev_pos; + __entry->req_count = req_count; + ), + + TP_printk( + "dev=%d:%d ino=%lx index=%lu req_count=%lu order=%u size=%u async_size=%u ra_pages=%u mmap_miss=%u prev_pos=%lld", + MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, + __entry->index, __entry->req_count, __entry->order, + __entry->size, __entry->async_size, __entry->ra_pages, + __entry->mmap_miss, __entry->prev_pos + ) +); + +DEFINE_EVENT(page_cache_ra_op, page_cache_sync_ra, + TP_PROTO(struct inode *inode, pgoff_t index, struct file_ra_state *ra, + unsigned long req_count), + TP_ARGS(inode, index, ra, req_count) +); + +DEFINE_EVENT(page_cache_ra_op, page_cache_async_ra, + TP_PROTO(struct inode *inode, pgoff_t index, struct file_ra_state *ra, + unsigned long req_count), + TP_ARGS(inode, index, ra, req_count) +); + +#endif /* _TRACE_FILEMAP_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index 8958ebfcff94..55749cb0b81d 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -22,12 +22,16 @@ * KEXEC_FILE_ON_CRASH : Load/unload operation belongs to kdump image. * KEXEC_FILE_NO_INITRAMFS : No initramfs is being loaded. Ignore the initrd * fd field. + * KEXEC_FILE_FORCE_DTB : Force carrying over the current boot's DTB to the new + * kernel on x86. This is already the default behavior on + * some other architectures, like ARM64 and PowerPC. */ #define KEXEC_FILE_UNLOAD 0x00000001 #define KEXEC_FILE_ON_CRASH 0x00000002 #define KEXEC_FILE_NO_INITRAMFS 0x00000004 #define KEXEC_FILE_DEBUG 0x00000008 #define KEXEC_FILE_NO_CMA 0x00000010 +#define KEXEC_FILE_FORCE_DTB 0x00000020 /* These values match the ELF architecture values. * Unless there is a good reason that should continue to be the case. diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 1f9bb10d1a47..8fbbe613611a 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -66,10 +66,16 @@ enum { #define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */ /* - * These bit locations are exposed in the vm.zone_reclaim_mode sysctl - * ABI. New bits are OK, but existing bits can never change. + * Enabling zone reclaim means the page allocator will attempt to fulfill + * the allocation request on the current node by triggering reclaim and + * trying to shrink the current node. + * Fallback allocations on the next candidates in the zonelist are considered + * when reclaim fails to free up enough memory in the current node/zone. + * + * These bit locations are exposed in the vm.zone_reclaim_mode sysctl. + * New bits are OK, but existing bits should not be changed. */ -#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ +#define RECLAIM_ZONE (1<<0) /* Enable zone reclaim */ #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ #define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index ed3aed264aeb..51c4e8c82b1e 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -177,7 +177,17 @@ struct prctl_mm_map { #define PR_GET_TID_ADDRESS 40 +/* + * Flags for PR_SET_THP_DISABLE are only applicable when disabling. Bit 0 + * is reserved, so PR_GET_THP_DISABLE can return "1 | flags", to effectively + * return "1" when no flags were specified for PR_SET_THP_DISABLE. + */ #define PR_SET_THP_DISABLE 41 +/* + * Don't disable THPs when explicitly advised (e.g., MADV_HUGEPAGE / + * VM_HUGEPAGE, MADV_COLLAPSE). + */ +# define PR_THP_DISABLE_EXCEPT_ADVISED (1 << 1) #define PR_GET_THP_DISABLE 42 /* |