From 1f7ef657740344541645349a8bece90cbff898f5 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 24 Feb 2021 12:01:42 -0800 Subject: mm/filemap: remove unused parameter and change to void type for replace_page_cache_page() Since commit 74d609585d8b ("page cache: Add and replace pages using the XArray") was merged, the replace_page_cache_page() can not fail and always return 0, we can remove the redundant return value and void it. Moreover remove the unused gfp_mask. Link: https://lkml.kernel.org/r/609c30e5274ba15d8b90c872fd0d8ac437a9b2bb.1610071401.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Matthew Wilcox Cc: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index d5570deff400..74e466e5a2ba 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -757,7 +757,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern void delete_from_page_cache(struct page *page); extern void __delete_from_page_cache(struct page *page, void *shadow); -int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); +void replace_page_cache_page(struct page *old, struct page *new); void delete_from_page_cache_batch(struct address_space *mapping, struct pagevec *pvec); -- cgit v1.2.3 From 4805462598113f350838d612d0895db2dbb3992b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Feb 2021 12:02:02 -0800 Subject: mm/filemap: pass a sleep state to put_and_wait_on_page_locked This is prep work for the next patch, but I think at least one of the current callers would prefer a killable sleep to an uninterruptible one. Link: https://lkml.kernel.org/r/20210122160140.223228-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Kent Overstreet Reviewed-by: Christoph Hellwig Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 3 +-- mm/filemap.c | 7 +++++-- mm/huge_memory.c | 4 ++-- mm/migrate.c | 4 ++-- 4 files changed, 10 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 74e466e5a2ba..bd629d676a27 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -681,8 +681,7 @@ static inline int wait_on_page_locked_killable(struct page *page) return wait_on_page_bit_killable(compound_head(page), PG_locked); } -extern void put_and_wait_on_page_locked(struct page *page); - +int put_and_wait_on_page_locked(struct page *page, int state); void wait_on_page_writeback(struct page *page); extern void end_page_writeback(struct page *page); void wait_for_stable_page(struct page *page); diff --git a/mm/filemap.c b/mm/filemap.c index 4fce9735e172..389a20eec9b8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1384,20 +1384,23 @@ static int wait_on_page_locked_async(struct page *page, /** * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked * @page: The page to wait for. + * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc). * * The caller should hold a reference on @page. They expect the page to * become unlocked relatively soon, but do not wish to hold up migration * (for example) by holding the reference while waiting for the page to * come unlocked. After this function returns, the caller should not * dereference @page. + * + * Return: 0 if the page was unlocked or -EINTR if interrupted by a signal. */ -void put_and_wait_on_page_locked(struct page *page) +int put_and_wait_on_page_locked(struct page *page, int state) { wait_queue_head_t *q; page = compound_head(page); q = page_waitqueue(page); - wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP); + return wait_on_page_bit_common(q, page, PG_locked, state, DROP); } /** diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 91ca9b103ee5..f655137536eb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1439,7 +1439,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) if (!get_page_unless_zero(page)) goto out_unlock; spin_unlock(vmf->ptl); - put_and_wait_on_page_locked(page); + put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); goto out; } @@ -1475,7 +1475,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) if (!get_page_unless_zero(page)) goto out_unlock; spin_unlock(vmf->ptl); - put_and_wait_on_page_locked(page); + put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); goto out; } diff --git a/mm/migrate.c b/mm/migrate.c index 20ca887ea769..4ec727adfaa2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -331,7 +331,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, if (!get_page_unless_zero(page)) goto out; pte_unmap_unlock(ptep, ptl); - put_and_wait_on_page_locked(page); + put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); return; out: pte_unmap_unlock(ptep, ptl); @@ -365,7 +365,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) if (!get_page_unless_zero(page)) goto unlock; spin_unlock(ptl); - put_and_wait_on_page_locked(page); + put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); return; unlock: spin_unlock(ptl); -- cgit v1.2.3 From 87fa0f3eb267eed966ee194907bc15376c1b758f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 Feb 2021 12:02:42 -0800 Subject: mm/filemap: rename generic_file_buffered_read to filemap_read Rename generic_file_buffered_read to match the naming of filemap_fault, also update the written parameter to a more descriptive name and improve the kerneldoc comment. Link: https://lkml.kernel.org/r/20210122160140.223228-18-willy@infradead.org Signed-off-by: Christoph Hellwig Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Kent Overstreet Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/file.c | 2 +- include/linux/fs.h | 4 ++-- mm/filemap.c | 35 ++++++++++++++++------------------- 3 files changed, 19 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index be9e3900cce8..bf2c51a9607a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3634,7 +3634,7 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return ret; } - return generic_file_buffered_read(iocb, to, ret); + return filemap_read(iocb, to, ret); } const struct file_operations btrfs_file_operations = { diff --git a/include/linux/fs.h b/include/linux/fs.h index 418b772d6eca..ec8f3ddf4a6a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3080,8 +3080,8 @@ extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *); extern int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count); extern int generic_file_rw_checks(struct file *file_in, struct file *file_out); -extern ssize_t generic_file_buffered_read(struct kiocb *iocb, - struct iov_iter *to, ssize_t already_read); +ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *to, + ssize_t already_read); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); diff --git a/mm/filemap.c b/mm/filemap.c index 1358cb061fd6..28c290da22c1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2394,23 +2394,20 @@ err: } /** - * generic_file_buffered_read - generic file read routine - * @iocb: the iocb to read - * @iter: data destination - * @written: already copied + * filemap_read - Read data from the page cache. + * @iocb: The iocb to read. + * @iter: Destination for the data. + * @already_read: Number of bytes already read by the caller. * - * This is a generic file read routine, and uses the - * mapping->a_ops->readpage() function for the actual low-level stuff. + * Copies data from the page cache. If the data is not currently present, + * uses the readahead and readpage address_space operations to fetch it. * - * This is really ugly. But the goto's actually try to clarify some - * of the logic when it comes to error handling etc. - * - * Return: - * * total number of bytes copied, including those the were already @written - * * negative error code if nothing was copied + * Return: Total number of bytes copied, including those already read by + * the caller. If an error happens before any bytes are copied, returns + * a negative error number. */ -ssize_t generic_file_buffered_read(struct kiocb *iocb, - struct iov_iter *iter, ssize_t written) +ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, + ssize_t already_read) { struct file *filp = iocb->ki_filp; struct file_ra_state *ra = &filp->f_ra; @@ -2437,7 +2434,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, * can no longer safely return -EIOCBQUEUED. Hence mark * an async read NOWAIT at that point. */ - if ((iocb->ki_flags & IOCB_WAITQ) && written) + if ((iocb->ki_flags & IOCB_WAITQ) && already_read) iocb->ki_flags |= IOCB_NOWAIT; error = filemap_get_pages(iocb, iter, &pvec); @@ -2497,7 +2494,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, copied = copy_page_to_iter(page, offset, bytes, iter); - written += copied; + already_read += copied; iocb->ki_pos += copied; ra->prev_pos = iocb->ki_pos; @@ -2514,9 +2511,9 @@ put_pages: file_accessed(filp); - return written ? written : error; + return already_read ? already_read : error; } -EXPORT_SYMBOL_GPL(generic_file_buffered_read); +EXPORT_SYMBOL_GPL(filemap_read); /** * generic_file_read_iter - generic filesystem read routine @@ -2591,7 +2588,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) goto out; } - retval = generic_file_buffered_read(iocb, iter, retval); + retval = filemap_read(iocb, iter, retval); out: return retval; } -- cgit v1.2.3 From 2e9bd483159939ed2c0704b914294653c8341d25 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 24 Feb 2021 12:03:11 -0800 Subject: mm: memcg/slab: pre-allocate obj_cgroups for slab caches with SLAB_ACCOUNT In general it's unknown in advance if a slab page will contain accounted objects or not. In order to avoid memory waste, an obj_cgroup vector is allocated dynamically when a need to account of a new object arises. Such approach is memory efficient, but requires an expensive cmpxchg() to set up the memcg/objcgs pointer, because an allocation can race with a different allocation on another cpu. But in some common cases it's known for sure that a slab page will contain accounted objects: if the page belongs to a slab cache with a SLAB_ACCOUNT flag set. It includes such popular objects like vm_area_struct, anon_vma, task_struct, etc. In such cases we can pre-allocate the objcgs vector and simple assign it to the page without any atomic operations, because at this early stage the page is not visible to anyone else. A very simplistic benchmark (allocating 10000000 64-bytes objects in a row) shows ~15% win. In the real life it seems that most workloads are not very sensitive to the speed of (accounted) slab allocations. [guro@fb.com: open-code set_page_objcgs() and add some comments, by Johannes] Link: https://lkml.kernel.org/r/20201113001926.GA2934489@carbon.dhcp.thefacebook.com [akpm@linux-foundation.org: fix it for mm-slub-call-account_slab_page-after-slab-page-initialization-fix.patch] Link: https://lkml.kernel.org/r/20201110195753.530157-2-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Cc: Michal Hocko Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 19 ------------------- mm/memcontrol.c | 23 +++++++++++++++++++---- mm/slab.c | 2 +- mm/slab.h | 14 ++++++++++---- mm/slub.c | 2 +- 5 files changed, 31 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index eeb0b52203e9..7a4dd1cb19fe 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -475,19 +475,6 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page) return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } -/* - * set_page_objcgs - associate a page with a object cgroups vector - * @page: a pointer to the page struct - * @objcgs: a pointer to the object cgroups vector - * - * Atomically associates a page with a vector of object cgroups. - */ -static inline bool set_page_objcgs(struct page *page, - struct obj_cgroup **objcgs) -{ - return !cmpxchg(&page->memcg_data, 0, (unsigned long)objcgs | - MEMCG_DATA_OBJCGS); -} #else static inline struct obj_cgroup **page_objcgs(struct page *page) { @@ -498,12 +485,6 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page) { return NULL; } - -static inline bool set_page_objcgs(struct page *page, - struct obj_cgroup **objcgs) -{ - return true; -} #endif static __always_inline bool memcg_stat_item_in_bytes(int idx) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0b9bd354e97e..60ce452e42e6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2935,9 +2935,10 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg) #ifdef CONFIG_MEMCG_KMEM int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, - gfp_t gfp) + gfp_t gfp, bool new_page) { unsigned int objects = objs_per_slab_page(s, page); + unsigned long memcg_data; void *vec; vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, @@ -2945,11 +2946,25 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, if (!vec) return -ENOMEM; - if (!set_page_objcgs(page, vec)) + memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS; + if (new_page) { + /* + * If the slab page is brand new and nobody can yet access + * it's memcg_data, no synchronization is required and + * memcg_data can be simply assigned. + */ + page->memcg_data = memcg_data; + } else if (cmpxchg(&page->memcg_data, 0, memcg_data)) { + /* + * If the slab page is already in use, somebody can allocate + * and assign obj_cgroups in parallel. In this case the existing + * objcg vector should be reused. + */ kfree(vec); - else - kmemleak_not_leak(vec); + return 0; + } + kmemleak_not_leak(vec); return 0; } diff --git a/mm/slab.c b/mm/slab.c index cdad64b2836d..9c9e0c255c4b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1379,7 +1379,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, return NULL; } - account_slab_page(page, cachep->gfporder, cachep); + account_slab_page(page, cachep->gfporder, cachep, flags); __SetPageSlab(page); /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ if (sk_memalloc_socks() && page_is_pfmemalloc(page)) diff --git a/mm/slab.h b/mm/slab.h index 48dc466a3791..076582f58f68 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -238,7 +238,7 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla #ifdef CONFIG_MEMCG_KMEM int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, - gfp_t gfp); + gfp_t gfp, bool new_page); static inline void memcg_free_page_obj_cgroups(struct page *page) { @@ -315,7 +315,8 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, page = virt_to_head_page(p[i]); if (!page_objcgs(page) && - memcg_alloc_page_obj_cgroups(page, s, flags)) { + memcg_alloc_page_obj_cgroups(page, s, flags, + false)) { obj_cgroup_uncharge(objcg, obj_full_size(s)); continue; } @@ -379,7 +380,8 @@ static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) } static inline int memcg_alloc_page_obj_cgroups(struct page *page, - struct kmem_cache *s, gfp_t gfp) + struct kmem_cache *s, gfp_t gfp, + bool new_page) { return 0; } @@ -420,8 +422,12 @@ static inline struct kmem_cache *virt_to_cache(const void *obj) } static __always_inline void account_slab_page(struct page *page, int order, - struct kmem_cache *s) + struct kmem_cache *s, + gfp_t gfp) { + if (memcg_kmem_enabled() && (s->flags & SLAB_ACCOUNT)) + memcg_alloc_page_obj_cgroups(page, s, gfp, true); + mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), PAGE_SIZE << order); } diff --git a/mm/slub.c b/mm/slub.c index e0fef8f8d4bf..ca566361561e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1785,7 +1785,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) page->objects = oo_objects(oo); - account_slab_page(page, oo_order(oo), s); + account_slab_page(page, oo_order(oo), s, flags); page->slab_cache = s; __SetPageSlab(page); -- cgit v1.2.3 From f3344adf38bdb3107d40483dd9501215ad40edce Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 24 Feb 2021 12:03:15 -0800 Subject: mm: memcontrol: optimize per-lruvec stats counter memory usage The vmstat threshold is 32 (MEMCG_CHARGE_BATCH), Actually the threshold can be as big as MEMCG_CHARGE_BATCH * PAGE_SIZE. It still fits into s32. So introduce struct batched_lruvec_stat to optimize memory usage. The size of struct lruvec_stat is 304 bytes on 64 bit systems. As it is a per-cpu structure. So with this patch, we can save 304 / 2 * ncpu bytes per-memcg per-node where ncpu is the number of the possible CPU. If there are c memory cgroup (include dying cgroup) and n NUMA node in the system. Finally, we can save (152 * ncpu * c * n) bytes. [akpm@linux-foundation.org: fix typo in comment] Link: https://lkml.kernel.org/r/20201210042121.39665-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Shakeel Butt Cc: Roman Gushchin Cc: Stephen Rothwell Cc: Chris Down Cc: Yafang Shao Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 14 ++++++++++++-- mm/memcontrol.c | 10 +++++++++- 2 files changed, 21 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7a4dd1cb19fe..41bbf71edd9f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -92,6 +92,10 @@ struct lruvec_stat { long count[NR_VM_NODE_STAT_ITEMS]; }; +struct batched_lruvec_stat { + s32 count[NR_VM_NODE_STAT_ITEMS]; +}; + /* * Bitmap of shrinker::id corresponding to memcg-aware shrinkers, * which have elements charged to this memcg. @@ -107,11 +111,17 @@ struct memcg_shrinker_map { struct mem_cgroup_per_node { struct lruvec lruvec; - /* Legacy local VM stats */ + /* + * Legacy local VM stats. This should be struct lruvec_stat and + * cannot be optimized to struct batched_lruvec_stat. Because + * the threshold of the lruvec_stat_cpu can be as big as + * MEMCG_CHARGE_BATCH * PAGE_SIZE. It can fit into s32. But this + * filed has no upper limit. + */ struct lruvec_stat __percpu *lruvec_stat_local; /* Subtree VM stats (batched updates) */ - struct lruvec_stat __percpu *lruvec_stat_cpu; + struct batched_lruvec_stat __percpu *lruvec_stat_cpu; atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS]; unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 60ce452e42e6..b259e7d8ce41 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5208,7 +5208,7 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) return 1; } - pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat, + pn->lruvec_stat_cpu = alloc_percpu_gfp(struct batched_lruvec_stat, GFP_KERNEL_ACCOUNT); if (!pn->lruvec_stat_cpu) { free_percpu(pn->lruvec_stat_local); @@ -7093,6 +7093,14 @@ static int __init mem_cgroup_init(void) { int cpu, node; + /* + * Currently s32 type (can refer to struct batched_lruvec_stat) is + * used for per-memcg-per-cpu caching of per-node statistics. In order + * to work fine, we should make sure that the overfill threshold can't + * exceed S32_MAX / PAGE_SIZE. + */ + BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE); + cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, memcg_hotplug_cpu_dead); -- cgit v1.2.3 From 69473e5de87389be6c0fa4a5d574a50c8f904fb3 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 24 Feb 2021 12:03:23 -0800 Subject: mm: memcontrol: convert NR_ANON_THPS account to pages Currently we use struct per_cpu_nodestat to cache the vmstat counters, which leads to inaccurate statistics especially THP vmstat counters. In the systems with hundreds of processors it can be GBs of memory. For example, for a 96 CPUs system, the threshold is the maximum number of 125. And the per cpu counters can cache 23.4375 GB in total. The THP page is already a form of batched addition (it will add 512 worth of memory in one go) so skipping the batching seems like sensible. Although every THP stats update overflows the per-cpu counter, resorting to atomic global updates. But it can make the statistics more accuracy for the THP vmstat counters. So we convert the NR_ANON_THPS account to pages. This patch is consistent with 8f182270dfec ("mm/swap.c: flush lru pvecs on compound page arrival"). Doing this also can make the unit of vmstat counters more unified. Finally, the unit of the vmstat counters are pages, kB and bytes. The B/KB suffix can tell us that the unit is bytes or kB. The rest which is without suffix are pages. Link: https://lkml.kernel.org/r/20201228164110.2838-3-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Greg Kroah-Hartman Cc: Rafael. J. Wysocki Cc: Alexey Dobriyan Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Hugh Dickins Cc: Shakeel Butt Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Feng Tang Cc: NeilBrown Cc: Joonsoo Kim Cc: Randy Dunlap Cc: Michal Hocko Cc: Pankaj Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 15 +++++++++------ fs/proc/meminfo.c | 2 +- include/linux/mmzone.h | 13 +++++++++++++ mm/huge_memory.c | 3 ++- mm/memcontrol.c | 20 ++++++-------------- mm/page_alloc.c | 2 +- mm/rmap.c | 6 +++--- mm/vmstat.c | 11 +++++++++-- 8 files changed, 44 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/node.c b/drivers/base/node.c index 04f71c7bc3f8..6da0c3508bc9 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -461,8 +461,7 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(sunreclaimable) #ifdef CONFIG_TRANSPARENT_HUGEPAGE , - nid, K(node_page_state(pgdat, NR_ANON_THPS) * - HPAGE_PMD_NR), + nid, K(node_page_state(pgdat, NR_ANON_THPS)), nid, K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * @@ -519,10 +518,14 @@ static ssize_t node_read_vmstat(struct device *dev, sum_zone_numa_state(nid, i)); #endif - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) - len += sysfs_emit_at(buf, len, "%s %lu\n", - node_stat_name(i), - node_page_state_pages(pgdat, i)); + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + unsigned long pages = node_page_state_pages(pgdat, i); + + if (vmstat_item_print_in_thp(i)) + pages /= HPAGE_PMD_NR; + len += sysfs_emit_at(buf, len, "%s %lu\n", node_stat_name(i), + pages); + } return len; } diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index d6fc74619625..a635c8a84ddf 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -129,7 +129,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_TRANSPARENT_HUGEPAGE show_val_kb(m, "AnonHugePages: ", - global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR); + global_node_page_state(NR_ANON_THPS)); show_val_kb(m, "ShmemHugePages: ", global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR); show_val_kb(m, "ShmemPmdMapped: ", diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b593316bff3d..67d50ef5dd20 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -209,6 +209,19 @@ enum node_stat_item { NR_VM_NODE_STAT_ITEMS }; +/* + * Returns true if the item should be printed in THPs (/proc/vmstat + * currently prints number of anon, file and shmem THPs. But the item + * is charged in pages). + */ +static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item) +{ + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + return false; + + return item == NR_ANON_THPS; +} + /* * Returns true if the value is measured in bytes (most vmstat values are * measured in pages). This defines the API part, the internal representation diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f655137536eb..3691e863070a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2176,7 +2176,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, lock_page_memcg(page); if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { /* Last compound_mapcount is gone. */ - __dec_lruvec_page_state(page, NR_ANON_THPS); + __mod_lruvec_page_state(page, NR_ANON_THPS, + -HPAGE_PMD_NR); if (TestClearPageDoubleMap(page)) { /* No need in mapcount reference anymore */ for (i = 0; i < HPAGE_PMD_NR; i++) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e31e47e7bab2..b2405f049006 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1533,7 +1533,7 @@ static struct memory_stat memory_stats[] = { * on some architectures, the macro of HPAGE_PMD_SIZE is not * constant(e.g. powerpc). */ - { "anon_thp", 0, NR_ANON_THPS }, + { "anon_thp", PAGE_SIZE, NR_ANON_THPS }, { "file_thp", 0, NR_FILE_THPS }, { "shmem_thp", 0, NR_SHMEM_THPS }, #endif @@ -1566,8 +1566,7 @@ static int __init memory_stats_init(void) for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (memory_stats[i].idx == NR_ANON_THPS || - memory_stats[i].idx == NR_FILE_THPS || + if (memory_stats[i].idx == NR_FILE_THPS || memory_stats[i].idx == NR_SHMEM_THPS) memory_stats[i].ratio = HPAGE_PMD_SIZE; #endif @@ -4087,10 +4086,6 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; nr = memcg_page_state_local(memcg, memcg1_stats[i]); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (memcg1_stats[i] == NR_ANON_THPS) - nr *= HPAGE_PMD_NR; -#endif seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE); } @@ -4121,10 +4116,6 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; nr = memcg_page_state(memcg, memcg1_stats[i]); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (memcg1_stats[i] == NR_ANON_THPS) - nr *= HPAGE_PMD_NR; -#endif seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], (u64)nr * PAGE_SIZE); } @@ -5652,10 +5643,11 @@ static int mem_cgroup_move_account(struct page *page, __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); if (PageTransHuge(page)) { - __dec_lruvec_state(from_vec, NR_ANON_THPS); - __inc_lruvec_state(to_vec, NR_ANON_THPS); + __mod_lruvec_state(from_vec, NR_ANON_THPS, + -nr_pages); + __mod_lruvec_state(to_vec, NR_ANON_THPS, + nr_pages); } - } } else { __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef5070fed76b..2e2e47f8714b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5587,7 +5587,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR), - K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), + K(node_page_state(pgdat, NR_ANON_THPS)), #endif K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), node_page_state(pgdat, NR_KERNEL_STACK_KB), diff --git a/mm/rmap.c b/mm/rmap.c index 08c56aaf72eb..c4d5c63cfd29 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1144,7 +1144,7 @@ void do_page_add_anon_rmap(struct page *page, * disabled. */ if (compound) - __inc_lruvec_page_state(page, NR_ANON_THPS); + __mod_lruvec_page_state(page, NR_ANON_THPS, nr); __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); } @@ -1186,7 +1186,7 @@ void page_add_new_anon_rmap(struct page *page, if (hpage_pincount_available(page)) atomic_set(compound_pincount_ptr(page), 0); - __inc_lruvec_page_state(page, NR_ANON_THPS); + __mod_lruvec_page_state(page, NR_ANON_THPS, nr); } else { /* Anon THP always mapped first with PMD */ VM_BUG_ON_PAGE(PageTransCompound(page), page); @@ -1292,7 +1292,7 @@ static void page_remove_anon_compound_rmap(struct page *page) if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return; - __dec_lruvec_page_state(page, NR_ANON_THPS); + __mod_lruvec_page_state(page, NR_ANON_THPS, -thp_nr_pages(page)); if (TestClearPageDoubleMap(page)) { /* diff --git a/mm/vmstat.c b/mm/vmstat.c index f8942160fc95..07dc0af50cf0 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1619,8 +1619,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, if (is_zone_first_populated(pgdat, zone)) { seq_printf(m, "\n per-node stats"); for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + unsigned long pages = node_page_state_pages(pgdat, i); + + if (vmstat_item_print_in_thp(i)) + pages /= HPAGE_PMD_NR; seq_printf(m, "\n %-12s %lu", node_stat_name(i), - node_page_state_pages(pgdat, i)); + pages); } } seq_printf(m, @@ -1740,8 +1744,11 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) v += NR_VM_NUMA_STAT_ITEMS; #endif - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { v[i] = global_node_page_state_pages(i); + if (vmstat_item_print_in_thp(i)) + v[i] /= HPAGE_PMD_NR; + } v += NR_VM_NODE_STAT_ITEMS; global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, -- cgit v1.2.3 From bf9ecead53c89d3d2cf60acbc460174ebbcf0027 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 24 Feb 2021 12:03:27 -0800 Subject: mm: memcontrol: convert NR_FILE_THPS account to pages Currently we use struct per_cpu_nodestat to cache the vmstat counters, which leads to inaccurate statistics especially THP vmstat counters. In the systems with if hundreds of processors it can be GBs of memory. For example, for a 96 CPUs system, the threshold is the maximum number of 125. And the per cpu counters can cache 23.4375 GB in total. The THP page is already a form of batched addition (it will add 512 worth of memory in one go) so skipping the batching seems like sensible. Although every THP stats update overflows the per-cpu counter, resorting to atomic global updates. But it can make the statistics more accuracy for the THP vmstat counters. So we convert the NR_FILE_THPS account to pages. This patch is consistent with 8f182270dfec ("mm/swap.c: flush lru pvecs on compound page arrival"). Doing this also can make the unit of vmstat counters more unified. Finally, the unit of the vmstat counters are pages, kB and bytes. The B/KB suffix can tell us that the unit is bytes or kB. The rest which is without suffix are pages. Link: https://lkml.kernel.org/r/20201228164110.2838-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Alexey Dobriyan Cc: Feng Tang Cc: Greg Kroah-Hartman Cc: Hugh Dickins Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Michal Hocko Cc: NeilBrown Cc: Pankaj Gupta Cc: Rafael. J. Wysocki Cc: Randy Dunlap Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Shakeel Butt Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 3 +-- fs/proc/meminfo.c | 2 +- include/linux/mmzone.h | 3 ++- mm/filemap.c | 2 +- mm/huge_memory.c | 5 ++++- mm/khugepaged.c | 4 +++- mm/memcontrol.c | 5 ++--- 7 files changed, 14 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/node.c b/drivers/base/node.c index 6da0c3508bc9..d5952f754911 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -466,8 +466,7 @@ static ssize_t node_read_meminfo(struct device *dev, HPAGE_PMD_NR), nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR), - nid, K(node_page_state(pgdat, NR_FILE_THPS) * - HPAGE_PMD_NR), + nid, K(node_page_state(pgdat, NR_FILE_THPS)), nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED) * HPAGE_PMD_NR) #endif diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index a635c8a84ddf..7ea4679880c8 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -135,7 +135,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "ShmemPmdMapped: ", global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR); show_val_kb(m, "FileHugePages: ", - global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR); + global_node_page_state(NR_FILE_THPS)); show_val_kb(m, "FilePmdMapped: ", global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR); #endif diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 67d50ef5dd20..b751a9898bb6 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -219,7 +219,8 @@ static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item) if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return false; - return item == NR_ANON_THPS; + return item == NR_ANON_THPS || + item == NR_FILE_THPS; } /* diff --git a/mm/filemap.c b/mm/filemap.c index 820d81901eae..9efe059e2b58 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -208,7 +208,7 @@ static void unaccount_page_cache_page(struct address_space *mapping, if (PageTransHuge(page)) __dec_lruvec_page_state(page, NR_SHMEM_THPS); } else if (PageTransHuge(page)) { - __dec_lruvec_page_state(page, NR_FILE_THPS); + __mod_lruvec_page_state(page, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3691e863070a..77181faa2340 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2752,10 +2752,13 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } spin_unlock(&ds_queue->split_queue_lock); if (mapping) { + int nr = thp_nr_pages(head); + if (PageSwapBacked(head)) __dec_lruvec_page_state(head, NR_SHMEM_THPS); else - __dec_lruvec_page_state(head, NR_FILE_THPS); + __mod_lruvec_page_state(head, NR_FILE_THPS, + -nr); } __split_huge_page(page, list, end); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index fb0fdaec34d5..c427fe2ca7ff 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1643,6 +1643,7 @@ static void collapse_file(struct mm_struct *mm, XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); int nr_none = 0, result = SCAN_SUCCEED; bool is_shmem = shmem_file(file); + int nr; VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); @@ -1854,11 +1855,12 @@ out_unlock: put_page(page); goto xa_unlocked; } + nr = thp_nr_pages(new_page); if (is_shmem) __inc_lruvec_page_state(new_page, NR_SHMEM_THPS); else { - __inc_lruvec_page_state(new_page, NR_FILE_THPS); + __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr); filemap_nr_thps_inc(mapping); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b2405f049006..fc552f57d3fb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1534,7 +1534,7 @@ static struct memory_stat memory_stats[] = { * constant(e.g. powerpc). */ { "anon_thp", PAGE_SIZE, NR_ANON_THPS }, - { "file_thp", 0, NR_FILE_THPS }, + { "file_thp", PAGE_SIZE, NR_FILE_THPS }, { "shmem_thp", 0, NR_SHMEM_THPS }, #endif { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON }, @@ -1566,8 +1566,7 @@ static int __init memory_stats_init(void) for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (memory_stats[i].idx == NR_FILE_THPS || - memory_stats[i].idx == NR_SHMEM_THPS) + if (memory_stats[i].idx == NR_SHMEM_THPS) memory_stats[i].ratio = HPAGE_PMD_SIZE; #endif VM_BUG_ON(!memory_stats[i].ratio); -- cgit v1.2.3 From 57b2847d3c1dc154923578efb47a12302a57d700 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 24 Feb 2021 12:03:31 -0800 Subject: mm: memcontrol: convert NR_SHMEM_THPS account to pages Currently we use struct per_cpu_nodestat to cache the vmstat counters, which leads to inaccurate statistics especially THP vmstat counters. In the systems with hundreds of processors it can be GBs of memory. For example, for a 96 CPUs system, the threshold is the maximum number of 125. And the per cpu counters can cache 23.4375 GB in total. The THP page is already a form of batched addition (it will add 512 worth of memory in one go) so skipping the batching seems like sensible. Although every THP stats update overflows the per-cpu counter, resorting to atomic global updates. But it can make the statistics more accuracy for the THP vmstat counters. So we convert the NR_SHMEM_THPS account to pages. This patch is consistent with 8f182270dfec ("mm/swap.c: flush lru pvecs on compound page arrival"). Doing this also can make the unit of vmstat counters more unified. Finally, the unit of the vmstat counters are pages, kB and bytes. The B/KB suffix can tell us that the unit is bytes or kB. The rest which is without suffix are pages. Link: https://lkml.kernel.org/r/20201228164110.2838-5-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Alexey Dobriyan Cc: Feng Tang Cc: Greg Kroah-Hartman Cc: Hugh Dickins Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Michal Hocko Cc: NeilBrown Cc: Pankaj Gupta Cc: Rafael. J. Wysocki Cc: Randy Dunlap Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Shakeel Butt Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 3 +-- fs/proc/meminfo.c | 2 +- include/linux/mmzone.h | 3 ++- mm/filemap.c | 2 +- mm/huge_memory.c | 3 ++- mm/khugepaged.c | 2 +- mm/memcontrol.c | 26 ++------------------------ mm/page_alloc.c | 2 +- mm/shmem.c | 2 +- 9 files changed, 12 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/node.c b/drivers/base/node.c index d5952f754911..6d5ac6ffb6e1 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -462,8 +462,7 @@ static ssize_t node_read_meminfo(struct device *dev, #ifdef CONFIG_TRANSPARENT_HUGEPAGE , nid, K(node_page_state(pgdat, NR_ANON_THPS)), - nid, K(node_page_state(pgdat, NR_SHMEM_THPS) * - HPAGE_PMD_NR), + nid, K(node_page_state(pgdat, NR_SHMEM_THPS)), nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR), nid, K(node_page_state(pgdat, NR_FILE_THPS)), diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 7ea4679880c8..cfb107eaa3e6 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -131,7 +131,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "AnonHugePages: ", global_node_page_state(NR_ANON_THPS)); show_val_kb(m, "ShmemHugePages: ", - global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR); + global_node_page_state(NR_SHMEM_THPS)); show_val_kb(m, "ShmemPmdMapped: ", global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR); show_val_kb(m, "FileHugePages: ", diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b751a9898bb6..788837f40b38 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -220,7 +220,8 @@ static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item) return false; return item == NR_ANON_THPS || - item == NR_FILE_THPS; + item == NR_FILE_THPS || + item == NR_SHMEM_THPS; } /* diff --git a/mm/filemap.c b/mm/filemap.c index 9efe059e2b58..46a8b9e82434 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -206,7 +206,7 @@ static void unaccount_page_cache_page(struct address_space *mapping, if (PageSwapBacked(page)) { __mod_lruvec_page_state(page, NR_SHMEM, -nr); if (PageTransHuge(page)) - __dec_lruvec_page_state(page, NR_SHMEM_THPS); + __mod_lruvec_page_state(page, NR_SHMEM_THPS, -nr); } else if (PageTransHuge(page)) { __mod_lruvec_page_state(page, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 77181faa2340..86a3015ba54f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2755,7 +2755,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) int nr = thp_nr_pages(head); if (PageSwapBacked(head)) - __dec_lruvec_page_state(head, NR_SHMEM_THPS); + __mod_lruvec_page_state(head, NR_SHMEM_THPS, + -nr); else __mod_lruvec_page_state(head, NR_FILE_THPS, -nr); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index c427fe2ca7ff..75e246f680f4 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1858,7 +1858,7 @@ out_unlock: nr = thp_nr_pages(new_page); if (is_shmem) - __inc_lruvec_page_state(new_page, NR_SHMEM_THPS); + __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr); else { __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr); filemap_nr_thps_inc(mapping); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fc552f57d3fb..d3a0c59210e7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1516,7 +1516,7 @@ struct memory_stat { unsigned int idx; }; -static struct memory_stat memory_stats[] = { +static const struct memory_stat memory_stats[] = { { "anon", PAGE_SIZE, NR_ANON_MAPPED }, { "file", PAGE_SIZE, NR_FILE_PAGES }, { "kernel_stack", 1024, NR_KERNEL_STACK_KB }, @@ -1528,14 +1528,9 @@ static struct memory_stat memory_stats[] = { { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY }, { "file_writeback", PAGE_SIZE, NR_WRITEBACK }, #ifdef CONFIG_TRANSPARENT_HUGEPAGE - /* - * The ratio will be initialized in memory_stats_init(). Because - * on some architectures, the macro of HPAGE_PMD_SIZE is not - * constant(e.g. powerpc). - */ { "anon_thp", PAGE_SIZE, NR_ANON_THPS }, { "file_thp", PAGE_SIZE, NR_FILE_THPS }, - { "shmem_thp", 0, NR_SHMEM_THPS }, + { "shmem_thp", PAGE_SIZE, NR_SHMEM_THPS }, #endif { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON }, { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON }, @@ -1560,23 +1555,6 @@ static struct memory_stat memory_stats[] = { { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM }, }; -static int __init memory_stats_init(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (memory_stats[i].idx == NR_SHMEM_THPS) - memory_stats[i].ratio = HPAGE_PMD_SIZE; -#endif - VM_BUG_ON(!memory_stats[i].ratio); - VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT); - } - - return 0; -} -pure_initcall(memory_stats_init); - static char *memory_stat_format(struct mem_cgroup *memcg) { struct seq_buf s; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2e2e47f8714b..df292d8e659b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5584,7 +5584,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(node_page_state(pgdat, NR_WRITEBACK)), K(node_page_state(pgdat, NR_SHMEM)), #ifdef CONFIG_TRANSPARENT_HUGEPAGE - K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), + K(node_page_state(pgdat, NR_SHMEM_THPS)), K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR), K(node_page_state(pgdat, NR_ANON_THPS)), diff --git a/mm/shmem.c b/mm/shmem.c index 7924b3bf46fb..ff741d229701 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -713,7 +713,7 @@ next: } if (PageTransHuge(page)) { count_vm_event(THP_FILE_ALLOC); - __inc_lruvec_page_state(page, NR_SHMEM_THPS); + __mod_lruvec_page_state(page, NR_SHMEM_THPS, nr); } mapping->nrpages += nr; __mod_lruvec_page_state(page, NR_FILE_PAGES, nr); -- cgit v1.2.3 From a1528e21f8915e16252cda1137fe29672c918361 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 24 Feb 2021 12:03:35 -0800 Subject: mm: memcontrol: convert NR_SHMEM_PMDMAPPED account to pages Currently we use struct per_cpu_nodestat to cache the vmstat counters, which leads to inaccurate statistics especially THP vmstat counters. In the systems with hundreds of processors it can be GBs of memory. For example, for a 96 CPUs system, the threshold is the maximum number of 125. And the per cpu counters can cache 23.4375 GB in total. The THP page is already a form of batched addition (it will add 512 worth of memory in one go) so skipping the batching seems like sensible. Although every THP stats update overflows the per-cpu counter, resorting to atomic global updates. But it can make the statistics more accuracy for the THP vmstat counters. So we convert the NR_SHMEM_PMDMAPPED account to pages. This patch is consistent with 8f182270dfec ("mm/swap.c: flush lru pvecs on compound page arrival"). Doing this also can make the unit of vmstat counters more unified. Finally, the unit of the vmstat counters are pages, kB and bytes. The B/KB suffix can tell us that the unit is bytes or kB. The rest which is without suffix are pages. Link: https://lkml.kernel.org/r/20201228164110.2838-6-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Alexey Dobriyan Cc: Feng Tang Cc: Greg Kroah-Hartman Cc: Hugh Dickins Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Michal Hocko Cc: NeilBrown Cc: Pankaj Gupta Cc: Rafael. J. Wysocki Cc: Randy Dunlap Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Shakeel Butt Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 3 +-- fs/proc/meminfo.c | 2 +- include/linux/mmzone.h | 3 ++- mm/page_alloc.c | 3 +-- mm/rmap.c | 14 ++++++++++---- 5 files changed, 15 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/node.c b/drivers/base/node.c index 6d5ac6ffb6e1..7a66aefe4e46 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -463,8 +463,7 @@ static ssize_t node_read_meminfo(struct device *dev, , nid, K(node_page_state(pgdat, NR_ANON_THPS)), nid, K(node_page_state(pgdat, NR_SHMEM_THPS)), - nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * - HPAGE_PMD_NR), + nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)), nid, K(node_page_state(pgdat, NR_FILE_THPS)), nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED) * HPAGE_PMD_NR) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index cfb107eaa3e6..c61f440570f9 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -133,7 +133,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "ShmemHugePages: ", global_node_page_state(NR_SHMEM_THPS)); show_val_kb(m, "ShmemPmdMapped: ", - global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR); + global_node_page_state(NR_SHMEM_PMDMAPPED)); show_val_kb(m, "FileHugePages: ", global_node_page_state(NR_FILE_THPS)); show_val_kb(m, "FilePmdMapped: ", diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 788837f40b38..7bdbfeeb5c8c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -221,7 +221,8 @@ static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item) return item == NR_ANON_THPS || item == NR_FILE_THPS || - item == NR_SHMEM_THPS; + item == NR_SHMEM_THPS || + item == NR_SHMEM_PMDMAPPED; } /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index df292d8e659b..069561aadc7b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5585,8 +5585,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(node_page_state(pgdat, NR_SHMEM)), #ifdef CONFIG_TRANSPARENT_HUGEPAGE K(node_page_state(pgdat, NR_SHMEM_THPS)), - K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) - * HPAGE_PMD_NR), + K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)), K(node_page_state(pgdat, NR_ANON_THPS)), #endif K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), diff --git a/mm/rmap.c b/mm/rmap.c index c4d5c63cfd29..1c1b576c0627 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1211,14 +1211,17 @@ void page_add_file_rmap(struct page *page, bool compound) VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); lock_page_memcg(page); if (compound && PageTransHuge(page)) { - for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { + int nr_pages = thp_nr_pages(page); + + for (i = 0, nr = 0; i < nr_pages; i++) { if (atomic_inc_and_test(&page[i]._mapcount)) nr++; } if (!atomic_inc_and_test(compound_mapcount_ptr(page))) goto out; if (PageSwapBacked(page)) - __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); + __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, + nr_pages); else __inc_node_page_state(page, NR_FILE_PMDMAPPED); } else { @@ -1252,14 +1255,17 @@ static void page_remove_file_rmap(struct page *page, bool compound) /* page still mapped by someone else? */ if (compound && PageTransHuge(page)) { - for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { + int nr_pages = thp_nr_pages(page); + + for (i = 0, nr = 0; i < nr_pages; i++) { if (atomic_add_negative(-1, &page[i]._mapcount)) nr++; } if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) return; if (PageSwapBacked(page)) - __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); + __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, + -nr_pages); else __dec_node_page_state(page, NR_FILE_PMDMAPPED); } else { -- cgit v1.2.3 From 380780e71895ae301505ffcec8f954ab3666a4c7 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 24 Feb 2021 12:03:39 -0800 Subject: mm: memcontrol: convert NR_FILE_PMDMAPPED account to pages Currently we use struct per_cpu_nodestat to cache the vmstat counters, which leads to inaccurate statistics especially THP vmstat counters. In the systems with hundreds of processors it can be GBs of memory. For example, for a 96 CPUs system, the threshold is the maximum number of 125. And the per cpu counters can cache 23.4375 GB in total. The THP page is already a form of batched addition (it will add 512 worth of memory in one go) so skipping the batching seems like sensible. Although every THP stats update overflows the per-cpu counter, resorting to atomic global updates. But it can make the statistics more accuracy for the THP vmstat counters. So we convert the NR_FILE_PMDMAPPED account to pages. This patch is consistent with 8f182270dfec ("mm/swap.c: flush lru pvecs on compound page arrival"). Doing this also can make the unit of vmstat counters more unified. Finally, the unit of the vmstat counters are pages, kB and bytes. The B/KB suffix can tell us that the unit is bytes or kB. The rest which is without suffix are pages. Link: https://lkml.kernel.org/r/20201228164110.2838-7-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Alexey Dobriyan Cc: Feng Tang Cc: Greg Kroah-Hartman Cc: Hugh Dickins Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Michal Hocko Cc: NeilBrown Cc: Pankaj Gupta Cc: Rafael. J. Wysocki Cc: Randy Dunlap Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Shakeel Butt Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 3 +-- fs/proc/meminfo.c | 2 +- include/linux/mmzone.h | 3 ++- mm/rmap.c | 6 ++++-- 4 files changed, 8 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/node.c b/drivers/base/node.c index 7a66aefe4e46..d02d86aec19f 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -465,8 +465,7 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(pgdat, NR_SHMEM_THPS)), nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)), nid, K(node_page_state(pgdat, NR_FILE_THPS)), - nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED) * - HPAGE_PMD_NR) + nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED)) #endif ); len += hugetlb_report_node_meminfo(buf, len, nid); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index c61f440570f9..6fa761c9cc78 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -137,7 +137,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "FileHugePages: ", global_node_page_state(NR_FILE_THPS)); show_val_kb(m, "FilePmdMapped: ", - global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR); + global_node_page_state(NR_FILE_PMDMAPPED)); #endif #ifdef CONFIG_CMA diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7bdbfeeb5c8c..66d68e5d5a0f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -222,7 +222,8 @@ static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item) return item == NR_ANON_THPS || item == NR_FILE_THPS || item == NR_SHMEM_THPS || - item == NR_SHMEM_PMDMAPPED; + item == NR_SHMEM_PMDMAPPED || + item == NR_FILE_PMDMAPPED; } /* diff --git a/mm/rmap.c b/mm/rmap.c index 1c1b576c0627..5ebf16fae4b9 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1223,7 +1223,8 @@ void page_add_file_rmap(struct page *page, bool compound) __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, nr_pages); else - __inc_node_page_state(page, NR_FILE_PMDMAPPED); + __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED, + nr_pages); } else { if (PageTransCompound(page) && page_mapping(page)) { VM_WARN_ON_ONCE(!PageLocked(page)); @@ -1267,7 +1268,8 @@ static void page_remove_file_rmap(struct page *page, bool compound) __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, -nr_pages); else - __dec_node_page_state(page, NR_FILE_PMDMAPPED); + __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED, + -nr_pages); } else { if (!atomic_add_negative(-1, &page->_mapcount)) return; -- cgit v1.2.3 From b6038942480e574c697ea1a80019bbe586c1d654 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 24 Feb 2021 12:03:55 -0800 Subject: mm: memcg: add swapcache stat for memcg v2 This patch adds swapcache stat for the cgroup v2. The swapcache represents the memory that is accounted against both the memory and the swap limit of the cgroup. The main motivation behind exposing the swapcache stat is for enabling users to gracefully migrate from cgroup v1's memsw counter to cgroup v2's memory and swap counters. Cgroup v1's memsw limit allows users to limit the memory+swap usage of a workload but without control on the exact proportion of memory and swap. Cgroup v2 provides separate limits for memory and swap which enables more control on the exact usage of memory and swap individually for the workload. With some little subtleties, the v1's memsw limit can be switched with the sum of the v2's memory and swap limits. However the alternative for memsw usage is not yet available in cgroup v2. Exposing per-cgroup swapcache stat enables that alternative. Adding the memory usage and swap usage and subtracting the swapcache will approximate the memsw usage. This will help in the transparent migration of the workloads depending on memsw usage and limit to v2' memory and swap counters. The reasons these applications are still interested in this approximate memsw usage are: (1) these applications are not really interested in two separate memory and swap usage metrics. A single usage metric is more simple to use and reason about for them. (2) The memsw usage metric hides the underlying system's swap setup from the applications. Applications with multiple instances running in a datacenter with heterogeneous systems (some have swap and some don't) will keep seeing a consistent view of their usage. [akpm@linux-foundation.org: fix CONFIG_SWAP=n build] Link: https://lkml.kernel.org/r/20210108155813.2914586-3-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Muchun Song Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 4 ++++ drivers/base/node.c | 6 ++++++ include/linux/mmzone.h | 3 +++ include/linux/swap.h | 6 +++++- mm/memcontrol.c | 3 +++ mm/migrate.c | 6 ++++++ mm/swap_state.c | 28 ++-------------------------- mm/vmstat.c | 3 +++ 8 files changed, 32 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index c513eafaddea..9acc57f68f31 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1299,6 +1299,10 @@ PAGE_SIZE multiple when read back. Amount of cached filesystem data that was modified and is currently being written back to disk + swapcached + Amount of swap cached in memory. The swapcache is accounted + against both memory and swap usage. + anon_thp Amount of memory used in anonymous mappings backed by transparent hugepages diff --git a/drivers/base/node.c b/drivers/base/node.c index d02d86aec19f..f449dbb2c746 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -372,14 +372,19 @@ static ssize_t node_read_meminfo(struct device *dev, struct pglist_data *pgdat = NODE_DATA(nid); struct sysinfo i; unsigned long sreclaimable, sunreclaimable; + unsigned long swapcached = 0; si_meminfo_node(&i, nid); sreclaimable = node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B); sunreclaimable = node_page_state_pages(pgdat, NR_SLAB_UNRECLAIMABLE_B); +#ifdef CONFIG_SWAP + swapcached = node_page_state_pages(pgdat, NR_SWAPCACHE); +#endif len = sysfs_emit_at(buf, len, "Node %d MemTotal: %8lu kB\n" "Node %d MemFree: %8lu kB\n" "Node %d MemUsed: %8lu kB\n" + "Node %d SwapCached: %8lu kB\n" "Node %d Active: %8lu kB\n" "Node %d Inactive: %8lu kB\n" "Node %d Active(anon): %8lu kB\n" @@ -391,6 +396,7 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(i.totalram), nid, K(i.freeram), nid, K(i.totalram - i.freeram), + nid, K(swapcached), nid, K(node_page_state(pgdat, NR_ACTIVE_ANON) + node_page_state(pgdat, NR_ACTIVE_FILE)), nid, K(node_page_state(pgdat, NR_INACTIVE_ANON) + diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 66d68e5d5a0f..fc99e9241846 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -206,6 +206,9 @@ enum node_stat_item { NR_KERNEL_SCS_KB, /* measured in KiB */ #endif NR_PAGETABLE, /* used for pagetables */ +#ifdef CONFIG_SWAP + NR_SWAPCACHE, +#endif NR_VM_NODE_STAT_ITEMS }; diff --git a/include/linux/swap.h b/include/linux/swap.h index 3f1f7ae0fbe9..0d64a2bc7e00 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -408,7 +408,11 @@ extern struct address_space *swapper_spaces[]; #define swap_address_space(entry) \ (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ >> SWAP_ADDRESS_SPACE_SHIFT]) -extern unsigned long total_swapcache_pages(void); +static inline unsigned long total_swapcache_pages(void) +{ + return global_node_page_state(NR_SWAPCACHE); +} + extern void show_swap_cache_info(void); extern int add_to_swap(struct page *page); extern void *get_shadow_from_swap_cache(swp_entry_t entry); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5435b370c929..58edfb6614b8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1521,6 +1521,9 @@ static const struct memory_stat memory_stats[] = { { "file_mapped", NR_FILE_MAPPED }, { "file_dirty", NR_FILE_DIRTY }, { "file_writeback", NR_WRITEBACK }, +#ifdef CONFIG_SWAP + { "swapcached", NR_SWAPCACHE }, +#endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE { "anon_thp", NR_ANON_THPS }, { "file_thp", NR_FILE_THPS }, diff --git a/mm/migrate.c b/mm/migrate.c index 4ec727adfaa2..62b81d5257aa 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -500,6 +500,12 @@ int migrate_page_move_mapping(struct address_space *mapping, __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr); __mod_lruvec_state(new_lruvec, NR_SHMEM, nr); } +#ifdef CONFIG_SWAP + if (PageSwapCache(page)) { + __mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr); + __mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr); + } +#endif if (dirty && mapping_can_writeback(mapping)) { __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr); __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr); diff --git a/mm/swap_state.c b/mm/swap_state.c index 807b00eae969..c1a648d9092b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -68,32 +68,6 @@ static struct { unsigned long find_total; } swap_cache_info; -unsigned long total_swapcache_pages(void) -{ - unsigned int i, j, nr; - unsigned long ret = 0; - struct address_space *spaces; - struct swap_info_struct *si; - - for (i = 0; i < MAX_SWAPFILES; i++) { - swp_entry_t entry = swp_entry(i, 1); - - /* Avoid get_swap_device() to warn for bad swap entry */ - if (!swp_swap_info(entry)) - continue; - /* Prevent swapoff to free swapper_spaces */ - si = get_swap_device(entry); - if (!si) - continue; - nr = nr_swapper_spaces[i]; - spaces = swapper_spaces[i]; - for (j = 0; j < nr; j++) - ret += spaces[j].nrpages; - put_swap_device(si); - } - return ret; -} - static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); void show_swap_cache_info(void) @@ -163,6 +137,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, address_space->nrexceptional -= nr_shadows; address_space->nrpages += nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); + __mod_lruvec_page_state(page, NR_SWAPCACHE, nr); ADD_CACHE_INFO(add_total, nr); unlock: xas_unlock_irq(&xas); @@ -203,6 +178,7 @@ void __delete_from_swap_cache(struct page *page, address_space->nrexceptional += nr; address_space->nrpages -= nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); + __mod_lruvec_page_state(page, NR_SWAPCACHE, -nr); ADD_CACHE_INFO(del_total, nr); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 07dc0af50cf0..a0e949542204 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1215,6 +1215,9 @@ const char * const vmstat_text[] = { "nr_shadow_call_stack", #endif "nr_page_table_pages", +#ifdef CONFIG_SWAP + "nr_swapcached", +#endif /* enum writeback_stat_item counters */ "nr_dirty_threshold", -- cgit v1.2.3 From c1a660dea3fa616420606f1e206e6d22f7e05c30 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 24 Feb 2021 12:03:58 -0800 Subject: mm: kmem: make __memcg_kmem_(un)charge static I've noticed that __memcg_kmem_charge() and __memcg_kmem_uncharge() are not used anywhere except memcontrol.c. Yet they are not declared as non-static and are declared in memcontrol.h. This patch makes them static. Link: https://lkml.kernel.org/r/20210108020332.4096911-1-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 3 --- mm/memcontrol.c | 11 ++++++++--- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 41bbf71edd9f..7a38a1517a05 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1592,9 +1592,6 @@ static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg, #endif #ifdef CONFIG_MEMCG_KMEM -int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, - unsigned int nr_pages); -void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages); int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge_page(struct page *page, int order); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 58edfb6614b8..dce5291525a5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -255,6 +255,11 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) #ifdef CONFIG_MEMCG_KMEM extern spinlock_t css_set_lock; +static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, + unsigned int nr_pages); +static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, + unsigned int nr_pages); + static void obj_cgroup_release(struct percpu_ref *ref) { struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt); @@ -3087,8 +3092,8 @@ static void memcg_free_cache_id(int id) * * Returns 0 on success, an error code on failure. */ -int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, - unsigned int nr_pages) +static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, + unsigned int nr_pages) { struct page_counter *counter; int ret; @@ -3120,7 +3125,7 @@ int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, * @memcg: memcg to uncharge * @nr_pages: number of pages to uncharge */ -void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages) +static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages) { if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) page_counter_uncharge(&memcg->kmem, nr_pages); -- cgit v1.2.3 From 802f1d522d5fdaefc2b935141bc8fe03d43a99ab Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 24 Feb 2021 12:04:01 -0800 Subject: mm: page_counter: re-layout structure to reduce false sharing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When checking a memory cgroup related performance regression [1], from the perf c2c profiling data, we found high false sharing for accessing 'usage' and 'parent'. On 64 bit system, the 'usage' and 'parent' are close to each other, and easy to be in one cacheline (for cacheline size == 64+ B). 'usage' is usally written, while 'parent' is usually read as the cgroup's hierarchical counting nature. So move the 'parent' to the end of the structure to make sure they are in different cache lines. Following are some performance data with the patch, against v5.11-rc1. [ In the data, A means a platform with 2 sockets 48C/96T, B is a platform of 4 sockests 72C/144T, and if a %stddev will be shown bigger than 2%, P100/P50 means number of test tasks equals to 100%/50% of nr_cpu] will-it-scale/malloc1 --------------------- v5.11-rc1 v5.11-rc1+patch A-P100 15782 ± 2% -0.1% 15765 ± 3% will-it-scale.per_process_ops A-P50 21511 +8.9% 23432 will-it-scale.per_process_ops B-P100 9155 +2.2% 9357 will-it-scale.per_process_ops B-P50 10967 +7.1% 11751 ± 2% will-it-scale.per_process_ops will-it-scale/pagefault2 ------------------------ v5.11-rc1 v5.11-rc1+patch A-P100 79028 +3.0% 81411 will-it-scale.per_process_ops A-P50 183960 ± 2% +4.4% 192078 ± 2% will-it-scale.per_process_ops B-P100 85966 +9.9% 94467 ± 3% will-it-scale.per_process_ops B-P50 198195 +9.8% 217526 will-it-scale.per_process_ops fio (4k/1M is block size) ------------------------- v5.11-rc1 v5.11-rc1+patch A-P50-r-4k 16881 ± 2% +1.2% 17081 ± 2% fio.read_bw_MBps A-P50-w-4k 3931 +4.5% 4111 ± 2% fio.write_bw_MBps A-P50-r-1M 15178 -0.2% 15154 fio.read_bw_MBps A-P50-w-1M 3924 +0.1% 3929 fio.write_bw_MBps [1].https://lore.kernel.org/lkml/20201102091543.GM31092@shao2-debian/ Link: https://lkml.kernel.org/r/1611040814-33449-1-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang Reviewed-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_counter.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 85bd413e784e..679591301994 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -12,7 +12,6 @@ struct page_counter { unsigned long low; unsigned long high; unsigned long max; - struct page_counter *parent; /* effective memory.min and memory.min usage tracking */ unsigned long emin; @@ -27,6 +26,14 @@ struct page_counter { /* legacy */ unsigned long watermark; unsigned long failcnt; + + /* + * 'parent' is placed here to be far from 'usage' to reduce + * cache false sharing, as 'usage' is written mostly while + * parent is frequently read for cgroup's hierarchical + * counting nature. + */ + struct page_counter *parent; }; #if BITS_PER_LONG == 32 -- cgit v1.2.3 From 6eeb104e114cb6b7391c2d69ff873403858c1f35 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 24 Feb 2021 12:04:15 -0800 Subject: fs: buffer: use raw page_memcg() on locked page alloc_page_buffers() currently uses get_mem_cgroup_from_page() for charging the buffers to the page owner, which does an rcu-protected page->memcg lookup and acquires a reference. But buffer allocation has the page lock held throughout, which pins the page to the memcg and thereby the memcg - neither rcu nor holding an extra reference during the allocation are necessary. Use a raw page_memcg() instead. This was the last user of get_mem_cgroup_from_page(), delete it. Link: https://lkml.kernel.org/r/20210209190126.97842-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Muchun Song Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 4 ++-- include/linux/memcontrol.h | 7 ------- mm/memcontrol.c | 23 ----------------------- 3 files changed, 2 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/fs/buffer.c b/fs/buffer.c index f1c3a5b27a90..0cb7ffd4977c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -847,7 +847,8 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, if (retry) gfp |= __GFP_NOFAIL; - memcg = get_mem_cgroup_from_page(page); + /* The page lock pins the memcg */ + memcg = page_memcg(page); old_memcg = set_active_memcg(memcg); head = NULL; @@ -868,7 +869,6 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, } out: set_active_memcg(old_memcg); - mem_cgroup_put(memcg); return head; /* * In case anything failed, we just free everything we got. diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7a38a1517a05..e6dc793d587d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -680,8 +680,6 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); -struct mem_cgroup *get_mem_cgroup_from_page(struct page *page); - struct lruvec *lock_page_lruvec(struct page *page); struct lruvec *lock_page_lruvec_irq(struct page *page); struct lruvec *lock_page_lruvec_irqsave(struct page *page, @@ -1191,11 +1189,6 @@ static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) return NULL; } -static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) -{ - return NULL; -} - static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8c035846c7a4..7fdc001ce15f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1047,29 +1047,6 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) } EXPORT_SYMBOL(get_mem_cgroup_from_mm); -/** - * get_mem_cgroup_from_page: Obtain a reference on given page's memcg. - * @page: page from which memcg should be extracted. - * - * Obtain a reference on page->memcg and returns it if successful. Otherwise - * root_mem_cgroup is returned. - */ -struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) -{ - struct mem_cgroup *memcg = page_memcg(page); - - if (mem_cgroup_disabled()) - return NULL; - - rcu_read_lock(); - /* Page should not get uncharged and freed memcg under us. */ - if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css))) - memcg = root_mem_cgroup; - rcu_read_unlock(); - return memcg; -} -EXPORT_SYMBOL(get_mem_cgroup_from_page); - static __always_inline struct mem_cgroup *active_memcg(void) { if (in_interrupt()) -- cgit v1.2.3 From 027b37b552f326aa94ef06c7ea77088b16c41e6e Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 24 Feb 2021 12:05:46 -0800 Subject: kasan: move _RET_IP_ to inline wrappers Generic mm functions that call KASAN annotations that might report a bug pass _RET_IP_ to them as an argument. This allows KASAN to include the name of the function that called the mm function in its report's header. Now that KASAN has inline wrappers for all of its annotations, move _RET_IP_ to those wrappers to simplify annotation call sites. Link: https://linux-review.googlesource.com/id/I8fb3c06d49671305ee184175a39591bc26647a67 Link: https://lkml.kernel.org/r/5c1490eddf20b436b8c4eeea83fce47687d5e4a4.1610733117.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Reviewed-by: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 20 +++++++++----------- mm/mempool.c | 2 +- mm/slab.c | 2 +- mm/slub.c | 4 ++-- 4 files changed, 13 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 0aea9e2a2a01..a7254186558a 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -185,19 +185,18 @@ static __always_inline void * __must_check kasan_init_slab_obj( } bool __kasan_slab_free(struct kmem_cache *s, void *object, unsigned long ip); -static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object, - unsigned long ip) +static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object) { if (kasan_enabled()) - return __kasan_slab_free(s, object, ip); + return __kasan_slab_free(s, object, _RET_IP_); return false; } void __kasan_slab_free_mempool(void *ptr, unsigned long ip); -static __always_inline void kasan_slab_free_mempool(void *ptr, unsigned long ip) +static __always_inline void kasan_slab_free_mempool(void *ptr) { if (kasan_enabled()) - __kasan_slab_free_mempool(ptr, ip); + __kasan_slab_free_mempool(ptr, _RET_IP_); } void * __must_check __kasan_slab_alloc(struct kmem_cache *s, @@ -241,10 +240,10 @@ static __always_inline void * __must_check kasan_krealloc(const void *object, } void __kasan_kfree_large(void *ptr, unsigned long ip); -static __always_inline void kasan_kfree_large(void *ptr, unsigned long ip) +static __always_inline void kasan_kfree_large(void *ptr) { if (kasan_enabled()) - __kasan_kfree_large(ptr, ip); + __kasan_kfree_large(ptr, _RET_IP_); } bool kasan_save_enable_multi_shot(void); @@ -277,12 +276,11 @@ static inline void *kasan_init_slab_obj(struct kmem_cache *cache, { return (void *)object; } -static inline bool kasan_slab_free(struct kmem_cache *s, void *object, - unsigned long ip) +static inline bool kasan_slab_free(struct kmem_cache *s, void *object) { return false; } -static inline void kasan_slab_free_mempool(void *ptr, unsigned long ip) {} +static inline void kasan_slab_free_mempool(void *ptr) {} static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags) { @@ -302,7 +300,7 @@ static inline void *kasan_krealloc(const void *object, size_t new_size, { return (void *)object; } -static inline void kasan_kfree_large(void *ptr, unsigned long ip) {} +static inline void kasan_kfree_large(void *ptr) {} #endif /* CONFIG_KASAN */ diff --git a/mm/mempool.c b/mm/mempool.c index 624ed51b060f..79959fac27d7 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -104,7 +104,7 @@ static inline void poison_element(mempool_t *pool, void *element) static __always_inline void kasan_poison_element(mempool_t *pool, void *element) { if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) - kasan_slab_free_mempool(element, _RET_IP_); + kasan_slab_free_mempool(element); else if (pool->alloc == mempool_alloc_pages) kasan_free_pages(element, (unsigned long)pool->pool_data); } diff --git a/mm/slab.c b/mm/slab.c index 9c9e0c255c4b..35c68d99d460 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3420,7 +3420,7 @@ static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp, memset(objp, 0, cachep->object_size); /* Put the object into the quarantine, don't touch it for now. */ - if (kasan_slab_free(cachep, objp, _RET_IP_)) + if (kasan_slab_free(cachep, objp)) return; /* Use KCSAN to help debug racy use-after-free. */ diff --git a/mm/slub.c b/mm/slub.c index 046d7194acaf..b2833ce85c92 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1528,7 +1528,7 @@ static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) static __always_inline void kfree_hook(void *x) { kmemleak_free(x); - kasan_kfree_large(x, _RET_IP_); + kasan_kfree_large(x); } static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x) @@ -1558,7 +1558,7 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x) KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT); /* KASAN might put x into memory quarantine, delaying its reuse */ - return kasan_slab_free(s, x, _RET_IP_); + return kasan_slab_free(s, x); } static inline bool slab_free_freelist_hook(struct kmem_cache *s, -- cgit v1.2.3 From 611806b4bf8dd97a4f3d73f5cf3c2c7730c51eb2 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 24 Feb 2021 12:05:50 -0800 Subject: kasan: fix bug detection via ksize for HW_TAGS mode The currently existing kasan_check_read/write() annotations are intended to be used for kernel modules that have KASAN compiler instrumentation disabled. Thus, they are only relevant for the software KASAN modes that rely on compiler instrumentation. However there's another use case for these annotations: ksize() checks that the object passed to it is indeed accessible before unpoisoning the whole object. This is currently done via __kasan_check_read(), which is compiled away for the hardware tag-based mode that doesn't rely on compiler instrumentation. This leads to KASAN missing detecting some memory corruptions. Provide another annotation called kasan_check_byte() that is available for all KASAN modes. As the implementation rename and reuse kasan_check_invalid_free(). Use this new annotation in ksize(). To avoid having ksize() as the top frame in the reported stack trace pass _RET_IP_ to __kasan_check_byte(). Also add a new ksize_uaf() test that checks that a use-after-free is detected via ksize() itself, and via plain accesses that happen later. Link: https://linux-review.googlesource.com/id/Iaabf771881d0f9ce1b969f2a62938e99d3308ec5 Link: https://lkml.kernel.org/r/f32ad74a60b28d8402482a38476f02bb7600f620.1610733117.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Reviewed-by: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan-checks.h | 6 ++++++ include/linux/kasan.h | 17 +++++++++++++++++ lib/test_kasan.c | 20 ++++++++++++++++++++ mm/kasan/common.c | 11 ++++++++++- mm/kasan/generic.c | 4 ++-- mm/kasan/kasan.h | 10 +++++----- mm/kasan/sw_tags.c | 6 +++--- mm/slab_common.c | 16 +++++++++------- 8 files changed, 72 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h index ca5e89fb10d3..3d6d22a25bdc 100644 --- a/include/linux/kasan-checks.h +++ b/include/linux/kasan-checks.h @@ -4,6 +4,12 @@ #include +/* + * The annotations present in this file are only relevant for the software + * KASAN modes that rely on compiler instrumentation, and will be optimized + * away for the hardware tag-based KASAN mode. Use kasan_check_byte() instead. + */ + /* * __kasan_check_*: Always available when KASAN is enabled. This may be used * even in compilation units that selectively disable KASAN, but must use KASAN diff --git a/include/linux/kasan.h b/include/linux/kasan.h index a7254186558a..7eaf2d9effb4 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -246,6 +246,19 @@ static __always_inline void kasan_kfree_large(void *ptr) __kasan_kfree_large(ptr, _RET_IP_); } +/* + * Unlike kasan_check_read/write(), kasan_check_byte() is performed even for + * the hardware tag-based mode that doesn't rely on compiler instrumentation. + */ +bool __kasan_check_byte(const void *addr, unsigned long ip); +static __always_inline bool kasan_check_byte(const void *addr) +{ + if (kasan_enabled()) + return __kasan_check_byte(addr, _RET_IP_); + return true; +} + + bool kasan_save_enable_multi_shot(void); void kasan_restore_multi_shot(bool enabled); @@ -301,6 +314,10 @@ static inline void *kasan_krealloc(const void *object, size_t new_size, return (void *)object; } static inline void kasan_kfree_large(void *ptr) {} +static inline bool kasan_check_byte(const void *address) +{ + return true; +} #endif /* CONFIG_KASAN */ diff --git a/lib/test_kasan.c b/lib/test_kasan.c index e59f185b8075..3f771fabd0ec 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -496,6 +496,7 @@ static void kasan_global_oob(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); } +/* Check that ksize() makes the whole object accessible. */ static void ksize_unpoisons_memory(struct kunit *test) { char *ptr; @@ -514,6 +515,24 @@ static void ksize_unpoisons_memory(struct kunit *test) kfree(ptr); } +/* + * Check that a use-after-free is detected by ksize() and via normal accesses + * after it. + */ +static void ksize_uaf(struct kunit *test) +{ + char *ptr; + int size = 128 - KASAN_GRANULE_SIZE; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + kfree(ptr); + + KUNIT_EXPECT_KASAN_FAIL(test, ksize(ptr)); + KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = *ptr); + KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = *(ptr + size)); +} + static void kasan_stack_oob(struct kunit *test) { char stack_array[10]; @@ -907,6 +926,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kasan_alloca_oob_left), KUNIT_CASE(kasan_alloca_oob_right), KUNIT_CASE(ksize_unpoisons_memory), + KUNIT_CASE(ksize_uaf), KUNIT_CASE(kmem_cache_double_free), KUNIT_CASE(kmem_cache_invalid_free), KUNIT_CASE(kasan_memchr), diff --git a/mm/kasan/common.c b/mm/kasan/common.c index eedc3e0fe365..b18189ef3a92 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -345,7 +345,7 @@ static bool ____kasan_slab_free(struct kmem_cache *cache, void *object, if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) return false; - if (kasan_check_invalid_free(tagged_object)) { + if (!kasan_byte_accessible(tagged_object)) { kasan_report_invalid_free(tagged_object, ip); return true; } @@ -490,3 +490,12 @@ void __kasan_kfree_large(void *ptr, unsigned long ip) kasan_report_invalid_free(ptr, ip); /* The object will be poisoned by kasan_free_pages(). */ } + +bool __kasan_check_byte(const void *address, unsigned long ip) +{ + if (!kasan_byte_accessible(address)) { + kasan_report((unsigned long)address, 1, false, ip); + return false; + } + return true; +} diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index acab8862dc67..3f17a1218055 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -185,11 +185,11 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write, return check_region_inline(addr, size, write, ret_ip); } -bool kasan_check_invalid_free(void *addr) +bool kasan_byte_accessible(const void *addr) { s8 shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr)); - return shadow_byte < 0 || shadow_byte >= KASAN_GRANULE_SIZE; + return shadow_byte >= 0 && shadow_byte < KASAN_GRANULE_SIZE; } void kasan_cache_shrink(struct kmem_cache *cache) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 1298b79f9518..cc14b6e6c14c 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -341,20 +341,20 @@ static inline void kasan_unpoison(const void *address, size_t size) round_up(size, KASAN_GRANULE_SIZE), get_tag(address)); } -static inline bool kasan_check_invalid_free(void *addr) +static inline bool kasan_byte_accessible(const void *addr) { u8 ptr_tag = get_tag(addr); - u8 mem_tag = hw_get_mem_tag(addr); + u8 mem_tag = hw_get_mem_tag((void *)addr); - return (mem_tag == KASAN_TAG_INVALID) || - (ptr_tag != KASAN_TAG_KERNEL && ptr_tag != mem_tag); + return (mem_tag != KASAN_TAG_INVALID) && + (ptr_tag == KASAN_TAG_KERNEL || ptr_tag == mem_tag); } #else /* CONFIG_KASAN_HW_TAGS */ void kasan_poison(const void *address, size_t size, u8 value); void kasan_unpoison(const void *address, size_t size); -bool kasan_check_invalid_free(void *addr); +bool kasan_byte_accessible(const void *addr); #endif /* CONFIG_KASAN_HW_TAGS */ diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index cc271fceb5d5..94c2d33be333 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -118,13 +118,13 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write, return true; } -bool kasan_check_invalid_free(void *addr) +bool kasan_byte_accessible(const void *addr) { u8 tag = get_tag(addr); u8 shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(kasan_reset_tag(addr))); - return (shadow_byte == KASAN_TAG_INVALID) || - (tag != KASAN_TAG_KERNEL && tag != shadow_byte); + return (shadow_byte != KASAN_TAG_INVALID) && + (tag == KASAN_TAG_KERNEL || tag == shadow_byte); } #define DEFINE_HWASAN_LOAD_STORE(size) \ diff --git a/mm/slab_common.c b/mm/slab_common.c index 5be7825ad3ce..7c8298c17145 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1218,19 +1218,21 @@ size_t ksize(const void *objp) size_t size; /* - * We need to check that the pointed to object is valid, and only then - * unpoison the shadow memory below. We use __kasan_check_read(), to - * generate a more useful report at the time ksize() is called (rather - * than later where behaviour is undefined due to potential - * use-after-free or double-free). + * We need to first check that the pointer to the object is valid, and + * only then unpoison the memory. The report printed from ksize() is + * more useful, then when it's printed later when the behaviour could + * be undefined due to a potential use-after-free or double-free. * - * If the pointed to memory is invalid we return 0, to avoid users of + * We use kasan_check_byte(), which is supported for the hardware + * tag-based KASAN mode, unlike kasan_check_read/write(). + * + * If the pointed to memory is invalid, we return 0 to avoid users of * ksize() writing to and potentially corrupting the memory region. * * We want to perform the check before __ksize(), to avoid potentially * crashing in __ksize() due to accessing invalid metadata. */ - if (unlikely(ZERO_OR_NULL_PTR(objp)) || !__kasan_check_read(objp, 1)) + if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp)) return 0; size = __ksize(objp); -- cgit v1.2.3 From 93f503c3fcd168a43e4a6c875fe2cfafaf8439dc Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Feb 2021 12:06:10 -0800 Subject: mm: fix prototype warning from kernel test robot Patch series "mm: clean up names and parameters of memmap_init_xxxx functions", v5. This patchset corrects inappropriate function names of memmap_init_xxx, and simplify parameters of functions in the code flow. And also fix a prototype warning reported by lkp. This patch (of 5); Kernel test robot calling make with 'W=1' is triggering warning like below for memmap_init_zone() function. mm/page_alloc.c:6259:23: warning: no previous prototype for 'memmap_init_zone' [-Wmissing-prototypes] 6259 | void __meminit __weak memmap_init_zone(unsigned long size, int nid, | ^~~~~~~~~~~~~~~~ Fix it by adding the function declaration in include/linux/mm.h. Since memmap_init_zone() has a generic version with '__weak', the declaratoin in ia64 header file can be simply removed. Link: https://lkml.kernel.org/r/20210122135956.5946-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20210122135956.5946-2-bhe@redhat.com Signed-off-by: Baoquan He Reported-by: kernel test robot Reviewed-by: Mike Rapoport Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/pgtable.h | 6 ------ include/linux/mm.h | 2 ++ 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index 779b6972aa84..9b4efe89e62d 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -517,12 +517,6 @@ extern struct page *zero_page_memmap_ptr; __changed; \ }) #endif - -# ifdef CONFIG_VIRTUAL_MEM_MAP - /* arch mem_map init routine is needed due to holes in a virtual mem_map */ - extern void memmap_init (unsigned long size, int nid, unsigned long zone, - unsigned long start_pfn); -# endif /* CONFIG_VIRTUAL_MEM_MAP */ # endif /* !__ASSEMBLY__ */ /* diff --git a/include/linux/mm.h b/include/linux/mm.h index 7deb7168104d..1c1eabdf112c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2408,6 +2408,8 @@ extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int migratetype); +extern void memmap_init(unsigned long size, int nid, + unsigned long zone, unsigned long range_start_pfn); extern void setup_per_zone_wmarks(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); -- cgit v1.2.3 From ab28cb6e1e5e59eb8bf3ad399133617414301d3a Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Feb 2021 12:06:14 -0800 Subject: mm: rename memmap_init() and memmap_init_zone() The current memmap_init_zone() only handles memory region inside one zone, actually memmap_init() does the memmap init of one zone. So rename both of them accordingly. Link: https://lkml.kernel.org/r/20210122135956.5946-3-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Mike Rapoport Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 6 +++--- include/linux/mm.h | 4 ++-- mm/memory_hotplug.c | 2 +- mm/page_alloc.c | 8 ++++---- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index b19f47a5a305..39b782f62d7d 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -536,18 +536,18 @@ virtual_memmap_init(u64 start, u64 end, void *arg) / sizeof(struct page)); if (map_start < map_end) - memmap_init_zone((unsigned long)(map_end - map_start), + memmap_init_range((unsigned long)(map_end - map_start), args->nid, args->zone, page_to_pfn(map_start), page_to_pfn(map_end), MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); return 0; } void __meminit -memmap_init (unsigned long size, int nid, unsigned long zone, +memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn) { if (!vmem_map) { - memmap_init_zone(size, nid, zone, start_pfn, start_pfn + size, + memmap_init_range(size, nid, zone, start_pfn, start_pfn + size, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); } else { struct page *start; diff --git a/include/linux/mm.h b/include/linux/mm.h index 1c1eabdf112c..56d0eeae0ce3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2405,10 +2405,10 @@ extern int __meminit early_pfn_to_nid(unsigned long pfn); #endif extern void set_dma_reserve(unsigned long new_dma_reserve); -extern void memmap_init_zone(unsigned long, int, unsigned long, +extern void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int migratetype); -extern void memmap_init(unsigned long size, int nid, +extern void memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long range_start_pfn); extern void setup_per_zone_wmarks(void); extern int __meminit init_per_zone_wmark_min(void); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f9d57b9be8c7..ddcb1cd24c60 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -713,7 +713,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, * expects the zone spans the pfn range. All the pages in the range * are reserved so nobody should be touching them so we should be safe */ - memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, 0, + memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0, MEMINIT_HOTPLUG, altmap, migratetype); set_zone_contiguous(zone); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 069561aadc7b..519cf52963eb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6121,7 +6121,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related * zone stats (e.g., nr_isolate_pageblock) are touched. */ -void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, +void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, unsigned long zone_end_pfn, enum meminit_context context, struct vmem_altmap *altmap, int migratetype) @@ -6258,7 +6258,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) } } -void __meminit __weak memmap_init(unsigned long size, int nid, +void __meminit __weak memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long range_start_pfn) { @@ -6272,7 +6272,7 @@ void __meminit __weak memmap_init(unsigned long size, int nid, if (end_pfn > start_pfn) { size = end_pfn - start_pfn; - memmap_init_zone(size, nid, zone, start_pfn, range_end_pfn, + memmap_init_range(size, nid, zone, start_pfn, range_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); } } @@ -6982,7 +6982,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat) set_pageblock_order(); setup_usemap(pgdat, zone, zone_start_pfn, size); init_currently_empty_zone(zone, zone_start_pfn, size); - memmap_init(size, nid, j, zone_start_pfn); + memmap_init_zone(size, nid, j, zone_start_pfn); } } -- cgit v1.2.3 From 3256ff83c566235e812498ee1dc806c45a5d5af7 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Feb 2021 12:06:17 -0800 Subject: mm: simplify parater of function memmap_init_zone() As David suggested, simply passing 'struct zone *zone' is enough. We can get all needed information from 'struct zone*' easily. Link: https://lkml.kernel.org/r/20210122135956.5946-4-bhe@redhat.com Signed-off-by: Baoquan He Suggested-by: David Hildenbrand Reviewed-by: Mike Rapoport Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 12 +++++++----- include/linux/mm.h | 3 +-- mm/page_alloc.c | 24 +++++++++++------------- 3 files changed, 19 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 39b782f62d7d..16d0d7d22657 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -542,12 +542,14 @@ virtual_memmap_init(u64 start, u64 end, void *arg) return 0; } -void __meminit -memmap_init_zone(unsigned long size, int nid, unsigned long zone, - unsigned long start_pfn) +void __meminit memmap_init_zone(struct zone *zone) { + int nid = zone_to_nid(zone), zone_id = zone_idx(zone); + unsigned long start_pfn = zone->zone_start_pfn; + unsigned long size = zone->spanned_pages; + if (!vmem_map) { - memmap_init_range(size, nid, zone, start_pfn, start_pfn + size, + memmap_init_range(size, nid, zone_id, start_pfn, start_pfn + size, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); } else { struct page *start; @@ -557,7 +559,7 @@ memmap_init_zone(unsigned long size, int nid, unsigned long zone, args.start = start; args.end = start + size; args.nid = nid; - args.zone = zone; + args.zone = zone_id; efi_memmap_walk(virtual_memmap_init, &args); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 56d0eeae0ce3..2601a5cce0ef 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2408,8 +2408,7 @@ extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int migratetype); -extern void memmap_init_zone(unsigned long size, int nid, - unsigned long zone, unsigned long range_start_pfn); +extern void memmap_init_zone(struct zone *zone); extern void setup_per_zone_wmarks(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 519cf52963eb..aa04c54ad47c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6258,23 +6258,21 @@ static void __meminit zone_init_free_lists(struct zone *zone) } } -void __meminit __weak memmap_init_zone(unsigned long size, int nid, - unsigned long zone, - unsigned long range_start_pfn) +void __meminit __weak memmap_init_zone(struct zone *zone) { + unsigned long zone_start_pfn = zone->zone_start_pfn; + unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages; + int i, nid = zone_to_nid(zone), zone_id = zone_idx(zone); unsigned long start_pfn, end_pfn; - unsigned long range_end_pfn = range_start_pfn + size; - int i; for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { - start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); - end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); + start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn); + end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn); - if (end_pfn > start_pfn) { - size = end_pfn - start_pfn; - memmap_init_range(size, nid, zone, start_pfn, range_end_pfn, - MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); - } + if (end_pfn > start_pfn) + memmap_init_range(end_pfn - start_pfn, nid, + zone_id, start_pfn, zone_end_pfn, + MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); } } @@ -6982,7 +6980,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat) set_pageblock_order(); setup_usemap(pgdat, zone, zone_start_pfn, size); init_currently_empty_zone(zone, zone_start_pfn, size); - memmap_init_zone(size, nid, j, zone_start_pfn); + memmap_init_zone(zone); } } -- cgit v1.2.3 From a0cd7a7c4bc004587d1f4785a320f58e72d880eb Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 24 Feb 2021 12:06:32 -0800 Subject: mm: simplify free_highmem_page() and free_reserved_page() adjust_managed_page_count() as called by free_reserved_page() properly handles pages in a highmem zone, so we can reuse it for free_highmem_page(). We can now get rid of totalhigh_pages_inc() and simplify free_reserved_page(). Link: https://lkml.kernel.org/r/20210126182113.19892-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Reviewed-by: Anshuman Khandual Cc: Thomas Gleixner Cc: "Peter Zijlstra (Intel)" Cc: Mike Rapoport Cc: Michal Hocko Cc: Wei Yang Cc: "Gustavo A. R. Silva" Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/highmem-internal.h | 5 ----- include/linux/mm.h | 16 ++-------------- mm/page_alloc.c | 11 ----------- 3 files changed, 2 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 1bbe96dc8be6..7902c7d8b55f 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -127,11 +127,6 @@ static inline unsigned long totalhigh_pages(void) return (unsigned long)atomic_long_read(&_totalhigh_pages); } -static inline void totalhigh_pages_inc(void) -{ - atomic_long_inc(&_totalhigh_pages); -} - static inline void totalhigh_pages_add(long count) { atomic_long_add(count, &_totalhigh_pages); diff --git a/include/linux/mm.h b/include/linux/mm.h index 2601a5cce0ef..76cab132c295 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2310,32 +2310,20 @@ extern void free_initmem(void); extern unsigned long free_reserved_area(void *start, void *end, int poison, const char *s); -#ifdef CONFIG_HIGHMEM -/* - * Free a highmem page into the buddy system, adjusting totalhigh_pages - * and totalram_pages. - */ -extern void free_highmem_page(struct page *page); -#endif - extern void adjust_managed_page_count(struct page *page, long count); extern void mem_init_print_info(const char *str); extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end); /* Free the reserved page into the buddy system, so it gets managed. */ -static inline void __free_reserved_page(struct page *page) +static inline void free_reserved_page(struct page *page) { ClearPageReserved(page); init_page_count(page); __free_page(page); -} - -static inline void free_reserved_page(struct page *page) -{ - __free_reserved_page(page); adjust_managed_page_count(page, 1); } +#define free_highmem_page(page) free_reserved_page(page) static inline void mark_page_reserved(struct page *page) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 35e230cadb94..ddccc59f2f72 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7691,17 +7691,6 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char return pages; } -#ifdef CONFIG_HIGHMEM -void free_highmem_page(struct page *page) -{ - __free_reserved_page(page); - totalram_pages_inc(); - atomic_long_inc(&page_zone(page)->managed_pages); - totalhigh_pages_inc(); -} -#endif - - void __init mem_init_print_info(const char *str) { unsigned long physpages, codesize, datasize, rosize, bss_size; -- cgit v1.2.3 From 3b2ebeaf98a028d5dd4ec63095855ef507920276 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Feb 2021 12:06:35 -0800 Subject: mm/gfp: add kernel-doc for gfp_t The generated html will link to the definition of the gfp_t automatically once we define it. Move the one-paragraph overview of GFP flags from the documentation directory into gfp.h and pull gfp.h into the documentation. This generates warnings with clang (https://lkml.kernel.org/r/20210219195509.GA59987@24bbad8f3778), so use a #if 0 to hide it from the compiler for now. Link: https://lkml.kernel.org/r/20210215204909.3824509-1-willy@infradead.org Link: https://lkml.kernel.org/r/20210220003049.GZ2858050@casper.infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport Cc: Nathan Chancellor Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/core-api/mm-api.rst | 7 ++----- include/linux/gfp.h | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index 2adffb3f7914..201b5423303b 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -19,11 +19,8 @@ User Space Memory Access Memory Allocation Controls ========================== -Functions which need to allocate memory often use GFP flags to express -how that memory should be allocated. The GFP acronym stands for "get -free pages", the underlying memory allocation function. Not every GFP -flag is allowed to every function which may allocate memory. Most -users will want to use a plain ``GFP_KERNEL``. +.. kernel-doc:: include/linux/gfp.h + :internal: .. kernel-doc:: include/linux/gfp.h :doc: Page mobility and placement hints diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 80544d5c08e7..220cd553a9e7 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -8,6 +8,20 @@ #include #include +/* The typedef is in types.h but we want the documentation here */ +#if 0 +/** + * typedef gfp_t - Memory allocation flags. + * + * GFP flags are commonly used throughout Linux to indicate how memory + * should be allocated. The GFP acronym stands for get_free_pages(), + * the underlying memory allocation function. Not every GFP flag is + * supported by every function which may allocate memory. Most users + * will want to use a plain ``GFP_KERNEL``. + */ +typedef unsigned int __bitwise gfp_t; +#endif + struct vm_area_struct; /* -- cgit v1.2.3 From 0fa5bc4023c188082024833b3deffd5543b93bc9 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Wed, 24 Feb 2021 12:07:12 -0800 Subject: mm/hugetlb: grab head page refcount once for group of subpages Patch series "mm/hugetlb: follow_hugetlb_page() improvements", v2. While looking at ZONE_DEVICE struct page reuse particularly the last patch[0], I found two possible improvements for follow_hugetlb_page() which is solely used for get_user_pages()/pin_user_pages(). The first patch batches page refcount updates while the second tidies up storing the subpages/vmas. Both together bring the cost of slow variant of gup() cost from ~87.6k usecs to ~5.8k usecs. libhugetlbfs tests seem to pass as well gup_test benchmarks with hugetlbfs vmas. This patch (of 2): follow_hugetlb_page() once it locks the pmd/pud, checks all its N subpages in a huge page and grabs a reference for each one. Similar to gup-fast, have follow_hugetlb_page() grab the head page refcount only after counting all its subpages that are part of the just faulted huge page. Consequently we reduce the number of atomics necessary to pin said huge page, which improves non-fast gup() considerably: - 16G with 1G huge page size gup_test -f /mnt/huge/file -m 16384 -r 10 -L -S -n 512 -w PIN_LONGTERM_BENCHMARK: ~87.6k us -> ~12.8k us Link: https://lkml.kernel.org/r/20210128182632.24562-1-joao.m.martins@oracle.com Link: https://lkml.kernel.org/r/20210128182632.24562-2-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 +++ mm/gup.c | 5 ++--- mm/hugetlb.c | 43 ++++++++++++++++++++++++------------------- 3 files changed, 29 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 76cab132c295..77e64e3eac80 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1187,6 +1187,9 @@ static inline void get_page(struct page *page) } bool __must_check try_grab_page(struct page *page, unsigned int flags); +__maybe_unused struct page *try_grab_compound_head(struct page *page, int refs, + unsigned int flags); + static inline __must_check bool try_get_page(struct page *page) { diff --git a/mm/gup.c b/mm/gup.c index e4c224cd9661..e40579624f10 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -78,9 +78,8 @@ static inline struct page *try_get_compound_head(struct page *page, int refs) * considered failure, and furthermore, a likely bug in the caller, so a warning * is also emitted. */ -static __maybe_unused struct page *try_grab_compound_head(struct page *page, - int refs, - unsigned int flags) +__maybe_unused struct page *try_grab_compound_head(struct page *page, + int refs, unsigned int flags) { if (flags & FOLL_GET) return try_get_compound_head(page, refs); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6920c71d7e0d..d4fc5db6bc32 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4796,7 +4796,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long vaddr = *position; unsigned long remainder = *nr_pages; struct hstate *h = hstate_vma(vma); - int err = -EFAULT; + int err = -EFAULT, refs; while (vaddr < vma->vm_end && remainder) { pte_t *pte; @@ -4916,26 +4916,11 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, continue; } + refs = 0; + same_page: - if (pages) { + if (pages) pages[i] = mem_map_offset(page, pfn_offset); - /* - * try_grab_page() should always succeed here, because: - * a) we hold the ptl lock, and b) we've just checked - * that the huge page is present in the page tables. If - * the huge page is present, then the tail pages must - * also be present. The ptl prevents the head page and - * tail pages from being rearranged in any way. So this - * page must be available at this point, unless the page - * refcount overflowed: - */ - if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) { - spin_unlock(ptl); - remainder = 0; - err = -ENOMEM; - break; - } - } if (vmas) vmas[i] = vma; @@ -4944,6 +4929,7 @@ same_page: ++pfn_offset; --remainder; ++i; + ++refs; if (vaddr < vma->vm_end && remainder && pfn_offset < pages_per_huge_page(h)) { /* @@ -4951,6 +4937,25 @@ same_page: * of this compound page. */ goto same_page; + } else if (pages) { + /* + * try_grab_compound_head() should always succeed here, + * because: a) we hold the ptl lock, and b) we've just + * checked that the huge page is present in the page + * tables. If the huge page is present, then the tail + * pages must also be present. The ptl prevents the + * head page and tail pages from being rearranged in + * any way. So this page must be available at this + * point, unless the page refcount overflowed: + */ + if (WARN_ON_ONCE(!try_grab_compound_head(pages[i-1], + refs, + flags))) { + spin_unlock(ptl); + remainder = 0; + err = -ENOMEM; + break; + } } spin_unlock(ptl); } -- cgit v1.2.3 From 6c26d3108393211ecfd44d89404cfb744027bafd Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:07:19 -0800 Subject: mm/hugetlb: fix some comment typos Fix typos sasitfy to satisfy, reservtion to reservation, hugegpage to hugepage and uniprocesor to uniprocessor in comments. Link: https://lkml.kernel.org/r/20210128112028.64831-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Souptick Joarder Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index b5807f23caf8..ac3cb239a7ec 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -37,7 +37,7 @@ struct hugepage_subpool { struct hstate *hstate; long min_hpages; /* Minimum huge pages or -1 if no minimum. */ long rsv_hpages; /* Pages reserved against global pool to */ - /* sasitfy minimum size. */ + /* satisfy minimum size. */ }; struct resv_map { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cf6653879bb4..fc895e25920c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1434,7 +1434,7 @@ static void __free_huge_page(struct page *page) * reservation. If the page was associated with a subpool, there * would have been a page reserved in the subpool before allocation * via hugepage_subpool_get_pages(). Since we are 'restoring' the - * reservtion, do not call hugepage_subpool_put_pages() as this will + * reservation, do not call hugepage_subpool_put_pages() as this will * remove the reserved page from the subpool. */ if (!restore_reserve) { @@ -3707,7 +3707,7 @@ static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) /* * We cannot handle pagefaults against hugetlb pages at all. They cause * handle_mm_fault() to try to instantiate regular-sized pages in the - * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get + * hugepage VMA. do_page_fault() is supposed to trap this, so BUG is we get * this far. */ static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) @@ -4491,7 +4491,7 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) } #else /* - * For uniprocesor systems we always use a single mutex, so just + * For uniprocessor systems we always use a single mutex, so just * return 0 and avoid the hashing overhead. */ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) -- cgit v1.2.3 From bae84953815793f68ddd8edeadd3f4e32676a2c8 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 24 Feb 2021 12:07:32 -0800 Subject: mm/pmem: avoid inserting hugepage PTE entry with fsdax if hugepage support is disabled Differentiate between hardware not supporting hugepages and user disabling THP via 'echo never > /sys/kernel/mm/transparent_hugepage/enabled' For the devdax namespace, the kernel handles the above via the supported_alignment attribute and failing to initialize the namespace if the namespace align value is not supported on the platform. For the fsdax namespace, the kernel will continue to initialize the namespace. This can result in the kernel creating a huge pte entry even though the hardware don't support the same. We do want hugepage support with pmem even if the end-user disabled THP via sysfs file (/sys/kernel/mm/transparent_hugepage/enabled). Hence differentiate between hardware/firmware lacking support vs user-controlled disable of THP and prevent a huge fault if the hardware lacks hugepage support. Link: https://lkml.kernel.org/r/20210205023956.417587-1-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Dan Williams Cc: "Kirill A . Shutemov" Cc: Jan Kara Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 15 +++++++++------ mm/huge_memory.c | 6 +++++- 2 files changed, 14 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 6a19f35f836b..ba973efcd369 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -78,6 +78,7 @@ static inline vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, } enum transparent_hugepage_flag { + TRANSPARENT_HUGEPAGE_NEVER_DAX, TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, @@ -123,6 +124,13 @@ extern unsigned long transparent_hugepage_flags; */ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) { + + /* + * If the hardware/firmware marked hugepage support disabled. + */ + if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX)) + return false; + if (vma->vm_flags & VM_NOHUGEPAGE) return false; @@ -134,12 +142,7 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG)) return true; - /* - * For dax vmas, try to always use hugepage mappings. If the kernel does - * not support hugepages, fsdax mappings will fallback to PAGE_SIZE - * mappings, and device-dax namespaces, that try to guarantee a given - * mapping size, will fail to enable - */ + if (vma_is_dax(vma)) return true; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4cdcc4da36e8..d77605c30f2e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -386,7 +386,11 @@ static int __init hugepage_init(void) struct kobject *hugepage_kobj; if (!has_transparent_hugepage()) { - transparent_hugepage_flags = 0; + /* + * Hardware doesn't support hugepages, hence disable + * DAX PMD support. + */ + transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_NEVER_DAX; return -EINVAL; } -- cgit v1.2.3 From c2135f7c570bc274035834848d9bf46ea89ba763 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Wed, 24 Feb 2021 12:08:01 -0800 Subject: mm/vmscan: __isolate_lru_page_prepare() cleanup The function just returns 2 results, so using a 'switch' to deal with its result is unnecessary. Also simplify it to a bool func as Vlastimil suggested. Also remove 'goto' by reusing list_move(), and take Matthew Wilcox's suggestion to update comments in function. Link: https://lkml.kernel.org/r/728874d7-2d93-4049-68c1-dcc3b2d52ccd@linux.alibaba.com Signed-off-by: Alex Shi Reviewed-by: Andrew Morton Acked-by: Vlastimil Babka Cc: Matthew Wilcox Cc: Hugh Dickins Cc: Yu Zhao Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 +- mm/compaction.c | 2 +- mm/vmscan.c | 68 ++++++++++++++++++++++++---------------------------- 3 files changed, 33 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 0d64a2bc7e00..32f665b1ee85 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -356,7 +356,7 @@ extern void lru_cache_add_inactive_or_unevictable(struct page *page, extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); -extern int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode); +extern bool __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode); extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, diff --git a/mm/compaction.c b/mm/compaction.c index 190ccdaa6c19..8f52532675a9 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -988,7 +988,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (unlikely(!get_page_unless_zero(page))) goto isolate_fail; - if (__isolate_lru_page_prepare(page, isolate_mode) != 0) + if (!__isolate_lru_page_prepare(page, isolate_mode)) goto isolate_fail_put; /* Try isolate the page */ diff --git a/mm/vmscan.c b/mm/vmscan.c index b1b574ad199d..04509994aed4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1539,19 +1539,17 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, * page: page to consider * mode: one of the LRU isolation modes defined above * - * returns 0 on success, -ve errno on failure. + * returns true on success, false on failure. */ -int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode) +bool __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode) { - int ret = -EBUSY; - /* Only take pages on the LRU. */ if (!PageLRU(page)) - return ret; + return false; /* Compaction should not handle unevictable pages but CMA can do so */ if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE)) - return ret; + return false; /* * To minimise LRU disruption, the caller can indicate that it only @@ -1564,7 +1562,7 @@ int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode) if (mode & ISOLATE_ASYNC_MIGRATE) { /* All the caller can do on PageWriteback is block */ if (PageWriteback(page)) - return ret; + return false; if (PageDirty(page)) { struct address_space *mapping; @@ -1580,20 +1578,20 @@ int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode) * from the page cache. */ if (!trylock_page(page)) - return ret; + return false; mapping = page_mapping(page); migrate_dirty = !mapping || mapping->a_ops->migratepage; unlock_page(page); if (!migrate_dirty) - return ret; + return false; } } if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) - return ret; + return false; - return 0; + return true; } /* @@ -1677,35 +1675,31 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, * only when the page is being freed somewhere else. */ scan += nr_pages; - switch (__isolate_lru_page_prepare(page, mode)) { - case 0: - /* - * Be careful not to clear PageLRU until after we're - * sure the page is not being freed elsewhere -- the - * page release code relies on it. - */ - if (unlikely(!get_page_unless_zero(page))) - goto busy; - - if (!TestClearPageLRU(page)) { - /* - * This page may in other isolation path, - * but we still hold lru_lock. - */ - put_page(page); - goto busy; - } - - nr_taken += nr_pages; - nr_zone_taken[page_zonenum(page)] += nr_pages; - list_move(&page->lru, dst); - break; + if (!__isolate_lru_page_prepare(page, mode)) { + /* It is being freed elsewhere */ + list_move(&page->lru, src); + continue; + } + /* + * Be careful not to clear PageLRU until after we're + * sure the page is not being freed elsewhere -- the + * page release code relies on it. + */ + if (unlikely(!get_page_unless_zero(page))) { + list_move(&page->lru, src); + continue; + } - default: -busy: - /* else it is being freed elsewhere */ + if (!TestClearPageLRU(page)) { + /* Another thread is already isolating this page */ + put_page(page); list_move(&page->lru, src); + continue; } + + nr_taken += nr_pages; + nr_zone_taken[page_zonenum(page)] += nr_pages; + list_move(&page->lru, dst); } /* -- cgit v1.2.3 From f90d8191ac864df33b1898bc7edc54eaa24e22bc Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 24 Feb 2021 12:08:13 -0800 Subject: include/linux/mm_inline.h: shuffle lru list addition and deletion functions These functions will call page_lru() in the following patches. Move them below page_lru() to avoid the forward declaration. Link: https://lore.kernel.org/linux-mm/20201207220949.830352-3-yuzhao@google.com/ Link: https://lkml.kernel.org/r/20210122220600.906146-3-yuzhao@google.com Signed-off-by: Yu Zhao Acked-by: Vlastimil Babka Reviewed-by: Miaohe Lin Cc: Alex Shi Cc: Hugh Dickins Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Roman Gushchin Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 8fc71e9d7bb0..2889741f450a 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -45,27 +45,6 @@ static __always_inline void update_lru_size(struct lruvec *lruvec, #endif } -static __always_inline void add_page_to_lru_list(struct page *page, - struct lruvec *lruvec, enum lru_list lru) -{ - update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); - list_add(&page->lru, &lruvec->lists[lru]); -} - -static __always_inline void add_page_to_lru_list_tail(struct page *page, - struct lruvec *lruvec, enum lru_list lru) -{ - update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); - list_add_tail(&page->lru, &lruvec->lists[lru]); -} - -static __always_inline void del_page_from_lru_list(struct page *page, - struct lruvec *lruvec, enum lru_list lru) -{ - list_del(&page->lru); - update_lru_size(lruvec, lru, page_zonenum(page), -thp_nr_pages(page)); -} - /** * page_lru_base_type - which LRU list type should a page be on? * @page: the page to test @@ -125,4 +104,25 @@ static __always_inline enum lru_list page_lru(struct page *page) } return lru; } + +static __always_inline void add_page_to_lru_list(struct page *page, + struct lruvec *lruvec, enum lru_list lru) +{ + update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); + list_add(&page->lru, &lruvec->lists[lru]); +} + +static __always_inline void add_page_to_lru_list_tail(struct page *page, + struct lruvec *lruvec, enum lru_list lru) +{ + update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); + list_add_tail(&page->lru, &lruvec->lists[lru]); +} + +static __always_inline void del_page_from_lru_list(struct page *page, + struct lruvec *lruvec, enum lru_list lru) +{ + list_del(&page->lru); + update_lru_size(lruvec, lru, page_zonenum(page), -thp_nr_pages(page)); +} #endif -- cgit v1.2.3 From 3a9c9788a3149d9745b7eb2eae811e57ef3b127c Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 24 Feb 2021 12:08:17 -0800 Subject: mm: don't pass "enum lru_list" to lru list addition functions The "enum lru_list" parameter to add_page_to_lru_list() and add_page_to_lru_list_tail() is redundant in the sense that it can be extracted from the "struct page" parameter by page_lru(). A caveat is that we need to make sure PageActive() or PageUnevictable() is correctly set or cleared before calling these two functions. And they are indeed. Link: https://lore.kernel.org/linux-mm/20201207220949.830352-4-yuzhao@google.com/ Link: https://lkml.kernel.org/r/20210122220600.906146-4-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Alex Shi Cc: Hugh Dickins Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Roman Gushchin Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 8 ++++++-- mm/swap.c | 15 +++++++-------- mm/vmscan.c | 6 ++---- 3 files changed, 15 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 2889741f450a..130ba3201d3f 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -106,15 +106,19 @@ static __always_inline enum lru_list page_lru(struct page *page) } static __always_inline void add_page_to_lru_list(struct page *page, - struct lruvec *lruvec, enum lru_list lru) + struct lruvec *lruvec) { + enum lru_list lru = page_lru(page); + update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); list_add(&page->lru, &lruvec->lists[lru]); } static __always_inline void add_page_to_lru_list_tail(struct page *page, - struct lruvec *lruvec, enum lru_list lru) + struct lruvec *lruvec) { + enum lru_list lru = page_lru(page); + update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); list_add_tail(&page->lru, &lruvec->lists[lru]); } diff --git a/mm/swap.c b/mm/swap.c index 2cca7141470c..22fad34b9c18 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -231,7 +231,7 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec) if (!PageUnevictable(page)) { del_page_from_lru_list(page, lruvec, page_lru(page)); ClearPageActive(page); - add_page_to_lru_list_tail(page, lruvec, page_lru(page)); + add_page_to_lru_list_tail(page, lruvec); __count_vm_events(PGROTATED, thp_nr_pages(page)); } } @@ -313,8 +313,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec) del_page_from_lru_list(page, lruvec, lru); SetPageActive(page); - lru += LRU_ACTIVE; - add_page_to_lru_list(page, lruvec, lru); + add_page_to_lru_list(page, lruvec); trace_mm_lru_activate(page); __count_vm_events(PGACTIVATE, nr_pages); @@ -543,14 +542,14 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) * It can make readahead confusing. But race window * is _really_ small and it's non-critical problem. */ - add_page_to_lru_list(page, lruvec, lru); + add_page_to_lru_list(page, lruvec); SetPageReclaim(page); } else { /* * The page's writeback ends up during pagevec * We moves tha page into tail of inactive. */ - add_page_to_lru_list_tail(page, lruvec, lru); + add_page_to_lru_list_tail(page, lruvec); __count_vm_events(PGROTATED, nr_pages); } @@ -570,7 +569,7 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); ClearPageActive(page); ClearPageReferenced(page); - add_page_to_lru_list(page, lruvec, lru); + add_page_to_lru_list(page, lruvec); __count_vm_events(PGDEACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, @@ -595,7 +594,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec) * anonymous pages */ ClearPageSwapBacked(page); - add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE); + add_page_to_lru_list(page, lruvec); __count_vm_events(PGLAZYFREE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, @@ -1005,7 +1004,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec) __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); } - add_page_to_lru_list(page, lruvec, lru); + add_page_to_lru_list(page, lruvec); trace_mm_lru_insertion(page, lru); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 19875660e8f8..09e4f97488c9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1867,7 +1867,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, * inhibits memcg migration). */ VM_BUG_ON_PAGE(!lruvec_holds_page_lru_lock(page, lruvec), page); - add_page_to_lru_list(page, lruvec, page_lru(page)); + add_page_to_lru_list(page, lruvec); nr_pages = thp_nr_pages(page); nr_moved += nr_pages; if (PageActive(page)) @@ -4282,12 +4282,10 @@ void check_move_unevictable_pages(struct pagevec *pvec) lruvec = relock_page_lruvec_irq(page, lruvec); if (page_evictable(page) && PageUnevictable(page)) { - enum lru_list lru = page_lru_base_type(page); - VM_BUG_ON_PAGE(PageActive(page), page); ClearPageUnevictable(page); del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); - add_page_to_lru_list(page, lruvec, lru); + add_page_to_lru_list(page, lruvec); pgrescued += nr_pages; } SetPageLRU(page); -- cgit v1.2.3 From 46ae6b2cc2a47904a368d238425531ea91f3a2a5 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 24 Feb 2021 12:08:25 -0800 Subject: mm/swap.c: don't pass "enum lru_list" to del_page_from_lru_list() The parameter is redundant in the sense that it can be potentially extracted from the "struct page" parameter by page_lru(). We need to make sure that existing PageActive() or PageUnevictable() remains until the function returns. A few places don't conform, and simple reordering fixes them. This patch may have left page_off_lru() seemingly odd, and we'll take care of it in the next patch. Link: https://lore.kernel.org/linux-mm/20201207220949.830352-6-yuzhao@google.com/ Link: https://lkml.kernel.org/r/20210122220600.906146-6-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Alex Shi Cc: Hugh Dickins Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Roman Gushchin Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 5 +++-- mm/compaction.c | 2 +- mm/mlock.c | 3 +-- mm/swap.c | 26 ++++++++++---------------- mm/vmscan.c | 4 ++-- 5 files changed, 17 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 130ba3201d3f..ffacc6273678 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -124,9 +124,10 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, } static __always_inline void del_page_from_lru_list(struct page *page, - struct lruvec *lruvec, enum lru_list lru) + struct lruvec *lruvec) { list_del(&page->lru); - update_lru_size(lruvec, lru, page_zonenum(page), -thp_nr_pages(page)); + update_lru_size(lruvec, page_lru(page), page_zonenum(page), + -thp_nr_pages(page)); } #endif diff --git a/mm/compaction.c b/mm/compaction.c index 8f52532675a9..3aef10c61d0e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1034,7 +1034,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, low_pfn += compound_nr(page) - 1; /* Successfully isolated */ - del_page_from_lru_list(page, lruvec, page_lru(page)); + del_page_from_lru_list(page, lruvec); mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_is_file_lru(page), thp_nr_pages(page)); diff --git a/mm/mlock.c b/mm/mlock.c index 55b3b3672977..73960bb3464d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -278,8 +278,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) */ if (TestClearPageLRU(page)) { lruvec = relock_page_lruvec_irq(page, lruvec); - del_page_from_lru_list(page, lruvec, - page_lru(page)); + del_page_from_lru_list(page, lruvec); continue; } else __munlock_isolation_failed(page); diff --git a/mm/swap.c b/mm/swap.c index a26e3a4fe17b..ff910f636df2 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -85,7 +85,8 @@ static void __page_cache_release(struct page *page) lruvec = lock_page_lruvec_irqsave(page, &flags); VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); - del_page_from_lru_list(page, lruvec, page_off_lru(page)); + del_page_from_lru_list(page, lruvec); + page_off_lru(page); unlock_page_lruvec_irqrestore(lruvec, flags); } __ClearPageWaiters(page); @@ -229,7 +230,7 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec) { if (!PageUnevictable(page)) { - del_page_from_lru_list(page, lruvec, page_lru(page)); + del_page_from_lru_list(page, lruvec); ClearPageActive(page); add_page_to_lru_list_tail(page, lruvec); __count_vm_events(PGROTATED, thp_nr_pages(page)); @@ -308,10 +309,9 @@ void lru_note_cost_page(struct page *page) static void __activate_page(struct page *page, struct lruvec *lruvec) { if (!PageActive(page) && !PageUnevictable(page)) { - int lru = page_lru_base_type(page); int nr_pages = thp_nr_pages(page); - del_page_from_lru_list(page, lruvec, lru); + del_page_from_lru_list(page, lruvec); SetPageActive(page); add_page_to_lru_list(page, lruvec); trace_mm_lru_activate(page); @@ -518,8 +518,7 @@ void lru_cache_add_inactive_or_unevictable(struct page *page, */ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) { - int lru; - bool active; + bool active = PageActive(page); int nr_pages = thp_nr_pages(page); if (PageUnevictable(page)) @@ -529,10 +528,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) if (page_mapped(page)) return; - active = PageActive(page); - lru = page_lru_base_type(page); - - del_page_from_lru_list(page, lruvec, lru + active); + del_page_from_lru_list(page, lruvec); ClearPageActive(page); ClearPageReferenced(page); @@ -563,10 +559,9 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) { if (PageActive(page) && !PageUnevictable(page)) { - int lru = page_lru_base_type(page); int nr_pages = thp_nr_pages(page); - del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); + del_page_from_lru_list(page, lruvec); ClearPageActive(page); ClearPageReferenced(page); add_page_to_lru_list(page, lruvec); @@ -581,11 +576,9 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec) { if (PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page) && !PageUnevictable(page)) { - bool active = PageActive(page); int nr_pages = thp_nr_pages(page); - del_page_from_lru_list(page, lruvec, - LRU_INACTIVE_ANON + active); + del_page_from_lru_list(page, lruvec); ClearPageActive(page); ClearPageReferenced(page); /* @@ -919,7 +912,8 @@ void release_pages(struct page **pages, int nr) VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); - del_page_from_lru_list(page, lruvec, page_off_lru(page)); + del_page_from_lru_list(page, lruvec); + page_off_lru(page); } __ClearPageWaiters(page); diff --git a/mm/vmscan.c b/mm/vmscan.c index 09e4f97488c9..7c65d47e6612 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1766,7 +1766,7 @@ int isolate_lru_page(struct page *page) get_page(page); lruvec = lock_page_lruvec_irq(page); - del_page_from_lru_list(page, lruvec, page_lru(page)); + del_page_from_lru_list(page, lruvec); unlock_page_lruvec_irq(lruvec); ret = 0; } @@ -4283,8 +4283,8 @@ void check_move_unevictable_pages(struct pagevec *pvec) lruvec = relock_page_lruvec_irq(page, lruvec); if (page_evictable(page) && PageUnevictable(page)) { VM_BUG_ON_PAGE(PageActive(page), page); + del_page_from_lru_list(page, lruvec); ClearPageUnevictable(page); - del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); add_page_to_lru_list(page, lruvec); pgrescued += nr_pages; } -- cgit v1.2.3 From 875601796267214f286d3581fe74f2805d060fe8 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 24 Feb 2021 12:08:28 -0800 Subject: mm: add __clear_page_lru_flags() to replace page_off_lru() Similar to page_off_lru(), the new function does non-atomic clearing of PageLRU() in addition to PageActive() and PageUnevictable(), on a page that has no references left. If PageActive() and PageUnevictable() are both set, refuse to clear either and leave them to bad_page(). This is a behavior change that is meant to help debug. Link: https://lore.kernel.org/linux-mm/20201207220949.830352-7-yuzhao@google.com/ Link: https://lkml.kernel.org/r/20210122220600.906146-7-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Alex Shi Cc: Hugh Dickins Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Roman Gushchin Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 28 ++++++++++------------------ mm/swap.c | 6 ++---- mm/vmscan.c | 3 +-- 3 files changed, 13 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index ffacc6273678..ef3fd79222e5 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -61,27 +61,19 @@ static inline enum lru_list page_lru_base_type(struct page *page) } /** - * page_off_lru - which LRU list was page on? clearing its lru flags. - * @page: the page to test - * - * Returns the LRU list a page was on, as an index into the array of LRU - * lists; and clears its Unevictable or Active flags, ready for freeing. + * __clear_page_lru_flags - clear page lru flags before releasing a page + * @page: the page that was on lru and now has a zero reference */ -static __always_inline enum lru_list page_off_lru(struct page *page) +static __always_inline void __clear_page_lru_flags(struct page *page) { - enum lru_list lru; + __ClearPageLRU(page); - if (PageUnevictable(page)) { - __ClearPageUnevictable(page); - lru = LRU_UNEVICTABLE; - } else { - lru = page_lru_base_type(page); - if (PageActive(page)) { - __ClearPageActive(page); - lru += LRU_ACTIVE; - } - } - return lru; + /* this shouldn't happen, so leave the flags to bad_page() */ + if (PageActive(page) && PageUnevictable(page)) + return; + + __ClearPageActive(page); + __ClearPageUnevictable(page); } /** diff --git a/mm/swap.c b/mm/swap.c index ff910f636df2..1455fa56a0ee 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -84,9 +84,8 @@ static void __page_cache_release(struct page *page) lruvec = lock_page_lruvec_irqsave(page, &flags); VM_BUG_ON_PAGE(!PageLRU(page), page); - __ClearPageLRU(page); del_page_from_lru_list(page, lruvec); - page_off_lru(page); + __clear_page_lru_flags(page); unlock_page_lruvec_irqrestore(lruvec, flags); } __ClearPageWaiters(page); @@ -911,9 +910,8 @@ void release_pages(struct page **pages, int nr) lock_batch = 0; VM_BUG_ON_PAGE(!PageLRU(page), page); - __ClearPageLRU(page); del_page_from_lru_list(page, lruvec); - page_off_lru(page); + __clear_page_lru_flags(page); } __ClearPageWaiters(page); diff --git a/mm/vmscan.c b/mm/vmscan.c index 7c65d47e6612..452dd3818aa3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1849,8 +1849,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, SetPageLRU(page); if (unlikely(put_page_testzero(page))) { - __ClearPageLRU(page); - __ClearPageActive(page); + __clear_page_lru_flags(page); if (unlikely(PageCompound(page))) { spin_unlock_irq(&lruvec->lru_lock); -- cgit v1.2.3 From bc7112719e1e80e4208eef3fc9bd8d2b6c263e7d Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 24 Feb 2021 12:08:32 -0800 Subject: mm: VM_BUG_ON lru page flags Move scattered VM_BUG_ONs to two essential places that cover all lru list additions and deletions. Link: https://lore.kernel.org/linux-mm/20201207220949.830352-8-yuzhao@google.com/ Link: https://lkml.kernel.org/r/20210122220600.906146-8-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Alex Shi Cc: Hugh Dickins Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Roman Gushchin Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 4 ++++ mm/swap.c | 2 -- mm/vmscan.c | 1 - 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index ef3fd79222e5..6d907a4dd6ad 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -66,6 +66,8 @@ static inline enum lru_list page_lru_base_type(struct page *page) */ static __always_inline void __clear_page_lru_flags(struct page *page) { + VM_BUG_ON_PAGE(!PageLRU(page), page); + __ClearPageLRU(page); /* this shouldn't happen, so leave the flags to bad_page() */ @@ -87,6 +89,8 @@ static __always_inline enum lru_list page_lru(struct page *page) { enum lru_list lru; + VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); + if (PageUnevictable(page)) lru = LRU_UNEVICTABLE; else { diff --git a/mm/swap.c b/mm/swap.c index 1455fa56a0ee..ab3258afcbeb 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -83,7 +83,6 @@ static void __page_cache_release(struct page *page) unsigned long flags; lruvec = lock_page_lruvec_irqsave(page, &flags); - VM_BUG_ON_PAGE(!PageLRU(page), page); del_page_from_lru_list(page, lruvec); __clear_page_lru_flags(page); unlock_page_lruvec_irqrestore(lruvec, flags); @@ -909,7 +908,6 @@ void release_pages(struct page **pages, int nr) if (prev_lruvec != lruvec) lock_batch = 0; - VM_BUG_ON_PAGE(!PageLRU(page), page); del_page_from_lru_list(page, lruvec); __clear_page_lru_flags(page); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 452dd3818aa3..348a90096550 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4281,7 +4281,6 @@ void check_move_unevictable_pages(struct pagevec *pvec) lruvec = relock_page_lruvec_irq(page, lruvec); if (page_evictable(page) && PageUnevictable(page)) { - VM_BUG_ON_PAGE(PageActive(page), page); del_page_from_lru_list(page, lruvec); ClearPageUnevictable(page); add_page_to_lru_list(page, lruvec); -- cgit v1.2.3 From c1770e34f3e7640887d8129fc05d13fe17101301 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 24 Feb 2021 12:08:36 -0800 Subject: include/linux/mm_inline.h: fold page_lru_base_type() into its sole caller We've removed all other references to this function. Link: https://lore.kernel.org/linux-mm/20201207220949.830352-9-yuzhao@google.com/ Link: https://lkml.kernel.org/r/20210122220600.906146-9-yuzhao@google.com Signed-off-by: Yu Zhao Reviewed-by: Alex Shi Cc: Hugh Dickins Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Roman Gushchin Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 6d907a4dd6ad..7183c7a03f09 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -45,21 +45,6 @@ static __always_inline void update_lru_size(struct lruvec *lruvec, #endif } -/** - * page_lru_base_type - which LRU list type should a page be on? - * @page: the page to test - * - * Used for LRU list index arithmetic. - * - * Returns the base LRU type - file or anon - @page should be on. - */ -static inline enum lru_list page_lru_base_type(struct page *page) -{ - if (page_is_file_lru(page)) - return LRU_INACTIVE_FILE; - return LRU_INACTIVE_ANON; -} - /** * __clear_page_lru_flags - clear page lru flags before releasing a page * @page: the page that was on lru and now has a zero reference @@ -92,12 +77,12 @@ static __always_inline enum lru_list page_lru(struct page *page) VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); if (PageUnevictable(page)) - lru = LRU_UNEVICTABLE; - else { - lru = page_lru_base_type(page); - if (PageActive(page)) - lru += LRU_ACTIVE; - } + return LRU_UNEVICTABLE; + + lru = page_is_file_lru(page) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON; + if (PageActive(page)) + lru += LRU_ACTIVE; + return lru; } -- cgit v1.2.3 From 289ccba18af436f2b65ec69b2be1b086ec9f24a4 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 24 Feb 2021 12:08:40 -0800 Subject: include/linux/mm_inline.h: fold __update_lru_size() into its sole caller All other references to the function were removed after commit a892cb6b977f ("mm/vmscan.c: use update_lru_size() in update_lru_sizes()"). Link: https://lore.kernel.org/linux-mm/20201207220949.830352-10-yuzhao@google.com/ Link: https://lkml.kernel.org/r/20210122220600.906146-10-yuzhao@google.com Signed-off-by: Yu Zhao Reviewed-by: Alex Shi Cc: Hugh Dickins Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Roman Gushchin Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 7183c7a03f09..355ea1ee32bd 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -24,7 +24,7 @@ static inline int page_is_file_lru(struct page *page) return !PageSwapBacked(page); } -static __always_inline void __update_lru_size(struct lruvec *lruvec, +static __always_inline void update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, int nr_pages) { @@ -33,13 +33,6 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec, __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); __mod_zone_page_state(&pgdat->node_zones[zid], NR_ZONE_LRU_BASE + lru, nr_pages); -} - -static __always_inline void update_lru_size(struct lruvec *lruvec, - enum lru_list lru, enum zone_type zid, - int nr_pages) -{ - __update_lru_size(lruvec, lru, zid, nr_pages); #ifdef CONFIG_MEMCG mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); #endif -- cgit v1.2.3 From 2091339d59e7808e9b39a79f48e3d17ef7389b97 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 24 Feb 2021 12:08:44 -0800 Subject: mm/vmscan.c: make lruvec_lru_size() static All other references to the function were removed after commit b910718a948a ("mm: vmscan: detect file thrashing at the reclaim root"). Link: https://lore.kernel.org/linux-mm/20201207220949.830352-11-yuzhao@google.com/ Link: https://lkml.kernel.org/r/20210122220600.906146-11-yuzhao@google.com Signed-off-by: Yu Zhao Reviewed-by: Alex Shi Cc: Hugh Dickins Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Roman Gushchin Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 -- mm/vmscan.c | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fc99e9241846..9198b7ade85f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -892,8 +892,6 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) #endif } -extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx); - #ifdef CONFIG_HAVE_MEMORYLESS_NODES int local_memory_node(int node_id); #else diff --git a/mm/vmscan.c b/mm/vmscan.c index 348a90096550..fea6b43bc1f9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -310,7 +310,8 @@ unsigned long zone_reclaimable_pages(struct zone *zone) * @lru: lru to use * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list) */ -unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) +static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, + int zone_idx) { unsigned long size = 0; int zid; -- cgit v1.2.3 From d6995da311221a05c8aef3bda2629e5cb14c7302 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 24 Feb 2021 12:08:51 -0800 Subject: hugetlb: use page.private for hugetlb specific page flags Patch series "create hugetlb flags to consolidate state", v3. While discussing a series of hugetlb fixes in [1], it became evident that the hugetlb specific page state information is stored in a somewhat haphazard manner. Code dealing with state information would be easier to read, understand and maintain if this information was stored in a consistent manner. This series uses page.private of the hugetlb head page for storing a set of hugetlb specific page flags. Routines are priovided for test, set and clear of the flags. [1] https://lore.kernel.org/r/20210106084739.63318-1-songmuchun@bytedance.com This patch (of 4): As hugetlbfs evolved, state information about hugetlb pages was added. One 'convenient' way of doing this was to use available fields in tail pages. Over time, it has become difficult to know the meaning or contents of fields simply by looking at a small bit of code. Sometimes, the naming is just confusing. For example: The PagePrivate flag indicates a huge page reservation was consumed and needs to be restored if an error is encountered and the page is freed before it is instantiated. The page.private field contains the pointer to a subpool if the page is associated with one. In an effort to make the code more readable, use page.private to contain hugetlb specific page flags. These flags will have test, set and clear functions similar to those used for 'normal' page flags. More importantly, an enum of flag values will be created with names that actually reflect their purpose. In this patch, - Create infrastructure for hugetlb specific page flag functions - Move subpool pointer to page[1].private to make way for flags Create routines with meaningful names to modify subpool field - Use new HPageRestoreReserve flag instead of PagePrivate Conversion of other state information will happen in subsequent patches. Link: https://lkml.kernel.org/r/20210122195231.324857-1-mike.kravetz@oracle.com Link: https://lkml.kernel.org/r/20210122195231.324857-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Naoya Horiguchi Cc: Muchun Song Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 12 +++------ include/linux/hugetlb.h | 68 +++++++++++++++++++++++++++++++++++++++++++++++++ mm/hugetlb.c | 48 +++++++++++++++++----------------- 3 files changed, 96 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index b7a72f577aab..907f6405e805 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -973,15 +973,9 @@ static int hugetlbfs_migrate_page(struct address_space *mapping, if (rc != MIGRATEPAGE_SUCCESS) return rc; - /* - * page_private is subpool pointer in hugetlb pages. Transfer to - * new page. PagePrivate is not associated with page_private for - * hugetlb pages and can not be set here as only page_huge_active - * pages can be migrated. - */ - if (page_private(page)) { - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); + if (hugetlb_page_subpool(page)) { + hugetlb_set_page_subpool(newpage, hugetlb_page_subpool(page)); + hugetlb_set_page_subpool(page, NULL); } if (mode != MIGRATE_SYNC_NO_COPY) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ac3cb239a7ec..249c65e1d8ca 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -472,6 +472,60 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long flags); #endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */ +/* + * huegtlb page specific state flags. These flags are located in page.private + * of the hugetlb head page. Functions created via the below macros should be + * used to manipulate these flags. + * + * HPG_restore_reserve - Set when a hugetlb page consumes a reservation at + * allocation time. Cleared when page is fully instantiated. Free + * routine checks flag to restore a reservation on error paths. + */ +enum hugetlb_page_flags { + HPG_restore_reserve = 0, + __NR_HPAGEFLAGS, +}; + +/* + * Macros to create test, set and clear function definitions for + * hugetlb specific page flags. + */ +#ifdef CONFIG_HUGETLB_PAGE +#define TESTHPAGEFLAG(uname, flname) \ +static inline int HPage##uname(struct page *page) \ + { return test_bit(HPG_##flname, &(page->private)); } + +#define SETHPAGEFLAG(uname, flname) \ +static inline void SetHPage##uname(struct page *page) \ + { set_bit(HPG_##flname, &(page->private)); } + +#define CLEARHPAGEFLAG(uname, flname) \ +static inline void ClearHPage##uname(struct page *page) \ + { clear_bit(HPG_##flname, &(page->private)); } +#else +#define TESTHPAGEFLAG(uname, flname) \ +static inline int HPage##uname(struct page *page) \ + { return 0; } + +#define SETHPAGEFLAG(uname, flname) \ +static inline void SetHPage##uname(struct page *page) \ + { } + +#define CLEARHPAGEFLAG(uname, flname) \ +static inline void ClearHPage##uname(struct page *page) \ + { } +#endif + +#define HPAGEFLAG(uname, flname) \ + TESTHPAGEFLAG(uname, flname) \ + SETHPAGEFLAG(uname, flname) \ + CLEARHPAGEFLAG(uname, flname) \ + +/* + * Create functions associated with hugetlb page flags + */ +HPAGEFLAG(RestoreReserve, restore_reserve) + #ifdef CONFIG_HUGETLB_PAGE #define HSTATE_NAME_LEN 32 @@ -531,6 +585,20 @@ extern unsigned int default_hstate_idx; #define default_hstate (hstates[default_hstate_idx]) +/* + * hugetlb page subpool pointer located in hpage[1].private + */ +static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage) +{ + return (struct hugepage_subpool *)(hpage+1)->private; +} + +static inline void hugetlb_set_page_subpool(struct page *hpage, + struct hugepage_subpool *subpool) +{ + set_page_private(hpage+1, (unsigned long)subpool); +} + static inline struct hstate *hstate_file(struct file *f) { return hstate_inode(file_inode(f)); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8a1d1f85e21e..f5e85dabb7a3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1143,7 +1143,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { - SetPagePrivate(page); + SetHPageRestoreReserve(page); h->resv_huge_pages--; } @@ -1418,20 +1418,19 @@ static void __free_huge_page(struct page *page) */ struct hstate *h = page_hstate(page); int nid = page_to_nid(page); - struct hugepage_subpool *spool = - (struct hugepage_subpool *)page_private(page); + struct hugepage_subpool *spool = hugetlb_page_subpool(page); bool restore_reserve; VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(page_mapcount(page), page); - set_page_private(page, 0); + hugetlb_set_page_subpool(page, NULL); page->mapping = NULL; - restore_reserve = PagePrivate(page); - ClearPagePrivate(page); + restore_reserve = HPageRestoreReserve(page); + ClearHPageRestoreReserve(page); /* - * If PagePrivate() was set on page, page allocation consumed a + * If HPageRestoreReserve was set on page, page allocation consumed a * reservation. If the page was associated with a subpool, there * would have been a page reserved in the subpool before allocation * via hugepage_subpool_get_pages(). Since we are 'restoring' the @@ -2263,24 +2262,24 @@ static long vma_add_reservation(struct hstate *h, * This routine is called to restore a reservation on error paths. In the * specific error paths, a huge page was allocated (via alloc_huge_page) * and is about to be freed. If a reservation for the page existed, - * alloc_huge_page would have consumed the reservation and set PagePrivate - * in the newly allocated page. When the page is freed via free_huge_page, - * the global reservation count will be incremented if PagePrivate is set. - * However, free_huge_page can not adjust the reserve map. Adjust the - * reserve map here to be consistent with global reserve count adjustments - * to be made by free_huge_page. + * alloc_huge_page would have consumed the reservation and set + * HPageRestoreReserve in the newly allocated page. When the page is freed + * via free_huge_page, the global reservation count will be incremented if + * HPageRestoreReserve is set. However, free_huge_page can not adjust the + * reserve map. Adjust the reserve map here to be consistent with global + * reserve count adjustments to be made by free_huge_page. */ static void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct page *page) { - if (unlikely(PagePrivate(page))) { + if (unlikely(HPageRestoreReserve(page))) { long rc = vma_needs_reservation(h, vma, address); if (unlikely(rc < 0)) { /* * Rare out of memory condition in reserve map - * manipulation. Clear PagePrivate so that + * manipulation. Clear HPageRestoreReserve so that * global reserve count will not be incremented * by free_huge_page. This will make it appear * as though the reservation for this page was @@ -2289,7 +2288,7 @@ static void restore_reserve_on_error(struct hstate *h, * is better than inconsistent global huge page * accounting of reserve counts. */ - ClearPagePrivate(page); + ClearHPageRestoreReserve(page); } else if (rc) { rc = vma_add_reservation(h, vma, address); if (unlikely(rc < 0)) @@ -2297,7 +2296,7 @@ static void restore_reserve_on_error(struct hstate *h, * See above comment about rare out of * memory condition. */ - ClearPagePrivate(page); + ClearHPageRestoreReserve(page); } else vma_end_reservation(h, vma, address); } @@ -2378,7 +2377,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, if (!page) goto out_uncharge_cgroup; if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { - SetPagePrivate(page); + SetHPageRestoreReserve(page); h->resv_huge_pages--; } spin_lock(&hugetlb_lock); @@ -2396,7 +2395,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, spin_unlock(&hugetlb_lock); - set_page_private(page, (unsigned long)spool); + hugetlb_set_page_subpool(page, spool); map_commit = vma_commit_reservation(h, vma, addr); if (unlikely(map_chg > map_commit)) { @@ -3170,6 +3169,9 @@ static int __init hugetlb_init(void) { int i; + BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE < + __NR_HPAGEFLAGS); + if (!hugepages_supported()) { if (hugetlb_max_hstate || default_hstate_max_huge_pages) pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n"); @@ -4207,7 +4209,7 @@ retry_avoidcopy: spin_lock(ptl); ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { - ClearPagePrivate(new_page); + ClearHPageRestoreReserve(new_page); /* Break COW */ huge_ptep_clear_flush(vma, haddr, ptep); @@ -4274,7 +4276,7 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, if (err) return err; - ClearPagePrivate(page); + ClearHPageRestoreReserve(page); /* * set page dirty so that it will not be removed from cache/file @@ -4436,7 +4438,7 @@ retry: goto backout; if (anon_rmap) { - ClearPagePrivate(page); + ClearHPageRestoreReserve(page); hugepage_add_new_anon_rmap(page, vma, haddr); } else page_dup_rmap(page, true); @@ -4750,7 +4752,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, if (vm_shared) { page_dup_rmap(page, true); } else { - ClearPagePrivate(page); + ClearHPageRestoreReserve(page); hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); } -- cgit v1.2.3 From 8f251a3d5ce3bdea73bd045ed35db64f32e0d0d9 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 24 Feb 2021 12:08:56 -0800 Subject: hugetlb: convert page_huge_active() HPageMigratable flag Use the new hugetlb page specific flag HPageMigratable to replace the page_huge_active interfaces. By it's name, page_huge_active implied that a huge page was on the active list. However, that is not really what code checking the flag wanted to know. It really wanted to determine if the huge page could be migrated. This happens when the page is actually added to the page cache and/or task page table. This is the reasoning behind the name change. The VM_BUG_ON_PAGE() calls in the *_huge_active() interfaces are not really necessary as we KNOW the page is a hugetlb page. Therefore, they are removed. The routine page_huge_active checked for PageHeadHuge before testing the active bit. This is unnecessary in the case where we hold a reference or lock and know it is a hugetlb head page. page_huge_active is also called without holding a reference or lock (scan_movable_pages), and can race with code freeing the page. The extra check in page_huge_active shortened the race window, but did not prevent the race. Offline code calling scan_movable_pages already deals with these races, so removing the check is acceptable. Add comment to racy code. [songmuchun@bytedance.com: remove set_page_huge_active() declaration from include/linux/hugetlb.h] Link: https://lkml.kernel.org/r/CAMZfGtUda+KoAZscU0718TN61cSFwp4zy=y2oZ=+6Z2TAZZwng@mail.gmail.com Link: https://lkml.kernel.org/r/20210122195231.324857-3-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Oscar Salvador Reviewed-by: Muchun Song Reviewed-by: Miaohe Lin Acked-by: Michal Hocko Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 2 +- include/linux/hugetlb.h | 7 +++++-- include/linux/page-flags.h | 6 ------ mm/hugetlb.c | 45 +++++++++++---------------------------------- mm/memory_hotplug.c | 9 ++++++++- 5 files changed, 25 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 907f6405e805..5a8ed6bd4f87 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -735,7 +735,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, mutex_unlock(&hugetlb_fault_mutex_table[hash]); - set_page_huge_active(page); + SetHPageMigratable(page); /* * unlock_page because locked by add_to_page_cache() * put_page() due to reference from alloc_huge_page() diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 249c65e1d8ca..77f2a032fe32 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -480,9 +480,13 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, * HPG_restore_reserve - Set when a hugetlb page consumes a reservation at * allocation time. Cleared when page is fully instantiated. Free * routine checks flag to restore a reservation on error paths. + * HPG_migratable - Set after a newly allocated page is added to the page + * cache and/or page tables. Indicates the page is a candidate for + * migration. */ enum hugetlb_page_flags { HPG_restore_reserve = 0, + HPG_migratable, __NR_HPAGEFLAGS, }; @@ -525,6 +529,7 @@ static inline void ClearHPage##uname(struct page *page) \ * Create functions associated with hugetlb page flags */ HPAGEFLAG(RestoreReserve, restore_reserve) +HPAGEFLAG(Migratable, migratable) #ifdef CONFIG_HUGETLB_PAGE @@ -838,8 +843,6 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, } #endif -void set_page_huge_active(struct page *page); - #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index ec5d0290e0ee..db914477057b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -592,15 +592,9 @@ static inline void ClearPageCompound(struct page *page) #ifdef CONFIG_HUGETLB_PAGE int PageHuge(struct page *page); int PageHeadHuge(struct page *page); -bool page_huge_active(struct page *page); #else TESTPAGEFLAG_FALSE(Huge) TESTPAGEFLAG_FALSE(HeadHuge) - -static inline bool page_huge_active(struct page *page) -{ - return 0; -} #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f5e85dabb7a3..727c09713627 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1364,30 +1364,6 @@ struct hstate *size_to_hstate(unsigned long size) return NULL; } -/* - * Test to determine whether the hugepage is "active/in-use" (i.e. being linked - * to hstate->hugepage_activelist.) - * - * This function can be called for tail pages, but never returns true for them. - */ -bool page_huge_active(struct page *page) -{ - return PageHeadHuge(page) && PagePrivate(&page[1]); -} - -/* never called for tail page */ -void set_page_huge_active(struct page *page) -{ - VM_BUG_ON_PAGE(!PageHeadHuge(page), page); - SetPagePrivate(&page[1]); -} - -static void clear_page_huge_active(struct page *page) -{ - VM_BUG_ON_PAGE(!PageHeadHuge(page), page); - ClearPagePrivate(&page[1]); -} - /* * Internal hugetlb specific page flag. Do not use outside of the hugetlb * code @@ -1449,7 +1425,7 @@ static void __free_huge_page(struct page *page) } spin_lock(&hugetlb_lock); - clear_page_huge_active(page); + ClearHPageMigratable(page); hugetlb_cgroup_uncharge_page(hstate_index(h), pages_per_huge_page(h), page); hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), @@ -4218,7 +4194,7 @@ retry_avoidcopy: make_huge_pte(vma, new_page, 1)); page_remove_rmap(old_page, true); hugepage_add_new_anon_rmap(new_page, vma, haddr); - set_page_huge_active(new_page); + SetHPageMigratable(new_page); /* Make the old page be freed below */ new_page = old_page; } @@ -4455,12 +4431,12 @@ retry: spin_unlock(ptl); /* - * Only make newly allocated pages active. Existing pages found - * in the pagecache could be !page_huge_active() if they have been - * isolated for migration. + * Only set HPageMigratable in newly allocated pages. Existing pages + * found in the pagecache may not have HPageMigratableset if they have + * been isolated for migration. */ if (new_page) - set_page_huge_active(page); + SetHPageMigratable(page); unlock_page(page); out: @@ -4771,7 +4747,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, update_mmu_cache(dst_vma, dst_addr, dst_pte); spin_unlock(ptl); - set_page_huge_active(page); + SetHPageMigratable(page); if (vm_shared) unlock_page(page); ret = 0; @@ -5610,12 +5586,13 @@ bool isolate_huge_page(struct page *page, struct list_head *list) bool ret = true; spin_lock(&hugetlb_lock); - if (!PageHeadHuge(page) || !page_huge_active(page) || + if (!PageHeadHuge(page) || + !HPageMigratable(page) || !get_page_unless_zero(page)) { ret = false; goto unlock; } - clear_page_huge_active(page); + ClearHPageMigratable(page); list_move_tail(&page->lru, list); unlock: spin_unlock(&hugetlb_lock); @@ -5625,7 +5602,7 @@ unlock: void putback_active_hugepage(struct page *page) { spin_lock(&hugetlb_lock); - set_page_huge_active(page); + SetHPageMigratable(page); list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); spin_unlock(&hugetlb_lock); put_page(page); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ddcb1cd24c60..abe43c1ae920 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1260,7 +1260,14 @@ static int scan_movable_pages(unsigned long start, unsigned long end, if (!PageHuge(page)) continue; head = compound_head(page); - if (page_huge_active(head)) + /* + * This test is racy as we hold no reference or lock. The + * hugetlb page could have been free'ed and head is no longer + * a hugetlb page before the following check. In such unlikely + * cases false positives and negatives are possible. Calling + * code must deal with these scenarios. + */ + if (HPageMigratable(head)) goto found; skip = compound_nr(head) - (page - head); pfn += skip - 1; -- cgit v1.2.3 From 9157c31186c358c5750dea50ac5705d61d7fc917 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 24 Feb 2021 12:09:00 -0800 Subject: hugetlb: convert PageHugeTemporary() to HPageTemporary flag Use new hugetlb specific HPageTemporary flag to replace the PageHugeTemporary() interfaces. PageHugeTemporary does contain a PageHuge() check. However, this interface is only used within hugetlb code where we know we are dealing with a hugetlb page. Therefore, the check can be eliminated. Link: https://lkml.kernel.org/r/20210122195231.324857-5-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Oscar Salvador Reviewed-by: Muchun Song Acked-by: Michal Hocko Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 6 ++++++ mm/hugetlb.c | 36 +++++++----------------------------- 2 files changed, 13 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 77f2a032fe32..e97498a21010 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -483,10 +483,15 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, * HPG_migratable - Set after a newly allocated page is added to the page * cache and/or page tables. Indicates the page is a candidate for * migration. + * HPG_temporary - - Set on a page that is temporarily allocated from the buddy + * allocator. Typically used for migration target pages when no pages + * are available in the pool. The hugetlb free page path will + * immediately free pages with this flag set to the buddy allocator. */ enum hugetlb_page_flags { HPG_restore_reserve = 0, HPG_migratable, + HPG_temporary, __NR_HPAGEFLAGS, }; @@ -530,6 +535,7 @@ static inline void ClearHPage##uname(struct page *page) \ */ HPAGEFLAG(RestoreReserve, restore_reserve) HPAGEFLAG(Migratable, migratable) +HPAGEFLAG(Temporary, temporary) #ifdef CONFIG_HUGETLB_PAGE diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 727c09713627..4d7d4535a313 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1364,28 +1364,6 @@ struct hstate *size_to_hstate(unsigned long size) return NULL; } -/* - * Internal hugetlb specific page flag. Do not use outside of the hugetlb - * code - */ -static inline bool PageHugeTemporary(struct page *page) -{ - if (!PageHuge(page)) - return false; - - return (unsigned long)page[2].mapping == -1U; -} - -static inline void SetPageHugeTemporary(struct page *page) -{ - page[2].mapping = (void *)-1U; -} - -static inline void ClearPageHugeTemporary(struct page *page) -{ - page[2].mapping = NULL; -} - static void __free_huge_page(struct page *page) { /* @@ -1433,9 +1411,9 @@ static void __free_huge_page(struct page *page) if (restore_reserve) h->resv_huge_pages++; - if (PageHugeTemporary(page)) { + if (HPageTemporary(page)) { list_del(&page->lru); - ClearPageHugeTemporary(page); + ClearHPageTemporary(page); update_and_free_page(h, page); } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ @@ -1869,7 +1847,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, * codeflow */ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { - SetPageHugeTemporary(page); + SetHPageTemporary(page); spin_unlock(&hugetlb_lock); put_page(page); return NULL; @@ -1900,7 +1878,7 @@ static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, * We do not account these pages as surplus because they are only * temporary and will be released properly on the last reference */ - SetPageHugeTemporary(page); + SetHPageTemporary(page); return page; } @@ -5625,12 +5603,12 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) * here as well otherwise the global surplus count will not match * the per-node's. */ - if (PageHugeTemporary(newpage)) { + if (HPageTemporary(newpage)) { int old_nid = page_to_nid(oldpage); int new_nid = page_to_nid(newpage); - SetPageHugeTemporary(oldpage); - ClearPageHugeTemporary(newpage); + SetHPageTemporary(oldpage); + ClearHPageTemporary(newpage); spin_lock(&hugetlb_lock); if (h->surplus_huge_pages_node[old_nid]) { -- cgit v1.2.3 From 6c037149014027d50175da5be4ae4531374dcbe0 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 24 Feb 2021 12:09:04 -0800 Subject: hugetlb: convert PageHugeFreed to HPageFreed flag Use new hugetlb specific HPageFreed flag to replace the PageHugeFreed interfaces. Link: https://lkml.kernel.org/r/20210122195231.324857-6-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Oscar Salvador Reviewed-by: Muchun Song Acked-by: Michal Hocko Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 3 +++ mm/hugetlb.c | 23 ++++------------------- 2 files changed, 7 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e97498a21010..eb2682fd97b6 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -487,11 +487,13 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, * allocator. Typically used for migration target pages when no pages * are available in the pool. The hugetlb free page path will * immediately free pages with this flag set to the buddy allocator. + * HPG_freed - Set when page is on the free lists. */ enum hugetlb_page_flags { HPG_restore_reserve = 0, HPG_migratable, HPG_temporary, + HPG_freed, __NR_HPAGEFLAGS, }; @@ -536,6 +538,7 @@ static inline void ClearHPage##uname(struct page *page) \ HPAGEFLAG(RestoreReserve, restore_reserve) HPAGEFLAG(Migratable, migratable) HPAGEFLAG(Temporary, temporary) +HPAGEFLAG(Freed, freed) #ifdef CONFIG_HUGETLB_PAGE diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4d7d4535a313..5377e1dad044 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -79,21 +79,6 @@ DEFINE_SPINLOCK(hugetlb_lock); static int num_fault_mutexes; struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; -static inline bool PageHugeFreed(struct page *head) -{ - return page_private(head + 4) == -1UL; -} - -static inline void SetPageHugeFreed(struct page *head) -{ - set_page_private(head + 4, -1UL); -} - -static inline void ClearPageHugeFreed(struct page *head) -{ - set_page_private(head + 4, 0); -} - /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); @@ -1053,7 +1038,7 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) list_move(&page->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; - SetPageHugeFreed(page); + SetHPageFreed(page); } static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) @@ -1070,7 +1055,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) list_move(&page->lru, &h->hugepage_activelist); set_page_refcounted(page); - ClearPageHugeFreed(page); + ClearHPageFreed(page); h->free_huge_pages--; h->free_huge_pages_node[nid]--; return page; @@ -1485,7 +1470,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) spin_lock(&hugetlb_lock); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; - ClearPageHugeFreed(page); + ClearHPageFreed(page); spin_unlock(&hugetlb_lock); } @@ -1756,7 +1741,7 @@ retry: * We should make sure that the page is already on the free list * when it is dissolved. */ - if (unlikely(!PageHugeFreed(head))) { + if (unlikely(!HPageFreed(head))) { spin_unlock(&hugetlb_lock); cond_resched(); -- cgit v1.2.3 From d95c0337774b1dc74d271e7475a96fe8838332ea Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 24 Feb 2021 12:09:08 -0800 Subject: include/linux/hugetlb.h: add synchronization information for new hugetlb specific flags Add comments, no functional change. Link: https://lkml.kernel.org/r/62a80585-2a73-10cc-4a2d-5721540d4ad2@oracle.com Signed-off-by: Mike Kravetz Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index eb2682fd97b6..3a541c6cf5c3 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -480,14 +480,24 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, * HPG_restore_reserve - Set when a hugetlb page consumes a reservation at * allocation time. Cleared when page is fully instantiated. Free * routine checks flag to restore a reservation on error paths. + * Synchronization: Examined or modified by code that knows it has + * the only reference to page. i.e. After allocation but before use + * or when the page is being freed. * HPG_migratable - Set after a newly allocated page is added to the page * cache and/or page tables. Indicates the page is a candidate for * migration. + * Synchronization: Initially set after new page allocation with no + * locking. When examined and modified during migration processing + * (isolate, migrate, putback) the hugetlb_lock is held. * HPG_temporary - - Set on a page that is temporarily allocated from the buddy * allocator. Typically used for migration target pages when no pages * are available in the pool. The hugetlb free page path will * immediately free pages with this flag set to the buddy allocator. + * Synchronization: Can be set after huge page allocation from buddy when + * code knows it has only reference. All other examinations and + * modifications require hugetlb_lock. * HPG_freed - Set when page is on the free lists. + * Synchronization: hugetlb_lock held for examination and modification. */ enum hugetlb_page_flags { HPG_restore_reserve = 0, -- cgit v1.2.3 From 33b8f84a4ee78491a8f4f9e4c5520c9da4a10983 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 24 Feb 2021 12:09:54 -0800 Subject: mm/hugetlb: change hugetlb_reserve_pages() to type bool While reviewing a bug in hugetlb_reserve_pages, it was noticed that all callers ignore the return value. Any failure is considered an ENOMEM error by the callers. Change the function to be of type bool. The function will return true if the reservation was successful, false otherwise. Callers currently assume a zero return code indicates success. Change the callers to look for true to indicate success. No functional change, only code cleanup. Link: https://lkml.kernel.org/r/20201221192542.15732-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Dan Carpenter Cc: Michal Hocko Cc: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 4 ++-- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 37 ++++++++++++++----------------------- 3 files changed, 17 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 5a8ed6bd4f87..3eca85a4d940 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -171,7 +171,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); ret = -ENOMEM; - if (hugetlb_reserve_pages(inode, + if (!hugetlb_reserve_pages(inode, vma->vm_pgoff >> huge_page_order(h), len >> huge_page_shift(h), vma, vma->vm_flags)) @@ -1493,7 +1493,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, inode->i_size = size; clear_nlink(inode); - if (hugetlb_reserve_pages(inode, 0, + if (!hugetlb_reserve_pages(inode, 0, size >> huge_page_shift(hstate_inode(inode)), NULL, acctflag)) file = ERR_PTR(-ENOMEM); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3a541c6cf5c3..cccd1aab69dd 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -139,7 +139,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, unsigned long dst_addr, unsigned long src_addr, struct page **pagep); -int hugetlb_reserve_pages(struct inode *inode, long from, long to, +bool hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8e5f494291c6..8fb42c6dd74b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5016,12 +5016,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, return pages << h->order; } -int hugetlb_reserve_pages(struct inode *inode, +/* Return true if reservation was successful, false otherwise. */ +bool hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, vm_flags_t vm_flags) { - long ret, chg, add = -1; + long chg, add = -1; struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; @@ -5031,7 +5032,7 @@ int hugetlb_reserve_pages(struct inode *inode, /* This should never happen */ if (from > to) { VM_WARN(1, "%s called with a negative range\n", __func__); - return -EINVAL; + return false; } /* @@ -5040,7 +5041,7 @@ int hugetlb_reserve_pages(struct inode *inode, * without using reserves */ if (vm_flags & VM_NORESERVE) - return 0; + return true; /* * Shared mappings base their reservation on the number of pages that @@ -5062,7 +5063,7 @@ int hugetlb_reserve_pages(struct inode *inode, /* Private mapping. */ resv_map = resv_map_alloc(); if (!resv_map) - return -ENOMEM; + return false; chg = to - from; @@ -5070,18 +5071,12 @@ int hugetlb_reserve_pages(struct inode *inode, set_vma_resv_flags(vma, HPAGE_RESV_OWNER); } - if (chg < 0) { - ret = chg; + if (chg < 0) goto out_err; - } - ret = hugetlb_cgroup_charge_cgroup_rsvd( - hstate_index(h), chg * pages_per_huge_page(h), &h_cg); - - if (ret < 0) { - ret = -ENOMEM; + if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h), + chg * pages_per_huge_page(h), &h_cg) < 0) goto out_err; - } if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { /* For private mappings, the hugetlb_cgroup uncharge info hangs @@ -5096,19 +5091,15 @@ int hugetlb_reserve_pages(struct inode *inode, * reservations already in place (gbl_reserve). */ gbl_reserve = hugepage_subpool_get_pages(spool, chg); - if (gbl_reserve < 0) { - ret = -ENOSPC; + if (gbl_reserve < 0) goto out_uncharge_cgroup; - } /* * Check enough hugepages are available for the reservation. * Hand the pages back to the subpool if there are not */ - ret = hugetlb_acct_memory(h, gbl_reserve); - if (ret < 0) { + if (hugetlb_acct_memory(h, gbl_reserve) < 0) goto out_put_pages; - } /* * Account for the reservations made. Shared mappings record regions @@ -5126,7 +5117,6 @@ int hugetlb_reserve_pages(struct inode *inode, if (unlikely(add < 0)) { hugetlb_acct_memory(h, -gbl_reserve); - ret = add; goto out_put_pages; } else if (unlikely(chg > add)) { /* @@ -5147,7 +5137,8 @@ int hugetlb_reserve_pages(struct inode *inode, hugetlb_acct_memory(h, -rsv_adjust); } } - return 0; + return true; + out_put_pages: /* put back original number of pages, chg */ (void)hugepage_subpool_put_pages(spool, chg); @@ -5163,7 +5154,7 @@ out_err: region_abort(resv_map, from, to, regions_needed); if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) kref_put(&resv_map->refs, resv_map_release); - return ret; + return false; } long hugetlb_unreserve_pages(struct inode *inode, long start, long end, -- cgit v1.2.3 From a553e3cd2053501b658feec2be9a3b662eb1b22b Mon Sep 17 00:00:00 2001 From: Chengyang Fan Date: Wed, 24 Feb 2021 12:10:28 -0800 Subject: mm/migrate: remove unneeded semicolons Remove superfluous semicolons after function definitions. Link: https://lkml.kernel.org/r/20210115110131.2359683-1-cy.fan@huawei.com Signed-off-by: Chengyang Fan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 4594838a0f7c..3a389633b68f 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -89,7 +89,7 @@ extern int PageMovable(struct page *page); extern void __SetPageMovable(struct page *page, struct address_space *mapping); extern void __ClearPageMovable(struct page *page); #else -static inline int PageMovable(struct page *page) { return 0; }; +static inline int PageMovable(struct page *page) { return 0; } static inline void __SetPageMovable(struct page *page, struct address_space *mapping) { -- cgit v1.2.3