From 76470ccd62f18bfa0954bec10f2329339f793914 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Tue, 13 Aug 2019 15:37:04 -0700 Subject: mm: document zone device struct page field usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm/hmm: fixes for device private page migration", v3. Testing the latest linux git tree turned up a few bugs with page migration to and from ZONE_DEVICE private and anonymous pages. Hopefully it clarifies how ZONE_DEVICE private struct page uses the same mapping and index fields from the source anonymous page mapping. This patch (of 3): Struct page for ZONE_DEVICE private pages uses the page->mapping and and page->index fields while the source anonymous pages are migrated to device private memory. This is so rmap_walk() can find the page when migrating the ZONE_DEVICE private page back to system memory. ZONE_DEVICE pmem backed fsdax pages also use the page->mapping and page->index fields when files are mapped into a process address space. Add comments to struct page and remove the unused "_zd_pad_1" field to make this more clear. Link: http://lkml.kernel.org/r/20190724232700.23327-2-rcampbell@nvidia.com Signed-off-by: Ralph Campbell Reviewed-by: John Hubbard Cc: Matthew Wilcox Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: "Kirill A . Shutemov" Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3a37a89eb7a7..6a7a1083b6fb 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -159,7 +159,16 @@ struct page { /** @pgmap: Points to the hosting device page map. */ struct dev_pagemap *pgmap; void *zone_device_data; - unsigned long _zd_pad_1; /* uses mapping */ + /* + * ZONE_DEVICE private pages are counted as being + * mapped so the next 3 words hold the mapping, index, + * and private fields from the source anonymous or + * page cache page while the page is migrated to device + * private memory. + * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also + * use the mapping, index, and private fields when + * pmem backed DAX files are mapped. + */ }; /** @rcu_head: You can use this to free a page by RCU. */ -- cgit v1.2.3 From ec9f02384f6053f2a5417e82b65078edc5364a8d Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 13 Aug 2019 15:37:41 -0700 Subject: mm: workingset: fix vmstat counters for shadow nodes Memcg counters for shadow nodes are broken because the memcg pointer is obtained in a wrong way. The following approach is used: virt_to_page(xa_node)->mem_cgroup Since commit 4d96ba353075 ("mm: memcg/slab: stop setting page->mem_cgroup pointer for slab pages") page->mem_cgroup pointer isn't set for slab pages, so memcg_from_slab_page() should be used instead. Also I doubt that it ever worked correctly: virt_to_head_page() should be used instead of virt_to_page(). Otherwise objects residing on tail pages are not accounted, because only the head page contains a valid mem_cgroup pointer. That was a case since the introduction of these counters by the commit 68d48e6a2df5 ("mm: workingset: add vmstat counter for shadow nodes"). Link: http://lkml.kernel.org/r/20190801233532.138743-1-guro@fb.com Fixes: 4d96ba353075 ("mm: memcg/slab: stop setting page->mem_cgroup pointer for slab pages") Signed-off-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Vladimir Davydov Cc: Shakeel Butt Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 19 +++++++++++++++++++ mm/memcontrol.c | 20 ++++++++++++++++++++ mm/workingset.c | 10 ++++------ 3 files changed, 43 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 44c41462be33..2cd4359cb38c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -668,6 +668,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); +void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val); static inline void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) @@ -1072,6 +1073,14 @@ static inline void mod_lruvec_page_state(struct page *page, mod_node_page_state(page_pgdat(page), idx, val); } +static inline void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, + int val) +{ + struct page *page = virt_to_head_page(p); + + __mod_node_page_state(page_pgdat(page), idx, val); +} + static inline unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, @@ -1159,6 +1168,16 @@ static inline void __dec_lruvec_page_state(struct page *page, __mod_lruvec_page_state(page, idx, -1); } +static inline void __inc_lruvec_slab_state(void *p, enum node_stat_item idx) +{ + __mod_lruvec_slab_state(p, idx, 1); +} + +static inline void __dec_lruvec_slab_state(void *p, enum node_stat_item idx) +{ + __mod_lruvec_slab_state(p, idx, -1); +} + /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void inc_memcg_state(struct mem_cgroup *memcg, int idx) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2e405e058eda..6f5c0c517c49 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -768,6 +768,26 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); } +void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) +{ + struct page *page = virt_to_head_page(p); + pg_data_t *pgdat = page_pgdat(page); + struct mem_cgroup *memcg; + struct lruvec *lruvec; + + rcu_read_lock(); + memcg = memcg_from_slab_page(page); + + /* Untracked pages have no memcg, no lruvec. Update only the node */ + if (!memcg || memcg == root_mem_cgroup) { + __mod_node_page_state(pgdat, idx, val); + } else { + lruvec = mem_cgroup_lruvec(pgdat, memcg); + __mod_lruvec_state(lruvec, idx, val); + } + rcu_read_unlock(); +} + /** * __count_memcg_events - account VM events in a cgroup * @memcg: the memory cgroup diff --git a/mm/workingset.c b/mm/workingset.c index e0b4edcb88c8..c963831d354f 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -380,14 +380,12 @@ void workingset_update_node(struct xa_node *node) if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { list_lru_add(&shadow_nodes, &node->private_list); - __inc_lruvec_page_state(virt_to_page(node), - WORKINGSET_NODES); + __inc_lruvec_slab_state(node, WORKINGSET_NODES); } } else { if (!list_empty(&node->private_list)) { list_lru_del(&shadow_nodes, &node->private_list); - __dec_lruvec_page_state(virt_to_page(node), - WORKINGSET_NODES); + __dec_lruvec_slab_state(node, WORKINGSET_NODES); } } } @@ -480,7 +478,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, } list_lru_isolate(lru, item); - __dec_lruvec_page_state(virt_to_page(node), WORKINGSET_NODES); + __dec_lruvec_slab_state(node, WORKINGSET_NODES); spin_unlock(lru_lock); @@ -503,7 +501,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * shadow entries we were tracking ... */ xas_store(&xas, NULL); - __inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); + __inc_lruvec_slab_state(node, WORKINGSET_NODERECLAIM); out_invalid: xa_unlock_irq(&mapping->i_pages); -- cgit v1.2.3 From 0cfaee2af3a04c0be5f056cebe5f804dedc59a43 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 13 Aug 2019 15:37:47 -0700 Subject: include/asm-generic/5level-fixup.h: fix variable 'p4d' set but not used A compiler throws a warning on an arm64 system since commit 9849a5697d3d ("arch, mm: convert all architectures to use 5level-fixup.h"), mm/kasan/init.c: In function 'kasan_free_p4d': mm/kasan/init.c:344:9: warning: variable 'p4d' set but not used [-Wunused-but-set-variable] p4d_t *p4d; ^~~ because p4d_none() in "5level-fixup.h" is compiled away while it is a static inline function in "pgtable-nopud.h". However, if converted p4d_none() to a static inline there, powerpc would be unhappy as it reads those in assembler language in "arch/powerpc/include/asm/book3s/64/pgtable.h", so it needs to skip assembly include for the static inline C function. While at it, converted a few similar functions to be consistent with the ones in "pgtable-nopud.h". Link: http://lkml.kernel.org/r/20190806232917.881-1-cai@lca.pw Signed-off-by: Qian Cai Acked-by: Arnd Bergmann Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Jason Gunthorpe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/5level-fixup.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h index bb6cb347018c..f6947da70d71 100644 --- a/include/asm-generic/5level-fixup.h +++ b/include/asm-generic/5level-fixup.h @@ -19,9 +19,24 @@ #define p4d_alloc(mm, pgd, address) (pgd) #define p4d_offset(pgd, start) (pgd) -#define p4d_none(p4d) 0 -#define p4d_bad(p4d) 0 -#define p4d_present(p4d) 1 + +#ifndef __ASSEMBLY__ +static inline int p4d_none(p4d_t p4d) +{ + return 0; +} + +static inline int p4d_bad(p4d_t p4d) +{ + return 0; +} + +static inline int p4d_present(p4d_t p4d) +{ + return 1; +} +#endif + #define p4d_ERROR(p4d) do { } while (0) #define p4d_clear(p4d) pgd_clear(p4d) #define p4d_val(p4d) pgd_val(p4d) -- cgit v1.2.3 From 92717d429b38e4f9f934eed7e605cc42858f1839 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 13 Aug 2019 15:37:50 -0700 Subject: Revert "Revert "mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask"" Patch series "reapply: relax __GFP_THISNODE for MADV_HUGEPAGE mappings". The fixes for what was originally reported as "pathological THP behavior" we rightfully reverted to be sure not to introduced regressions at end of a merge window after a severe regression report from the kernel bot. We can safely re-apply them now that we had time to analyze the problem. The mm process worked fine, because the good fixes were eventually committed upstream without excessive delay. The regression reported by the kernel bot however forced us to revert the good fixes to be sure not to introduce regressions and to give us the time to analyze the issue further. The silver lining is that this extra time allowed to think more at this issue and also plan for a future direction to improve things further in terms of THP NUMA locality. This patch (of 2): This reverts commit 356ff8a9a78fb35d ("Revert "mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask"). So it reapplies 89c83fb539f954 ("mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask"). Consolidation of the THP allocation flags at the same place was meant to be a clean up to easier handle otherwise scattered code which is imposing a maintenance burden. There were no real problems observed with the gfp mask consolidation but the reversion was rushed through without a larger consensus regardless. This patch brings the consolidation back because this should make the long term maintainability easier as well as it should allow future changes to be less error prone. [mhocko@kernel.org: changelog additions] Link: http://lkml.kernel.org/r/20190503223146.2312-2-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Acked-by: Michal Hocko Cc: Mel Gorman Cc: Vlastimil Babka Cc: David Rientjes Cc: Zi Yan Cc: Stefan Priebe - Profihost AG Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 12 ++++-------- mm/huge_memory.c | 27 ++++++++++++++------------- mm/mempolicy.c | 32 +++----------------------------- mm/shmem.c | 2 +- 4 files changed, 22 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index fb07b503dc45..f33881688f42 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -510,22 +510,18 @@ alloc_pages(gfp_t gfp_mask, unsigned int order) } extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr, - int node, bool hugepage); -#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ - alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true) + int node); #else #define alloc_pages(gfp_mask, order) \ alloc_pages_node(numa_node_id(), gfp_mask, order) -#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ - alloc_pages(gfp_mask, order) -#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ +#define alloc_pages_vma(gfp_mask, order, vma, addr, node)\ alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) #define alloc_page_vma(gfp_mask, vma, addr) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) + alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id()) #define alloc_page_vma_node(gfp_mask, vma, addr, node) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, node, false) + alloc_pages_vma(gfp_mask, 0, vma, addr, node) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1334ede667a8..f7e388b8662d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -644,30 +644,30 @@ release: * available * never: never stall for any thp allocation */ -static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) +static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr) { const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); + const gfp_t gfp_mask = GFP_TRANSHUGE_LIGHT | __GFP_THISNODE; /* Always do synchronous compaction */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); + return GFP_TRANSHUGE | __GFP_THISNODE | + (vma_madvised ? 0 : __GFP_NORETRY); /* Kick kcompactd and fail quickly */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; + return gfp_mask | __GFP_KSWAPD_RECLAIM; /* Synchronous compaction if madvised, otherwise kick kcompactd */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | - (vma_madvised ? __GFP_DIRECT_RECLAIM : - __GFP_KSWAPD_RECLAIM); + return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM); /* Only do synchronous compaction if madvised */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | - (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); + return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); - return GFP_TRANSHUGE_LIGHT; + return gfp_mask; } /* Caller must hold page table lock. */ @@ -739,8 +739,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) pte_free(vma->vm_mm, pgtable); return ret; } - gfp = alloc_hugepage_direct_gfpmask(vma); - page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); + gfp = alloc_hugepage_direct_gfpmask(vma, haddr); + page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id()); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1347,8 +1347,9 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) alloc: if (__transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { - huge_gfp = alloc_hugepage_direct_gfpmask(vma); - new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); + huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr); + new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma, + haddr, numa_node_id()); } else new_page = NULL; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 547cd403ed02..9c9877a43d58 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1180,8 +1180,8 @@ static struct page *new_page(struct page *page, unsigned long start) } else if (PageTransHuge(page)) { struct page *thp; - thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address, - HPAGE_PMD_ORDER); + thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma, + address, numa_node_id()); if (!thp) return NULL; prep_transhuge_page(thp); @@ -2083,7 +2083,6 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual Address of the allocation. Must be inside the VMA. * @node: Which node to prefer for allocation (modulo policy). - * @hugepage: for hugepages try only the preferred node if possible * * This function allocates a page from the kernel page pool and applies * a NUMA policy associated with the VMA or the current process. @@ -2094,7 +2093,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, */ struct page * alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, int node, bool hugepage) + unsigned long addr, int node) { struct mempolicy *pol; struct page *page; @@ -2112,31 +2111,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, goto out; } - if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { - int hpage_node = node; - - /* - * For hugepage allocation and non-interleave policy which - * allows the current node (or other explicitly preferred - * node) we only try to allocate from the current/preferred - * node and don't fall back to other nodes, as the cost of - * remote accesses would likely offset THP benefits. - * - * If the policy is interleave, or does not allow the current - * node in its nodemask, we allocate the standard way. - */ - if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL)) - hpage_node = pol->v.preferred_node; - - nmask = policy_nodemask(gfp, pol); - if (!nmask || node_isset(hpage_node, *nmask)) { - mpol_cond_put(pol); - page = __alloc_pages_node(hpage_node, - gfp | __GFP_THISNODE, order); - goto out; - } - } - nmask = policy_nodemask(gfp, pol); preferred_nid = policy_node(gfp, pol, node); page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); diff --git a/mm/shmem.c b/mm/shmem.c index 626d8c74b973..2bed4761f279 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1466,7 +1466,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, shmem_pseudo_vma_init(&pvma, info, hindex); page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, - HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); + HPAGE_PMD_ORDER, &pvma, 0, numa_node_id()); shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); -- cgit v1.2.3 From a8282608c88e08b1782141026eab61204c1e533f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 13 Aug 2019 15:37:53 -0700 Subject: Revert "mm, thp: restore node-local hugepage allocations" This reverts commit 2f0799a0ffc033b ("mm, thp: restore node-local hugepage allocations"). commit 2f0799a0ffc033b was rightfully applied to avoid the risk of a severe regression that was reported by the kernel test robot at the end of the merge window. Now we understood the regression was a false positive and was caused by a significant increase in fairness during a swap trashing benchmark. So it's safe to re-apply the fix and continue improving the code from there. The benchmark that reported the regression is very useful, but it provides a meaningful result only when there is no significant alteration in fairness during the workload. The removal of __GFP_THISNODE increased fairness. __GFP_THISNODE cannot be used in the generic page faults path for new memory allocations under the MPOL_DEFAULT mempolicy, or the allocation behavior significantly deviates from what the MPOL_DEFAULT semantics are supposed to be for THP and 4k allocations alike. Setting THP defrag to "always" or using MADV_HUGEPAGE (with THP defrag set to "madvise") has never meant to provide an implicit MPOL_BIND on the "current" node the task is running on, causing swap storms and providing a much more aggressive behavior than even zone_reclaim_node = 3. Any workload who could have benefited from __GFP_THISNODE has now to enable zone_reclaim_mode=1||2||3. __GFP_THISNODE implicitly provided the zone_reclaim_mode behavior, but it only did so if THP was enabled: if THP was disabled, there would have been no chance to get any 4k page from the current node if the current node was full of pagecache, which further shows how this __GFP_THISNODE was misplaced in MADV_HUGEPAGE. MADV_HUGEPAGE has never been intended to provide any zone_reclaim_mode semantics, in fact the two are orthogonal, zone_reclaim_mode = 1|2|3 must work exactly the same with MADV_HUGEPAGE set or not. The performance characteristic of memory depends on the hardware details. The numbers below are obtained on Naples/EPYC architecture and the N/A projection extends them to show what we should aim for in the future as a good THP NUMA locality default. The benchmark used exercises random memory seeks (note: the cost of the page faults is not part of the measurement). D0 THP | D0 4k | D1 THP | D1 4k | D2 THP | D2 4k | D3 THP | D3 4k | ... 0% | +43% | +45% | +106% | +131% | +224% | N/A | N/A D0 means distance zero (i.e. local memory), D1 means distance one (i.e. intra socket memory), D2 means distance two (i.e. inter socket memory), etc... For the guest physical memory allocated by qemu and for guest mode kernel the performance characteristic of RAM is more complex and an ideal default could be: D0 THP | D1 THP | D0 4k | D2 THP | D1 4k | D3 THP | D2 4k | D3 4k | ... 0% | +58% | +101% | N/A | +222% | N/A | N/A | N/A NOTE: the N/A are projections and haven't been measured yet, the measurement in this case is done on a 1950x with only two NUMA nodes. The THP case here means THP was used both in the host and in the guest. After applying this commit the THP NUMA locality order that we'll get out of MADV_HUGEPAGE is this: D0 THP | D1 THP | D2 THP | D3 THP | ... | D0 4k | D1 4k | D2 4k | D3 4k | ... Before this commit it was: D0 THP | D0 4k | D1 4k | D2 4k | D3 4k | ... Even if we ignore the breakage of large workloads that can't fit in a single node that the __GFP_THISNODE implicit "current node" mbind caused, the THP NUMA locality order provided by __GFP_THISNODE was still not the one we shall aim for in the long term (i.e. the first one at the top). After this commit is applied, we can introduce a new allocator multi order API and to replace those two alloc_pages_vmas calls in the page fault path, with a single multi order call: unsigned int order = (1 << HPAGE_PMD_ORDER) | (1 << 0); page = alloc_pages_multi_order(..., &order); if (!page) goto out; if (!(order & (1 << 0))) { VM_WARN_ON(order != 1 << HPAGE_PMD_ORDER); /* THP fault */ } else { VM_WARN_ON(order != 1 << 0); /* 4k fallback */ } The page allocator logic has to be altered so that when it fails on any zone with order 9, it has to try again with a order 0 before falling back to the next zone in the zonelist. After that we need to do more measurements and evaluate if adding an opt-in feature for guest mode is worth it, to swap "DN 4k | DN+1 THP" with "DN+1 THP | DN 4k" at every NUMA distance crossing. Link: http://lkml.kernel.org/r/20190503223146.2312-3-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Acked-by: Michal Hocko Acked-by: Mel Gorman Cc: Vlastimil Babka Cc: David Rientjes Cc: Zi Yan Cc: Stefan Priebe - Profihost AG Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 2 ++ mm/huge_memory.c | 42 ++++++++++++++++++++++++++---------------- mm/mempolicy.c | 2 +- 3 files changed, 29 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 5228c62af416..bac395f1d00a 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -139,6 +139,8 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr); +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, + unsigned long addr); bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f7e388b8662d..738065f765ab 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -647,27 +647,37 @@ release: static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr) { const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); - const gfp_t gfp_mask = GFP_TRANSHUGE_LIGHT | __GFP_THISNODE; + gfp_t this_node = 0; - /* Always do synchronous compaction */ - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE | __GFP_THISNODE | - (vma_madvised ? 0 : __GFP_NORETRY); +#ifdef CONFIG_NUMA + struct mempolicy *pol; + /* + * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not + * specified, to express a general desire to stay on the current + * node for optimistic allocation attempts. If the defrag mode + * and/or madvise hint requires the direct reclaim then we prefer + * to fallback to other node rather than node reclaim because that + * can lead to excessive reclaim even though there is free memory + * on other nodes. We expect that NUMA preferences are specified + * by memory policies. + */ + pol = get_vma_policy(vma, addr); + if (pol->mode != MPOL_BIND) + this_node = __GFP_THISNODE; + mpol_cond_put(pol); +#endif - /* Kick kcompactd and fail quickly */ + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - return gfp_mask | __GFP_KSWAPD_RECLAIM; - - /* Synchronous compaction if madvised, otherwise kick kcompactd */ + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node; if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) - return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : - __GFP_KSWAPD_RECLAIM); - - /* Only do synchronous compaction if madvised */ + return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM | this_node); if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) - return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); - - return gfp_mask; + return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : + this_node); + return GFP_TRANSHUGE_LIGHT | this_node; } /* Caller must hold page table lock. */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9c9877a43d58..65e0874fce17 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1734,7 +1734,7 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ -static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = __get_vma_policy(vma, addr); -- cgit v1.2.3