From 7e96fb5710a806fd41d2eb6b41bac14a26a094e2 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 6 Apr 2020 20:03:43 -0700 Subject: mm/vma: add missing VMA flag readable name for VM_SYNC Patch series "mm/vma: Use all available wrappers when possible", v2. Apart from adding a VMA flag readable name for trace purpose, this series does some open encoding replacements with availabe VMA specific wrappers. This skips VM_HUGETLB check in vma_migratable() as its already being done with another patch (https://patchwork.kernel.org/patch/11347831/) which is yet to be merged. This patch (of 4): This just adds the missing readable name for VM_SYNC. Signed-off-by: Anshuman Khandual Signed-off-by: Andrew Morton Cc: Vlastimil Babka Cc: Steven Rostedt Cc: Ingo Molnar Cc: Alexander Viro Cc: Andy Lutomirski Cc: "Aneesh Kumar K.V" Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Dave Hansen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Mel Gorman Cc: Michael Ellerman Cc: Nick Piggin Cc: Paul Burton Cc: Paul Mackerras Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Rich Felker Cc: Thomas Gleixner Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/1582520593-30704-2-git-send-email-anshuman.khandual@arm.com Signed-off-by: Linus Torvalds --- include/trace/events/mmflags.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index a1675d43777e..5fb752034386 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -154,6 +154,7 @@ IF_HAVE_PG_IDLE(PG_idle, "idle" ) {VM_ACCOUNT, "account" }, \ {VM_NORESERVE, "noreserve" }, \ {VM_HUGETLB, "hugetlb" }, \ + {VM_SYNC, "sync" }, \ __VM_ARCH_SPECIFIC_1 , \ {VM_WIPEONFORK, "wipeonfork" }, \ {VM_DONTDUMP, "dontdump" }, \ -- cgit v1.3 From 3122e80efc0faf4a2accba7a46c7ed795edbfded Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 6 Apr 2020 20:03:47 -0700 Subject: mm/vma: make vma_is_accessible() available for general use Lets move vma_is_accessible() helper to include/linux/mm.h which makes it available for general use. While here, this replaces all remaining open encodings for VMA access check with vma_is_accessible(). Signed-off-by: Anshuman Khandual Signed-off-by: Andrew Morton Acked-by: Geert Uytterhoeven Acked-by: Guo Ren Acked-by: Vlastimil Babka Cc: Guo Ren Cc: Geert Uytterhoeven Cc: Ralf Baechle Cc: Paul Burton Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Yoshinori Sato Cc: Rich Felker Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Steven Rostedt Cc: Mel Gorman Cc: Alexander Viro Cc: "Aneesh Kumar K.V" Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Nick Piggin Cc: Paul Mackerras Cc: Will Deacon Link: http://lkml.kernel.org/r/1582520593-30704-3-git-send-email-anshuman.khandual@arm.com Signed-off-by: Linus Torvalds --- arch/csky/mm/fault.c | 2 +- arch/m68k/mm/fault.c | 2 +- arch/mips/mm/fault.c | 2 +- arch/powerpc/mm/fault.c | 2 +- arch/sh/mm/fault.c | 2 +- arch/x86/mm/fault.c | 2 +- include/linux/mm.h | 6 ++++++ kernel/sched/fair.c | 2 +- mm/gup.c | 2 +- mm/memory.c | 5 ----- mm/mempolicy.c | 3 +-- mm/mmap.c | 5 ++--- 12 files changed, 17 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c index d3c61b83e195..a6e8230b6fbf 100644 --- a/arch/csky/mm/fault.c +++ b/arch/csky/mm/fault.c @@ -141,7 +141,7 @@ good_area: if (!(vma->vm_flags & VM_WRITE)) goto bad_area; } else { - if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))) + if (!vma_is_accessible(vma)) goto bad_area; } diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index f7afb9897966..0c4a21a685d5 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -125,7 +125,7 @@ good_area: case 1: /* read, present */ goto acc_err; case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) + if (!vma_is_accessible(vma)) goto acc_err; } diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index 4a0eafe3d932..fb048ba2b91d 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -142,7 +142,7 @@ good_area: goto bad_area; } } else { - if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))) + if (!vma_is_accessible(vma)) goto bad_area; } } diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index d15f0f0ee806..84af6c8eecf7 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -314,7 +314,7 @@ static bool access_error(bool is_write, bool is_exec, return false; } - if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) + if (unlikely(!vma_is_accessible(vma))) return true; /* * We should ideally do the vma pkey access check here. But in the diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 13ee4d20e622..5f23d7907597 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -355,7 +355,7 @@ static inline int access_error(int error_code, struct vm_area_struct *vma) return 1; /* read, not present: */ - if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) + if (unlikely(!vma_is_accessible(vma))) return 1; return 0; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 859519f5b342..a51df516b87b 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1222,7 +1222,7 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) return 1; /* read, not present: */ - if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) + if (unlikely(!vma_is_accessible(vma))) return 1; return 0; diff --git a/include/linux/mm.h b/include/linux/mm.h index 7dd5c4ccbf85..be49e371e4b5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -629,6 +629,12 @@ static inline bool vma_is_foreign(struct vm_area_struct *vma) return false; } + +static inline bool vma_is_accessible(struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC); +} + #ifdef CONFIG_SHMEM /* * The vma_is_shmem is not inline because it is used only by slow diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d7fb20adabeb..1ea3dddafe69 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2799,7 +2799,7 @@ static void task_numa_work(struct callback_head *work) * Skip inaccessible VMAs to avoid any confusion between * PROT_NONE and NUMA hinting ptes */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) + if (!vma_is_accessible(vma)) continue; do { diff --git a/mm/gup.c b/mm/gup.c index da3e03185144..4d505c994623 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1416,7 +1416,7 @@ long populate_vma_page_range(struct vm_area_struct *vma, * We want mlock to succeed for regions that have any permissions * other than PROT_NONE. */ - if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) + if (vma_is_accessible(vma)) gup_flags |= FOLL_FORCE; /* diff --git a/mm/memory.c b/mm/memory.c index 586271f3efc6..d2a353c345ad 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3964,11 +3964,6 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) return VM_FAULT_FALLBACK; } -static inline bool vma_is_accessible(struct vm_area_struct *vma) -{ - return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE); -} - static vm_fault_t create_huge_pud(struct vm_fault *vmf) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5fb427aed612..b36926ba02e2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -678,8 +678,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, if (flags & MPOL_MF_LAZY) { /* Similar to task_numa_work, skip inaccessible VMAs */ - if (!is_vm_hugetlb_page(vma) && - (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) && + if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) && !(vma->vm_flags & VM_MIXEDMAP)) change_prot_numa(vma, start, endvma); return 1; diff --git a/mm/mmap.c b/mm/mmap.c index 94ae18398c59..aa09429d5888 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2358,8 +2358,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) gap_addr = TASK_SIZE; next = vma->vm_next; - if (next && next->vm_start < gap_addr && - (next->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) { + if (next && next->vm_start < gap_addr && vma_is_accessible(next)) { if (!(next->vm_flags & VM_GROWSUP)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ @@ -2440,7 +2439,7 @@ int expand_downwards(struct vm_area_struct *vma, prev = vma->vm_prev; /* Check that both stack segments have the same anon_vma? */ if (prev && !(prev->vm_flags & VM_GROWSDOWN) && - (prev->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) { + vma_is_accessible(prev)) { if (address - prev->vm_end < stack_guard_gap) return -ENOMEM; } -- cgit v1.3 From 03911132aafd6727e59408e497c049402a5a11fa Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 6 Apr 2020 20:03:51 -0700 Subject: mm/vma: replace all remaining open encodings with is_vm_hugetlb_page() This replaces all remaining open encodings with is_vm_hugetlb_page(). Signed-off-by: Anshuman Khandual Signed-off-by: Andrew Morton Acked-by: Vlastimil Babka Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Cc: Alexander Viro Cc: Will Deacon Cc: "Aneesh Kumar K.V" Cc: Nick Piggin Cc: Peter Zijlstra Cc: Arnd Bergmann Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Andy Lutomirski Cc: Dave Hansen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Mel Gorman Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Rich Felker Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/1582520593-30704-4-git-send-email-anshuman.khandual@arm.com Signed-off-by: Linus Torvalds --- arch/powerpc/kvm/e500_mmu_host.c | 2 +- fs/binfmt_elf.c | 3 ++- include/asm-generic/tlb.h | 3 ++- kernel/events/core.c | 3 ++- 4 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 425d13806645..df9989cf7ba3 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -422,7 +422,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, break; } } else if (vma && hva >= vma->vm_start && - (vma->vm_flags & VM_HUGETLB)) { + is_vm_hugetlb_page(vma)) { unsigned long psize = vma_kernel_pagesize(vma); tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index f4713ea76e82..1eb63867e266 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -1317,7 +1318,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, } /* Hugetlb memory check */ - if (vma->vm_flags & VM_HUGETLB) { + if (is_vm_hugetlb_page(vma)) { if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) goto whole; if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE)) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index f391f6b500b4..3f1649a8cf55 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -398,7 +399,7 @@ tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) * We rely on tlb_end_vma() to issue a flush, such that when we reset * these values the batch is empty. */ - tlb->vma_huge = !!(vma->vm_flags & VM_HUGETLB); + tlb->vma_huge = is_vm_hugetlb_page(vma); tlb->vma_exec = !!(vma->vm_flags & VM_EXEC); } diff --git a/kernel/events/core.c b/kernel/events/core.c index 81e6d80cb219..55e44417f66d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -7973,7 +7974,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) flags |= MAP_EXECUTABLE; if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; - if (vma->vm_flags & VM_HUGETLB) + if (is_vm_hugetlb_page(vma)) flags |= MAP_HUGETLB; if (file) { -- cgit v1.3 From 29fd1897070125ab49634524a20f146fb4240a51 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 6 Apr 2020 20:04:06 -0700 Subject: mm: make it clear that gfp reclaim modifiers are valid only for sleepable allocations While it might be really clear to MM developers that gfp reclaim modifiers are applicable only to sleepable allocations (those with __GFP_DIRECT_RECLAIM) it seems that actual users of the API are not always sure. Make it explicit that they are not applicable for GFP_NOWAIT or GFP_ATOMIC allocations which are the most commonly used non-sleepable allocation masks. Signed-off-by: Michal Hocko Signed-off-by: Andrew Morton Reviewed-by: Joel Fernandes (Google) Acked-by: Paul E. McKenney Acked-by: David Rientjes Cc: Neil Brown Link: http://lkml.kernel.org/r/20200403083543.11552-3-mhocko@kernel.org Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index be2754841369..4aba4c86c626 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -124,6 +124,8 @@ struct vm_area_struct; * * Reclaim modifiers * ~~~~~~~~~~~~~~~~~ + * Please note that all the following flags are only applicable to sleepable + * allocations (e.g. %GFP_NOWAIT and %GFP_ATOMIC will ignore them). * * %__GFP_IO can start physical IO. * -- cgit v1.3 From dcdf11ee144133328664d90836e712d840d047d9 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 6 Apr 2020 20:04:25 -0700 Subject: mm, shmem: add vmstat for hugepage fallback The existing thp_fault_fallback indicates when thp attempts to allocate a hugepage but fails, or if the hugepage cannot be charged to the mem cgroup hierarchy. Extend this to shmem as well. Adds a new thp_file_fallback to complement thp_file_alloc that gets incremented when a hugepage is attempted to be allocated but fails, or if it cannot be charged to the mem cgroup hierarchy. Additionally, remove the check for CONFIG_TRANSPARENT_HUGE_PAGECACHE from shmem_alloc_hugepage() since it is only called with this configuration option. Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Reviewed-by: Yang Shi Acked-by: Kirill A. Shutemov Cc: Mike Rapoport Cc: Jeremy Cline Cc: Andrea Arcangeli Cc: Mike Kravetz Cc: Michal Hocko Cc: Vlastimil Babka Link: http://lkml.kernel.org/r/alpine.DEB.2.21.2003061421240.7412@chino.kir.corp.google.com Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/transhuge.rst | 4 ++++ include/linux/vm_event_item.h | 2 ++ mm/shmem.c | 10 ++++++---- mm/vmstat.c | 1 + 4 files changed, 13 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index bd5714547cee..f79ebbcd6725 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -319,6 +319,10 @@ thp_file_alloc is incremented every time a file huge page is successfully allocated. +thp_file_fallback + is incremented if a file huge page is attempted to be allocated + but fails and instead falls back to using small pages. + thp_file_mapped is incremented every time a file huge page is mapped into user address space. diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 47a3441cf4c4..41a4c7568748 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -76,6 +76,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_COLLAPSE_ALLOC, THP_COLLAPSE_ALLOC_FAILED, THP_FILE_ALLOC, + THP_FILE_FALLBACK, THP_FILE_MAPPED, THP_SPLIT_PAGE, THP_SPLIT_PAGE_FAILED, @@ -115,6 +116,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifndef CONFIG_TRANSPARENT_HUGEPAGE #define THP_FILE_ALLOC ({ BUILD_BUG(); 0; }) +#define THP_FILE_FALLBACK ({ BUILD_BUG(); 0; }) #define THP_FILE_MAPPED ({ BUILD_BUG(); 0; }) #endif diff --git a/mm/shmem.c b/mm/shmem.c index f47347cb30f6..8160d0762bf5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1472,9 +1472,6 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, pgoff_t hindex; struct page *page; - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) - return NULL; - hindex = round_down(index, HPAGE_PMD_NR); if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1, XA_PRESENT)) @@ -1486,6 +1483,8 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); + else + count_vm_event(THP_FILE_FALLBACK); return page; } @@ -1871,8 +1870,11 @@ alloc_nohuge: error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, PageTransHuge(page)); - if (error) + if (error) { + if (PageTransHuge(page)) + count_vm_event(THP_FILE_FALLBACK); goto unacct; + } error = shmem_add_to_page_cache(page, mapping, hindex, NULL, gfp & GFP_RECLAIM_MASK); if (error) { diff --git a/mm/vmstat.c b/mm/vmstat.c index c9c0d71f917f..ebc1dcaa0539 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1259,6 +1259,7 @@ const char * const vmstat_text[] = { "thp_collapse_alloc", "thp_collapse_alloc_failed", "thp_file_alloc", + "thp_file_fallback", "thp_file_mapped", "thp_split_page", "thp_split_page_failed", -- cgit v1.3 From 85b9f46e8ea451633ccd60a7d8cacbfff9f34047 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 6 Apr 2020 20:04:28 -0700 Subject: mm, thp: track fallbacks due to failed memcg charges separately The thp_fault_fallback and thp_file_fallback vmstats are incremented if either the hugepage allocation fails through the page allocator or the hugepage charge fails through mem cgroup. This patch leaves this field untouched but adds two new fields, thp_{fault,file}_fallback_charge, which is incremented only when the mem cgroup charge fails. This distinguishes between attempted hugepage allocations that fail due to fragmentation (or low memory conditions) and those that fail due to mem cgroup limits. That can be used to determine the impact of fragmentation on the system by excluding faults that failed due to memcg usage. Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Reviewed-by: Yang Shi Acked-by: Kirill A. Shutemov Cc: Mike Rapoport Cc: Jeremy Cline Cc: Andrea Arcangeli Cc: Mike Kravetz Cc: Michal Hocko Cc: Vlastimil Babka Link: http://lkml.kernel.org/r/alpine.DEB.2.21.2003061422070.7412@chino.kir.corp.google.com Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/transhuge.rst | 10 ++++++++++ include/linux/vm_event_item.h | 3 +++ mm/huge_memory.c | 2 ++ mm/shmem.c | 4 +++- mm/vmstat.c | 2 ++ 5 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index f79ebbcd6725..2f31de8f7c74 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -310,6 +310,11 @@ thp_fault_fallback is incremented if a page fault fails to allocate a huge page and instead falls back to using small pages. +thp_fault_fallback_charge + is incremented if a page fault fails to charge a huge page and + instead falls back to using small pages even though the + allocation was successful. + thp_collapse_alloc_failed is incremented if khugepaged found a range of pages that should be collapsed into one huge page but failed @@ -323,6 +328,11 @@ thp_file_fallback is incremented if a file huge page is attempted to be allocated but fails and instead falls back to using small pages. +thp_file_fallback_charge + is incremented if a file huge page cannot be charged and instead + falls back to using small pages even though the allocation was + successful. + thp_file_mapped is incremented every time a file huge page is mapped into user address space. diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 41a4c7568748..ffef0f279747 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -73,10 +73,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_TRANSPARENT_HUGEPAGE THP_FAULT_ALLOC, THP_FAULT_FALLBACK, + THP_FAULT_FALLBACK_CHARGE, THP_COLLAPSE_ALLOC, THP_COLLAPSE_ALLOC_FAILED, THP_FILE_ALLOC, THP_FILE_FALLBACK, + THP_FILE_FALLBACK_CHARGE, THP_FILE_MAPPED, THP_SPLIT_PAGE, THP_SPLIT_PAGE_FAILED, @@ -117,6 +119,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifndef CONFIG_TRANSPARENT_HUGEPAGE #define THP_FILE_ALLOC ({ BUILD_BUG(); 0; }) #define THP_FILE_FALLBACK ({ BUILD_BUG(); 0; }) +#define THP_FILE_FALLBACK_CHARGE ({ BUILD_BUG(); 0; }) #define THP_FILE_MAPPED ({ BUILD_BUG(); 0; }) #endif diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0f9389f9d1f8..0080e8df18ef 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -597,6 +597,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); + count_vm_event(THP_FAULT_FALLBACK_CHARGE); return VM_FAULT_FALLBACK; } @@ -1446,6 +1447,7 @@ alloc: put_page(page); ret |= VM_FAULT_FALLBACK; count_vm_event(THP_FAULT_FALLBACK); + count_vm_event(THP_FAULT_FALLBACK_CHARGE); goto out; } diff --git a/mm/shmem.c b/mm/shmem.c index 8160d0762bf5..b48ac3806f8f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1871,8 +1871,10 @@ alloc_nohuge: error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, PageTransHuge(page)); if (error) { - if (PageTransHuge(page)) + if (PageTransHuge(page)) { count_vm_event(THP_FILE_FALLBACK); + count_vm_event(THP_FILE_FALLBACK_CHARGE); + } goto unacct; } error = shmem_add_to_page_cache(page, mapping, hindex, diff --git a/mm/vmstat.c b/mm/vmstat.c index ebc1dcaa0539..96d21a792b57 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1256,10 +1256,12 @@ const char * const vmstat_text[] = { #ifdef CONFIG_TRANSPARENT_HUGEPAGE "thp_fault_alloc", "thp_fault_fallback", + "thp_fault_fallback_charge", "thp_collapse_alloc", "thp_collapse_alloc_failed", "thp_file_alloc", "thp_file_fallback", + "thp_file_fallback_charge", "thp_file_mapped", "thp_split_page", "thp_split_page_failed", -- cgit v1.3 From a0650604a707b1926cbc32feb2f006ebb74ef47b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Apr 2020 20:04:31 -0700 Subject: include/linux/pagemap.h: optimise find_subpage for !THP If THP is disabled, find_subpage() can become a no-op by using hpage_nr_pages() instead of compound_nr(). hpage_nr_pages() embeds a check for PageTail, so we can drop the check here. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Acked-by: Kirill A. Shutemov Cc: Aneesh Kumar K.V Cc: Pankaj Gupta Link: http://lkml.kernel.org/r/20200318140253.6141-5-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index f56282491a48..a8f7bd8ea1c6 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -341,9 +341,7 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index) if (PageHuge(head)) return head; - VM_BUG_ON_PAGE(PageTail(head), head); - - return head + (index & (compound_nr(head) - 1)); + return head + (index & (hpage_nr_pages(head) - 1)); } struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); -- cgit v1.3 From 396bcc5299c281e9cf1737ad0efcd97be9f83845 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Apr 2020 20:04:35 -0700 Subject: mm: remove CONFIG_TRANSPARENT_HUGE_PAGECACHE Commit e496cf3d7821 ("thp: introduce CONFIG_TRANSPARENT_HUGE_PAGECACHE") notes that it should be reverted when the PowerPC problem was fixed. The commit fixing the PowerPC problem (953c66c2b22a) did not revert the commit; instead setting CONFIG_TRANSPARENT_HUGE_PAGECACHE to the same as CONFIG_TRANSPARENT_HUGEPAGE. Checking with Kirill and Aneesh, this was an oversight, so remove the Kconfig symbol and undo the work of commit e496cf3d7821. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Acked-by: Kirill A. Shutemov Cc: Aneesh Kumar K.V Cc: Christoph Hellwig Cc: Pankaj Gupta Link: http://lkml.kernel.org/r/20200318140253.6141-6-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 10 +--------- mm/Kconfig | 6 +----- mm/huge_memory.c | 2 +- mm/khugepaged.c | 12 ++++-------- mm/memory.c | 5 ++--- mm/rmap.c | 2 +- mm/shmem.c | 34 +++++++++++++++++----------------- 7 files changed, 27 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index d56fefef8905..7a35a6901221 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -78,6 +78,7 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); extern int shmem_unuse(unsigned int type, bool frontswap, unsigned long *fs_pages_to_unuse); +extern bool shmem_huge_enabled(struct vm_area_struct *vma); extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); @@ -114,15 +115,6 @@ static inline bool shmem_file(struct file *file) extern bool shmem_charge(struct inode *inode, long pages); extern void shmem_uncharge(struct inode *inode, long pages); -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE -extern bool shmem_huge_enabled(struct vm_area_struct *vma); -#else -static inline bool shmem_huge_enabled(struct vm_area_struct *vma) -{ - return false; -} -#endif - #ifdef CONFIG_SHMEM extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, diff --git a/mm/Kconfig b/mm/Kconfig index ab80933be65f..211a70e8d5cf 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -420,10 +420,6 @@ config THP_SWAP For selection by architectures with reasonable THP sizes. -config TRANSPARENT_HUGE_PAGECACHE - def_bool y - depends on TRANSPARENT_HUGEPAGE - # # UP and nommu archs use km based percpu allocator # @@ -714,7 +710,7 @@ config GUP_GET_PTE_LOW_HIGH config READ_ONLY_THP_FOR_FS bool "Read-only THP for filesystems (EXPERIMENTAL)" - depends on TRANSPARENT_HUGE_PAGECACHE && SHMEM + depends on TRANSPARENT_HUGEPAGE && SHMEM help Allow khugepaged to put read-only file-backed pages in THP. diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0080e8df18ef..c1e7c71db1e6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -326,7 +326,7 @@ static struct attribute *hugepage_attr[] = { &defrag_attr.attr, &use_zero_page_attr.attr, &hpage_pmd_size_attr.attr, -#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) +#ifdef CONFIG_SHMEM &shmem_enabled_attr.attr, #endif #ifdef CONFIG_DEBUG_VM diff --git a/mm/khugepaged.c b/mm/khugepaged.c index c659c68728bc..b1d9a8e189b8 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -414,8 +414,6 @@ static bool hugepage_vma_check(struct vm_area_struct *vma, (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file && (vm_flags & VM_DENYWRITE))) { - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) - return false; return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, HPAGE_PMD_NR); } @@ -1258,7 +1256,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) } } -#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) +#ifdef CONFIG_SHMEM /* * Notify khugepaged that given addr of the mm is pte-mapped THP. Then * khugepaged should try to collapse the page table. @@ -1973,6 +1971,8 @@ skip: if (khugepaged_scan.address < hstart) khugepaged_scan.address = hstart; VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); + if (shmem_file(vma->vm_file) && !shmem_huge_enabled(vma)) + goto skip; while (khugepaged_scan.address < hend) { int ret; @@ -1984,14 +1984,10 @@ skip: khugepaged_scan.address + HPAGE_PMD_SIZE > hend); if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { - struct file *file; + struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, khugepaged_scan.address); - if (shmem_file(vma->vm_file) - && !shmem_huge_enabled(vma)) - goto skip; - file = get_file(vma->vm_file); up_read(&mm->mmap_sem); ret = 1; khugepaged_scan_file(mm, file, pgoff, hpage); diff --git a/mm/memory.c b/mm/memory.c index d2a353c345ad..d527e0ec29c7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3373,7 +3373,7 @@ map_pte: return 0; } -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE static void deposit_prealloc_pte(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -3475,8 +3475,7 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, pte_t entry; vm_fault_t ret; - if (pmd_none(*vmf->pmd) && PageTransCompound(page) && - IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { + if (pmd_none(*vmf->pmd) && PageTransCompound(page)) { /* THP on COW? */ VM_BUG_ON_PAGE(memcg, page); diff --git a/mm/rmap.c b/mm/rmap.c index 68fe0472c803..374a9bfdbffa 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -933,7 +933,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, set_pte_at(vma->vm_mm, address, pte, entry); ret = 1; } else { -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE pmd_t *pmd = pvmw.pmd; pmd_t entry; diff --git a/mm/shmem.c b/mm/shmem.c index b48ac3806f8f..2c255f383608 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -410,7 +410,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, #define SHMEM_HUGE_DENY (-1) #define SHMEM_HUGE_FORCE (-2) -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* ifdef here to avoid bloating shmem.o when not necessary */ static int shmem_huge __read_mostly; @@ -580,7 +580,7 @@ static long shmem_unused_huge_count(struct super_block *sb, struct shmem_sb_info *sbinfo = SHMEM_SB(sb); return READ_ONCE(sbinfo->shrinklist_len); } -#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */ +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ #define shmem_huge SHMEM_HUGE_DENY @@ -589,11 +589,11 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, { return 0; } -#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) { - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) && shmem_huge != SHMEM_HUGE_DENY) return true; @@ -1059,7 +1059,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) * Part of the huge page can be beyond i_size: subject * to shrink under memory pressure. */ - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { spin_lock(&sbinfo->shrinklist_lock); /* * _careful to defend against unlocked access to @@ -1510,7 +1510,7 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, int nr; int err = -ENOSPC; - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) huge = false; nr = huge ? HPAGE_PMD_NR : 1; @@ -2093,7 +2093,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, get_area = current->mm->get_unmapped_area; addr = get_area(file, uaddr, len, pgoff, flags); - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return addr; if (IS_ERR_VALUE(addr)) return addr; @@ -2232,7 +2232,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); vma->vm_ops = &shmem_vm_ops; - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < (vma->vm_end & HPAGE_PMD_MASK)) { khugepaged_enter(vma, vma->vm_flags); @@ -3459,7 +3459,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) case Opt_huge: ctx->huge = result.uint_32; if (ctx->huge != SHMEM_HUGE_NEVER && - !(IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && has_transparent_hugepage())) goto unsupported_parameter; ctx->seen |= SHMEM_SEEN_HUGE; @@ -3605,7 +3605,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, sbinfo->gid)); -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ if (sbinfo->huge) seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); @@ -3850,7 +3850,7 @@ static const struct super_operations shmem_ops = { .evict_inode = shmem_evict_inode, .drop_inode = generic_delete_inode, .put_super = shmem_put_super, -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE .nr_cached_objects = shmem_unused_huge_count, .free_cached_objects = shmem_unused_huge_scan, #endif @@ -3912,7 +3912,7 @@ int __init shmem_init(void) goto out1; } -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; else @@ -3928,7 +3928,7 @@ out2: return error; } -#if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS) +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) static ssize_t shmem_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -3980,9 +3980,9 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, struct kobj_attribute shmem_enabled_attr = __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store); -#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE bool shmem_huge_enabled(struct vm_area_struct *vma) { struct inode *inode = file_inode(vma->vm_file); @@ -4017,7 +4017,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma) return false; } } -#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #else /* !CONFIG_SHMEM */ @@ -4186,7 +4186,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) vma->vm_file = file; vma->vm_ops = &shmem_vm_ops; - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < (vma->vm_end & HPAGE_PMD_MASK)) { khugepaged_enter(vma, vma->vm_flags); -- cgit v1.3 From 9de4f22a60f731943f050f4448bf2933ed3fa70b Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 6 Apr 2020 20:04:41 -0700 Subject: mm: code cleanup for MADV_FREE Some comments for MADV_FREE is revised and added to help people understand the MADV_FREE code, especially the page flag, PG_swapbacked. This makes page_is_file_cache() isn't consistent with its comments. So the function is renamed to page_is_file_lru() to make them consistent again. All these are put in one patch as one logical change. Suggested-by: David Hildenbrand Suggested-by: Johannes Weiner Suggested-by: David Rientjes Signed-off-by: "Huang, Ying" Signed-off-by: Andrew Morton Acked-by: Johannes Weiner Acked-by: David Rientjes Acked-by: Michal Hocko Acked-by: Pankaj Gupta Acked-by: Vlastimil Babka Cc: Dave Hansen Cc: Mel Gorman Cc: Minchan Kim Cc: Hugh Dickins Cc: Rik van Riel Link: http://lkml.kernel.org/r/20200317100342.2730705-1-ying.huang@intel.com Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 15 ++++++++------- include/linux/page-flags.h | 5 +++++ include/trace/events/vmscan.h | 2 +- mm/compaction.c | 2 +- mm/gup.c | 2 +- mm/khugepaged.c | 4 ++-- mm/memory-failure.c | 2 +- mm/memory_hotplug.c | 2 +- mm/mempolicy.c | 2 +- mm/migrate.c | 16 ++++++++-------- mm/mprotect.c | 2 +- mm/swap.c | 16 ++++++++-------- mm/vmscan.c | 12 ++++++------ 13 files changed, 44 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 6f2fef7b0784..219bef41d87c 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -6,19 +6,20 @@ #include /** - * page_is_file_cache - should the page be on a file LRU or anon LRU? + * page_is_file_lru - should the page be on a file LRU or anon LRU? * @page: the page to test * - * Returns 1 if @page is page cache page backed by a regular filesystem, - * or 0 if @page is anonymous, tmpfs or otherwise ram or swap backed. - * Used by functions that manipulate the LRU lists, to sort a page - * onto the right LRU list. + * Returns 1 if @page is a regular filesystem backed page cache page or a lazily + * freed anonymous page (e.g. via MADV_FREE). Returns 0 if @page is a normal + * anonymous page, a tmpfs page or otherwise ram or swap backed page. Used by + * functions that manipulate the LRU lists, to sort a page onto the right LRU + * list. * * We would like to get this info without a page flag, but the state * needs to survive until the page is last deleted from the LRU, which * could be as far down as __page_cache_release. */ -static inline int page_is_file_cache(struct page *page) +static inline int page_is_file_lru(struct page *page) { return !PageSwapBacked(page); } @@ -75,7 +76,7 @@ static __always_inline void del_page_from_lru_list(struct page *page, */ static inline enum lru_list page_lru_base_type(struct page *page) { - if (page_is_file_cache(page)) + if (page_is_file_lru(page)) return LRU_INACTIVE_FILE; return LRU_INACTIVE_ANON; } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 77de28bfefb0..acf7988fd640 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -63,6 +63,11 @@ * page_waitqueue(page) is a wait queue of all tasks waiting for the page * to become unlocked. * + * PG_swapbacked is set when a page uses swap as a backing storage. This are + * usually PageAnon or shmem pages but please note that even anonymous pages + * might lose their PG_swapbacked flag when they simply can be dropped (e.g. as + * a result of MADV_FREE). + * * PG_uptodate tells whether the page's contents is valid. When a read * completes, the page becomes uptodate, unless a disk I/O error happened. * diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index a5ab2973e8dc..74bb594ccb25 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -323,7 +323,7 @@ TRACE_EVENT(mm_vmscan_writepage, TP_fast_assign( __entry->pfn = page_to_pfn(page); __entry->reclaim_flags = trace_reclaim_flags( - page_is_file_cache(page)); + page_is_file_lru(page)); ), TP_printk("page=%p pfn=%lu flags=%s", diff --git a/mm/compaction.c b/mm/compaction.c index df3da2f76fdc..46af63eb8212 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -989,7 +989,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Successfully isolated */ del_page_from_lru_list(page, lruvec, page_lru(page)); mod_node_page_state(page_pgdat(page), - NR_ISOLATED_ANON + page_is_file_cache(page), + NR_ISOLATED_ANON + page_is_file_lru(page), hpage_nr_pages(page)); isolate_success: diff --git a/mm/gup.c b/mm/gup.c index 96af7e08db4b..b185377c38b7 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1677,7 +1677,7 @@ check_again: list_add_tail(&head->lru, &cma_page_list); mod_node_page_state(page_pgdat(head), NR_ISOLATED_ANON + - page_is_file_cache(head), + page_is_file_lru(head), hpage_nr_pages(head)); } } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b1d9a8e189b8..3afc1e2d7a55 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -511,7 +511,7 @@ void __khugepaged_exit(struct mm_struct *mm) static void release_pte_page(struct page *page) { - dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); + dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_lru(page)); unlock_page(page); putback_lru_page(page); } @@ -611,7 +611,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, goto out; } inc_node_page_state(page, - NR_ISOLATED_ANON + page_is_file_cache(page)); + NR_ISOLATED_ANON + page_is_file_lru(page)); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageLRU(page), page); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1c961cd26c0b..a96364be8ab4 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1810,7 +1810,7 @@ static int __soft_offline_page(struct page *page, int flags) */ if (!__PageMovable(page)) inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + page_is_file_lru(page)); list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 19389cdc16a5..005eab3411e5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1317,7 +1317,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) list_add_tail(&page->lru, &source); if (!__PageMovable(page)) inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + page_is_file_lru(page)); } else { pr_warn("failed to isolate pfn %lx\n", pfn); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b36926ba02e2..037e5f548118 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1022,7 +1022,7 @@ static int migrate_page_add(struct page *page, struct list_head *pagelist, if (!isolate_lru_page(head)) { list_add_tail(&head->lru, pagelist); mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + page_is_file_cache(head), + NR_ISOLATED_ANON + page_is_file_lru(head), hpage_nr_pages(head)); } else if (flags & MPOL_MF_STRICT) { /* diff --git a/mm/migrate.c b/mm/migrate.c index 1a205503be3f..c1412e04975e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -193,7 +193,7 @@ void putback_movable_pages(struct list_head *l) put_page(page); } else { mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + - page_is_file_cache(page), -hpage_nr_pages(page)); + page_is_file_lru(page), -hpage_nr_pages(page)); putback_lru_page(page); } } @@ -1219,7 +1219,7 @@ out: */ if (likely(!__PageMovable(page))) mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + - page_is_file_cache(page), -hpage_nr_pages(page)); + page_is_file_lru(page), -hpage_nr_pages(page)); } /* @@ -1592,7 +1592,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, err = 1; list_add_tail(&head->lru, pagelist); mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + page_is_file_cache(head), + NR_ISOLATED_ANON + page_is_file_lru(head), hpage_nr_pages(head)); } out_putpage: @@ -1955,7 +1955,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) return 0; } - page_lru = page_is_file_cache(page); + page_lru = page_is_file_lru(page); mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, hpage_nr_pages(page)); @@ -1991,7 +1991,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, * Don't migrate file pages that are mapped in multiple processes * with execute permissions as they are probably shared libraries. */ - if (page_mapcount(page) != 1 && page_is_file_cache(page) && + if (page_mapcount(page) != 1 && page_is_file_lru(page) && (vma->vm_flags & VM_EXEC)) goto out; @@ -1999,7 +1999,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, * Also do not migrate dirty pages as not all filesystems can move * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles. */ - if (page_is_file_cache(page) && PageDirty(page)) + if (page_is_file_lru(page) && PageDirty(page)) goto out; isolated = numamigrate_isolate_page(pgdat, page); @@ -2014,7 +2014,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, if (!list_empty(&migratepages)) { list_del(&page->lru); dec_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + page_is_file_lru(page)); putback_lru_page(page); } isolated = 0; @@ -2044,7 +2044,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, pg_data_t *pgdat = NODE_DATA(node); int isolated = 0; struct page *new_page = NULL; - int page_lru = page_is_file_cache(page); + int page_lru = page_is_file_lru(page); unsigned long start = address & HPAGE_PMD_MASK; new_page = alloc_pages_node(node, diff --git a/mm/mprotect.c b/mm/mprotect.c index 311c0dadf71c..0fee14b39416 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -98,7 +98,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * it cannot move them all from MIGRATE_ASYNC * context. */ - if (page_is_file_cache(page) && PageDirty(page)) + if (page_is_file_lru(page) && PageDirty(page)) continue; /* diff --git a/mm/swap.c b/mm/swap.c index a4af8c999963..18505990c3b1 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -276,7 +276,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, void *arg) { if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - int file = page_is_file_cache(page); + int file = page_is_file_lru(page); int lru = page_lru_base_type(page); del_page_from_lru_list(page, lruvec, lru); @@ -394,7 +394,7 @@ void mark_page_accessed(struct page *page) else __lru_cache_activate_page(page); ClearPageReferenced(page); - if (page_is_file_cache(page)) + if (page_is_file_lru(page)) workingset_activation(page); } if (page_is_idle(page)) @@ -515,7 +515,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, return; active = PageActive(page); - file = page_is_file_cache(page); + file = page_is_file_lru(page); lru = page_lru_base_type(page); del_page_from_lru_list(page, lruvec, lru + active); @@ -548,7 +548,7 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, void *arg) { if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { - int file = page_is_file_cache(page); + int file = page_is_file_lru(page); int lru = page_lru_base_type(page); del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); @@ -573,9 +573,9 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, ClearPageActive(page); ClearPageReferenced(page); /* - * lazyfree pages are clean anonymous pages. They have - * SwapBacked flag cleared to distinguish normal anonymous - * pages + * Lazyfree pages are clean anonymous pages. They have + * PG_swapbacked flag cleared, to distinguish them from normal + * anonymous pages */ ClearPageSwapBacked(page); add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE); @@ -962,7 +962,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, if (page_evictable(page)) { lru = page_lru(page); - update_page_reclaim_stat(lruvec, page_is_file_cache(page), + update_page_reclaim_stat(lruvec, page_is_file_lru(page), PageActive(page)); if (was_unevictable) count_vm_event(UNEVICTABLE_PGRESCUED); diff --git a/mm/vmscan.c b/mm/vmscan.c index 2e8e690d2813..b06868fc4926 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -919,7 +919,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * exceptional entries and shadow exceptional entries in the * same address_space. */ - if (reclaimed && page_is_file_cache(page) && + if (reclaimed && page_is_file_lru(page) && !mapping_exiting(mapping) && !dax_mapping(mapping)) shadow = workingset_eviction(page, target_memcg); __delete_from_page_cache(page, shadow); @@ -1043,7 +1043,7 @@ static void page_check_dirty_writeback(struct page *page, * Anonymous pages are not handled by flushers and must be written * from reclaim context. Do not stall reclaim based on them */ - if (!page_is_file_cache(page) || + if (!page_is_file_lru(page) || (PageAnon(page) && !PageSwapBacked(page))) { *dirty = false; *writeback = false; @@ -1315,7 +1315,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * the rest of the LRU for clean pages and see * the same dirty pages again (PageReclaim). */ - if (page_is_file_cache(page) && + if (page_is_file_lru(page) && (!current_is_kswapd() || !PageReclaim(page) || !test_bit(PGDAT_DIRTY, &pgdat->flags))) { /* @@ -1459,7 +1459,7 @@ activate_locked: try_to_free_swap(page); VM_BUG_ON_PAGE(PageActive(page), page); if (!PageMlocked(page)) { - int type = page_is_file_cache(page); + int type = page_is_file_lru(page); SetPageActive(page); stat->nr_activate[type] += nr_pages; count_memcg_page_event(page, PGACTIVATE); @@ -1497,7 +1497,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, LIST_HEAD(clean_pages); list_for_each_entry_safe(page, next, page_list, lru) { - if (page_is_file_cache(page) && !PageDirty(page) && + if (page_is_file_lru(page) && !PageDirty(page) && !__PageMovable(page) && !PageUnevictable(page)) { ClearPageActive(page); list_move(&page->lru, &clean_pages); @@ -2053,7 +2053,7 @@ static void shrink_active_list(unsigned long nr_to_scan, * IO, plus JVM can create lots of anon VM_EXEC pages, * so we ignore them here. */ - if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) { + if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) { list_add(&page->lru, &l_active); continue; } -- cgit v1.3 From a2129f24798a993abde9b4bf8b3713b52d56c121 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 6 Apr 2020 20:04:45 -0700 Subject: mm: adjust shuffle code to allow for future coalescing Patch series "mm / virtio: Provide support for free page reporting", v17. This series provides an asynchronous means of reporting free guest pages to a hypervisor so that the memory associated with those pages can be dropped and reused by other processes and/or guests on the host. Using this it is possible to avoid unnecessary I/O to disk and greatly improve performance in the case of memory overcommit on the host. When enabled we will be performing a scan of free memory every 2 seconds while pages of sufficiently high order are being freed. In each pass at least one sixteenth of each free list will be reported. By doing this we avoid racing against other threads that may be causing a high amount of memory churn. The lowest page order currently scanned when reporting pages is pageblock_order so that this feature will not interfere with the use of Transparent Huge Pages in the case of virtualization. Currently this is only in use by virtio-balloon however there is the hope that at some point in the future other hypervisors might be able to make use of it. In the virtio-balloon/QEMU implementation the hypervisor is currently using MADV_DONTNEED to indicate to the host kernel that the page is currently free. It will be zeroed and faulted back into the guest the next time the page is accessed. To track if a page is reported or not the Uptodate flag was repurposed and used as a Reported flag for Buddy pages. We walk though the free list isolating pages and adding them to the scatterlist until we either encounter the end of the list or have processed at least one sixteenth of the pages that were listed in nr_free prior to us starting. If we fill the scatterlist before we reach the end of the list we rotate the list so that the first unreported page we encounter is moved to the head of the list as that is where we will resume after we have freed the reported pages back into the tail of the list. Below are the results from various benchmarks. I primarily focused on two tests. The first is the will-it-scale/page_fault2 test, and the other is a modified version of will-it-scale/page_fault1 that was enabled to use THP. I did this as it allows for better visibility into different parts of the memory subsystem. The guest is running with 32G for RAM on one node of a E5-2630 v3. The host has had some features such as CPU turbo disabled in the BIOS. Test page_fault1 (THP) page_fault2 Name tasks Process Iter STDEV Process Iter STDEV Baseline 1 1012402.50 0.14% 361855.25 0.81% 16 8827457.25 0.09% 3282347.00 0.34% Patches Applied 1 1007897.00 0.23% 361887.00 0.26% 16 8784741.75 0.39% 3240669.25 0.48% Patches Enabled 1 1010227.50 0.39% 359749.25 0.56% 16 8756219.00 0.24% 3226608.75 0.97% Patches Enabled 1 1050982.00 4.26% 357966.25 0.14% page shuffle 16 8672601.25 0.49% 3223177.75 0.40% Patches enabled 1 1003238.00 0.22% 360211.00 0.22% shuffle w/ RFC 16 8767010.50 0.32% 3199874.00 0.71% The results above are for a baseline with a linux-next-20191219 kernel, that kernel with this patch set applied but page reporting disabled in virtio-balloon, the patches applied and page reporting fully enabled, the patches enabled with page shuffling enabled, and the patches applied with page shuffling enabled and an RFC patch that makes used of MADV_FREE in QEMU. These results include the deviation seen between the average value reported here versus the high and/or low value. I observed that during the test memory usage for the first three tests never dropped whereas with the patches fully enabled the VM would drop to using only a few GB of the host's memory when switching from memhog to page fault tests. Any of the overhead visible with this patch set enabled seems due to page faults caused by accessing the reported pages and the host zeroing the page before giving it back to the guest. This overhead is much more visible when using THP than with standard 4K pages. In addition page shuffling seemed to increase the amount of faults generated due to an increase in memory churn. The overehad is reduced when using MADV_FREE as we can avoid the extra zeroing of the pages when they are reintroduced to the host, as can be seen when the RFC is applied with shuffling enabled. The overall guest size is kept fairly small to only a few GB while the test is running. If the host memory were oversubscribed this patch set should result in a performance improvement as swapping memory in the host can be avoided. A brief history on the background of free page reporting can be found at: https://lore.kernel.org/lkml/29f43d5796feed0dec8e8bb98b187d9dac03b900.camel@linux.intel.com/ This patch (of 9): Move the head/tail adding logic out of the shuffle code and into the __free_one_page function since ultimately that is where it is really needed anyway. By doing this we should be able to reduce the overhead and can consolidate all of the list addition bits in one spot. Signed-off-by: Alexander Duyck Signed-off-by: Andrew Morton Reviewed-by: Dan Williams Acked-by: Mel Gorman Acked-by: David Hildenbrand Cc: Yang Zhang Cc: Pankaj Gupta Cc: Konrad Rzeszutek Wilk Cc: Nitesh Narayan Lal Cc: Rik van Riel Cc: Matthew Wilcox Cc: Luiz Capitulino Cc: Dave Hansen Cc: Wei Wang Cc: Andrea Arcangeli Cc: Paolo Bonzini Cc: Michal Hocko Cc: Vlastimil Babka Cc: Oscar Salvador Cc: Michael S. Tsirkin Cc: wei qi Link: http://lkml.kernel.org/r/20200211224602.29318.84523.stgit@localhost.localdomain Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 12 --------- mm/page_alloc.c | 71 +++++++++++++++++++++++++++++--------------------- mm/shuffle.c | 12 ++++----- mm/shuffle.h | 6 +++++ 4 files changed, 54 insertions(+), 47 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e84d448988b6..c023d7968b14 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -116,18 +116,6 @@ static inline void add_to_free_area_tail(struct page *page, struct free_area *ar area->nr_free++; } -#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR -/* Used to preserve page allocation order entropy */ -void add_to_free_area_random(struct page *page, struct free_area *area, - int migratetype); -#else -static inline void add_to_free_area_random(struct page *page, - struct free_area *area, int migratetype) -{ - add_to_free_area(page, area, migratetype); -} -#endif - /* Used for pages which are on another list */ static inline void move_to_free_area(struct page *page, struct free_area *area, int migratetype) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e5f76da8cd4e..f2b8cb8f995f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -864,6 +864,36 @@ compaction_capture(struct capture_control *capc, struct page *page, } #endif /* CONFIG_COMPACTION */ +/* + * If this is not the largest possible page, check if the buddy + * of the next-highest order is free. If it is, it's possible + * that pages are being freed that will coalesce soon. In case, + * that is happening, add the free page to the tail of the list + * so it's less likely to be used soon and more likely to be merged + * as a higher order page + */ +static inline bool +buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, + struct page *page, unsigned int order) +{ + struct page *higher_page, *higher_buddy; + unsigned long combined_pfn; + + if (order >= MAX_ORDER - 2) + return false; + + if (!pfn_valid_within(buddy_pfn)) + return false; + + combined_pfn = buddy_pfn & pfn; + higher_page = page + (combined_pfn - pfn); + buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); + higher_buddy = higher_page + (buddy_pfn - combined_pfn); + + return pfn_valid_within(buddy_pfn) && + page_is_buddy(higher_page, higher_buddy, order + 1); +} + /* * Freeing function for a buddy system allocator. * @@ -893,11 +923,13 @@ static inline void __free_one_page(struct page *page, struct zone *zone, unsigned int order, int migratetype) { - unsigned long combined_pfn; + struct capture_control *capc = task_capc(zone); unsigned long uninitialized_var(buddy_pfn); - struct page *buddy; + unsigned long combined_pfn; + struct free_area *area; unsigned int max_order; - struct capture_control *capc = task_capc(zone); + struct page *buddy; + bool to_tail; max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); @@ -966,35 +998,16 @@ continue_merging: done_merging: set_page_order(page, order); - /* - * If this is not the largest possible page, check if the buddy - * of the next-highest order is free. If it is, it's possible - * that pages are being freed that will coalesce soon. In case, - * that is happening, add the free page to the tail of the list - * so it's less likely to be used soon and more likely to be merged - * as a higher order page - */ - if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn) - && !is_shuffle_order(order)) { - struct page *higher_page, *higher_buddy; - combined_pfn = buddy_pfn & pfn; - higher_page = page + (combined_pfn - pfn); - buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); - higher_buddy = higher_page + (buddy_pfn - combined_pfn); - if (pfn_valid_within(buddy_pfn) && - page_is_buddy(higher_page, higher_buddy, order + 1)) { - add_to_free_area_tail(page, &zone->free_area[order], - migratetype); - return; - } - } - + area = &zone->free_area[order]; if (is_shuffle_order(order)) - add_to_free_area_random(page, &zone->free_area[order], - migratetype); + to_tail = shuffle_pick_tail(); else - add_to_free_area(page, &zone->free_area[order], migratetype); + to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); + if (to_tail) + add_to_free_area_tail(page, area, migratetype); + else + add_to_free_area(page, area, migratetype); } /* diff --git a/mm/shuffle.c b/mm/shuffle.c index c716059cbd3c..44406d9977c7 100644 --- a/mm/shuffle.c +++ b/mm/shuffle.c @@ -183,11 +183,11 @@ void __meminit __shuffle_free_memory(pg_data_t *pgdat) shuffle_zone(z); } -void add_to_free_area_random(struct page *page, struct free_area *area, - int migratetype) +bool shuffle_pick_tail(void) { static u64 rand; static u8 rand_bits; + bool ret; /* * The lack of locking is deliberate. If 2 threads race to @@ -198,10 +198,10 @@ void add_to_free_area_random(struct page *page, struct free_area *area, rand = get_random_u64(); } - if (rand & 1) - add_to_free_area(page, area, migratetype); - else - add_to_free_area_tail(page, area, migratetype); + ret = rand & 1; + rand_bits--; rand >>= 1; + + return ret; } diff --git a/mm/shuffle.h b/mm/shuffle.h index 777a257a0d2f..4d79f03b6658 100644 --- a/mm/shuffle.h +++ b/mm/shuffle.h @@ -22,6 +22,7 @@ enum mm_shuffle_ctl { DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key); extern void page_alloc_shuffle(enum mm_shuffle_ctl ctl); extern void __shuffle_free_memory(pg_data_t *pgdat); +extern bool shuffle_pick_tail(void); static inline void shuffle_free_memory(pg_data_t *pgdat) { if (!static_branch_unlikely(&page_alloc_shuffle_key)) @@ -44,6 +45,11 @@ static inline bool is_shuffle_order(int order) return order >= SHUFFLE_ORDER; } #else +static inline bool shuffle_pick_tail(void) +{ + return false; +} + static inline void shuffle_free_memory(pg_data_t *pgdat) { } -- cgit v1.3 From 6ab0136310961ebf4b5ecb565f0bf52c233dc093 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 6 Apr 2020 20:04:49 -0700 Subject: mm: use zone and order instead of free area in free_list manipulators In order to enable the use of the zone from the list manipulator functions I will need access to the zone pointer. As it turns out most of the accessors were always just being directly passed &zone->free_area[order] anyway so it would make sense to just fold that into the function itself and pass the zone and order as arguments instead of the free area. In order to be able to reference the zone we need to move the declaration of the functions down so that we have the zone defined before we define the list manipulation functions. Since the functions are only used in the file mm/page_alloc.c we can just move them there to reduce noise in the header. Signed-off-by: Alexander Duyck Signed-off-by: Andrew Morton Reviewed-by: Dan Williams Reviewed-by: David Hildenbrand Reviewed-by: Pankaj Gupta Acked-by: Mel Gorman Cc: Andrea Arcangeli Cc: Dave Hansen Cc: Konrad Rzeszutek Wilk Cc: Luiz Capitulino Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Michal Hocko Cc: Nitesh Narayan Lal Cc: Oscar Salvador Cc: Paolo Bonzini Cc: Rik van Riel Cc: Vlastimil Babka Cc: Wei Wang Cc: Yang Zhang Cc: wei qi Link: http://lkml.kernel.org/r/20200211224613.29318.43080.stgit@localhost.localdomain Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 32 ------------------------ mm/page_alloc.c | 67 ++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 49 insertions(+), 50 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c023d7968b14..42b77d3b68e8 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -100,29 +100,6 @@ struct free_area { unsigned long nr_free; }; -/* Used for pages not on another list */ -static inline void add_to_free_area(struct page *page, struct free_area *area, - int migratetype) -{ - list_add(&page->lru, &area->free_list[migratetype]); - area->nr_free++; -} - -/* Used for pages not on another list */ -static inline void add_to_free_area_tail(struct page *page, struct free_area *area, - int migratetype) -{ - list_add_tail(&page->lru, &area->free_list[migratetype]); - area->nr_free++; -} - -/* Used for pages which are on another list */ -static inline void move_to_free_area(struct page *page, struct free_area *area, - int migratetype) -{ - list_move(&page->lru, &area->free_list[migratetype]); -} - static inline struct page *get_page_from_free_area(struct free_area *area, int migratetype) { @@ -130,15 +107,6 @@ static inline struct page *get_page_from_free_area(struct free_area *area, struct page, lru); } -static inline void del_page_from_free_area(struct page *page, - struct free_area *area) -{ - list_del(&page->lru); - __ClearPageBuddy(page); - set_page_private(page, 0); - area->nr_free--; -} - static inline bool free_area_empty(struct free_area *area, int migratetype) { return list_empty(&area->free_list[migratetype]); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f2b8cb8f995f..14bdf3608a6b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -864,6 +864,44 @@ compaction_capture(struct capture_control *capc, struct page *page, } #endif /* CONFIG_COMPACTION */ +/* Used for pages not on another list */ +static inline void add_to_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype) +{ + struct free_area *area = &zone->free_area[order]; + + list_add(&page->lru, &area->free_list[migratetype]); + area->nr_free++; +} + +/* Used for pages not on another list */ +static inline void add_to_free_list_tail(struct page *page, struct zone *zone, + unsigned int order, int migratetype) +{ + struct free_area *area = &zone->free_area[order]; + + list_add_tail(&page->lru, &area->free_list[migratetype]); + area->nr_free++; +} + +/* Used for pages which are on another list */ +static inline void move_to_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype) +{ + struct free_area *area = &zone->free_area[order]; + + list_move(&page->lru, &area->free_list[migratetype]); +} + +static inline void del_page_from_free_list(struct page *page, struct zone *zone, + unsigned int order) +{ + list_del(&page->lru); + __ClearPageBuddy(page); + set_page_private(page, 0); + zone->free_area[order].nr_free--; +} + /* * If this is not the largest possible page, check if the buddy * of the next-highest order is free. If it is, it's possible @@ -926,7 +964,6 @@ static inline void __free_one_page(struct page *page, struct capture_control *capc = task_capc(zone); unsigned long uninitialized_var(buddy_pfn); unsigned long combined_pfn; - struct free_area *area; unsigned int max_order; struct page *buddy; bool to_tail; @@ -964,7 +1001,7 @@ continue_merging: if (page_is_guard(buddy)) clear_page_guard(zone, buddy, order, migratetype); else - del_page_from_free_area(buddy, &zone->free_area[order]); + del_page_from_free_list(buddy, zone, order); combined_pfn = buddy_pfn & pfn; page = page + (combined_pfn - pfn); pfn = combined_pfn; @@ -998,16 +1035,15 @@ continue_merging: done_merging: set_page_order(page, order); - area = &zone->free_area[order]; if (is_shuffle_order(order)) to_tail = shuffle_pick_tail(); else to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); if (to_tail) - add_to_free_area_tail(page, area, migratetype); + add_to_free_list_tail(page, zone, order, migratetype); else - add_to_free_area(page, area, migratetype); + add_to_free_list(page, zone, order, migratetype); } /* @@ -2021,13 +2057,11 @@ void __init init_cma_reserved_pageblock(struct page *page) * -- nyc */ static inline void expand(struct zone *zone, struct page *page, - int low, int high, struct free_area *area, - int migratetype) + int low, int high, int migratetype) { unsigned long size = 1 << high; while (high > low) { - area--; high--; size >>= 1; VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); @@ -2041,7 +2075,7 @@ static inline void expand(struct zone *zone, struct page *page, if (set_page_guard(zone, &page[size], high, migratetype)) continue; - add_to_free_area(&page[size], area, migratetype); + add_to_free_list(&page[size], zone, high, migratetype); set_page_order(&page[size], high); } } @@ -2199,8 +2233,8 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, page = get_page_from_free_area(area, migratetype); if (!page) continue; - del_page_from_free_area(page, area); - expand(zone, page, order, current_order, area, migratetype); + del_page_from_free_list(page, zone, current_order); + expand(zone, page, order, current_order, migratetype); set_pcppage_migratetype(page, migratetype); return page; } @@ -2274,7 +2308,7 @@ static int move_freepages(struct zone *zone, VM_BUG_ON_PAGE(page_zone(page) != zone, page); order = page_order(page); - move_to_free_area(page, &zone->free_area[order], migratetype); + move_to_free_list(page, zone, order, migratetype); page += 1 << order; pages_moved += 1 << order; } @@ -2390,7 +2424,6 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, unsigned int alloc_flags, int start_type, bool whole_block) { unsigned int current_order = page_order(page); - struct free_area *area; int free_pages, movable_pages, alike_pages; int old_block_type; @@ -2461,8 +2494,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, return; single_page: - area = &zone->free_area[current_order]; - move_to_free_area(page, area, start_type); + move_to_free_list(page, zone, current_order, start_type); } /* @@ -3133,7 +3165,6 @@ EXPORT_SYMBOL_GPL(split_page); int __isolate_free_page(struct page *page, unsigned int order) { - struct free_area *area = &page_zone(page)->free_area[order]; unsigned long watermark; struct zone *zone; int mt; @@ -3159,7 +3190,7 @@ int __isolate_free_page(struct page *page, unsigned int order) /* Remove page from free list */ - del_page_from_free_area(page, area); + del_page_from_free_list(page, zone, order); /* * Set the pageblock if the isolated page is at least half of a @@ -8726,7 +8757,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) BUG_ON(!PageBuddy(page)); order = page_order(page); offlined_pages += 1 << order; - del_page_from_free_area(page, &zone->free_area[order]); + del_page_from_free_list(page, zone, order); pfn += (1 << order); } spin_unlock_irqrestore(&zone->lock, flags); -- cgit v1.3 From 36e66c554b5c6a9d17a229faca7a61693527b0bd Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 6 Apr 2020 20:04:56 -0700 Subject: mm: introduce Reported pages In order to pave the way for free page reporting in virtualized environments we will need a way to get pages out of the free lists and identify those pages after they have been returned. To accomplish this, this patch adds the concept of a Reported Buddy, which is essentially meant to just be the Uptodate flag used in conjunction with the Buddy page type. To prevent the reported pages from leaking outside of the buddy lists I added a check to clear the PageReported bit in the del_page_from_free_list function. As a result any reported page that is split, merged, or allocated will have the flag cleared prior to the PageBuddy value being cleared. The process for reporting pages is fairly simple. Once we free a page that meets the minimum order for page reporting we will schedule a worker thread to start 2s or more in the future. That worker thread will begin working from the lowest supported page reporting order up to MAX_ORDER - 1 pulling unreported pages from the free list and storing them in the scatterlist. When processing each individual free list it is necessary for the worker thread to release the zone lock when it needs to stop and report the full scatterlist of pages. To reduce the work of the next iteration the worker thread will rotate the free list so that the first unreported page in the free list becomes the first entry in the list. It will then call a reporting function providing information on how many entries are in the scatterlist. Once the function completes it will return the pages to the free area from which they were allocated and start over pulling more pages from the free areas until there are no longer enough pages to report on to keep the worker busy, or we have processed as many pages as were contained in the free area when we started processing the list. The worker thread will work in a round-robin fashion making its way though each zone requesting reporting, and through each reportable free list within that zone. Once all free areas within the zone have been processed it will check to see if there have been any requests for reporting while it was processing. If so it will reschedule the worker thread to start up again in roughly 2s and exit. Signed-off-by: Alexander Duyck Signed-off-by: Andrew Morton Acked-by: Mel Gorman Cc: Andrea Arcangeli Cc: Dan Williams Cc: Dave Hansen Cc: David Hildenbrand Cc: Konrad Rzeszutek Wilk Cc: Luiz Capitulino Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Michal Hocko Cc: Nitesh Narayan Lal Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Paolo Bonzini Cc: Rik van Riel Cc: Vlastimil Babka Cc: Wei Wang Cc: Yang Zhang Cc: wei qi Link: http://lkml.kernel.org/r/20200211224635.29318.19750.stgit@localhost.localdomain Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 11 ++ include/linux/page_reporting.h | 25 ++++ mm/Kconfig | 11 ++ mm/Makefile | 1 + mm/page_alloc.c | 17 ++- mm/page_reporting.c | 319 +++++++++++++++++++++++++++++++++++++++++ mm/page_reporting.h | 54 +++++++ 7 files changed, 434 insertions(+), 4 deletions(-) create mode 100644 include/linux/page_reporting.h create mode 100644 mm/page_reporting.c create mode 100644 mm/page_reporting.h (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index acf7988fd640..222f6f7b2bb3 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -168,6 +168,9 @@ enum pageflags { /* non-lru isolated movable page */ PG_isolated = PG_reclaim, + + /* Only valid for buddy pages. Used to track pages that are reported */ + PG_reported = PG_uptodate, }; #ifndef __GENERATING_BOUNDS_H @@ -436,6 +439,14 @@ TESTCLEARFLAG(Young, young, PF_ANY) PAGEFLAG(Idle, idle, PF_ANY) #endif +/* + * PageReported() is used to track reported free pages within the Buddy + * allocator. We can use the non-atomic version of the test and set + * operations as both should be shielded with the zone lock to prevent + * any possible races on the setting or clearing of the bit. + */ +__PAGEFLAG(Reported, reported, PF_NO_COMPOUND) + /* * On an anonymous page mapped into a user virtual memory area, * page->mapping points to its anon_vma, not to a struct address_space; diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h new file mode 100644 index 000000000000..32355486f572 --- /dev/null +++ b/include/linux/page_reporting.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PAGE_REPORTING_H +#define _LINUX_PAGE_REPORTING_H + +#include +#include + +#define PAGE_REPORTING_CAPACITY 32 + +struct page_reporting_dev_info { + /* function that alters pages to make them "reported" */ + int (*report)(struct page_reporting_dev_info *prdev, + struct scatterlist *sg, unsigned int nents); + + /* work struct for processing reports */ + struct delayed_work work; + + /* Current state of page reporting */ + atomic_t state; +}; + +/* Tear-down and bring-up for page reporting devices */ +void page_reporting_unregister(struct page_reporting_dev_info *prdev); +int page_reporting_register(struct page_reporting_dev_info *prdev); +#endif /*_LINUX_PAGE_REPORTING_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 211a70e8d5cf..d286dc54458c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -236,6 +236,17 @@ config COMPACTION it and then we would be really interested to hear about that at linux-mm@kvack.org. +# +# support for free page reporting +config PAGE_REPORTING + bool "Free page reporting" + def_bool n + help + Free page reporting allows for the incremental acquisition of + free pages from the buddy allocator for the purpose of reporting + those pages to another entity, such as a hypervisor, so that the + memory can be freed within the host for other uses. + # # support for page migration # diff --git a/mm/Makefile b/mm/Makefile index dbc8346d16ca..fccd3756b25f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -111,3 +111,4 @@ obj-$(CONFIG_HMM_MIRROR) += hmm.o obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o +obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 448e439b75f2..114c56c3685d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -74,6 +74,7 @@ #include #include "internal.h" #include "shuffle.h" +#include "page_reporting.h" /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); @@ -896,6 +897,10 @@ static inline void move_to_free_list(struct page *page, struct zone *zone, static inline void del_page_from_free_list(struct page *page, struct zone *zone, unsigned int order) { + /* clear reported state and update reported page count */ + if (page_reported(page)) + __ClearPageReported(page); + list_del(&page->lru); __ClearPageBuddy(page); set_page_private(page, 0); @@ -959,7 +964,7 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, static inline void __free_one_page(struct page *page, unsigned long pfn, struct zone *zone, unsigned int order, - int migratetype) + int migratetype, bool report) { struct capture_control *capc = task_capc(zone); unsigned long uninitialized_var(buddy_pfn); @@ -1044,6 +1049,10 @@ done_merging: add_to_free_list_tail(page, zone, order, migratetype); else add_to_free_list(page, zone, order, migratetype); + + /* Notify page reporting subsystem of freed page */ + if (report) + page_reporting_notify_free(order); } /* @@ -1360,7 +1369,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, if (unlikely(isolated_pageblocks)) mt = get_pageblock_migratetype(page); - __free_one_page(page, page_to_pfn(page), zone, 0, mt); + __free_one_page(page, page_to_pfn(page), zone, 0, mt, true); trace_mm_page_pcpu_drain(page, 0, mt); } spin_unlock(&zone->lock); @@ -1376,7 +1385,7 @@ static void free_one_page(struct zone *zone, is_migrate_isolate(migratetype))) { migratetype = get_pfnblock_migratetype(page, pfn); } - __free_one_page(page, pfn, zone, order, migratetype); + __free_one_page(page, pfn, zone, order, migratetype, true); spin_unlock(&zone->lock); } @@ -3227,7 +3236,7 @@ void __putback_isolated_page(struct page *page, unsigned int order, int mt) lockdep_assert_held(&zone->lock); /* Return isolated page to tail of freelist. */ - __free_one_page(page, page_to_pfn(page), zone, order, mt); + __free_one_page(page, page_to_pfn(page), zone, order, mt, false); } /* diff --git a/mm/page_reporting.c b/mm/page_reporting.c new file mode 100644 index 000000000000..1047c6872d4f --- /dev/null +++ b/mm/page_reporting.c @@ -0,0 +1,319 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +#include "page_reporting.h" +#include "internal.h" + +#define PAGE_REPORTING_DELAY (2 * HZ) +static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly; + +enum { + PAGE_REPORTING_IDLE = 0, + PAGE_REPORTING_REQUESTED, + PAGE_REPORTING_ACTIVE +}; + +/* request page reporting */ +static void +__page_reporting_request(struct page_reporting_dev_info *prdev) +{ + unsigned int state; + + /* Check to see if we are in desired state */ + state = atomic_read(&prdev->state); + if (state == PAGE_REPORTING_REQUESTED) + return; + + /* + * If reporting is already active there is nothing we need to do. + * Test against 0 as that represents PAGE_REPORTING_IDLE. + */ + state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED); + if (state != PAGE_REPORTING_IDLE) + return; + + /* + * Delay the start of work to allow a sizable queue to build. For + * now we are limiting this to running no more than once every + * couple of seconds. + */ + schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY); +} + +/* notify prdev of free page reporting request */ +void __page_reporting_notify(void) +{ + struct page_reporting_dev_info *prdev; + + /* + * We use RCU to protect the pr_dev_info pointer. In almost all + * cases this should be present, however in the unlikely case of + * a shutdown this will be NULL and we should exit. + */ + rcu_read_lock(); + prdev = rcu_dereference(pr_dev_info); + if (likely(prdev)) + __page_reporting_request(prdev); + + rcu_read_unlock(); +} + +static void +page_reporting_drain(struct page_reporting_dev_info *prdev, + struct scatterlist *sgl, unsigned int nents, bool reported) +{ + struct scatterlist *sg = sgl; + + /* + * Drain the now reported pages back into their respective + * free lists/areas. We assume at least one page is populated. + */ + do { + struct page *page = sg_page(sg); + int mt = get_pageblock_migratetype(page); + unsigned int order = get_order(sg->length); + + __putback_isolated_page(page, order, mt); + + /* If the pages were not reported due to error skip flagging */ + if (!reported) + continue; + + /* + * If page was not comingled with another page we can + * consider the result to be "reported" since the page + * hasn't been modified, otherwise we will need to + * report on the new larger page when we make our way + * up to that higher order. + */ + if (PageBuddy(page) && page_order(page) == order) + __SetPageReported(page); + } while ((sg = sg_next(sg))); + + /* reinitialize scatterlist now that it is empty */ + sg_init_table(sgl, nents); +} + +/* + * The page reporting cycle consists of 4 stages, fill, report, drain, and + * idle. We will cycle through the first 3 stages until we cannot obtain a + * full scatterlist of pages, in that case we will switch to idle. + */ +static int +page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone, + unsigned int order, unsigned int mt, + struct scatterlist *sgl, unsigned int *offset) +{ + struct free_area *area = &zone->free_area[order]; + struct list_head *list = &area->free_list[mt]; + unsigned int page_len = PAGE_SIZE << order; + struct page *page, *next; + int err = 0; + + /* + * Perform early check, if free area is empty there is + * nothing to process so we can skip this free_list. + */ + if (list_empty(list)) + return err; + + spin_lock_irq(&zone->lock); + + /* loop through free list adding unreported pages to sg list */ + list_for_each_entry_safe(page, next, list, lru) { + /* We are going to skip over the reported pages. */ + if (PageReported(page)) + continue; + + /* Attempt to pull page from list */ + if (!__isolate_free_page(page, order)) + break; + + /* Add page to scatter list */ + --(*offset); + sg_set_page(&sgl[*offset], page, page_len, 0); + + /* If scatterlist isn't full grab more pages */ + if (*offset) + continue; + + /* release lock before waiting on report processing */ + spin_unlock_irq(&zone->lock); + + /* begin processing pages in local list */ + err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY); + + /* reset offset since the full list was reported */ + *offset = PAGE_REPORTING_CAPACITY; + + /* reacquire zone lock and resume processing */ + spin_lock_irq(&zone->lock); + + /* flush reported pages from the sg list */ + page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err); + + /* + * Reset next to first entry, the old next isn't valid + * since we dropped the lock to report the pages + */ + next = list_first_entry(list, struct page, lru); + + /* exit on error */ + if (err) + break; + } + + spin_unlock_irq(&zone->lock); + + return err; +} + +static int +page_reporting_process_zone(struct page_reporting_dev_info *prdev, + struct scatterlist *sgl, struct zone *zone) +{ + unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY; + unsigned long watermark; + int err = 0; + + /* Generate minimum watermark to be able to guarantee progress */ + watermark = low_wmark_pages(zone) + + (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER); + + /* + * Cancel request if insufficient free memory or if we failed + * to allocate page reporting statistics for the zone. + */ + if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) + return err; + + /* Process each free list starting from lowest order/mt */ + for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) { + for (mt = 0; mt < MIGRATE_TYPES; mt++) { + /* We do not pull pages from the isolate free list */ + if (is_migrate_isolate(mt)) + continue; + + err = page_reporting_cycle(prdev, zone, order, mt, + sgl, &offset); + if (err) + return err; + } + } + + /* report the leftover pages before going idle */ + leftover = PAGE_REPORTING_CAPACITY - offset; + if (leftover) { + sgl = &sgl[offset]; + err = prdev->report(prdev, sgl, leftover); + + /* flush any remaining pages out from the last report */ + spin_lock_irq(&zone->lock); + page_reporting_drain(prdev, sgl, leftover, !err); + spin_unlock_irq(&zone->lock); + } + + return err; +} + +static void page_reporting_process(struct work_struct *work) +{ + struct delayed_work *d_work = to_delayed_work(work); + struct page_reporting_dev_info *prdev = + container_of(d_work, struct page_reporting_dev_info, work); + int err = 0, state = PAGE_REPORTING_ACTIVE; + struct scatterlist *sgl; + struct zone *zone; + + /* + * Change the state to "Active" so that we can track if there is + * anyone requests page reporting after we complete our pass. If + * the state is not altered by the end of the pass we will switch + * to idle and quit scheduling reporting runs. + */ + atomic_set(&prdev->state, state); + + /* allocate scatterlist to store pages being reported on */ + sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL); + if (!sgl) + goto err_out; + + sg_init_table(sgl, PAGE_REPORTING_CAPACITY); + + for_each_zone(zone) { + err = page_reporting_process_zone(prdev, sgl, zone); + if (err) + break; + } + + kfree(sgl); +err_out: + /* + * If the state has reverted back to requested then there may be + * additional pages to be processed. We will defer for 2s to allow + * more pages to accumulate. + */ + state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE); + if (state == PAGE_REPORTING_REQUESTED) + schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY); +} + +static DEFINE_MUTEX(page_reporting_mutex); +DEFINE_STATIC_KEY_FALSE(page_reporting_enabled); + +int page_reporting_register(struct page_reporting_dev_info *prdev) +{ + int err = 0; + + mutex_lock(&page_reporting_mutex); + + /* nothing to do if already in use */ + if (rcu_access_pointer(pr_dev_info)) { + err = -EBUSY; + goto err_out; + } + + /* initialize state and work structures */ + atomic_set(&prdev->state, PAGE_REPORTING_IDLE); + INIT_DELAYED_WORK(&prdev->work, &page_reporting_process); + + /* Begin initial flush of zones */ + __page_reporting_request(prdev); + + /* Assign device to allow notifications */ + rcu_assign_pointer(pr_dev_info, prdev); + + /* enable page reporting notification */ + if (!static_key_enabled(&page_reporting_enabled)) { + static_branch_enable(&page_reporting_enabled); + pr_info("Free page reporting enabled\n"); + } +err_out: + mutex_unlock(&page_reporting_mutex); + + return err; +} +EXPORT_SYMBOL_GPL(page_reporting_register); + +void page_reporting_unregister(struct page_reporting_dev_info *prdev) +{ + mutex_lock(&page_reporting_mutex); + + if (rcu_access_pointer(pr_dev_info) == prdev) { + /* Disable page reporting notification */ + RCU_INIT_POINTER(pr_dev_info, NULL); + synchronize_rcu(); + + /* Flush any existing work, and lock it out */ + cancel_delayed_work_sync(&prdev->work); + } + + mutex_unlock(&page_reporting_mutex); +} +EXPORT_SYMBOL_GPL(page_reporting_unregister); diff --git a/mm/page_reporting.h b/mm/page_reporting.h new file mode 100644 index 000000000000..aa6d37f4dc22 --- /dev/null +++ b/mm/page_reporting.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _MM_PAGE_REPORTING_H +#define _MM_PAGE_REPORTING_H + +#include +#include +#include +#include +#include +#include +#include + +#define PAGE_REPORTING_MIN_ORDER pageblock_order + +#ifdef CONFIG_PAGE_REPORTING +DECLARE_STATIC_KEY_FALSE(page_reporting_enabled); +void __page_reporting_notify(void); + +static inline bool page_reported(struct page *page) +{ + return static_branch_unlikely(&page_reporting_enabled) && + PageReported(page); +} + +/** + * page_reporting_notify_free - Free page notification to start page processing + * + * This function is meant to act as a screener for __page_reporting_notify + * which will determine if a give zone has crossed over the high-water mark + * that will justify us beginning page treatment. If we have crossed that + * threshold then it will start the process of pulling some pages and + * placing them in the batch list for treatment. + */ +static inline void page_reporting_notify_free(unsigned int order) +{ + /* Called from hot path in __free_one_page() */ + if (!static_branch_unlikely(&page_reporting_enabled)) + return; + + /* Determine if we have crossed reporting threshold */ + if (order < PAGE_REPORTING_MIN_ORDER) + return; + + /* This will add a few cycles, but should be called infrequently */ + __page_reporting_notify(); +} +#else /* CONFIG_PAGE_REPORTING */ +#define page_reported(_page) false + +static inline void page_reporting_notify_free(unsigned int order) +{ +} +#endif /* CONFIG_PAGE_REPORTING */ +#endif /*_MM_PAGE_REPORTING_H */ -- cgit v1.3 From b0c504f154718904ae49349147e3b7e6ae91ffdc Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 6 Apr 2020 20:05:05 -0700 Subject: virtio-balloon: add support for providing free page reports to host Add support for the page reporting feature provided by virtio-balloon. Reporting differs from the regular balloon functionality in that is is much less durable than a standard memory balloon. Instead of creating a list of pages that cannot be accessed the pages are only inaccessible while they are being indicated to the virtio interface. Once the interface has acknowledged them they are placed back into their respective free lists and are once again accessible by the guest system. Unlike a standard balloon we don't inflate and deflate the pages. Instead we perform the reporting, and once the reporting is completed it is assumed that the page has been dropped from the guest and will be faulted back in the next time the page is accessed. Signed-off-by: Alexander Duyck Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Acked-by: Michael S. Tsirkin Cc: Andrea Arcangeli Cc: Dan Williams Cc: Dave Hansen Cc: Konrad Rzeszutek Wilk Cc: Luiz Capitulino Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Nitesh Narayan Lal Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Paolo Bonzini Cc: Rik van Riel Cc: Vlastimil Babka Cc: Wei Wang Cc: Yang Zhang Cc: wei qi Link: http://lkml.kernel.org/r/20200211224657.29318.68624.stgit@localhost.localdomain Signed-off-by: Linus Torvalds --- drivers/virtio/Kconfig | 1 + drivers/virtio/virtio_balloon.c | 64 +++++++++++++++++++++++++++++++++++++ include/uapi/linux/virtio_balloon.h | 1 + 3 files changed, 66 insertions(+) (limited to 'include') diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index 078615cf2afc..4b2dd8259ff5 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -58,6 +58,7 @@ config VIRTIO_BALLOON tristate "Virtio balloon driver" depends on VIRTIO select MEMORY_BALLOON + select PAGE_REPORTING ---help--- This driver supports increasing and decreasing the amount of memory within a KVM guest. diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index e41a62d84260..9e0177529bad 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -19,6 +19,7 @@ #include #include #include +#include /* * Balloon device works in 4K page units. So each page is pointed to by @@ -47,6 +48,7 @@ enum virtio_balloon_vq { VIRTIO_BALLOON_VQ_DEFLATE, VIRTIO_BALLOON_VQ_STATS, VIRTIO_BALLOON_VQ_FREE_PAGE, + VIRTIO_BALLOON_VQ_REPORTING, VIRTIO_BALLOON_VQ_MAX }; @@ -114,6 +116,10 @@ struct virtio_balloon { /* To register a shrinker to shrink memory upon memory pressure */ struct shrinker shrinker; + + /* Free page reporting device */ + struct virtqueue *reporting_vq; + struct page_reporting_dev_info pr_dev_info; }; static struct virtio_device_id id_table[] = { @@ -153,6 +159,33 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq) } +int virtballoon_free_page_report(struct page_reporting_dev_info *pr_dev_info, + struct scatterlist *sg, unsigned int nents) +{ + struct virtio_balloon *vb = + container_of(pr_dev_info, struct virtio_balloon, pr_dev_info); + struct virtqueue *vq = vb->reporting_vq; + unsigned int unused, err; + + /* We should always be able to add these buffers to an empty queue. */ + err = virtqueue_add_inbuf(vq, sg, nents, vb, GFP_NOWAIT | __GFP_NOWARN); + + /* + * In the extremely unlikely case that something has occurred and we + * are able to trigger an error we will simply display a warning + * and exit without actually processing the pages. + */ + if (WARN_ON_ONCE(err)) + return err; + + virtqueue_kick(vq); + + /* When host has read buffer, this completes via balloon_ack */ + wait_event(vb->acked, virtqueue_get_buf(vq, &unused)); + + return 0; +} + static void set_page_pfns(struct virtio_balloon *vb, __virtio32 pfns[], struct page *page) { @@ -481,6 +514,7 @@ static int init_vqs(struct virtio_balloon *vb) names[VIRTIO_BALLOON_VQ_STATS] = NULL; callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL; names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL; + names[VIRTIO_BALLOON_VQ_REPORTING] = NULL; if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) { names[VIRTIO_BALLOON_VQ_STATS] = "stats"; @@ -492,6 +526,11 @@ static int init_vqs(struct virtio_balloon *vb) callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL; } + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) { + names[VIRTIO_BALLOON_VQ_REPORTING] = "reporting_vq"; + callbacks[VIRTIO_BALLOON_VQ_REPORTING] = balloon_ack; + } + err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX, vqs, callbacks, names, NULL, NULL); if (err) @@ -524,6 +563,9 @@ static int init_vqs(struct virtio_balloon *vb) if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) vb->free_page_vq = vqs[VIRTIO_BALLOON_VQ_FREE_PAGE]; + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) + vb->reporting_vq = vqs[VIRTIO_BALLOON_VQ_REPORTING]; + return 0; } @@ -953,12 +995,31 @@ static int virtballoon_probe(struct virtio_device *vdev) if (err) goto out_del_balloon_wq; } + + vb->pr_dev_info.report = virtballoon_free_page_report; + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) { + unsigned int capacity; + + capacity = virtqueue_get_vring_size(vb->reporting_vq); + if (capacity < PAGE_REPORTING_CAPACITY) { + err = -ENOSPC; + goto out_unregister_shrinker; + } + + err = page_reporting_register(&vb->pr_dev_info); + if (err) + goto out_unregister_shrinker; + } + virtio_device_ready(vdev); if (towards_target(vb)) virtballoon_changed(vdev); return 0; +out_unregister_shrinker: + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) + virtio_balloon_unregister_shrinker(vb); out_del_balloon_wq: if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) destroy_workqueue(vb->balloon_wq); @@ -997,6 +1058,8 @@ static void virtballoon_remove(struct virtio_device *vdev) { struct virtio_balloon *vb = vdev->priv; + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) + page_reporting_unregister(&vb->pr_dev_info); if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) virtio_balloon_unregister_shrinker(vb); spin_lock_irq(&vb->stop_update_lock); @@ -1069,6 +1132,7 @@ static unsigned int features[] = { VIRTIO_BALLOON_F_DEFLATE_ON_OOM, VIRTIO_BALLOON_F_FREE_PAGE_HINT, VIRTIO_BALLOON_F_PAGE_POISON, + VIRTIO_BALLOON_F_REPORTING, }; static struct virtio_driver virtio_balloon_driver = { diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h index a1966cd7b677..19974392d324 100644 --- a/include/uapi/linux/virtio_balloon.h +++ b/include/uapi/linux/virtio_balloon.h @@ -36,6 +36,7 @@ #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */ #define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */ #define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */ +#define VIRTIO_BALLOON_F_REPORTING 5 /* Page reporting virtqueue */ /* Size of a PFN in the balloon interface. */ #define VIRTIO_BALLOON_PFN_SHIFT 12 -- cgit v1.3 From 43b76f298f023d32273c2b9c25dd83ae02711019 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 6 Apr 2020 20:05:14 -0700 Subject: mm/page_reporting: add budget limit on how many pages can be reported per pass In order to keep ourselves from reporting pages that are just going to be reused again in the case of heavy churn we can put a limit on how many total pages we will process per pass. Doing this will allow the worker thread to go into idle much more quickly so that we avoid competing with other threads that might be allocating or freeing pages. The logic added here will limit the worker thread to no more than one sixteenth of the total free pages in a given area per list. Once that limit is reached it will update the state so that at the end of the pass we will reschedule the worker to try again in 2 seconds when the memory churn has hopefully settled down. Again this optimization doesn't show much of a benefit in the standard case as the memory churn is minmal. However with page allocator shuffling enabled the gain is quite noticeable. Below are the results with a THP enabled version of the will-it-scale page_fault1 test showing the improvement in iterations for 16 processes or threads. Without: tasks processes processes_idle threads threads_idle 16 8283274.75 0.17 5594261.00 38.15 With: tasks processes processes_idle threads threads_idle 16 8767010.50 0.21 5791312.75 36.98 Signed-off-by: Alexander Duyck Signed-off-by: Andrew Morton Acked-by: Mel Gorman Cc: Andrea Arcangeli Cc: Dan Williams Cc: Dave Hansen Cc: David Hildenbrand Cc: Konrad Rzeszutek Wilk Cc: Luiz Capitulino Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Michal Hocko Cc: Nitesh Narayan Lal Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Paolo Bonzini Cc: Rik van Riel Cc: Vlastimil Babka Cc: Wei Wang Cc: Yang Zhang Cc: wei qi Link: http://lkml.kernel.org/r/20200211224719.29318.72113.stgit@localhost.localdomain Signed-off-by: Linus Torvalds --- include/linux/page_reporting.h | 1 + mm/page_reporting.c | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h index 32355486f572..3b99e0ec24f2 100644 --- a/include/linux/page_reporting.h +++ b/include/linux/page_reporting.h @@ -5,6 +5,7 @@ #include #include +/* This value should always be a power of 2, see page_reporting_cycle() */ #define PAGE_REPORTING_CAPACITY 32 struct page_reporting_dev_info { diff --git a/mm/page_reporting.c b/mm/page_reporting.c index 6885e74c2367..3bbd471cfc81 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -114,6 +114,7 @@ page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone, struct list_head *list = &area->free_list[mt]; unsigned int page_len = PAGE_SIZE << order; struct page *page, *next; + long budget; int err = 0; /* @@ -125,12 +126,39 @@ page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone, spin_lock_irq(&zone->lock); + /* + * Limit how many calls we will be making to the page reporting + * device for this list. By doing this we avoid processing any + * given list for too long. + * + * The current value used allows us enough calls to process over a + * sixteenth of the current list plus one additional call to handle + * any pages that may have already been present from the previous + * list processed. This should result in us reporting all pages on + * an idle system in about 30 seconds. + * + * The division here should be cheap since PAGE_REPORTING_CAPACITY + * should always be a power of 2. + */ + budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16); + /* loop through free list adding unreported pages to sg list */ list_for_each_entry_safe(page, next, list, lru) { /* We are going to skip over the reported pages. */ if (PageReported(page)) continue; + /* + * If we fully consumed our budget then update our + * state to indicate that we are requesting additional + * processing and exit this list. + */ + if (budget < 0) { + atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED); + next = page; + break; + } + /* Attempt to pull page from list and place in scatterlist */ if (*offset) { if (!__isolate_free_page(page, order)) { @@ -146,7 +174,7 @@ page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone, } /* - * Make the first non-processed page in the free list + * Make the first non-reported page in the free list * the new head of the free list before we release the * zone lock. */ @@ -162,6 +190,9 @@ page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone, /* reset offset since the full list was reported */ *offset = PAGE_REPORTING_CAPACITY; + /* update budget to reflect call to report function */ + budget--; + /* reacquire zone lock and resume processing */ spin_lock_irq(&zone->lock); -- cgit v1.3 From 1df319e0b4dee11436fe2ab1a0d536d3fad7cfef Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 6 Apr 2020 20:05:25 -0700 Subject: userfaultfd: wp: add helper for writeprotect check Patch series "userfaultfd: write protection support", v6. Overview ======== The uffd-wp work was initialized by Shaohua Li [1], and later continued by Andrea [2]. This series is based upon Andrea's latest userfaultfd tree, and it is a continuous works from both Shaohua and Andrea. Many of the follow up ideas come from Andrea too. Besides the old MISSING register mode of userfaultfd, the new uffd-wp support provides another alternative register mode called UFFDIO_REGISTER_MODE_WP that can be used to listen to not only missing page faults but also write protection page faults, or even they can be registered together. At the same time, the new feature also provides a new userfaultfd ioctl called UFFDIO_WRITEPROTECT which allows the userspace to write protect a range or memory or fixup write permission of faulted pages. Please refer to the document patch "userfaultfd: wp: UFFDIO_REGISTER_MODE_WP documentation update" for more information on the new interface and what it can do. The major workflow of an uffd-wp program should be: 1. Register a memory region with WP mode using UFFDIO_REGISTER_MODE_WP 2. Write protect part of the whole registered region using UFFDIO_WRITEPROTECT, passing in UFFDIO_WRITEPROTECT_MODE_WP to show that we want to write protect the range. 3. Start a working thread that modifies the protected pages, meanwhile listening to UFFD messages. 4. When a write is detected upon the protected range, page fault happens, a UFFD message will be generated and reported to the page fault handling thread 5. The page fault handler thread resolves the page fault using the new UFFDIO_WRITEPROTECT ioctl, but this time passing in !UFFDIO_WRITEPROTECT_MODE_WP instead showing that we want to recover the write permission. Before this operation, the fault handler thread can do anything it wants, e.g., dumps the page to a persistent storage. 6. The worker thread will continue running with the correctly applied write permission from step 5. Currently there are already two projects that are based on this new userfaultfd feature. QEMU Live Snapshot: The project provides a way to allow the QEMU hypervisor to take snapshot of VMs without stopping the VM [3]. LLNL umap library: The project provides a mmap-like interface and "allow to have an application specific buffer of pages cached from a large file, i.e. out-of-core execution using memory map" [4][5]. Before posting the patchset, this series was smoke tested against QEMU live snapshot and the LLNL umap library (by doing parallel quicksort using 128 sorting threads + 80 uffd servicing threads). My sincere thanks to Marty Mcfadden and Denis Plotnikov for the help along the way. TODO ==== - hugetlbfs/shmem support - performance - more architectures - cooperate with mprotect()-allowed processes (???) - ... References ========== [1] https://lwn.net/Articles/666187/ [2] https://git.kernel.org/pub/scm/linux/kernel/git/andrea/aa.git/log/?h=userfault [3] https://github.com/denis-plotnikov/qemu/commits/background-snapshot-kvm [4] https://github.com/LLNL/umap [5] https://llnl-umap.readthedocs.io/en/develop/ [6] https://git.kernel.org/pub/scm/linux/kernel/git/andrea/aa.git/commit/?h=userfault&id=b245ecf6cf59156966f3da6e6b674f6695a5ffa5 [7] https://lkml.org/lkml/2018/11/21/370 [8] https://lkml.org/lkml/2018/12/30/64 This patch (of 19): Add helper for writeprotect check. Will use it later. Signed-off-by: Shaohua Li Signed-off-by: Andrea Arcangeli Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Reviewed-by: Jerome Glisse Reviewed-by: Mike Rapoport Cc: Rik van Riel Cc: Kirill A. Shutemov Cc: Mel Gorman Cc: Hugh Dickins Cc: Johannes Weiner Cc: Bobby Powers Cc: Brian Geffon Cc: David Hildenbrand Cc: Denis Plotnikov Cc: "Dr . David Alan Gilbert" Cc: Martin Cracauer Cc: Marty McFadden Cc: Maya Gokhale Cc: Mike Kravetz Cc: Pavel Emelyanov Link: http://lkml.kernel.org/r/20200220163112.11409-2-peterx@redhat.com Signed-off-by: Linus Torvalds --- include/linux/userfaultfd_k.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index ac9d71e24b81..5dc247af0f2e 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -52,6 +52,11 @@ static inline bool userfaultfd_missing(struct vm_area_struct *vma) return vma->vm_flags & VM_UFFD_MISSING; } +static inline bool userfaultfd_wp(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_UFFD_WP; +} + static inline bool userfaultfd_armed(struct vm_area_struct *vma) { return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP); @@ -96,6 +101,11 @@ static inline bool userfaultfd_missing(struct vm_area_struct *vma) return false; } +static inline bool userfaultfd_wp(struct vm_area_struct *vma) +{ + return false; +} + static inline bool userfaultfd_armed(struct vm_area_struct *vma) { return false; -- cgit v1.3 From 5a281062af1d43d3f3956a6b429c2d727bc92603 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Mon, 6 Apr 2020 20:05:33 -0700 Subject: userfaultfd: wp: add WP pagetable tracking to x86 Accurate userfaultfd WP tracking is possible by tracking exactly which virtual memory ranges were writeprotected by userland. We can't relay only on the RW bit of the mapped pagetable because that information is destroyed by fork() or KSM or swap. If we were to relay on that, we'd need to stay on the safe side and generate false positive wp faults for every swapped out page. [peterx@redhat.com: append _PAGE_UFD_WP to _PAGE_CHG_MASK] Signed-off-by: Andrea Arcangeli Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Reviewed-by: Jerome Glisse Reviewed-by: Mike Rapoport Cc: Bobby Powers Cc: Brian Geffon Cc: David Hildenbrand Cc: Denis Plotnikov Cc: "Dr . David Alan Gilbert" Cc: Hugh Dickins Cc: Johannes Weiner Cc: "Kirill A . Shutemov" Cc: Martin Cracauer Cc: Marty McFadden Cc: Maya Gokhale Cc: Mel Gorman Cc: Mike Kravetz Cc: Pavel Emelyanov Cc: Rik van Riel Cc: Shaohua Li Link: http://lkml.kernel.org/r/20200220163112.11409-4-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/include/asm/pgtable.h | 52 ++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/pgtable_64.h | 8 +++++- arch/x86/include/asm/pgtable_types.h | 12 ++++++++- include/asm-generic/pgtable.h | 1 + include/asm-generic/pgtable_uffd.h | 51 +++++++++++++++++++++++++++++++++++ init/Kconfig | 5 ++++ 7 files changed, 128 insertions(+), 2 deletions(-) create mode 100644 include/asm-generic/pgtable_uffd.h (limited to 'include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1edf788d301c..8d078642b4be 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -149,6 +149,7 @@ config X86 select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64 + select HAVE_ARCH_USERFAULTFD_WP if USERFAULTFD select HAVE_ARCH_VMAP_STACK if X86_64 select HAVE_ARCH_WITHIN_STACK_FRAMES select HAVE_ASM_MODVERSIONS diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index afda66a6d325..c37e1649fb7e 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -25,6 +25,7 @@ #include #include #include +#include extern pgd_t early_top_pgt[PTRS_PER_PGD]; int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); @@ -313,6 +314,23 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear) return native_make_pte(v & ~clear); } +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP +static inline int pte_uffd_wp(pte_t pte) +{ + return pte_flags(pte) & _PAGE_UFFD_WP; +} + +static inline pte_t pte_mkuffd_wp(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_UFFD_WP); +} + +static inline pte_t pte_clear_uffd_wp(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_UFFD_WP); +} +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ + static inline pte_t pte_mkclean(pte_t pte) { return pte_clear_flags(pte, _PAGE_DIRTY); @@ -392,6 +410,23 @@ static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) return native_make_pmd(v & ~clear); } +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP +static inline int pmd_uffd_wp(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_UFFD_WP; +} + +static inline pmd_t pmd_mkuffd_wp(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_UFFD_WP); +} + +static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_UFFD_WP); +} +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ + static inline pmd_t pmd_mkold(pmd_t pmd) { return pmd_clear_flags(pmd, _PAGE_ACCESSED); @@ -1374,6 +1409,23 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) #endif #endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP +static inline pte_t pte_swp_mkuffd_wp(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SWP_UFFD_WP); +} + +static inline int pte_swp_uffd_wp(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SWP_UFFD_WP; +} + +static inline pte_t pte_swp_clear_uffd_wp(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_SWP_UFFD_WP); +} +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ + #define PKRU_AD_BIT 0x1 #define PKRU_WD_BIT 0x2 #define PKRU_BITS_PER_PKEY 2 diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 0b6c4042942a..df1373415f11 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -189,7 +189,7 @@ extern void sync_global_pgds(unsigned long start, unsigned long end); * * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names - * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|X|SD|0| <- swp entry + * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|F|SD|0| <- swp entry * * G (8) is aliased and used as a PROT_NONE indicator for * !present ptes. We need to start storing swap entries above @@ -197,9 +197,15 @@ extern void sync_global_pgds(unsigned long start, unsigned long end); * erratum where they can be incorrectly set by hardware on * non-present PTEs. * + * SD Bits 1-4 are not used in non-present format and available for + * special use described below: + * * SD (1) in swp entry is used to store soft dirty bit, which helps us * remember soft dirty over page migration * + * F (2) in swp entry is used to record when a pagetable is + * writeprotected by userfaultfd WP support. + * * Bit 7 in swp entry should be 0 because pmd_present checks not only P, * but also L and G. * diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 65c2ecd730c5..b6606fe6cfdf 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -32,6 +32,7 @@ #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 +#define _PAGE_BIT_UFFD_WP _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 @@ -100,6 +101,14 @@ #define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) #endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP +#define _PAGE_UFFD_WP (_AT(pteval_t, 1) << _PAGE_BIT_UFFD_WP) +#define _PAGE_SWP_UFFD_WP _PAGE_USER +#else +#define _PAGE_UFFD_WP (_AT(pteval_t, 0)) +#define _PAGE_SWP_UFFD_WP (_AT(pteval_t, 0)) +#endif + #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #define _PAGE_DEVMAP (_AT(u64, 1) << _PAGE_BIT_DEVMAP) @@ -118,7 +127,8 @@ */ #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ - _PAGE_SOFT_DIRTY | _PAGE_DEVMAP | _PAGE_ENC) + _PAGE_SOFT_DIRTY | _PAGE_DEVMAP | _PAGE_ENC | \ + _PAGE_UFFD_WP) #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) /* diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index e2e2bef07dd2..329b8c8ca703 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -10,6 +10,7 @@ #include #include #include +#include #if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \ defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS diff --git a/include/asm-generic/pgtable_uffd.h b/include/asm-generic/pgtable_uffd.h new file mode 100644 index 000000000000..643d1bf559c2 --- /dev/null +++ b/include/asm-generic/pgtable_uffd.h @@ -0,0 +1,51 @@ +#ifndef _ASM_GENERIC_PGTABLE_UFFD_H +#define _ASM_GENERIC_PGTABLE_UFFD_H + +#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP +static __always_inline int pte_uffd_wp(pte_t pte) +{ + return 0; +} + +static __always_inline int pmd_uffd_wp(pmd_t pmd) +{ + return 0; +} + +static __always_inline pte_t pte_mkuffd_wp(pte_t pte) +{ + return pte; +} + +static __always_inline pmd_t pmd_mkuffd_wp(pmd_t pmd) +{ + return pmd; +} + +static __always_inline pte_t pte_clear_uffd_wp(pte_t pte) +{ + return pte; +} + +static __always_inline pmd_t pmd_clear_uffd_wp(pmd_t pmd) +{ + return pmd; +} + +static __always_inline pte_t pte_swp_mkuffd_wp(pte_t pte) +{ + return pte; +} + +static __always_inline int pte_swp_uffd_wp(pte_t pte) +{ + return 0; +} + +static __always_inline pte_t pte_swp_clear_uffd_wp(pte_t pte) +{ + return pte; +} +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ + +#endif /* _ASM_GENERIC_PGTABLE_UFFD_H */ diff --git a/init/Kconfig b/init/Kconfig index 1c12059e0f7e..2bc0e47689d8 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1556,6 +1556,11 @@ config ADVISE_SYSCALLS applications use these syscalls, you can disable this option to save space. +config HAVE_ARCH_USERFAULTFD_WP + bool + help + Arch has userfaultfd write protection support + config MEMBARRIER bool "Enable membarrier() system call" if EXPERT default y -- cgit v1.3 From 55adf4de30346e0ec6b988cb9b885a5dddc954af Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Mon, 6 Apr 2020 20:05:37 -0700 Subject: userfaultfd: wp: userfaultfd_pte/huge_pmd_wp() helpers Implement helpers methods to invoke userfaultfd wp faults more selectively: not only when a wp fault triggers on a vma with vma->vm_flags VM_UFFD_WP set, but only if the _PAGE_UFFD_WP bit is set in the pagetable too. Signed-off-by: Andrea Arcangeli Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Reviewed-by: Jerome Glisse Reviewed-by: Mike Rapoport Cc: Bobby Powers Cc: Brian Geffon Cc: David Hildenbrand Cc: Denis Plotnikov Cc: "Dr . David Alan Gilbert" Cc: Hugh Dickins Cc: Johannes Weiner Cc: "Kirill A . Shutemov" Cc: Martin Cracauer Cc: Marty McFadden Cc: Maya Gokhale Cc: Mel Gorman Cc: Mike Kravetz Cc: Pavel Emelyanov Cc: Rik van Riel Cc: Shaohua Li Link: http://lkml.kernel.org/r/20200220163112.11409-5-peterx@redhat.com Signed-off-by: Linus Torvalds --- include/linux/userfaultfd_k.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 5dc247af0f2e..7b91b76aac58 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -14,6 +14,8 @@ #include /* linux/include/uapi/linux/userfaultfd.h */ #include +#include +#include /* * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining @@ -57,6 +59,18 @@ static inline bool userfaultfd_wp(struct vm_area_struct *vma) return vma->vm_flags & VM_UFFD_WP; } +static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, + pte_t pte) +{ + return userfaultfd_wp(vma) && pte_uffd_wp(pte); +} + +static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma, + pmd_t pmd) +{ + return userfaultfd_wp(vma) && pmd_uffd_wp(pmd); +} + static inline bool userfaultfd_armed(struct vm_area_struct *vma) { return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP); @@ -106,6 +120,19 @@ static inline bool userfaultfd_wp(struct vm_area_struct *vma) return false; } +static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, + pte_t pte) +{ + return false; +} + +static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma, + pmd_t pmd) +{ + return false; +} + + static inline bool userfaultfd_armed(struct vm_area_struct *vma) { return false; -- cgit v1.3 From 72981e0e7b609c741d7764cc920c8fec00920bd5 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Mon, 6 Apr 2020 20:05:41 -0700 Subject: userfaultfd: wp: add UFFDIO_COPY_MODE_WP This allows UFFDIO_COPY to map pages write-protected. [peterx@redhat.com: switch to VM_WARN_ON_ONCE in mfill_atomic_pte; add brackets around "dst_vma->vm_flags & VM_WRITE"; fix wordings in comments and commit messages] Signed-off-by: Andrea Arcangeli Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Reviewed-by: Jerome Glisse Reviewed-by: Mike Rapoport Cc: Bobby Powers Cc: Brian Geffon Cc: David Hildenbrand Cc: Denis Plotnikov Cc: "Dr . David Alan Gilbert" Cc: Hugh Dickins Cc: Johannes Weiner Cc: "Kirill A . Shutemov" Cc: Martin Cracauer Cc: Marty McFadden Cc: Maya Gokhale Cc: Mel Gorman Cc: Mike Kravetz Cc: Pavel Emelyanov Cc: Rik van Riel Cc: Shaohua Li Link: http://lkml.kernel.org/r/20200220163112.11409-6-peterx@redhat.com Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 5 +++-- include/linux/userfaultfd_k.h | 2 +- include/uapi/linux/userfaultfd.h | 11 ++++++----- mm/userfaultfd.c | 36 +++++++++++++++++++++++++----------- 4 files changed, 35 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 703c1c3faa6e..c49bef505775 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1724,11 +1724,12 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, ret = -EINVAL; if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src) goto out; - if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE) + if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) goto out; if (mmget_not_zero(ctx->mm)) { ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, - uffdio_copy.len, &ctx->mmap_changing); + uffdio_copy.len, &ctx->mmap_changing, + uffdio_copy.mode); mmput(ctx->mm); } else { return -ESRCH; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 7b91b76aac58..dcd33172b728 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -36,7 +36,7 @@ extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool *mmap_changing); + bool *mmap_changing, __u64 mode); extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len, diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 48f1a7c2f1f0..340f23bc251d 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -203,13 +203,14 @@ struct uffdio_copy { __u64 dst; __u64 src; __u64 len; +#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0) /* - * There will be a wrprotection flag later that allows to map - * pages wrprotected on the fly. And such a flag will be - * available if the wrprotection ioctl are implemented for the - * range according to the uffdio_register.ioctls. + * UFFDIO_COPY_MODE_WP will map the page write protected on + * the fly. UFFDIO_COPY_MODE_WP is available only if the + * write protected ioctl is implemented for the range + * according to the uffdio_register.ioctls. */ -#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0) +#define UFFDIO_COPY_MODE_WP ((__u64)1<<1) __u64 mode; /* diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index bd96855f3961..05dbbcafdcc0 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -53,7 +53,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, - struct page **pagep) + struct page **pagep, + bool wp_copy) { struct mem_cgroup *memcg; pte_t _dst_pte, *dst_pte; @@ -99,9 +100,9 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false)) goto out_release; - _dst_pte = mk_pte(page, dst_vma->vm_page_prot); - if (dst_vma->vm_flags & VM_WRITE) - _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); + _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot)); + if ((dst_vma->vm_flags & VM_WRITE) && !wp_copy) + _dst_pte = pte_mkwrite(_dst_pte); dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); if (dst_vma->vm_file) { @@ -415,7 +416,8 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, unsigned long dst_addr, unsigned long src_addr, struct page **page, - bool zeropage) + bool zeropage, + bool wp_copy) { ssize_t err; @@ -432,11 +434,13 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, if (!(dst_vma->vm_flags & VM_SHARED)) { if (!zeropage) err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, - dst_addr, src_addr, page); + dst_addr, src_addr, page, + wp_copy); else err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, dst_addr); } else { + VM_WARN_ON_ONCE(wp_copy); if (!zeropage) err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, @@ -454,7 +458,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long src_start, unsigned long len, bool zeropage, - bool *mmap_changing) + bool *mmap_changing, + __u64 mode) { struct vm_area_struct *dst_vma; ssize_t err; @@ -462,6 +467,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long src_addr, dst_addr; long copied; struct page *page; + bool wp_copy; /* * Sanitize the command parameters: @@ -507,6 +513,14 @@ retry: dst_vma->vm_flags & VM_SHARED)) goto out_unlock; + /* + * validate 'mode' now that we know the dst_vma: don't allow + * a wrprotect copy if the userfaultfd didn't register as WP. + */ + wp_copy = mode & UFFDIO_COPY_MODE_WP; + if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP)) + goto out_unlock; + /* * If this is a HUGETLB vma, pass off to appropriate routine */ @@ -562,7 +576,7 @@ retry: BUG_ON(pmd_trans_huge(*dst_pmd)); err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, - src_addr, &page, zeropage); + src_addr, &page, zeropage, wp_copy); cond_resched(); if (unlikely(err == -ENOENT)) { @@ -609,14 +623,14 @@ out: ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool *mmap_changing) + bool *mmap_changing, __u64 mode) { return __mcopy_atomic(dst_mm, dst_start, src_start, len, false, - mmap_changing); + mmap_changing, mode); } ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool *mmap_changing) { - return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing); + return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0); } -- cgit v1.3 From 58705444c45b3ca987b03bd9beb41bbbe41ae439 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 6 Apr 2020 20:05:45 -0700 Subject: mm: merge parameters for change_protection() change_protection() was used by either the NUMA or mprotect() code, there's one parameter for each of the callers (dirty_accountable and prot_numa). Further, these parameters are passed along the calls: - change_protection_range() - change_p4d_range() - change_pud_range() - change_pmd_range() - ... Now we introduce a flag for change_protect() and all these helpers to replace these parameters. Then we can avoid passing multiple parameters multiple times along the way. More importantly, it'll greatly simplify the work if we want to introduce any new parameters to change_protection(). In the follow up patches, a new parameter for userfaultfd write protection will be introduced. No functional change at all. Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Reviewed-by: Jerome Glisse Cc: Andrea Arcangeli Cc: Bobby Powers Cc: Brian Geffon Cc: David Hildenbrand Cc: Denis Plotnikov Cc: "Dr . David Alan Gilbert" Cc: Hugh Dickins Cc: Johannes Weiner Cc: "Kirill A . Shutemov" Cc: Martin Cracauer Cc: Marty McFadden Cc: Maya Gokhale Cc: Mel Gorman Cc: Mike Kravetz Cc: Mike Rapoport Cc: Pavel Emelyanov Cc: Rik van Riel Cc: Shaohua Li Link: http://lkml.kernel.org/r/20200220163112.11409-7-peterx@redhat.com Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 2 +- include/linux/mm.h | 14 +++++++++++++- mm/huge_memory.c | 3 ++- mm/mempolicy.c | 2 +- mm/mprotect.c | 29 ++++++++++++++++------------- 5 files changed, 33 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f2df2247026a..cfbb0a87c5f0 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -46,7 +46,7 @@ extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, pmd_t *old_pmd, pmd_t *new_pmd); extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, - int prot_numa); + unsigned long cp_flags); vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn, pgprot_t pgprot, bool write); diff --git a/include/linux/mm.h b/include/linux/mm.h index be49e371e4b5..c7d87ff5027b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1771,9 +1771,21 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, bool need_rmap_locks); + +/* + * Flags used by change_protection(). For now we make it a bitmap so + * that we can pass in multiple flags just like parameters. However + * for now all the callers are only use one of the flags at the same + * time. + */ +/* Whether we should allow dirty bit accounting */ +#define MM_CP_DIRTY_ACCT (1UL << 0) +/* Whether this protection change is for NUMA hints */ +#define MM_CP_PROT_NUMA (1UL << 1) + extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa); + unsigned long cp_flags); extern int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c1e7c71db1e6..dc12249af6df 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1979,13 +1979,14 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, * - HPAGE_PMD_NR is protections changed and TLB flush necessary */ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot, int prot_numa) + unsigned long addr, pgprot_t newprot, unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; pmd_t entry; bool preserve_write; int ret; + bool prot_numa = cp_flags & MM_CP_PROT_NUMA; ptl = __pmd_trans_huge_lock(pmd, vma); if (!ptl) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 037e5f548118..145be04b7108 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -627,7 +627,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, { int nr_updated; - nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1); + nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA); if (nr_updated) count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); diff --git a/mm/mprotect.c b/mm/mprotect.c index 0fee14b39416..046e0889e65f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -37,12 +37,14 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) + unsigned long cp_flags) { pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; int target_node = NUMA_NO_NODE; + bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT; + bool prot_numa = cp_flags & MM_CP_PROT_NUMA; /* * Can be called with only the mmap_sem for reading by @@ -188,7 +190,7 @@ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd) static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, - pgprot_t newprot, int dirty_accountable, int prot_numa) + pgprot_t newprot, unsigned long cp_flags) { pmd_t *pmd; unsigned long next; @@ -229,7 +231,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, __split_huge_pmd(vma, pmd, addr, false, NULL); } else { int nr_ptes = change_huge_pmd(vma, pmd, addr, - newprot, prot_numa); + newprot, cp_flags); if (nr_ptes) { if (nr_ptes == HPAGE_PMD_NR) { @@ -244,7 +246,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, /* fall through, the trans huge pmd just split */ } this_pages = change_pte_range(vma, pmd, addr, next, newprot, - dirty_accountable, prot_numa); + cp_flags); pages += this_pages; next: cond_resched(); @@ -260,7 +262,7 @@ next: static inline unsigned long change_pud_range(struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, - pgprot_t newprot, int dirty_accountable, int prot_numa) + pgprot_t newprot, unsigned long cp_flags) { pud_t *pud; unsigned long next; @@ -272,7 +274,7 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma, if (pud_none_or_clear_bad(pud)) continue; pages += change_pmd_range(vma, pud, addr, next, newprot, - dirty_accountable, prot_numa); + cp_flags); } while (pud++, addr = next, addr != end); return pages; @@ -280,7 +282,7 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma, static inline unsigned long change_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, - pgprot_t newprot, int dirty_accountable, int prot_numa) + pgprot_t newprot, unsigned long cp_flags) { p4d_t *p4d; unsigned long next; @@ -292,7 +294,7 @@ static inline unsigned long change_p4d_range(struct vm_area_struct *vma, if (p4d_none_or_clear_bad(p4d)) continue; pages += change_pud_range(vma, p4d, addr, next, newprot, - dirty_accountable, prot_numa); + cp_flags); } while (p4d++, addr = next, addr != end); return pages; @@ -300,7 +302,7 @@ static inline unsigned long change_p4d_range(struct vm_area_struct *vma, static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) + unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; @@ -317,7 +319,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, if (pgd_none_or_clear_bad(pgd)) continue; pages += change_p4d_range(vma, pgd, addr, next, newprot, - dirty_accountable, prot_numa); + cp_flags); } while (pgd++, addr = next, addr != end); /* Only flush the TLB if we actually modified any entries: */ @@ -330,14 +332,15 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) + unsigned long cp_flags) { unsigned long pages; if (is_vm_hugetlb_page(vma)) pages = hugetlb_change_protection(vma, start, end, newprot); else - pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); + pages = change_protection_range(vma, start, end, newprot, + cp_flags); return pages; } @@ -459,7 +462,7 @@ success: vma_set_page_prot(vma); change_protection(vma, start, end, vma->vm_page_prot, - dirty_accountable, 0); + dirty_accountable ? MM_CP_DIRTY_ACCT : 0); /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major -- cgit v1.3 From 292924b260247483a58916f6d3550d8c92f32f55 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 6 Apr 2020 20:05:49 -0700 Subject: userfaultfd: wp: apply _PAGE_UFFD_WP bit Firstly, introduce two new flags MM_CP_UFFD_WP[_RESOLVE] for change_protection() when used with uffd-wp and make sure the two new flags are exclusively used. Then, - For MM_CP_UFFD_WP: apply the _PAGE_UFFD_WP bit and remove _PAGE_RW when a range of memory is write protected by uffd - For MM_CP_UFFD_WP_RESOLVE: remove the _PAGE_UFFD_WP bit and recover _PAGE_RW when write protection is resolved from userspace And use this new interface in mwriteprotect_range() to replace the old MM_CP_DIRTY_ACCT. Do this change for both PTEs and huge PMDs. Then we can start to identify which PTE/PMD is write protected by general (e.g., COW or soft dirty tracking), and which is for userfaultfd-wp. Since we should keep the _PAGE_UFFD_WP when doing pte_modify(), add it into _PAGE_CHG_MASK as well. Meanwhile, since we have this new bit, we can be even more strict when detecting uffd-wp page faults in either do_wp_page() or wp_huge_pmd(). After we're with _PAGE_UFFD_WP, a special case is when a page is both protected by the general COW logic and also userfault-wp. Here the userfault-wp will have higher priority and will be handled first. Only after the uffd-wp bit is cleared on the PTE/PMD will we continue to handle the general COW. These are the steps on what will happen with such a page: 1. CPU accesses write protected shared page (so both protected by general COW and uffd-wp), blocked by uffd-wp first because in do_wp_page we'll handle uffd-wp first, so it has higher priority than general COW. 2. Uffd service thread receives the request, do UFFDIO_WRITEPROTECT to remove the uffd-wp bit upon the PTE/PMD. However here we still keep the write bit cleared. Notify the blocked CPU. 3. The blocked CPU resumes the page fault process with a fault retry, during retry it'll notice it was not with the uffd-wp bit this time but it is still write protected by general COW, then it'll go though the COW path in the fault handler, copy the page, apply write bit where necessary, and retry again. 4. The CPU will be able to access this page with write bit set. Suggested-by: Andrea Arcangeli Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Cc: Brian Geffon Cc: Pavel Emelyanov Cc: Mike Kravetz Cc: David Hildenbrand Cc: Martin Cracauer Cc: Mel Gorman Cc: Bobby Powers Cc: Mike Rapoport Cc: "Kirill A . Shutemov" Cc: Maya Gokhale Cc: Johannes Weiner Cc: Marty McFadden Cc: Denis Plotnikov Cc: Hugh Dickins Cc: "Dr . David Alan Gilbert" Cc: Jerome Glisse Cc: Rik van Riel Cc: Shaohua Li Link: http://lkml.kernel.org/r/20200220163112.11409-8-peterx@redhat.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 5 +++++ mm/huge_memory.c | 18 +++++++++++++++++- mm/memory.c | 4 ++-- mm/mprotect.c | 17 +++++++++++++++++ mm/userfaultfd.c | 8 ++++++-- 5 files changed, 47 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index c7d87ff5027b..e2f938c5a9d8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1782,6 +1782,11 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, #define MM_CP_DIRTY_ACCT (1UL << 0) /* Whether this protection change is for NUMA hints */ #define MM_CP_PROT_NUMA (1UL << 1) +/* Whether this change is for write protecting */ +#define MM_CP_UFFD_WP (1UL << 2) /* do wp */ +#define MM_CP_UFFD_WP_RESOLVE (1UL << 3) /* Resolve wp */ +#define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ + MM_CP_UFFD_WP_RESOLVE) extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index dc12249af6df..425339491677 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1987,6 +1987,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, bool preserve_write; int ret; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; + bool uffd_wp = cp_flags & MM_CP_UFFD_WP; + bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; ptl = __pmd_trans_huge_lock(pmd, vma); if (!ptl) @@ -2053,6 +2055,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, entry = pmd_modify(entry, newprot); if (preserve_write) entry = pmd_mk_savedwrite(entry); + if (uffd_wp) { + entry = pmd_wrprotect(entry); + entry = pmd_mkuffd_wp(entry); + } else if (uffd_wp_resolve) { + /* + * Leave the write bit to be handled by PF interrupt + * handler, then things like COW could be properly + * handled. + */ + entry = pmd_clear_uffd_wp(entry); + } ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry)); @@ -2201,7 +2214,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, struct page *page; pgtable_t pgtable; pmd_t old_pmd, _pmd; - bool young, write, soft_dirty, pmd_migration = false; + bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; unsigned long addr; int i; @@ -2283,6 +2296,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, write = pmd_write(old_pmd); young = pmd_young(old_pmd); soft_dirty = pmd_soft_dirty(old_pmd); + uffd_wp = pmd_uffd_wp(old_pmd); } VM_BUG_ON_PAGE(!page_count(page), page); page_ref_add(page, HPAGE_PMD_NR - 1); @@ -2316,6 +2330,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_mkold(entry); if (soft_dirty) entry = pte_mksoft_dirty(entry); + if (uffd_wp) + entry = pte_mkuffd_wp(entry); } pte = pte_offset_map(&_pmd, addr); BUG_ON(!pte_none(*pte)); diff --git a/mm/memory.c b/mm/memory.c index 46aa79600ed8..f35821b43c1b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2752,7 +2752,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - if (userfaultfd_wp(vma)) { + if (userfaultfd_pte_wp(vma, *vmf->pte)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_WP); } @@ -3954,7 +3954,7 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) { if (vma_is_anonymous(vmf->vma)) { - if (userfaultfd_wp(vmf->vma)) + if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd)) return handle_userfault(vmf, VM_UFFD_WP); return do_huge_pmd_wp_page(vmf, orig_pmd); } diff --git a/mm/mprotect.c b/mm/mprotect.c index 046e0889e65f..e4fa41a24bec 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -45,6 +45,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, int target_node = NUMA_NO_NODE; bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; + bool uffd_wp = cp_flags & MM_CP_UFFD_WP; + bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; /* * Can be called with only the mmap_sem for reading by @@ -116,6 +118,19 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (preserve_write) ptent = pte_mk_savedwrite(ptent); + if (uffd_wp) { + ptent = pte_wrprotect(ptent); + ptent = pte_mkuffd_wp(ptent); + } else if (uffd_wp_resolve) { + /* + * Leave the write bit to be handled + * by PF interrupt handler, then + * things like COW could be properly + * handled. + */ + ptent = pte_clear_uffd_wp(ptent); + } + /* Avoid taking write faults for known dirty pages */ if (dirty_accountable && pte_dirty(ptent) && (pte_soft_dirty(ptent) || @@ -336,6 +351,8 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, { unsigned long pages; + BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL); + if (is_vm_hugetlb_page(vma)) pages = hugetlb_change_protection(vma, start, end, newprot); else diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 05dbbcafdcc0..7d6ab05be019 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -101,8 +101,12 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, goto out_release; _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot)); - if ((dst_vma->vm_flags & VM_WRITE) && !wp_copy) - _dst_pte = pte_mkwrite(_dst_pte); + if (dst_vma->vm_flags & VM_WRITE) { + if (wp_copy) + _dst_pte = pte_mkuffd_wp(_dst_pte); + else + _dst_pte = pte_mkwrite(_dst_pte); + } dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); if (dst_vma->vm_file) { -- cgit v1.3 From 2e3d5dc508cf001c4fb2d15515ebe6f30df88f76 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 6 Apr 2020 20:05:57 -0700 Subject: userfaultfd: wp: add pmd_swp_*uffd_wp() helpers Adding these missing helpers for uffd-wp operations with pmd swap/migration entries. Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Reviewed-by: Jerome Glisse Reviewed-by: Mike Rapoport Cc: Andrea Arcangeli Cc: Bobby Powers Cc: Brian Geffon Cc: David Hildenbrand Cc: Denis Plotnikov Cc: "Dr . David Alan Gilbert" Cc: Hugh Dickins Cc: Johannes Weiner Cc: "Kirill A . Shutemov" Cc: Martin Cracauer Cc: Marty McFadden Cc: Maya Gokhale Cc: Mel Gorman Cc: Mike Kravetz Cc: Pavel Emelyanov Cc: Rik van Riel Cc: Shaohua Li Link: http://lkml.kernel.org/r/20200220163112.11409-10-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 15 +++++++++++++++ include/asm-generic/pgtable_uffd.h | 15 +++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'include') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index c37e1649fb7e..28838d790191 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1424,6 +1424,21 @@ static inline pte_t pte_swp_clear_uffd_wp(pte_t pte) { return pte_clear_flags(pte, _PAGE_SWP_UFFD_WP); } + +static inline pmd_t pmd_swp_mkuffd_wp(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_SWP_UFFD_WP); +} + +static inline int pmd_swp_uffd_wp(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_SWP_UFFD_WP; +} + +static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_SWP_UFFD_WP); +} #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ #define PKRU_AD_BIT 0x1 diff --git a/include/asm-generic/pgtable_uffd.h b/include/asm-generic/pgtable_uffd.h index 643d1bf559c2..828966d4c281 100644 --- a/include/asm-generic/pgtable_uffd.h +++ b/include/asm-generic/pgtable_uffd.h @@ -46,6 +46,21 @@ static __always_inline pte_t pte_swp_clear_uffd_wp(pte_t pte) { return pte; } + +static inline pmd_t pmd_swp_mkuffd_wp(pmd_t pmd) +{ + return pmd; +} + +static inline int pmd_swp_uffd_wp(pmd_t pmd) +{ + return 0; +} + +static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd) +{ + return pmd; +} #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ #endif /* _ASM_GENERIC_PGTABLE_UFFD_H */ -- cgit v1.3 From f45ec5ff16a75f96dac8c89862d75f1d8739efd4 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 6 Apr 2020 20:06:01 -0700 Subject: userfaultfd: wp: support swap and page migration For either swap and page migration, we all use the bit 2 of the entry to identify whether this entry is uffd write-protected. It plays a similar role as the existing soft dirty bit in swap entries but only for keeping the uffd-wp tracking for a specific PTE/PMD. Something special here is that when we want to recover the uffd-wp bit from a swap/migration entry to the PTE bit we'll also need to take care of the _PAGE_RW bit and make sure it's cleared, otherwise even with the _PAGE_UFFD_WP bit we can't trap it at all. In change_pte_range() we do nothing for uffd if the PTE is a swap entry. That can lead to data mismatch if the page that we are going to write protect is swapped out when sending the UFFDIO_WRITEPROTECT. This patch also applies/removes the uffd-wp bit even for the swap entries. Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Cc: Andrea Arcangeli Cc: Bobby Powers Cc: Brian Geffon Cc: David Hildenbrand Cc: Denis Plotnikov Cc: "Dr . David Alan Gilbert" Cc: Hugh Dickins Cc: Jerome Glisse Cc: Johannes Weiner Cc: "Kirill A . Shutemov" Cc: Martin Cracauer Cc: Marty McFadden Cc: Maya Gokhale Cc: Mel Gorman Cc: Mike Kravetz Cc: Mike Rapoport Cc: Pavel Emelyanov Cc: Rik van Riel Cc: Shaohua Li Link: http://lkml.kernel.org/r/20200220163112.11409-11-peterx@redhat.com Signed-off-by: Linus Torvalds --- include/linux/swapops.h | 2 ++ mm/huge_memory.c | 3 +++ mm/memory.c | 8 ++++++++ mm/migrate.c | 6 ++++++ mm/mprotect.c | 28 +++++++++++++++++----------- mm/rmap.c | 6 ++++++ 6 files changed, 42 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 877fd239b6ff..9a6f06de183b 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -68,6 +68,8 @@ static inline swp_entry_t pte_to_swp_entry(pte_t pte) if (pte_swp_soft_dirty(pte)) pte = pte_swp_clear_soft_dirty(pte); + if (pte_swp_uffd_wp(pte)) + pte = pte_swp_clear_uffd_wp(pte); arch_entry = __pte_to_swp_entry(pte); return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8164787cd51f..6ecd1045113b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2297,6 +2297,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, write = is_write_migration_entry(entry); young = false; soft_dirty = pmd_swp_soft_dirty(old_pmd); + uffd_wp = pmd_swp_uffd_wp(old_pmd); } else { page = pmd_page(old_pmd); if (pmd_dirty(old_pmd)) @@ -2329,6 +2330,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = swp_entry_to_pte(swp_entry); if (soft_dirty) entry = pte_swp_mksoft_dirty(entry); + if (uffd_wp) + entry = pte_swp_mkuffd_wp(entry); } else { entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); entry = maybe_mkwrite(entry, vma); diff --git a/mm/memory.c b/mm/memory.c index f8b1969669b7..8ac9af73e9d2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -733,6 +733,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(*src_pte)) pte = pte_swp_mksoft_dirty(pte); + if (pte_swp_uffd_wp(*src_pte)) + pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } } else if (is_device_private_entry(entry)) { @@ -762,6 +764,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, is_cow_mapping(vm_flags)) { make_device_private_entry_read(&entry); pte = swp_entry_to_pte(entry); + if (pte_swp_uffd_wp(*src_pte)) + pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } } @@ -3098,6 +3102,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) flush_icache_page(vma, page); if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); + if (pte_swp_uffd_wp(vmf->orig_pte)) { + pte = pte_mkuffd_wp(pte); + pte = pte_wrprotect(pte); + } set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); vmf->orig_pte = pte; diff --git a/mm/migrate.c b/mm/migrate.c index c1412e04975e..7160c1556f79 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -243,11 +243,15 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, entry = pte_to_swp_entry(*pvmw.pte); if (is_write_migration_entry(entry)) pte = maybe_mkwrite(pte, vma); + else if (pte_swp_uffd_wp(*pvmw.pte)) + pte = pte_mkuffd_wp(pte); if (unlikely(is_zone_device_page(new))) { if (is_device_private_page(new)) { entry = make_device_private_entry(new, pte_write(pte)); pte = swp_entry_to_pte(entry); + if (pte_swp_uffd_wp(*pvmw.pte)) + pte = pte_mkuffd_wp(pte); } } @@ -2338,6 +2342,8 @@ again: swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pte)) swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pte)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, addr, ptep, swp_pte); /* diff --git a/mm/mprotect.c b/mm/mprotect.c index e4fa41a24bec..1d823b050329 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -139,11 +139,11 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); pages++; - } else if (IS_ENABLED(CONFIG_MIGRATION)) { + } else if (is_swap_pte(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); + pte_t newpte; if (is_write_migration_entry(entry)) { - pte_t newpte; /* * A protection check is difficult so * just be safe and disable write @@ -152,22 +152,28 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, newpte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(oldpte)) newpte = pte_swp_mksoft_dirty(newpte); - set_pte_at(vma->vm_mm, addr, pte, newpte); - - pages++; - } - - if (is_write_device_private_entry(entry)) { - pte_t newpte; - + if (pte_swp_uffd_wp(oldpte)) + newpte = pte_swp_mkuffd_wp(newpte); + } else if (is_write_device_private_entry(entry)) { /* * We do not preserve soft-dirtiness. See * copy_one_pte() for explanation. */ make_device_private_entry_read(&entry); newpte = swp_entry_to_pte(entry); - set_pte_at(vma->vm_mm, addr, pte, newpte); + if (pte_swp_uffd_wp(oldpte)) + newpte = pte_swp_mkuffd_wp(newpte); + } else { + newpte = oldpte; + } + if (uffd_wp) + newpte = pte_swp_mkuffd_wp(newpte); + else if (uffd_wp_resolve) + newpte = pte_swp_clear_uffd_wp(newpte); + + if (!pte_same(oldpte, newpte)) { + set_pte_at(vma->vm_mm, addr, pte, newpte); pages++; } } diff --git a/mm/rmap.c b/mm/rmap.c index 374a9bfdbffa..ed8889bf4ede 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1502,6 +1502,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pteval)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); /* * No need to invalidate here it will synchronize on @@ -1601,6 +1603,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pteval)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte); /* * No need to invalidate here it will synchronize on @@ -1667,6 +1671,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pteval)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte); /* Invalidate as we cleared the pte */ mmu_notifier_invalidate_range(mm, address, -- cgit v1.3 From e1e267c7928fe387e5e1cffeafb0de2d0473663a Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 6 Apr 2020 20:06:04 -0700 Subject: khugepaged: skip collapse if uffd-wp detected Don't collapse the huge PMD if there is any userfault write protected small PTEs. The problem is that the write protection is in small page granularity and there's no way to keep all these write protection information if the small pages are going to be merged into a huge PMD. The same thing needs to be considered for swap entries and migration entries. So do the check as well disregarding khugepaged_max_ptes_swap. Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Reviewed-by: Jerome Glisse Reviewed-by: Mike Rapoport Cc: Andrea Arcangeli Cc: Bobby Powers Cc: Brian Geffon Cc: David Hildenbrand Cc: Denis Plotnikov Cc: "Dr . David Alan Gilbert" Cc: Hugh Dickins Cc: Johannes Weiner Cc: "Kirill A . Shutemov" Cc: Martin Cracauer Cc: Marty McFadden Cc: Maya Gokhale Cc: Mel Gorman Cc: Mike Kravetz Cc: Pavel Emelyanov Cc: Rik van Riel Cc: Shaohua Li Link: http://lkml.kernel.org/r/20200220163112.11409-12-peterx@redhat.com Signed-off-by: Linus Torvalds --- include/trace/events/huge_memory.h | 1 + mm/khugepaged.c | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'include') diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index d82a0f4e824d..70e32ff096ec 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -13,6 +13,7 @@ EM( SCAN_PMD_NULL, "pmd_null") \ EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \ EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \ + EM( SCAN_PTE_UFFD_WP, "pte_uffd_wp") \ EM( SCAN_PAGE_RO, "no_writable_page") \ EM( SCAN_LACK_REFERENCED_PAGE, "lack_referenced_page") \ EM( SCAN_PAGE_NULL, "page_null") \ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 3afc1e2d7a55..99d77ffb79c2 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -29,6 +29,7 @@ enum scan_result { SCAN_PMD_NULL, SCAN_EXCEED_NONE_PTE, SCAN_PTE_NON_PRESENT, + SCAN_PTE_UFFD_WP, SCAN_PAGE_RO, SCAN_LACK_REFERENCED_PAGE, SCAN_PAGE_NULL, @@ -1137,6 +1138,15 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, pte_t pteval = *_pte; if (is_swap_pte(pteval)) { if (++unmapped <= khugepaged_max_ptes_swap) { + /* + * Always be strict with uffd-wp + * enabled swap entries. Please see + * comment below for pte_uffd_wp(). + */ + if (pte_swp_uffd_wp(pteval)) { + result = SCAN_PTE_UFFD_WP; + goto out_unmap; + } continue; } else { result = SCAN_EXCEED_SWAP_PTE; @@ -1156,6 +1166,19 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, result = SCAN_PTE_NON_PRESENT; goto out_unmap; } + if (pte_uffd_wp(pteval)) { + /* + * Don't collapse the page if any of the small + * PTEs are armed with uffd write protection. + * Here we can also mark the new huge pmd as + * write protected if any of the small ones is + * marked but that could bring uknown + * userfault messages that falls outside of + * the registered range. So, just be simple. + */ + result = SCAN_PTE_UFFD_WP; + goto out_unmap; + } if (pte_write(pteval)) writable = true; -- cgit v1.3 From ffd05793963a44bd119311df3c02b191982574ee Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 6 Apr 2020 20:06:09 -0700 Subject: userfaultfd: wp: support write protection for userfault vma range Add API to enable/disable writeprotect a vma range. Unlike mprotect, this doesn't split/merge vmas. [peterx@redhat.com: - use the helper to find VMA; - return -ENOENT if not found to match mcopy case; - use the new MM_CP_UFFD_WP* flags for change_protection - check against mmap_changing for failures - replace find_dst_vma with vma_find_uffd] Signed-off-by: Shaohua Li Signed-off-by: Andrea Arcangeli Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Reviewed-by: Jerome Glisse Reviewed-by: Mike Rapoport Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Kirill A. Shutemov Cc: Mel Gorman Cc: Hugh Dickins Cc: Johannes Weiner Cc: Bobby Powers Cc: Brian Geffon Cc: David Hildenbrand Cc: Denis Plotnikov Cc: "Dr . David Alan Gilbert" Cc: Martin Cracauer Cc: Marty McFadden Cc: Maya Gokhale Cc: Mike Kravetz Cc: Pavel Emelyanov Link: http://lkml.kernel.org/r/20200220163112.11409-13-peterx@redhat.com Signed-off-by: Linus Torvalds --- include/linux/userfaultfd_k.h | 3 +++ mm/userfaultfd.c | 54 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) (limited to 'include') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index dcd33172b728..a8e5f3ea9bb2 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -41,6 +41,9 @@ extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len, bool *mmap_changing); +extern int mwriteprotect_range(struct mm_struct *dst_mm, + unsigned long start, unsigned long len, + bool enable_wp, bool *mmap_changing); /* mm helpers */ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 7d6ab05be019..512576e171ce 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -638,3 +638,57 @@ ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, { return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0); } + +int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, + unsigned long len, bool enable_wp, bool *mmap_changing) +{ + struct vm_area_struct *dst_vma; + pgprot_t newprot; + int err; + + /* + * Sanitize the command parameters: + */ + BUG_ON(start & ~PAGE_MASK); + BUG_ON(len & ~PAGE_MASK); + + /* Does the address range wrap, or is the span zero-sized? */ + BUG_ON(start + len <= start); + + down_read(&dst_mm->mmap_sem); + + /* + * If memory mappings are changing because of non-cooperative + * operation (e.g. mremap) running in parallel, bail out and + * request the user to retry later + */ + err = -EAGAIN; + if (mmap_changing && READ_ONCE(*mmap_changing)) + goto out_unlock; + + err = -ENOENT; + dst_vma = find_dst_vma(dst_mm, start, len); + /* + * Make sure the vma is not shared, that the dst range is + * both valid and fully within a single existing vma. + */ + if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) + goto out_unlock; + if (!userfaultfd_wp(dst_vma)) + goto out_unlock; + if (!vma_is_anonymous(dst_vma)) + goto out_unlock; + + if (enable_wp) + newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE)); + else + newprot = vm_get_page_prot(dst_vma->vm_flags); + + change_protection(dst_vma, start, start + len, newprot, + enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE); + + err = 0; +out_unlock: + up_read(&dst_mm->mmap_sem); + return err; +} -- cgit v1.3 From 63b2d4174c4ad1f40b48d7138e71bcb564c1fe03 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Mon, 6 Apr 2020 20:06:12 -0700 Subject: userfaultfd: wp: add the writeprotect API to userfaultfd ioctl Introduce the new uffd-wp APIs for userspace. Firstly, we'll allow to do UFFDIO_REGISTER with write protection tracking using the new UFFDIO_REGISTER_MODE_WP flag. Note that this flag can co-exist with the existing UFFDIO_REGISTER_MODE_MISSING, in which case the userspace program can not only resolve missing page faults, and at the same time tracking page data changes along the way. Secondly, we introduced the new UFFDIO_WRITEPROTECT API to do page level write protection tracking. Note that we will need to register the memory region with UFFDIO_REGISTER_MODE_WP before that. [peterx@redhat.com: write up the commit message] [peterx@redhat.com: remove useless block, write commit message, check against VM_MAYWRITE rather than VM_WRITE when register] Signed-off-by: Andrea Arcangeli Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Reviewed-by: Jerome Glisse Cc: Bobby Powers Cc: Brian Geffon Cc: David Hildenbrand Cc: Denis Plotnikov Cc: "Dr . David Alan Gilbert" Cc: Hugh Dickins Cc: Johannes Weiner Cc: "Kirill A . Shutemov" Cc: Martin Cracauer Cc: Marty McFadden Cc: Maya Gokhale Cc: Mel Gorman Cc: Mike Kravetz Cc: Mike Rapoport Cc: Pavel Emelyanov Cc: Rik van Riel Cc: Shaohua Li Link: http://lkml.kernel.org/r/20200220163112.11409-14-peterx@redhat.com Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 82 ++++++++++++++++++++++++++++++++-------- include/uapi/linux/userfaultfd.h | 23 +++++++++++ 2 files changed, 89 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index c49bef505775..59e9e399fddb 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -314,8 +314,11 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, if (!pmd_present(_pmd)) goto out; - if (pmd_trans_huge(_pmd)) + if (pmd_trans_huge(_pmd)) { + if (!pmd_write(_pmd) && (reason & VM_UFFD_WP)) + ret = true; goto out; + } /* * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it @@ -328,6 +331,8 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, */ if (pte_none(*pte)) ret = true; + if (!pte_write(*pte) && (reason & VM_UFFD_WP)) + ret = true; pte_unmap(pte); out: @@ -1287,10 +1292,13 @@ static __always_inline int validate_range(struct mm_struct *mm, return 0; } -static inline bool vma_can_userfault(struct vm_area_struct *vma) +static inline bool vma_can_userfault(struct vm_area_struct *vma, + unsigned long vm_flags) { - return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || - vma_is_shmem(vma); + /* FIXME: add WP support to hugetlbfs and shmem */ + return vma_is_anonymous(vma) || + ((is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) && + !(vm_flags & VM_UFFD_WP)); } static int userfaultfd_register(struct userfaultfd_ctx *ctx, @@ -1322,15 +1330,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vm_flags = 0; if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) vm_flags |= VM_UFFD_MISSING; - if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) vm_flags |= VM_UFFD_WP; - /* - * FIXME: remove the below error constraint by - * implementing the wprotect tracking mode. - */ - ret = -EINVAL; - goto out; - } ret = validate_range(mm, &uffdio_register.range.start, uffdio_register.range.len); @@ -1380,7 +1381,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, /* check not compatible vmas */ ret = -EINVAL; - if (!vma_can_userfault(cur)) + if (!vma_can_userfault(cur, vm_flags)) goto out_unlock; /* @@ -1408,6 +1409,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, if (end & (vma_hpagesize - 1)) goto out_unlock; } + if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE)) + goto out_unlock; /* * Check that this vma isn't already owned by a @@ -1437,7 +1440,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, do { cond_resched(); - BUG_ON(!vma_can_userfault(vma)); + BUG_ON(!vma_can_userfault(vma, vm_flags)); BUG_ON(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); @@ -1575,7 +1578,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * provides for more strict behavior to notice * unregistration errors. */ - if (!vma_can_userfault(cur)) + if (!vma_can_userfault(cur, cur->vm_flags)) goto out_unlock; found = true; @@ -1589,7 +1592,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, do { cond_resched(); - BUG_ON(!vma_can_userfault(vma)); + BUG_ON(!vma_can_userfault(vma, vma->vm_flags)); /* * Nothing to do: this vma is already registered into this @@ -1802,6 +1805,50 @@ out: return ret; } +static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + int ret; + struct uffdio_writeprotect uffdio_wp; + struct uffdio_writeprotect __user *user_uffdio_wp; + struct userfaultfd_wake_range range; + + if (READ_ONCE(ctx->mmap_changing)) + return -EAGAIN; + + user_uffdio_wp = (struct uffdio_writeprotect __user *) arg; + + if (copy_from_user(&uffdio_wp, user_uffdio_wp, + sizeof(struct uffdio_writeprotect))) + return -EFAULT; + + ret = validate_range(ctx->mm, &uffdio_wp.range.start, + uffdio_wp.range.len); + if (ret) + return ret; + + if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE | + UFFDIO_WRITEPROTECT_MODE_WP)) + return -EINVAL; + if ((uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP) && + (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE)) + return -EINVAL; + + ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start, + uffdio_wp.range.len, uffdio_wp.mode & + UFFDIO_WRITEPROTECT_MODE_WP, + &ctx->mmap_changing); + if (ret) + return ret; + + if (!(uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE)) { + range.start = uffdio_wp.range.start; + range.len = uffdio_wp.range.len; + wake_userfault(ctx, &range); + } + return ret; +} + static inline unsigned int uffd_ctx_features(__u64 user_features) { /* @@ -1883,6 +1930,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, case UFFDIO_ZEROPAGE: ret = userfaultfd_zeropage(ctx, arg); break; + case UFFDIO_WRITEPROTECT: + ret = userfaultfd_writeprotect(ctx, arg); + break; } return ret; } diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 340f23bc251d..95c4a160e5f8 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -52,6 +52,7 @@ #define _UFFDIO_WAKE (0x02) #define _UFFDIO_COPY (0x03) #define _UFFDIO_ZEROPAGE (0x04) +#define _UFFDIO_WRITEPROTECT (0x06) #define _UFFDIO_API (0x3F) /* userfaultfd ioctl ids */ @@ -68,6 +69,8 @@ struct uffdio_copy) #define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \ struct uffdio_zeropage) +#define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \ + struct uffdio_writeprotect) /* read() structure */ struct uffd_msg { @@ -232,4 +235,24 @@ struct uffdio_zeropage { __s64 zeropage; }; +struct uffdio_writeprotect { + struct uffdio_range range; +/* + * UFFDIO_WRITEPROTECT_MODE_WP: set the flag to write protect a range, + * unset the flag to undo protection of a range which was previously + * write protected. + * + * UFFDIO_WRITEPROTECT_MODE_DONTWAKE: set the flag to avoid waking up + * any wait thread after the operation succeeds. + * + * NOTE: Write protecting a region (WP=1) is unrelated to page faults, + * therefore DONTWAKE flag is meaningless with WP=1. Removing write + * protection (WP=0) in response to a page fault wakes the faulting + * task unless DONTWAKE is set. + */ +#define UFFDIO_WRITEPROTECT_MODE_WP ((__u64)1<<0) +#define UFFDIO_WRITEPROTECT_MODE_DONTWAKE ((__u64)1<<1) + __u64 mode; +}; + #endif /* _LINUX_USERFAULTFD_H */ -- cgit v1.3 From e06f1e1dd4998ffc9da37f580703b55a93fc4de4 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 6 Apr 2020 20:06:16 -0700 Subject: userfaultfd: wp: enabled write protection in userfaultfd API Now it's safe to enable write protection in userfaultfd API Signed-off-by: Shaohua Li Signed-off-by: Andrea Arcangeli Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Reviewed-by: Jerome Glisse Reviewed-by: Mike Rapoport Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Kirill A. Shutemov Cc: Mel Gorman Cc: Hugh Dickins Cc: Johannes Weiner Cc: Bobby Powers Cc: Brian Geffon Cc: David Hildenbrand Cc: Denis Plotnikov Cc: "Dr . David Alan Gilbert" Cc: Martin Cracauer Cc: Marty McFadden Cc: Maya Gokhale Cc: Mike Kravetz Cc: Pavel Emelyanov Link: http://lkml.kernel.org/r/20200220163112.11409-15-peterx@redhat.com Signed-off-by: Linus Torvalds --- include/uapi/linux/userfaultfd.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 95c4a160e5f8..e7e98bde221f 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -19,7 +19,8 @@ * means the userland is reading). */ #define UFFD_API ((__u64)0xAA) -#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ +#define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \ + UFFD_FEATURE_EVENT_FORK | \ UFFD_FEATURE_EVENT_REMAP | \ UFFD_FEATURE_EVENT_REMOVE | \ UFFD_FEATURE_EVENT_UNMAP | \ @@ -34,7 +35,8 @@ #define UFFD_API_RANGE_IOCTLS \ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY | \ - (__u64)1 << _UFFDIO_ZEROPAGE) + (__u64)1 << _UFFDIO_ZEROPAGE | \ + (__u64)1 << _UFFDIO_WRITEPROTECT) #define UFFD_API_RANGE_IOCTLS_BASIC \ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY) -- cgit v1.3 From 68c3a6ac65f675b4b783635787fa0ed896f5b3d5 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 6 Apr 2020 20:06:40 -0700 Subject: drivers/base/memory.c: drop section_count Patch series "mm: drop superfluous section checks when onlining/offlining". Let's drop some superfluous section checks on the onlining/offlining path. This patch (of 3): Since commit c5e79ef561b0 ("mm/memory_hotplug.c: don't allow to online/offline memory blocks with holes") we have a generic check in offline_pages() that disallows offlining memory blocks with holes. Memory blocks with missing sections are just another variant of these type of blocks. We can stop checking (and especially storing) present sections. A proper error message is now printed why offlining failed. section_count was initially introduced in commit 07681215975e ("Driver core: Add section count to memory_block struct") in order to detect when it is okay to remove a memory block. It was used in commit 26bbe7ef6d5c ("drivers/base/memory.c: prohibit offlining of memory blocks with missing sections") to disallow offlining memory blocks with missing sections. As we refactored creation/removal of memory devices and have a proper check for holes in place, we can drop the section_count. This also removes a leftover comment regarding the mem_sysfs_mutex, which was removed in commit 848e19ad3c33 ("drivers/base/memory.c: drop the mem_sysfs_mutex"). Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Michal Hocko Cc: Dan Williams Cc: Pavel Tatashin Cc: Anshuman Khandual Link: http://lkml.kernel.org/r/20200127110424.5757-2-david@redhat.com Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 17 +++-------------- include/linux/memory.h | 1 - 2 files changed, 3 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 4086718f6876..086997212dbb 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -267,10 +267,6 @@ static int memory_subsys_offline(struct device *dev) if (mem->state == MEM_OFFLINE) return 0; - /* Can't offline block with non-present sections */ - if (mem->section_count != sections_per_block) - return -EINVAL; - return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); } @@ -627,7 +623,7 @@ static int init_memory_block(struct memory_block **memory, static int add_memory_block(unsigned long base_section_nr) { - int ret, section_count = 0; + int section_count = 0; struct memory_block *mem; unsigned long nr; @@ -638,12 +634,8 @@ static int add_memory_block(unsigned long base_section_nr) if (section_count == 0) return 0; - ret = init_memory_block(&mem, base_memory_block_id(base_section_nr), - MEM_ONLINE); - if (ret) - return ret; - mem->section_count = section_count; - return 0; + return init_memory_block(&mem, base_memory_block_id(base_section_nr), + MEM_ONLINE); } static void unregister_memory(struct memory_block *memory) @@ -679,7 +671,6 @@ int create_memory_block_devices(unsigned long start, unsigned long size) ret = init_memory_block(&mem, block_id, MEM_OFFLINE); if (ret) break; - mem->section_count = sections_per_block; } if (ret) { end_block_id = block_id; @@ -688,7 +679,6 @@ int create_memory_block_devices(unsigned long start, unsigned long size) mem = find_memory_block_by_id(block_id); if (WARN_ON_ONCE(!mem)) continue; - mem->section_count = 0; unregister_memory(mem); } } @@ -717,7 +707,6 @@ void remove_memory_block_devices(unsigned long start, unsigned long size) mem = find_memory_block_by_id(block_id); if (WARN_ON_ONCE(!mem)) continue; - mem->section_count = 0; unregister_memory_block_under_nodes(mem); unregister_memory(mem); } diff --git a/include/linux/memory.h b/include/linux/memory.h index 0b8d791b6669..439a89e758d8 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -26,7 +26,6 @@ struct memory_block { unsigned long start_section_nr; unsigned long state; /* serialized by the dev->lock */ - int section_count; /* serialized by mem_sysfs_mutex */ int online_type; /* for passing data to online routine */ int phys_device; /* to which fru does this belong? */ struct device dev; -- cgit v1.3 From 0a9f9f62316606ee827fa3318e95a1c489d9acf5 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 6 Apr 2020 20:07:06 -0700 Subject: mm/sparse.c: only use subsection map in VMEMMAP case Currently, to support subsection aligned memory region adding for pmem, subsection map is added to track which subsection is present. However, config ZONE_DEVICE depends on SPARSEMEM_VMEMMAP. It means subsection map only makes sense when SPARSEMEM_VMEMMAP enabled. For the classic sparse, it's meaningless. Even worse, it may confuse people when checking code related to the classic sparse. About the classic sparse which doesn't support subsection hotplug, Dan said it's more because the effort and maintenance burden outweighs the benefit. Besides, the current 64 bit ARCHes all enable SPARSEMEM_VMEMMAP_ENABLE by default. Combining the above reasons, no need to provide subsection map and the relevant handling for the classic sparse. Let's remove them. Signed-off-by: Baoquan He Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Cc: Dan Williams Cc: Michal Hocko Cc: Pankaj Gupta Cc: Wei Yang Link: http://lkml.kernel.org/r/20200312124414.439-4-bhe@redhat.com Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 ++ mm/sparse.c | 25 +++++++++++++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 42b77d3b68e8..f3f264826423 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1143,7 +1143,9 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec) #define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK) struct mem_section_usage { +#ifdef CONFIG_SPARSEMEM_VMEMMAP DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION); +#endif /* See declaration of similar field in struct zone */ unsigned long pageblock_flags[0]; }; diff --git a/mm/sparse.c b/mm/sparse.c index 01204c3b4649..095ecf5bb6d3 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -209,6 +209,7 @@ static inline unsigned long first_present_section_nr(void) return next_present_section_nr(-1); } +#ifdef CONFIG_SPARSEMEM_VMEMMAP static void subsection_mask_set(unsigned long *map, unsigned long pfn, unsigned long nr_pages) { @@ -243,6 +244,11 @@ void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages) nr_pages -= pfns; } } +#else +void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages) +{ +} +#endif /* Record a memory area against a node. */ void __init memory_present(int nid, unsigned long start, unsigned long end) @@ -705,6 +711,7 @@ static void free_map_bootmem(struct page *memmap) } #endif /* CONFIG_SPARSEMEM_VMEMMAP */ +#ifdef CONFIG_SPARSEMEM_VMEMMAP static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages) { DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; @@ -731,6 +738,17 @@ static bool is_subsection_map_empty(struct mem_section *ms) return bitmap_empty(&ms->usage->subsection_map[0], SUBSECTIONS_PER_SECTION); } +#else +static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + return 0; +} + +static bool is_subsection_map_empty(struct mem_section *ms) +{ + return true; +} +#endif static void section_deactivate(unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap) @@ -792,6 +810,7 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, ms->section_mem_map = (unsigned long)NULL; } +#ifdef CONFIG_SPARSEMEM_VMEMMAP static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) { struct mem_section *ms = __pfn_to_section(pfn); @@ -813,6 +832,12 @@ static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) return rc; } +#else +static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + return 0; +} +#endif static struct page * __meminit section_activate(int nid, unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap) -- cgit v1.3 From 956f8b445061667c3545baa24778f890d1d522f4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 6 Apr 2020 20:07:16 -0700 Subject: drivers/base/memory: rename MMOP_ONLINE_KEEP to MMOP_ONLINE Patch series "mm/memory_hotplug: allow to specify a default online_type", v3. Distributions nowadays use udev rules ([1] [2]) to specify if and how to online hotplugged memory. The rules seem to get more complex with many special cases. Due to the various special cases, CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE cannot be used. All memory hotplug is handled via udev rules. Every time we hotplug memory, the udev rule will come to the same conclusion. Especially Hyper-V (but also soon virtio-mem) add a lot of memory in separate memory blocks and wait for memory to get onlined by user space before continuing to add more memory blocks (to not add memory faster than it is getting onlined). This of course slows down the whole memory hotplug process. To make the job of distributions easier and to avoid udev rules that get more and more complicated, let's extend the mechanism provided by - /sys/devices/system/memory/auto_online_blocks - "memhp_default_state=" on the kernel cmdline to be able to specify also "online_movable" as well as "online_kernel" === Example /usr/libexec/config-memhotplug === #!/bin/bash VIRT=`systemd-detect-virt --vm` ARCH=`uname -p` sense_virtio_mem() { if [ -d "/sys/bus/virtio/drivers/virtio_mem/" ]; then DEVICES=`find /sys/bus/virtio/drivers/virtio_mem/ -maxdepth 1 -type l | wc -l` if [ $DEVICES != "0" ]; then return 0 fi fi return 1 } if [ ! -e "/sys/devices/system/memory/auto_online_blocks" ]; then echo "Memory hotplug configuration support missing in the kernel" exit 1 fi if grep "memhp_default_state=" /proc/cmdline > /dev/null; then echo "Memory hotplug configuration overridden in kernel cmdline (memhp_default_state=)" exit 1 fi if [ $VIRT == "microsoft" ]; then echo "Detected Hyper-V on $ARCH" # Hyper-V wants all memory in ZONE_NORMAL ONLINE_TYPE="online_kernel" elif sense_virtio_mem; then echo "Detected virtio-mem on $ARCH" # virtio-mem wants all memory in ZONE_NORMAL ONLINE_TYPE="online_kernel" elif [ $ARCH == "s390x" ] || [ $ARCH == "s390" ]; then echo "Detected $ARCH" # standby memory should not be onlined automatically ONLINE_TYPE="offline" elif [ $ARCH == "ppc64" ] || [ $ARCH == "ppc64le" ]; then echo "Detected" $ARCH # PPC64 onlines all hotplugged memory right from the kernel ONLINE_TYPE="offline" elif [ $VIRT == "none" ]; then echo "Detected bare-metal on $ARCH" # Bare metal users expect hotplugged memory to be unpluggable. We assume # that ZONE imbalances on such enterpise servers cannot happen and is # properly documented ONLINE_TYPE="online_movable" else # TODO: Hypervisors that want to unplug DIMMs and can guarantee that ZONE # imbalances won't happen echo "Detected $VIRT on $ARCH" # Usually, ballooning is used in virtual environments, so memory should go to # ZONE_NORMAL. However, sometimes "movable_node" is relevant. ONLINE_TYPE="online" fi echo "Selected online_type:" $ONLINE_TYPE # Configure what to do with memory that will be hotplugged in the future echo $ONLINE_TYPE 2>/dev/null > /sys/devices/system/memory/auto_online_blocks if [ $? != "0" ]; then echo "Memory hotplug cannot be configured (e.g., old kernel or missing permissions)" # A backup udev rule should handle old kernels if necessary exit 1 fi # Process all already pluggedd blocks (e.g., DIMMs, but also Hyper-V or virtio-mem) if [ $ONLINE_TYPE != "offline" ]; then for MEMORY in /sys/devices/system/memory/memory*; do STATE=`cat $MEMORY/state` if [ $STATE == "offline" ]; then echo $ONLINE_TYPE > $MEMORY/state fi done fi === Example /usr/lib/systemd/system/config-memhotplug.service === [Unit] Description=Configure memory hotplug behavior DefaultDependencies=no Conflicts=shutdown.target Before=sysinit.target shutdown.target After=systemd-modules-load.service ConditionPathExists=|/sys/devices/system/memory/auto_online_blocks [Service] ExecStart=/usr/libexec/config-memhotplug Type=oneshot TimeoutSec=0 RemainAfterExit=yes [Install] WantedBy=sysinit.target === Example modification to the 40-redhat.rules [2] === : diff --git a/40-redhat.rules b/40-redhat.rules-new : index 2c690e5..168fd03 100644 : --- a/40-redhat.rules : +++ b/40-redhat.rules-new : @@ -6,6 +6,9 @@ SUBSYSTEM=="cpu", ACTION=="add", TEST=="online", ATTR{online}=="0", ATTR{online} : # Memory hotadd request : SUBSYSTEM!="memory", GOTO="memory_hotplug_end" : ACTION!="add", GOTO="memory_hotplug_end" : +# memory hotplug behavior configured : +PROGRAM=="grep online /sys/devices/system/memory/auto_online_blocks", GOTO="memory_hotplug_end" : + : PROGRAM="/bin/uname -p", RESULT=="s390*", GOTO="memory_hotplug_end" : : ENV{.state}="online" === [1] https://github.com/lnykryn/systemd-rhel/pull/281 [2] https://github.com/lnykryn/systemd-rhel/blob/staging/rules/40-redhat.rules This patch (of 8): The name is misleading and it's not really clear what is "kept". Let's just name it like the online_type name we expose to user space ("online"). Add some documentation to the types. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Wei Yang Reviewed-by: Baoquan He Acked-by: Pankaj Gupta Cc: Greg Kroah-Hartman Cc: Michal Hocko Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: Wei Yang Cc: Vitaly Kuznetsov Cc: Yumei Huang Cc: Igor Mammedov Cc: Eduardo Habkost Cc: Benjamin Herrenschmidt Cc: Haiyang Zhang Cc: K. Y. Srinivasan Cc: Michael Ellerman (powerpc) Cc: Paul Mackerras Cc: Stephen Hemminger Cc: Wei Liu Link: http://lkml.kernel.org/r/20200319131221.14044-1-david@redhat.com Link: http://lkml.kernel.org/r/20200317104942.11178-2-david@redhat.com Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 9 +++++---- include/linux/memory_hotplug.h | 6 +++++- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 96c80dfaac90..156b89b14fcc 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -208,7 +208,7 @@ static int memory_subsys_online(struct device *dev) * attribute and need to set the online_type. */ if (mem->online_type < 0) - mem->online_type = MMOP_ONLINE_KEEP; + mem->online_type = MMOP_ONLINE; ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); @@ -243,7 +243,7 @@ static ssize_t state_store(struct device *dev, struct device_attribute *attr, else if (sysfs_streq(buf, "online_movable")) online_type = MMOP_ONLINE_MOVABLE; else if (sysfs_streq(buf, "online")) - online_type = MMOP_ONLINE_KEEP; + online_type = MMOP_ONLINE; else if (sysfs_streq(buf, "offline")) online_type = MMOP_OFFLINE; else { @@ -254,7 +254,7 @@ static ssize_t state_store(struct device *dev, struct device_attribute *attr, switch (online_type) { case MMOP_ONLINE_KERNEL: case MMOP_ONLINE_MOVABLE: - case MMOP_ONLINE_KEEP: + case MMOP_ONLINE: /* mem->online_type is protected by device_hotplug_lock */ mem->online_type = online_type; ret = device_online(&mem->dev); @@ -334,7 +334,8 @@ static ssize_t valid_zones_show(struct device *dev, } nid = mem->nid; - default_zone = zone_for_pfn_range(MMOP_ONLINE_KEEP, nid, start_pfn, nr_pages); + default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, start_pfn, + nr_pages); strcat(buf, default_zone->name); print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL, diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index f4d59155f3d4..261dbf010d5d 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -47,9 +47,13 @@ enum { /* Types for control the zone type of onlined and offlined memory */ enum { + /* Offline the memory. */ MMOP_OFFLINE = -1, - MMOP_ONLINE_KEEP, + /* Online the memory. Zone depends, see default_zone_for_pfn(). */ + MMOP_ONLINE, + /* Online the memory to ZONE_NORMAL. */ MMOP_ONLINE_KERNEL, + /* Online the memory to ZONE_MOVABLE. */ MMOP_ONLINE_MOVABLE, }; -- cgit v1.3 From efc978ad0e05ed6401c7854811750bf55b67f4b9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 6 Apr 2020 20:07:20 -0700 Subject: drivers/base/memory: map MMOP_OFFLINE to 0 Historically, we used the value -1. Just treat 0 as the special case now. Clarify a comment (which was wrong, when we come via device_online() the first time, the online_type would have been 0 / MEM_ONLINE). The default is now always MMOP_OFFLINE. This removes the last user of the manual "-1", which didn't use the enum value. This is a preparation to use the online_type as an array index. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Wei Yang Reviewed-by: Baoquan He Acked-by: Michal Hocko Acked-by: Pankaj Gupta Cc: Greg Kroah-Hartman Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: Wei Yang Cc: Benjamin Herrenschmidt Cc: Eduardo Habkost Cc: Haiyang Zhang Cc: Igor Mammedov Cc: "K. Y. Srinivasan" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Stephen Hemminger Cc: Vitaly Kuznetsov Cc: Wei Liu Cc: Yumei Huang Link: http://lkml.kernel.org/r/20200317104942.11178-3-david@redhat.com Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 11 ++++------- include/linux/memory_hotplug.h | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 156b89b14fcc..f65f3d53dc64 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -203,17 +203,14 @@ static int memory_subsys_online(struct device *dev) return 0; /* - * If we are called from state_store(), online_type will be - * set >= 0 Otherwise we were called from the device online - * attribute and need to set the online_type. + * When called via device_online() without configuring the online_type, + * we want to default to MMOP_ONLINE. */ - if (mem->online_type < 0) + if (mem->online_type == MMOP_OFFLINE) mem->online_type = MMOP_ONLINE; ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); - - /* clear online_type */ - mem->online_type = -1; + mem->online_type = MMOP_OFFLINE; return ret; } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 261dbf010d5d..c2e06ed5e0e9 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -48,7 +48,7 @@ enum { /* Types for control the zone type of onlined and offlined memory */ enum { /* Offline the memory. */ - MMOP_OFFLINE = -1, + MMOP_OFFLINE = 0, /* Online the memory. Zone depends, see default_zone_for_pfn(). */ MMOP_ONLINE, /* Online the memory to ZONE_NORMAL. */ -- cgit v1.3 From 862919e568356cc36288a11b42cd88ec3a7100e9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 6 Apr 2020 20:07:40 -0700 Subject: mm/memory_hotplug: convert memhp_auto_online to store an online_type ... and rename it to memhp_default_online_type. This is a preparation for more detailed default online behavior. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Wei Yang Reviewed-by: Baoquan He Acked-by: Michal Hocko Acked-by: Pankaj Gupta Cc: Greg Kroah-Hartman Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: Wei Yang Cc: Benjamin Herrenschmidt Cc: Eduardo Habkost Cc: Haiyang Zhang Cc: Igor Mammedov Cc: "K. Y. Srinivasan" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Stephen Hemminger Cc: Vitaly Kuznetsov Cc: Wei Liu Cc: Yumei Huang Link: http://lkml.kernel.org/r/20200317104942.11178-8-david@redhat.com Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 10 ++++------ include/linux/memory_hotplug.h | 3 ++- mm/memory_hotplug.c | 11 ++++++----- 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 1c90bdf60d85..7d2f829d00d7 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -378,10 +378,8 @@ static DEVICE_ATTR_RO(block_size_bytes); static ssize_t auto_online_blocks_show(struct device *dev, struct device_attribute *attr, char *buf) { - if (memhp_auto_online) - return sprintf(buf, "online\n"); - else - return sprintf(buf, "offline\n"); + return sprintf(buf, "%s\n", + online_type_to_str[memhp_default_online_type]); } static ssize_t auto_online_blocks_store(struct device *dev, @@ -389,9 +387,9 @@ static ssize_t auto_online_blocks_store(struct device *dev, const char *buf, size_t count) { if (sysfs_streq(buf, "online")) - memhp_auto_online = true; + memhp_default_online_type = MMOP_ONLINE; else if (sysfs_streq(buf, "offline")) - memhp_auto_online = false; + memhp_default_online_type = MMOP_OFFLINE; else return -EINVAL; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index c2e06ed5e0e9..c6e090b34c4b 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -117,7 +117,8 @@ extern int arch_add_memory(int nid, u64 start, u64 size, struct mhp_restrictions *restrictions); extern u64 max_mem_size; -extern bool memhp_auto_online; +/* Default online_type (MMOP_*) when new memory blocks are added. */ +extern int memhp_default_online_type; /* If movable_node boot option specified */ extern bool movable_node_enabled; static inline bool movable_node_is_enabled(void) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9691cbb4383e..9436f7e6257a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -67,17 +67,17 @@ void put_online_mems(void) bool movable_node_enabled = false; #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE -bool memhp_auto_online; +int memhp_default_online_type = MMOP_OFFLINE; #else -bool memhp_auto_online = true; +int memhp_default_online_type = MMOP_ONLINE; #endif static int __init setup_memhp_default_state(char *str) { if (!strcmp(str, "online")) - memhp_auto_online = true; + memhp_default_online_type = MMOP_ONLINE; else if (!strcmp(str, "offline")) - memhp_auto_online = false; + memhp_default_online_type = MMOP_OFFLINE; return 1; } @@ -990,6 +990,7 @@ static int check_hotplug_memory_range(u64 start, u64 size) static int online_memory_block(struct memory_block *mem, void *arg) { + mem->online_type = memhp_default_online_type; return device_online(&mem->dev); } @@ -1062,7 +1063,7 @@ int __ref add_memory_resource(int nid, struct resource *res) mem_hotplug_done(); /* online pages if requested */ - if (memhp_auto_online) + if (memhp_default_online_type != MMOP_OFFLINE) walk_memory_blocks(start, size, NULL, online_memory_block); return ret; -- cgit v1.3 From 5f47adf762b78cae97de58d9ff01d2d44db09467 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 6 Apr 2020 20:07:44 -0700 Subject: mm/memory_hotplug: allow to specify a default online_type For now, distributions implement advanced udev rules to essentially - Don't online any hotplugged memory (s390x) - Online all memory to ZONE_NORMAL (e.g., most virt environments like hyperv) - Online all memory to ZONE_MOVABLE in case the zone imbalance is taken care of (e.g., bare metal, special virt environments) In summary: All memory is usually onlined the same way, however, the kernel always has to ask user space to come up with the same answer. E.g., Hyper-V always waits for a memory block to get onlined before continuing, otherwise it might end up adding memory faster than onlining it, which can result in strange OOM situations. This waiting slows down adding of a bigger amount of memory. Let's allow to specify a default online_type, not just "online" and "offline". This allows distributions to configure the default online_type when booting up and be done with it. We can now specify "offline", "online", "online_movable" and "online_kernel" via - "memhp_default_state=" on the kernel cmdline - /sys/devices/system/memory/auto_online_blocks just like we are able to specify for a single memory block via /sys/devices/system/memory/memoryX/state Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Wei Yang Reviewed-by: Baoquan He Acked-by: Michal Hocko Acked-by: Pankaj Gupta Cc: Greg Kroah-Hartman Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: Wei Yang Cc: Benjamin Herrenschmidt Cc: Eduardo Habkost Cc: Haiyang Zhang Cc: Igor Mammedov Cc: "K. Y. Srinivasan" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Stephen Hemminger Cc: Vitaly Kuznetsov Cc: Wei Liu Cc: Yumei Huang Link: http://lkml.kernel.org/r/20200317104942.11178-9-david@redhat.com Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 11 +++++------ include/linux/memory_hotplug.h | 2 ++ mm/memory_hotplug.c | 8 ++++---- 3 files changed, 11 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 7d2f829d00d7..dbec3a05590a 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -34,7 +34,7 @@ static const char *const online_type_to_str[] = { [MMOP_ONLINE_MOVABLE] = "online_movable", }; -static int memhp_online_type_from_str(const char *str) +int memhp_online_type_from_str(const char *str) { int i; @@ -386,13 +386,12 @@ static ssize_t auto_online_blocks_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - if (sysfs_streq(buf, "online")) - memhp_default_online_type = MMOP_ONLINE; - else if (sysfs_streq(buf, "offline")) - memhp_default_online_type = MMOP_OFFLINE; - else + const int online_type = memhp_online_type_from_str(buf); + + if (online_type < 0) return -EINVAL; + memhp_default_online_type = online_type; return count; } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index c6e090b34c4b..ef55115320fb 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -117,6 +117,8 @@ extern int arch_add_memory(int nid, u64 start, u64 size, struct mhp_restrictions *restrictions); extern u64 max_mem_size; +extern int memhp_online_type_from_str(const char *str); + /* Default online_type (MMOP_*) when new memory blocks are added. */ extern int memhp_default_online_type; /* If movable_node boot option specified */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9436f7e6257a..2fb78c5ebaf3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -74,10 +74,10 @@ int memhp_default_online_type = MMOP_ONLINE; static int __init setup_memhp_default_state(char *str) { - if (!strcmp(str, "online")) - memhp_default_online_type = MMOP_ONLINE; - else if (!strcmp(str, "offline")) - memhp_default_online_type = MMOP_OFFLINE; + const int online_type = memhp_online_type_from_str(str); + + if (online_type >= 0) + memhp_default_online_type = online_type; return 1; } -- cgit v1.3 From 552657b7b3343851916fde7e4fd6bfb6516d2bcb Mon Sep 17 00:00:00 2001 From: chenqiwu Date: Mon, 6 Apr 2020 20:08:33 -0700 Subject: mm: fix ambiguous comments for better code readability The parameter of remap_pfn_range() @pfn passed from the caller is actually a page-frame number converted by corresponding physical address of kernel memory, the original comment is ambiguous that may mislead the users. Meanwhile, there is an ambiguous typo "VMM" in the comment of vm_area_struct. So fixing them will make the code more readable. Signed-off-by: chenqiwu Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Link: http://lkml.kernel.org/r/1583026921-15279-1-git-send-email-qiwuchen55@gmail.com Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 4 ++-- mm/memory.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index dd555e6d23f3..4aba6c0c2ba8 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -289,8 +289,8 @@ struct vm_userfaultfd_ctx {}; #endif /* CONFIG_USERFAULTFD */ /* - * This struct defines a memory VMM memory area. There is one of these - * per VM-area/task. A VM area is any part of the process virtual memory + * This struct describes a virtual memory area. There is one of these + * per VM-area/task. A VM area is any part of the process virtual memory * space that has a special rule for the page-fault handlers (ie a shared * library, the executable area etc). */ diff --git a/mm/memory.c b/mm/memory.c index 8ac9af73e9d2..19874d133a66 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1952,7 +1952,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, * @vma: user vma to map to * @addr: target user address to start at * @pfn: page frame number of kernel physical memory address - * @size: size of map area + * @size: size of mapping area * @prot: page protection flags for this mapping * * Note: this is only safe if the mm semaphore is held when called. -- cgit v1.3 From 3f3673d7d324d872d9d8ddb73b3e5e47fbf12e0d Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 6 Apr 2020 20:08:43 -0700 Subject: include/linux/swapops.h: correct guards for non_swap_entry() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_DEVICE_PRIVATE is defined, but neither CONFIG_MEMORY_FAILURE nor CONFIG_MIGRATION, then non_swap_entry() will return 0, meaning that the condition (non_swap_entry(entry) && is_device_private_entry(entry)) in zap_pte_range() will never be true even if the entry is a device private one. Equally any other code depending on non_swap_entry() will not function as expected. I originally spotted this just by looking at the code, I haven't actually observed any problems. Looking a bit more closely it appears that actually this situation (currently at least) cannot occur: DEVICE_PRIVATE depends on ZONE_DEVICE ZONE_DEVICE depends on MEMORY_HOTREMOVE MEMORY_HOTREMOVE depends on MIGRATION Fixes: 5042db43cc26 ("mm/ZONE_DEVICE: new type of ZONE_DEVICE for unaddressable memory") Signed-off-by: Steven Price Signed-off-by: Andrew Morton Cc: Jérôme Glisse Cc: Arnd Bergmann Cc: Dan Williams Cc: John Hubbard Link: http://lkml.kernel.org/r/20200305130550.22693-1-steven.price@arm.com Signed-off-by: Linus Torvalds --- include/linux/swapops.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 9a6f06de183b..d9b7c9132c2f 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -350,7 +350,8 @@ static inline void num_poisoned_pages_inc(void) } #endif -#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION) +#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION) || \ + defined(CONFIG_DEVICE_PRIVATE) static inline int non_swap_entry(swp_entry_t entry) { return swp_type(entry) >= MAX_SWAPFILES; -- cgit v1.3 From 1d90b6491014ead775146726b81a78ed993c3188 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Mon, 6 Apr 2020 20:08:46 -0700 Subject: include/linux/memremap.h: remove stale comments Fixes: 80a72d0af05a ("memremap: remove the data field in struct dev_pagemap") Fixes: fdc029b19dfd ("memremap: remove the dev field in struct dev_pagemap") Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Cc: Jason Gunthorpe Cc: Dan Williams Link: http://lkml.kernel.org/r/20200316213205.145333-1-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- include/linux/memremap.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 60d97e8fd3c0..8b37c4c9222c 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -98,8 +98,6 @@ struct dev_pagemap_ops { * @ref: reference count that pins the devm_memremap_pages() mapping * @internal_ref: internal reference if @ref is not provided by the caller * @done: completion for @internal_ref - * @dev: host device of the mapping for debug - * @data: private data pointer for page_free() * @type: memory type: see MEMORY_* in memory_hotplug.h * @flags: PGMAP_* flags to specify defailed behavior * @ops: method table -- cgit v1.3 From 6218d740ac1bc723d57900b865d3e52b83550c2b Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 6 Apr 2020 20:08:52 -0700 Subject: mm: remove dummy struct bootmem_data/bootmem_data_t Both bootmem_data and bootmem_data_t structures are no longer defined. Remove the dummy forward declarations. Signed-off-by: Waiman Long Signed-off-by: Andrew Morton Reviewed-by: Baoquan He Acked-by: Mike Rapoport Link: http://lkml.kernel.org/r/20200326022617.26208-1-longman@redhat.com Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/mmzone.h | 2 -- include/linux/mmzone.h | 1 - 2 files changed, 3 deletions(-) (limited to 'include') diff --git a/arch/alpha/include/asm/mmzone.h b/arch/alpha/include/asm/mmzone.h index 7ee144f484f1..9b521c857436 100644 --- a/arch/alpha/include/asm/mmzone.h +++ b/arch/alpha/include/asm/mmzone.h @@ -8,8 +8,6 @@ #include -struct bootmem_data_t; /* stupid forward decl. */ - /* * Following are macros that are specific to this numa platform. */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f3f264826423..e9892bf9eba9 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -664,7 +664,6 @@ struct deferred_split { * Memory statistics and page replacement data structures are maintained on a * per-zone basis. */ -struct bootmem_data; typedef struct pglist_data { struct zone node_zones[MAX_NR_ZONES]; struct zonelist node_zonelists[MAX_ZONELISTS]; -- cgit v1.3 From d919b33dafb3e222d23671b2bb06d119aede625f Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 6 Apr 2020 20:09:01 -0700 Subject: proc: faster open/read/close with "permanent" files Now that "struct proc_ops" exist we can start putting there stuff which could not fly with VFS "struct file_operations"... Most of fs/proc/inode.c file is dedicated to make open/read/.../close reliable in the event of disappearing /proc entries which usually happens if module is getting removed. Files like /proc/cpuinfo which never disappear simply do not need such protection. Save 2 atomic ops, 1 allocation, 1 free per open/read/close sequence for such "permanent" files. Enable "permanent" flag for /proc/cpuinfo /proc/kmsg /proc/modules /proc/slabinfo /proc/stat /proc/sysvipc/* /proc/swaps More will come once I figure out foolproof way to prevent out module authors from marking their stuff "permanent" for performance reasons when it is not. This should help with scalability: benchmark is "read /proc/cpuinfo R times by N threads scattered over the system". N R t, s (before) t, s (after) ----------------------------------------------------- 64 4096 1.582458 1.530502 -3.2% 256 4096 6.371926 6.125168 -3.9% 1024 4096 25.64888 24.47528 -4.6% Benchmark source: #include #include #include #include #include #include #include #include const int NR_CPUS = sysconf(_SC_NPROCESSORS_ONLN); int N; const char *filename; int R; int xxx = 0; int glue(int n) { cpu_set_t m; CPU_ZERO(&m); CPU_SET(n, &m); return sched_setaffinity(0, sizeof(cpu_set_t), &m); } void f(int n) { glue(n % NR_CPUS); while (*(volatile int *)&xxx == 0) { } for (int i = 0; i < R; i++) { int fd = open(filename, O_RDONLY); char buf[4096]; ssize_t rv = read(fd, buf, sizeof(buf)); asm volatile ("" :: "g" (rv)); close(fd); } } int main(int argc, char *argv[]) { if (argc < 4) { std::cerr << "usage: " << argv[0] << ' ' << "N /proc/filename R "; return 1; } N = atoi(argv[1]); filename = argv[2]; R = atoi(argv[3]); for (int i = 0; i < NR_CPUS; i++) { if (glue(i) == 0) break; } std::vector T; T.reserve(N); for (int i = 0; i < N; i++) { T.emplace_back(f, i); } auto t0 = std::chrono::system_clock::now(); { *(volatile int *)&xxx = 1; for (auto& t: T) { t.join(); } } auto t1 = std::chrono::system_clock::now(); std::chrono::duration dt = t1 - t0; std::cout << dt.count() << ' '; return 0; } P.S.: Explicit randomization marker is added because adding non-function pointer will silently disable structure layout randomization. [akpm@linux-foundation.org: coding style fixes] Reported-by: kbuild test robot Reported-by: Dan Carpenter Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Cc: Al Viro Cc: Joe Perches Link: http://lkml.kernel.org/r/20200222201539.GA22576@avx2 Signed-off-by: Linus Torvalds --- fs/proc/cpuinfo.c | 1 + fs/proc/generic.c | 31 +++++++- fs/proc/inode.c | 187 +++++++++++++++++++++++++++++++++++------------- fs/proc/internal.h | 6 ++ fs/proc/kmsg.c | 1 + fs/proc/stat.c | 1 + include/linux/proc_fs.h | 17 ++++- ipc/util.c | 1 + kernel/module.c | 1 + mm/slab_common.c | 1 + mm/swapfile.c | 1 + 11 files changed, 194 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c index c1dea9b8222e..d0989a443c77 100644 --- a/fs/proc/cpuinfo.c +++ b/fs/proc/cpuinfo.c @@ -17,6 +17,7 @@ static int cpuinfo_open(struct inode *inode, struct file *file) } static const struct proc_ops cpuinfo_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = cpuinfo_open, .proc_read = seq_read, .proc_lseek = seq_lseek, diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 3faed94e4b65..4ed6dabdf6ff 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -531,6 +531,12 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, return p; } +static inline void pde_set_flags(struct proc_dir_entry *pde) +{ + if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT) + pde->flags |= PROC_ENTRY_PERMANENT; +} + struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct proc_ops *proc_ops, void *data) @@ -541,6 +547,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, if (!p) return NULL; p->proc_ops = proc_ops; + pde_set_flags(p); return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_data); @@ -572,6 +579,7 @@ static int proc_seq_release(struct inode *inode, struct file *file) } static const struct proc_ops proc_seq_ops = { + /* not permanent -- can call into arbitrary seq_operations */ .proc_open = proc_seq_open, .proc_read = seq_read, .proc_lseek = seq_lseek, @@ -602,6 +610,7 @@ static int proc_single_open(struct inode *inode, struct file *file) } static const struct proc_ops proc_single_ops = { + /* not permanent -- can call into arbitrary ->single_show */ .proc_open = proc_single_open, .proc_read = seq_read, .proc_lseek = seq_lseek, @@ -662,9 +671,13 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) de = pde_subdir_find(parent, fn, len); if (de) { - rb_erase(&de->subdir_node, &parent->subdir); - if (S_ISDIR(de->mode)) { - parent->nlink--; + if (unlikely(pde_is_permanent(de))) { + WARN(1, "removing permanent /proc entry '%s'", de->name); + de = NULL; + } else { + rb_erase(&de->subdir_node, &parent->subdir); + if (S_ISDIR(de->mode)) + parent->nlink--; } } write_unlock(&proc_subdir_lock); @@ -700,12 +713,24 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) write_unlock(&proc_subdir_lock); return -ENOENT; } + if (unlikely(pde_is_permanent(root))) { + write_unlock(&proc_subdir_lock); + WARN(1, "removing permanent /proc entry '%s/%s'", + root->parent->name, root->name); + return -EINVAL; + } rb_erase(&root->subdir_node, &parent->subdir); de = root; while (1) { next = pde_subdir_first(de); if (next) { + if (unlikely(pde_is_permanent(root))) { + write_unlock(&proc_subdir_lock); + WARN(1, "removing permanent /proc entry '%s/%s'", + next->parent->name, next->name); + return -EINVAL; + } rb_erase(&next->subdir_node, &de->subdir); de = next; continue; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 05d31c464bee..fb4cace9ea41 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -259,135 +259,204 @@ void proc_entry_rundown(struct proc_dir_entry *de) spin_unlock(&de->pde_unload_lock); } +static loff_t pde_lseek(struct proc_dir_entry *pde, struct file *file, loff_t offset, int whence) +{ + typeof_member(struct proc_ops, proc_lseek) lseek; + + lseek = pde->proc_ops->proc_lseek; + if (!lseek) + lseek = default_llseek; + return lseek(file, offset, whence); +} + static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) { struct proc_dir_entry *pde = PDE(file_inode(file)); loff_t rv = -EINVAL; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_lseek) lseek; - lseek = pde->proc_ops->proc_lseek; - if (!lseek) - lseek = default_llseek; - rv = lseek(file, offset, whence); + if (pde_is_permanent(pde)) { + return pde_lseek(pde, file, offset, whence); + } else if (use_pde(pde)) { + rv = pde_lseek(pde, file, offset, whence); unuse_pde(pde); } return rv; } +static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + typeof_member(struct proc_ops, proc_read) read; + + read = pde->proc_ops->proc_read; + if (read) + return read(file, buf, count, ppos); + return -EIO; +} + static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_read) read; - read = pde->proc_ops->proc_read; - if (read) - rv = read(file, buf, count, ppos); + if (pde_is_permanent(pde)) { + return pde_read(pde, file, buf, count, ppos); + } else if (use_pde(pde)) { + rv = pde_read(pde, file, buf, count, ppos); unuse_pde(pde); } return rv; } +static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos) +{ + typeof_member(struct proc_ops, proc_write) write; + + write = pde->proc_ops->proc_write; + if (write) + return write(file, buf, count, ppos); + return -EIO; +} + static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_write) write; - write = pde->proc_ops->proc_write; - if (write) - rv = write(file, buf, count, ppos); + if (pde_is_permanent(pde)) { + return pde_write(pde, file, buf, count, ppos); + } else if (use_pde(pde)) { + rv = pde_write(pde, file, buf, count, ppos); unuse_pde(pde); } return rv; } +static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts) +{ + typeof_member(struct proc_ops, proc_poll) poll; + + poll = pde->proc_ops->proc_poll; + if (poll) + return poll(file, pts); + return DEFAULT_POLLMASK; +} + static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts) { struct proc_dir_entry *pde = PDE(file_inode(file)); __poll_t rv = DEFAULT_POLLMASK; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_poll) poll; - poll = pde->proc_ops->proc_poll; - if (poll) - rv = poll(file, pts); + if (pde_is_permanent(pde)) { + return pde_poll(pde, file, pts); + } else if (use_pde(pde)) { + rv = pde_poll(pde, file, pts); unuse_pde(pde); } return rv; } +static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) +{ + typeof_member(struct proc_ops, proc_ioctl) ioctl; + + ioctl = pde->proc_ops->proc_ioctl; + if (ioctl) + return ioctl(file, cmd, arg); + return -ENOTTY; +} + static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_ioctl) ioctl; - ioctl = pde->proc_ops->proc_ioctl; - if (ioctl) - rv = ioctl(file, cmd, arg); + if (pde_is_permanent(pde)) { + return pde_ioctl(pde, file, cmd, arg); + } else if (use_pde(pde)) { + rv = pde_ioctl(pde, file, cmd, arg); unuse_pde(pde); } return rv; } #ifdef CONFIG_COMPAT +static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) +{ + typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl; + + compat_ioctl = pde->proc_ops->proc_compat_ioctl; + if (compat_ioctl) + return compat_ioctl(file, cmd, arg); + return -ENOTTY; +} + static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl; - - compat_ioctl = pde->proc_ops->proc_compat_ioctl; - if (compat_ioctl) - rv = compat_ioctl(file, cmd, arg); + if (pde_is_permanent(pde)) { + return pde_compat_ioctl(pde, file, cmd, arg); + } else if (use_pde(pde)) { + rv = pde_compat_ioctl(pde, file, cmd, arg); unuse_pde(pde); } return rv; } #endif +static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma) +{ + typeof_member(struct proc_ops, proc_mmap) mmap; + + mmap = pde->proc_ops->proc_mmap; + if (mmap) + return mmap(file, vma); + return -EIO; +} + static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) { struct proc_dir_entry *pde = PDE(file_inode(file)); int rv = -EIO; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_mmap) mmap; - mmap = pde->proc_ops->proc_mmap; - if (mmap) - rv = mmap(file, vma); + if (pde_is_permanent(pde)) { + return pde_mmap(pde, file, vma); + } else if (use_pde(pde)) { + rv = pde_mmap(pde, file, vma); unuse_pde(pde); } return rv; } static unsigned long -proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, +pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - struct proc_dir_entry *pde = PDE(file_inode(file)); - unsigned long rv = -EIO; - - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_get_unmapped_area) get_area; + typeof_member(struct proc_ops, proc_get_unmapped_area) get_area; - get_area = pde->proc_ops->proc_get_unmapped_area; + get_area = pde->proc_ops->proc_get_unmapped_area; #ifdef CONFIG_MMU - if (!get_area) - get_area = current->mm->get_unmapped_area; + if (!get_area) + get_area = current->mm->get_unmapped_area; #endif + if (get_area) + return get_area(file, orig_addr, len, pgoff, flags); + return orig_addr; +} + +static unsigned long +proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct proc_dir_entry *pde = PDE(file_inode(file)); + unsigned long rv = -EIO; - if (get_area) - rv = get_area(file, orig_addr, len, pgoff, flags); - else - rv = orig_addr; + if (pde_is_permanent(pde)) { + return pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags); + } else if (use_pde(pde)) { + rv = pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags); unuse_pde(pde); } return rv; @@ -401,6 +470,13 @@ static int proc_reg_open(struct inode *inode, struct file *file) typeof_member(struct proc_ops, proc_release) release; struct pde_opener *pdeo; + if (pde_is_permanent(pde)) { + open = pde->proc_ops->proc_open; + if (open) + rv = open(inode, file); + return rv; + } + /* * Ensure that * 1) PDE's ->release hook will be called no matter what @@ -450,6 +526,17 @@ static int proc_reg_release(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); struct pde_opener *pdeo; + + if (pde_is_permanent(pde)) { + typeof_member(struct proc_ops, proc_release) release; + + release = pde->proc_ops->proc_release; + if (release) { + return release(inode, file); + } + return 0; + } + spin_lock(&pde->pde_unload_lock); list_for_each_entry(pdeo, &pde->pde_openers, lh) { if (pdeo->file == file) { diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 9e294f0290e5..917cc85e3466 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -61,6 +61,7 @@ struct proc_dir_entry { struct rb_node subdir_node; char *name; umode_t mode; + u8 flags; u8 namelen; char inline_name[]; } __randomize_layout; @@ -73,6 +74,11 @@ struct proc_dir_entry { 0) #define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry)) +static inline bool pde_is_permanent(const struct proc_dir_entry *pde) +{ + return pde->flags & PROC_ENTRY_PERMANENT; +} + extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index ec1b7d2fb773..b38ad552887f 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -50,6 +50,7 @@ static __poll_t kmsg_poll(struct file *file, poll_table *wait) static const struct proc_ops kmsg_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_read = kmsg_read, .proc_poll = kmsg_poll, .proc_open = kmsg_open, diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 0449edf460f5..46b3293015fe 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -224,6 +224,7 @@ static int stat_open(struct inode *inode, struct file *file) } static const struct proc_ops stat_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = stat_open, .proc_read = seq_read, .proc_lseek = seq_lseek, diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 40a7982b7285..45c05fd9c99d 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -5,6 +5,7 @@ #ifndef _LINUX_PROC_FS_H #define _LINUX_PROC_FS_H +#include #include #include @@ -12,7 +13,21 @@ struct proc_dir_entry; struct seq_file; struct seq_operations; +enum { + /* + * All /proc entries using this ->proc_ops instance are never removed. + * + * If in doubt, ignore this flag. + */ +#ifdef MODULE + PROC_ENTRY_PERMANENT = 0U, +#else + PROC_ENTRY_PERMANENT = 1U << 0, +#endif +}; + struct proc_ops { + unsigned int proc_flags; int (*proc_open)(struct inode *, struct file *); ssize_t (*proc_read)(struct file *, char __user *, size_t, loff_t *); ssize_t (*proc_write)(struct file *, const char __user *, size_t, loff_t *); @@ -25,7 +40,7 @@ struct proc_ops { #endif int (*proc_mmap)(struct file *, struct vm_area_struct *); unsigned long (*proc_get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); -}; +} __randomize_layout; #ifdef CONFIG_PROC_FS diff --git a/ipc/util.c b/ipc/util.c index fe61df53775a..97638eb2d7cb 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -885,6 +885,7 @@ static int sysvipc_proc_release(struct inode *inode, struct file *file) } static const struct proc_ops sysvipc_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = sysvipc_proc_open, .proc_read = seq_read, .proc_lseek = seq_lseek, diff --git a/kernel/module.c b/kernel/module.c index 33569a01d6e1..3447f3b74870 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4355,6 +4355,7 @@ static int modules_open(struct inode *inode, struct file *file) } static const struct proc_ops modules_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = modules_open, .proc_read = seq_read, .proc_lseek = seq_lseek, diff --git a/mm/slab_common.c b/mm/slab_common.c index 5282f881d2f5..93ec4a574d8d 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1581,6 +1581,7 @@ static int slabinfo_open(struct inode *inode, struct file *file) } static const struct proc_ops slabinfo_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = slabinfo_open, .proc_read = seq_read, .proc_write = slabinfo_write, diff --git a/mm/swapfile.c b/mm/swapfile.c index 273a923c275c..5871a2aa86a5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2797,6 +2797,7 @@ static int swaps_open(struct inode *inode, struct file *file) } static const struct proc_ops swaps_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = swaps_open, .proc_read = seq_read, .proc_lseek = seq_lseek, -- cgit v1.3 From b829a0f0f2f2094c1e40637259c44b854e6ebe96 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Apr 2020 20:09:17 -0700 Subject: seq_file: remove m->version The process maps file was the only user of version (introduced back in 2005). Now that it uses ppos instead, we can remove it. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/20200317193201.9924-4-adobriyan@gmail.com Signed-off-by: Linus Torvalds --- fs/seq_file.c | 28 ---------------------------- include/linux/seq_file.h | 1 - 2 files changed, 29 deletions(-) (limited to 'include') diff --git a/fs/seq_file.c b/fs/seq_file.c index 1600034a929b..79781ebd2145 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -67,13 +67,6 @@ int seq_open(struct file *file, const struct seq_operations *op) // to the lifetime of the file. p->file = file; - /* - * Wrappers around seq_open(e.g. swaps_open) need to be - * aware of this. If they set f_version themselves, they - * should call seq_open first and then set f_version. - */ - file->f_version = 0; - /* * seq_files support lseek() and pread(). They do not implement * write() at all, but we clear FMODE_PWRITE here for historical @@ -94,7 +87,6 @@ static int traverse(struct seq_file *m, loff_t offset) int error = 0; void *p; - m->version = 0; m->index = 0; m->count = m->from = 0; if (!offset) @@ -160,26 +152,12 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) mutex_lock(&m->lock); - /* - * seq_file->op->..m_start/m_stop/m_next may do special actions - * or optimisations based on the file->f_version, so we want to - * pass the file->f_version to those methods. - * - * seq_file->version is just copy of f_version, and seq_file - * methods can treat it simply as file version. - * It is copied in first and copied out after all operations. - * It is convenient to have it as part of structure to avoid the - * need of passing another argument to all the seq_file methods. - */ - m->version = file->f_version; - /* * if request is to read from zero offset, reset iterator to first * record as it might have been already advanced by previous requests */ if (*ppos == 0) { m->index = 0; - m->version = 0; m->count = 0; } @@ -190,7 +168,6 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) if (err) { /* With prejudice... */ m->read_pos = 0; - m->version = 0; m->index = 0; m->count = 0; goto Done; @@ -243,7 +220,6 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) m->buf = seq_buf_alloc(m->size <<= 1); if (!m->buf) goto Enomem; - m->version = 0; p = m->op->start(m, &m->index); } m->op->stop(m, p); @@ -287,7 +263,6 @@ Done: *ppos += copied; m->read_pos += copied; } - file->f_version = m->version; mutex_unlock(&m->lock); return copied; Enomem: @@ -313,7 +288,6 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence) loff_t retval = -EINVAL; mutex_lock(&m->lock); - m->version = file->f_version; switch (whence) { case SEEK_CUR: offset += file->f_pos; @@ -329,7 +303,6 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence) /* with extreme prejudice... */ file->f_pos = 0; m->read_pos = 0; - m->version = 0; m->index = 0; m->count = 0; } else { @@ -340,7 +313,6 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence) file->f_pos = offset; } } - file->f_version = m->version; mutex_unlock(&m->lock); return retval; } diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 770c2bf3aa43..1672cf6f7614 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -21,7 +21,6 @@ struct seq_file { size_t pad_until; loff_t index; loff_t read_pos; - u64 version; struct mutex lock; const struct seq_operations *op; int poll_event; -- cgit v1.3 From 889b3c1245de48ed0cacf7aebb25c489d3e4a3e9 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 6 Apr 2020 20:09:33 -0700 Subject: compiler: remove CONFIG_OPTIMIZE_INLINING entirely Commit ac7c3e4ff401 ("compiler: enable CONFIG_OPTIMIZE_INLINING forcibly") made this always-on option. We released v5.4 and v5.5 including that commit. Remove the CONFIG option and clean up the code now. Signed-off-by: Masahiro Yamada Signed-off-by: Andrew Morton Reviewed-by: Miguel Ojeda Reviewed-by: Nathan Chancellor Cc: Arnd Bergmann Cc: Borislav Petkov Cc: David Miller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200220110807.32534-2-masahiroy@kernel.org Signed-off-by: Linus Torvalds --- arch/x86/configs/i386_defconfig | 1 - arch/x86/configs/x86_64_defconfig | 1 - include/linux/compiler_types.h | 11 +---------- kernel/configs/tiny.config | 1 - lib/Kconfig.debug | 12 ------------ 5 files changed, 1 insertion(+), 25 deletions(-) (limited to 'include') diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index ab8b30cb978e..550904591e94 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -285,7 +285,6 @@ CONFIG_EARLY_PRINTK_DBGP=y CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_BOOT_PARAMS=y -CONFIG_OPTIMIZE_INLINING=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 2d196cb49084..614961009075 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -282,7 +282,6 @@ CONFIG_EARLY_PRINTK_DBGP=y CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_BOOT_PARAMS=y -CONFIG_OPTIMIZE_INLINING=y CONFIG_UNWINDER_ORC=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 72393a8c1a6c..e970f97a7fcb 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -129,22 +129,13 @@ struct ftrace_likely_data { #define __compiler_offsetof(a, b) __builtin_offsetof(a, b) /* - * Force always-inline if the user requests it so via the .config. * Prefer gnu_inline, so that extern inline functions do not emit an * externally visible function. This makes extern inline behave as per gnu89 * semantics rather than c99. This prevents multiple symbol definition errors * of extern inline functions at link time. * A lot of inline functions can cause havoc with function tracing. - * Do not use __always_inline here, since currently it expands to inline again - * (which would break users of __always_inline). */ -#if !defined(CONFIG_OPTIMIZE_INLINING) -#define inline inline __attribute__((__always_inline__)) __gnu_inline \ - __inline_maybe_unused notrace -#else -#define inline inline __gnu_inline \ - __inline_maybe_unused notrace -#endif +#define inline inline __gnu_inline __inline_maybe_unused notrace /* * gcc provides both __inline__ and __inline as alternate spellings of diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index 7fa0c4ae6394..8a44b93da0f3 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -6,7 +6,6 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_KERNEL_XZ=y # CONFIG_KERNEL_LZO is not set # CONFIG_KERNEL_LZ4 is not set -CONFIG_OPTIMIZE_INLINING=y # CONFIG_SLAB is not set # CONFIG_SLUB is not set CONFIG_SLOB=y diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index d1398cef3b18..7f9a89847b65 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -305,18 +305,6 @@ config HEADERS_INSTALL user-space program samples. It is also needed by some features such as uapi header sanity checks. -config OPTIMIZE_INLINING - def_bool y - help - This option determines if the kernel forces gcc to inline the functions - developers have marked 'inline'. Doing so takes away freedom from gcc to - do what it thinks is best, which is desirable for the gcc 3.x series of - compilers. The gcc 4.x series have a rewritten inlining algorithm and - enabling this option will generate a smaller kernel there. Hopefully - this algorithm is so good that allowing gcc 4.x and above to make the - decision will become the default in the future. Until then this option - is there to test gcc for this. - config DEBUG_SECTION_MISMATCH bool "Enable full Section mismatch analysis" help -- cgit v1.3 From af9c5d2e3b355854ff0e4acfbfbfadcd5198a349 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Mon, 6 Apr 2020 20:09:37 -0700 Subject: compiler.h: fix error in BUILD_BUG_ON() reporting compiletime_assert() uses __LINE__ to create a unique function name. This means that if you have more than one BUILD_BUG_ON() in the same source line (which can happen if they appear e.g. in a macro), then the error message from the compiler might output the wrong condition. For this source file: #include #define macro() \ BUILD_BUG_ON(1); \ BUILD_BUG_ON(0); void foo() { macro(); } gcc would output: ./include/linux/compiler.h:350:38: error: call to `__compiletime_assert_9' declared with attribute error: BUILD_BUG_ON failed: 0 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__) However, it was not the BUILD_BUG_ON(0) that failed, so it should say 1 instead of 0. With this patch, we use __COUNTER__ instead of __LINE__, so each BUILD_BUG_ON() gets a different function name and the correct condition is printed: ./include/linux/compiler.h:350:38: error: call to `__compiletime_assert_0' declared with attribute error: BUILD_BUG_ON failed: 1 _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) Signed-off-by: Vegard Nossum Signed-off-by: Andrew Morton Reviewed-by: Masahiro Yamada Reviewed-by: Daniel Santos Cc: Rasmus Villemoes Cc: Ian Abbott Cc: Joe Perches Link: http://lkml.kernel.org/r/20200331112637.25047-1-vegard.nossum@oracle.com Signed-off-by: Linus Torvalds --- include/linux/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 5e88e7e33abe..034b0a644efc 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -347,7 +347,7 @@ static inline void *offset_to_ptr(const int *off) * compiler has support to do so. */ #define compiletime_assert(condition, msg) \ - _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__) + _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) #define compiletime_assert_atomic_type(t) \ compiletime_assert(__native_word(t), \ -- cgit v1.3 From f80ac98a641a03097cbc9fdfd4b6a41a8dd3b7ae Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 6 Apr 2020 20:09:43 -0700 Subject: bitops: always inline sign extension helpers With CONFIG_CC_OPTIMIZE_FOR_SIZE, objtool reports: drivers/gpu/drm/i915/gem/i915_gem_execbuffer.o: warning: objtool: i915_gem_execbuffer2_ioctl()+0x5b7: call to gen8_canonical_addr() with UACCESS enabled This means i915_gem_execbuffer2_ioctl() is calling gen8_canonical_addr() from the user_access_begin/end critical region (i.e, with SMAP disabled). While it's probably harmless in this case, in general we like to avoid extra function calls in SMAP-disabled regions because it can open up inadvertent security holes. Fix the warning by changing the sign extension helpers to __always_inline. This convinces GCC to inline gen8_canonical_addr(). The sign extension functions are trivial anyway, so it makes sense to always inline them. With my test optimize-for-size-based config, this actually shrinks the text size of i915_gem_execbuffer.o by 45 bytes -- and no change for vmlinux. Reported-by: Randy Dunlap Signed-off-by: Josh Poimboeuf Signed-off-by: Andrew Morton Cc: Peter Zijlstra Cc: Al Viro Cc: Chris Wilson Link: http://lkml.kernel.org/r/740179324b2b18b750b16295c48357f00b5fa9ed.1582982020.git.jpoimboe@redhat.com Signed-off-by: Linus Torvalds --- include/linux/bitops.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 47f54b459c26..9acf654f0b19 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -162,7 +162,7 @@ static inline __u8 ror8(__u8 word, unsigned int shift) * * This is safe to use for 16- and 8-bit types as well. */ -static inline __s32 sign_extend32(__u32 value, int index) +static __always_inline __s32 sign_extend32(__u32 value, int index) { __u8 shift = 31 - index; return (__s32)(value << shift) >> shift; @@ -173,7 +173,7 @@ static inline __s32 sign_extend32(__u32 value, int index) * @value: value to sign extend * @index: 0 based bit index (0<=index<64) to sign bit */ -static inline __s64 sign_extend64(__u64 value, int index) +static __always_inline __s64 sign_extend64(__u64 value, int index) { __u8 shift = 63 - index; return (__s64)(value << shift) >> shift; -- cgit v1.3 From 505a0ef15f96c6c43ec719c9fc1833d98957bb39 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 6 Apr 2020 20:10:22 -0700 Subject: kasan: stackdepot: move filter_irq_stacks() to stackdepot.c filter_irq_stacks() can be used by other tools (e.g. KMSAN), so it needs to be moved to a common location. lib/stackdepot.c seems a good place, as filter_irq_stacks() is usually applied to the output of stack_trace_save(). This patch has been previously mailed as part of KMSAN RFC patch series. [glider@google.co: nds32: linker script: add SOFTIRQENTRY_TEXT\ Link: http://lkml.kernel.org/r/20200311121002.241430-1-glider@google.com [glider@google.com: add IRQENTRY_TEXT and SOFTIRQENTRY_TEXT to linker script] Link: http://lkml.kernel.org/r/20200311121124.243352-1-glider@google.com Signed-off-by: Alexander Potapenko Signed-off-by: Andrew Morton Cc: Vegard Nossum Cc: Dmitry Vyukov Cc: Marco Elver Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Arnd Bergmann Cc: Sergey Senozhatsky Link: http://lkml.kernel.org/r/20200220141916.55455-3-glider@google.com Signed-off-by: Linus Torvalds --- arch/ia64/kernel/vmlinux.lds.S | 2 ++ arch/nds32/kernel/vmlinux.lds.S | 1 + include/linux/stackdepot.h | 2 ++ lib/stackdepot.c | 24 ++++++++++++++++++++++++ mm/kasan/common.c | 23 ----------------------- 5 files changed, 29 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index 1ec6b703c5b4..6b5652ee76f9 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -54,6 +54,8 @@ SECTIONS { CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT + IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.gnu.linkonce.t*) } diff --git a/arch/nds32/kernel/vmlinux.lds.S b/arch/nds32/kernel/vmlinux.lds.S index f679d3397436..7a6c1cefe3fe 100644 --- a/arch/nds32/kernel/vmlinux.lds.S +++ b/arch/nds32/kernel/vmlinux.lds.S @@ -47,6 +47,7 @@ SECTIONS LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.fixup) } diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 3efa97d482cb..24d49c732341 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -19,4 +19,6 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries); +unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries); + #endif diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 1ec36ee344e0..2caffc64e4c8 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -20,6 +20,7 @@ */ #include +#include #include #include #include @@ -316,3 +317,26 @@ fast_exit: return retval; } EXPORT_SYMBOL_GPL(stack_depot_save); + +static inline int in_irqentry_text(unsigned long ptr) +{ + return (ptr >= (unsigned long)&__irqentry_text_start && + ptr < (unsigned long)&__irqentry_text_end) || + (ptr >= (unsigned long)&__softirqentry_text_start && + ptr < (unsigned long)&__softirqentry_text_end); +} + +unsigned int filter_irq_stacks(unsigned long *entries, + unsigned int nr_entries) +{ + unsigned int i; + + for (i = 0; i < nr_entries; i++) { + if (in_irqentry_text(entries[i])) { + /* Include the irqentry function into the stack. */ + return i + 1; + } + } + return nr_entries; +} +EXPORT_SYMBOL_GPL(filter_irq_stacks); diff --git a/mm/kasan/common.c b/mm/kasan/common.c index e61b4a492218..2906358e42f0 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -15,7 +15,6 @@ */ #include -#include #include #include #include @@ -42,28 +41,6 @@ #include "kasan.h" #include "../slab.h" -static inline int in_irqentry_text(unsigned long ptr) -{ - return (ptr >= (unsigned long)&__irqentry_text_start && - ptr < (unsigned long)&__irqentry_text_end) || - (ptr >= (unsigned long)&__softirqentry_text_start && - ptr < (unsigned long)&__softirqentry_text_end); -} - -static inline unsigned int filter_irq_stacks(unsigned long *entries, - unsigned int nr_entries) -{ - unsigned int i; - - for (i = 0; i < nr_entries; i++) { - if (in_irqentry_text(entries[i])) { - /* Include the irqentry function into the stack. */ - return i + 1; - } - } - return nr_entries; -} - static inline depot_stack_handle_t save_stack(gfp_t flags) { unsigned long entries[KASAN_STACK_DEPTH]; -- cgit v1.3 From 7e2345200262e4a6056580f0231cccdaffc825f3 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 6 Apr 2020 20:10:25 -0700 Subject: percpu_counter: fix a data race at vm_committed_as "vm_committed_as.count" could be accessed concurrently as reported by KCSAN, BUG: KCSAN: data-race in __vm_enough_memory / percpu_counter_add_batch write to 0xffffffff9451c538 of 8 bytes by task 65879 on cpu 35: percpu_counter_add_batch+0x83/0xd0 percpu_counter_add_batch at lib/percpu_counter.c:91 __vm_enough_memory+0xb9/0x260 dup_mm+0x3a4/0x8f0 copy_process+0x2458/0x3240 _do_fork+0xaa/0x9f0 __do_sys_clone+0x125/0x160 __x64_sys_clone+0x70/0x90 do_syscall_64+0x91/0xb05 entry_SYSCALL_64_after_hwframe+0x49/0xbe read to 0xffffffff9451c538 of 8 bytes by task 66773 on cpu 19: __vm_enough_memory+0x199/0x260 percpu_counter_read_positive at include/linux/percpu_counter.h:81 (inlined by) __vm_enough_memory at mm/util.c:839 mmap_region+0x1b2/0xa10 do_mmap+0x45c/0x700 vm_mmap_pgoff+0xc0/0x130 ksys_mmap_pgoff+0x6e/0x300 __x64_sys_mmap+0x33/0x40 do_syscall_64+0x91/0xb05 entry_SYSCALL_64_after_hwframe+0x49/0xbe The read is outside percpu_counter::lock critical section which results in a data race. Fix it by adding a READ_ONCE() in percpu_counter_read_positive() which could also service as the existing compiler memory barrier. Signed-off-by: Qian Cai Signed-off-by: Andrew Morton Acked-by: Marco Elver Link: http://lkml.kernel.org/r/1582302724-2804-1-git-send-email-cai@lca.pw Signed-off-by: Linus Torvalds --- include/linux/percpu_counter.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 4f052496cdfd..0a4f54dd4737 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -78,9 +78,9 @@ static inline s64 percpu_counter_read(struct percpu_counter *fbc) */ static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc) { - s64 ret = fbc->count; + /* Prevent reloads of fbc->count */ + s64 ret = READ_ONCE(fbc->count); - barrier(); /* Prevent reloads of fbc->count */ if (ret >= 0) return ret; return 0; -- cgit v1.3 From 295bcca84916cb5079140a89fccb472bb8d1f6e2 Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Mon, 6 Apr 2020 20:10:38 -0700 Subject: linux/bits.h: add compile time sanity check of GENMASK inputs GENMASK() and GENMASK_ULL() are supposed to be called with the high bit as the first argument and the low bit as the second argument. Mixing them will return a mask with zero bits set. Recent commits show getting this wrong is not uncommon, see e.g. commit aa4c0c9091b0 ("net: stmmac: Fix misuses of GENMASK macro") and commit 9bdd7bb3a844 ("clocksource/drivers/npcm: Fix misuse of GENMASK macro"). To prevent such mistakes from appearing again, add compile time sanity checking to the arguments of GENMASK() and GENMASK_ULL(). If both arguments are known at compile time, and the low bit is higher than the high bit, break the build to detect the mistake immediately. Since GENMASK() is used in declarations, BUILD_BUG_ON_ZERO() must be used instead of BUILD_BUG_ON(). __builtin_constant_p does not evaluate is argument, it only checks if it is a constant or not at compile time, and __builtin_choose_expr does not evaluate the expression that is not chosen. Therefore, GENMASK(x++, 0) does only evaluate x++ once. Commit 95b980d62d52 ("linux/bits.h: make BIT(), GENMASK(), and friends available in assembly") made the macros in linux/bits.h available in assembly. Since BUILD_BUG_OR_ZERO() is not asm compatible, disable the checks if the file is included in an asm file. Due to bugs in GCC versions before 4.9 [0], disable the check if building with a too old GCC compiler. [0]: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=19449 Signed-off-by: Rikard Falkeborn Signed-off-by: Andrew Morton Reviewed-by: Masahiro Yamada Reviewed-by: Kees Cook Cc: Borislav Petkov Cc: Geert Uytterhoeven Cc: Haren Myneni Cc: Joe Perches Cc: Johannes Berg Cc: lkml Cc: Ingo Molnar Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200308193954.2372399-1-rikard.falkeborn@gmail.com Signed-off-by: Linus Torvalds --- include/linux/bits.h | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bits.h b/include/linux/bits.h index a740bbcf3cd2..4671fbf28842 100644 --- a/include/linux/bits.h +++ b/include/linux/bits.h @@ -18,12 +18,30 @@ * position @h. For example * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000. */ -#define GENMASK(h, l) \ +#if !defined(__ASSEMBLY__) && \ + (!defined(CONFIG_CC_IS_GCC) || CONFIG_GCC_VERSION >= 49000) +#include +#define GENMASK_INPUT_CHECK(h, l) \ + (BUILD_BUG_ON_ZERO(__builtin_choose_expr( \ + __builtin_constant_p((l) > (h)), (l) > (h), 0))) +#else +/* + * BUILD_BUG_ON_ZERO is not available in h files included from asm files, + * disable the input check if that is the case. + */ +#define GENMASK_INPUT_CHECK(h, l) 0 +#endif + +#define __GENMASK(h, l) \ (((~UL(0)) - (UL(1) << (l)) + 1) & \ (~UL(0) >> (BITS_PER_LONG - 1 - (h)))) +#define GENMASK(h, l) \ + (GENMASK_INPUT_CHECK(h, l) + __GENMASK(h, l)) -#define GENMASK_ULL(h, l) \ +#define __GENMASK_ULL(h, l) \ (((~ULL(0)) - (ULL(1) << (l)) + 1) & \ (~ULL(0) >> (BITS_PER_LONG_LONG - 1 - (h)))) +#define GENMASK_ULL(h, l) \ + (GENMASK_INPUT_CHECK(h, l) + __GENMASK_ULL(h, l)) #endif /* __LINUX_BITS_H */ -- cgit v1.3