From fce86ff5802bac3a7b19db171aa1949ef9caac31 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 13 May 2019 17:15:33 -0700 Subject: mm/huge_memory: fix vmf_insert_pfn_{pmd, pud}() crash, handle unaligned addresses Starting with c6f3c5ee40c1 ("mm/huge_memory.c: fix modifying of page protection by insert_pfn_pmd()") vmf_insert_pfn_pmd() internally calls pmdp_set_access_flags(). That helper enforces a pmd aligned @address argument via VM_BUG_ON() assertion. Update the implementation to take a 'struct vm_fault' argument directly and apply the address alignment fixup internally to fix crash signatures like: kernel BUG at arch/x86/mm/pgtable.c:515! invalid opcode: 0000 [#1] SMP NOPTI CPU: 51 PID: 43713 Comm: java Tainted: G OE 4.19.35 #1 [..] RIP: 0010:pmdp_set_access_flags+0x48/0x50 [..] Call Trace: vmf_insert_pfn_pmd+0x198/0x350 dax_iomap_fault+0xe82/0x1190 ext4_dax_huge_fault+0x103/0x1f0 ? __switch_to_asm+0x40/0x70 __handle_mm_fault+0x3f6/0x1370 ? __switch_to_asm+0x34/0x70 ? __switch_to_asm+0x40/0x70 handle_mm_fault+0xda/0x200 __do_page_fault+0x249/0x4f0 do_page_fault+0x32/0x110 ? page_fault+0x8/0x30 page_fault+0x1e/0x30 Link: http://lkml.kernel.org/r/155741946350.372037.11148198430068238140.stgit@dwillia2-desk3.amr.corp.intel.com Fixes: c6f3c5ee40c1 ("mm/huge_memory.c: fix modifying of page protection by insert_pfn_pmd()") Signed-off-by: Dan Williams Reported-by: Piotr Balcer Tested-by: Yan Ma Tested-by: Pankaj Gupta Reviewed-by: Matthew Wilcox Reviewed-by: Jan Kara Reviewed-by: Aneesh Kumar K.V Cc: Chandan Rajendra Cc: Souptick Joarder Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 381e872bfde0..7cd5c150c21d 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -47,10 +47,8 @@ extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, int prot_numa); -vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, pfn_t pfn, bool write); -vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, - pud_t *pud, pfn_t pfn, bool write); +vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write); +vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, -- cgit v1.2.3 From a16b53849913e742d086bb2b6f5e069ea2850c56 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Mon, 13 May 2019 17:15:59 -0700 Subject: list: add function list_rotate_to_front() Patch series "mm: Use slab_list list_head instead of lru", v5. Currently the slab allocators (ab)use the struct page 'lru' list_head. We have a list head for slab allocators to use, 'slab_list'. During v2 it was noted by Christoph that the SLOB allocator was reaching into a list_head, this version adds 2 patches to the front of the set to fix that. Clean up all three allocators by using the 'slab_list' list_head instead of overloading the 'lru' list_head. This patch (of 7): Currently if we wish to rotate a list until a specific item is at the front of the list we can call list_move_tail(head, list). Note that the arguments are the reverse way to the usual use of list_move_tail(list, head). This is a hack, it depends on the developer knowing how the list_head operates internally which violates the layer of abstraction offered by the list_head. Also, it is not intuitive so the next developer to come along must study list.h in order to fully understand what is meant by the call, while this is 'good for' the developer it makes reading the code harder. We should have an function appropriately named that does this if there are users for it intree. By grep'ing the tree for list_move_tail() and list_tail() and attempting to guess the argument order from the names it seems there is only one place currently in the tree that does this - the slob allocatator. Add function list_rotate_to_front() to rotate a list until the specified item is at the front of the list. Link: http://lkml.kernel.org/r/20190402230545.2929-2-tobin@kernel.org Signed-off-by: Tobin C. Harding Reviewed-by: Christoph Lameter Reviewed-by: Roman Gushchin Acked-by: Vlastimil Babka Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index 58aa3adf94e6..9e9a6403dbe4 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -270,6 +270,24 @@ static inline void list_rotate_left(struct list_head *head) } } +/** + * list_rotate_to_front() - Rotate list to specific item. + * @list: The desired new front of the list. + * @head: The head of the list. + * + * Rotates list so that @list becomes the new front of the list. + */ +static inline void list_rotate_to_front(struct list_head *list, + struct list_head *head) +{ + /* + * Deletes the list head from the list denoted by @head and + * places it as the tail of @list, this effectively rotates the + * list so that @list is at the front. + */ + list_move_tail(head, list); +} + /** * list_is_singular - tests whether a list has just one entry. * @head: the list to test. -- cgit v1.2.3 From 3e05617ceaa42838084daee209f9c4965bf03379 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Mon, 13 May 2019 17:16:19 -0700 Subject: mm: remove stale comment from page struct We now use the slab_list list_head instead of the lru list_head. This comment has become stale. Remove stale comment from page struct slab_list list_head. Link: http://lkml.kernel.org/r/20190402230545.2929-8-tobin@kernel.org Signed-off-by: Tobin C. Harding Acked-by: Christoph Lameter Reviewed-by: Roman Gushchin Acked-by: Vlastimil Babka Cc: David Rientjes Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4ef4bbe78a1d..e1f42a07d8f0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -103,7 +103,7 @@ struct page { }; struct { /* slab, slob and slub */ union { - struct list_head slab_list; /* uses lru */ + struct list_head slab_list; struct { /* Partial pages */ struct page *next; #ifdef CONFIG_64BIT -- cgit v1.2.3 From cefdca0a86be517bc390fc4541e3674b8e7803b0 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 13 May 2019 17:16:41 -0700 Subject: userfaultfd/sysctl: add vm.unprivileged_userfaultfd Userfaultfd can be misued to make it easier to exploit existing use-after-free (and similar) bugs that might otherwise only make a short window or race condition available. By using userfaultfd to stall a kernel thread, a malicious program can keep some state that it wrote, stable for an extended period, which it can then access using an existing exploit. While it doesn't cause the exploit itself, and while it's not the only thing that can stall a kernel thread when accessing a memory location, it's one of the few that never needs privilege. We can add a flag, allowing userfaultfd to be restricted, so that in general it won't be useable by arbitrary user programs, but in environments that require userfaultfd it can be turned back on. Add a global sysctl knob "vm.unprivileged_userfaultfd" to control whether userfaultfd is allowed by unprivileged users. When this is set to zero, only privileged users (root user, or users with the CAP_SYS_PTRACE capability) will be able to use the userfaultfd syscalls. Andrea said: : The only difference between the bpf sysctl and the userfaultfd sysctl : this way is that the bpf sysctl adds the CAP_SYS_ADMIN capability : requirement, while userfaultfd adds the CAP_SYS_PTRACE requirement, : because the userfaultfd monitor is more likely to need CAP_SYS_PTRACE : already if it's doing other kind of tracking on processes runtime, in : addition of userfaultfd. In other words both syscalls works only for : root, when the two sysctl are opt-in set to 1. [dgilbert@redhat.com: changelog additions] [akpm@linux-foundation.org: documentation tweak, per Mike] Link: http://lkml.kernel.org/r/20190319030722.12441-2-peterx@redhat.com Signed-off-by: Peter Xu Suggested-by: Andrea Arcangeli Suggested-by: Mike Rapoport Reviewed-by: Mike Rapoport Reviewed-by: Andrea Arcangeli Cc: Paolo Bonzini Cc: Hugh Dickins Cc: Luis Chamberlain Cc: Maxime Coquelin Cc: Maya Gokhale Cc: Jerome Glisse Cc: Pavel Emelyanov Cc: Johannes Weiner Cc: Martin Cracauer Cc: Denis Plotnikov Cc: Marty McFadden Cc: Mike Kravetz Cc: Kees Cook Cc: Mel Gorman Cc: "Kirill A . Shutemov" Cc: "Dr . David Alan Gilbert" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 12 ++++++++++++ fs/userfaultfd.c | 5 +++++ include/linux/userfaultfd_k.h | 2 ++ kernel/sysctl.c | 12 ++++++++++++ 4 files changed, 31 insertions(+) (limited to 'include/linux') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 3f13d8599337..749322060f10 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -61,6 +61,7 @@ Currently, these files are in /proc/sys/vm: - stat_refresh - numa_stat - swappiness +- unprivileged_userfaultfd - user_reserve_kbytes - vfs_cache_pressure - watermark_boost_factor @@ -818,6 +819,17 @@ The default value is 60. ============================================================== +unprivileged_userfaultfd + +This flag controls whether unprivileged users can use the userfaultfd +system calls. Set this to 1 to allow unprivileged users to use the +userfaultfd system calls, or set this to 0 to restrict userfaultfd to only +privileged users (with SYS_CAP_PTRACE capability). + +The default value is 1. + +============================================================== + - user_reserve_kbytes When overcommit_memory is set to 2, "never overcommit" mode, reserve diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index f5de1e726356..3b30301c90ec 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -30,6 +30,8 @@ #include #include +int sysctl_unprivileged_userfaultfd __read_mostly = 1; + static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; enum userfaultfd_state { @@ -1930,6 +1932,9 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) struct userfaultfd_ctx *ctx; int fd; + if (!sysctl_unprivileged_userfaultfd && !capable(CAP_SYS_PTRACE)) + return -EPERM; + BUG_ON(!current->mm); /* Check the UFFD_* constants for consistency. */ diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 37c9eba75c98..ac9d71e24b81 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -28,6 +28,8 @@ #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) +extern int sysctl_unprivileged_userfaultfd; + extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 599510a3355e..ba158f61aab4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -66,6 +66,7 @@ #include #include #include +#include #include "../lib/kstrtox.h" @@ -1719,6 +1720,17 @@ static struct ctl_table vm_table[] = { .extra1 = (void *)&mmap_rnd_compat_bits_min, .extra2 = (void *)&mmap_rnd_compat_bits_max, }, +#endif +#ifdef CONFIG_USERFAULTFD + { + .procname = "unprivileged_userfaultfd", + .data = &sysctl_unprivileged_userfaultfd, + .maxlen = sizeof(sysctl_unprivileged_userfaultfd), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif { } }; -- cgit v1.2.3 From 5fd4ca2d84b249f0858ce28cf637cf25b61a398f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 13 May 2019 17:16:44 -0700 Subject: mm: page cache: store only head pages in i_pages Transparent Huge Pages are currently stored in i_pages as pointers to consecutive subpages. This patch changes that to storing consecutive pointers to the head page in preparation for storing huge pages more efficiently in i_pages. Large parts of this are "inspired" by Kirill's patch https://lore.kernel.org/lkml/20170126115819.58875-2-kirill.shutemov@linux.intel.com/ [willy@infradead.org: fix swapcache pages] Link: http://lkml.kernel.org/r/20190324155441.GF10344@bombadil.infradead.org [kirill@shutemov.name: hugetlb stores pages in page cache differently] Link: http://lkml.kernel.org/r/20190404134553.vuvhgmghlkiw2hgl@kshutemo-mobl1 Link: http://lkml.kernel.org/r/20190307153051.18815-1-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Jan Kara Reviewed-by: Kirill Shutemov Reviewed-and-tested-by: Song Liu Tested-by: William Kucharski Reviewed-by: William Kucharski Tested-by: Qian Cai Cc: Hugh Dickins Cc: Song Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 13 ++++ mm/filemap.c | 159 +++++++++++++++++++----------------------------- mm/huge_memory.c | 3 + mm/khugepaged.c | 4 +- mm/memfd.c | 2 + mm/migrate.c | 2 +- mm/shmem.c | 2 +- mm/swap_state.c | 4 +- 8 files changed, 86 insertions(+), 103 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index bcf909d0de5f..2e8438a1216a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -333,6 +333,19 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, mapping_gfp_mask(mapping)); } +static inline struct page *find_subpage(struct page *page, pgoff_t offset) +{ + unsigned long mask; + + if (PageHuge(page)) + return page; + + VM_BUG_ON_PAGE(PageTail(page), page); + + mask = (1UL << compound_order(page)) - 1; + return page + (offset & mask); +} + struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); unsigned find_get_entries(struct address_space *mapping, pgoff_t start, diff --git a/mm/filemap.c b/mm/filemap.c index d78f577baef2..4157f858a9c6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -279,11 +279,11 @@ EXPORT_SYMBOL(delete_from_page_cache); * @pvec: pagevec with pages to delete * * The function walks over mapping->i_pages and removes pages passed in @pvec - * from the mapping. The function expects @pvec to be sorted by page index. + * from the mapping. The function expects @pvec to be sorted by page index + * and is optimised for it to be dense. * It tolerates holes in @pvec (mapping entries at those indices are not * modified). The function expects only THP head pages to be present in the - * @pvec and takes care to delete all corresponding tail pages from the - * mapping as well. + * @pvec. * * The function expects the i_pages lock to be held. */ @@ -292,40 +292,44 @@ static void page_cache_delete_batch(struct address_space *mapping, { XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); int total_pages = 0; - int i = 0, tail_pages = 0; + int i = 0; struct page *page; mapping_set_update(&xas, mapping); xas_for_each(&xas, page, ULONG_MAX) { - if (i >= pagevec_count(pvec) && !tail_pages) + if (i >= pagevec_count(pvec)) break; + + /* A swap/dax/shadow entry got inserted? Skip it. */ if (xa_is_value(page)) continue; - if (!tail_pages) { - /* - * Some page got inserted in our range? Skip it. We - * have our pages locked so they are protected from - * being removed. - */ - if (page != pvec->pages[i]) { - VM_BUG_ON_PAGE(page->index > - pvec->pages[i]->index, page); - continue; - } - WARN_ON_ONCE(!PageLocked(page)); - if (PageTransHuge(page) && !PageHuge(page)) - tail_pages = HPAGE_PMD_NR - 1; + /* + * A page got inserted in our range? Skip it. We have our + * pages locked so they are protected from being removed. + * If we see a page whose index is higher than ours, it + * means our page has been removed, which shouldn't be + * possible because we're holding the PageLock. + */ + if (page != pvec->pages[i]) { + VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index, + page); + continue; + } + + WARN_ON_ONCE(!PageLocked(page)); + + if (page->index == xas.xa_index) page->mapping = NULL; - /* - * Leave page->index set: truncation lookup relies - * upon it - */ + /* Leave page->index set: truncation lookup relies on it */ + + /* + * Move to the next page in the vector if this is a regular + * page or the index is of the last sub-page of this compound + * page. + */ + if (page->index + (1UL << compound_order(page)) - 1 == + xas.xa_index) i++; - } else { - VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages - != pvec->pages[i]->index, page); - tail_pages--; - } xas_store(&xas, NULL); total_pages++; } @@ -1491,7 +1495,7 @@ EXPORT_SYMBOL(page_cache_prev_miss); struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { XA_STATE(xas, &mapping->i_pages, offset); - struct page *head, *page; + struct page *page; rcu_read_lock(); repeat: @@ -1506,25 +1510,19 @@ repeat: if (!page || xa_is_value(page)) goto out; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto repeat; - /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } - /* - * Has the page moved? + * Has the page moved or been split? * This is part of the lockless pagecache protocol. See * include/linux/pagemap.h for details. */ if (unlikely(page != xas_reload(&xas))) { - put_page(head); + put_page(page); goto repeat; } + page = find_subpage(page, offset); out: rcu_read_unlock(); @@ -1706,7 +1704,6 @@ unsigned find_get_entries(struct address_space *mapping, rcu_read_lock(); xas_for_each(&xas, page, ULONG_MAX) { - struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -1717,17 +1714,13 @@ unsigned find_get_entries(struct address_space *mapping, if (xa_is_value(page)) goto export; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; + page = find_subpage(page, xas.xa_index); export: indices[ret] = xas.xa_index; @@ -1736,7 +1729,7 @@ export: break; continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -1778,33 +1771,27 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, rcu_read_lock(); xas_for_each(&xas, page, end) { - struct page *head; if (xas_retry(&xas, page)) continue; /* Skip over shadow, swap and DAX entries */ if (xa_is_value(page)) continue; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = page; + pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) { *start = xas.xa_index + 1; goto out; } continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -1849,7 +1836,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, rcu_read_lock(); for (page = xas_load(&xas); page; page = xas_next(&xas)) { - struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -1859,24 +1845,19 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, if (xa_is_value(page)) break; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = page; + pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) break; continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -1912,7 +1893,6 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, rcu_read_lock(); xas_for_each_marked(&xas, page, end, tag) { - struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -1923,26 +1903,21 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, if (xa_is_value(page)) continue; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = page; + pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) { *index = xas.xa_index + 1; goto out; } continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -1991,7 +1966,6 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, rcu_read_lock(); xas_for_each_marked(&xas, page, ULONG_MAX, tag) { - struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -2002,17 +1976,13 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, if (xa_is_value(page)) goto export; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; + page = find_subpage(page, xas.xa_index); export: indices[ret] = xas.xa_index; @@ -2021,7 +1991,7 @@ export: break; continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -2691,7 +2661,7 @@ void filemap_map_pages(struct vm_fault *vmf, pgoff_t last_pgoff = start_pgoff; unsigned long max_idx; XA_STATE(xas, &mapping->i_pages, start_pgoff); - struct page *head, *page; + struct page *page; rcu_read_lock(); xas_for_each(&xas, page, end_pgoff) { @@ -2700,24 +2670,19 @@ void filemap_map_pages(struct vm_fault *vmf, if (xa_is_value(page)) goto next; - head = compound_head(page); - /* * Check for a locked page first, as a speculative * reference may adversely influence page migration. */ - if (PageLocked(head)) + if (PageLocked(page)) goto next; - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto next; - /* The page was split under us? */ - if (compound_head(page) != head) - goto skip; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto skip; + page = find_subpage(page, xas.xa_index); if (!PageUptodate(page) || PageReadahead(page) || diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c314a362c167..50c665b12cf1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2496,6 +2496,9 @@ static void __split_huge_page(struct page *page, struct list_head *list, if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) shmem_uncharge(head->mapping->host, 1); put_page(head + i); + } else if (!PageAnon(page)) { + __xa_store(&head->mapping->i_pages, head[i].index, + head + i, 0); } } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 449044378782..7ba7a1e4fa79 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1374,7 +1374,7 @@ static void collapse_shmem(struct mm_struct *mm, result = SCAN_FAIL; goto xa_locked; } - xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); + xas_store(&xas, new_page); nr_none++; continue; } @@ -1450,7 +1450,7 @@ static void collapse_shmem(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); + xas_store(&xas, new_page); continue; out_unlock: unlock_page(page); diff --git a/mm/memfd.c b/mm/memfd.c index 650e65a46b9c..2647c898990c 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -39,6 +39,7 @@ static void memfd_tag_pins(struct xa_state *xas) xas_for_each(xas, page, ULONG_MAX) { if (xa_is_value(page)) continue; + page = find_subpage(page, xas->xa_index); if (page_count(page) - page_mapcount(page) > 1) xas_set_mark(xas, MEMFD_TAG_PINNED); @@ -88,6 +89,7 @@ static int memfd_wait_for_pins(struct address_space *mapping) bool clear = true; if (xa_is_value(page)) continue; + page = find_subpage(page, xas.xa_index); if (page_count(page) - page_mapcount(page) != 1) { /* * On the last scan, we clean up all those tags diff --git a/mm/migrate.c b/mm/migrate.c index 663a5449367a..a1770403ff7f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -463,7 +463,7 @@ int migrate_page_move_mapping(struct address_space *mapping, for (i = 1; i < HPAGE_PMD_NR; i++) { xas_next(&xas); - xas_store(&xas, newpage + i); + xas_store(&xas, newpage); } } diff --git a/mm/shmem.c b/mm/shmem.c index f4dce9c8670d..1bb3b8dc8bb2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -614,7 +614,7 @@ static int shmem_add_to_page_cache(struct page *page, if (xas_error(&xas)) goto unlock; next: - xas_store(&xas, page + i); + xas_store(&xas, page); if (++i < nr) { xas_next(&xas); goto next; diff --git a/mm/swap_state.c b/mm/swap_state.c index 85245fdec8d9..eb714165afd2 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -132,7 +132,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp) for (i = 0; i < nr; i++) { VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); set_page_private(page + i, entry.val + i); - xas_store(&xas, page + i); + xas_store(&xas, page); xas_next(&xas); } address_space->nrpages += nr; @@ -167,7 +167,7 @@ void __delete_from_swap_cache(struct page *page, swp_entry_t entry) for (i = 0; i < nr; i++) { void *entry = xas_store(&xas, NULL); - VM_BUG_ON_PAGE(entry != page + i, entry); + VM_BUG_ON_PAGE(entry != page, entry); set_page_private(page + i, 0); xas_next(&xas); } -- cgit v1.2.3 From 886cf1901db962cee5f8b82b9b260079a5e8a4eb Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 13 May 2019 17:16:51 -0700 Subject: mm: move recent_rotated pages calculation to shrink_inactive_list() Patch series "mm: Generalize putback functions"] putback_inactive_pages() and move_active_pages_to_lru() are almost similar, so this patchset merges them ina single function. This patch (of 4): The patch moves the calculation from putback_inactive_pages() to shrink_inactive_list(). This makes putback_inactive_pages() looking more similar to move_active_pages_to_lru(). To do that, we account activated pages in reclaim_stat::nr_activate. Since a page may change its LRU type from anon to file cache inside shrink_page_list() (see ClearPageSwapBacked()), we have to account pages for the both types. So, nr_activate becomes an array. Previously we used nr_activate to account PGACTIVATE events, but now we account them into pgactivate variable (since they are about number of pages in general, not about sum of hpage_nr_pages). Link: http://lkml.kernel.org/r/155290127956.31489.3393586616054413298.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Reviewed-by: Daniel Jordan Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../trace/postprocess/trace-vmscan-postprocess.pl | 7 ++++--- include/linux/vmstat.h | 2 +- include/trace/events/vmscan.h | 13 ++++++++----- mm/vmscan.c | 15 +++++++-------- 4 files changed, 20 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl index 66bfd8396877..995da15b16ca 100644 --- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl +++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl @@ -113,7 +113,7 @@ my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)'; my $regex_kswapd_sleep_default = 'nid=([0-9]*)'; my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*) gfp_flags=([A-Z_|]*)'; my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) classzone_idx=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)'; -my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)'; +my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate_anon=([0-9]*) nr_activate_file=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)'; my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)'; my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) flags=([A-Z_|]*)'; @@ -212,7 +212,8 @@ $regex_lru_shrink_inactive = generate_traceevent_regex( "vmscan/mm_vmscan_lru_shrink_inactive", $regex_lru_shrink_inactive_default, "nid", "nr_scanned", "nr_reclaimed", "nr_dirty", "nr_writeback", - "nr_congested", "nr_immediate", "nr_activate", "nr_ref_keep", + "nr_congested", "nr_immediate", "nr_activate_anon", + "nr_activate_file", "nr_ref_keep", "nr_unmap_fail", "priority", "flags"); $regex_lru_shrink_active = generate_traceevent_regex( "vmscan/mm_vmscan_lru_shrink_active", @@ -407,7 +408,7 @@ EVENT_PROCESS: } my $nr_reclaimed = $3; - my $flags = $12; + my $flags = $13; my $file = 0; if ($flags =~ /RECLAIM_WB_FILE/) { $file = 1; diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 2db8d60981fe..bdeda4b079fe 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -26,7 +26,7 @@ struct reclaim_stat { unsigned nr_congested; unsigned nr_writeback; unsigned nr_immediate; - unsigned nr_activate; + unsigned nr_activate[2]; unsigned nr_ref_keep; unsigned nr_unmap_fail; }; diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index e8709ab22d68..cb2add69301a 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -359,7 +359,8 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive, __field(unsigned long, nr_writeback) __field(unsigned long, nr_congested) __field(unsigned long, nr_immediate) - __field(unsigned long, nr_activate) + __field(unsigned int, nr_activate0) + __field(unsigned int, nr_activate1) __field(unsigned long, nr_ref_keep) __field(unsigned long, nr_unmap_fail) __field(int, priority) @@ -374,20 +375,22 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive, __entry->nr_writeback = stat->nr_writeback; __entry->nr_congested = stat->nr_congested; __entry->nr_immediate = stat->nr_immediate; - __entry->nr_activate = stat->nr_activate; + __entry->nr_activate0 = stat->nr_activate[0]; + __entry->nr_activate1 = stat->nr_activate[1]; __entry->nr_ref_keep = stat->nr_ref_keep; __entry->nr_unmap_fail = stat->nr_unmap_fail; __entry->priority = priority; __entry->reclaim_flags = trace_shrink_flags(file); ), - TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate=%ld nr_ref_keep=%ld nr_unmap_fail=%ld priority=%d flags=%s", + TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate_anon=%d nr_activate_file=%d nr_ref_keep=%ld nr_unmap_fail=%ld priority=%d flags=%s", __entry->nid, __entry->nr_scanned, __entry->nr_reclaimed, __entry->nr_dirty, __entry->nr_writeback, __entry->nr_congested, __entry->nr_immediate, - __entry->nr_activate, __entry->nr_ref_keep, - __entry->nr_unmap_fail, __entry->priority, + __entry->nr_activate0, __entry->nr_activate1, + __entry->nr_ref_keep, __entry->nr_unmap_fail, + __entry->priority, show_reclaim_flags(__entry->reclaim_flags)) ); diff --git a/mm/vmscan.c b/mm/vmscan.c index fd9de504e516..e6913e68db2e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1107,6 +1107,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, LIST_HEAD(ret_pages); LIST_HEAD(free_pages); unsigned nr_reclaimed = 0; + unsigned pgactivate = 0; memset(stat, 0, sizeof(*stat)); cond_resched(); @@ -1466,8 +1467,10 @@ activate_locked: try_to_free_swap(page); VM_BUG_ON_PAGE(PageActive(page), page); if (!PageMlocked(page)) { + int type = page_is_file_cache(page); SetPageActive(page); - stat->nr_activate++; + pgactivate++; + stat->nr_activate[type] += hpage_nr_pages(page); count_memcg_page_event(page, PGACTIVATE); } keep_locked: @@ -1482,7 +1485,7 @@ keep: free_unref_page_list(&free_pages); list_splice(&ret_pages, page_list); - count_vm_events(PGACTIVATE, stat->nr_activate); + count_vm_events(PGACTIVATE, pgactivate); return nr_reclaimed; } @@ -1807,7 +1810,6 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, static noinline_for_stack void putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) { - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; struct pglist_data *pgdat = lruvec_pgdat(lruvec); LIST_HEAD(pages_to_free); @@ -1833,11 +1835,6 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) lru = page_lru(page); add_page_to_lru_list(page, lruvec, lru); - if (is_active_lru(lru)) { - int file = is_file_lru(lru); - int numpages = hpage_nr_pages(page); - reclaim_stat->recent_rotated[file] += numpages; - } if (put_page_testzero(page)) { __ClearPageLRU(page); __ClearPageActive(page); @@ -1945,6 +1942,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT, nr_reclaimed); } + reclaim_stat->recent_rotated[0] = stat.nr_activate[0]; + reclaim_stat->recent_rotated[1] = stat.nr_activate[1]; putback_inactive_pages(lruvec, &page_list); -- cgit v1.2.3 From 9851ac13592df77958ae7bac6ba39e71420c38ec Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 13 May 2019 17:16:54 -0700 Subject: mm: move nr_deactivate accounting to shrink_active_list() We know which LRU is not active. [chris@chrisdown.name: fix build on !CONFIG_MEMCG] Link: http://lkml.kernel.org/r/20190322150513.GA22021@chrisdown.name Link: http://lkml.kernel.org/r/155290128498.31489.18250485448913338607.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Signed-off-by: Chris Down Reviewed-by: Daniel Jordan Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ++++++ mm/vmscan.c | 10 ++++------ 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index dbb6118370c1..b238403f95b2 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1117,6 +1117,12 @@ static inline void count_memcg_events(struct mem_cgroup *memcg, { } +static inline void __count_memcg_events(struct mem_cgroup *memcg, + enum vm_event_item idx, + unsigned long count) +{ +} + static inline void count_memcg_page_event(struct page *page, int idx) { diff --git a/mm/vmscan.c b/mm/vmscan.c index e6913e68db2e..5002cc43e32f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2040,12 +2040,6 @@ static unsigned move_active_pages_to_lru(struct lruvec *lruvec, } } - if (!is_active_lru(lru)) { - __count_vm_events(PGDEACTIVATE, nr_moved); - count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, - nr_moved); - } - return nr_moved; } @@ -2137,6 +2131,10 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); + + __count_vm_events(PGDEACTIVATE, nr_deactivate); + __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&pgdat->lru_lock); -- cgit v1.2.3 From 932f4a630a695212bdc7379b05f9bd0dafc5d968 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Mon, 13 May 2019 17:17:03 -0700 Subject: mm/gup: replace get_user_pages_longterm() with FOLL_LONGTERM Pach series "Add FOLL_LONGTERM to GUP fast and use it". HFI1, qib, and mthca, use get_user_pages_fast() due to its performance advantages. These pages can be held for a significant time. But get_user_pages_fast() does not protect against mapping FS DAX pages. Introduce FOLL_LONGTERM and use this flag in get_user_pages_fast() which retains the performance while also adding the FS DAX checks. XDP has also shown interest in using this functionality.[1] In addition we change get_user_pages() to use the new FOLL_LONGTERM flag and remove the specialized get_user_pages_longterm call. [1] https://lkml.org/lkml/2019/3/19/939 "longterm" is a relative thing and at this point is probably a misnomer. This is really flagging a pin which is going to be given to hardware and can't move. I've thought of a couple of alternative names but I think we have to settle on if we are going to use FL_LAYOUT or something else to solve the "longterm" problem. Then I think we can change the flag to a better name. Secondly, it depends on how often you are registering memory. I have spoken with some RDMA users who consider MR in the performance path... For the overall application performance. I don't have the numbers as the tests for HFI1 were done a long time ago. But there was a significant advantage. Some of which is probably due to the fact that you don't have to hold mmap_sem. Finally, architecturally I think it would be good for everyone to use *_fast. There are patches submitted to the RDMA list which would allow the use of *_fast (they reworking the use of mmap_sem) and as soon as they are accepted I'll submit a patch to convert the RDMA core as well. Also to this point others are looking to use *_fast. As an aside, Jasons pointed out in my previous submission that *_fast and *_unlocked look very much the same. I agree and I think further cleanup will be coming. But I'm focused on getting the final solution for DAX at the moment. This patch (of 7): This patch starts a series which aims to support FOLL_LONGTERM in get_user_pages_fast(). Some callers who would like to do a longterm (user controlled pin) of pages with the fast variant of GUP for performance purposes. Rather than have a separate get_user_pages_longterm() call, introduce FOLL_LONGTERM and change the longterm callers to use it. This patch does not change any functionality. In the short term "longterm" or user controlled pins are unsafe for Filesystems and FS DAX in particular has been blocked. However, callers of get_user_pages_fast() were not "protected". FOLL_LONGTERM can _only_ be supported with get_user_pages[_fast]() as it requires vmas to determine if DAX is in use. NOTE: In merging with the CMA changes we opt to change the get_user_pages() call in check_and_migrate_cma_pages() to a call of __get_user_pages_locked() on the newly migrated pages. This makes the code read better in that we are calling __get_user_pages_locked() on the pages before and after a potential migration. As a side affect some of the interfaces are cleaned up but this is not the primary purpose of the series. In review[1] it was asked: > This I don't get - if you do lock down long term mappings performance > of the actual get_user_pages call shouldn't matter to start with. > > What do I miss? A couple of points. First "longterm" is a relative thing and at this point is probably a misnomer. This is really flagging a pin which is going to be given to hardware and can't move. I've thought of a couple of alternative names but I think we have to settle on if we are going to use FL_LAYOUT or something else to solve the "longterm" problem. Then I think we can change the flag to a better name. Second, It depends on how often you are registering memory. I have spoken with some RDMA users who consider MR in the performance path... For the overall application performance. I don't have the numbers as the tests for HFI1 were done a long time ago. But there was a significant advantage. Some of which is probably due to the fact that you don't have to hold mmap_sem. Finally, architecturally I think it would be good for everyone to use *_fast. There are patches submitted to the RDMA list which would allow the use of *_fast (they reworking the use of mmap_sem) and as soon as they are accepted I'll submit a patch to convert the RDMA core as well. Also to this point others are looking to use *_fast. As an asside, Jasons pointed out in my previous submission that *_fast and *_unlocked look very much the same. I agree and I think further cleanup will be coming. But I'm focused on getting the final solution for DAX at the moment. [1] https://lore.kernel.org/lkml/20190220180255.GA12020@iweiny-DESK2.sc.intel.com/T/#md6abad2569f3bf6c1f03686c8097ab6563e94965 [ira.weiny@intel.com: v3] Link: http://lkml.kernel.org/r/20190328084422.29911-2-ira.weiny@intel.com Link: http://lkml.kernel.org/r/20190328084422.29911-2-ira.weiny@intel.com Link: http://lkml.kernel.org/r/20190317183438.2057-2-ira.weiny@intel.com Signed-off-by: Ira Weiny Reviewed-by: Andrew Morton Cc: Aneesh Kumar K.V Cc: Michal Hocko Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Peter Zijlstra Cc: Jason Gunthorpe Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "David S. Miller" Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Rich Felker Cc: Yoshinori Sato Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Ralf Baechle Cc: James Hogan Cc: Dan Williams Cc: Mike Marshall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/book3s64/iommu_api.c | 5 +- drivers/infiniband/core/umem.c | 5 +- drivers/infiniband/hw/qib/qib_user_pages.c | 8 +- drivers/infiniband/hw/usnic/usnic_uiom.c | 9 +- drivers/media/v4l2-core/videobuf-dma-sg.c | 6 +- drivers/vfio/vfio_iommu_type1.c | 3 +- fs/io_uring.c | 5 +- include/linux/mm.h | 41 +++++-- mm/gup.c | 190 ++++++++++++++++++----------- mm/gup_benchmark.c | 5 +- net/xdp/xdp_umem.c | 4 +- 11 files changed, 173 insertions(+), 108 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c index 8330f135294f..5c521f3924a5 100644 --- a/arch/powerpc/mm/book3s64/iommu_api.c +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -141,8 +141,9 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, for (entry = 0; entry < entries; entry += chunk) { unsigned long n = min(entries - entry, chunk); - ret = get_user_pages_longterm(ua + (entry << PAGE_SHIFT), n, - FOLL_WRITE, mem->hpages + entry, NULL); + ret = get_user_pages(ua + (entry << PAGE_SHIFT), n, + FOLL_WRITE | FOLL_LONGTERM, + mem->hpages + entry, NULL); if (ret == n) { pinned += n; continue; diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 0a23048db523..e7ea819fcb11 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -295,10 +295,11 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, while (npages) { down_read(&mm->mmap_sem); - ret = get_user_pages_longterm(cur_base, + ret = get_user_pages(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof (struct page *)), - gup_flags, page_list, NULL); + gup_flags | FOLL_LONGTERM, + page_list, NULL); if (ret < 0) { up_read(&mm->mmap_sem); goto umem_release; diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index 123ca8f64f75..f712fb7fa82f 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -114,10 +114,10 @@ int qib_get_user_pages(unsigned long start_page, size_t num_pages, down_read(¤t->mm->mmap_sem); for (got = 0; got < num_pages; got += ret) { - ret = get_user_pages_longterm(start_page + got * PAGE_SIZE, - num_pages - got, - FOLL_WRITE | FOLL_FORCE, - p + got, NULL); + ret = get_user_pages(start_page + got * PAGE_SIZE, + num_pages - got, + FOLL_LONGTERM | FOLL_WRITE | FOLL_FORCE, + p + got, NULL); if (ret < 0) { up_read(¤t->mm->mmap_sem); goto bail_release; diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index da35d6fdfc5e..e312f522a66d 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -143,10 +143,11 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, ret = 0; while (npages) { - ret = get_user_pages_longterm(cur_base, - min_t(unsigned long, npages, - PAGE_SIZE / sizeof(struct page *)), - gup_flags, page_list, NULL); + ret = get_user_pages(cur_base, + min_t(unsigned long, npages, + PAGE_SIZE / sizeof(struct page *)), + gup_flags | FOLL_LONGTERM, + page_list, NULL); if (ret < 0) goto out; diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index 08929c087e27..870a2a526e0b 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c @@ -186,12 +186,12 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma, dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n", data, size, dma->nr_pages); - err = get_user_pages_longterm(data & PAGE_MASK, dma->nr_pages, - flags, dma->pages, NULL); + err = get_user_pages(data & PAGE_MASK, dma->nr_pages, + flags | FOLL_LONGTERM, dma->pages, NULL); if (err != dma->nr_pages) { dma->nr_pages = (err >= 0) ? err : 0; - dprintk(1, "get_user_pages_longterm: err=%d [%d]\n", err, + dprintk(1, "get_user_pages: err=%d [%d]\n", err, dma->nr_pages); return err < 0 ? err : -EINVAL; } diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 3be1db3501cc..3ddc375e7063 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -358,7 +358,8 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, down_read(&mm->mmap_sem); if (mm == current->mm) { - ret = get_user_pages_longterm(vaddr, 1, flags, page, vmas); + ret = get_user_pages(vaddr, 1, flags | FOLL_LONGTERM, page, + vmas); } else { ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page, vmas, NULL); diff --git a/fs/io_uring.c b/fs/io_uring.c index 48ea3977012a..fdc18321d70c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2697,8 +2697,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, ret = 0; down_read(¤t->mm->mmap_sem); - pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE, - pages, vmas); + pret = get_user_pages(ubuf, nr_pages, + FOLL_WRITE | FOLL_LONGTERM, + pages, vmas); if (pret == nr_pages) { /* don't support file backed memory */ for (j = 0; j < nr_pages; j++) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 083d7b4863ed..8bc677ce8f01 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1505,19 +1505,6 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); -#if defined(CONFIG_FS_DAX) || defined(CONFIG_CMA) -long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas); -#else -static inline long get_user_pages_longterm(unsigned long start, - unsigned long nr_pages, unsigned int gup_flags, - struct page **pages, struct vm_area_struct **vmas) -{ - return get_user_pages(start, nr_pages, gup_flags, pages, vmas); -} -#endif /* CONFIG_FS_DAX */ - int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); @@ -2583,6 +2570,34 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ #define FOLL_COW 0x4000 /* internal GUP flag */ #define FOLL_ANON 0x8000 /* don't do file mappings */ +#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ + +/* + * NOTE on FOLL_LONGTERM: + * + * FOLL_LONGTERM indicates that the page will be held for an indefinite time + * period _often_ under userspace control. This is contrasted with + * iov_iter_get_pages() where usages which are transient. + * + * FIXME: For pages which are part of a filesystem, mappings are subject to the + * lifetime enforced by the filesystem and we need guarantees that longterm + * users like RDMA and V4L2 only establish mappings which coordinate usage with + * the filesystem. Ideas for this coordination include revoking the longterm + * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was + * added after the problem with filesystems was found FS DAX VMAs are + * specifically failed. Filesystem pages are still subject to bugs and use of + * FOLL_LONGTERM should be avoided on those pages. + * + * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call. + * Currently only get_user_pages() and get_user_pages_fast() support this flag + * and calls to get_user_pages_[un]locked are specifically not allowed. This + * is due to an incompatibility with the FS DAX check and + * FAULT_FLAG_ALLOW_RETRY + * + * In the CMA case: longterm pins in a CMA region would unnecessarily fragment + * that region. And so CMA attempts to migrate the page before pinning when + * FOLL_LONGTERM is specified. + */ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) { diff --git a/mm/gup.c b/mm/gup.c index 91819b8ad9cc..25381102e21e 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1018,6 +1018,15 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { + /* + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; + return __get_user_pages_locked(current, current->mm, start, nr_pages, pages, NULL, locked, gup_flags | FOLL_TOUCH); @@ -1046,6 +1055,15 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, int locked = 1; long ret; + /* + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; + down_read(&mm->mmap_sem); ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL, &locked, gup_flags | FOLL_TOUCH); @@ -1116,32 +1134,22 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) { + /* + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; + return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, locked, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } EXPORT_SYMBOL(get_user_pages_remote); -/* - * This is the same as get_user_pages_remote(), just with a - * less-flexible calling convention where we assume that the task - * and mm being operated on are the current task's and don't allow - * passing of a locked parameter. We also obviously don't pass - * FOLL_REMOTE in here. - */ -long get_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) -{ - return __get_user_pages_locked(current, current->mm, start, nr_pages, - pages, vmas, NULL, - gup_flags | FOLL_TOUCH); -} -EXPORT_SYMBOL(get_user_pages); - #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) - -#ifdef CONFIG_FS_DAX static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) { long i; @@ -1160,12 +1168,6 @@ static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) } return false; } -#else -static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) -{ - return false; -} -#endif #ifdef CONFIG_CMA static struct page *new_non_cma_page(struct page *page, unsigned long private) @@ -1219,10 +1221,13 @@ static struct page *new_non_cma_page(struct page *page, unsigned long private) return __alloc_pages_node(nid, gfp_mask, 0); } -static long check_and_migrate_cma_pages(unsigned long start, long nr_pages, - unsigned int gup_flags, +static long check_and_migrate_cma_pages(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, struct page **pages, - struct vm_area_struct **vmas) + struct vm_area_struct **vmas, + unsigned int gup_flags) { long i; bool drain_allow = true; @@ -1278,10 +1283,14 @@ check_again: putback_movable_pages(&cma_page_list); } /* - * We did migrate all the pages, Try to get the page references again - * migrating any new CMA pages which we failed to isolate earlier. + * We did migrate all the pages, Try to get the page references + * again migrating any new CMA pages which we failed to isolate + * earlier. */ - nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas); + nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages, + pages, vmas, NULL, + gup_flags); + if ((nr_pages > 0) && migrate_allow) { drain_allow = true; goto check_again; @@ -1291,66 +1300,101 @@ check_again: return nr_pages; } #else -static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages, - unsigned int gup_flags, - struct page **pages, - struct vm_area_struct **vmas) +static long check_and_migrate_cma_pages(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int gup_flags) { return nr_pages; } #endif /* - * This is the same as get_user_pages() in that it assumes we are - * operating on the current task's mm, but it goes further to validate - * that the vmas associated with the address range are suitable for - * longterm elevated page reference counts. For example, filesystem-dax - * mappings are subject to the lifetime enforced by the filesystem and - * we need guarantees that longterm users like RDMA and V4L2 only - * establish mappings that have a kernel enforced revocation mechanism. - * - * "longterm" == userspace controlled elevated page count lifetime. - * Contrast this to iov_iter_get_pages() usages which are transient. + * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which + * allows us to process the FOLL_LONGTERM flag. */ -long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas_arg) +static long __gup_longterm_locked(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int gup_flags) { - struct vm_area_struct **vmas = vmas_arg; - unsigned long flags; + struct vm_area_struct **vmas_tmp = vmas; + unsigned long flags = 0; long rc, i; - if (!pages) - return -EINVAL; - - if (!vmas) { - vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *), - GFP_KERNEL); - if (!vmas) - return -ENOMEM; + if (gup_flags & FOLL_LONGTERM) { + if (!pages) + return -EINVAL; + + if (!vmas_tmp) { + vmas_tmp = kcalloc(nr_pages, + sizeof(struct vm_area_struct *), + GFP_KERNEL); + if (!vmas_tmp) + return -ENOMEM; + } + flags = memalloc_nocma_save(); } - flags = memalloc_nocma_save(); - rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas); - memalloc_nocma_restore(flags); - if (rc < 0) - goto out; + rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages, + vmas_tmp, NULL, gup_flags); - if (check_dax_vmas(vmas, rc)) { - for (i = 0; i < rc; i++) - put_page(pages[i]); - rc = -EOPNOTSUPP; - goto out; + if (gup_flags & FOLL_LONGTERM) { + memalloc_nocma_restore(flags); + if (rc < 0) + goto out; + + if (check_dax_vmas(vmas_tmp, rc)) { + for (i = 0; i < rc; i++) + put_page(pages[i]); + rc = -EOPNOTSUPP; + goto out; + } + + rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages, + vmas_tmp, gup_flags); } - rc = check_and_migrate_cma_pages(start, rc, gup_flags, pages, vmas); out: - if (vmas != vmas_arg) - kfree(vmas); + if (vmas_tmp != vmas) + kfree(vmas_tmp); return rc; } -EXPORT_SYMBOL(get_user_pages_longterm); -#endif /* CONFIG_FS_DAX */ +#else /* !CONFIG_FS_DAX && !CONFIG_CMA */ +static __always_inline long __gup_longterm_locked(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int flags) +{ + return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, + NULL, flags); +} +#endif /* CONFIG_FS_DAX || CONFIG_CMA */ + +/* + * This is the same as get_user_pages_remote(), just with a + * less-flexible calling convention where we assume that the task + * and mm being operated on are the current task's and don't allow + * passing of a locked parameter. We also obviously don't pass + * FOLL_REMOTE in here. + */ +long get_user_pages(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas) +{ + return __gup_longterm_locked(current, current->mm, start, nr_pages, + pages, vmas, gup_flags | FOLL_TOUCH); +} +EXPORT_SYMBOL(get_user_pages); /** * populate_vma_page_range() - populate a range of pages in the vma. diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index 6c0279e70cc4..7dd602d7f8db 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c @@ -54,8 +54,9 @@ static int __gup_benchmark_ioctl(unsigned int cmd, pages + i); break; case GUP_LONGTERM_BENCHMARK: - nr = get_user_pages_longterm(addr, nr, gup->flags & 1, - pages + i, NULL); + nr = get_user_pages(addr, nr, + (gup->flags & 1) | FOLL_LONGTERM, + pages + i, NULL); break; case GUP_BENCHMARK: nr = get_user_pages(addr, nr, gup->flags & 1, pages + i, diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 989e52386c35..2b18223e7eb8 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -253,8 +253,8 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem) return -ENOMEM; down_read(¤t->mm->mmap_sem); - npgs = get_user_pages_longterm(umem->address, umem->npgs, - gup_flags, &umem->pgs[0], NULL); + npgs = get_user_pages(umem->address, umem->npgs, + gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL); up_read(¤t->mm->mmap_sem); if (npgs != umem->npgs) { -- cgit v1.2.3 From 73b0140bf0fe9df90fb267c00673c4b9bf285430 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Mon, 13 May 2019 17:17:11 -0700 Subject: mm/gup: change GUP fast to use flags rather than a write 'bool' To facilitate additional options to get_user_pages_fast() change the singular write parameter to be gup_flags. This patch does not change any functionality. New functionality will follow in subsequent patches. Some of the get_user_pages_fast() call sites were unchanged because they already passed FOLL_WRITE or 0 for the write parameter. NOTE: It was suggested to change the ordering of the get_user_pages_fast() arguments to ensure that callers were converted. This breaks the current GUP call site convention of having the returned pages be the final parameter. So the suggestion was rejected. Link: http://lkml.kernel.org/r/20190328084422.29911-4-ira.weiny@intel.com Link: http://lkml.kernel.org/r/20190317183438.2057-4-ira.weiny@intel.com Signed-off-by: Ira Weiny Reviewed-by: Mike Marshall Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Dan Williams Cc: "David S. Miller" Cc: Heiko Carstens Cc: Ingo Molnar Cc: James Hogan Cc: Jason Gunthorpe Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Martin Schwidefsky Cc: Michal Hocko Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Rich Felker Cc: Thomas Gleixner Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/gup.c | 11 ++++++----- arch/powerpc/kvm/book3s_64_mmu_hv.c | 4 ++-- arch/powerpc/kvm/e500_mmu.c | 2 +- arch/s390/kvm/interrupt.c | 2 +- arch/sh/mm/gup.c | 11 ++++++----- arch/sparc/mm/gup.c | 9 +++++---- arch/x86/kvm/paging_tmpl.h | 2 +- arch/x86/kvm/svm.c | 2 +- drivers/fpga/dfl-afu-dma-region.c | 2 +- drivers/gpu/drm/via/via_dmablit.c | 3 ++- drivers/infiniband/hw/hfi1/user_pages.c | 3 ++- drivers/misc/genwqe/card_utils.c | 2 +- drivers/misc/vmw_vmci/vmci_host.c | 2 +- drivers/misc/vmw_vmci/vmci_queue_pair.c | 6 ++++-- drivers/platform/goldfish/goldfish_pipe.c | 3 ++- drivers/rapidio/devices/rio_mport_cdev.c | 4 +++- drivers/sbus/char/oradax.c | 2 +- drivers/scsi/st.c | 3 ++- drivers/staging/gasket/gasket_page_table.c | 4 ++-- drivers/tee/tee_shm.c | 2 +- drivers/vfio/vfio_iommu_spapr_tce.c | 3 ++- drivers/vhost/vhost.c | 2 +- drivers/video/fbdev/pvr2fb.c | 2 +- drivers/virt/fsl_hypervisor.c | 2 +- drivers/xen/gntdev.c | 2 +- fs/orangefs/orangefs-bufmap.c | 2 +- include/linux/mm.h | 4 ++-- kernel/futex.c | 2 +- lib/iov_iter.c | 7 +++++-- mm/gup.c | 10 +++++----- mm/util.c | 8 ++++---- net/ceph/pagevec.c | 2 +- net/rds/info.c | 2 +- net/rds/rdma.c | 3 ++- 34 files changed, 73 insertions(+), 57 deletions(-) (limited to 'include/linux') diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c index 0d14e0d8eacf..4c2b4483683c 100644 --- a/arch/mips/mm/gup.c +++ b/arch/mips/mm/gup.c @@ -235,7 +235,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to + * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @@ -247,8 +247,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) { struct mm_struct *mm = current->mm; unsigned long addr, len, end; @@ -273,7 +273,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) goto slow; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE, + pages, &nr)) goto slow; } while (pgdp++, addr = next, addr != end); local_irq_enable(); @@ -289,7 +290,7 @@ slow_irqon: pages += nr; ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT, - pages, write ? FOLL_WRITE : 0); + pages, gup_flags); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index be7bc070eae5..ab3d484c5e2e 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -600,7 +600,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, /* If writing != 0, then the HPTE must allow writing, if we get here */ write_ok = writing; hva = gfn_to_hva_memslot(memslot, gfn); - npages = get_user_pages_fast(hva, 1, writing, pages); + npages = get_user_pages_fast(hva, 1, writing ? FOLL_WRITE : 0, pages); if (npages < 1) { /* Check if it's an I/O mapping */ down_read(¤t->mm->mmap_sem); @@ -1193,7 +1193,7 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) goto err; hva = gfn_to_hva_memslot(memslot, gfn); - npages = get_user_pages_fast(hva, 1, 1, pages); + npages = get_user_pages_fast(hva, 1, FOLL_WRITE, pages); if (npages < 1) goto err; page = pages[0]; diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c index 24296f4cadc6..e0af53fd78c5 100644 --- a/arch/powerpc/kvm/e500_mmu.c +++ b/arch/powerpc/kvm/e500_mmu.c @@ -783,7 +783,7 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, if (!pages) return -ENOMEM; - ret = get_user_pages_fast(cfg->array, num_pages, 1, pages); + ret = get_user_pages_fast(cfg->array, num_pages, FOLL_WRITE, pages); if (ret < 0) goto free_pages; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 37503ae62486..1fd706f6206c 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2376,7 +2376,7 @@ static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr) ret = -EFAULT; goto out; } - ret = get_user_pages_fast(map->addr, 1, 1, &map->page); + ret = get_user_pages_fast(map->addr, 1, FOLL_WRITE, &map->page); if (ret < 0) goto out; BUG_ON(ret != 1); diff --git a/arch/sh/mm/gup.c b/arch/sh/mm/gup.c index 3e27f6d1f1ec..277c882f7489 100644 --- a/arch/sh/mm/gup.c +++ b/arch/sh/mm/gup.c @@ -204,7 +204,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to + * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @@ -216,8 +216,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) { struct mm_struct *mm = current->mm; unsigned long addr, len, end; @@ -241,7 +241,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) goto slow; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE, + pages, &nr)) goto slow; } while (pgdp++, addr = next, addr != end); local_irq_enable(); @@ -261,7 +262,7 @@ slow_irqon: ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT, pages, - write ? FOLL_WRITE : 0); + gup_flags); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c index aee6dba83d0e..1e770a517d4a 100644 --- a/arch/sparc/mm/gup.c +++ b/arch/sparc/mm/gup.c @@ -245,8 +245,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, return nr; } -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) { struct mm_struct *mm = current->mm; unsigned long addr, len, end; @@ -303,7 +303,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) goto slow; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE, + pages, &nr)) goto slow; } while (pgdp++, addr = next, addr != end); @@ -324,7 +325,7 @@ slow: ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT, pages, - write ? FOLL_WRITE : 0); + gup_flags); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 6bdca39829bc..08715034e315 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -140,7 +140,7 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pt_element_t *table; struct page *page; - npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page); + npages = get_user_pages_fast((unsigned long)ptep_user, 1, FOLL_WRITE, &page); /* Check if the user is doing something meaningless. */ if (unlikely(npages != 1)) return -EFAULT; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 406b558abfef..6b92eaf4a3b1 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1805,7 +1805,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, return NULL; /* Pin the user virtual address. */ - npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); + npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages); if (npinned != npages) { pr_err("SEV: Failure locking %lu pages.\n", npages); goto err; diff --git a/drivers/fpga/dfl-afu-dma-region.c b/drivers/fpga/dfl-afu-dma-region.c index e18a786fc943..c438722bf4e1 100644 --- a/drivers/fpga/dfl-afu-dma-region.c +++ b/drivers/fpga/dfl-afu-dma-region.c @@ -102,7 +102,7 @@ static int afu_dma_pin_pages(struct dfl_feature_platform_data *pdata, goto unlock_vm; } - pinned = get_user_pages_fast(region->user_addr, npages, 1, + pinned = get_user_pages_fast(region->user_addr, npages, FOLL_WRITE, region->pages); if (pinned < 0) { ret = pinned; diff --git a/drivers/gpu/drm/via/via_dmablit.c b/drivers/gpu/drm/via/via_dmablit.c index 8bf3a7c23ed3..062067438f1d 100644 --- a/drivers/gpu/drm/via/via_dmablit.c +++ b/drivers/gpu/drm/via/via_dmablit.c @@ -243,7 +243,8 @@ via_lock_all_dma_pages(drm_via_sg_info_t *vsg, drm_via_dmablit_t *xfer) if (NULL == vsg->pages) return -ENOMEM; ret = get_user_pages_fast((unsigned long)xfer->mem_addr, - vsg->num_pages, vsg->direction == DMA_FROM_DEVICE, + vsg->num_pages, + vsg->direction == DMA_FROM_DEVICE ? FOLL_WRITE : 0, vsg->pages); if (ret != vsg->num_pages) { if (ret < 0) diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c index 24b592c6522e..78ccacaf97d0 100644 --- a/drivers/infiniband/hw/hfi1/user_pages.c +++ b/drivers/infiniband/hw/hfi1/user_pages.c @@ -105,7 +105,8 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np { int ret; - ret = get_user_pages_fast(vaddr, npages, writable, pages); + ret = get_user_pages_fast(vaddr, npages, writable ? FOLL_WRITE : 0, + pages); if (ret < 0) return ret; diff --git a/drivers/misc/genwqe/card_utils.c b/drivers/misc/genwqe/card_utils.c index 25265fd0fd6e..89cff9d1012b 100644 --- a/drivers/misc/genwqe/card_utils.c +++ b/drivers/misc/genwqe/card_utils.c @@ -603,7 +603,7 @@ int genwqe_user_vmap(struct genwqe_dev *cd, struct dma_mapping *m, void *uaddr, /* pin user pages in memory */ rc = get_user_pages_fast(data & PAGE_MASK, /* page aligned addr */ m->nr_pages, - m->write, /* readable/writable */ + m->write ? FOLL_WRITE : 0, /* readable/writable */ m->page_list); /* ptrs to pages */ if (rc < 0) goto fail_get_user_pages; diff --git a/drivers/misc/vmw_vmci/vmci_host.c b/drivers/misc/vmw_vmci/vmci_host.c index 997f92543dd4..422d08da3244 100644 --- a/drivers/misc/vmw_vmci/vmci_host.c +++ b/drivers/misc/vmw_vmci/vmci_host.c @@ -242,7 +242,7 @@ static int vmci_host_setup_notify(struct vmci_ctx *context, /* * Lock physical page backing a given user VA. */ - retval = get_user_pages_fast(uva, 1, 1, &context->notify_page); + retval = get_user_pages_fast(uva, 1, FOLL_WRITE, &context->notify_page); if (retval != 1) { context->notify_page = NULL; return VMCI_ERROR_GENERIC; diff --git a/drivers/misc/vmw_vmci/vmci_queue_pair.c b/drivers/misc/vmw_vmci/vmci_queue_pair.c index f5f1aac9d163..1174735f003d 100644 --- a/drivers/misc/vmw_vmci/vmci_queue_pair.c +++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c @@ -659,7 +659,8 @@ static int qp_host_get_user_memory(u64 produce_uva, int err = VMCI_SUCCESS; retval = get_user_pages_fast((uintptr_t) produce_uva, - produce_q->kernel_if->num_pages, 1, + produce_q->kernel_if->num_pages, + FOLL_WRITE, produce_q->kernel_if->u.h.header_page); if (retval < (int)produce_q->kernel_if->num_pages) { pr_debug("get_user_pages_fast(produce) failed (retval=%d)", @@ -671,7 +672,8 @@ static int qp_host_get_user_memory(u64 produce_uva, } retval = get_user_pages_fast((uintptr_t) consume_uva, - consume_q->kernel_if->num_pages, 1, + consume_q->kernel_if->num_pages, + FOLL_WRITE, consume_q->kernel_if->u.h.header_page); if (retval < (int)consume_q->kernel_if->num_pages) { pr_debug("get_user_pages_fast(consume) failed (retval=%d)", diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c index 321bc673c417..cef0133aa47a 100644 --- a/drivers/platform/goldfish/goldfish_pipe.c +++ b/drivers/platform/goldfish/goldfish_pipe.c @@ -274,7 +274,8 @@ static int pin_user_pages(unsigned long first_page, *iter_last_page_size = last_page_size; } - ret = get_user_pages_fast(first_page, requested_pages, !is_write, + ret = get_user_pages_fast(first_page, requested_pages, + !is_write ? FOLL_WRITE : 0, pages); if (ret <= 0) return -EFAULT; diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index 1e1f42e210a0..4a4a75fa26d5 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -868,7 +868,9 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, pinned = get_user_pages_fast( (unsigned long)xfer->loc_addr & PAGE_MASK, - nr_pages, dir == DMA_FROM_DEVICE, page_list); + nr_pages, + dir == DMA_FROM_DEVICE ? FOLL_WRITE : 0, + page_list); if (pinned != nr_pages) { if (pinned < 0) { diff --git a/drivers/sbus/char/oradax.c b/drivers/sbus/char/oradax.c index acd9ba40eabe..8090dc9a1514 100644 --- a/drivers/sbus/char/oradax.c +++ b/drivers/sbus/char/oradax.c @@ -437,7 +437,7 @@ static int dax_lock_page(void *va, struct page **p) dax_dbg("uva %p", va); - ret = get_user_pages_fast((unsigned long)va, 1, 1, p); + ret = get_user_pages_fast((unsigned long)va, 1, FOLL_WRITE, p); if (ret == 1) { dax_dbg("locked page %p, for VA %p", *p, va); return 0; diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 19c022e66d63..3c6a18ad9a87 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -4922,7 +4922,8 @@ static int sgl_map_user_pages(struct st_buffer *STbp, /* Try to fault in all of the necessary pages */ /* rw==READ means read from drive, write into memory area */ - res = get_user_pages_fast(uaddr, nr_pages, rw == READ, pages); + res = get_user_pages_fast(uaddr, nr_pages, rw == READ ? FOLL_WRITE : 0, + pages); /* Errors and no page mapped should return here */ if (res < nr_pages) diff --git a/drivers/staging/gasket/gasket_page_table.c b/drivers/staging/gasket/gasket_page_table.c index 600928f63577..d35c4fb19e28 100644 --- a/drivers/staging/gasket/gasket_page_table.c +++ b/drivers/staging/gasket/gasket_page_table.c @@ -486,8 +486,8 @@ static int gasket_perform_mapping(struct gasket_page_table *pg_tbl, ptes[i].dma_addr = pg_tbl->coherent_pages[0].paddr + off + i * PAGE_SIZE; } else { - ret = get_user_pages_fast(page_addr - offset, 1, 1, - &page); + ret = get_user_pages_fast(page_addr - offset, 1, + FOLL_WRITE, &page); if (ret <= 0) { dev_err(pg_tbl->device, diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c index 0b9ab1d0dd45..49fd7312e2aa 100644 --- a/drivers/tee/tee_shm.c +++ b/drivers/tee/tee_shm.c @@ -273,7 +273,7 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr, goto err; } - rc = get_user_pages_fast(start, num_pages, 1, shm->pages); + rc = get_user_pages_fast(start, num_pages, FOLL_WRITE, shm->pages); if (rc > 0) shm->num_pages = rc; if (rc != num_pages) { diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 6b64e45a5269..40ddc0c5f677 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -532,7 +532,8 @@ static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa) enum dma_data_direction direction = iommu_tce_direction(tce); if (get_user_pages_fast(tce & PAGE_MASK, 1, - direction != DMA_TO_DEVICE, &page) != 1) + direction != DMA_TO_DEVICE ? FOLL_WRITE : 0, + &page) != 1) return -EFAULT; *hpa = __pa((unsigned long) page_address(page)); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 351af88231ad..1e3ed41ae1f3 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1704,7 +1704,7 @@ static int set_bit_to_user(int nr, void __user *addr) int bit = nr + (log % PAGE_SIZE) * 8; int r; - r = get_user_pages_fast(log, 1, 1, &page); + r = get_user_pages_fast(log, 1, FOLL_WRITE, &page); if (r < 0) return r; BUG_ON(r != 1); diff --git a/drivers/video/fbdev/pvr2fb.c b/drivers/video/fbdev/pvr2fb.c index dfed532ed606..4e4d6a0df978 100644 --- a/drivers/video/fbdev/pvr2fb.c +++ b/drivers/video/fbdev/pvr2fb.c @@ -686,7 +686,7 @@ static ssize_t pvr2fb_write(struct fb_info *info, const char *buf, if (!pages) return -ENOMEM; - ret = get_user_pages_fast((unsigned long)buf, nr_pages, true, pages); + ret = get_user_pages_fast((unsigned long)buf, nr_pages, FOLL_WRITE, pages); if (ret < nr_pages) { nr_pages = ret; ret = -EINVAL; diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c index 8ba726e600e9..6446bcab4185 100644 --- a/drivers/virt/fsl_hypervisor.c +++ b/drivers/virt/fsl_hypervisor.c @@ -244,7 +244,7 @@ static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p) /* Get the physical addresses of the source buffer */ num_pinned = get_user_pages_fast(param.local_vaddr - lb_offset, - num_pages, param.source != -1, pages); + num_pages, param.source != -1 ? FOLL_WRITE : 0, pages); if (num_pinned != num_pages) { /* get_user_pages() failed */ diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 7cf9c51318aa..02bc815982d4 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -852,7 +852,7 @@ static int gntdev_get_page(struct gntdev_copy_batch *batch, void __user *virt, unsigned long xen_pfn; int ret; - ret = get_user_pages_fast(addr, 1, writeable, &page); + ret = get_user_pages_fast(addr, 1, writeable ? FOLL_WRITE : 0, &page); if (ret < 0) return ret; diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index d4811f981608..2bb916d68576 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -269,7 +269,7 @@ orangefs_bufmap_map(struct orangefs_bufmap *bufmap, /* map the pages */ ret = get_user_pages_fast((unsigned long)user_desc->ptr, - bufmap->page_count, 1, bufmap->page_array); + bufmap->page_count, FOLL_WRITE, bufmap->page_array); if (ret < 0) return ret; diff --git a/include/linux/mm.h b/include/linux/mm.h index 8bc677ce8f01..c3c73b3c9adc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1505,8 +1505,8 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages); +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages); /* Container for pinned pfns / pages */ struct frame_vector { diff --git a/kernel/futex.c b/kernel/futex.c index 6262f1534ac9..2268b97d5439 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -543,7 +543,7 @@ again: if (unlikely(should_fail_futex(fshared))) return -EFAULT; - err = get_user_pages_fast(address, 1, 1, &page); + err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); /* * If write access is not required (eg. FUTEX_WAIT), try * and get read-only access. diff --git a/lib/iov_iter.c b/lib/iov_iter.c index b396d328a764..f74fa832f3aa 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1293,7 +1293,9 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, len = maxpages * PAGE_SIZE; addr &= ~(PAGE_SIZE - 1); n = DIV_ROUND_UP(len, PAGE_SIZE); - res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, pages); + res = get_user_pages_fast(addr, n, + iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, + pages); if (unlikely(res < 0)) return res; return (res == n ? len : res * PAGE_SIZE) - *start; @@ -1374,7 +1376,8 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, p = get_pages_array(n); if (!p) return -ENOMEM; - res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, p); + res = get_user_pages_fast(addr, n, + iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, p); if (unlikely(res < 0)) { kvfree(p); return res; diff --git a/mm/gup.c b/mm/gup.c index 113c18a98cf5..3dde6a8da670 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2062,7 +2062,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to + * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @@ -2074,8 +2074,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) { unsigned long addr, len, end; int nr = 0, ret = 0; @@ -2093,7 +2093,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, if (gup_fast_permitted(start, nr_pages)) { local_irq_disable(); - gup_pgd_range(addr, end, write ? FOLL_WRITE : 0, pages, &nr); + gup_pgd_range(addr, end, gup_flags, pages, &nr); local_irq_enable(); ret = nr; } @@ -2104,7 +2104,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, pages += nr; ret = get_user_pages_unlocked(start, nr_pages - nr, pages, - write ? FOLL_WRITE : 0); + gup_flags); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/mm/util.c b/mm/util.c index 43a2984bccaa..05a464929b3e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -318,7 +318,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to + * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @@ -339,10 +339,10 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); * were pinned, returns -errno. */ int __weak get_user_pages_fast(unsigned long start, - int nr_pages, int write, struct page **pages) + int nr_pages, unsigned int gup_flags, + struct page **pages) { - return get_user_pages_unlocked(start, nr_pages, pages, - write ? FOLL_WRITE : 0); + return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); } EXPORT_SYMBOL_GPL(get_user_pages_fast); diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index d3736f5bffec..74cafc0142ea 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -27,7 +27,7 @@ struct page **ceph_get_direct_page_vector(const void __user *data, while (got < num_pages) { rc = get_user_pages_fast( (unsigned long)data + ((unsigned long)got * PAGE_SIZE), - num_pages - got, write_page, pages + got); + num_pages - got, write_page ? FOLL_WRITE : 0, pages + got); if (rc < 0) break; BUG_ON(rc == 0); diff --git a/net/rds/info.c b/net/rds/info.c index e367a97a18c8..03f6fd56d237 100644 --- a/net/rds/info.c +++ b/net/rds/info.c @@ -193,7 +193,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, ret = -ENOMEM; goto out; } - ret = get_user_pages_fast(start, nr_pages, 1, pages); + ret = get_user_pages_fast(start, nr_pages, FOLL_WRITE, pages); if (ret != nr_pages) { if (ret > 0) nr_pages = ret; diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 182ab8430594..b340ed4fc43a 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -158,7 +158,8 @@ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, { int ret; - ret = get_user_pages_fast(user_addr, nr_pages, write, pages); + ret = get_user_pages_fast(user_addr, nr_pages, write ? FOLL_WRITE : 0, + pages); if (ret >= 0 && ret < nr_pages) { while (ret--) -- cgit v1.2.3 From e0ee0e71078abbcadd4cbc38fb8570551fccc103 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 13 May 2019 17:17:57 -0700 Subject: mm: memcontrol: track LRU counts in the vmstats array Patch series "mm: memcontrol: clean up the LRU counts tracking". The memcg LRU stats usage is currently a bit messy. Memcg has private per-zone counters because reclaim needs zone granularity sometimes, but we also have plenty of users that need to awkwardly sum them up to node or memcg granularity. Meanwhile the canonical per-memcg vmstats do not track the LRU counts (NR_INACTIVE_ANON etc.) as you'd expect. This series enables LRU count tracking in the per-memcg vmstats array such that lruvec_page_state() and memcg_page_state() work on the enum node_stat_item items for the LRU counters. Then it converts all the callers that don't specifically need per-zone numbers over to that. This patch (of 6): The memcg code currently maintains private per-zone breakdowns of the LRU counters. This is necessary for reclaim decisions which are still zone-based, but there are a variety of users of these counters that only want the aggregate per-lruvec or per-memcg LRU counts, and they need to painfully sum up the zone counters on each request for that. These would be better served using the memcg vmstats arrays, which track VM statistics at the desired scope already. They just don't have the LRU counts right now. So to kick off the conversion, begin tracking LRU counts in those. Link: http://lkml.kernel.org/r/20190228163020.24100-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Cc: Tejun Heo Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 04ec454d44ce..6f2fef7b0784 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -29,7 +29,7 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec, { struct pglist_data *pgdat = lruvec_pgdat(lruvec); - __mod_node_page_state(pgdat, NR_LRU_BASE + lru, nr_pages); + __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); __mod_zone_page_state(&pgdat->node_zones[zid], NR_ZONE_LRU_BASE + lru, nr_pages); } -- cgit v1.2.3 From 1a61ab8038e724a6d8aa59e7d4931a119483294d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 13 May 2019 17:18:00 -0700 Subject: mm: memcontrol: replace zone summing with lruvec_page_state() Instead of adding up the zone counters, use lruvec_page_state() to get the node state directly. This is a bit cheaper and more stream-lined. Link: http://lkml.kernel.org/r/20190228163020.24100-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Cc: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 18 ------------------ mm/memcontrol.c | 2 +- mm/vmscan.c | 2 +- 3 files changed, 2 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b238403f95b2..65f381b27a2d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -504,19 +504,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask); -static inline -unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) -{ - struct mem_cgroup_per_node *mz; - unsigned long nr_pages = 0; - int zid; - - mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - for (zid = 0; zid < MAX_NR_ZONES; zid++) - nr_pages += mz->lru_zone_size[zid][lru]; - return nr_pages; -} - static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) @@ -960,11 +947,6 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg) return true; } -static inline unsigned long -mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) -{ - return 0; -} static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 81a0d3914ec9..f30381481c45 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -737,7 +737,7 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) continue; - nr += mem_cgroup_get_lru_size(lruvec, lru); + nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); } return nr; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 39912c6b7181..e869f9e25a3d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -346,7 +346,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone int zid; if (!mem_cgroup_disabled()) - lru_size = mem_cgroup_get_lru_size(lruvec, lru); + lru_size = lruvec_page_state(lruvec, NR_LRU_BASE + lru); else lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); -- cgit v1.2.3 From 2b487e59f00aaa885ebf9c47d44d09f3ef4df80e Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 13 May 2019 17:18:05 -0700 Subject: mm: memcontrol: push down mem_cgroup_node_nr_lru_pages() mem_cgroup_node_nr_lru_pages() is just a convenience wrapper around lruvec_page_state() that takes bitmasks of lru indexes and aggregates the counts for those. Replace callsites where the bitmask is simple enough with direct lruvec_page_state() calls. This removes the last extern user of mem_cgroup_node_nr_lru_pages(), so make that function private again, too. Link: http://lkml.kernel.org/r/20190228163020.24100-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Cc: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 ---------- mm/memcontrol.c | 10 +++++++--- mm/workingset.c | 5 +++-- 3 files changed, 10 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 65f381b27a2d..30561a954ee0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -501,9 +501,6 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int zid, int nr_pages); -unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, unsigned int lru_mask); - static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) @@ -954,13 +951,6 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, return 0; } -static inline unsigned long -mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, unsigned int lru_mask) -{ - return 0; -} - static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { return 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4000ea4ea0ec..268a0bd83773 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -725,7 +725,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages); } -unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, +static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask) { struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); @@ -1425,11 +1425,15 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, int nid, bool noswap) { - if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) + struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); + + if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) || + lruvec_page_state(lruvec, NR_ACTIVE_FILE)) return true; if (noswap || !total_swap_pages) return false; - if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) + if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) || + lruvec_page_state(lruvec, NR_ACTIVE_ANON)) return true; return false; diff --git a/mm/workingset.c b/mm/workingset.c index 0bedf67502d5..6419baebd306 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -426,10 +426,11 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, #ifdef CONFIG_MEMCG if (sc->memcg) { struct lruvec *lruvec; + int i; - pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, - LRU_ALL); lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg); + for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) + pages += lruvec_page_state(lruvec, NR_LRU_BASE + i); pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE); pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE); } else -- cgit v1.2.3 From 113b7dfd827175977ea71cc4a29c1ac24acb9fce Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 13 May 2019 17:18:11 -0700 Subject: mm: memcontrol: quarantine the mem_cgroup_[node_]nr_lru_pages() API Only memcg_numa_stat_show() uses those wrappers and the lru bitmasks, group them together. Link: http://lkml.kernel.org/r/20190228163020.24100-7-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Cc: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 5 ---- mm/memcontrol.c | 67 +++++++++++++++++++++++++++----------------------- 2 files changed, 36 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fba7741533be..5a4aedc160bd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -247,11 +247,6 @@ struct lruvec { #endif }; -/* Mask used at gathering information at once (see memcontrol.c) */ -#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) -#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) -#define LRU_ALL ((1 << NR_LRU_LISTS) - 1) - /* Isolate unmapped file */ #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2) /* Isolate for asynchronous migration */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d29417b93a8b..287933005e11 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -725,37 +725,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages); } -static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, unsigned int lru_mask) -{ - struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); - unsigned long nr = 0; - enum lru_list lru; - - VM_BUG_ON((unsigned)nid >= nr_node_ids); - - for_each_lru(lru) { - if (!(BIT(lru) & lru_mask)) - continue; - nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); - } - return nr; -} - -static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, - unsigned int lru_mask) -{ - unsigned long nr = 0; - enum lru_list lru; - - for_each_lru(lru) { - if (!(BIT(lru) & lru_mask)) - continue; - nr += memcg_page_state(memcg, NR_LRU_BASE + lru); - } - return nr; -} - static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, enum mem_cgroup_events_target target) { @@ -3338,6 +3307,42 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, #endif #ifdef CONFIG_NUMA + +#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) +#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) +#define LRU_ALL ((1 << NR_LRU_LISTS) - 1) + +static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid, unsigned int lru_mask) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); + unsigned long nr = 0; + enum lru_list lru; + + VM_BUG_ON((unsigned)nid >= nr_node_ids); + + for_each_lru(lru) { + if (!(BIT(lru) & lru_mask)) + continue; + nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); + } + return nr; +} + +static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, + unsigned int lru_mask) +{ + unsigned long nr = 0; + enum lru_list lru; + + for_each_lru(lru) { + if (!(BIT(lru) & lru_mask)) + continue; + nr += memcg_page_state(memcg, NR_LRU_BASE + lru); + } + return nr; +} + static int memcg_numa_stat_show(struct seq_file *m, void *v) { struct numa_stat { -- cgit v1.2.3 From 8df995f6bde01de96ce93373785f41c3bd13ad1c Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 13 May 2019 17:19:00 -0700 Subject: mm: simplify MEMORY_ISOLATION && COMPACTION || CMA into CONTIG_ALLOC This condition allows to define alloc_contig_range, so simplify it into a more accurate naming. Link: http://lkml.kernel.org/r/20190327063626.18421-4-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Suggested-by: Vlastimil Babka Acked-by: Vlastimil Babka Cc: Andy Lutomirsky Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David S. Miller Cc: Heiko Carstens Cc: "H . Peter Anvin" Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Mike Kravetz Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Rich Felker Cc: Thomas Gleixner Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 2 +- arch/powerpc/platforms/Kconfig.cputype | 2 +- arch/s390/Kconfig | 2 +- arch/sh/Kconfig | 2 +- arch/sparc/Kconfig | 2 +- arch/x86/Kconfig | 2 +- arch/x86/mm/hugetlbpage.c | 2 +- include/linux/gfp.h | 2 +- mm/Kconfig | 3 +++ mm/page_alloc.c | 3 +-- 10 files changed, 12 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e24dc16453aa..7f7fbd8bd9d5 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -19,7 +19,7 @@ config ARM64 select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA + select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC select ARCH_HAS_KCOV select ARCH_HAS_KEEPINITRD select ARCH_HAS_MEMBARRIER_SYNC_CORE diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index d0e172d47574..3a31d4289ea4 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -331,7 +331,7 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK config PPC_RADIX_MMU bool "Radix MMU Support" depends on PPC_BOOK3S_64 && HUGETLB_PAGE - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA + select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC select PPC_HAVE_KUEP select PPC_HAVE_KUAP default y diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 07485582d027..724dbc6b7d33 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -63,7 +63,7 @@ config S390 select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA + select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC select ARCH_HAS_KCOV select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_MEMORY diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 6349396317a9..2a5ec643fec0 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -53,7 +53,7 @@ config SUPERH select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_NMI select NEED_SG_DMA_LENGTH - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA + select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC help The SuperH is a RISC processor targeted for use in embedded systems diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index b848c8ddd92e..566de738e487 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -92,7 +92,7 @@ config SPARC64 select ARCH_CLOCKSOURCE_DATA select ARCH_HAS_PTE_SPECIAL select PCI_DOMAINS if PCI - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA + select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC config ARCH_DEFCONFIG string diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e7212731cffb..526d95abfe5e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -22,7 +22,7 @@ config X86_64 def_bool y depends on 64BIT # Options that are inherently 64-bit kernel only: - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA + select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC select ARCH_SUPPORTS_INT128 select ARCH_USE_CMPXCHG_LOCKREF select HAVE_ARCH_SOFT_DIRTY diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 92e4c4b85bba..fab095362c50 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -203,7 +203,7 @@ static __init int setup_hugepagesz(char *opt) } __setup("hugepagesz=", setup_hugepagesz); -#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) +#ifdef CONFIG_CONTIG_ALLOC static __init int gigantic_pages_init(void) { /* With compaction or CMA we can allocate gigantic pages at runtime */ diff --git a/include/linux/gfp.h b/include/linux/gfp.h index fdab7de7490d..e77ab30e9328 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -585,7 +585,7 @@ static inline bool pm_suspended_storage(void) } #endif /* CONFIG_PM_SLEEP */ -#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) +#ifdef CONFIG_CONTIG_ALLOC /* The below functions must be run on a range from a single zone. */ extern int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype, gfp_t gfp_mask); diff --git a/mm/Kconfig b/mm/Kconfig index 25c71eb8a7db..137eadc18732 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -258,6 +258,9 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION config ARCH_ENABLE_THP_MIGRATION bool +config CONTIG_ALLOC + def_bool (MEMORY_ISOLATION && COMPACTION) || CMA + config PHYS_ADDR_T_64BIT def_bool 64BIT diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 07a0d722d481..2efb6525d932 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8137,8 +8137,7 @@ unmovable: return true; } -#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) - +#ifdef CONFIG_CONTIG_ALLOC static unsigned long pfn_max_align_down(unsigned long pfn) { return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, -- cgit v1.2.3 From 4eb0716e868eed963967adb0b1b11d9bd8ca1d01 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 13 May 2019 17:19:04 -0700 Subject: hugetlb: allow to free gigantic pages regardless of the configuration On systems without CONTIG_ALLOC activated but that support gigantic pages, boottime reserved gigantic pages can not be freed at all. This patch simply enables the possibility to hand back those pages to memory allocator. Link: http://lkml.kernel.org/r/20190327063626.18421-5-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: David S. Miller [sparc] Reviewed-by: Mike Kravetz Cc: Andy Lutomirsky Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Hansen Cc: Heiko Carstens Cc: "H . Peter Anvin" Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Rich Felker Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 2 +- arch/arm64/include/asm/hugetlb.h | 4 --- arch/powerpc/include/asm/book3s/64/hugetlb.h | 5 ++- arch/powerpc/platforms/Kconfig.cputype | 2 +- arch/s390/Kconfig | 2 +- arch/s390/include/asm/hugetlb.h | 8 +++-- arch/sh/Kconfig | 2 +- arch/sparc/Kconfig | 2 +- arch/x86/Kconfig | 2 +- arch/x86/include/asm/hugetlb.h | 4 --- include/asm-generic/hugetlb.h | 7 ++++ include/linux/gfp.h | 2 +- mm/hugetlb.c | 54 +++++++++++++++++++--------- mm/page_alloc.c | 4 +-- 14 files changed, 61 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7f7fbd8bd9d5..7a1aa53d188d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -19,7 +19,7 @@ config ARM64 select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC + select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_KCOV select ARCH_HAS_KEEPINITRD select ARCH_HAS_MEMBARRIER_SYNC_CORE diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index c6a07a3b433e..4aad6382f631 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -70,8 +70,4 @@ extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, #include -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static inline bool gigantic_page_supported(void) { return true; } -#endif - #endif /* __ASM_HUGETLB_H */ diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h index 56140d19c85f..12e150e615b7 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -36,8 +36,8 @@ static inline int hstate_get_psize(struct hstate *hstate) } } -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static inline bool gigantic_page_supported(void) +#define __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED +static inline bool gigantic_page_runtime_supported(void) { /* * We used gigantic page reservation with hypervisor assist in some case. @@ -49,7 +49,6 @@ static inline bool gigantic_page_supported(void) return true; } -#endif /* hugepd entry valid bit */ #define HUGEPD_VAL_BITS (0x8000000000000000UL) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 3a31d4289ea4..2794235e9d3e 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -331,7 +331,7 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK config PPC_RADIX_MMU bool "Radix MMU Support" depends on PPC_BOOK3S_64 && HUGETLB_PAGE - select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC + select ARCH_HAS_GIGANTIC_PAGE select PPC_HAVE_KUEP select PPC_HAVE_KUAP default y diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 724dbc6b7d33..d0c046af65fa 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -63,7 +63,7 @@ config S390 select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC + select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_KCOV select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_MEMORY diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 2d1afa58a4b6..bb59dd964590 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -116,7 +116,9 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) return pte_modify(pte, newprot); } -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static inline bool gigantic_page_supported(void) { return true; } -#endif +static inline bool gigantic_page_runtime_supported(void) +{ + return true; +} + #endif /* _ASM_S390_HUGETLB_H */ diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 2a5ec643fec0..2a77033e1e7c 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -53,7 +53,7 @@ config SUPERH select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_NMI select NEED_SG_DMA_LENGTH - select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC + select ARCH_HAS_GIGANTIC_PAGE help The SuperH is a RISC processor targeted for use in embedded systems diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 566de738e487..7c93f3121ee6 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -92,7 +92,7 @@ config SPARC64 select ARCH_CLOCKSOURCE_DATA select ARCH_HAS_PTE_SPECIAL select PCI_DOMAINS if PCI - select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC + select ARCH_HAS_GIGANTIC_PAGE config ARCH_DEFCONFIG string diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 526d95abfe5e..f21bc56e5d7b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -22,7 +22,7 @@ config X86_64 def_bool y depends on 64BIT # Options that are inherently 64-bit kernel only: - select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC + select ARCH_HAS_GIGANTIC_PAGE select ARCH_SUPPORTS_INT128 select ARCH_USE_CMPXCHG_LOCKREF select HAVE_ARCH_SOFT_DIRTY diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index 7469d321f072..f65cfb48cfdd 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -17,8 +17,4 @@ static inline void arch_clear_hugepage_flags(struct page *page) { } -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static inline bool gigantic_page_supported(void) { return true; } -#endif - #endif /* _ASM_X86_HUGETLB_H */ diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 71d7b77eea50..822f433ac95c 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -126,4 +126,11 @@ static inline pte_t huge_ptep_get(pte_t *ptep) } #endif +#ifndef __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED +static inline bool gigantic_page_runtime_supported(void) +{ + return IS_ENABLED(CONFIG_ARCH_HAS_GIGANTIC_PAGE); +} +#endif /* __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED */ + #endif /* _ASM_GENERIC_HUGETLB_H */ diff --git a/include/linux/gfp.h b/include/linux/gfp.h index e77ab30e9328..fb07b503dc45 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -589,8 +589,8 @@ static inline bool pm_suspended_storage(void) /* The below functions must be run on a range from a single zone. */ extern int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype, gfp_t gfp_mask); -extern void free_contig_range(unsigned long pfn, unsigned nr_pages); #endif +void free_contig_range(unsigned long pfn, unsigned int nr_pages); #ifdef CONFIG_CMA /* CMA stuff */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dffe5d9d03ae..2f901a6e13d2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1059,6 +1059,7 @@ static void free_gigantic_page(struct page *page, unsigned int order) free_contig_range(page_to_pfn(page), 1 << order); } +#ifdef CONFIG_CONTIG_ALLOC static int __alloc_gigantic_page(unsigned long start_pfn, unsigned long nr_pages, gfp_t gfp_mask) { @@ -1143,11 +1144,20 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); static void prep_compound_gigantic_page(struct page *page, unsigned int order); +#else /* !CONFIG_CONTIG_ALLOC */ +static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) +{ + return NULL; +} +#endif /* CONFIG_CONTIG_ALLOC */ #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ -static inline bool gigantic_page_supported(void) { return false; } static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nodemask) { return NULL; } + int nid, nodemask_t *nodemask) +{ + return NULL; +} static inline void free_gigantic_page(struct page *page, unsigned int order) { } static inline void destroy_compound_gigantic_page(struct page *page, unsigned int order) { } @@ -1157,7 +1167,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) { int i; - if (hstate_is_gigantic(h) && !gigantic_page_supported()) + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; h->nr_huge_pages--; @@ -2278,13 +2288,27 @@ found: } #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) -static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, - nodemask_t *nodes_allowed) +static int set_max_huge_pages(struct hstate *h, unsigned long count, + nodemask_t *nodes_allowed) { unsigned long min_count, ret; - if (hstate_is_gigantic(h) && !gigantic_page_supported()) - return h->max_huge_pages; + spin_lock(&hugetlb_lock); + + /* + * Gigantic pages runtime allocation depend on the capability for large + * page range allocation. + * If the system does not provide this feature, return an error when + * the user tries to allocate gigantic pages but let the user free the + * boottime allocated gigantic pages. + */ + if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { + if (count > persistent_huge_pages(h)) { + spin_unlock(&hugetlb_lock); + return -EINVAL; + } + /* Fall through to decrease pool */ + } /* * Increase the pool size @@ -2297,7 +2321,6 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * pool might be one hugepage larger than it needs to be, but * within all the constraints specified by the sysctls. */ - spin_lock(&hugetlb_lock); while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, -1)) break; @@ -2352,9 +2375,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, break; } out: - ret = persistent_huge_pages(h); + h->max_huge_pages = persistent_huge_pages(h); spin_unlock(&hugetlb_lock); - return ret; + + return 0; } #define HSTATE_ATTR_RO(_name) \ @@ -2406,7 +2430,7 @@ static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, int err; NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); - if (hstate_is_gigantic(h) && !gigantic_page_supported()) { + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) { err = -EINVAL; goto out; } @@ -2430,15 +2454,13 @@ static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, } else nodes_allowed = &node_states[N_MEMORY]; - h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); + err = set_max_huge_pages(h, count, nodes_allowed); +out: if (nodes_allowed != &node_states[N_MEMORY]) NODEMASK_FREE(nodes_allowed); - return len; -out: - NODEMASK_FREE(nodes_allowed); - return err; + return err ? err : len; } static ssize_t nr_hugepages_store_common(bool obey_mempolicy, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2efb6525d932..4ea71bc70413 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8346,8 +8346,9 @@ done: pfn_max_align_up(end), migratetype); return ret; } +#endif /* CONFIG_CONTIG_ALLOC */ -void free_contig_range(unsigned long pfn, unsigned nr_pages) +void free_contig_range(unsigned long pfn, unsigned int nr_pages) { unsigned int count = 0; @@ -8359,7 +8360,6 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) } WARN(count != 0, "%d pages are still in use!\n", count); } -#endif #ifdef CONFIG_MEMORY_HOTPLUG /* -- cgit v1.2.3 From fc1d8e7cca2daa18d2fe56b94874848adf89d7f5 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Mon, 13 May 2019 17:19:08 -0700 Subject: mm: introduce put_user_page*(), placeholder versions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A discussion of the overall problem is below. As mentioned in patch 0001, the steps are to fix the problem are: 1) Provide put_user_page*() routines, intended to be used for releasing pages that were pinned via get_user_pages*(). 2) Convert all of the call sites for get_user_pages*(), to invoke put_user_page*(), instead of put_page(). This involves dozens of call sites, and will take some time. 3) After (2) is complete, use get_user_pages*() and put_user_page*() to implement tracking of these pages. This tracking will be separate from the existing struct page refcounting. 4) Use the tracking and identification of these pages, to implement special handling (especially in writeback paths) when the pages are backed by a filesystem. Overview ======== Some kernel components (file systems, device drivers) need to access memory that is specified via process virtual address. For a long time, the API to achieve that was get_user_pages ("GUP") and its variations. However, GUP has critical limitations that have been overlooked; in particular, GUP does not interact correctly with filesystems in all situations. That means that file-backed memory + GUP is a recipe for potential problems, some of which have already occurred in the field. GUP was first introduced for Direct IO (O_DIRECT), allowing filesystem code to get the struct page behind a virtual address and to let storage hardware perform a direct copy to or from that page. This is a short-lived access pattern, and as such, the window for a concurrent writeback of GUP'd page was small enough that there were not (we think) any reported problems. Also, userspace was expected to understand and accept that Direct IO was not synchronized with memory-mapped access to that data, nor with any process address space changes such as munmap(), mremap(), etc. Over the years, more GUP uses have appeared (virtualization, device drivers, RDMA) that can keep the pages they get via GUP for a long period of time (seconds, minutes, hours, days, ...). This long-term pinning makes an underlying design problem more obvious. In fact, there are a number of key problems inherent to GUP: Interactions with file systems ============================== File systems expect to be able to write back data, both to reclaim pages, and for data integrity. Allowing other hardware (NICs, GPUs, etc) to gain write access to the file memory pages means that such hardware can dirty the pages, without the filesystem being aware. This can, in some cases (depending on filesystem, filesystem options, block device, block device options, and other variables), lead to data corruption, and also to kernel bugs of the form: kernel BUG at /build/linux-fQ94TU/linux-4.4.0/fs/ext4/inode.c:1899! backtrace: ext4_writepage __writepage write_cache_pages ext4_writepages do_writepages __writeback_single_inode writeback_sb_inodes __writeback_inodes_wb wb_writeback wb_workfn process_one_work worker_thread kthread ret_from_fork ...which is due to the file system asserting that there are still buffer heads attached: ({ \ BUG_ON(!PagePrivate(page)); \ ((struct buffer_head *)page_private(page)); \ }) Dave Chinner's description of this is very clear: "The fundamental issue is that ->page_mkwrite must be called on every write access to a clean file backed page, not just the first one. How long the GUP reference lasts is irrelevant, if the page is clean and you need to dirty it, you must call ->page_mkwrite before it is marked writeable and dirtied. Every. Time." This is just one symptom of the larger design problem: real filesystems that actually write to a backing device, do not actually support get_user_pages() being called on their pages, and letting hardware write directly to those pages--even though that pattern has been going on since about 2005 or so. Long term GUP ============= Long term GUP is an issue when FOLL_WRITE is specified to GUP (so, a writeable mapping is created), and the pages are file-backed. That can lead to filesystem corruption. What happens is that when a file-backed page is being written back, it is first mapped read-only in all of the CPU page tables; the file system then assumes that nobody can write to the page, and that the page content is therefore stable. Unfortunately, the GUP callers generally do not monitor changes to the CPU pages tables; they instead assume that the following pattern is safe (it's not): get_user_pages() Hardware can keep a reference to those pages for a very long time, and write to it at any time. Because "hardware" here means "devices that are not a CPU", this activity occurs without any interaction with the kernel's file system code. for each page set_page_dirty put_page() In fact, the GUP documentation even recommends that pattern. Anyway, the file system assumes that the page is stable (nothing is writing to the page), and that is a problem: stable page content is necessary for many filesystem actions during writeback, such as checksum, encryption, RAID striping, etc. Furthermore, filesystem features like COW (copy on write) or snapshot also rely on being able to use a new page for as memory for that memory range inside the file. Corruption during write back is clearly possible here. To solve that, one idea is to identify pages that have active GUP, so that we can use a bounce page to write stable data to the filesystem. The filesystem would work on the bounce page, while any of the active GUP might write to the original page. This would avoid the stable page violation problem, but note that it is only part of the overall solution, because other problems remain. Other filesystem features that need to replace the page with a new one can be inhibited for pages that are GUP-pinned. This will, however, alter and limit some of those filesystem features. The only fix for that would be to require GUP users to monitor and respond to CPU page table updates. Subsystems such as ODP and HMM do this, for example. This aspect of the problem is still under discussion. Direct IO ========= Direct IO can cause corruption, if userspace does Direct-IO that writes to a range of virtual addresses that are mmap'd to a file. The pages written to are file-backed pages that can be under write back, while the Direct IO is taking place. Here, Direct IO races with a write back: it calls GUP before page_mkclean() has replaced the CPU pte with a read-only entry. The race window is pretty small, which is probably why years have gone by before we noticed this problem: Direct IO is generally very quick, and tends to finish up before the filesystem gets around to do anything with the page contents. However, it's still a real problem. The solution is to never let GUP return pages that are under write back, but instead, force GUP to take a write fault on those pages. That way, GUP will properly synchronize with the active write back. This does not change the required GUP behavior, it just avoids that race. Details ======= Introduces put_user_page(), which simply calls put_page(). This provides a way to update all get_user_pages*() callers, so that they call put_user_page(), instead of put_page(). Also introduces put_user_pages(), and a few dirty/locked variations, as a replacement for release_pages(), and also as a replacement for open-coded loops that release multiple pages. These may be used for subsequent performance improvements, via batching of pages to be released. This is the first step of fixing a problem (also described in [1] and [2]) with interactions between get_user_pages ("gup") and filesystems. Problem description: let's start with a bug report. Below, is what happens sometimes, under memory pressure, when a driver pins some pages via gup, and then marks those pages dirty, and releases them. Note that the gup documentation actually recommends that pattern. The problem is that the filesystem may do a writeback while the pages were gup-pinned, and then the filesystem believes that the pages are clean. So, when the driver later marks the pages as dirty, that conflicts with the filesystem's page tracking and results in a BUG(), like this one that I experienced: kernel BUG at /build/linux-fQ94TU/linux-4.4.0/fs/ext4/inode.c:1899! backtrace: ext4_writepage __writepage write_cache_pages ext4_writepages do_writepages __writeback_single_inode writeback_sb_inodes __writeback_inodes_wb wb_writeback wb_workfn process_one_work worker_thread kthread ret_from_fork ...which is due to the file system asserting that there are still buffer heads attached: ({ \ BUG_ON(!PagePrivate(page)); \ ((struct buffer_head *)page_private(page)); \ }) Dave Chinner's description of this is very clear: "The fundamental issue is that ->page_mkwrite must be called on every write access to a clean file backed page, not just the first one. How long the GUP reference lasts is irrelevant, if the page is clean and you need to dirty it, you must call ->page_mkwrite before it is marked writeable and dirtied. Every. Time." This is just one symptom of the larger design problem: real filesystems that actually write to a backing device, do not actually support get_user_pages() being called on their pages, and letting hardware write directly to those pages--even though that pattern has been going on since about 2005 or so. The steps are to fix it are: 1) (This patch): provide put_user_page*() routines, intended to be used for releasing pages that were pinned via get_user_pages*(). 2) Convert all of the call sites for get_user_pages*(), to invoke put_user_page*(), instead of put_page(). This involves dozens of call sites, and will take some time. 3) After (2) is complete, use get_user_pages*() and put_user_page*() to implement tracking of these pages. This tracking will be separate from the existing struct page refcounting. 4) Use the tracking and identification of these pages, to implement special handling (especially in writeback paths) when the pages are backed by a filesystem. [1] https://lwn.net/Articles/774411/ : "DMA and get_user_pages()" [2] https://lwn.net/Articles/753027/ : "The Trouble with get_user_pages()" Link: http://lkml.kernel.org/r/20190327023632.13307-2-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Jan Kara Reviewed-by: Mike Rapoport [docs] Reviewed-by: Ira Weiny Reviewed-by: Jérôme Glisse Reviewed-by: Christoph Lameter Tested-by: Ira Weiny Cc: Al Viro Cc: Christoph Hellwig Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: Matthew Wilcox Cc: Michal Hocko Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 24 ++++++++++++ mm/gup.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index c3c73b3c9adc..e6b6be15609e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1007,6 +1007,30 @@ static inline void put_page(struct page *page) __put_page(page); } +/** + * put_user_page() - release a gup-pinned page + * @page: pointer to page to be released + * + * Pages that were pinned via get_user_pages*() must be released via + * either put_user_page(), or one of the put_user_pages*() routines + * below. This is so that eventually, pages that are pinned via + * get_user_pages*() can be separately tracked and uniquely handled. In + * particular, interactions with RDMA and filesystems need special + * handling. + * + * put_user_page() and put_page() are not interchangeable, despite this early + * implementation that makes them look the same. put_user_page() calls must + * be perfectly matched up with get_user_page() calls. + */ +static inline void put_user_page(struct page *page) +{ + put_page(page); +} + +void put_user_pages_dirty(struct page **pages, unsigned long npages); +void put_user_pages_dirty_lock(struct page **pages, unsigned long npages); +void put_user_pages(struct page **pages, unsigned long npages); + #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif diff --git a/mm/gup.c b/mm/gup.c index 8e0a0a3a2b2d..2c08248d4fa2 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -28,6 +28,111 @@ struct follow_page_context { unsigned int page_mask; }; +typedef int (*set_dirty_func_t)(struct page *page); + +static void __put_user_pages_dirty(struct page **pages, + unsigned long npages, + set_dirty_func_t sdf) +{ + unsigned long index; + + for (index = 0; index < npages; index++) { + struct page *page = compound_head(pages[index]); + + /* + * Checking PageDirty at this point may race with + * clear_page_dirty_for_io(), but that's OK. Two key cases: + * + * 1) This code sees the page as already dirty, so it skips + * the call to sdf(). That could happen because + * clear_page_dirty_for_io() called page_mkclean(), + * followed by set_page_dirty(). However, now the page is + * going to get written back, which meets the original + * intention of setting it dirty, so all is well: + * clear_page_dirty_for_io() goes on to call + * TestClearPageDirty(), and write the page back. + * + * 2) This code sees the page as clean, so it calls sdf(). + * The page stays dirty, despite being written back, so it + * gets written back again in the next writeback cycle. + * This is harmless. + */ + if (!PageDirty(page)) + sdf(page); + + put_user_page(page); + } +} + +/** + * put_user_pages_dirty() - release and dirty an array of gup-pinned pages + * @pages: array of pages to be marked dirty and released. + * @npages: number of pages in the @pages array. + * + * "gup-pinned page" refers to a page that has had one of the get_user_pages() + * variants called on that page. + * + * For each page in the @pages array, make that page (or its head page, if a + * compound page) dirty, if it was previously listed as clean. Then, release + * the page using put_user_page(). + * + * Please see the put_user_page() documentation for details. + * + * set_page_dirty(), which does not lock the page, is used here. + * Therefore, it is the caller's responsibility to ensure that this is + * safe. If not, then put_user_pages_dirty_lock() should be called instead. + * + */ +void put_user_pages_dirty(struct page **pages, unsigned long npages) +{ + __put_user_pages_dirty(pages, npages, set_page_dirty); +} +EXPORT_SYMBOL(put_user_pages_dirty); + +/** + * put_user_pages_dirty_lock() - release and dirty an array of gup-pinned pages + * @pages: array of pages to be marked dirty and released. + * @npages: number of pages in the @pages array. + * + * For each page in the @pages array, make that page (or its head page, if a + * compound page) dirty, if it was previously listed as clean. Then, release + * the page using put_user_page(). + * + * Please see the put_user_page() documentation for details. + * + * This is just like put_user_pages_dirty(), except that it invokes + * set_page_dirty_lock(), instead of set_page_dirty(). + * + */ +void put_user_pages_dirty_lock(struct page **pages, unsigned long npages) +{ + __put_user_pages_dirty(pages, npages, set_page_dirty_lock); +} +EXPORT_SYMBOL(put_user_pages_dirty_lock); + +/** + * put_user_pages() - release an array of gup-pinned pages. + * @pages: array of pages to be marked dirty and released. + * @npages: number of pages in the @pages array. + * + * For each page in the @pages array, release the page using put_user_page(). + * + * Please see the put_user_page() documentation for details. + */ +void put_user_pages(struct page **pages, unsigned long npages) +{ + unsigned long index; + + /* + * TODO: this can be optimized for huge pages: if a series of pages is + * physically contiguous and part of the same compound page, then a + * single operation to the head page should suffice. + */ + for (index = 0; index < npages; index++) + put_user_page(pages[index]); +} +EXPORT_SYMBOL(put_user_pages); + static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags) { -- cgit v1.2.3 From 926e5d1cb525ec4faa66ddb24ac3b61c0102cb5c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 13 May 2019 17:19:29 -0700 Subject: include/linux/balloon_compaction.h: drop unused function stubs These are leftovers from the pre-"general non-lru movable page" era. Link: http://lkml.kernel.org/r/20190329122649.28404-1-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Mukesh Ojha Acked-by: Michael S. Tsirkin Acked-by: Pankaj Gupta Acked-by: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/balloon_compaction.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index f111c780ef1d..f31521dcb09a 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -151,21 +151,6 @@ static inline void balloon_page_delete(struct page *page) list_del(&page->lru); } -static inline bool __is_movable_balloon_page(struct page *page) -{ - return false; -} - -static inline bool balloon_page_movable(struct page *page) -{ - return false; -} - -static inline bool isolated_balloon_page(struct page *page) -{ - return false; -} - static inline bool balloon_page_isolate(struct page *page) { return false; -- cgit v1.2.3 From 1b426bac66e6cc83c9f2d92b96e4e72acf43419a Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Mon, 13 May 2019 17:19:41 -0700 Subject: hugetlb: use same fault hash key for shared and private mappings hugetlb uses a fault mutex hash table to prevent page faults of the same pages concurrently. The key for shared and private mappings is different. Shared keys off address_space and file index. Private keys off mm and virtual address. Consider a private mappings of a populated hugetlbfs file. A fault will map the page from the file and if needed do a COW to map a writable page. Hugetlbfs hole punch uses the fault mutex to prevent mappings of file pages. It uses the address_space file index key. However, private mappings will use a different key and could race with this code to map the file page. This causes problems (BUG) for the page cache remove code as it expects the page to be unmapped. A sample stack is: page dumped because: VM_BUG_ON_PAGE(page_mapped(page)) kernel BUG at mm/filemap.c:169! ... RIP: 0010:unaccount_page_cache_page+0x1b8/0x200 ... Call Trace: __delete_from_page_cache+0x39/0x220 delete_from_page_cache+0x45/0x70 remove_inode_hugepages+0x13c/0x380 ? __add_to_page_cache_locked+0x162/0x380 hugetlbfs_fallocate+0x403/0x540 ? _cond_resched+0x15/0x30 ? __inode_security_revalidate+0x5d/0x70 ? selinux_file_permission+0x100/0x130 vfs_fallocate+0x13f/0x270 ksys_fallocate+0x3c/0x80 __x64_sys_fallocate+0x1a/0x20 do_syscall_64+0x5b/0x180 entry_SYSCALL_64_after_hwframe+0x44/0xa9 There seems to be another potential COW issue/race with this approach of different private and shared keys as noted in commit 8382d914ebf7 ("mm, hugetlb: improve page-fault scalability"). Since every hugetlb mapping (even anon and private) is actually a file mapping, just use the address_space index key for all mappings. This results in potentially more hash collisions. However, this should not be the common case. Link: http://lkml.kernel.org/r/20190328234704.27083-3-mike.kravetz@oracle.com Link: http://lkml.kernel.org/r/20190412165235.t4sscoujczfhuiyt@linux-r8p5 Fixes: b5cec28d36f5 ("hugetlbfs: truncate_hugepages() takes a range of pages") Signed-off-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Reviewed-by: Davidlohr Bueso Cc: Joonsoo Kim Cc: "Kirill A . Shutemov" Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 7 ++----- include/linux/hugetlb.h | 4 +--- mm/hugetlb.c | 22 ++++++---------------- mm/userfaultfd.c | 3 +-- 4 files changed, 10 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c74ef4426282..f23237135163 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -440,9 +440,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, u32 hash; index = page->index; - hash = hugetlb_fault_mutex_hash(h, current->mm, - &pseudo_vma, - mapping, index, 0); + hash = hugetlb_fault_mutex_hash(h, mapping, index, 0); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* @@ -639,8 +637,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, addr = index * hpage_size; /* mutex taken here, fault path and hole punch */ - hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, - index, addr); + hash = hugetlb_fault_mutex_hash(h, mapping, index, addr); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 11943b60f208..edf476c8cfb9 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -123,9 +123,7 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason); void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, - struct vm_area_struct *vma, - struct address_space *mapping, +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, pgoff_t idx, unsigned long address); pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c33c5cbb67ff..98a3c7c224cb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3824,8 +3824,7 @@ retry: * handling userfault. Reacquire after handling * fault to make calling code simpler. */ - hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, - idx, haddr); + hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr); mutex_unlock(&hugetlb_fault_mutex_table[hash]); ret = handle_userfault(&vmf, VM_UFFD_MISSING); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -3933,21 +3932,14 @@ backout_unlocked: } #ifdef CONFIG_SMP -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, - struct vm_area_struct *vma, - struct address_space *mapping, +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, pgoff_t idx, unsigned long address) { unsigned long key[2]; u32 hash; - if (vma->vm_flags & VM_SHARED) { - key[0] = (unsigned long) mapping; - key[1] = idx; - } else { - key[0] = (unsigned long) mm; - key[1] = address >> huge_page_shift(h); - } + key[0] = (unsigned long) mapping; + key[1] = idx; hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); @@ -3958,9 +3950,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, * For uniprocesor systems we always use a single mutex, so just * return 0 and avoid the hashing overhead. */ -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, - struct vm_area_struct *vma, - struct address_space *mapping, +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, pgoff_t idx, unsigned long address) { return 0; @@ -4005,7 +3995,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); + hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr); mutex_lock(&hugetlb_fault_mutex_table[hash]); entry = huge_ptep_get(ptep); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index d59b5a73dfb3..9932d5755e4c 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -271,8 +271,7 @@ retry: */ idx = linear_page_index(dst_vma, dst_addr); mapping = dst_vma->vm_file->f_mapping; - hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, - idx, dst_addr); + hash = hugetlb_fault_mutex_hash(h, mapping, idx, dst_addr); mutex_lock(&hugetlb_fault_mutex_table[hash]); err = -ENOMEM; -- cgit v1.2.3 From 704f3f2cf63cdb76925ac2ff432182c73574b20b Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:19:48 -0700 Subject: mm/hmm: use reference counting for HMM struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every time I read the code to check that the HMM structure does not vanish before it should thanks to the many lock protecting its removal i get a headache. Switch to reference counting instead it is much easier to follow and harder to break. This also remove some code that is no longer needed with refcounting. Link: http://lkml.kernel.org/r/20190403193318.16478-3-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Cc: John Hubbard Cc: Dan Williams Cc: Arnd Bergmann Cc: Balbir Singh Cc: Dan Carpenter Cc: Ira Weiny Cc: Matthew Wilcox Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 2 + mm/hmm.c | 190 +++++++++++++++++++++++++++++++++------------------- 2 files changed, 124 insertions(+), 68 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index ad50b7b4f141..716fc61fa6d4 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -131,6 +131,7 @@ enum hmm_pfn_value_e { /* * struct hmm_range - track invalidation lock on virtual address range * + * @hmm: the core HMM structure this range is active against * @vma: the vm area struct for the range * @list: all range lock are on a list * @start: range virtual start address (inclusive) @@ -142,6 +143,7 @@ enum hmm_pfn_value_e { * @valid: pfns array did not change since it has been fill by an HMM function */ struct hmm_range { + struct hmm *hmm; struct vm_area_struct *vma; struct list_head list; unsigned long start; diff --git a/mm/hmm.c b/mm/hmm.c index fe1cd87e49ac..919d78fd21c5 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -50,6 +50,7 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops; */ struct hmm { struct mm_struct *mm; + struct kref kref; spinlock_t lock; struct list_head ranges; struct list_head mirrors; @@ -57,24 +58,33 @@ struct hmm { struct rw_semaphore mirrors_sem; }; -/* - * hmm_register - register HMM against an mm (HMM internal) +static inline struct hmm *mm_get_hmm(struct mm_struct *mm) +{ + struct hmm *hmm = READ_ONCE(mm->hmm); + + if (hmm && kref_get_unless_zero(&hmm->kref)) + return hmm; + + return NULL; +} + +/** + * hmm_get_or_create - register HMM against an mm (HMM internal) * * @mm: mm struct to attach to + * Returns: returns an HMM object, either by referencing the existing + * (per-process) object, or by creating a new one. * - * This is not intended to be used directly by device drivers. It allocates an - * HMM struct if mm does not have one, and initializes it. + * This is not intended to be used directly by device drivers. If mm already + * has an HMM struct then it get a reference on it and returns it. Otherwise + * it allocates an HMM struct, initializes it, associate it with the mm and + * returns it. */ -static struct hmm *hmm_register(struct mm_struct *mm) +static struct hmm *hmm_get_or_create(struct mm_struct *mm) { - struct hmm *hmm = READ_ONCE(mm->hmm); + struct hmm *hmm = mm_get_hmm(mm); bool cleanup = false; - /* - * The hmm struct can only be freed once the mm_struct goes away, - * hence we should always have pre-allocated an new hmm struct - * above. - */ if (hmm) return hmm; @@ -86,6 +96,7 @@ static struct hmm *hmm_register(struct mm_struct *mm) hmm->mmu_notifier.ops = NULL; INIT_LIST_HEAD(&hmm->ranges); spin_lock_init(&hmm->lock); + kref_init(&hmm->kref); hmm->mm = mm; spin_lock(&mm->page_table_lock); @@ -106,7 +117,7 @@ static struct hmm *hmm_register(struct mm_struct *mm) if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) goto error_mm; - return mm->hmm; + return hmm; error_mm: spin_lock(&mm->page_table_lock); @@ -118,9 +129,41 @@ error: return NULL; } +static void hmm_free(struct kref *kref) +{ + struct hmm *hmm = container_of(kref, struct hmm, kref); + struct mm_struct *mm = hmm->mm; + + mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); + + spin_lock(&mm->page_table_lock); + if (mm->hmm == hmm) + mm->hmm = NULL; + spin_unlock(&mm->page_table_lock); + + kfree(hmm); +} + +static inline void hmm_put(struct hmm *hmm) +{ + kref_put(&hmm->kref, hmm_free); +} + void hmm_mm_destroy(struct mm_struct *mm) { - kfree(mm->hmm); + struct hmm *hmm; + + spin_lock(&mm->page_table_lock); + hmm = mm_get_hmm(mm); + mm->hmm = NULL; + if (hmm) { + hmm->mm = NULL; + spin_unlock(&mm->page_table_lock); + hmm_put(hmm); + return; + } + + spin_unlock(&mm->page_table_lock); } static int hmm_invalidate_range(struct hmm *hmm, bool device, @@ -165,7 +208,7 @@ static int hmm_invalidate_range(struct hmm *hmm, bool device, static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct hmm_mirror *mirror; - struct hmm *hmm = mm->hmm; + struct hmm *hmm = mm_get_hmm(mm); down_write(&hmm->mirrors_sem); mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror, @@ -186,13 +229,16 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) struct hmm_mirror, list); } up_write(&hmm->mirrors_sem); + + hmm_put(hmm); } static int hmm_invalidate_range_start(struct mmu_notifier *mn, const struct mmu_notifier_range *range) { + struct hmm *hmm = mm_get_hmm(range->mm); struct hmm_update update; - struct hmm *hmm = range->mm->hmm; + int ret; VM_BUG_ON(!hmm); @@ -200,14 +246,16 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, update.end = range->end; update.event = HMM_UPDATE_INVALIDATE; update.blockable = range->blockable; - return hmm_invalidate_range(hmm, true, &update); + ret = hmm_invalidate_range(hmm, true, &update); + hmm_put(hmm); + return ret; } static void hmm_invalidate_range_end(struct mmu_notifier *mn, const struct mmu_notifier_range *range) { + struct hmm *hmm = mm_get_hmm(range->mm); struct hmm_update update; - struct hmm *hmm = range->mm->hmm; VM_BUG_ON(!hmm); @@ -216,6 +264,7 @@ static void hmm_invalidate_range_end(struct mmu_notifier *mn, update.event = HMM_UPDATE_INVALIDATE; update.blockable = true; hmm_invalidate_range(hmm, false, &update); + hmm_put(hmm); } static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { @@ -241,24 +290,13 @@ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) if (!mm || !mirror || !mirror->ops) return -EINVAL; -again: - mirror->hmm = hmm_register(mm); + mirror->hmm = hmm_get_or_create(mm); if (!mirror->hmm) return -ENOMEM; down_write(&mirror->hmm->mirrors_sem); - if (mirror->hmm->mm == NULL) { - /* - * A racing hmm_mirror_unregister() is about to destroy the hmm - * struct. Try again to allocate a new one. - */ - up_write(&mirror->hmm->mirrors_sem); - mirror->hmm = NULL; - goto again; - } else { - list_add(&mirror->list, &mirror->hmm->mirrors); - up_write(&mirror->hmm->mirrors_sem); - } + list_add(&mirror->list, &mirror->hmm->mirrors); + up_write(&mirror->hmm->mirrors_sem); return 0; } @@ -273,33 +311,18 @@ EXPORT_SYMBOL(hmm_mirror_register); */ void hmm_mirror_unregister(struct hmm_mirror *mirror) { - bool should_unregister = false; - struct mm_struct *mm; - struct hmm *hmm; + struct hmm *hmm = READ_ONCE(mirror->hmm); - if (mirror->hmm == NULL) + if (hmm == NULL) return; - hmm = mirror->hmm; down_write(&hmm->mirrors_sem); list_del_init(&mirror->list); - should_unregister = list_empty(&hmm->mirrors); + /* To protect us against double unregister ... */ mirror->hmm = NULL; - mm = hmm->mm; - hmm->mm = NULL; up_write(&hmm->mirrors_sem); - if (!should_unregister || mm == NULL) - return; - - mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); - - spin_lock(&mm->page_table_lock); - if (mm->hmm == hmm) - mm->hmm = NULL; - spin_unlock(&mm->page_table_lock); - - kfree(hmm); + hmm_put(hmm); } EXPORT_SYMBOL(hmm_mirror_unregister); @@ -708,23 +731,29 @@ int hmm_vma_get_pfns(struct hmm_range *range) struct mm_walk mm_walk; struct hmm *hmm; + range->hmm = NULL; + /* Sanity check, this really should not happen ! */ if (range->start < vma->vm_start || range->start >= vma->vm_end) return -EINVAL; if (range->end < vma->vm_start || range->end > vma->vm_end) return -EINVAL; - hmm = hmm_register(vma->vm_mm); + hmm = hmm_get_or_create(vma->vm_mm); if (!hmm) return -ENOMEM; - /* Caller must have registered a mirror, via hmm_mirror_register() ! */ - if (!hmm->mmu_notifier.ops) + + /* Check if hmm_mm_destroy() was call. */ + if (hmm->mm == NULL) { + hmm_put(hmm); return -EINVAL; + } /* FIXME support hugetlb fs */ if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || vma_is_dax(vma)) { hmm_pfns_special(range); + hmm_put(hmm); return -EINVAL; } @@ -736,6 +765,7 @@ int hmm_vma_get_pfns(struct hmm_range *range) * operations such has atomic access would not work. */ hmm_pfns_clear(range, range->pfns, range->start, range->end); + hmm_put(hmm); return -EPERM; } @@ -758,6 +788,12 @@ int hmm_vma_get_pfns(struct hmm_range *range) mm_walk.pte_hole = hmm_vma_walk_hole; walk_page_range(range->start, range->end, &mm_walk); + /* + * Transfer hmm reference to the range struct it will be drop inside + * the hmm_vma_range_done() function (which _must_ be call if this + * function return 0). + */ + range->hmm = hmm; return 0; } EXPORT_SYMBOL(hmm_vma_get_pfns); @@ -802,25 +838,27 @@ EXPORT_SYMBOL(hmm_vma_get_pfns); */ bool hmm_vma_range_done(struct hmm_range *range) { - unsigned long npages = (range->end - range->start) >> PAGE_SHIFT; - struct hmm *hmm; + bool ret = false; - if (range->end <= range->start) { + /* Sanity check this really should not happen. */ + if (range->hmm == NULL || range->end <= range->start) { BUG(); return false; } - hmm = hmm_register(range->vma->vm_mm); - if (!hmm) { - memset(range->pfns, 0, sizeof(*range->pfns) * npages); - return false; - } - - spin_lock(&hmm->lock); + spin_lock(&range->hmm->lock); list_del_rcu(&range->list); - spin_unlock(&hmm->lock); + ret = range->valid; + spin_unlock(&range->hmm->lock); - return range->valid; + /* Is the mm still alive ? */ + if (range->hmm->mm == NULL) + ret = false; + + /* Drop reference taken by hmm_vma_fault() or hmm_vma_get_pfns() */ + hmm_put(range->hmm); + range->hmm = NULL; + return ret; } EXPORT_SYMBOL(hmm_vma_range_done); @@ -880,25 +918,31 @@ int hmm_vma_fault(struct hmm_range *range, bool block) struct hmm *hmm; int ret; + range->hmm = NULL; + /* Sanity check, this really should not happen ! */ if (range->start < vma->vm_start || range->start >= vma->vm_end) return -EINVAL; if (range->end < vma->vm_start || range->end > vma->vm_end) return -EINVAL; - hmm = hmm_register(vma->vm_mm); + hmm = hmm_get_or_create(vma->vm_mm); if (!hmm) { hmm_pfns_clear(range, range->pfns, range->start, range->end); return -ENOMEM; } - /* Caller must have registered a mirror using hmm_mirror_register() */ - if (!hmm->mmu_notifier.ops) + + /* Check if hmm_mm_destroy() was call. */ + if (hmm->mm == NULL) { + hmm_put(hmm); return -EINVAL; + } /* FIXME support hugetlb fs */ if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || vma_is_dax(vma)) { hmm_pfns_special(range); + hmm_put(hmm); return -EINVAL; } @@ -910,6 +954,7 @@ int hmm_vma_fault(struct hmm_range *range, bool block) * operations such has atomic access would not work. */ hmm_pfns_clear(range, range->pfns, range->start, range->end); + hmm_put(hmm); return -EPERM; } @@ -945,7 +990,16 @@ int hmm_vma_fault(struct hmm_range *range, bool block) hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last, range->end); hmm_vma_range_done(range); + hmm_put(hmm); + } else { + /* + * Transfer hmm reference to the range struct it will be drop + * inside the hmm_vma_range_done() function (which _must_ be + * call if this function return 0). + */ + range->hmm = hmm; } + return ret; } EXPORT_SYMBOL(hmm_vma_fault); -- cgit v1.2.3 From 25f23a0c7127b65c4d8200ccda8a352ad5ce1e1d Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:19:55 -0700 Subject: mm/hmm: improve and rename hmm_vma_get_pfns() to hmm_range_snapshot() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename for consistency between code, comments and documentation. Also improves the comments on all the possible returns values. Improve the function by returning the number of populated entries in pfns array. Link: http://lkml.kernel.org/r/20190403193318.16478-5-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: John Hubbard Reviewed-by: Ira Weiny Cc: Dan Williams Cc: Arnd Bergmann Cc: Balbir Singh Cc: Dan Carpenter Cc: Matthew Wilcox Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/hmm.rst | 26 ++++++++++++++++++-------- include/linux/hmm.h | 4 ++-- mm/hmm.c | 31 +++++++++++++++++-------------- 3 files changed, 37 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 44205f0b671f..d9b27bdadd1b 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -189,11 +189,7 @@ the driver callback returns. When the device driver wants to populate a range of virtual addresses, it can use either:: - int hmm_vma_get_pfns(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns); + long hmm_range_snapshot(struct hmm_range *range); int hmm_vma_fault(struct vm_area_struct *vma, struct hmm_range *range, unsigned long start, @@ -202,7 +198,7 @@ use either:: bool write, bool block); -The first one (hmm_vma_get_pfns()) will only fetch present CPU page table +The first one (hmm_range_snapshot()) will only fetch present CPU page table entries and will not trigger a page fault on missing or non-present entries. The second one does trigger a page fault on missing or read-only entry if the write parameter is true. Page faults use the generic mm page fault code path @@ -220,19 +216,33 @@ respect in order to keep things properly synchronized. The usage pattern is:: { struct hmm_range range; ... + + range.start = ...; + range.end = ...; + range.pfns = ...; + range.flags = ...; + range.values = ...; + range.pfn_shift = ...; + again: - ret = hmm_vma_get_pfns(vma, &range, start, end, pfns); - if (ret) + down_read(&mm->mmap_sem); + range.vma = ...; + ret = hmm_range_snapshot(&range); + if (ret) { + up_read(&mm->mmap_sem); return ret; + } take_lock(driver->update); if (!hmm_vma_range_done(vma, &range)) { release_lock(driver->update); + up_read(&mm->mmap_sem); goto again; } // Use pfns array content to update device page table release_lock(driver->update); + up_read(&mm->mmap_sem); return 0; } diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 716fc61fa6d4..32206b0b1bfd 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -365,11 +365,11 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); * table invalidation serializes on it. * * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL - * hmm_vma_get_pfns() WITHOUT ERROR ! + * hmm_range_snapshot() WITHOUT ERROR ! * * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID ! */ -int hmm_vma_get_pfns(struct hmm_range *range); +long hmm_range_snapshot(struct hmm_range *range); bool hmm_vma_range_done(struct hmm_range *range); diff --git a/mm/hmm.c b/mm/hmm.c index 84e0577a912a..bd957a9f10d1 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -702,23 +702,25 @@ static void hmm_pfns_special(struct hmm_range *range) } /* - * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses - * @range: range being snapshotted - * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid - * vma permission, 0 success + * hmm_range_snapshot() - snapshot CPU page table for a range + * @range: range + * Returns: number of valid pages in range->pfns[] (from range start + * address). This may be zero. If the return value is negative, + * then one of the following values may be returned: + * + * -EINVAL invalid arguments or mm or virtual address are in an + * invalid vma (ie either hugetlbfs or device file vma). + * -EPERM For example, asking for write, when the range is + * read-only + * -EAGAIN Caller needs to retry + * -EFAULT Either no valid vma exists for this range, or it is + * illegal to access the range * * This snapshots the CPU page table for a range of virtual addresses. Snapshot * validity is tracked by range struct. See hmm_vma_range_done() for further * information. - * - * The range struct is initialized here. It tracks the CPU page table, but only - * if the function returns success (0), in which case the caller must then call - * hmm_vma_range_done() to stop CPU page table update tracking on this range. - * - * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS - * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED ! */ -int hmm_vma_get_pfns(struct hmm_range *range) +long hmm_range_snapshot(struct hmm_range *range) { struct vm_area_struct *vma = range->vma; struct hmm_vma_walk hmm_vma_walk; @@ -772,6 +774,7 @@ int hmm_vma_get_pfns(struct hmm_range *range) hmm_vma_walk.fault = false; hmm_vma_walk.range = range; mm_walk.private = &hmm_vma_walk; + hmm_vma_walk.last = range->start; mm_walk.vma = vma; mm_walk.mm = vma->vm_mm; @@ -788,9 +791,9 @@ int hmm_vma_get_pfns(struct hmm_range *range) * function return 0). */ range->hmm = hmm; - return 0; + return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; } -EXPORT_SYMBOL(hmm_vma_get_pfns); +EXPORT_SYMBOL(hmm_range_snapshot); /* * hmm_vma_range_done() - stop tracking change to CPU page table over a range -- cgit v1.2.3 From 73231612dc7c907bd96880a4086ee55eef6b6888 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:19:58 -0700 Subject: mm/hmm: improve and rename hmm_vma_fault() to hmm_range_fault() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Minor optimization around hmm_pte_need_fault(). Rename for consistency between code, comments and documentation. Also improves the comments on all the possible returns values. Improve the function by returning the number of populated entries in pfns array. Link: http://lkml.kernel.org/r/20190403193318.16478-6-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Cc: John Hubbard Cc: Dan Williams Cc: Arnd Bergmann Cc: Balbir Singh Cc: Dan Carpenter Cc: Ira Weiny Cc: Matthew Wilcox Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/hmm.rst | 8 +---- include/linux/hmm.h | 13 ++++++- mm/hmm.c | 91 +++++++++++++++++++++--------------------------- 3 files changed, 52 insertions(+), 60 deletions(-) (limited to 'include/linux') diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index d9b27bdadd1b..61f073215a8d 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -190,13 +190,7 @@ When the device driver wants to populate a range of virtual addresses, it can use either:: long hmm_range_snapshot(struct hmm_range *range); - int hmm_vma_fault(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns, - bool write, - bool block); + long hmm_range_fault(struct hmm_range *range, bool block); The first one (hmm_range_snapshot()) will only fetch present CPU page table entries and will not trigger a page fault on missing or non-present entries. diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 32206b0b1bfd..e9afd23c2eac 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -391,7 +391,18 @@ bool hmm_vma_range_done(struct hmm_range *range); * * See the function description in mm/hmm.c for further documentation. */ -int hmm_vma_fault(struct hmm_range *range, bool block); +long hmm_range_fault(struct hmm_range *range, bool block); + +/* This is a temporary helper to avoid merge conflict between trees. */ +static inline int hmm_vma_fault(struct hmm_range *range, bool block) +{ + long ret = hmm_range_fault(range, block); + if (ret == -EBUSY) + ret = -EAGAIN; + else if (ret == -EAGAIN) + ret = -EBUSY; + return ret < 0 ? ret : 0; +} /* Below are for HMM internal use only! Not to be used by device driver! */ void hmm_mm_destroy(struct mm_struct *mm); diff --git a/mm/hmm.c b/mm/hmm.c index bd957a9f10d1..b7e4034d96e1 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -340,13 +340,13 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, flags |= write_fault ? FAULT_FLAG_WRITE : 0; ret = handle_mm_fault(vma, addr, flags); if (ret & VM_FAULT_RETRY) - return -EBUSY; + return -EAGAIN; if (ret & VM_FAULT_ERROR) { *pfn = range->values[HMM_PFN_ERROR]; return -EFAULT; } - return -EAGAIN; + return -EBUSY; } static int hmm_pfns_bad(unsigned long addr, @@ -372,7 +372,7 @@ static int hmm_pfns_bad(unsigned long addr, * @fault: should we fault or not ? * @write_fault: write fault ? * @walk: mm_walk structure - * Returns: 0 on success, -EAGAIN after page fault, or page fault error + * Returns: 0 on success, -EBUSY after page fault, or page fault error * * This function will be called whenever pmd_none() or pte_none() returns true, * or whenever there is no page directory covering the virtual address range. @@ -395,12 +395,12 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, ret = hmm_vma_do_fault(walk, addr, write_fault, &pfns[i]); - if (ret != -EAGAIN) + if (ret != -EBUSY) return ret; } } - return (fault || write_fault) ? -EAGAIN : 0; + return (fault || write_fault) ? -EBUSY : 0; } static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, @@ -531,11 +531,11 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, uint64_t orig_pfn = *pfn; *pfn = range->values[HMM_PFN_NONE]; - cpu_flags = pte_to_hmm_pfn_flags(range, pte); - hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, - &fault, &write_fault); + fault = write_fault = false; if (pte_none(pte)) { + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, + &fault, &write_fault); if (fault || write_fault) goto fault; return 0; @@ -574,7 +574,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, hmm_vma_walk->last = addr; migration_entry_wait(vma->vm_mm, pmdp, addr); - return -EAGAIN; + return -EBUSY; } return 0; } @@ -582,6 +582,10 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, /* Report error for everything else */ *pfn = range->values[HMM_PFN_ERROR]; return -EFAULT; + } else { + cpu_flags = pte_to_hmm_pfn_flags(range, pte); + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, + &fault, &write_fault); } if (fault || write_fault) @@ -632,7 +636,7 @@ again: if (fault || write_fault) { hmm_vma_walk->last = addr; pmd_migration_entry_wait(vma->vm_mm, pmdp); - return -EAGAIN; + return -EBUSY; } return 0; } else if (!pmd_present(pmd)) @@ -860,53 +864,34 @@ bool hmm_vma_range_done(struct hmm_range *range) EXPORT_SYMBOL(hmm_vma_range_done); /* - * hmm_vma_fault() - try to fault some address in a virtual address range + * hmm_range_fault() - try to fault some address in a virtual address range * @range: range being faulted * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) - * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop) + * Returns: number of valid pages in range->pfns[] (from range start + * address). This may be zero. If the return value is negative, + * then one of the following values may be returned: + * + * -EINVAL invalid arguments or mm or virtual address are in an + * invalid vma (ie either hugetlbfs or device file vma). + * -ENOMEM: Out of memory. + * -EPERM: Invalid permission (for instance asking for write and + * range is read only). + * -EAGAIN: If you need to retry and mmap_sem was drop. This can only + * happens if block argument is false. + * -EBUSY: If the the range is being invalidated and you should wait + * for invalidation to finish. + * -EFAULT: Invalid (ie either no valid vma or it is illegal to access + * that range), number of valid pages in range->pfns[] (from + * range start address). * * This is similar to a regular CPU page fault except that it will not trigger - * any memory migration if the memory being faulted is not accessible by CPUs. + * any memory migration if the memory being faulted is not accessible by CPUs + * and caller does not ask for migration. * * On error, for one virtual address in the range, the function will mark the * corresponding HMM pfn entry with an error flag. - * - * Expected use pattern: - * retry: - * down_read(&mm->mmap_sem); - * // Find vma and address device wants to fault, initialize hmm_pfn_t - * // array accordingly - * ret = hmm_vma_fault(range, write, block); - * switch (ret) { - * case -EAGAIN: - * hmm_vma_range_done(range); - * // You might want to rate limit or yield to play nicely, you may - * // also commit any valid pfn in the array assuming that you are - * // getting true from hmm_vma_range_monitor_end() - * goto retry; - * case 0: - * break; - * case -ENOMEM: - * case -EINVAL: - * case -EPERM: - * default: - * // Handle error ! - * up_read(&mm->mmap_sem) - * return; - * } - * // Take device driver lock that serialize device page table update - * driver_lock_device_page_table_update(); - * hmm_vma_range_done(range); - * // Commit pfns we got from hmm_vma_fault() - * driver_unlock_device_page_table_update(); - * up_read(&mm->mmap_sem) - * - * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0) - * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION ! - * - * YOU HAVE BEEN WARNED ! */ -int hmm_vma_fault(struct hmm_range *range, bool block) +long hmm_range_fault(struct hmm_range *range, bool block) { struct vm_area_struct *vma = range->vma; unsigned long start = range->start; @@ -978,7 +963,8 @@ int hmm_vma_fault(struct hmm_range *range, bool block) do { ret = walk_page_range(start, range->end, &mm_walk); start = hmm_vma_walk.last; - } while (ret == -EAGAIN); + /* Keep trying while the range is valid. */ + } while (ret == -EBUSY && range->valid); if (ret) { unsigned long i; @@ -988,6 +974,7 @@ int hmm_vma_fault(struct hmm_range *range, bool block) range->end); hmm_vma_range_done(range); hmm_put(hmm); + return ret; } else { /* * Transfer hmm reference to the range struct it will be drop @@ -997,9 +984,9 @@ int hmm_vma_fault(struct hmm_range *range, bool block) range->hmm = hmm; } - return ret; + return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; } -EXPORT_SYMBOL(hmm_vma_fault); +EXPORT_SYMBOL(hmm_range_fault); #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -- cgit v1.2.3 From a3e0d41c2b1f86b483b202d642140d8b86d677ca Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:01 -0700 Subject: mm/hmm: improve driver API to work and wait over a range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A common use case for HMM mirror is user trying to mirror a range and before they could program the hardware it get invalidated by some core mm event. Instead of having user re-try right away to mirror the range provide a completion mechanism for them to wait for any active invalidation affecting the range. This also changes how hmm_range_snapshot() and hmm_range_fault() works by not relying on vma so that we can drop the mmap_sem when waiting and lookup the vma again on retry. Link: http://lkml.kernel.org/r/20190403193318.16478-7-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Cc: John Hubbard Cc: Dan Williams Cc: Dan Carpenter Cc: Matthew Wilcox Cc: Arnd Bergmann Cc: Balbir Singh Cc: Ira Weiny Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/hmm.rst | 25 ++- include/linux/hmm.h | 145 +++++++++---- mm/hmm.c | 531 +++++++++++++++++++++++------------------------ 3 files changed, 387 insertions(+), 314 deletions(-) (limited to 'include/linux') diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 61f073215a8d..945d5fb6d14a 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -217,17 +217,33 @@ respect in order to keep things properly synchronized. The usage pattern is:: range.flags = ...; range.values = ...; range.pfn_shift = ...; + hmm_range_register(&range); + + /* + * Just wait for range to be valid, safe to ignore return value as we + * will use the return value of hmm_range_snapshot() below under the + * mmap_sem to ascertain the validity of the range. + */ + hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); again: down_read(&mm->mmap_sem); - range.vma = ...; ret = hmm_range_snapshot(&range); if (ret) { up_read(&mm->mmap_sem); + if (ret == -EAGAIN) { + /* + * No need to check hmm_range_wait_until_valid() return value + * on retry we will get proper error with hmm_range_snapshot() + */ + hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); + goto again; + } + hmm_mirror_unregister(&range); return ret; } take_lock(driver->update); - if (!hmm_vma_range_done(vma, &range)) { + if (!range.valid) { release_lock(driver->update); up_read(&mm->mmap_sem); goto again; @@ -235,14 +251,15 @@ respect in order to keep things properly synchronized. The usage pattern is:: // Use pfns array content to update device page table + hmm_mirror_unregister(&range); release_lock(driver->update); up_read(&mm->mmap_sem); return 0; } The driver->update lock is the same lock that the driver takes inside its -update() callback. That lock must be held before hmm_vma_range_done() to avoid -any race with a concurrent CPU page table update. +update() callback. That lock must be held before checking the range.valid +field to avoid any race with a concurrent CPU page table update. HMM implements all this on top of the mmu_notifier API because we wanted a simpler API and also to be able to perform optimizations latter on like doing diff --git a/include/linux/hmm.h b/include/linux/hmm.h index e9afd23c2eac..ec4bfa91648f 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -77,8 +77,34 @@ #include #include #include +#include -struct hmm; + +/* + * struct hmm - HMM per mm struct + * + * @mm: mm struct this HMM struct is bound to + * @lock: lock protecting ranges list + * @ranges: list of range being snapshotted + * @mirrors: list of mirrors for this mm + * @mmu_notifier: mmu notifier to track updates to CPU page table + * @mirrors_sem: read/write semaphore protecting the mirrors list + * @wq: wait queue for user waiting on a range invalidation + * @notifiers: count of active mmu notifiers + * @dead: is the mm dead ? + */ +struct hmm { + struct mm_struct *mm; + struct kref kref; + struct mutex lock; + struct list_head ranges; + struct list_head mirrors; + struct mmu_notifier mmu_notifier; + struct rw_semaphore mirrors_sem; + wait_queue_head_t wq; + long notifiers; + bool dead; +}; /* * hmm_pfn_flag_e - HMM flag enums @@ -155,6 +181,38 @@ struct hmm_range { bool valid; }; +/* + * hmm_range_wait_until_valid() - wait for range to be valid + * @range: range affected by invalidation to wait on + * @timeout: time out for wait in ms (ie abort wait after that period of time) + * Returns: true if the range is valid, false otherwise. + */ +static inline bool hmm_range_wait_until_valid(struct hmm_range *range, + unsigned long timeout) +{ + /* Check if mm is dead ? */ + if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) { + range->valid = false; + return false; + } + if (range->valid) + return true; + wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead, + msecs_to_jiffies(timeout)); + /* Return current valid status just in case we get lucky */ + return range->valid; +} + +/* + * hmm_range_valid() - test if a range is valid or not + * @range: range + * Returns: true if the range is valid, false otherwise. + */ +static inline bool hmm_range_valid(struct hmm_range *range) +{ + return range->valid; +} + /* * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn * @range: range use to decode HMM pfn value @@ -357,51 +415,66 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); /* - * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device - * driver lock that serializes device page table updates, then call - * hmm_vma_range_done(), to check if the snapshot is still valid. The same - * device driver page table update lock must also be used in the - * hmm_mirror_ops.sync_cpu_device_pagetables() callback, so that CPU page - * table invalidation serializes on it. - * - * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL - * hmm_range_snapshot() WITHOUT ERROR ! - * - * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID ! + * Please see Documentation/vm/hmm.rst for how to use the range API. */ +int hmm_range_register(struct hmm_range *range, + struct mm_struct *mm, + unsigned long start, + unsigned long end); +void hmm_range_unregister(struct hmm_range *range); long hmm_range_snapshot(struct hmm_range *range); -bool hmm_vma_range_done(struct hmm_range *range); - +long hmm_range_fault(struct hmm_range *range, bool block); /* - * Fault memory on behalf of device driver. Unlike handle_mm_fault(), this will - * not migrate any device memory back to system memory. The HMM pfn array will - * be updated with the fault result and current snapshot of the CPU page table - * for the range. - * - * The mmap_sem must be taken in read mode before entering and it might be - * dropped by the function if the block argument is false. In that case, the - * function returns -EAGAIN. - * - * Return value does not reflect if the fault was successful for every single - * address or not. Therefore, the caller must to inspect the HMM pfn array to - * determine fault status for each address. - * - * Trying to fault inside an invalid vma will result in -EINVAL. + * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range * - * See the function description in mm/hmm.c for further documentation. + * When waiting for mmu notifiers we need some kind of time out otherwise we + * could potentialy wait for ever, 1000ms ie 1s sounds like a long time to + * wait already. */ -long hmm_range_fault(struct hmm_range *range, bool block); +#define HMM_RANGE_DEFAULT_TIMEOUT 1000 + +/* This is a temporary helper to avoid merge conflict between trees. */ +static inline bool hmm_vma_range_done(struct hmm_range *range) +{ + bool ret = hmm_range_valid(range); + + hmm_range_unregister(range); + return ret; +} /* This is a temporary helper to avoid merge conflict between trees. */ static inline int hmm_vma_fault(struct hmm_range *range, bool block) { - long ret = hmm_range_fault(range, block); - if (ret == -EBUSY) - ret = -EAGAIN; - else if (ret == -EAGAIN) - ret = -EBUSY; - return ret < 0 ? ret : 0; + long ret; + + ret = hmm_range_register(range, range->vma->vm_mm, + range->start, range->end); + if (ret) + return (int)ret; + + if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) { + /* + * The mmap_sem was taken by driver we release it here and + * returns -EAGAIN which correspond to mmap_sem have been + * drop in the old API. + */ + up_read(&range->vma->vm_mm->mmap_sem); + return -EAGAIN; + } + + ret = hmm_range_fault(range, block); + if (ret <= 0) { + if (ret == -EBUSY || !ret) { + /* Same as above drop mmap_sem to match old API. */ + up_read(&range->vma->vm_mm->mmap_sem); + ret = -EBUSY; + } else if (ret == -EAGAIN) + ret = -EBUSY; + hmm_range_unregister(range); + return ret; + } + return 0; } /* Below are for HMM internal use only! Not to be used by device driver! */ diff --git a/mm/hmm.c b/mm/hmm.c index b7e4034d96e1..3e07f32b94f8 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -38,26 +38,6 @@ #if IS_ENABLED(CONFIG_HMM_MIRROR) static const struct mmu_notifier_ops hmm_mmu_notifier_ops; -/* - * struct hmm - HMM per mm struct - * - * @mm: mm struct this HMM struct is bound to - * @lock: lock protecting ranges list - * @ranges: list of range being snapshotted - * @mirrors: list of mirrors for this mm - * @mmu_notifier: mmu notifier to track updates to CPU page table - * @mirrors_sem: read/write semaphore protecting the mirrors list - */ -struct hmm { - struct mm_struct *mm; - struct kref kref; - spinlock_t lock; - struct list_head ranges; - struct list_head mirrors; - struct mmu_notifier mmu_notifier; - struct rw_semaphore mirrors_sem; -}; - static inline struct hmm *mm_get_hmm(struct mm_struct *mm) { struct hmm *hmm = READ_ONCE(mm->hmm); @@ -91,12 +71,15 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm) hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); if (!hmm) return NULL; + init_waitqueue_head(&hmm->wq); INIT_LIST_HEAD(&hmm->mirrors); init_rwsem(&hmm->mirrors_sem); hmm->mmu_notifier.ops = NULL; INIT_LIST_HEAD(&hmm->ranges); - spin_lock_init(&hmm->lock); + mutex_init(&hmm->lock); kref_init(&hmm->kref); + hmm->notifiers = 0; + hmm->dead = false; hmm->mm = mm; spin_lock(&mm->page_table_lock); @@ -158,6 +141,7 @@ void hmm_mm_destroy(struct mm_struct *mm) mm->hmm = NULL; if (hmm) { hmm->mm = NULL; + hmm->dead = true; spin_unlock(&mm->page_table_lock); hmm_put(hmm); return; @@ -166,43 +150,22 @@ void hmm_mm_destroy(struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } -static int hmm_invalidate_range(struct hmm *hmm, bool device, - const struct hmm_update *update) +static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) { + struct hmm *hmm = mm_get_hmm(mm); struct hmm_mirror *mirror; struct hmm_range *range; - spin_lock(&hmm->lock); - list_for_each_entry(range, &hmm->ranges, list) { - if (update->end < range->start || update->start >= range->end) - continue; + /* Report this HMM as dying. */ + hmm->dead = true; + /* Wake-up everyone waiting on any range. */ + mutex_lock(&hmm->lock); + list_for_each_entry(range, &hmm->ranges, list) { range->valid = false; } - spin_unlock(&hmm->lock); - - if (!device) - return 0; - - down_read(&hmm->mirrors_sem); - list_for_each_entry(mirror, &hmm->mirrors, list) { - int ret; - - ret = mirror->ops->sync_cpu_device_pagetables(mirror, update); - if (!update->blockable && ret == -EAGAIN) { - up_read(&hmm->mirrors_sem); - return -EAGAIN; - } - } - up_read(&hmm->mirrors_sem); - - return 0; -} - -static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) -{ - struct hmm_mirror *mirror; - struct hmm *hmm = mm_get_hmm(mm); + wake_up_all(&hmm->wq); + mutex_unlock(&hmm->lock); down_write(&hmm->mirrors_sem); mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror, @@ -228,36 +191,80 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) } static int hmm_invalidate_range_start(struct mmu_notifier *mn, - const struct mmu_notifier_range *range) + const struct mmu_notifier_range *nrange) { - struct hmm *hmm = mm_get_hmm(range->mm); + struct hmm *hmm = mm_get_hmm(nrange->mm); + struct hmm_mirror *mirror; struct hmm_update update; - int ret; + struct hmm_range *range; + int ret = 0; VM_BUG_ON(!hmm); - update.start = range->start; - update.end = range->end; + update.start = nrange->start; + update.end = nrange->end; update.event = HMM_UPDATE_INVALIDATE; - update.blockable = range->blockable; - ret = hmm_invalidate_range(hmm, true, &update); + update.blockable = nrange->blockable; + + if (nrange->blockable) + mutex_lock(&hmm->lock); + else if (!mutex_trylock(&hmm->lock)) { + ret = -EAGAIN; + goto out; + } + hmm->notifiers++; + list_for_each_entry(range, &hmm->ranges, list) { + if (update.end < range->start || update.start >= range->end) + continue; + + range->valid = false; + } + mutex_unlock(&hmm->lock); + + if (nrange->blockable) + down_read(&hmm->mirrors_sem); + else if (!down_read_trylock(&hmm->mirrors_sem)) { + ret = -EAGAIN; + goto out; + } + list_for_each_entry(mirror, &hmm->mirrors, list) { + int ret; + + ret = mirror->ops->sync_cpu_device_pagetables(mirror, &update); + if (!update.blockable && ret == -EAGAIN) { + up_read(&hmm->mirrors_sem); + ret = -EAGAIN; + goto out; + } + } + up_read(&hmm->mirrors_sem); + +out: hmm_put(hmm); return ret; } static void hmm_invalidate_range_end(struct mmu_notifier *mn, - const struct mmu_notifier_range *range) + const struct mmu_notifier_range *nrange) { - struct hmm *hmm = mm_get_hmm(range->mm); - struct hmm_update update; + struct hmm *hmm = mm_get_hmm(nrange->mm); VM_BUG_ON(!hmm); - update.start = range->start; - update.end = range->end; - update.event = HMM_UPDATE_INVALIDATE; - update.blockable = true; - hmm_invalidate_range(hmm, false, &update); + mutex_lock(&hmm->lock); + hmm->notifiers--; + if (!hmm->notifiers) { + struct hmm_range *range; + + list_for_each_entry(range, &hmm->ranges, list) { + if (range->valid) + continue; + range->valid = true; + } + wake_up_all(&hmm->wq); + } + mutex_unlock(&hmm->lock); + hmm_put(hmm); } @@ -409,7 +416,6 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, { struct hmm_range *range = hmm_vma_walk->range; - *fault = *write_fault = false; if (!hmm_vma_walk->fault) return; @@ -448,10 +454,11 @@ static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, return; } + *fault = *write_fault = false; for (i = 0; i < npages; ++i) { hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, fault, write_fault); - if ((*fault) || (*write_fault)) + if ((*write_fault)) return; } } @@ -706,162 +713,155 @@ static void hmm_pfns_special(struct hmm_range *range) } /* - * hmm_range_snapshot() - snapshot CPU page table for a range + * hmm_range_register() - start tracking change to CPU page table over a range * @range: range - * Returns: number of valid pages in range->pfns[] (from range start - * address). This may be zero. If the return value is negative, - * then one of the following values may be returned: + * @mm: the mm struct for the range of virtual address + * @start: start virtual address (inclusive) + * @end: end virtual address (exclusive) + * Returns 0 on success, -EFAULT if the address space is no longer valid * - * -EINVAL invalid arguments or mm or virtual address are in an - * invalid vma (ie either hugetlbfs or device file vma). - * -EPERM For example, asking for write, when the range is - * read-only - * -EAGAIN Caller needs to retry - * -EFAULT Either no valid vma exists for this range, or it is - * illegal to access the range - * - * This snapshots the CPU page table for a range of virtual addresses. Snapshot - * validity is tracked by range struct. See hmm_vma_range_done() for further - * information. + * Track updates to the CPU page table see include/linux/hmm.h */ -long hmm_range_snapshot(struct hmm_range *range) +int hmm_range_register(struct hmm_range *range, + struct mm_struct *mm, + unsigned long start, + unsigned long end) { - struct vm_area_struct *vma = range->vma; - struct hmm_vma_walk hmm_vma_walk; - struct mm_walk mm_walk; - struct hmm *hmm; - + range->start = start & PAGE_MASK; + range->end = end & PAGE_MASK; + range->valid = false; range->hmm = NULL; - /* Sanity check, this really should not happen ! */ - if (range->start < vma->vm_start || range->start >= vma->vm_end) - return -EINVAL; - if (range->end < vma->vm_start || range->end > vma->vm_end) + if (range->start >= range->end) return -EINVAL; - hmm = hmm_get_or_create(vma->vm_mm); - if (!hmm) - return -ENOMEM; + range->start = start; + range->end = end; + + range->hmm = hmm_get_or_create(mm); + if (!range->hmm) + return -EFAULT; /* Check if hmm_mm_destroy() was call. */ - if (hmm->mm == NULL) { - hmm_put(hmm); - return -EINVAL; + if (range->hmm->mm == NULL || range->hmm->dead) { + hmm_put(range->hmm); + return -EFAULT; } - /* FIXME support hugetlb fs */ - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || - vma_is_dax(vma)) { - hmm_pfns_special(range); - hmm_put(hmm); - return -EINVAL; - } + /* Initialize range to track CPU page table update */ + mutex_lock(&range->hmm->lock); - if (!(vma->vm_flags & VM_READ)) { - /* - * If vma do not allow read access, then assume that it does - * not allow write access, either. Architecture that allow - * write without read access are not supported by HMM, because - * operations such has atomic access would not work. - */ - hmm_pfns_clear(range, range->pfns, range->start, range->end); - hmm_put(hmm); - return -EPERM; - } + list_add_rcu(&range->list, &range->hmm->ranges); - /* Initialize range to track CPU page table update */ - spin_lock(&hmm->lock); - range->valid = true; - list_add_rcu(&range->list, &hmm->ranges); - spin_unlock(&hmm->lock); - - hmm_vma_walk.fault = false; - hmm_vma_walk.range = range; - mm_walk.private = &hmm_vma_walk; - hmm_vma_walk.last = range->start; - - mm_walk.vma = vma; - mm_walk.mm = vma->vm_mm; - mm_walk.pte_entry = NULL; - mm_walk.test_walk = NULL; - mm_walk.hugetlb_entry = NULL; - mm_walk.pmd_entry = hmm_vma_walk_pmd; - mm_walk.pte_hole = hmm_vma_walk_hole; - - walk_page_range(range->start, range->end, &mm_walk); /* - * Transfer hmm reference to the range struct it will be drop inside - * the hmm_vma_range_done() function (which _must_ be call if this - * function return 0). + * If there are any concurrent notifiers we have to wait for them for + * the range to be valid (see hmm_range_wait_until_valid()). */ - range->hmm = hmm; - return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; + if (!range->hmm->notifiers) + range->valid = true; + mutex_unlock(&range->hmm->lock); + + return 0; } -EXPORT_SYMBOL(hmm_range_snapshot); +EXPORT_SYMBOL(hmm_range_register); /* - * hmm_vma_range_done() - stop tracking change to CPU page table over a range - * @range: range being tracked - * Returns: false if range data has been invalidated, true otherwise + * hmm_range_unregister() - stop tracking change to CPU page table over a range + * @range: range * * Range struct is used to track updates to the CPU page table after a call to - * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done - * using the data, or wants to lock updates to the data it got from those - * functions, it must call the hmm_vma_range_done() function, which will then - * stop tracking CPU page table updates. - * - * Note that device driver must still implement general CPU page table update - * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using - * the mmu_notifier API directly. - * - * CPU page table update tracking done through hmm_range is only temporary and - * to be used while trying to duplicate CPU page table contents for a range of - * virtual addresses. - * - * There are two ways to use this : - * again: - * hmm_vma_get_pfns(range); or hmm_vma_fault(...); - * trans = device_build_page_table_update_transaction(pfns); - * device_page_table_lock(); - * if (!hmm_vma_range_done(range)) { - * device_page_table_unlock(); - * goto again; - * } - * device_commit_transaction(trans); - * device_page_table_unlock(); - * - * Or: - * hmm_vma_get_pfns(range); or hmm_vma_fault(...); - * device_page_table_lock(); - * hmm_vma_range_done(range); - * device_update_page_table(range->pfns); - * device_page_table_unlock(); + * hmm_range_register(). See include/linux/hmm.h for how to use it. */ -bool hmm_vma_range_done(struct hmm_range *range) +void hmm_range_unregister(struct hmm_range *range) { - bool ret = false; - /* Sanity check this really should not happen. */ - if (range->hmm == NULL || range->end <= range->start) { - BUG(); - return false; - } + if (range->hmm == NULL || range->end <= range->start) + return; - spin_lock(&range->hmm->lock); + mutex_lock(&range->hmm->lock); list_del_rcu(&range->list); - ret = range->valid; - spin_unlock(&range->hmm->lock); + mutex_unlock(&range->hmm->lock); - /* Is the mm still alive ? */ - if (range->hmm->mm == NULL) - ret = false; - - /* Drop reference taken by hmm_vma_fault() or hmm_vma_get_pfns() */ + /* Drop reference taken by hmm_range_register() */ + range->valid = false; hmm_put(range->hmm); range->hmm = NULL; - return ret; } -EXPORT_SYMBOL(hmm_vma_range_done); +EXPORT_SYMBOL(hmm_range_unregister); + +/* + * hmm_range_snapshot() - snapshot CPU page table for a range + * @range: range + * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid + * permission (for instance asking for write and range is read only), + * -EAGAIN if you need to retry, -EFAULT invalid (ie either no valid + * vma or it is illegal to access that range), number of valid pages + * in range->pfns[] (from range start address). + * + * This snapshots the CPU page table for a range of virtual addresses. Snapshot + * validity is tracked by range struct. See in include/linux/hmm.h for example + * on how to use. + */ +long hmm_range_snapshot(struct hmm_range *range) +{ + unsigned long start = range->start, end; + struct hmm_vma_walk hmm_vma_walk; + struct hmm *hmm = range->hmm; + struct vm_area_struct *vma; + struct mm_walk mm_walk; + + /* Check if hmm_mm_destroy() was call. */ + if (hmm->mm == NULL || hmm->dead) + return -EFAULT; + + do { + /* If range is no longer valid force retry. */ + if (!range->valid) + return -EAGAIN; + + vma = find_vma(hmm->mm, start); + if (vma == NULL || (vma->vm_flags & VM_SPECIAL)) + return -EFAULT; + + /* FIXME support hugetlb fs/dax */ + if (is_vm_hugetlb_page(vma) || vma_is_dax(vma)) { + hmm_pfns_special(range); + return -EINVAL; + } + + if (!(vma->vm_flags & VM_READ)) { + /* + * If vma do not allow read access, then assume that it + * does not allow write access, either. HMM does not + * support architecture that allow write without read. + */ + hmm_pfns_clear(range, range->pfns, + range->start, range->end); + return -EPERM; + } + + range->vma = vma; + hmm_vma_walk.last = start; + hmm_vma_walk.fault = false; + hmm_vma_walk.range = range; + mm_walk.private = &hmm_vma_walk; + end = min(range->end, vma->vm_end); + + mm_walk.vma = vma; + mm_walk.mm = vma->vm_mm; + mm_walk.pte_entry = NULL; + mm_walk.test_walk = NULL; + mm_walk.hugetlb_entry = NULL; + mm_walk.pmd_entry = hmm_vma_walk_pmd; + mm_walk.pte_hole = hmm_vma_walk_hole; + + walk_page_range(start, end, &mm_walk); + start = end; + } while (start < range->end); + + return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; +} +EXPORT_SYMBOL(hmm_range_snapshot); /* * hmm_range_fault() - try to fault some address in a virtual address range @@ -893,96 +893,79 @@ EXPORT_SYMBOL(hmm_vma_range_done); */ long hmm_range_fault(struct hmm_range *range, bool block) { - struct vm_area_struct *vma = range->vma; - unsigned long start = range->start; + unsigned long start = range->start, end; struct hmm_vma_walk hmm_vma_walk; + struct hmm *hmm = range->hmm; + struct vm_area_struct *vma; struct mm_walk mm_walk; - struct hmm *hmm; int ret; - range->hmm = NULL; - - /* Sanity check, this really should not happen ! */ - if (range->start < vma->vm_start || range->start >= vma->vm_end) - return -EINVAL; - if (range->end < vma->vm_start || range->end > vma->vm_end) - return -EINVAL; + /* Check if hmm_mm_destroy() was call. */ + if (hmm->mm == NULL || hmm->dead) + return -EFAULT; - hmm = hmm_get_or_create(vma->vm_mm); - if (!hmm) { - hmm_pfns_clear(range, range->pfns, range->start, range->end); - return -ENOMEM; - } + do { + /* If range is no longer valid force retry. */ + if (!range->valid) { + up_read(&hmm->mm->mmap_sem); + return -EAGAIN; + } - /* Check if hmm_mm_destroy() was call. */ - if (hmm->mm == NULL) { - hmm_put(hmm); - return -EINVAL; - } + vma = find_vma(hmm->mm, start); + if (vma == NULL || (vma->vm_flags & VM_SPECIAL)) + return -EFAULT; - /* FIXME support hugetlb fs */ - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || - vma_is_dax(vma)) { - hmm_pfns_special(range); - hmm_put(hmm); - return -EINVAL; - } + /* FIXME support hugetlb fs/dax */ + if (is_vm_hugetlb_page(vma) || vma_is_dax(vma)) { + hmm_pfns_special(range); + return -EINVAL; + } - if (!(vma->vm_flags & VM_READ)) { - /* - * If vma do not allow read access, then assume that it does - * not allow write access, either. Architecture that allow - * write without read access are not supported by HMM, because - * operations such has atomic access would not work. - */ - hmm_pfns_clear(range, range->pfns, range->start, range->end); - hmm_put(hmm); - return -EPERM; - } + if (!(vma->vm_flags & VM_READ)) { + /* + * If vma do not allow read access, then assume that it + * does not allow write access, either. HMM does not + * support architecture that allow write without read. + */ + hmm_pfns_clear(range, range->pfns, + range->start, range->end); + return -EPERM; + } - /* Initialize range to track CPU page table update */ - spin_lock(&hmm->lock); - range->valid = true; - list_add_rcu(&range->list, &hmm->ranges); - spin_unlock(&hmm->lock); - - hmm_vma_walk.fault = true; - hmm_vma_walk.block = block; - hmm_vma_walk.range = range; - mm_walk.private = &hmm_vma_walk; - hmm_vma_walk.last = range->start; - - mm_walk.vma = vma; - mm_walk.mm = vma->vm_mm; - mm_walk.pte_entry = NULL; - mm_walk.test_walk = NULL; - mm_walk.hugetlb_entry = NULL; - mm_walk.pmd_entry = hmm_vma_walk_pmd; - mm_walk.pte_hole = hmm_vma_walk_hole; + range->vma = vma; + hmm_vma_walk.last = start; + hmm_vma_walk.fault = true; + hmm_vma_walk.block = block; + hmm_vma_walk.range = range; + mm_walk.private = &hmm_vma_walk; + end = min(range->end, vma->vm_end); + + mm_walk.vma = vma; + mm_walk.mm = vma->vm_mm; + mm_walk.pte_entry = NULL; + mm_walk.test_walk = NULL; + mm_walk.hugetlb_entry = NULL; + mm_walk.pmd_entry = hmm_vma_walk_pmd; + mm_walk.pte_hole = hmm_vma_walk_hole; + + do { + ret = walk_page_range(start, end, &mm_walk); + start = hmm_vma_walk.last; + + /* Keep trying while the range is valid. */ + } while (ret == -EBUSY && range->valid); + + if (ret) { + unsigned long i; + + i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; + hmm_pfns_clear(range, &range->pfns[i], + hmm_vma_walk.last, range->end); + return ret; + } + start = end; - do { - ret = walk_page_range(start, range->end, &mm_walk); - start = hmm_vma_walk.last; - /* Keep trying while the range is valid. */ - } while (ret == -EBUSY && range->valid); - - if (ret) { - unsigned long i; - - i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; - hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last, - range->end); - hmm_vma_range_done(range); - hmm_put(hmm); - return ret; - } else { - /* - * Transfer hmm reference to the range struct it will be drop - * inside the hmm_vma_range_done() function (which _must_ be - * call if this function return 0). - */ - range->hmm = hmm; - } + } while (start < range->end); return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; } -- cgit v1.2.3 From 023a019a9b4e90b9df8ed5be591787b5c914d74f Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:05 -0700 Subject: mm/hmm: add default fault flags to avoid the need to pre-fill pfns arrays MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The HMM mirror API can be use in two fashions. The first one where the HMM user coalesce multiple page faults into one request and set flags per pfns for of those faults. The second one where the HMM user want to pre-fault a range with specific flags. For the latter one it is a waste to have the user pre-fill the pfn arrays with a default flags value. This patch adds a default flags value allowing user to set them for a range without having to pre-fill the pfn array. Link: http://lkml.kernel.org/r/20190403193318.16478-8-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Cc: John Hubbard Cc: Dan Williams Cc: Arnd Bergmann Cc: Balbir Singh Cc: Dan Carpenter Cc: Ira Weiny Cc: Matthew Wilcox Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/hmm.rst | 35 +++++++++++++++++++++++++++++++++++ include/linux/hmm.h | 13 +++++++++++++ mm/hmm.c | 12 ++++++++++++ 3 files changed, 60 insertions(+) (limited to 'include/linux') diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 945d5fb6d14a..ec1efa32af3c 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -276,6 +276,41 @@ report commands as executed is serialized (there is no point in doing this concurrently). +Leverage default_flags and pfn_flags_mask +========================================= + +The hmm_range struct has 2 fields default_flags and pfn_flags_mask that allows +to set fault or snapshot policy for a whole range instead of having to set them +for each entries in the range. + +For instance if the device flags for device entries are: + VALID (1 << 63) + WRITE (1 << 62) + +Now let say that device driver wants to fault with at least read a range then +it does set: + range->default_flags = (1 << 63) + range->pfn_flags_mask = 0; + +and calls hmm_range_fault() as described above. This will fill fault all page +in the range with at least read permission. + +Now let say driver wants to do the same except for one page in the range for +which its want to have write. Now driver set: + range->default_flags = (1 << 63); + range->pfn_flags_mask = (1 << 62); + range->pfns[index_of_write] = (1 << 62); + +With this HMM will fault in all page with at least read (ie valid) and for the +address == range->start + (index_of_write << PAGE_SHIFT) it will fault with +write permission ie if the CPU pte does not have write permission set then HMM +will call handle_mm_fault(). + +Note that HMM will populate the pfns array with write permission for any entry +that have write permission within the CPU pte no matter what are the values set +in default_flags or pfn_flags_mask. + + Represent and manage device memory from core kernel point of view ================================================================= diff --git a/include/linux/hmm.h b/include/linux/hmm.h index ec4bfa91648f..dee2f8953b2e 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -165,6 +165,8 @@ enum hmm_pfn_value_e { * @pfns: array of pfns (big enough for the range) * @flags: pfn flags to match device driver page table * @values: pfn value for some special case (none, special, error, ...) + * @default_flags: default flags for the range (write, read, ... see hmm doc) + * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT) * @valid: pfns array did not change since it has been fill by an HMM function */ @@ -177,6 +179,8 @@ struct hmm_range { uint64_t *pfns; const uint64_t *flags; const uint64_t *values; + uint64_t default_flags; + uint64_t pfn_flags_mask; uint8_t pfn_shift; bool valid; }; @@ -448,6 +452,15 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block) { long ret; + /* + * With the old API the driver must set each individual entries with + * the requested flags (valid, write, ...). So here we set the mask to + * keep intact the entries provided by the driver and zero out the + * default_flags. + */ + range->default_flags = 0; + range->pfn_flags_mask = -1UL; + ret = hmm_range_register(range, range->vma->vm_mm, range->start, range->end); if (ret) diff --git a/mm/hmm.c b/mm/hmm.c index 3e07f32b94f8..0e21d3594ab6 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -419,6 +419,18 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, if (!hmm_vma_walk->fault) return; + /* + * So we not only consider the individual per page request we also + * consider the default flags requested for the range. The API can + * be use in 2 fashions. The first one where the HMM user coalesce + * multiple page fault into one request and set flags per pfns for + * of those faults. The second one where the HMM user want to pre- + * fault a range with specific flags. For the latter one it is a + * waste to have the user pre-fill the pfn arrays with a default + * flags value. + */ + pfns = (pfns & range->pfn_flags_mask) | range->default_flags; + /* We aren't ask to do anything ... */ if (!(pfns & range->flags[HMM_PFN_VALID])) return; -- cgit v1.2.3 From 63d5066f6e5a1713d0247ef38f0add545408896b Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:18 -0700 Subject: mm/hmm: mirror hugetlbfs (snapshoting, faulting and DMA mapping) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HMM mirror is a device driver helpers to mirror range of virtual address. It means that the process jobs running on the device can access the same virtual address as the CPU threads of that process. This patch adds support for hugetlbfs mapping (ie range of virtual address that are mmap of a hugetlbfs). [rcampbell@nvidia.com: fix initial PFN for hugetlbfs pages] Link: http://lkml.kernel.org/r/20190419233536.8080-1-rcampbell@nvidia.com Link: http://lkml.kernel.org/r/20190403193318.16478-9-jglisse@redhat.com Signed-off-by: Jérôme Glisse Signed-off-by: Ralph Campbell Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Cc: John Hubbard Cc: Dan Williams Cc: Arnd Bergmann Cc: Balbir Singh Cc: Dan Carpenter Cc: Matthew Wilcox Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 27 +++++++++++- mm/hmm.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 134 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index dee2f8953b2e..e5834082de60 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -181,10 +181,31 @@ struct hmm_range { const uint64_t *values; uint64_t default_flags; uint64_t pfn_flags_mask; + uint8_t page_shift; uint8_t pfn_shift; bool valid; }; +/* + * hmm_range_page_shift() - return the page shift for the range + * @range: range being queried + * Returns: page shift (page size = 1 << page shift) for the range + */ +static inline unsigned hmm_range_page_shift(const struct hmm_range *range) +{ + return range->page_shift; +} + +/* + * hmm_range_page_size() - return the page size for the range + * @range: range being queried + * Returns: page size for the range in bytes + */ +static inline unsigned long hmm_range_page_size(const struct hmm_range *range) +{ + return 1UL << hmm_range_page_shift(range); +} + /* * hmm_range_wait_until_valid() - wait for range to be valid * @range: range affected by invalidation to wait on @@ -424,7 +445,8 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); int hmm_range_register(struct hmm_range *range, struct mm_struct *mm, unsigned long start, - unsigned long end); + unsigned long end, + unsigned page_shift); void hmm_range_unregister(struct hmm_range *range); long hmm_range_snapshot(struct hmm_range *range); long hmm_range_fault(struct hmm_range *range, bool block); @@ -462,7 +484,8 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block) range->pfn_flags_mask = -1UL; ret = hmm_range_register(range, range->vma->vm_mm, - range->start, range->end); + range->start, range->end, + PAGE_SHIFT); if (ret) return (int)ret; diff --git a/mm/hmm.c b/mm/hmm.c index 0e21d3594ab6..52e40be56dc7 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -391,11 +391,13 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; uint64_t *pfns = range->pfns; - unsigned long i; + unsigned long i, page_size; hmm_vma_walk->last = addr; - i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += PAGE_SIZE, i++) { + page_size = hmm_range_page_size(range); + i = (addr - range->start) >> range->page_shift; + + for (; addr < end; addr += page_size, i++) { pfns[i] = range->values[HMM_PFN_NONE]; if (fault || write_fault) { int ret; @@ -707,6 +709,69 @@ again: return 0; } +static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long start, unsigned long end, + struct mm_walk *walk) +{ +#ifdef CONFIG_HUGETLB_PAGE + unsigned long addr = start, i, pfn, mask, size, pfn_inc; + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + struct vm_area_struct *vma = walk->vma; + struct hstate *h = hstate_vma(vma); + uint64_t orig_pfn, cpu_flags; + bool fault, write_fault; + spinlock_t *ptl; + pte_t entry; + int ret = 0; + + size = 1UL << huge_page_shift(h); + mask = size - 1; + if (range->page_shift != PAGE_SHIFT) { + /* Make sure we are looking at full page. */ + if (start & mask) + return -EINVAL; + if (end < (start + size)) + return -EINVAL; + pfn_inc = size >> PAGE_SHIFT; + } else { + pfn_inc = 1; + size = PAGE_SIZE; + } + + + ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); + entry = huge_ptep_get(pte); + + i = (start - range->start) >> range->page_shift; + orig_pfn = range->pfns[i]; + range->pfns[i] = range->values[HMM_PFN_NONE]; + cpu_flags = pte_to_hmm_pfn_flags(range, entry); + fault = write_fault = false; + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, + &fault, &write_fault); + if (fault || write_fault) { + ret = -ENOENT; + goto unlock; + } + + pfn = pte_pfn(entry) + ((start & mask) >> range->page_shift); + for (; addr < end; addr += size, i++, pfn += pfn_inc) + range->pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; + hmm_vma_walk->last = end; + +unlock: + spin_unlock(ptl); + + if (ret == -ENOENT) + return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); + + return ret; +#else /* CONFIG_HUGETLB_PAGE */ + return -EINVAL; +#endif +} + static void hmm_pfns_clear(struct hmm_range *range, uint64_t *pfns, unsigned long addr, @@ -730,6 +795,7 @@ static void hmm_pfns_special(struct hmm_range *range) * @mm: the mm struct for the range of virtual address * @start: start virtual address (inclusive) * @end: end virtual address (exclusive) + * @page_shift: expect page shift for the range * Returns 0 on success, -EFAULT if the address space is no longer valid * * Track updates to the CPU page table see include/linux/hmm.h @@ -737,16 +803,20 @@ static void hmm_pfns_special(struct hmm_range *range) int hmm_range_register(struct hmm_range *range, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + unsigned page_shift) { - range->start = start & PAGE_MASK; - range->end = end & PAGE_MASK; + unsigned long mask = ((1UL << page_shift) - 1UL); + range->valid = false; range->hmm = NULL; - if (range->start >= range->end) + if ((start & mask) || (end & mask)) + return -EINVAL; + if (start >= end) return -EINVAL; + range->page_shift = page_shift; range->start = start; range->end = end; @@ -816,6 +886,7 @@ EXPORT_SYMBOL(hmm_range_unregister); */ long hmm_range_snapshot(struct hmm_range *range) { + const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; unsigned long start = range->start, end; struct hmm_vma_walk hmm_vma_walk; struct hmm *hmm = range->hmm; @@ -832,15 +903,26 @@ long hmm_range_snapshot(struct hmm_range *range) return -EAGAIN; vma = find_vma(hmm->mm, start); - if (vma == NULL || (vma->vm_flags & VM_SPECIAL)) + if (vma == NULL || (vma->vm_flags & device_vma)) return -EFAULT; - /* FIXME support hugetlb fs/dax */ - if (is_vm_hugetlb_page(vma) || vma_is_dax(vma)) { + /* FIXME support dax */ + if (vma_is_dax(vma)) { hmm_pfns_special(range); return -EINVAL; } + if (is_vm_hugetlb_page(vma)) { + struct hstate *h = hstate_vma(vma); + + if (huge_page_shift(h) != range->page_shift && + range->page_shift != PAGE_SHIFT) + return -EINVAL; + } else { + if (range->page_shift != PAGE_SHIFT) + return -EINVAL; + } + if (!(vma->vm_flags & VM_READ)) { /* * If vma do not allow read access, then assume that it @@ -866,6 +948,7 @@ long hmm_range_snapshot(struct hmm_range *range) mm_walk.hugetlb_entry = NULL; mm_walk.pmd_entry = hmm_vma_walk_pmd; mm_walk.pte_hole = hmm_vma_walk_hole; + mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry; walk_page_range(start, end, &mm_walk); start = end; @@ -884,7 +967,7 @@ EXPORT_SYMBOL(hmm_range_snapshot); * then one of the following values may be returned: * * -EINVAL invalid arguments or mm or virtual address are in an - * invalid vma (ie either hugetlbfs or device file vma). + * invalid vma (for instance device file vma). * -ENOMEM: Out of memory. * -EPERM: Invalid permission (for instance asking for write and * range is read only). @@ -905,6 +988,7 @@ EXPORT_SYMBOL(hmm_range_snapshot); */ long hmm_range_fault(struct hmm_range *range, bool block) { + const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; unsigned long start = range->start, end; struct hmm_vma_walk hmm_vma_walk; struct hmm *hmm = range->hmm; @@ -924,15 +1008,25 @@ long hmm_range_fault(struct hmm_range *range, bool block) } vma = find_vma(hmm->mm, start); - if (vma == NULL || (vma->vm_flags & VM_SPECIAL)) + if (vma == NULL || (vma->vm_flags & device_vma)) return -EFAULT; - /* FIXME support hugetlb fs/dax */ - if (is_vm_hugetlb_page(vma) || vma_is_dax(vma)) { + /* FIXME support dax */ + if (vma_is_dax(vma)) { hmm_pfns_special(range); return -EINVAL; } + if (is_vm_hugetlb_page(vma)) { + if (huge_page_shift(hstate_vma(vma)) != + range->page_shift && + range->page_shift != PAGE_SHIFT) + return -EINVAL; + } else { + if (range->page_shift != PAGE_SHIFT) + return -EINVAL; + } + if (!(vma->vm_flags & VM_READ)) { /* * If vma do not allow read access, then assume that it @@ -959,6 +1053,7 @@ long hmm_range_fault(struct hmm_range *range, bool block) mm_walk.hugetlb_entry = NULL; mm_walk.pmd_entry = hmm_vma_walk_pmd; mm_walk.pte_hole = hmm_vma_walk_hole; + mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry; do { ret = walk_page_range(start, end, &mm_walk); -- cgit v1.2.3 From 202394178d027f8a1530df65d4a25229138fab62 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:24 -0700 Subject: mm/hmm: add helpers to test if mm is still alive or not MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The device driver can have kernel thread or worker doing work against a process mm and it is useful for those to test wether the mm is dead or alive to avoid doing useless work. Add an helper to test that so that driver can bail out early if a process is dying. Note that the helper does not perform any lock synchronization and thus is just a hint ie a process might be dying but the helper might still return the process as alive. All HMM functions are safe to use in that case as HMM internal properly protect itself with lock. If driver use this helper with non HMM functions it should ascertain that it is safe to do so. Link: http://lkml.kernel.org/r/20190403193318.16478-11-jglisse@redhat.com Signed-off-by: Jérôme Glisse Cc: Ralph Campbell Cc: John Hubbard Cc: Dan Williams Cc: Ira Weiny Cc: Arnd Bergmann Cc: Balbir Singh Cc: Dan Carpenter Cc: Matthew Wilcox Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index e5834082de60..a79fcc6681f5 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -438,6 +438,30 @@ struct hmm_mirror { int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm); void hmm_mirror_unregister(struct hmm_mirror *mirror); +/* + * hmm_mirror_mm_is_alive() - test if mm is still alive + * @mirror: the HMM mm mirror for which we want to lock the mmap_sem + * Returns: false if the mm is dead, true otherwise + * + * This is an optimization it will not accurately always return -EINVAL if the + * mm is dead ie there can be false negative (process is being kill but HMM is + * not yet inform of that). It is only intented to be use to optimize out case + * where driver is about to do something time consuming and it would be better + * to skip it if the mm is dead. + */ +static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror) +{ + struct mm_struct *mm; + + if (!mirror || !mirror->hmm) + return false; + mm = READ_ONCE(mirror->hmm->mm); + if (mirror->hmm->dead || !mm) + return false; + + return true; +} + /* * Please see Documentation/vm/hmm.rst for how to use the range API. -- cgit v1.2.3 From 55c0ece82ac6ad018a71465d332847dce023eeb3 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:28 -0700 Subject: mm/hmm: add a helper function that fault pages and map them to a device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a all in one helper that fault pages in a range and map them to a device so that every single device driver do not have to re-implement this common pattern. This is taken from ODP RDMA in preparation of ODP RDMA convertion. It will be use by nouveau and other drivers. [jglisse@redhat.com: Was using wrong field and wrong enum] Link: http://lkml.kernel.org/r/20190409175340.26614-1-jglisse@redhat.com Link: http://lkml.kernel.org/r/20190403193318.16478-12-jglisse@redhat.com Signed-off-by: Jérôme Glisse Cc: Ralph Campbell Cc: John Hubbard Cc: Dan Williams Cc: Souptick Joarder Cc: Arnd Bergmann Cc: Balbir Singh Cc: Dan Carpenter Cc: Ira Weiny Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 9 ++++ mm/hmm.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 161 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index a79fcc6681f5..f81fe2c0f343 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -474,6 +474,15 @@ int hmm_range_register(struct hmm_range *range, void hmm_range_unregister(struct hmm_range *range); long hmm_range_snapshot(struct hmm_range *range); long hmm_range_fault(struct hmm_range *range, bool block); +long hmm_range_dma_map(struct hmm_range *range, + struct device *device, + dma_addr_t *daddrs, + bool block); +long hmm_range_dma_unmap(struct hmm_range *range, + struct vm_area_struct *vma, + struct device *device, + dma_addr_t *daddrs, + bool dirty); /* * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range diff --git a/mm/hmm.c b/mm/hmm.c index b1c9b05bf26f..95fa7abb9d67 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -1182,6 +1183,157 @@ long hmm_range_fault(struct hmm_range *range, bool block) return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; } EXPORT_SYMBOL(hmm_range_fault); + +/** + * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one. + * @range: range being faulted + * @device: device against to dma map page to + * @daddrs: dma address of mapped pages + * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) + * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been + * drop and you need to try again, some other error value otherwise + * + * Note same usage pattern as hmm_range_fault(). + */ +long hmm_range_dma_map(struct hmm_range *range, + struct device *device, + dma_addr_t *daddrs, + bool block) +{ + unsigned long i, npages, mapped; + long ret; + + ret = hmm_range_fault(range, block); + if (ret <= 0) + return ret ? ret : -EBUSY; + + npages = (range->end - range->start) >> PAGE_SHIFT; + for (i = 0, mapped = 0; i < npages; ++i) { + enum dma_data_direction dir = DMA_TO_DEVICE; + struct page *page; + + /* + * FIXME need to update DMA API to provide invalid DMA address + * value instead of a function to test dma address value. This + * would remove lot of dumb code duplicated accross many arch. + * + * For now setting it to 0 here is good enough as the pfns[] + * value is what is use to check what is valid and what isn't. + */ + daddrs[i] = 0; + + page = hmm_pfn_to_page(range, range->pfns[i]); + if (page == NULL) + continue; + + /* Check if range is being invalidated */ + if (!range->valid) { + ret = -EBUSY; + goto unmap; + } + + /* If it is read and write than map bi-directional. */ + if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) + dir = DMA_BIDIRECTIONAL; + + daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir); + if (dma_mapping_error(device, daddrs[i])) { + ret = -EFAULT; + goto unmap; + } + + mapped++; + } + + return mapped; + +unmap: + for (npages = i, i = 0; (i < npages) && mapped; ++i) { + enum dma_data_direction dir = DMA_TO_DEVICE; + struct page *page; + + page = hmm_pfn_to_page(range, range->pfns[i]); + if (page == NULL) + continue; + + if (dma_mapping_error(device, daddrs[i])) + continue; + + /* If it is read and write than map bi-directional. */ + if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) + dir = DMA_BIDIRECTIONAL; + + dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); + mapped--; + } + + return ret; +} +EXPORT_SYMBOL(hmm_range_dma_map); + +/** + * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map() + * @range: range being unmapped + * @vma: the vma against which the range (optional) + * @device: device against which dma map was done + * @daddrs: dma address of mapped pages + * @dirty: dirty page if it had the write flag set + * Returns: number of page unmapped on success, -EINVAL otherwise + * + * Note that caller MUST abide by mmu notifier or use HMM mirror and abide + * to the sync_cpu_device_pagetables() callback so that it is safe here to + * call set_page_dirty(). Caller must also take appropriate locks to avoid + * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress. + */ +long hmm_range_dma_unmap(struct hmm_range *range, + struct vm_area_struct *vma, + struct device *device, + dma_addr_t *daddrs, + bool dirty) +{ + unsigned long i, npages; + long cpages = 0; + + /* Sanity check. */ + if (range->end <= range->start) + return -EINVAL; + if (!daddrs) + return -EINVAL; + if (!range->pfns) + return -EINVAL; + + npages = (range->end - range->start) >> PAGE_SHIFT; + for (i = 0; i < npages; ++i) { + enum dma_data_direction dir = DMA_TO_DEVICE; + struct page *page; + + page = hmm_pfn_to_page(range, range->pfns[i]); + if (page == NULL) + continue; + + /* If it is read and write than map bi-directional. */ + if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) { + dir = DMA_BIDIRECTIONAL; + + /* + * See comments in function description on why it is + * safe here to call set_page_dirty() + */ + if (dirty) + set_page_dirty(page); + } + + /* Unmap and clear pfns/dma address */ + dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); + range->pfns[i] = range->values[HMM_PFN_NONE]; + /* FIXME see comments in hmm_vma_dma_map() */ + daddrs[i] = 0; + cpages++; + } + + return cpages; +} +EXPORT_SYMBOL(hmm_range_dma_unmap); #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -- cgit v1.2.3 From 391aab11e93f36c421abeab62526954d08ac3eed Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:31 -0700 Subject: mm/hmm: convert various hmm_pfn_* to device_entry which is a better name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert hmm_pfn_* to device_entry_* as here we are dealing with device driver specific entry format and hmm provide helpers to allow differents components (including HMM) to create/parse device entry. We keep wrapper with the old name so that we can convert driver to use the new API in stages in each device driver tree. This will get remove once all driver are converted. Link: http://lkml.kernel.org/r/20190403193318.16478-13-jglisse@redhat.com Signed-off-by: Jérôme Glisse Cc: Ralph Campbell Cc: John Hubbard Cc: Dan Williams Cc: Ira Weiny Cc: Arnd Bergmann Cc: Balbir Singh Cc: Dan Carpenter Cc: Matthew Wilcox Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 93 ++++++++++++++++++++++++++++++++++++----------------- mm/hmm.c | 19 ++++++----- 2 files changed, 75 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index f81fe2c0f343..51ec27a84668 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -239,36 +239,36 @@ static inline bool hmm_range_valid(struct hmm_range *range) } /* - * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn - * @range: range use to decode HMM pfn value - * @pfn: HMM pfn value to get corresponding struct page from - * Returns: struct page pointer if pfn is a valid HMM pfn, NULL otherwise + * hmm_device_entry_to_page() - return struct page pointed to by a device entry + * @range: range use to decode device entry value + * @entry: device entry value to get corresponding struct page from + * Returns: struct page pointer if entry is a valid, NULL otherwise * - * If the HMM pfn is valid (ie valid flag set) then return the struct page - * matching the pfn value stored in the HMM pfn. Otherwise return NULL. + * If the device entry is valid (ie valid flag set) then return the struct page + * matching the entry value. Otherwise return NULL. */ -static inline struct page *hmm_pfn_to_page(const struct hmm_range *range, - uint64_t pfn) +static inline struct page *hmm_device_entry_to_page(const struct hmm_range *range, + uint64_t entry) { - if (pfn == range->values[HMM_PFN_NONE]) + if (entry == range->values[HMM_PFN_NONE]) return NULL; - if (pfn == range->values[HMM_PFN_ERROR]) + if (entry == range->values[HMM_PFN_ERROR]) return NULL; - if (pfn == range->values[HMM_PFN_SPECIAL]) + if (entry == range->values[HMM_PFN_SPECIAL]) return NULL; - if (!(pfn & range->flags[HMM_PFN_VALID])) + if (!(entry & range->flags[HMM_PFN_VALID])) return NULL; - return pfn_to_page(pfn >> range->pfn_shift); + return pfn_to_page(entry >> range->pfn_shift); } /* - * hmm_pfn_to_pfn() - return pfn value store in a HMM pfn - * @range: range use to decode HMM pfn value - * @pfn: HMM pfn value to extract pfn from - * Returns: pfn value if HMM pfn is valid, -1UL otherwise + * hmm_device_entry_to_pfn() - return pfn value store in a device entry + * @range: range use to decode device entry value + * @entry: device entry to extract pfn from + * Returns: pfn value if device entry is valid, -1UL otherwise */ -static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range, - uint64_t pfn) +static inline unsigned long +hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn) { if (pfn == range->values[HMM_PFN_NONE]) return -1UL; @@ -282,31 +282,66 @@ static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range, } /* - * hmm_pfn_from_page() - create a valid HMM pfn value from struct page + * hmm_device_entry_from_page() - create a valid device entry for a page * @range: range use to encode HMM pfn value - * @page: struct page pointer for which to create the HMM pfn - * Returns: valid HMM pfn for the page + * @page: page for which to create the device entry + * Returns: valid device entry for the page */ -static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range, - struct page *page) +static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range, + struct page *page) { return (page_to_pfn(page) << range->pfn_shift) | range->flags[HMM_PFN_VALID]; } /* - * hmm_pfn_from_pfn() - create a valid HMM pfn value from pfn + * hmm_device_entry_from_pfn() - create a valid device entry value from pfn * @range: range use to encode HMM pfn value - * @pfn: pfn value for which to create the HMM pfn - * Returns: valid HMM pfn for the pfn + * @pfn: pfn value for which to create the device entry + * Returns: valid device entry for the pfn */ -static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range, - unsigned long pfn) +static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, + unsigned long pfn) { return (pfn << range->pfn_shift) | range->flags[HMM_PFN_VALID]; } +/* + * Old API: + * hmm_pfn_to_page() + * hmm_pfn_to_pfn() + * hmm_pfn_from_page() + * hmm_pfn_from_pfn() + * + * This are the OLD API please use new API, it is here to avoid cross-tree + * merge painfullness ie we convert things to new API in stages. + */ +static inline struct page *hmm_pfn_to_page(const struct hmm_range *range, + uint64_t pfn) +{ + return hmm_device_entry_to_page(range, pfn); +} + +static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range, + uint64_t pfn) +{ + return hmm_device_entry_to_pfn(range, pfn); +} + +static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range, + struct page *page) +{ + return hmm_device_entry_from_page(range, page); +} + +static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range, + unsigned long pfn) +{ + return hmm_device_entry_from_pfn(range, pfn); +} + + #if IS_ENABLED(CONFIG_HMM_MIRROR) /* diff --git a/mm/hmm.c b/mm/hmm.c index 95fa7abb9d67..44a238642b1d 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -543,7 +543,7 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, if (unlikely(!hmm_vma_walk->pgmap)) return -EBUSY; } - pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; + pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; } if (hmm_vma_walk->pgmap) { put_dev_pagemap(hmm_vma_walk->pgmap); @@ -611,7 +611,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, &fault, &write_fault); if (fault || write_fault) goto fault; - *pfn = hmm_pfn_from_pfn(range, swp_offset(entry)); + *pfn = hmm_device_entry_from_pfn(range, + swp_offset(entry)); *pfn |= cpu_flags; return 0; } @@ -649,7 +650,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, return -EFAULT; } - *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags; + *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags; return 0; fault: @@ -803,7 +804,8 @@ again: hmm_vma_walk->pgmap); if (unlikely(!hmm_vma_walk->pgmap)) return -EBUSY; - pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; + pfns[i] = hmm_device_entry_from_pfn(range, pfn) | + cpu_flags; } if (hmm_vma_walk->pgmap) { put_dev_pagemap(hmm_vma_walk->pgmap); @@ -879,7 +881,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, pfn = pte_pfn(entry) + ((start & mask) >> range->page_shift); for (; addr < end; addr += size, i++, pfn += pfn_inc) - range->pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; + range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | + cpu_flags; hmm_vma_walk->last = end; unlock: @@ -1222,7 +1225,7 @@ long hmm_range_dma_map(struct hmm_range *range, */ daddrs[i] = 0; - page = hmm_pfn_to_page(range, range->pfns[i]); + page = hmm_device_entry_to_page(range, range->pfns[i]); if (page == NULL) continue; @@ -1252,7 +1255,7 @@ unmap: enum dma_data_direction dir = DMA_TO_DEVICE; struct page *page; - page = hmm_pfn_to_page(range, range->pfns[i]); + page = hmm_device_entry_to_page(range, range->pfns[i]); if (page == NULL) continue; @@ -1307,7 +1310,7 @@ long hmm_range_dma_unmap(struct hmm_range *range, enum dma_data_direction dir = DMA_TO_DEVICE; struct page *page; - page = hmm_pfn_to_page(range, range->pfns[i]); + page = hmm_device_entry_to_page(range, range->pfns[i]); if (page == NULL) continue; -- cgit v1.2.3 From 4a83bfe916f3d2100df5bc8389bd182a537ced3e Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:34 -0700 Subject: mm/mmu_notifier: helper to test if a range invalidation is blockable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mmu notifier provide context informations", v6. Here I am not posting users of this, they already have been posted to appropriate mailing list [6] and will be merge through the appropriate tree once this patchset is upstream. Note that this serie does not change any behavior for any existing code. It just pass down more information to mmu notifier listener. The rationale for this patchset: CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). This patchset introduce a set of enums that can be associated with each of the events triggering a mmu notifier: - UNMAP: munmap() or mremap() - CLEAR: page table is cleared (migration, compaction, reclaim, ...) - PROTECTION_VMA: change in access protections for the range - PROTECTION_PAGE: change in access protections for page in the range - SOFT_DIRTY: soft dirtyness tracking Being able to identify munmap() and mremap() from other reasons why the page table is cleared is important to allow user of mmu notifier to update their own internal tracking structure accordingly (on munmap or mremap it is not longer needed to track range of virtual address as it becomes invalid). Without this serie, driver are force to assume that every notification is an munmap which triggers useless trashing within drivers that associate structure with range of virtual address. Each driver is force to free up its tracking structure and then restore it on next device page fault. With this series we can also optimize device page table update. Patches to use this are at https://lkml.org/lkml/2019/1/23/833 https://lkml.org/lkml/2019/1/23/834 https://lkml.org/lkml/2019/1/23/832 https://lkml.org/lkml/2019/1/23/831 Moreover this can also be used to optimize out some page table updates such as for KVM where we can update the secondary MMU directly from the callback instead of clearing it. ACKS AMD/RADEON https://lkml.org/lkml/2019/2/1/395 ACKS RDMA https://lkml.org/lkml/2018/12/6/1473 This patch (of 8): Simple helpers to test if range invalidation is blockable. Latter patches use cocinnelle to convert all direct dereference of range-> blockable to use this function instead so that we can convert the blockable field to an unsigned for more flags. Link: http://lkml.kernel.org/r/20190326164747.24405-2-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Michal Hocko Cc: Christian Koenig Cc: John Hubbard Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 4050ec1c3b45..e630def131ce 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -226,6 +226,12 @@ extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r, extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); +static inline bool +mmu_notifier_range_blockable(const struct mmu_notifier_range *range) +{ + return range->blockable; +} + static inline void mmu_notifier_release(struct mm_struct *mm) { if (mm_has_notifiers(mm)) @@ -455,6 +461,11 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range, #define mmu_notifier_range_init(range, mm, start, end) \ _mmu_notifier_range_init(range, start, end) +static inline bool +mmu_notifier_range_blockable(const struct mmu_notifier_range *range) +{ + return true; +} static inline int mm_has_notifiers(struct mm_struct *mm) { -- cgit v1.2.3 From 27560ee96f40017075bcb975b85f85dae3622f01 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:42 -0700 Subject: mm/mmu_notifier: convert mmu_notifier_range->blockable to a flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use an unsigned field for flags other than blockable and convert the blockable field to be one of those flags. Link: http://lkml.kernel.org/r/20190326164747.24405-4-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Michal Hocko Cc: Christian Koenig Cc: John Hubbard Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index e630def131ce..c8672c366f67 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -25,11 +25,13 @@ struct mmu_notifier_mm { spinlock_t lock; }; +#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) + struct mmu_notifier_range { struct mm_struct *mm; unsigned long start; unsigned long end; - bool blockable; + unsigned flags; }; struct mmu_notifier_ops { @@ -229,7 +231,7 @@ extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, static inline bool mmu_notifier_range_blockable(const struct mmu_notifier_range *range) { - return range->blockable; + return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE); } static inline void mmu_notifier_release(struct mm_struct *mm) @@ -275,7 +277,7 @@ static inline void mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { if (mm_has_notifiers(range->mm)) { - range->blockable = true; + range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE; __mmu_notifier_invalidate_range_start(range); } } @@ -284,7 +286,7 @@ static inline int mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) { if (mm_has_notifiers(range->mm)) { - range->blockable = false; + range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE; return __mmu_notifier_invalidate_range_start(range); } return 0; @@ -331,6 +333,7 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, range->mm = mm; range->start = start; range->end = end; + range->flags = 0; } #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ -- cgit v1.2.3 From d87f055b94ea9270c491b5e650dd776ecc30d7c9 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:45 -0700 Subject: mm/mmu_notifier: contextual information for event enums MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). This patch introduce a set of enums that can be associated with each of the events triggering a mmu notifier. Latter patches take advantages of those enum values. - UNMAP: munmap() or mremap() - CLEAR: page table is cleared (migration, compaction, reclaim, ...) - PROTECTION_VMA: change in access protections for the range - PROTECTION_PAGE: change in access protections for page in the range - SOFT_DIRTY: soft dirtyness tracking Being able to identify munmap() and mremap() from other reasons why the page table is cleared is important to allow user of mmu notifier to update their own internal tracking structure accordingly (on munmap or mremap it is not longer needed to track range of virtual address as it becomes invalid). Link: http://lkml.kernel.org/r/20190326164747.24405-5-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Michal Hocko Cc: Christian Koenig Cc: John Hubbard Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index c8672c366f67..2386e71ac1b8 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -10,6 +10,36 @@ struct mmu_notifier; struct mmu_notifier_ops; +/** + * enum mmu_notifier_event - reason for the mmu notifier callback + * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that + * move the range + * + * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like + * madvise() or replacing a page by another one, ...). + * + * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range + * ie using the vma access permission (vm_page_prot) to update the whole range + * is enough no need to inspect changes to the CPU page table (mprotect() + * syscall) + * + * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for + * pages in the range so to mirror those changes the user must inspect the CPU + * page table (from the end callback). + * + * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same + * access flags). User should soft dirty the page in the end callback to make + * sure that anyone relying on soft dirtyness catch pages that might be written + * through non CPU mappings. + */ +enum mmu_notifier_event { + MMU_NOTIFY_UNMAP = 0, + MMU_NOTIFY_CLEAR, + MMU_NOTIFY_PROTECTION_VMA, + MMU_NOTIFY_PROTECTION_PAGE, + MMU_NOTIFY_SOFT_DIRTY, +}; + #ifdef CONFIG_MMU_NOTIFIER /* -- cgit v1.2.3 From 6f4f13e8d9e27cefd2cd88dd4fd80aa6d68b9131 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:49 -0700 Subject: mm/mmu_notifier: contextual information for event triggering invalidation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). Users of mmu notifier API track changes to the CPU page table and take specific action for them. While current API only provide range of virtual address affected by the change, not why the changes is happening. This patchset do the initial mechanical convertion of all the places that calls mmu_notifier_range_init to also provide the default MMU_NOTIFY_UNMAP event as well as the vma if it is know (most invalidation happens against a given vma). Passing down the vma allows the users of mmu notifier to inspect the new vma page protection. The MMU_NOTIFY_UNMAP is always the safe default as users of mmu notifier should assume that every for the range is going away when that event happens. A latter patch do convert mm call path to use a more appropriate events for each call. This is done as 2 patches so that no call site is forgotten especialy as it uses this following coccinelle patch: %<---------------------------------------------------------------------- @@ identifier I1, I2, I3, I4; @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *I1, +enum mmu_notifier_event event, +unsigned flags, +struct vm_area_struct *vma, struct mm_struct *I2, unsigned long I3, unsigned long I4) { ... } @@ @@ -#define mmu_notifier_range_init(range, mm, start, end) +#define mmu_notifier_range_init(range, event, flags, vma, mm, start, end) @@ expression E1, E3, E4; identifier I1; @@ <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, I1, I1->vm_mm, E3, E4) ...> @@ expression E1, E2, E3, E4; identifier FN, VMA; @@ FN(..., struct vm_area_struct *VMA, ...) { <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, VMA, E2, E3, E4) ...> } @@ expression E1, E2, E3, E4; identifier FN, VMA; @@ FN(...) { struct vm_area_struct *VMA; <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, VMA, E2, E3, E4) ...> } @@ expression E1, E2, E3, E4; identifier FN; @@ FN(...) { <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, NULL, E2, E3, E4) ...> } ---------------------------------------------------------------------->% Applied with: spatch --all-includes --sp-file mmu-notifier.spatch fs/proc/task_mmu.c --in-place spatch --sp-file mmu-notifier.spatch --dir kernel/events/ --in-place spatch --sp-file mmu-notifier.spatch --dir mm --in-place Link: http://lkml.kernel.org/r/20190326164747.24405-6-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Michal Hocko Cc: Christian Koenig Cc: John Hubbard Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 3 ++- include/linux/mmu_notifier.h | 5 ++++- kernel/events/uprobes.c | 3 ++- mm/huge_memory.c | 12 ++++++++---- mm/hugetlb.c | 12 ++++++++---- mm/khugepaged.c | 3 ++- mm/ksm.c | 6 ++++-- mm/madvise.c | 3 ++- mm/memory.c | 25 ++++++++++++++++--------- mm/migrate.c | 5 ++++- mm/mprotect.c | 3 ++- mm/mremap.c | 3 ++- mm/oom_kill.c | 3 ++- mm/rmap.c | 6 ++++-- 14 files changed, 62 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 95ca1fe7283c..ea464f2b9867 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1169,7 +1169,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, break; } - mmu_notifier_range_init(&range, mm, 0, -1UL); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, + NULL, mm, 0, -1UL); mmu_notifier_invalidate_range_start(&range); } walk_page_range(0, mm->highest_vm_end, &clear_refs_walk); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 2386e71ac1b8..62f94cd85455 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -356,6 +356,9 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, + enum mmu_notifier_event event, + unsigned flags, + struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end) @@ -491,7 +494,7 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range, range->end = end; } -#define mmu_notifier_range_init(range, mm, start, end) \ +#define mmu_notifier_range_init(range,event,flags,vma,mm,start,end) \ _mmu_notifier_range_init(range, start, end) static inline bool diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4ca7364c956d..e34b699f3865 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct mmu_notifier_range range; struct mem_cgroup *memcg; - mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, addr, + addr + PAGE_SIZE); VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 50c665b12cf1..428b5794f4b8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1224,7 +1224,8 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, cond_resched(); } - mmu_notifier_range_init(&range, vma->vm_mm, haddr, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -1388,7 +1389,8 @@ alloc: vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, vma->vm_mm, haddr, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -2064,7 +2066,8 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PUD_MASK, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + address & HPAGE_PUD_MASK, (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); mmu_notifier_invalidate_range_start(&range); ptl = pud_lock(vma->vm_mm, pud); @@ -2282,7 +2285,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PMD_MASK, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + address & HPAGE_PMD_MASK, (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); ptl = pmd_lock(vma->vm_mm, pmd); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 98a3c7c224cb..89d206d6ecf3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3294,7 +3294,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; if (cow) { - mmu_notifier_range_init(&range, src, vma->vm_start, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, src, + vma->vm_start, vma->vm_end); mmu_notifier_invalidate_range_start(&range); } @@ -3406,7 +3407,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, /* * If sharing possible, alert mmu notifiers of worst case. */ - mmu_notifier_range_init(&range, mm, start, end); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start, + end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); mmu_notifier_invalidate_range_start(&range); address = start; @@ -3673,7 +3675,8 @@ retry_avoidcopy: pages_per_huge_page(h)); __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h)); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, haddr, + haddr + huge_page_size(h)); mmu_notifier_invalidate_range_start(&range); /* @@ -4408,7 +4411,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * start/end. Set range.start/range.end to cover the maximum possible * range if PMD sharing is possible. */ - mmu_notifier_range_init(&range, mm, start, end); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start, + end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); BUG_ON(address >= end); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7ba7a1e4fa79..14581dbf62a5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1016,7 +1016,8 @@ static void collapse_huge_page(struct mm_struct *mm, pte = pte_offset_map(pmd, address); pte_ptl = pte_lockptr(mm, pmd); - mmu_notifier_range_init(&range, mm, address, address + HPAGE_PMD_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, NULL, mm, + address, address + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ /* diff --git a/mm/ksm.c b/mm/ksm.c index fc64874dc6f4..01f5fe2c90cf 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1066,7 +1066,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, BUG_ON(PageTransCompound(page)); - mmu_notifier_range_init(&range, mm, pvmw.address, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, + pvmw.address, pvmw.address + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -1154,7 +1155,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (!pmd) goto out; - mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, addr, + addr + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); diff --git a/mm/madvise.c b/mm/madvise.c index bb3a4554d5d5..1c52bdf1b696 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -472,7 +472,8 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, range.end = min(vma->vm_end, end_addr); if (range.end <= vma->vm_start) return -EINVAL; - mmu_notifier_range_init(&range, mm, range.start, range.end); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, + range.start, range.end); lru_add_drain(); tlb_gather_mmu(&tlb, mm, range.start, range.end); diff --git a/mm/memory.c b/mm/memory.c index f7d962d7de19..90672674c582 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1010,7 +1010,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, is_cow = is_cow_mapping(vma->vm_flags); if (is_cow) { - mmu_notifier_range_init(&range, src_mm, addr, end); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, + src_mm, addr, end); mmu_notifier_invalidate_range_start(&range); } @@ -1334,7 +1335,8 @@ void unmap_vmas(struct mmu_gather *tlb, { struct mmu_notifier_range range; - mmu_notifier_range_init(&range, vma->vm_mm, start_addr, end_addr); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + start_addr, end_addr); mmu_notifier_invalidate_range_start(&range); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); @@ -1356,7 +1358,8 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, struct mmu_gather tlb; lru_add_drain(); - mmu_notifier_range_init(&range, vma->vm_mm, start, start + size); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + start, start + size); tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); @@ -1382,7 +1385,8 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr struct mmu_gather tlb; lru_add_drain(); - mmu_notifier_range_init(&range, vma->vm_mm, address, address + size); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + address, address + size); tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); @@ -2279,7 +2283,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, mm, vmf->address & PAGE_MASK, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, + vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -4104,8 +4109,9 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, goto out; if (range) { - mmu_notifier_range_init(range, mm, address & PMD_MASK, - (address & PMD_MASK) + PMD_SIZE); + mmu_notifier_range_init(range, MMU_NOTIFY_UNMAP, 0, + NULL, mm, address & PMD_MASK, + (address & PMD_MASK) + PMD_SIZE); mmu_notifier_invalidate_range_start(range); } *ptlp = pmd_lock(mm, pmd); @@ -4122,8 +4128,9 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, goto out; if (range) { - mmu_notifier_range_init(range, mm, address & PAGE_MASK, - (address & PAGE_MASK) + PAGE_SIZE); + mmu_notifier_range_init(range, MMU_NOTIFY_UNMAP, 0, NULL, mm, + address & PAGE_MASK, + (address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(range); } ptep = pte_offset_map_lock(mm, pmd, address, ptlp); diff --git a/mm/migrate.c b/mm/migrate.c index a1770403ff7f..855bdb3b3333 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2356,7 +2356,8 @@ static void migrate_vma_collect(struct migrate_vma *migrate) mm_walk.mm = migrate->vma->vm_mm; mm_walk.private = migrate; - mmu_notifier_range_init(&range, mm_walk.mm, migrate->start, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, NULL, mm_walk.mm, + migrate->start, migrate->end); mmu_notifier_invalidate_range_start(&range); walk_page_range(migrate->start, migrate->end, &mm_walk); @@ -2764,6 +2765,8 @@ static void migrate_vma_pages(struct migrate_vma *migrate) notified = true; mmu_notifier_range_init(&range, + MMU_NOTIFY_UNMAP, 0, + NULL, migrate->vma->vm_mm, addr, migrate->end); mmu_notifier_invalidate_range_start(&range); diff --git a/mm/mprotect.c b/mm/mprotect.c index 028c724dcb1a..b10984052ae9 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -185,7 +185,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, /* invoke the mmu notifier if the pmd is populated */ if (!range.start) { - mmu_notifier_range_init(&range, vma->vm_mm, addr, end); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, + vma, vma->vm_mm, addr, end); mmu_notifier_invalidate_range_start(&range); } diff --git a/mm/mremap.c b/mm/mremap.c index e3edef6b7a12..fc241d23cd97 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -249,7 +249,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma, old_end = old_addr + len; flush_cache_range(vma, old_addr, old_end); - mmu_notifier_range_init(&range, vma->vm_mm, old_addr, old_end); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + old_addr, old_end); mmu_notifier_invalidate_range_start(&range); for (; old_addr < old_end; old_addr += extent, new_addr += extent) { diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3a2484884cfd..539c91d0b26a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -531,7 +531,8 @@ bool __oom_reap_task_mm(struct mm_struct *mm) struct mmu_notifier_range range; struct mmu_gather tlb; - mmu_notifier_range_init(&range, mm, vma->vm_start, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, + vma, mm, vma->vm_start, vma->vm_end); tlb_gather_mmu(&tlb, mm, range.start, range.end); if (mmu_notifier_invalidate_range_start_nonblock(&range)) { diff --git a/mm/rmap.c b/mm/rmap.c index 76c8dfd3ae1c..288e636b7813 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -896,7 +896,8 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, * We have to assume the worse case ie pmd for invalidation. Note that * the page can not be free from this function. */ - mmu_notifier_range_init(&range, vma->vm_mm, address, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + address, min(vma->vm_end, address + (PAGE_SIZE << compound_order(page)))); mmu_notifier_invalidate_range_start(&range); @@ -1371,7 +1372,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ - mmu_notifier_range_init(&range, vma->vm_mm, address, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + address, min(vma->vm_end, address + (PAGE_SIZE << compound_order(page)))); if (PageHuge(page)) { -- cgit v1.2.3 From bf198b2b34bfd4bc9bd6abb33bf650b74329a2ac Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:57 -0700 Subject: mm/mmu_notifier: pass down vma and reasons why mmu notifier is happening MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). Users of mmu notifier API track changes to the CPU page table and take specific action for them. While current API only provide range of virtual address affected by the change, not why the changes is happening This patch is just passing down the new informations by adding it to the mmu_notifier_range structure. Link: http://lkml.kernel.org/r/20190326164747.24405-8-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Michal Hocko Cc: Christian Koenig Cc: John Hubbard Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 62f94cd85455..0379956fff23 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -58,10 +58,12 @@ struct mmu_notifier_mm { #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) struct mmu_notifier_range { + struct vm_area_struct *vma; struct mm_struct *mm; unsigned long start; unsigned long end; unsigned flags; + enum mmu_notifier_event event; }; struct mmu_notifier_ops { @@ -363,10 +365,12 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, unsigned long start, unsigned long end) { + range->vma = vma; + range->event = event; range->mm = mm; range->start = start; range->end = end; - range->flags = 0; + range->flags = flags; } #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ -- cgit v1.2.3 From c6d23413f81bd69935afedaf1da9d55b03febf58 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:21:00 -0700 Subject: mm/mmu_notifier: mmu_notifier_range_update_to_read_only() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Helper to test if a range is updated to read only (it is still valid to read from the range). This is useful for device driver or anyone who wish to optimize out update when they know that they already have the range map read only. Link: http://lkml.kernel.org/r/20190326164747.24405-9-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Michal Hocko Cc: Christian Koenig Cc: John Hubbard Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 4 ++++ mm/mmu_notifier.c | 10 ++++++++++ 2 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 0379956fff23..b6c004bd9f6a 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -259,6 +259,8 @@ extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r, bool only_end); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); +extern bool +mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range); static inline bool mmu_notifier_range_blockable(const struct mmu_notifier_range *range) @@ -568,6 +570,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) { } +#define mmu_notifier_range_update_to_read_only(r) false + #define ptep_clear_flush_young_notify ptep_clear_flush_young #define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_young_notify ptep_test_and_clear_young diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index abd88c466eb2..ee36068077b6 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -395,3 +395,13 @@ void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, mmdrop(mm); } EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); + +bool +mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range) +{ + if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA) + return false; + /* Return true if the vma still have the read flag set. */ + return range->vma->vm_flags & VM_READ; +} +EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only); -- cgit v1.2.3 From 5470dea49f5382257c242ac617d908267727f1a8 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 13 May 2019 17:21:10 -0700 Subject: mm: use mm_zero_struct_page from SPARC on all 64b architectures Patch series "Deferred page init improvements", v7. This patchset is essentially a refactor of the page initialization logic that is meant to provide for better code reuse while providing a significant improvement in deferred page initialization performance. In my testing on an x86_64 system with 384GB of RAM I have seen the following. In the case of regular memory initialization the deferred init time was decreased from 3.75s to 1.38s on average. This amounts to a 172% improvement for the deferred memory initialization performance. I have called out the improvement observed with each patch. This patch (of 4): Use the same approach that was already in use on Sparc on all the architectures that support a 64b long. This is mostly motivated by the fact that 7 to 10 store/move instructions are likely always going to be faster than having to call into a function that is not specialized for handling page init. An added advantage to doing it this way is that the compiler can get away with combining writes in the __init_single_page call. As a result the memset call will be reduced to only about 4 write operations, or at least that is what I am seeing with GCC 6.2 as the flags, LRU pointers, and count/mapcount seem to be cancelling out at least 4 of the 8 assignments on my system. One change I had to make to the function was to reduce the minimum page size to 56 to support some powerpc64 configurations. This change should introduce no change on SPARC since it already had this code. In the case of x86_64 I saw a reduction from 3.75s to 2.80s when initializing 384GB of RAM per node. Pavel Tatashin tested on a system with Broadcom's Stingray CPU and 48GB of RAM and found that __init_single_page() takes 19.30ns / 64-byte struct page before this patch and with this patch it takes 17.33ns / 64-byte struct page. Mike Rapoport ran a similar test on a OpenPower (S812LC 8348-21C) with Power8 processor and 128GB or RAM. His results per 64-byte struct page were 4.68ns before, and 4.59ns after this patch. Link: http://lkml.kernel.org/r/20190405221213.12227.9392.stgit@localhost.localdomain Signed-off-by: Alexander Duyck Reviewed-by: Pavel Tatashin Acked-by: Michal Hocko Cc: Mike Rapoport Cc: Dan Williams Cc: Dave Jiang Cc: David S. Miller Cc: Ingo Molnar Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Laurent Dufour Cc: Matthew Wilcox Cc: Mel Gorman Cc: Mike Rapoport Cc: Pavel Tatashin Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/include/asm/pgtable_64.h | 30 --------------------------- include/linux/mm.h | 41 ++++++++++++++++++++++++++++++++++--- 2 files changed, 38 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 1393a8ac596b..22500c3be7a9 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -231,36 +231,6 @@ extern unsigned long _PAGE_ALL_SZ_BITS; extern struct page *mem_map_zero; #define ZERO_PAGE(vaddr) (mem_map_zero) -/* This macro must be updated when the size of struct page grows above 80 - * or reduces below 64. - * The idea that compiler optimizes out switch() statement, and only - * leaves clrx instructions - */ -#define mm_zero_struct_page(pp) do { \ - unsigned long *_pp = (void *)(pp); \ - \ - /* Check that struct page is either 64, 72, or 80 bytes */ \ - BUILD_BUG_ON(sizeof(struct page) & 7); \ - BUILD_BUG_ON(sizeof(struct page) < 64); \ - BUILD_BUG_ON(sizeof(struct page) > 80); \ - \ - switch (sizeof(struct page)) { \ - case 80: \ - _pp[9] = 0; /* fallthrough */ \ - case 72: \ - _pp[8] = 0; /* fallthrough */ \ - default: \ - _pp[7] = 0; \ - _pp[6] = 0; \ - _pp[5] = 0; \ - _pp[4] = 0; \ - _pp[3] = 0; \ - _pp[2] = 0; \ - _pp[1] = 0; \ - _pp[0] = 0; \ - } \ -} while (0) - /* PFNs are real physical page numbers. However, mem_map only begins to record * per-page information starting at pfn_base. This is to handle systems where * the first physical page in the machine is at some huge physical address, diff --git a/include/linux/mm.h b/include/linux/mm.h index e6b6be15609e..abb7eb7ef0f2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -124,10 +124,45 @@ extern int mmap_rnd_compat_bits __read_mostly; /* * On some architectures it is expensive to call memset() for small sizes. - * Those architectures should provide their own implementation of "struct page" - * zeroing by defining this macro in . + * If an architecture decides to implement their own version of + * mm_zero_struct_page they should wrap the defines below in a #ifndef and + * define their own version of this macro in */ -#ifndef mm_zero_struct_page +#if BITS_PER_LONG == 64 +/* This function must be updated when the size of struct page grows above 80 + * or reduces below 56. The idea that compiler optimizes out switch() + * statement, and only leaves move/store instructions. Also the compiler can + * combine write statments if they are both assignments and can be reordered, + * this can result in several of the writes here being dropped. + */ +#define mm_zero_struct_page(pp) __mm_zero_struct_page(pp) +static inline void __mm_zero_struct_page(struct page *page) +{ + unsigned long *_pp = (void *)page; + + /* Check that struct page is either 56, 64, 72, or 80 bytes */ + BUILD_BUG_ON(sizeof(struct page) & 7); + BUILD_BUG_ON(sizeof(struct page) < 56); + BUILD_BUG_ON(sizeof(struct page) > 80); + + switch (sizeof(struct page)) { + case 80: + _pp[9] = 0; /* fallthrough */ + case 72: + _pp[8] = 0; /* fallthrough */ + case 64: + _pp[7] = 0; /* fallthrough */ + case 56: + _pp[6] = 0; + _pp[5] = 0; + _pp[4] = 0; + _pp[3] = 0; + _pp[2] = 0; + _pp[1] = 0; + _pp[0] = 0; + } +} +#else #define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page))) #endif -- cgit v1.2.3 From 837566e7e08e3f89444166444836a8a49b9f9322 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 13 May 2019 17:21:17 -0700 Subject: mm: implement new zone specific memblock iterator Introduce a new iterator for_each_free_mem_pfn_range_in_zone. This iterator will take care of making sure a given memory range provided is in fact contained within a zone. It takes are of all the bounds checking we were doing in deferred_grow_zone, and deferred_init_memmap. In addition it should help to speed up the search a bit by iterating until the end of a range is greater than the start of the zone pfn range, and will exit completely if the start is beyond the end of the zone. Link: http://lkml.kernel.org/r/20190405221225.12227.22573.stgit@localhost.localdomain Signed-off-by: Alexander Duyck Reviewed-by: Pavel Tatashin Reviewed-by: Mike Rapoport Cc: Dan Williams Cc: Dave Jiang Cc: David S. Miller Cc: Ingo Molnar Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Laurent Dufour Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Pavel Tatashin Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 25 +++++++++++++++++++ mm/memblock.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 31 +++++++++-------------- 3 files changed, 101 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 294d5d80e150..f8b78892b977 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -240,6 +240,31 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid)) #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, + unsigned long *out_spfn, + unsigned long *out_epfn); +/** + * for_each_free_mem_range_in_zone - iterate through zone specific free + * memblock areas + * @i: u64 used as loop variable + * @zone: zone in which all of the memory blocks reside + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + * + * Walks over free (memory && !reserved) areas of memblock in a specific + * zone. Available once memblock and an empty zone is initialized. The main + * assumption is that the zone start, end, and pgdat have been associated. + * This way we can use the zone to determine NUMA node, and if a given part + * of the memblock is valid for the zone. + */ +#define for_each_free_mem_pfn_range_in_zone(i, zone, p_start, p_end) \ + for (i = 0, \ + __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end); \ + i != U64_MAX; \ + __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ + /** * for_each_free_mem_range - iterate through free memblock areas * @i: u64 used as loop variable diff --git a/mm/memblock.c b/mm/memblock.c index a48f520c2d01..f315eca9f4a1 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1255,6 +1255,70 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, return 0; } #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +/** + * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone() + * + * @idx: pointer to u64 loop variable + * @zone: zone in which all of the memory blocks reside + * @out_spfn: ptr to ulong for start pfn of the range, can be %NULL + * @out_epfn: ptr to ulong for end pfn of the range, can be %NULL + * + * This function is meant to be a zone/pfn specific wrapper for the + * for_each_mem_range type iterators. Specifically they are used in the + * deferred memory init routines and as such we were duplicating much of + * this logic throughout the code. So instead of having it in multiple + * locations it seemed like it would make more sense to centralize this to + * one new iterator that does everything they need. + */ +void __init_memblock +__next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, + unsigned long *out_spfn, unsigned long *out_epfn) +{ + int zone_nid = zone_to_nid(zone); + phys_addr_t spa, epa; + int nid; + + __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, + &memblock.memory, &memblock.reserved, + &spa, &epa, &nid); + + while (*idx != U64_MAX) { + unsigned long epfn = PFN_DOWN(epa); + unsigned long spfn = PFN_UP(spa); + + /* + * Verify the end is at least past the start of the zone and + * that we have at least one PFN to initialize. + */ + if (zone->zone_start_pfn < epfn && spfn < epfn) { + /* if we went too far just stop searching */ + if (zone_end_pfn(zone) <= spfn) { + *idx = U64_MAX; + break; + } + + if (out_spfn) + *out_spfn = max(zone->zone_start_pfn, spfn); + if (out_epfn) + *out_epfn = min(zone_end_pfn(zone), epfn); + + return; + } + + __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, + &memblock.memory, &memblock.reserved, + &spa, &epa, &nid); + } + + /* signal end of iteration */ + if (out_spfn) + *out_spfn = ULONG_MAX; + if (out_epfn) + *out_epfn = 0; +} + +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ /** * memblock_alloc_range_nid - allocate boot memory block diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 25b82be438d7..fd42321c02f0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1631,11 +1631,9 @@ static unsigned long __init deferred_init_pages(struct zone *zone, static int __init deferred_init_memmap(void *data) { pg_data_t *pgdat = data; - int nid = pgdat->node_id; unsigned long start = jiffies; unsigned long nr_pages = 0; unsigned long spfn, epfn, first_init_pfn, flags; - phys_addr_t spa, epa; int zid; struct zone *zone; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); @@ -1672,14 +1670,12 @@ static int __init deferred_init_memmap(void *data) * freeing pages we can access pages that are ahead (computing buddy * page in __free_one_page()). */ - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + for_each_free_mem_pfn_range_in_zone(i, zone, &spfn, &epfn) { + spfn = max_t(unsigned long, first_init_pfn, spfn); nr_pages += deferred_init_pages(zone, spfn, epfn); } - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + for_each_free_mem_pfn_range_in_zone(i, zone, &spfn, &epfn) { + spfn = max_t(unsigned long, first_init_pfn, spfn); deferred_free_pages(spfn, epfn); } pgdat_resize_unlock(pgdat, &flags); @@ -1687,8 +1683,8 @@ static int __init deferred_init_memmap(void *data) /* Sanity check that the next zone really is unpopulated */ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); - pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages, - jiffies_to_msecs(jiffies - start)); + pr_info("node %d initialised, %lu pages in %ums\n", + pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start)); pgdat_init_report_one_done(); return 0; @@ -1712,13 +1708,11 @@ static int __init deferred_init_memmap(void *data) static noinline bool __init deferred_grow_zone(struct zone *zone, unsigned int order) { - int nid = zone_to_nid(zone); - pg_data_t *pgdat = NODE_DATA(nid); unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); + pg_data_t *pgdat = zone->zone_pgdat; unsigned long nr_pages = 0; unsigned long first_init_pfn, spfn, epfn, t, flags; unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; - phys_addr_t spa, epa; u64 i; /* Only the last zone may have deferred pages */ @@ -1754,9 +1748,8 @@ deferred_grow_zone(struct zone *zone, unsigned int order) return false; } - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + for_each_free_mem_pfn_range_in_zone(i, zone, &spfn, &epfn) { + spfn = max_t(unsigned long, first_init_pfn, spfn); while (spfn < epfn && nr_pages < nr_pages_needed) { t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION); @@ -1770,9 +1763,9 @@ deferred_grow_zone(struct zone *zone, unsigned int order) break; } - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); - epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa)); + for_each_free_mem_pfn_range_in_zone(i, zone, &spfn, &epfn) { + spfn = max_t(unsigned long, first_init_pfn, spfn); + epfn = min_t(unsigned long, first_deferred_pfn, epfn); deferred_free_pages(spfn, epfn); if (first_deferred_pfn == epfn) -- cgit v1.2.3 From 0e56acae4b4dd4a9fbe897854ab83a109e2a9e11 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 13 May 2019 17:21:20 -0700 Subject: mm: initialize MAX_ORDER_NR_PAGES at a time instead of doing larger sections Add yet another iterator, for_each_free_mem_range_in_zone_from, and then use it to support initializing and freeing pages in groups no larger than MAX_ORDER_NR_PAGES. By doing this we can greatly improve the cache locality of the pages while we do several loops over them in the init and freeing process. We are able to tighten the loops further as a result of the "from" iterator as we can perform the initial checks for first_init_pfn in our first call to the iterator, and continue without the need for those checks via the "from" iterator. I have added this functionality in the function called deferred_init_mem_pfn_range_in_zone that primes the iterator and causes us to exit if we encounter any failure. On my x86_64 test system with 384GB of memory per node I saw a reduction in initialization time from 1.85s to 1.38s as a result of this patch. Link: http://lkml.kernel.org/r/20190405221231.12227.85836.stgit@localhost.localdomain Signed-off-by: Alexander Duyck Reviewed-by: Pavel Tatashin Cc: Mike Rapoport Cc: Michal Hocko Cc: Dave Jiang Cc: Matthew Wilcox Cc: Ingo Molnar Cc: Cc: Khalid Aziz Cc: Mike Rapoport Cc: Vlastimil Babka Cc: Dan Williams Cc: Laurent Dufour Cc: Mel Gorman Cc: David S. Miller Cc: "Kirill A. Shutemov" Cc: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 16 +++++ mm/page_alloc.c | 162 +++++++++++++++++++++++++++++++++++------------ 2 files changed, 137 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index f8b78892b977..47e3c0612592 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -263,6 +263,22 @@ void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end); \ i != U64_MAX; \ __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) + +/** + * for_each_free_mem_range_in_zone_from - iterate through zone specific + * free memblock areas from a given point + * @i: u64 used as loop variable + * @zone: zone in which all of the memory blocks reside + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + * + * Walks over free (memory && !reserved) areas of memblock in a specific + * zone, continuing from current position. Available as soon as memblock is + * initialized. + */ +#define for_each_free_mem_pfn_range_in_zone_from(i, zone, p_start, p_end) \ + for (; i != U64_MAX; \ + __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ /** diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fd42321c02f0..96ca65636e40 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1627,16 +1627,100 @@ static unsigned long __init deferred_init_pages(struct zone *zone, return (nr_pages); } +/* + * This function is meant to pre-load the iterator for the zone init. + * Specifically it walks through the ranges until we are caught up to the + * first_init_pfn value and exits there. If we never encounter the value we + * return false indicating there are no valid ranges left. + */ +static bool __init +deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone, + unsigned long *spfn, unsigned long *epfn, + unsigned long first_init_pfn) +{ + u64 j; + + /* + * Start out by walking through the ranges in this zone that have + * already been initialized. We don't need to do anything with them + * so we just need to flush them out of the system. + */ + for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) { + if (*epfn <= first_init_pfn) + continue; + if (*spfn < first_init_pfn) + *spfn = first_init_pfn; + *i = j; + return true; + } + + return false; +} + +/* + * Initialize and free pages. We do it in two loops: first we initialize + * struct page, then free to buddy allocator, because while we are + * freeing pages we can access pages that are ahead (computing buddy + * page in __free_one_page()). + * + * In order to try and keep some memory in the cache we have the loop + * broken along max page order boundaries. This way we will not cause + * any issues with the buddy page computation. + */ +static unsigned long __init +deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn, + unsigned long *end_pfn) +{ + unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES); + unsigned long spfn = *start_pfn, epfn = *end_pfn; + unsigned long nr_pages = 0; + u64 j = *i; + + /* First we loop through and initialize the page values */ + for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) { + unsigned long t; + + if (mo_pfn <= *start_pfn) + break; + + t = min(mo_pfn, *end_pfn); + nr_pages += deferred_init_pages(zone, *start_pfn, t); + + if (mo_pfn < *end_pfn) { + *start_pfn = mo_pfn; + break; + } + } + + /* Reset values and now loop through freeing pages as needed */ + swap(j, *i); + + for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) { + unsigned long t; + + if (mo_pfn <= spfn) + break; + + t = min(mo_pfn, epfn); + deferred_free_pages(spfn, t); + + if (mo_pfn <= epfn) + break; + } + + return nr_pages; +} + /* Initialise remaining memory on a node */ static int __init deferred_init_memmap(void *data) { pg_data_t *pgdat = data; + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + unsigned long spfn = 0, epfn = 0, nr_pages = 0; + unsigned long first_init_pfn, flags; unsigned long start = jiffies; - unsigned long nr_pages = 0; - unsigned long spfn, epfn, first_init_pfn, flags; - int zid; struct zone *zone; - const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + int zid; u64 i; /* Bind memory initialisation thread to a local node if possible */ @@ -1662,22 +1746,20 @@ static int __init deferred_init_memmap(void *data) if (first_init_pfn < zone_end_pfn(zone)) break; } - first_init_pfn = max(zone->zone_start_pfn, first_init_pfn); + + /* If the zone is empty somebody else may have cleared out the zone */ + if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, + first_init_pfn)) + goto zone_empty; /* - * Initialize and free pages. We do it in two loops: first we initialize - * struct page, than free to buddy allocator, because while we are - * freeing pages we can access pages that are ahead (computing buddy - * page in __free_one_page()). + * Initialize and free pages in MAX_ORDER sized increments so + * that we can avoid introducing any issues with the buddy + * allocator. */ - for_each_free_mem_pfn_range_in_zone(i, zone, &spfn, &epfn) { - spfn = max_t(unsigned long, first_init_pfn, spfn); - nr_pages += deferred_init_pages(zone, spfn, epfn); - } - for_each_free_mem_pfn_range_in_zone(i, zone, &spfn, &epfn) { - spfn = max_t(unsigned long, first_init_pfn, spfn); - deferred_free_pages(spfn, epfn); - } + while (spfn < epfn) + nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); +zone_empty: pgdat_resize_unlock(pgdat, &flags); /* Sanity check that the next zone really is unpopulated */ @@ -1710,9 +1792,9 @@ deferred_grow_zone(struct zone *zone, unsigned int order) { unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); pg_data_t *pgdat = zone->zone_pgdat; - unsigned long nr_pages = 0; - unsigned long first_init_pfn, spfn, epfn, t, flags; unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; + unsigned long spfn, epfn, flags; + unsigned long nr_pages = 0; u64 i; /* Only the last zone may have deferred pages */ @@ -1741,37 +1823,35 @@ deferred_grow_zone(struct zone *zone, unsigned int order) return true; } - first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn); - - if (first_init_pfn >= pgdat_end_pfn(pgdat)) { + /* If the zone is empty somebody else may have cleared out the zone */ + if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, + first_deferred_pfn)) { + pgdat->first_deferred_pfn = ULONG_MAX; pgdat_resize_unlock(pgdat, &flags); - return false; + return true; } - for_each_free_mem_pfn_range_in_zone(i, zone, &spfn, &epfn) { - spfn = max_t(unsigned long, first_init_pfn, spfn); + /* + * Initialize and free pages in MAX_ORDER sized increments so + * that we can avoid introducing any issues with the buddy + * allocator. + */ + while (spfn < epfn) { + /* update our first deferred PFN for this section */ + first_deferred_pfn = spfn; + + nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); - while (spfn < epfn && nr_pages < nr_pages_needed) { - t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION); - first_deferred_pfn = min(t, epfn); - nr_pages += deferred_init_pages(zone, spfn, - first_deferred_pfn); - spfn = first_deferred_pfn; - } + /* We should only stop along section boundaries */ + if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION) + continue; + /* If our quota has been met we can stop here */ if (nr_pages >= nr_pages_needed) break; } - for_each_free_mem_pfn_range_in_zone(i, zone, &spfn, &epfn) { - spfn = max_t(unsigned long, first_init_pfn, spfn); - epfn = min_t(unsigned long, first_deferred_pfn, epfn); - deferred_free_pages(spfn, epfn); - - if (first_deferred_pfn == epfn) - break; - } - pgdat->first_deferred_pfn = first_deferred_pfn; + pgdat->first_deferred_pfn = spfn; pgdat_resize_unlock(pgdat, &flags); return nr_pages > 0; -- cgit v1.2.3 From 5557c766abad25acc8091ccb9641b96e3b3da06f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 13 May 2019 17:21:24 -0700 Subject: mm, memory_hotplug: cleanup memory offline path check_pages_isolated_cb currently accounts the whole pfn range as being offlined if test_pages_isolated suceeds on the range. This is based on the assumption that all pages in the range are freed which is currently the case in most cases but it won't be with later changes, as pages marked as vmemmap won't be isolated. Move the offlined pages counting to offline_isolated_pages_cb and rely on __offline_isolated_pages to return the correct value. check_pages_isolated_cb will still do it's primary job and check the pfn range. While we are at it remove check_pages_isolated and offline_isolated_pages and use directly walk_system_ram_range as do in online_pages. Link: http://lkml.kernel.org/r/20190408082633.2864-2-osalvador@suse.de Reviewed-by: David Hildenbrand Signed-off-by: Michal Hocko Signed-off-by: Oscar Salvador Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 3 ++- mm/memory_hotplug.c | 45 +++++++++++------------------------------- mm/page_alloc.c | 11 +++++++++-- 3 files changed, 22 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 8ade08c50d26..3c8cf347804c 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -87,7 +87,8 @@ extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); extern int online_pages(unsigned long, unsigned long, int); extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, unsigned long *valid_start, unsigned long *valid_end); -extern void __offline_isolated_pages(unsigned long, unsigned long); +extern unsigned long __offline_isolated_pages(unsigned long start_pfn, + unsigned long end_pfn); typedef void (*online_page_callback_t)(struct page *page, unsigned int order); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a279671b9968..75f9f6590677 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1449,15 +1449,10 @@ static int offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, void *data) { - __offline_isolated_pages(start, start + nr_pages); - return 0; -} + unsigned long *offlined_pages = (unsigned long *)data; -static void -offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) -{ - walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, - offline_isolated_pages_cb); + *offlined_pages += __offline_isolated_pages(start, start + nr_pages); + return 0; } /* @@ -1467,26 +1462,7 @@ static int check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, void *data) { - int ret; - long offlined = *(long *)data; - ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); - offlined = nr_pages; - if (!ret) - *(long *)data += offlined; - return ret; -} - -static long -check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) -{ - long offlined = 0; - int ret; - - ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, - check_pages_isolated_cb); - if (ret < 0) - offlined = (long)ret; - return offlined; + return test_pages_isolated(start_pfn, start_pfn + nr_pages, true); } static int __init cmdline_parse_movable_node(char *p) @@ -1571,7 +1547,7 @@ static int __ref __offline_pages(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn, nr_pages; - long offlined_pages; + unsigned long offlined_pages = 0; int ret, node, nr_isolate_pageblock; unsigned long flags; unsigned long valid_start, valid_end; @@ -1647,14 +1623,15 @@ static int __ref __offline_pages(unsigned long start_pfn, goto failed_removal_isolated; } /* check again */ - offlined_pages = check_pages_isolated(start_pfn, end_pfn); - } while (offlined_pages < 0); + ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, + NULL, check_pages_isolated_cb); + } while (ret); - pr_info("Offlined Pages %ld\n", offlined_pages); /* Ok, all of our target is isolated. We cannot do rollback at this point. */ - offline_isolated_pages(start_pfn, end_pfn); - + walk_system_ram_range(start_pfn, end_pfn - start_pfn, + &offlined_pages, offline_isolated_pages_cb); + pr_info("Offlined Pages %ld\n", offlined_pages); /* * Onlining will reset pagetype flags and makes migrate type * MOVABLE, so just need to decrease the number of isolated diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 96ca65636e40..c45da9fe3ce1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8453,7 +8453,7 @@ void zone_pcp_reset(struct zone *zone) * All pages in the range must be in a single zone and isolated * before calling this. */ -void +unsigned long __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { struct page *page; @@ -8461,12 +8461,15 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) unsigned int order, i; unsigned long pfn; unsigned long flags; + unsigned long offlined_pages = 0; + /* find the first valid pfn */ for (pfn = start_pfn; pfn < end_pfn; pfn++) if (pfn_valid(pfn)) break; if (pfn == end_pfn) - return; + return offlined_pages; + offline_mem_sections(pfn, end_pfn); zone = page_zone(pfn_to_page(pfn)); spin_lock_irqsave(&zone->lock, flags); @@ -8484,12 +8487,14 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { pfn++; SetPageReserved(page); + offlined_pages++; continue; } BUG_ON(page_count(page)); BUG_ON(!PageBuddy(page)); order = page_order(page); + offlined_pages += 1 << order; #ifdef CONFIG_DEBUG_VM pr_info("remove from free list %lx %d %lx\n", pfn, 1 << order, end_pfn); @@ -8502,6 +8507,8 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) pfn += (1 << order); } spin_unlock_irqrestore(&zone->lock, flags); + + return offlined_pages; } #endif -- cgit v1.2.3 From 940519f0c8b757fdcbc5d14c93cdaada20ded14c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 13 May 2019 17:21:26 -0700 Subject: mm, memory_hotplug: provide a more generic restrictions for memory hotplug arch_add_memory, __add_pages take a want_memblock which controls whether the newly added memory should get the sysfs memblock user API (e.g. ZONE_DEVICE users do not want/need this interface). Some callers even want to control where do we allocate the memmap from by configuring altmap. Add a more generic hotplug context for arch_add_memory and __add_pages. struct mhp_restrictions contains flags which contains additional features to be enabled by the memory hotplug (MHP_MEMBLOCK_API currently) and altmap for alternative memmap allocator. This patch shouldn't introduce any functional change. [akpm@linux-foundation.org: build fix] Link: http://lkml.kernel.org/r/20190408082633.2864-3-osalvador@suse.de Signed-off-by: Michal Hocko Signed-off-by: Oscar Salvador Cc: Dan Williams Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/mmu.c | 6 +++--- arch/ia64/mm/init.c | 6 +++--- arch/powerpc/mm/mem.c | 6 +++--- arch/s390/mm/init.c | 6 +++--- arch/sh/mm/init.c | 6 +++--- arch/x86/mm/init_32.c | 6 +++--- arch/x86/mm/init_64.c | 10 +++++----- include/linux/memory_hotplug.h | 31 ++++++++++++++++++++++++------- kernel/memremap.c | 12 +++++++++--- mm/memory_hotplug.c | 11 +++++++---- 10 files changed, 63 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index ef82312860ac..ef32d4839c3f 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1065,8 +1065,8 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { int flags = 0; @@ -1077,6 +1077,6 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, size, PAGE_KERNEL, __pgd_pgtable_alloc, flags); return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, - altmap, want_memblock); + restrictions); } #endif diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index e49200e31750..379eb1f9adc9 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -666,14 +666,14 @@ mem_init (void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + ret = __add_pages(nid, start_pfn, nr_pages, restrictions); if (ret) printk("%s: Problem encountered in __add_pages() as ret=%d\n", __func__, ret); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 20266898f3a8..de5c591a550d 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -109,8 +109,8 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end) return -ENODEV; } -int __ref arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int __ref arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -127,7 +127,7 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altm } flush_inval_dcache_range(start, start + size); - return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + return __add_pages(nid, start_pfn, nr_pages, restrictions); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 5f48fc7e61d5..06bd05137a00 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -219,8 +219,8 @@ device_initcall(s390_cma_mem_init); #endif /* CONFIG_CMA */ -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = PFN_DOWN(start); unsigned long size_pages = PFN_DOWN(size); @@ -230,7 +230,7 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, if (rc) return rc; - rc = __add_pages(nid, start_pfn, size_pages, altmap, want_memblock); + rc = __add_pages(nid, start_pfn, size_pages, restrictions); if (rc) vmem_remove_mapping(start, size); return rc; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index aeb9f45c7a39..d3cd07bd2dc1 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -404,15 +404,15 @@ void __init mem_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; int ret; /* We only have ZONE_NORMAL, so this is easy.. */ - ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + ret = __add_pages(nid, start_pfn, nr_pages, restrictions); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 85c94f9a87f8..755dbed85531 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -850,13 +850,13 @@ void __init mem_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + return __add_pages(nid, start_pfn, nr_pages, restrictions); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index bccff68e3267..db42c11b48fb 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -777,11 +777,11 @@ static void update_end_of_memory_vars(u64 start, u64 size) } int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, - struct vmem_altmap *altmap, bool want_memblock) + struct mhp_restrictions *restrictions) { int ret; - ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + ret = __add_pages(nid, start_pfn, nr_pages, restrictions); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ @@ -791,15 +791,15 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, return ret; } -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; init_memory_mapping(start, start + size); - return add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + return add_pages(nid, start_pfn, nr_pages, restrictions); } #define PAGE_INUSE 0xFD diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 3c8cf347804c..b24aca54353e 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -53,6 +53,16 @@ enum { MMOP_ONLINE_MOVABLE, }; +/* + * Restrictions for the memory hotplug: + * flags: MHP_ flags + * altmap: alternative allocator for memmap array + */ +struct mhp_restrictions { + unsigned long flags; + struct vmem_altmap *altmap; +}; + /* * Zone resizing functions * @@ -101,6 +111,8 @@ extern void __online_page_free(struct page *page); extern int try_online_node(int nid); +extern int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions); extern u64 max_mem_size; extern bool memhp_auto_online; @@ -118,20 +130,27 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); #endif /* CONFIG_MEMORY_HOTREMOVE */ +/* + * Do we want sysfs memblock files created. This will allow userspace to online + * and offline memory explicitly. Lack of this bit means that the caller has to + * call move_pfn_range_to_zone to finish the initialization. + */ + +#define MHP_MEMBLOCK_API (1<<0) + /* reasonably generic interface to expand the physical pages */ extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, - struct vmem_altmap *altmap, bool want_memblock); + struct mhp_restrictions *restrictions); #ifndef CONFIG_ARCH_HAS_ADD_PAGES static inline int add_pages(int nid, unsigned long start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap, - bool want_memblock) + unsigned long nr_pages, struct mhp_restrictions *restrictions) { - return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + return __add_pages(nid, start_pfn, nr_pages, restrictions); } #else /* ARCH_HAS_ADD_PAGES */ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, - struct vmem_altmap *altmap, bool want_memblock); + struct mhp_restrictions *restrictions); #endif /* ARCH_HAS_ADD_PAGES */ #ifdef CONFIG_NUMA @@ -332,8 +351,6 @@ extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, extern int __add_memory(int nid, u64 start, u64 size); extern int add_memory(int nid, u64 start, u64 size); extern int add_memory_resource(int nid, struct resource *resource); -extern int arch_add_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap, bool want_memblock); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); extern bool is_memblock_offlined(struct memory_block *mem); diff --git a/kernel/memremap.c b/kernel/memremap.c index a856cb5ff192..4e59d29245f4 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -148,6 +148,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) &pgmap->altmap : NULL; struct resource *res = &pgmap->res; struct dev_pagemap *conflict_pgmap; + struct mhp_restrictions restrictions = { + /* + * We do not want any optional features only our own memmap + */ + .altmap = altmap, + }; pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; @@ -214,7 +220,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) */ if (pgmap->type == MEMORY_DEVICE_PRIVATE) { error = add_pages(nid, align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, NULL, false); + align_size >> PAGE_SHIFT, &restrictions); } else { error = kasan_add_zero_shadow(__va(align_start), align_size); if (error) { @@ -222,8 +228,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) goto err_kasan; } - error = arch_add_memory(nid, align_start, align_size, altmap, - false); + error = arch_add_memory(nid, align_start, align_size, + &restrictions); } if (!error) { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 75f9f6590677..339d5a62d5d5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -273,12 +273,12 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, * add the new pages. */ int __ref __add_pages(int nid, unsigned long phys_start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap, - bool want_memblock) + unsigned long nr_pages, struct mhp_restrictions *restrictions) { unsigned long i; int err = 0; int start_sec, end_sec; + struct vmem_altmap *altmap = restrictions->altmap; /* during initialize mem_map, align hot-added range to section */ start_sec = pfn_to_section_nr(phys_start_pfn); @@ -299,7 +299,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn, for (i = start_sec; i <= end_sec; i++) { err = __add_section(nid, section_nr_to_pfn(i), altmap, - want_memblock); + restrictions->flags & MHP_MEMBLOCK_API); /* * EEXIST is finally dealt with by ioresource collision @@ -1097,6 +1097,9 @@ static int online_memory_block(struct memory_block *mem, void *arg) */ int __ref add_memory_resource(int nid, struct resource *res) { + struct mhp_restrictions restrictions = { + .flags = MHP_MEMBLOCK_API, + }; u64 start, size; bool new_node = false; int ret; @@ -1124,7 +1127,7 @@ int __ref add_memory_resource(int nid, struct resource *res) new_node = ret; /* call arch's memory hotadd */ - ret = arch_add_memory(nid, start, size, NULL, true); + ret = arch_add_memory(nid, start, size, &restrictions); if (ret < 0) goto error; -- cgit v1.2.3 From cb7b3a3685b20d3b5900ff24b2cb96d002960189 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 13 May 2019 17:21:37 -0700 Subject: mm/memory_hotplug: make unregister_memory_section() never fail Failing while removing memory is mostly ignored and cannot really be handled. Let's treat errors in unregister_memory_section() in a nice way, warning, but continuing. Link: http://lkml.kernel.org/r/20190409100148.24703-3-david@redhat.com Signed-off-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Ingo Molnar Cc: Andrew Banman Cc: Mike Travis Cc: David Hildenbrand Cc: Oscar Salvador Cc: Michal Hocko Cc: Pavel Tatashin Cc: Qian Cai Cc: Wei Yang Cc: Arun KS Cc: Mathieu Malaterre Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christophe Leroy Cc: Dave Hansen Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joonsoo Kim Cc: "Kirill A. Shutemov" Cc: Martin Schwidefsky Cc: Masahiro Yamada Cc: Michael Ellerman Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Rich Felker Cc: Rob Herring Cc: Stefan Agner Cc: Thomas Gleixner Cc: Tony Luck Cc: Vasily Gorbik Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 16 +++++----------- include/linux/memory.h | 2 +- mm/memory_hotplug.c | 4 +--- 3 files changed, 7 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 0c9e22ffa47a..f180427e48f4 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -734,15 +734,18 @@ unregister_memory(struct memory_block *memory) { BUG_ON(memory->dev.bus != &memory_subsys); - /* drop the ref. we got in remove_memory_section() */ + /* drop the ref. we got via find_memory_block() */ put_device(&memory->dev); device_unregister(&memory->dev); } -static int remove_memory_section(struct mem_section *section) +void unregister_memory_section(struct mem_section *section) { struct memory_block *mem; + if (WARN_ON_ONCE(!present_section(section))) + return; + mutex_lock(&mem_sysfs_mutex); /* @@ -763,15 +766,6 @@ static int remove_memory_section(struct mem_section *section) out_unlock: mutex_unlock(&mem_sysfs_mutex); - return 0; -} - -int unregister_memory_section(struct mem_section *section) -{ - if (!present_section(section)) - return -EINVAL; - - return remove_memory_section(section); } #endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/include/linux/memory.h b/include/linux/memory.h index a6ddefc60517..e1dc1bb2b787 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -113,7 +113,7 @@ extern int register_memory_isolate_notifier(struct notifier_block *nb); extern void unregister_memory_isolate_notifier(struct notifier_block *nb); int hotplug_memory_register(int nid, struct mem_section *section); #ifdef CONFIG_MEMORY_HOTREMOVE -extern int unregister_memory_section(struct mem_section *); +extern void unregister_memory_section(struct mem_section *); #endif extern int memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 65f166ec2e4c..1f3707ab7a63 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -526,9 +526,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms, if (!valid_section(ms)) return ret; - ret = unregister_memory_section(ms); - if (ret) - return ret; + unregister_memory_section(ms); scn_nr = __section_nr(ms); start_pfn = section_nr_to_pfn((unsigned long)scn_nr); -- cgit v1.2.3 From ac5c94264580f498e484c854031d0226b3c1038f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 13 May 2019 17:21:46 -0700 Subject: mm/memory_hotplug: make __remove_pages() and arch_remove_memory() never fail All callers of arch_remove_memory() ignore errors. And we should really try to remove any errors from the memory removal path. No more errors are reported from __remove_pages(). BUG() in s390x code in case arch_remove_memory() is triggered. We may implement that properly later. WARN in case powerpc code failed to remove the section mapping, which is better than ignoring the error completely right now. Link: http://lkml.kernel.org/r/20190409100148.24703-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Yoshinori Sato Cc: Rich Felker Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: "Kirill A. Shutemov" Cc: Christophe Leroy Cc: Stefan Agner Cc: Nicholas Piggin Cc: Pavel Tatashin Cc: Vasily Gorbik Cc: Arun KS Cc: Geert Uytterhoeven Cc: Masahiro Yamada Cc: Rob Herring Cc: Joonsoo Kim Cc: Wei Yang Cc: Qian Cai Cc: Mathieu Malaterre Cc: Andrew Banman Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Mike Travis Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 11 +++-------- arch/powerpc/mm/mem.c | 9 +++------ arch/s390/mm/init.c | 5 +++-- arch/sh/mm/init.c | 11 +++-------- arch/x86/mm/init_32.c | 5 +++-- arch/x86/mm/init_64.c | 10 +++------- include/linux/memory_hotplug.h | 8 ++++---- mm/memory_hotplug.c | 5 ++--- 8 files changed, 24 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 379eb1f9adc9..d28e29103bdb 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -682,20 +682,15 @@ int arch_add_memory(int nid, u64 start, u64 size, } #ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) +void arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; struct zone *zone; - int ret; zone = page_zone(pfn_to_page(start_pfn)); - ret = __remove_pages(zone, start_pfn, nr_pages, altmap); - if (ret) - pr_warn("%s: Problem encountered in __remove_pages() as" - " ret=%d\n", __func__, ret); - - return ret; + __remove_pages(zone, start_pfn, nr_pages, altmap); } #endif #endif diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index de5c591a550d..e885fe2aafcc 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -131,7 +131,7 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, } #ifdef CONFIG_MEMORY_HOTREMOVE -int __ref arch_remove_memory(int nid, u64 start, u64 size, +void __ref arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; @@ -147,14 +147,13 @@ int __ref arch_remove_memory(int nid, u64 start, u64 size, if (altmap) page += vmem_altmap_offset(altmap); - ret = __remove_pages(page_zone(page), start_pfn, nr_pages, altmap); - if (ret) - return ret; + __remove_pages(page_zone(page), start_pfn, nr_pages, altmap); /* Remove htab bolted mappings for this section of memory */ start = (unsigned long)__va(start); flush_inval_dcache_range(start, start + size); ret = remove_section_mapping(start, start + size); + WARN_ON_ONCE(ret); /* Ensure all vmalloc mappings are flushed in case they also * hit that section of memory @@ -163,8 +162,6 @@ int __ref arch_remove_memory(int nid, u64 start, u64 size, if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC) pr_warn("Hash collision while resizing HPT\n"); - - return ret; } #endif #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 06bd05137a00..14d1eae9fe43 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -237,14 +237,15 @@ int arch_add_memory(int nid, u64 start, u64 size, } #ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) +void arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) { /* * There is no hardware or firmware interface which could trigger a * hot memory remove on s390. So there is nothing that needs to be * implemented. */ - return -EBUSY; + BUG(); } #endif #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index d3cd07bd2dc1..b95e343e3c9d 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -429,20 +429,15 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif #ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) +void arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) { unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; struct zone *zone; - int ret; zone = page_zone(pfn_to_page(start_pfn)); - ret = __remove_pages(zone, start_pfn, nr_pages, altmap); - if (unlikely(ret)) - pr_warn("%s: Failed, __remove_pages() == %d\n", __func__, - ret); - - return ret; + __remove_pages(zone, start_pfn, nr_pages, altmap); } #endif #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 755dbed85531..075e568098f2 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -860,14 +860,15 @@ int arch_add_memory(int nid, u64 start, u64 size, } #ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) +void arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; struct zone *zone; zone = page_zone(pfn_to_page(start_pfn)); - return __remove_pages(zone, start_pfn, nr_pages, altmap); + __remove_pages(zone, start_pfn, nr_pages, altmap); } #endif #endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index db42c11b48fb..20d14254b686 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1141,24 +1141,20 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end) remove_pagetable(start, end, true, NULL); } -int __ref arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void __ref arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; struct page *page = pfn_to_page(start_pfn); struct zone *zone; - int ret; /* With altmap the first mapped page is offset from @start */ if (altmap) page += vmem_altmap_offset(altmap); zone = page_zone(page); - ret = __remove_pages(zone, start_pfn, nr_pages, altmap); - WARN_ON_ONCE(ret); + __remove_pages(zone, start_pfn, nr_pages, altmap); kernel_physical_mapping_remove(start, start + size); - - return ret; } #endif #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index b24aca54353e..ae892eef8b82 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -124,10 +124,10 @@ static inline bool movable_node_is_enabled(void) } #ifdef CONFIG_MEMORY_HOTREMOVE -extern int arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap); -extern int __remove_pages(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap); +extern void arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap); +extern void __remove_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages, struct vmem_altmap *altmap); #endif /* CONFIG_MEMORY_HOTREMOVE */ /* diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3512bba20e2b..6c0c4f48638e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -547,8 +547,8 @@ static void __remove_section(struct zone *zone, struct mem_section *ms, * sure that pages are marked reserved and zones are adjust properly by * calling offline_pages(). */ -int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap) +void __remove_pages(struct zone *zone, unsigned long phys_start_pfn, + unsigned long nr_pages, struct vmem_altmap *altmap) { unsigned long i; unsigned long map_offset = 0; @@ -579,7 +579,6 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, } set_zone_contiguous(zone); - return 0; } #endif /* CONFIG_MEMORY_HOTREMOVE */ -- cgit v1.2.3 From a667d7456f189e3422725dddcd067537feac49c0 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Mon, 13 May 2019 17:21:56 -0700 Subject: mm: introduce new vm_map_pages() and vm_map_pages_zero() API Patch series "mm: Use vm_map_pages() and vm_map_pages_zero() API", v5. This patch (of 5): Previouly drivers have their own way of mapping range of kernel pages/memory into user vma and this was done by invoking vm_insert_page() within a loop. As this pattern is common across different drivers, it can be generalized by creating new functions and using them across the drivers. vm_map_pages() is the API which can be used to map kernel memory/pages in drivers which have considered vm_pgoff vm_map_pages_zero() is the API which can be used to map a range of kernel memory/pages in drivers which have not considered vm_pgoff. vm_pgoff is passed as default 0 for those drivers. We _could_ then at a later "fix" these drivers which are using vm_map_pages_zero() to behave according to the normal vm_pgoff offsetting simply by removing the _zero suffix on the function name and if that causes regressions, it gives us an easy way to revert. Tested on Rockchip hardware and display is working, including talking to Lima via prime. Link: http://lkml.kernel.org/r/751cb8a0f4c3e67e95c58a3b072937617f338eea.1552921225.git.jrdr.linux@gmail.com Signed-off-by: Souptick Joarder Suggested-by: Russell King Suggested-by: Matthew Wilcox Reviewed-by: Mike Rapoport Tested-by: Heiko Stuebner Cc: Michal Hocko Cc: "Kirill A. Shutemov" Cc: Vlastimil Babka Cc: Rik van Riel Cc: Stephen Rothwell Cc: Peter Zijlstra Cc: Robin Murphy Cc: Joonsoo Kim Cc: Thierry Reding Cc: Kees Cook Cc: Marek Szyprowski Cc: Stefan Richter Cc: Sandy Huang Cc: David Airlie Cc: Oleksandr Andrushchenko Cc: Joerg Roedel Cc: Pawel Osciak Cc: Kyungmin Park Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Mauro Carvalho Chehab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 +++ mm/memory.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/nommu.c | 14 ++++++++++ 3 files changed, 99 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index abb7eb7ef0f2..912614fbbef3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2579,6 +2579,10 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); +int vm_map_pages(struct vm_area_struct *vma, struct page **pages, + unsigned long num); +int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, + unsigned long num); vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, diff --git a/mm/memory.c b/mm/memory.c index 9b68a72f8c17..96f1d473c89a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1527,6 +1527,87 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_page); +/* + * __vm_map_pages - maps range of kernel pages into user vma + * @vma: user vma to map to + * @pages: pointer to array of source kernel pages + * @num: number of pages in page array + * @offset: user's requested vm_pgoff + * + * This allows drivers to map range of kernel pages into a user vma. + * + * Return: 0 on success and error code otherwise. + */ +static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages, + unsigned long num, unsigned long offset) +{ + unsigned long count = vma_pages(vma); + unsigned long uaddr = vma->vm_start; + int ret, i; + + /* Fail if the user requested offset is beyond the end of the object */ + if (offset > num) + return -ENXIO; + + /* Fail if the user requested size exceeds available object size */ + if (count > num - offset) + return -ENXIO; + + for (i = 0; i < count; i++) { + ret = vm_insert_page(vma, uaddr, pages[offset + i]); + if (ret < 0) + return ret; + uaddr += PAGE_SIZE; + } + + return 0; +} + +/** + * vm_map_pages - maps range of kernel pages starts with non zero offset + * @vma: user vma to map to + * @pages: pointer to array of source kernel pages + * @num: number of pages in page array + * + * Maps an object consisting of @num pages, catering for the user's + * requested vm_pgoff + * + * If we fail to insert any page into the vma, the function will return + * immediately leaving any previously inserted pages present. Callers + * from the mmap handler may immediately return the error as their caller + * will destroy the vma, removing any successfully inserted pages. Other + * callers should make their own arrangements for calling unmap_region(). + * + * Context: Process context. Called by mmap handlers. + * Return: 0 on success and error code otherwise. + */ +int vm_map_pages(struct vm_area_struct *vma, struct page **pages, + unsigned long num) +{ + return __vm_map_pages(vma, pages, num, vma->vm_pgoff); +} +EXPORT_SYMBOL(vm_map_pages); + +/** + * vm_map_pages_zero - map range of kernel pages starts with zero offset + * @vma: user vma to map to + * @pages: pointer to array of source kernel pages + * @num: number of pages in page array + * + * Similar to vm_map_pages(), except that it explicitly sets the offset + * to 0. This function is intended for the drivers that did not consider + * vm_pgoff. + * + * Context: Process context. Called by mmap handlers. + * Return: 0 on success and error code otherwise. + */ +int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, + unsigned long num) +{ + return __vm_map_pages(vma, pages, num, 0); +} +EXPORT_SYMBOL(vm_map_pages_zero); + static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn, pgprot_t prot, bool mkwrite) { diff --git a/mm/nommu.c b/mm/nommu.c index 749276beb109..b492fd1fcf9f 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -473,6 +473,20 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_page); +int vm_map_pages(struct vm_area_struct *vma, struct page **pages, + unsigned long num) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_map_pages); + +int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, + unsigned long num) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_map_pages_zero); + /* * sys_brk() for the most part doesn't need the global kernel * lock, except when an application is doing something nasty -- cgit v1.2.3 From 350e88bad4964da6feabee02a1a70381bcdb087e Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 13 May 2019 17:22:59 -0700 Subject: mm: memblock: make keeping memblock memory opt-in rather than opt-out Most architectures do not need the memblock memory after the page allocator is initialized, but only few enable ARCH_DISCARD_MEMBLOCK in the arch Kconfig. Replacing ARCH_DISCARD_MEMBLOCK with ARCH_KEEP_MEMBLOCK and inverting the logic makes it clear which architectures actually use memblock after system initialization and skips the necessity to add ARCH_DISCARD_MEMBLOCK to the architectures that are still missing that option. Link: http://lkml.kernel.org/r/1556102150-32517-1-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Acked-by: Michael Ellerman (powerpc) Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Richard Kuo Cc: Tony Luck Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Ralf Baechle Cc: Paul Burton Cc: James Hogan Cc: Ley Foon Tan Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Yoshinori Sato Cc: Rich Felker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Eric Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 2 +- arch/arm64/Kconfig | 1 + arch/hexagon/Kconfig | 1 - arch/ia64/Kconfig | 1 - arch/m68k/Kconfig | 1 - arch/mips/Kconfig | 1 - arch/nios2/Kconfig | 1 - arch/powerpc/Kconfig | 1 + arch/s390/Kconfig | 1 + arch/sh/Kconfig | 1 - arch/x86/Kconfig | 1 - include/linux/memblock.h | 3 ++- kernel/kexec_file.c | 16 ++++++++-------- mm/Kconfig | 2 +- mm/memblock.c | 6 +++--- mm/page_alloc.c | 3 +-- 16 files changed, 19 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index a11dfcc2a130..5fd344bd06b9 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -4,7 +4,6 @@ config ARM default y select ARCH_32BIT_OFF_T select ARCH_CLOCKSOURCE_DATA - select ARCH_DISCARD_MEMBLOCK if !HAVE_ARCH_PFN_VALID && !KEXEC select ARCH_HAS_DEBUG_VIRTUAL if MMU select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE @@ -22,6 +21,7 @@ config ARM select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_KEEP_MEMBLOCK if HAVE_ARCH_PFN_VALID || KEXEC select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_NO_SG_CHAIN if !ARM_HAS_SG_CHAIN select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7a1aa53d188d..69a59a5d1143 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -60,6 +60,7 @@ config ARM64 select ARCH_INLINE_SPIN_UNLOCK_BH if !PREEMPT select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPT select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPT + select ARCH_KEEP_MEMBLOCK select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index 3e54a53208d5..b7d404bbaa0f 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -22,7 +22,6 @@ config HEXAGON select GENERIC_IRQ_SHOW select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK - select ARCH_DISCARD_MEMBLOCK select NEED_SG_DMA_LENGTH select NO_IOPORT_MAP select GENERIC_IOMAP diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 73a26f04644e..7468d8e50467 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -33,7 +33,6 @@ config IA64 select ARCH_HAS_DMA_COHERENT_TO_PFN if SWIOTLB select ARCH_HAS_SYNC_DMA_FOR_CPU if SWIOTLB select VIRT_TO_BUS - select ARCH_DISCARD_MEMBLOCK select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP select GENERIC_IRQ_SHOW diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index fe5cc2da6d10..218e037ef901 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -26,7 +26,6 @@ config M68K select MODULES_USE_ELF_RELA select OLD_SIGSUSPEND3 select OLD_SIGACTION - select ARCH_DISCARD_MEMBLOCK select MMU_GATHER_NO_RANGE if MMU config CPU_BIG_ENDIAN diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index ff8cff9fcf54..677e5bfeff47 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -5,7 +5,6 @@ config MIPS select ARCH_32BIT_OFF_T if !64BIT select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT select ARCH_CLOCKSOURCE_DATA - select ARCH_DISCARD_MEMBLOCK select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN_SANITIZE_ALL diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index ea37394ff3ea..26a9c760a98b 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -23,7 +23,6 @@ config NIOS2 select SPARSE_IRQ select USB_ARCH_HAS_HCD if USB_SUPPORT select CPU_NO_EFFICIENT_FFS - select ARCH_DISCARD_MEMBLOCK select MMU_GATHER_NO_RANGE if MMU config GENERIC_CSUM diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index d7996cfaceca..8c1c636308c8 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -137,6 +137,7 @@ config PPC select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAS_ZONE_DEVICE if PPC_BOOK3S_64 select ARCH_HAVE_NMI_SAFE_CMPXCHG + select ARCH_KEEP_MEMBLOCK select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index d0c046af65fa..109243fdb6ec 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -100,6 +100,7 @@ config S390 select ARCH_INLINE_WRITE_UNLOCK_BH select ARCH_INLINE_WRITE_UNLOCK_IRQ select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + select ARCH_KEEP_MEMBLOCK select ARCH_SAVE_PAGE_KEYS if HIBERNATION select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_NUMA_BALANCING diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 2a77033e1e7c..b77f512bb176 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -10,7 +10,6 @@ config SUPERH select DMA_DECLARE_COHERENT select HAVE_IDE if HAS_IOPORT_MAP select HAVE_MEMBLOCK_NODE_MAP - select ARCH_DISCARD_MEMBLOCK select HAVE_OPROFILE select HAVE_ARCH_TRACEHOOK select HAVE_PERF_EVENTS diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f21bc56e5d7b..818b361094ed 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -47,7 +47,6 @@ config X86 select ARCH_32BIT_OFF_T if X86_32 select ARCH_CLOCKSOURCE_DATA select ARCH_CLOCKSOURCE_INIT - select ARCH_DISCARD_MEMBLOCK select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEVMEM_IS_ALLOWED diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 47e3c0612592..676d3900e1bd 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -96,13 +96,14 @@ struct memblock { extern struct memblock memblock; extern int memblock_debug; -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK +#ifndef CONFIG_ARCH_KEEP_MEMBLOCK #define __init_memblock __meminit #define __initdata_memblock __meminitdata void memblock_discard(void); #else #define __init_memblock #define __initdata_memblock +static inline void memblock_discard(void) {} #endif #define memblock_dbg(fmt, ...) \ diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f7fb8f6a688f..072b6ee55e3f 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -500,13 +500,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) return locate_mem_hole_bottom_up(start, end, kbuf); } -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK -static int kexec_walk_memblock(struct kexec_buf *kbuf, - int (*func)(struct resource *, void *)) -{ - return 0; -} -#else +#ifdef CONFIG_ARCH_KEEP_MEMBLOCK static int kexec_walk_memblock(struct kexec_buf *kbuf, int (*func)(struct resource *, void *)) { @@ -550,6 +544,12 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf, return ret; } +#else +static int kexec_walk_memblock(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + return 0; +} #endif /** @@ -589,7 +589,7 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN) return 0; - if (IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK)) + if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) ret = kexec_walk_resources(kbuf, locate_mem_hole_callback); else ret = kexec_walk_memblock(kbuf, locate_mem_hole_callback); diff --git a/mm/Kconfig b/mm/Kconfig index 71e697e693df..c5124c2cb0b2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -136,7 +136,7 @@ config HAVE_MEMBLOCK_PHYS_MAP config HAVE_GENERIC_GUP bool -config ARCH_DISCARD_MEMBLOCK +config ARCH_KEEP_MEMBLOCK bool config MEMORY_ISOLATION diff --git a/mm/memblock.c b/mm/memblock.c index f315eca9f4a1..6bbad46f4d2c 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -94,7 +94,7 @@ * :c:func:`mem_init` function frees all the memory to the buddy page * allocator. * - * If an architecure enables %CONFIG_ARCH_DISCARD_MEMBLOCK, the + * Unless an architecure enables %CONFIG_ARCH_KEEP_MEMBLOCK, the * memblock data structures will be discarded after the system * initialization compltes. */ @@ -375,7 +375,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u } } -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK +#ifndef CONFIG_ARCH_KEEP_MEMBLOCK /** * memblock_discard - discard memory and reserved arrays if they were allocated */ @@ -1987,7 +1987,7 @@ unsigned long __init memblock_free_all(void) return pages; } -#if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK) +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK) static int memblock_debug_show(struct seq_file *m, void *private) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cbda9aea0bf5..f2f3fb4921d1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1896,10 +1896,9 @@ void __init page_alloc_init_late(void) /* Reinit limits that are based on free pages after the kernel is up */ files_maxfiles_init(); #endif -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK + /* Discard memblock private memory */ memblock_discard(); -#endif for_each_populated_zone(zone) set_zone_contiguous(zone); -- cgit v1.2.3 From 19343b5bdd16ad4ae6b845ef829f68b683c4dfb5 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 13 May 2019 17:23:11 -0700 Subject: mm/page-writeback: introduce tracepoint for wait_on_page_writeback() Recently there have been some hung tasks on our server due to wait_on_page_writeback(), and we want to know the details of this PG_writeback, i.e. this page is writing back to which device. But it is not so convenient to get the details. I think it would be better to introduce a tracepoint for diagnosing the writeback details. Link: http://lkml.kernel.org/r/1556274402-19018-1-git-send-email-laoar.shao@gmail.com Signed-off-by: Yafang Shao Reviewed-by: Andrew Morton Cc: Jan Kara Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 10 +--------- include/trace/events/writeback.h | 16 +++++++++++++++- mm/page-writeback.c | 12 ++++++++++++ 3 files changed, 28 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 2e8438a1216a..112f15bb5907 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -540,15 +540,7 @@ static inline int wait_on_page_locked_killable(struct page *page) extern void put_and_wait_on_page_locked(struct page *page); -/* - * Wait for a page to complete writeback - */ -static inline void wait_on_page_writeback(struct page *page) -{ - if (PageWriteback(page)) - wait_on_page_bit(page, PG_writeback); -} - +void wait_on_page_writeback(struct page *page); extern void end_page_writeback(struct page *page); void wait_for_stable_page(struct page *page); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 32db72c7c055..aa7f3aeac740 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -53,7 +53,7 @@ WB_WORK_REASON struct wb_writeback_work; -TRACE_EVENT(writeback_dirty_page, +DECLARE_EVENT_CLASS(writeback_page_template, TP_PROTO(struct page *page, struct address_space *mapping), @@ -79,6 +79,20 @@ TRACE_EVENT(writeback_dirty_page, ) ); +DEFINE_EVENT(writeback_page_template, writeback_dirty_page, + + TP_PROTO(struct page *page, struct address_space *mapping), + + TP_ARGS(page, mapping) +); + +DEFINE_EVENT(writeback_page_template, wait_on_page_writeback, + + TP_PROTO(struct page *page, struct address_space *mapping), + + TP_ARGS(page, mapping) +); + DECLARE_EVENT_CLASS(writeback_dirty_inode_template, TP_PROTO(struct inode *inode, int flags), diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 9f61dfec6a1f..07656485c0e6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2808,6 +2808,18 @@ int __test_set_page_writeback(struct page *page, bool keep_write) } EXPORT_SYMBOL(__test_set_page_writeback); +/* + * Wait for a page to complete writeback + */ +void wait_on_page_writeback(struct page *page) +{ + if (PageWriteback(page)) { + trace_wait_on_page_writeback(page, page_mapping(page)); + wait_on_page_bit(page, PG_writeback); + } +} +EXPORT_SYMBOL_GPL(wait_on_page_writeback); + /** * wait_for_stable_page() - wait for writeback to finish, if necessary. * @page: The page to wait on. -- cgit v1.2.3 From a1b8e6abf35b9903807eced67a4c26e440663620 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 13 May 2019 17:23:20 -0700 Subject: mm: delete find_get_entries_tag I removed the only user of this and hadn't noticed it was now unused. Link: http://lkml.kernel.org/r/20190430152929.21813-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 3 --- mm/filemap.c | 61 ------------------------------------------------- 2 files changed, 64 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 112f15bb5907..9ec3544baee2 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -373,9 +373,6 @@ static inline unsigned find_get_pages_tag(struct address_space *mapping, return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag, nr_pages, pages); } -unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, - xa_mark_t tag, unsigned int nr_entries, - struct page **entries, pgoff_t *indices); struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index, unsigned flags); diff --git a/mm/filemap.c b/mm/filemap.c index 3ad18fa56057..c5af80c43d36 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1941,67 +1941,6 @@ out: } EXPORT_SYMBOL(find_get_pages_range_tag); -/** - * find_get_entries_tag - find and return entries that match @tag - * @mapping: the address_space to search - * @start: the starting page cache index - * @tag: the tag index - * @nr_entries: the maximum number of entries - * @entries: where the resulting entries are placed - * @indices: the cache indices corresponding to the entries in @entries - * - * Like find_get_entries, except we only return entries which are tagged with - * @tag. - * - * Return: the number of entries which were found. - */ -unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, - xa_mark_t tag, unsigned int nr_entries, - struct page **entries, pgoff_t *indices) -{ - XA_STATE(xas, &mapping->i_pages, start); - struct page *page; - unsigned int ret = 0; - - if (!nr_entries) - return 0; - - rcu_read_lock(); - xas_for_each_marked(&xas, page, ULONG_MAX, tag) { - if (xas_retry(&xas, page)) - continue; - /* - * A shadow entry of a recently evicted page, a swap - * entry from shmem/tmpfs or a DAX entry. Return it - * without attempting to raise page count. - */ - if (xa_is_value(page)) - goto export; - - if (!page_cache_get_speculative(page)) - goto retry; - - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) - goto put_page; - page = find_subpage(page, xas.xa_index); - -export: - indices[ret] = xas.xa_index; - entries[ret] = page; - if (++ret == nr_entries) - break; - continue; -put_page: - put_page(page); -retry: - xas_reset(&xas); - } - rcu_read_unlock(); - return ret; -} -EXPORT_SYMBOL(find_get_entries_tag); - /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: -- cgit v1.2.3