From 7e27f22c9e40b66186e0675376f0495725ff1b0a Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Oct 2020 20:06:50 -0700 Subject: mm,hwpoison: unexport get_hwpoison_page and make it static Since get_hwpoison_page is only used in memory-failure code now, let us un-export it and make it private to that code. Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Acked-by: Naoya Horiguchi Cc: "Aneesh Kumar K.V" Cc: Aneesh Kumar K.V Cc: Aristeu Rozanski Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qian Cai Cc: Tony Luck Link: https://lkml.kernel.org/r/20200922135650.1634-5-osalvador@suse.de Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 620961e4f32b..1977c09afe7a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3025,7 +3025,6 @@ extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); -extern int get_hwpoison_page(struct page *page); #define put_hwpoison_page(page) put_page(page) extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; -- cgit v1.2.3 From dd6e2402fad966290f35dc687294fb6049714aac Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Oct 2020 20:06:57 -0700 Subject: mm,hwpoison: kill put_hwpoison_page After commit 4e41a30c6d50 ("mm: hwpoison: adjust for new thp refcounting"), put_hwpoison_page got reduced to a put_page. Let us just use put_page instead. Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Acked-by: Naoya Horiguchi Cc: "Aneesh Kumar K.V" Cc: Aneesh Kumar K.V Cc: Aristeu Rozanski Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qian Cai Cc: Tony Luck Link: https://lkml.kernel.org/r/20200922135650.1634-7-osalvador@suse.de Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - mm/memory-failure.c | 30 +++++++++++++++--------------- 2 files changed, 15 insertions(+), 16 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1977c09afe7a..ab038a3521b4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3025,7 +3025,6 @@ extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); -#define put_hwpoison_page(page) put_page(page) extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p, int access); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 95d5d8f3496c..4a9f8dc01160 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1143,7 +1143,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); num_poisoned_pages_dec(); unlock_page(head); - put_hwpoison_page(head); + put_page(head); return 0; } @@ -1335,7 +1335,7 @@ int memory_failure(unsigned long pfn, int flags) pfn); if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); - put_hwpoison_page(p); + put_page(p); return -EBUSY; } unlock_page(p); @@ -1388,14 +1388,14 @@ int memory_failure(unsigned long pfn, int flags) pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); num_poisoned_pages_dec(); unlock_page(p); - put_hwpoison_page(p); + put_page(p); return 0; } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); unlock_page(p); - put_hwpoison_page(p); + put_page(p); return 0; } @@ -1629,9 +1629,9 @@ int unpoison_memory(unsigned long pfn) } unlock_page(page); - put_hwpoison_page(page); + put_page(page); if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) - put_hwpoison_page(page); + put_page(page); return 0; } @@ -1692,7 +1692,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) /* * Try to free it. */ - put_hwpoison_page(page); + put_page(page); shake_page(page, 1); /* @@ -1701,7 +1701,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) ret = __get_any_page(page, pfn, 0); if (ret == 1 && !PageLRU(page)) { /* Drop page reference which is from __get_any_page() */ - put_hwpoison_page(page); + put_page(page); pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n", pfn, page->flags, &page->flags); return -EIO; @@ -1724,7 +1724,7 @@ static int soft_offline_huge_page(struct page *page, int flags) lock_page(hpage); if (PageHWPoison(hpage)) { unlock_page(hpage); - put_hwpoison_page(hpage); + put_page(hpage); pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); return -EBUSY; } @@ -1735,7 +1735,7 @@ static int soft_offline_huge_page(struct page *page, int flags) * get_any_page() and isolate_huge_page() takes a refcount each, * so need to drop one here. */ - put_hwpoison_page(hpage); + put_page(hpage); if (!ret) { pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn); return -EBUSY; @@ -1784,7 +1784,7 @@ static int __soft_offline_page(struct page *page, int flags) wait_on_page_writeback(page); if (PageHWPoison(page)) { unlock_page(page); - put_hwpoison_page(page); + put_page(page); pr_info("soft offline: %#lx page already poisoned\n", pfn); return -EBUSY; } @@ -1799,7 +1799,7 @@ static int __soft_offline_page(struct page *page, int flags) * would need to fix isolation locking first. */ if (ret == 1) { - put_hwpoison_page(page); + put_page(page); pr_info("soft_offline: %#lx: invalidated\n", pfn); SetPageHWPoison(page); num_poisoned_pages_inc(); @@ -1819,7 +1819,7 @@ static int __soft_offline_page(struct page *page, int flags) * Drop page reference which is came from get_any_page() * successful isolate_lru_page() already took another one. */ - put_hwpoison_page(page); + put_page(page); if (!ret) { LIST_HEAD(pagelist); /* @@ -1863,7 +1863,7 @@ static int soft_offline_in_use_page(struct page *page, int flags) pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page)); else pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page)); - put_hwpoison_page(page); + put_page(page); return -EBUSY; } unlock_page(page); @@ -1936,7 +1936,7 @@ int soft_offline_page(unsigned long pfn, int flags) if (PageHWPoison(page)) { pr_info("soft offline: %#lx page already poisoned\n", pfn); if (flags & MF_COUNT_INCREASED) - put_hwpoison_page(page); + put_page(page); return -EBUSY; } -- cgit v1.2.3 From 5d1fd5dc877bc1c670e7b1c174aa659b76c07de1 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 15 Oct 2020 20:07:21 -0700 Subject: mm,hwpoison: introduce MF_MSG_UNSPLIT_THP memory_failure() is supposed to call action_result() when it handles a memory error event, but there's one missing case. So let's add it. I find that include/ras/ras_event.h has some other MF_MSG_* undefined, so this patch also adds them. Signed-off-by: Naoya Horiguchi Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Cc: "Aneesh Kumar K.V" Cc: Aneesh Kumar K.V Cc: Aristeu Rozanski Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qian Cai Cc: Tony Luck Link: https://lkml.kernel.org/r/20200922135650.1634-13-osalvador@suse.de Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + include/ras/ras_event.h | 3 +++ mm/memory-failure.c | 5 ++++- 3 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index ab038a3521b4..a9df46309e07 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3064,6 +3064,7 @@ enum mf_action_page_type { MF_MSG_BUDDY, MF_MSG_BUDDY_2ND, MF_MSG_DAX, + MF_MSG_UNSPLIT_THP, MF_MSG_UNKNOWN, }; diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index 36c5c5e38c1d..0bdbc0d17d2f 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -361,6 +361,7 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_POISONED_HUGE, "huge page already hardware poisoned" ) \ EM ( MF_MSG_HUGE, "huge page" ) \ EM ( MF_MSG_FREE_HUGE, "free huge page" ) \ + EM ( MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page" ) \ EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" ) \ EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" ) \ EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" ) \ @@ -373,6 +374,8 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_TRUNCATED_LRU, "already truncated LRU page" ) \ EM ( MF_MSG_BUDDY, "free buddy page" ) \ EM ( MF_MSG_BUDDY_2ND, "free buddy page (2nd try)" ) \ + EM ( MF_MSG_DAX, "dax page" ) \ + EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ EMe ( MF_MSG_UNKNOWN, "unknown page" ) /* diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1fb290faf28e..f9fa9982b5d4 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -582,6 +582,7 @@ static const char * const action_page_types[] = { [MF_MSG_BUDDY] = "free buddy page", [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)", [MF_MSG_DAX] = "dax page", + [MF_MSG_UNSPLIT_THP] = "unsplit thp", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -1370,8 +1371,10 @@ int memory_failure(unsigned long pfn, int flags) } if (PageTransHuge(hpage)) { - if (try_to_split_thp_page(p, "Memory Failure") < 0) + if (try_to_split_thp_page(p, "Memory Failure") < 0) { + action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); return -EBUSY; + } VM_BUG_ON_PAGE(!page_count(p), p); } -- cgit v1.2.3 From d882c0067d99d0f2add9a41628703cc99511a639 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:19 -0700 Subject: mm: pass migratetype into memmap_init_zone() and move_pfn_range_to_zone() On the memory onlining path, we want to start with MIGRATE_ISOLATE, to un-isolate the pages after memory onlining is complete. Let's allow passing in the migratetype. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Wei Yang Cc: Baoquan He Cc: Pankaj Gupta Cc: Tony Luck Cc: Fenghua Yu Cc: Logan Gunthorpe Cc: Dan Williams Cc: Mike Rapoport Cc: "Matthew Wilcox (Oracle)" Cc: Michel Lespinasse Cc: Charan Teja Reddy Cc: Mel Gorman Link: https://lkml.kernel.org/r/20200819175957.28465-10-david@redhat.com Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 4 ++-- include/linux/memory_hotplug.h | 3 ++- include/linux/mm.h | 2 +- mm/memory_hotplug.c | 11 ++++++++--- mm/memremap.c | 3 ++- mm/page_alloc.c | 21 ++++++++++++--------- 6 files changed, 27 insertions(+), 17 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index d8686bf3ae2f..ef12e097f318 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -537,7 +537,7 @@ virtual_memmap_init(u64 start, u64 end, void *arg) if (map_start < map_end) memmap_init_zone((unsigned long)(map_end - map_start), args->nid, args->zone, page_to_pfn(map_start), - MEMINIT_EARLY, NULL); + MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); return 0; } @@ -547,7 +547,7 @@ memmap_init (unsigned long size, int nid, unsigned long zone, { if (!vmem_map) { memmap_init_zone(size, nid, zone, start_pfn, - MEMINIT_EARLY, NULL); + MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); } else { struct page *start; struct memmap_init_callback_data args; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 76b314031f09..51a877fec8da 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -351,7 +351,8 @@ extern int add_memory_resource(int nid, struct resource *resource); extern int add_memory_driver_managed(int nid, u64 start, u64 size, const char *resource_name); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap); + unsigned long nr_pages, + struct vmem_altmap *altmap, int migratetype); extern void remove_pfn_range_from_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); diff --git a/include/linux/mm.h b/include/linux/mm.h index a9df46309e07..61a2633fcc7f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2440,7 +2440,7 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn, extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, - enum meminit_context, struct vmem_altmap *); + enum meminit_context, struct vmem_altmap *, int migratetype); extern void setup_per_zone_wmarks(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 113edf95b908..bb30e99b7383 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -701,9 +701,14 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon * Associate the pfn range with the given zone, initializing the memmaps * and resizing the pgdat/zone data to span the added pages. After this * call, all affected pages are PG_reserved. + * + * All aligned pageblocks are initialized to the specified migratetype + * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related + * zone stats (e.g., nr_isolate_pageblock) are touched. */ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap) + unsigned long nr_pages, + struct vmem_altmap *altmap, int migratetype) { struct pglist_data *pgdat = zone->zone_pgdat; int nid = pgdat->node_id; @@ -728,7 +733,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, * are reserved so nobody should be touching them so we should be safe */ memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, - MEMINIT_HOTPLUG, altmap); + MEMINIT_HOTPLUG, altmap, migratetype); set_zone_contiguous(zone); } @@ -808,7 +813,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, /* associate pfn range with the zone */ zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages); - move_pfn_range_to_zone(zone, pfn, nr_pages, NULL); + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_MOVABLE); arg.start_pfn = pfn; arg.nr_pages = nr_pages; diff --git a/mm/memremap.c b/mm/memremap.c index 198083453182..73a206d0f645 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -266,7 +266,8 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; move_pfn_range_to_zone(zone, PHYS_PFN(range->start), - PHYS_PFN(range_len(range)), params->altmap); + PHYS_PFN(range_len(range)), params->altmap, + MIGRATE_MOVABLE); } mem_hotplug_done(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7a99ed299443..f7f292f1d108 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5990,10 +5990,15 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) * Initially all pages are reserved - free ones are freed * up by memblock_free_all() once the early boot process is * done. Non-atomic initialization, single-pass. + * + * All aligned pageblocks are initialized to the specified migratetype + * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related + * zone stats (e.g., nr_isolate_pageblock) are touched. */ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, - unsigned long start_pfn, enum meminit_context context, - struct vmem_altmap *altmap) + unsigned long start_pfn, + enum meminit_context context, + struct vmem_altmap *altmap, int migratetype) { unsigned long pfn, end_pfn = start_pfn + size; struct page *page; @@ -6037,14 +6042,12 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, __SetPageReserved(page); /* - * Mark the block movable so that blocks are reserved for - * movable at startup. This will force kernel allocations - * to reserve their blocks rather than leaking throughout - * the address space during boot when many long-lived - * kernel allocations are made. + * Usually, we want to mark the pageblock MIGRATE_MOVABLE, + * such that unmovable allocations won't be scattered all + * over the place during system boot. */ if (IS_ALIGNED(pfn, pageblock_nr_pages)) { - set_pageblock_migratetype(page, MIGRATE_MOVABLE); + set_pageblock_migratetype(page, migratetype); cond_resched(); } pfn++; @@ -6144,7 +6147,7 @@ void __meminit __weak memmap_init(unsigned long size, int nid, if (end_pfn > start_pfn) { size = end_pfn - start_pfn; memmap_init_zone(size, nid, zone, start_pfn, - MEMINIT_EARLY, NULL); + MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); } } } -- cgit v1.2.3 From 0726b01e70455f9900ab524117c7b520d197dc8c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Sat, 17 Oct 2020 16:14:50 -0700 Subject: mm/madvise: pass mm to do_madvise Patch series "introduce memory hinting API for external process", v9. Now, we have MADV_PAGEOUT and MADV_COLD as madvise hinting API. With that, application could give hints to kernel what memory range are preferred to be reclaimed. However, in some platform(e.g., Android), the information required to make the hinting decision is not known to the app. Instead, it is known to a centralized userspace daemon(e.g., ActivityManagerService), and that daemon must be able to initiate reclaim on its own without any app involvement. To solve the concern, this patch introduces new syscall - process_madvise(2). Bascially, it's same with madvise(2) syscall but it has some differences. 1. It needs pidfd of target process to provide the hint 2. It supports only MADV_{COLD|PAGEOUT|MERGEABLE|UNMEREABLE} at this moment. Other hints in madvise will be opened when there are explicit requests from community to prevent unexpected bugs we couldn't support. 3. Only privileged processes can do something for other process's address space. For more detail of the new API, please see "mm: introduce external memory hinting API" description in this patchset. This patch (of 3): In upcoming patches, do_madvise will be called from external process context so we shouldn't asssume "current" is always hinted process's task_struct. Furthermore, we must not access mm_struct via task->mm, but obtain it via access_mm() once (in the following patch) and only use that pointer [1], so pass it to do_madvise() as well. Note the vma->vm_mm pointers are safe, so we can use them further down the call stack. And let's pass current->mm as arguments of do_madvise so it shouldn't change existing behavior but prepare next patch to make review easy. [vbabka@suse.cz: changelog tweak] [minchan@kernel.org: use current->mm for io_uring] Link: http://lkml.kernel.org/r/20200423145215.72666-1-minchan@kernel.org [akpm@linux-foundation.org: fix it for upstream changes] [akpm@linux-foundation.org: whoops] [rdunlap@infradead.org: add missing includes] Signed-off-by: Minchan Kim Signed-off-by: Andrew Morton Reviewed-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Acked-by: David Rientjes Cc: Jens Axboe Cc: Jann Horn Cc: Tim Murray Cc: Daniel Colascione Cc: Sandeep Patil Cc: Sonny Rao Cc: Brian Geffon Cc: Michal Hocko Cc: Johannes Weiner Cc: Shakeel Butt Cc: John Dias Cc: Joel Fernandes Cc: Alexander Duyck Cc: SeongJae Park Cc: Christian Brauner Cc: Kirill Tkhai Cc: Oleksandr Natalenko Cc: SeongJae Park Cc: Christian Brauner Cc: Florian Weimer Cc: Link: https://lkml.kernel.org/r/20200901000633.1920247-1-minchan@kernel.org Link: http://lkml.kernel.org/r/20200622192900.22757-1-minchan@kernel.org Link: http://lkml.kernel.org/r/20200302193630.68771-2-minchan@kernel.org Link: http://lkml.kernel.org/r/20200622192900.22757-2-minchan@kernel.org Link: https://lkml.kernel.org/r/20200901000633.1920247-2-minchan@kernel.org Signed-off-by: Linus Torvalds --- fs/io_uring.c | 2 +- include/linux/mm.h | 2 +- mm/madvise.c | 32 ++++++++++++++++++-------------- 3 files changed, 20 insertions(+), 16 deletions(-) (limited to 'include/linux/mm.h') diff --git a/fs/io_uring.c b/fs/io_uring.c index 2e1dc354cd08..b58169240c77 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3989,7 +3989,7 @@ static int io_madvise(struct io_kiocb *req, bool force_nonblock) if (force_nonblock) return -EAGAIN; - ret = do_madvise(ma->addr, ma->len, ma->advice); + ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); if (ret < 0) req_set_fail_links(req); io_req_complete(req, ret); diff --git a/include/linux/mm.h b/include/linux/mm.h index 61a2633fcc7f..ef360fe70aaf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2579,7 +2579,7 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf, bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); -extern int do_madvise(unsigned long start, size_t len_in, int behavior); +extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); #ifdef CONFIG_MMU extern int __mm_populate(unsigned long addr, unsigned long len, diff --git a/mm/madvise.c b/mm/madvise.c index fd1f448b4e1d..d550ef045288 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -258,6 +258,7 @@ static long madvise_willneed(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { + struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; loff_t offset; @@ -294,10 +295,10 @@ static long madvise_willneed(struct vm_area_struct *vma, get_file(file); offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - mmap_read_unlock(current->mm); + mmap_read_unlock(mm); vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); fput(file); - mmap_read_lock(current->mm); + mmap_read_lock(mm); return 0; } @@ -766,6 +767,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, unsigned long start, unsigned long end, int behavior) { + struct mm_struct *mm = vma->vm_mm; + *prev = vma; if (!can_madv_lru_vma(vma)) return -EINVAL; @@ -773,8 +776,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, if (!userfaultfd_remove(vma, start, end)) { *prev = NULL; /* mmap_lock has been dropped, prev is stale */ - mmap_read_lock(current->mm); - vma = find_vma(current->mm, start); + mmap_read_lock(mm); + vma = find_vma(mm, start); if (!vma) return -ENOMEM; if (start < vma->vm_start) { @@ -828,6 +831,7 @@ static long madvise_remove(struct vm_area_struct *vma, loff_t offset; int error; struct file *f; + struct mm_struct *mm = vma->vm_mm; *prev = NULL; /* tell sys_madvise we drop mmap_lock */ @@ -855,13 +859,13 @@ static long madvise_remove(struct vm_area_struct *vma, get_file(f); if (userfaultfd_remove(vma, start, end)) { /* mmap_lock was not released by userfaultfd_remove() */ - mmap_read_unlock(current->mm); + mmap_read_unlock(mm); } error = vfs_fallocate(f, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, end - start); fput(f); - mmap_read_lock(current->mm); + mmap_read_lock(mm); return error; } @@ -1045,7 +1049,7 @@ madvise_behavior_valid(int behavior) * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. */ -int do_madvise(unsigned long start, size_t len_in, int behavior) +int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { unsigned long end, tmp; struct vm_area_struct *vma, *prev; @@ -1083,10 +1087,10 @@ int do_madvise(unsigned long start, size_t len_in, int behavior) write = madvise_need_mmap_write(behavior); if (write) { - if (mmap_write_lock_killable(current->mm)) + if (mmap_write_lock_killable(mm)) return -EINTR; } else { - mmap_read_lock(current->mm); + mmap_read_lock(mm); } /* @@ -1094,7 +1098,7 @@ int do_madvise(unsigned long start, size_t len_in, int behavior) * ranges, just ignore them, but return -ENOMEM at the end. * - different from the way of handling in mlock etc. */ - vma = find_vma_prev(current->mm, start, &prev); + vma = find_vma_prev(mm, start, &prev); if (vma && start > vma->vm_start) prev = vma; @@ -1131,19 +1135,19 @@ int do_madvise(unsigned long start, size_t len_in, int behavior) if (prev) vma = prev->vm_next; else /* madvise_remove dropped mmap_lock */ - vma = find_vma(current->mm, start); + vma = find_vma(mm, start); } out: blk_finish_plug(&plug); if (write) - mmap_write_unlock(current->mm); + mmap_write_unlock(mm); else - mmap_read_unlock(current->mm); + mmap_read_unlock(mm); return error; } SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { - return do_madvise(start, len_in, behavior); + return do_madvise(current->mm, start, len_in, behavior); } -- cgit v1.2.3 From f8f6ae5d077a9bdaf5cbf2ac960a5d1a04b47482 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 1 Nov 2020 17:08:00 -0800 Subject: mm: always have io_remap_pfn_range() set pgprot_decrypted() The purpose of io_remap_pfn_range() is to map IO memory, such as a memory mapped IO exposed through a PCI BAR. IO devices do not understand encryption, so this memory must always be decrypted. Automatically call pgprot_decrypted() as part of the generic implementation. This fixes a bug where enabling AMD SME causes subsystems, such as RDMA, using io_remap_pfn_range() to expose BAR pages to user space to fail. The CPU will encrypt access to those BAR pages instead of passing unencrypted IO directly to the device. Places not mapping IO should use remap_pfn_range(). Fixes: aca20d546214 ("x86/mm: Add support to make use of Secure Memory Encryption") Signed-off-by: Jason Gunthorpe Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Tom Lendacky Cc: Thomas Gleixner Cc: Andrey Ryabinin Cc: Borislav Petkov Cc: Brijesh Singh Cc: Jonathan Corbet Cc: Dmitry Vyukov Cc: "Dave Young" Cc: Alexander Potapenko Cc: Konrad Rzeszutek Wilk Cc: Andy Lutomirski Cc: Larry Woodman Cc: Matt Fleming Cc: Ingo Molnar Cc: "Michael S. Tsirkin" Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Rik van Riel Cc: Toshimitsu Kani Cc: Link: https://lkml.kernel.org/r/0-v1-025d64bdf6c4+e-amd_sme_fix_jgg@nvidia.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 9 +++++++++ include/linux/pgtable.h | 4 ---- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index ef360fe70aaf..db6ae4d3fb4e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2759,6 +2759,15 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, return VM_FAULT_NOPAGE; } +#ifndef io_remap_pfn_range +static inline int io_remap_pfn_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot)); +} +#endif + static inline vm_fault_t vmf_error(int err) { if (err == -ENOMEM) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 38c33eabea89..71125a4676c4 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1427,10 +1427,6 @@ typedef unsigned int pgtbl_mod_mask; #endif /* !__ASSEMBLY__ */ -#ifndef io_remap_pfn_range -#define io_remap_pfn_range remap_pfn_range -#endif - #ifndef has_transparent_hugepage #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define has_transparent_hugepage() 1 -- cgit v1.2.3