From 7190b3c8bd2b0cde483bd440cf91ba1c518b4261 Mon Sep 17 00:00:00 2001 From: Ignacio Moreno Gonzalez Date: Wed, 7 May 2025 15:28:06 +0200 Subject: mm: mmap: map MAP_STACK to VM_NOHUGEPAGE only if THP is enabled commit c4608d1bf7c6 ("mm: mmap: map MAP_STACK to VM_NOHUGEPAGE") maps the mmap option MAP_STACK to VM_NOHUGEPAGE. This is also done if CONFIG_TRANSPARENT_HUGEPAGE is not defined. But in that case, the VM_NOHUGEPAGE does not make sense. I discovered this issue when trying to use the tool CRIU to checkpoint and restore a container. Our running kernel is compiled without CONFIG_TRANSPARENT_HUGEPAGE. CRIU parses the output of /proc//smaps and saves the "nh" flag. When trying to restore the container, CRIU fails to restore the "nh" mappings, since madvise() MADV_NOHUGEPAGE always returns an error because CONFIG_TRANSPARENT_HUGEPAGE is not defined. Link: https://lkml.kernel.org/r/20250507-map-map_stack-to-vm_nohugepage-only-if-thp-is-enabled-v5-1-c6c38cfefd6e@kuka.com Fixes: c4608d1bf7c6 ("mm: mmap: map MAP_STACK to VM_NOHUGEPAGE") Signed-off-by: Ignacio Moreno Gonzalez Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Reviewed-by: Yang Shi Reviewed-by: Liam R. Howlett Cc: Signed-off-by: Andrew Morton --- include/linux/mman.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mman.h b/include/linux/mman.h index bce214fece16..f4c6346a8fcd 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -155,7 +155,9 @@ calc_vm_flag_bits(struct file *file, unsigned long flags) return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) | _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) | +#ifdef CONFIG_TRANSPARENT_HUGEPAGE _calc_vm_trans(flags, MAP_STACK, VM_NOHUGEPAGE) | +#endif arch_calc_vm_flag_bits(file, flags); } -- cgit v1.2.3 From 0f518255bde881d2a2605bbc080b438b532b6ab2 Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Wed, 7 May 2025 15:09:57 +0200 Subject: mm: fix VM_UFFD_MINOR == VM_SHADOW_STACK on USERFAULTFD=y && ARM64_GCS=y On configs with CONFIG_ARM64_GCS=y, VM_SHADOW_STACK is bit 38. On configs with CONFIG_HAVE_ARCH_USERFAULTFD_MINOR=y (selected by CONFIG_ARM64 when CONFIG_USERFAULTFD=y), VM_UFFD_MINOR is _also_ bit 38. This bit being shared by two different VMA flags could lead to all sorts of unintended behaviors. Presumably, a process could maybe call into userfaultfd in a way that disables the shadow stack vma flag. I can't think of any attack where this would help (presumably, if an attacker tries to disable shadow stacks, they are trying to hijack control flow so can't arbitrarily call into userfaultfd yet anyway) but this still feels somewhat scary. Link: https://lkml.kernel.org/r/20250507131000.1204175-2-revest@chromium.org Fixes: ae80e1629aea ("mm: Define VM_SHADOW_STACK for arm64 when we support GCS") Signed-off-by: Florent Revest Reviewed-by: Mark Brown Cc: Borislav Betkov Cc: Brendan Jackman Cc: Catalin Marinas Cc: Florent Revest Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thiago Jung Bauermann Cc: Thomas Gleinxer Cc: Will Deacon Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index bf55206935c4..fdda6b16263b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -385,7 +385,7 @@ extern unsigned int kobjsize(const void *objp); #endif #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR -# define VM_UFFD_MINOR_BIT 38 +# define VM_UFFD_MINOR_BIT 41 # define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */ #else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ # define VM_UFFD_MINOR VM_NONE -- cgit v1.2.3 From 97dfbbd135cb5e4426f37ca53a8fa87eaaa4e376 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 14 May 2025 18:06:02 +0100 Subject: highmem: add folio_test_partial_kmap() In commit c749d9b7ebbc ("iov_iter: fix copy_page_from_iter_atomic() if KMAP_LOCAL_FORCE_MAP"), Hugh correctly noted that if KMAP_LOCAL_FORCE_MAP is enabled, we must limit ourselves to PAGE_SIZE bytes per call to kmap_local(). The same problem exists in memcpy_from_folio(), memcpy_to_folio(), folio_zero_tail(), folio_fill_tail() and memcpy_from_file_folio(), so add folio_test_partial_kmap() to do this more succinctly. Link: https://lkml.kernel.org/r/20250514170607.3000994-2-willy@infradead.org Fixes: 00cdf76012ab ("mm: add memcpy_from_file_folio()") Signed-off-by: Matthew Wilcox (Oracle) Cc: Al Viro Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton --- include/linux/highmem.h | 10 +++++----- include/linux/page-flags.h | 7 +++++++ 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 5c6bea81a90e..c698f8415675 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -461,7 +461,7 @@ static inline void memcpy_from_folio(char *to, struct folio *folio, const char *from = kmap_local_folio(folio, offset); size_t chunk = len; - if (folio_test_highmem(folio) && + if (folio_test_partial_kmap(folio) && chunk > PAGE_SIZE - offset_in_page(offset)) chunk = PAGE_SIZE - offset_in_page(offset); memcpy(to, from, chunk); @@ -489,7 +489,7 @@ static inline void memcpy_to_folio(struct folio *folio, size_t offset, char *to = kmap_local_folio(folio, offset); size_t chunk = len; - if (folio_test_highmem(folio) && + if (folio_test_partial_kmap(folio) && chunk > PAGE_SIZE - offset_in_page(offset)) chunk = PAGE_SIZE - offset_in_page(offset); memcpy(to, from, chunk); @@ -522,7 +522,7 @@ static inline __must_check void *folio_zero_tail(struct folio *folio, { size_t len = folio_size(folio) - offset; - if (folio_test_highmem(folio)) { + if (folio_test_partial_kmap(folio)) { size_t max = PAGE_SIZE - offset_in_page(offset); while (len > max) { @@ -560,7 +560,7 @@ static inline void folio_fill_tail(struct folio *folio, size_t offset, VM_BUG_ON(offset + len > folio_size(folio)); - if (folio_test_highmem(folio)) { + if (folio_test_partial_kmap(folio)) { size_t max = PAGE_SIZE - offset_in_page(offset); while (len > max) { @@ -597,7 +597,7 @@ static inline size_t memcpy_from_file_folio(char *to, struct folio *folio, size_t offset = offset_in_folio(folio, pos); char *from = kmap_local_folio(folio, offset); - if (folio_test_highmem(folio)) { + if (folio_test_partial_kmap(folio)) { offset = offset_in_page(offset); len = min_t(size_t, len, PAGE_SIZE - offset); } else diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e6a21b62dcce..3b814ce08331 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -615,6 +615,13 @@ FOLIO_FLAG(dropbehind, FOLIO_HEAD_PAGE) PAGEFLAG_FALSE(HighMem, highmem) #endif +/* Does kmap_local_folio() only allow access to one page of the folio? */ +#ifdef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP +#define folio_test_partial_kmap(f) true +#else +#define folio_test_partial_kmap(f) folio_test_highmem(f) +#endif + #ifdef CONFIG_SWAP static __always_inline bool folio_test_swapcache(const struct folio *folio) { -- cgit v1.2.3 From 12ca42c237756182aad8ab04654c952765cb9061 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 16 May 2025 17:07:39 -0700 Subject: alloc_tag: allocate percpu counters for module tags dynamically When a module gets unloaded it checks whether any of its tags are still in use and if so, we keep the memory containing module's allocation tags alive until all tags are unused. However percpu counters referenced by the tags are freed by free_module(). This will lead to UAF if the memory allocated by a module is accessed after module was unloaded. To fix this we allocate percpu counters for module allocation tags dynamically and we keep it alive for tags which are still in use after module unloading. This also removes the requirement of a larger PERCPU_MODULE_RESERVE when memory allocation profiling is enabled because percpu memory for counters does not need to be reserved anymore. Link: https://lkml.kernel.org/r/20250517000739.5930-1-surenb@google.com Fixes: 0db6f8d7820a ("alloc_tag: load module tags into separate contiguous memory") Signed-off-by: Suren Baghdasaryan Reported-by: David Wang <00107082@163.com> Closes: https://lore.kernel.org/all/20250516131246.6244-1-00107082@163.com/ Tested-by: David Wang <00107082@163.com> Cc: Christoph Lameter (Ampere) Cc: Dennis Zhou Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 12 +++++++ include/linux/codetag.h | 8 ++--- include/linux/percpu.h | 4 --- lib/alloc_tag.c | 87 +++++++++++++++++++++++++++++++++++++---------- lib/codetag.c | 5 +-- 5 files changed, 88 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index a946e0203e6d..8f7931eb7d16 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -104,6 +104,16 @@ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag); #else /* ARCH_NEEDS_WEAK_PER_CPU */ +#ifdef MODULE + +#define DEFINE_ALLOC_TAG(_alloc_tag) \ + static struct alloc_tag _alloc_tag __used __aligned(8) \ + __section(ALLOC_TAG_SECTION_NAME) = { \ + .ct = CODE_TAG_INIT, \ + .counters = NULL }; + +#else /* MODULE */ + #define DEFINE_ALLOC_TAG(_alloc_tag) \ static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr); \ static struct alloc_tag _alloc_tag __used __aligned(8) \ @@ -111,6 +121,8 @@ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag); .ct = CODE_TAG_INIT, \ .counters = &_alloc_tag_cntr }; +#endif /* MODULE */ + #endif /* ARCH_NEEDS_WEAK_PER_CPU */ DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, diff --git a/include/linux/codetag.h b/include/linux/codetag.h index d14dbd26b370..0ee4c21c6dbc 100644 --- a/include/linux/codetag.h +++ b/include/linux/codetag.h @@ -36,10 +36,10 @@ union codetag_ref { struct codetag_type_desc { const char *section; size_t tag_size; - void (*module_load)(struct codetag_type *cttype, - struct codetag_module *cmod); - void (*module_unload)(struct codetag_type *cttype, - struct codetag_module *cmod); + void (*module_load)(struct module *mod, + struct codetag *start, struct codetag *end); + void (*module_unload)(struct module *mod, + struct codetag *start, struct codetag *end); #ifdef CONFIG_MODULES void (*module_replaced)(struct module *mod, struct module *new_mod); bool (*needs_section_mem)(struct module *mod, unsigned long size); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 52b5ea663b9f..85bf8dd9f087 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -15,11 +15,7 @@ /* enough to cover all DEFINE_PER_CPUs in modules */ #ifdef CONFIG_MODULES -#ifdef CONFIG_MEM_ALLOC_PROFILING -#define PERCPU_MODULE_RESERVE (8 << 13) -#else #define PERCPU_MODULE_RESERVE (8 << 10) -#endif #else #define PERCPU_MODULE_RESERVE 0 #endif diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 25ecc1334b67..c7f602fa7b23 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -350,18 +350,28 @@ static bool needs_section_mem(struct module *mod, unsigned long size) return size >= sizeof(struct alloc_tag); } -static struct alloc_tag *find_used_tag(struct alloc_tag *from, struct alloc_tag *to) +static bool clean_unused_counters(struct alloc_tag *start_tag, + struct alloc_tag *end_tag) { - while (from <= to) { + struct alloc_tag *tag; + bool ret = true; + + for (tag = start_tag; tag <= end_tag; tag++) { struct alloc_tag_counters counter; - counter = alloc_tag_read(from); - if (counter.bytes) - return from; - from++; + if (!tag->counters) + continue; + + counter = alloc_tag_read(tag); + if (!counter.bytes) { + free_percpu(tag->counters); + tag->counters = NULL; + } else { + ret = false; + } } - return NULL; + return ret; } /* Called with mod_area_mt locked */ @@ -371,12 +381,16 @@ static void clean_unused_module_areas_locked(void) struct module *val; mas_for_each(&mas, val, module_tags.size) { + struct alloc_tag *start_tag; + struct alloc_tag *end_tag; + if (val != &unloaded_mod) continue; /* Release area if all tags are unused */ - if (!find_used_tag((struct alloc_tag *)(module_tags.start_addr + mas.index), - (struct alloc_tag *)(module_tags.start_addr + mas.last))) + start_tag = (struct alloc_tag *)(module_tags.start_addr + mas.index); + end_tag = (struct alloc_tag *)(module_tags.start_addr + mas.last); + if (clean_unused_counters(start_tag, end_tag)) mas_erase(&mas); } } @@ -561,7 +575,8 @@ unlock: static void release_module_tags(struct module *mod, bool used) { MA_STATE(mas, &mod_area_mt, module_tags.size, module_tags.size); - struct alloc_tag *tag; + struct alloc_tag *start_tag; + struct alloc_tag *end_tag; struct module *val; mas_lock(&mas); @@ -575,15 +590,22 @@ static void release_module_tags(struct module *mod, bool used) if (!used) goto release_area; - /* Find out if the area is used */ - tag = find_used_tag((struct alloc_tag *)(module_tags.start_addr + mas.index), - (struct alloc_tag *)(module_tags.start_addr + mas.last)); - if (tag) { - struct alloc_tag_counters counter = alloc_tag_read(tag); + start_tag = (struct alloc_tag *)(module_tags.start_addr + mas.index); + end_tag = (struct alloc_tag *)(module_tags.start_addr + mas.last); + if (!clean_unused_counters(start_tag, end_tag)) { + struct alloc_tag *tag; + + for (tag = start_tag; tag <= end_tag; tag++) { + struct alloc_tag_counters counter; + + if (!tag->counters) + continue; - pr_info("%s:%u module %s func:%s has %llu allocated at module unload\n", - tag->ct.filename, tag->ct.lineno, tag->ct.modname, - tag->ct.function, counter.bytes); + counter = alloc_tag_read(tag); + pr_info("%s:%u module %s func:%s has %llu allocated at module unload\n", + tag->ct.filename, tag->ct.lineno, tag->ct.modname, + tag->ct.function, counter.bytes); + } } else { used = false; } @@ -596,6 +618,34 @@ out: mas_unlock(&mas); } +static void load_module(struct module *mod, struct codetag *start, struct codetag *stop) +{ + /* Allocate module alloc_tag percpu counters */ + struct alloc_tag *start_tag; + struct alloc_tag *stop_tag; + struct alloc_tag *tag; + + if (!mod) + return; + + start_tag = ct_to_alloc_tag(start); + stop_tag = ct_to_alloc_tag(stop); + for (tag = start_tag; tag < stop_tag; tag++) { + WARN_ON(tag->counters); + tag->counters = alloc_percpu(struct alloc_tag_counters); + if (!tag->counters) { + while (--tag >= start_tag) { + free_percpu(tag->counters); + tag->counters = NULL; + } + shutdown_mem_profiling(true); + pr_err("Failed to allocate memory for allocation tag percpu counters in the module %s. Memory allocation profiling is disabled!\n", + mod->name); + break; + } + } +} + static void replace_module(struct module *mod, struct module *new_mod) { MA_STATE(mas, &mod_area_mt, 0, module_tags.size); @@ -757,6 +807,7 @@ static int __init alloc_tag_init(void) .needs_section_mem = needs_section_mem, .alloc_section_mem = reserve_module_tags, .free_section_mem = release_module_tags, + .module_load = load_module, .module_replaced = replace_module, #endif }; diff --git a/lib/codetag.c b/lib/codetag.c index 42aadd6c1454..de332e98d6f5 100644 --- a/lib/codetag.c +++ b/lib/codetag.c @@ -194,7 +194,7 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod) if (err >= 0) { cttype->count += range_size(cttype, &range); if (cttype->desc.module_load) - cttype->desc.module_load(cttype, cmod); + cttype->desc.module_load(mod, range.start, range.stop); } up_write(&cttype->mod_lock); @@ -333,7 +333,8 @@ void codetag_unload_module(struct module *mod) } if (found) { if (cttype->desc.module_unload) - cttype->desc.module_unload(cttype, cmod); + cttype->desc.module_unload(cmod->mod, + cmod->range.start, cmod->range.stop); cttype->count -= range_size(cttype, &cmod->range); idr_remove(&cttype->mod_idr, mod_id); -- cgit v1.2.3 From ee40c9920ac286c5bfe7c811e66ff899266d2582 Mon Sep 17 00:00:00 2001 From: Ricardo Cañuelo Navarro Date: Fri, 23 May 2025 14:19:10 +0200 Subject: mm: fix copy_vma() error handling for hugetlb mappings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If, during a mremap() operation for a hugetlb-backed memory mapping, copy_vma() fails after the source vma has been duplicated and opened (ie. vma_link() fails), the error is handled by closing the new vma. This updates the hugetlbfs reservation counter of the reservation map which at this point is referenced by both the source vma and the new copy. As a result, once the new vma has been freed and copy_vma() returns, the reservation counter for the source vma will be incorrect. This patch addresses this corner case by clearing the hugetlb private page reservation reference for the new vma and decrementing the reference before closing the vma, so that vma_close() won't update the reservation counter. This is also what copy_vma_and_data() does with the source vma if copy_vma() succeeds, so a helper function has been added to do the fixup in both functions. The issue was reported by a private syzbot instance and can be reproduced using the C reproducer in [1]. It's also a possible duplicate of public syzbot report [2]. The WARNING report is: ============================================================ page_counter underflow: -1024 nr_pages=1024 WARNING: CPU: 0 PID: 3287 at mm/page_counter.c:61 page_counter_cancel+0xf6/0x120 Modules linked in: CPU: 0 UID: 0 PID: 3287 Comm: repro__WARNING_ Not tainted 6.15.0-rc7+ #54 NONE Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-2-gc13ff2cd-prebuilt.qemu.org 04/01/2014 RIP: 0010:page_counter_cancel+0xf6/0x120 Code: ff 5b 41 5e 41 5f 5d c3 cc cc cc cc e8 f3 4f 8f ff c6 05 64 01 27 06 01 48 c7 c7 60 15 f8 85 48 89 de 4c 89 fa e8 2a a7 51 ff <0f> 0b e9 66 ff ff ff 44 89 f9 80 e1 07 38 c1 7c 9d 4c 81 RSP: 0018:ffffc900025df6a0 EFLAGS: 00010246 RAX: 2edfc409ebb44e00 RBX: fffffffffffffc00 RCX: ffff8880155f0000 RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000000 RBP: dffffc0000000000 R08: ffffffff81c4a23c R09: 1ffff1100330482a R10: dffffc0000000000 R11: ffffed100330482b R12: 0000000000000000 R13: ffff888058a882c0 R14: ffff888058a882c0 R15: 0000000000000400 FS: 0000000000000000(0000) GS:ffff88808fc53000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000004b33e0 CR3: 00000000076d6000 CR4: 00000000000006f0 Call Trace: page_counter_uncharge+0x33/0x80 hugetlb_cgroup_uncharge_counter+0xcb/0x120 hugetlb_vm_op_close+0x579/0x960 ? __pfx_hugetlb_vm_op_close+0x10/0x10 remove_vma+0x88/0x130 exit_mmap+0x71e/0xe00 ? __pfx_exit_mmap+0x10/0x10 ? __mutex_unlock_slowpath+0x22e/0x7f0 ? __pfx_exit_aio+0x10/0x10 ? __up_read+0x256/0x690 ? uprobe_clear_state+0x274/0x290 ? mm_update_next_owner+0xa9/0x810 __mmput+0xc9/0x370 exit_mm+0x203/0x2f0 ? __pfx_exit_mm+0x10/0x10 ? taskstats_exit+0x32b/0xa60 do_exit+0x921/0x2740 ? do_raw_spin_lock+0x155/0x3b0 ? __pfx_do_exit+0x10/0x10 ? __pfx_do_raw_spin_lock+0x10/0x10 ? _raw_spin_lock_irq+0xc5/0x100 do_group_exit+0x20c/0x2c0 get_signal+0x168c/0x1720 ? __pfx_get_signal+0x10/0x10 ? schedule+0x165/0x360 arch_do_signal_or_restart+0x8e/0x7d0 ? __pfx_arch_do_signal_or_restart+0x10/0x10 ? __pfx___se_sys_futex+0x10/0x10 syscall_exit_to_user_mode+0xb8/0x2c0 do_syscall_64+0x75/0x120 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x422dcd Code: Unable to access opcode bytes at 0x422da3. RSP: 002b:00007ff266cdb208 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca RAX: 0000000000000001 RBX: 00007ff266cdbcdc RCX: 0000000000422dcd RDX: 00000000000f4240 RSI: 0000000000000081 RDI: 00000000004c7bec RBP: 00007ff266cdb220 R08: 203a6362696c6720 R09: 203a6362696c6720 R10: 0000200000c00000 R11: 0000000000000246 R12: ffffffffffffffd0 R13: 0000000000000002 R14: 00007ffe1cb5f520 R15: 00007ff266cbb000 ============================================================ Link: https://lkml.kernel.org/r/20250523-warning_in_page_counter_cancel-v2-1-b6df1a8cfefd@igalia.com Link: https://people.igalia.com/rcn/kernel_logs/20250422__WARNING_in_page_counter_cancel__repro.c [1] Link: https://lore.kernel.org/all/67000a50.050a0220.49194.048d.GAE@google.com/ [2] Signed-off-by: Ricardo Cañuelo Navarro Suggested-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Florent Revest Cc: Jann Horn Cc: Oscar Salvador Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 5 +++++ mm/hugetlb.c | 16 +++++++++++++++- mm/mremap.c | 3 +-- mm/vma.c | 1 + 4 files changed, 22 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 8f3ac832ee7f..4861a7e304bb 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -275,6 +275,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, bool is_hugetlb_entry_migration(pte_t pte); bool is_hugetlb_entry_hwpoisoned(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); +void fixup_hugetlb_reservations(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ @@ -468,6 +469,10 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm, static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { } +static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma) +{ +} + #endif /* !CONFIG_HUGETLB_PAGE */ #ifndef pgd_write diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 55e6796b24d0..6a3cf7935c14 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1250,7 +1250,7 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma) /* * Reset and decrement one ref on hugepage private reservation. * Called with mm->mmap_lock writer semaphore held. - * This function should be only used by move_vma() and operate on + * This function should be only used by mremap and operate on * same sized vma. It should never come here with last ref on the * reservation. */ @@ -7939,3 +7939,17 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE), ALIGN_DOWN(vma->vm_end, PUD_SIZE)); } + +/* + * For hugetlb, mremap() is an odd edge case - while the VMA copying is + * performed, we permit both the old and new VMAs to reference the same + * reservation. + * + * We fix this up after the operation succeeds, or if a newly allocated VMA + * is closed as a result of a failure to allocate memory. + */ +void fixup_hugetlb_reservations(struct vm_area_struct *vma) +{ + if (is_vm_hugetlb_page(vma)) + clear_vma_resv_huge_pages(vma); +} diff --git a/mm/mremap.c b/mm/mremap.c index 7db9da609c84..0d4948b720e2 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1188,8 +1188,7 @@ static int copy_vma_and_data(struct vma_remap_struct *vrm, mremap_userfaultfd_prep(new_vma, vrm->uf); } - if (is_vm_hugetlb_page(vma)) - clear_vma_resv_huge_pages(vma); + fixup_hugetlb_reservations(vma); /* Tell pfnmap has moved from this vma */ if (unlikely(vma->vm_flags & VM_PFNMAP)) diff --git a/mm/vma.c b/mm/vma.c index 839d12f02c88..a468d4c29c0c 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -1834,6 +1834,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return new_vma; out_vma_link: + fixup_hugetlb_reservations(new_vma); vma_close(new_vma); if (new_vma->vm_file) -- cgit v1.2.3