From 55284f70134f01fdc9cc4c4905551cc1f37abd34 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Apr 2025 18:29:02 +0200 Subject: mm: Add vmalloc_huge_node() To enable node specific hash-tables using huge pages if possible. [bigeasy: use __vmalloc_node_range_noprof(), add nommu bits, inline vmalloc_huge] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250416162921.513656-3-bigeasy@linutronix.de --- include/linux/vmalloc.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux/vmalloc.h') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 31e9ffd936e3..de95794777ad 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -168,8 +168,13 @@ void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_m int node, const void *caller) __alloc_size(1); #define __vmalloc_node(...) alloc_hooks(__vmalloc_node_noprof(__VA_ARGS__)) -void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __alloc_size(1); -#define vmalloc_huge(...) alloc_hooks(vmalloc_huge_noprof(__VA_ARGS__)) +void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node) __alloc_size(1); +#define vmalloc_huge_node(...) alloc_hooks(vmalloc_huge_node_noprof(__VA_ARGS__)) + +static inline void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) +{ + return vmalloc_huge_node(size, gfp_mask, NUMA_NO_NODE); +} extern void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); #define __vmalloc_array(...) alloc_hooks(__vmalloc_array_noprof(__VA_ARGS__)) -- cgit v1.2.3 From a0309faf1cb0622cac7c820150b7abf2024acff5 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 25 Apr 2025 17:11:07 -0700 Subject: mm: vmalloc: support more granular vrealloc() sizing Introduce struct vm_struct::requested_size so that the requested (re)allocation size is retained separately from the allocated area size. This means that KASAN will correctly poison the correct spans of requested bytes. This also means we can support growing the usable portion of an allocation that can already be supported by the existing area's existing allocation. Link: https://lkml.kernel.org/r/20250426001105.it.679-kees@kernel.org Fixes: 3ddc2fefe6f3 ("mm: vmalloc: implement vrealloc()") Signed-off-by: Kees Cook Reported-by: Erhard Furtner Closes: https://lore.kernel.org/all/20250408192503.6149a816@outsider.home/ Reviewed-by: Danilo Krummrich Cc: Michal Hocko Cc: "Uladzislau Rezki (Sony)" Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/vmalloc.h | 1 + mm/vmalloc.c | 31 ++++++++++++++++++++++++------- 2 files changed, 25 insertions(+), 7 deletions(-) (limited to 'include/linux/vmalloc.h') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 31e9ffd936e3..5ca8d4dd149d 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -61,6 +61,7 @@ struct vm_struct { unsigned int nr_pages; phys_addr_t phys_addr; const void *caller; + unsigned long requested_size; }; struct vmap_area { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3ed720a787ec..2d7511654831 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1940,7 +1940,7 @@ static inline void setup_vmalloc_vm(struct vm_struct *vm, { vm->flags = flags; vm->addr = (void *)va->va_start; - vm->size = va_size(va); + vm->size = vm->requested_size = va_size(va); vm->caller = caller; va->vm = vm; } @@ -3133,6 +3133,7 @@ struct vm_struct *__get_vm_area_node(unsigned long size, area->flags = flags; area->caller = caller; + area->requested_size = requested_size; va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area); if (IS_ERR(va)) { @@ -4063,6 +4064,8 @@ EXPORT_SYMBOL(vzalloc_node_noprof); */ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) { + struct vm_struct *vm = NULL; + size_t alloced_size = 0; size_t old_size = 0; void *n; @@ -4072,15 +4075,17 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) } if (p) { - struct vm_struct *vm; - vm = find_vm_area(p); if (unlikely(!vm)) { WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p); return NULL; } - old_size = get_vm_area_size(vm); + alloced_size = get_vm_area_size(vm); + old_size = vm->requested_size; + if (WARN(alloced_size < old_size, + "vrealloc() has mismatched area vs requested sizes (%p)\n", p)) + return NULL; } /* @@ -4088,14 +4093,26 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) * would be a good heuristic for when to shrink the vm_area? */ if (size <= old_size) { - /* Zero out spare memory. */ - if (want_init_on_alloc(flags)) + /* Zero out "freed" memory. */ + if (want_init_on_free()) memset((void *)p + size, 0, old_size - size); + vm->requested_size = size; kasan_poison_vmalloc(p + size, old_size - size); - kasan_unpoison_vmalloc(p, size, KASAN_VMALLOC_PROT_NORMAL); return (void *)p; } + /* + * We already have the bytes available in the allocation; use them. + */ + if (size <= alloced_size) { + kasan_unpoison_vmalloc(p + old_size, size - old_size, + KASAN_VMALLOC_PROT_NORMAL); + /* Zero out "alloced" memory. */ + if (want_init_on_alloc(flags)) + memset((void *)p + old_size, 0, size - old_size); + vm->requested_size = size; + } + /* TODO: Grow the vm_area, i.e. allocate and map additional pages. */ n = __vmalloc_noprof(size, flags); if (!n) -- cgit v1.2.3 From 2fba13371fe80b4d0d533a502e460ce0e936d024 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Tue, 22 Apr 2025 09:18:16 +0100 Subject: mm/vmalloc: Gracefully unmap huge ptes Commit f7ee1f13d606 ("mm/vmalloc: enable mapping of huge pages at pte level in vmap") added its support by reusing the set_huge_pte_at() API, which is otherwise only used for user mappings. But when unmapping those huge ptes, it continued to call ptep_get_and_clear(), which is a layering violation. To date, the only arch to implement this support is powerpc and it all happens to work ok for it. But arm64's implementation of ptep_get_and_clear() can not be safely used to clear a previous set_huge_pte_at(). So let's introduce a new arch opt-in function, arch_vmap_pte_range_unmap_size(), which can provide the size of a (present) pte. Then we can call huge_ptep_get_and_clear() to tear it down properly. Note that if vunmap_range() is called with a range that starts in the middle of a huge pte-mapped page, we must unmap the entire huge page so the behaviour is consistent with pmd and pud block mappings. In this case emit a warning just like we do for pmd/pud mappings. Reviewed-by: Anshuman Khandual Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Catalin Marinas Signed-off-by: Ryan Roberts Tested-by: Luiz Capitulino Link: https://lore.kernel.org/r/20250422081822.1836315-9-ryan.roberts@arm.com Signed-off-by: Will Deacon --- include/linux/vmalloc.h | 8 ++++++++ mm/vmalloc.c | 18 ++++++++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) (limited to 'include/linux/vmalloc.h') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 31e9ffd936e3..16dd4cba64f2 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -113,6 +113,14 @@ static inline unsigned long arch_vmap_pte_range_map_size(unsigned long addr, uns } #endif +#ifndef arch_vmap_pte_range_unmap_size +static inline unsigned long arch_vmap_pte_range_unmap_size(unsigned long addr, + pte_t *ptep) +{ + return PAGE_SIZE; +} +#endif + #ifndef arch_vmap_pte_supported_shift static inline int arch_vmap_pte_supported_shift(unsigned long size) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d60d3a29d149..fe2e2cc8da94 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -350,12 +350,26 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { pte_t *pte; + pte_t ptent; + unsigned long size = PAGE_SIZE; pte = pte_offset_kernel(pmd, addr); do { - pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); +#ifdef CONFIG_HUGETLB_PAGE + size = arch_vmap_pte_range_unmap_size(addr, pte); + if (size != PAGE_SIZE) { + if (WARN_ON(!IS_ALIGNED(addr, size))) { + addr = ALIGN_DOWN(addr, size); + pte = PTR_ALIGN_DOWN(pte, sizeof(*pte) * (size >> PAGE_SHIFT)); + } + ptent = huge_ptep_get_and_clear(&init_mm, addr, pte, size); + if (WARN_ON(end - addr < size)) + size = end - addr; + } else +#endif + ptent = ptep_get_and_clear(&init_mm, addr, pte); WARN_ON(!pte_none(ptent) && !pte_present(ptent)); - } while (pte++, addr += PAGE_SIZE, addr != end); + } while (pte += (size >> PAGE_SHIFT), addr += size, addr != end); *mask |= PGTBL_PTE_MODIFIED; } -- cgit v1.2.3