From fb5eda0dfe2256b468fc4e95207a4df88457274f Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 11 Sep 2024 14:45:31 +0800 Subject: mm/kasan: Don't store metadata inside kmalloc object when slub_debug_orig_size is on For a kmalloc object, when both kasan and slub redzone sanity check are enabled, they could both manipulate its data space like storing kasan free meta data and setting up kmalloc redzone, and may affect accuracy of that object's 'orig_size'. As an accurate 'orig_size' will be needed by some function like krealloc() soon, save kasan's free meta data in slub's metadata area instead of inside object when 'orig_size' is enabled. This will make it easier to maintain/understand the code. Size wise, when these two options are both enabled, the slub meta data space is already huge, and this just slightly increase the overall size. Signed-off-by: Feng Tang Acked-by: Andrey Konovalov Signed-off-by: Vlastimil Babka --- mm/kasan/generic.c | 7 +++++-- mm/slab.h | 6 ++++++ mm/slub.c | 17 ----------------- 3 files changed, 11 insertions(+), 19 deletions(-) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 6310a180278b..8b9e348113b1 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -392,9 +392,12 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, * 1. Object is SLAB_TYPESAFE_BY_RCU, which means that it can * be touched after it was freed, or * 2. Object has a constructor, which means it's expected to - * retain its content until the next allocation. + * retain its content until the next allocation, or + * 3. It is from a kmalloc cache which enables the debug option + * to store original size. */ - if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor) { + if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor || + slub_debug_orig_size(cache)) { cache->kasan_info.free_meta_offset = *size; *size += sizeof(struct kasan_free_meta); goto free_meta_added; diff --git a/mm/slab.h b/mm/slab.h index 6c6fe6d630ce..298567019485 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -695,6 +695,12 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) void __check_heap_object(const void *ptr, unsigned long n, const struct slab *slab, bool to_user); +static inline bool slub_debug_orig_size(struct kmem_cache *s) +{ + return (kmem_cache_debug_flags(s, SLAB_STORE_USER) && + (s->flags & SLAB_KMALLOC)); +} + #ifdef CONFIG_SLUB_DEBUG void skip_orig_size_check(struct kmem_cache *s, const void *object); #endif diff --git a/mm/slub.c b/mm/slub.c index 5b832512044e..83579b637578 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -230,12 +230,6 @@ static inline bool kmem_cache_debug(struct kmem_cache *s) return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS); } -static inline bool slub_debug_orig_size(struct kmem_cache *s) -{ - return (kmem_cache_debug_flags(s, SLAB_STORE_USER) && - (s->flags & SLAB_KMALLOC)); -} - void *fixup_red_left(struct kmem_cache *s, void *p) { if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) @@ -760,21 +754,10 @@ static inline void set_orig_size(struct kmem_cache *s, void *object, unsigned int orig_size) { void *p = kasan_reset_tag(object); - unsigned int kasan_meta_size; if (!slub_debug_orig_size(s)) return; - /* - * KASAN can save its free meta data inside of the object at offset 0. - * If this meta data size is larger than 'orig_size', it will overlap - * the data redzone in [orig_size+1, object_size]. Thus, we adjust - * 'orig_size' to be as at least as big as KASAN's meta data. - */ - kasan_meta_size = kasan_metadata_size(s, true); - if (kasan_meta_size > orig_size) - orig_size = kasan_meta_size; - p += get_info_end(s); p += sizeof(struct track) * 2; -- cgit v1.2.3 From 1e4df1859ec2d09fdfe184e7a92a476f01f64e34 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 11 Sep 2024 14:45:33 +0800 Subject: mm/slub: Move krealloc() and related code to slub.c This is a preparation for the following refactoring of krealloc(), for more efficient function calling as it will call some internal functions defined in slub.c. Signed-off-by: Feng Tang Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 84 -------------------------------------------------------- mm/slub.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 84 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 552b92dfdac7..f1f90491df03 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1190,90 +1190,6 @@ module_init(slab_proc_init); #endif /* CONFIG_SLUB_DEBUG */ -static __always_inline __realloc_size(2) void * -__do_krealloc(const void *p, size_t new_size, gfp_t flags) -{ - void *ret; - size_t ks; - - /* Check for double-free before calling ksize. */ - if (likely(!ZERO_OR_NULL_PTR(p))) { - if (!kasan_check_byte(p)) - return NULL; - ks = ksize(p); - } else - ks = 0; - - /* If the object still fits, repoison it precisely. */ - if (ks >= new_size) { - /* Zero out spare memory. */ - if (want_init_on_alloc(flags)) { - kasan_disable_current(); - memset(kasan_reset_tag(p) + new_size, 0, ks - new_size); - kasan_enable_current(); - } - - p = kasan_krealloc((void *)p, new_size, flags); - return (void *)p; - } - - ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_); - if (ret && p) { - /* Disable KASAN checks as the object's redzone is accessed. */ - kasan_disable_current(); - memcpy(ret, kasan_reset_tag(p), ks); - kasan_enable_current(); - } - - return ret; -} - -/** - * krealloc - reallocate memory. The contents will remain unchanged. - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - * - * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size - * is 0 and @p is not a %NULL pointer, the object pointed to is freed. - * - * If __GFP_ZERO logic is requested, callers must ensure that, starting with the - * initial memory allocation, every subsequent call to this API for the same - * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that - * __GFP_ZERO is not fully honored by this API. - * - * This is the case, since krealloc() only knows about the bucket size of an - * allocation (but not the exact size it was allocated with) and hence - * implements the following semantics for shrinking and growing buffers with - * __GFP_ZERO. - * - * new bucket - * 0 size size - * |--------|----------------| - * | keep | zero | - * - * In any case, the contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. - * - * Return: pointer to the allocated memory or %NULL in case of error - */ -void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) -{ - void *ret; - - if (unlikely(!new_size)) { - kfree(p); - return ZERO_SIZE_PTR; - } - - ret = __do_krealloc(p, new_size, flags); - if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret)) - kfree(p); - - return ret; -} -EXPORT_SYMBOL(krealloc_noprof); - /** * kfree_sensitive - Clear sensitive information in memory before freeing * @p: object to free memory of diff --git a/mm/slub.c b/mm/slub.c index 83579b637578..2ccb016c7f99 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4711,6 +4711,90 @@ void kfree(const void *object) } EXPORT_SYMBOL(kfree); +static __always_inline __realloc_size(2) void * +__do_krealloc(const void *p, size_t new_size, gfp_t flags) +{ + void *ret; + size_t ks; + + /* Check for double-free before calling ksize. */ + if (likely(!ZERO_OR_NULL_PTR(p))) { + if (!kasan_check_byte(p)) + return NULL; + ks = ksize(p); + } else + ks = 0; + + /* If the object still fits, repoison it precisely. */ + if (ks >= new_size) { + /* Zero out spare memory. */ + if (want_init_on_alloc(flags)) { + kasan_disable_current(); + memset(kasan_reset_tag(p) + new_size, 0, ks - new_size); + kasan_enable_current(); + } + + p = kasan_krealloc((void *)p, new_size, flags); + return (void *)p; + } + + ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_); + if (ret && p) { + /* Disable KASAN checks as the object's redzone is accessed. */ + kasan_disable_current(); + memcpy(ret, kasan_reset_tag(p), ks); + kasan_enable_current(); + } + + return ret; +} + +/** + * krealloc - reallocate memory. The contents will remain unchanged. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size + * is 0 and @p is not a %NULL pointer, the object pointed to is freed. + * + * If __GFP_ZERO logic is requested, callers must ensure that, starting with the + * initial memory allocation, every subsequent call to this API for the same + * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that + * __GFP_ZERO is not fully honored by this API. + * + * This is the case, since krealloc() only knows about the bucket size of an + * allocation (but not the exact size it was allocated with) and hence + * implements the following semantics for shrinking and growing buffers with + * __GFP_ZERO. + * + * new bucket + * 0 size size + * |--------|----------------| + * | keep | zero | + * + * In any case, the contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. + * + * Return: pointer to the allocated memory or %NULL in case of error + */ +void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) +{ + void *ret; + + if (unlikely(!new_size)) { + kfree(p); + return ZERO_SIZE_PTR; + } + + ret = __do_krealloc(p, new_size, flags); + if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret)) + kfree(p); + + return ret; +} +EXPORT_SYMBOL(krealloc_noprof); + struct detached_freelist { struct slab *slab; void *tail; -- cgit v1.2.3 From b4b797d87745f79e1d3c945dc0db4093c9ae9904 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 9 Oct 2024 11:54:55 +0800 Subject: mm/slab: remove duplicate check in create_cache() The WARN_ON() check in static function create_cache() is done by its only parent __kmem_cache_create_args() before calling it. if (... || WARN_ON(... || object_size - args->usersize < args->useroffset)) args->usersize = args->useroffset = 0; ... s = create_cache(cache_name, object_size, args, flags); Therefore, the WARN_ON() check in create_cache() can be safely removed. Signed-off-by: Zhen Lei Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index f1f90491df03..9e22d1266f6a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -222,9 +222,6 @@ static struct kmem_cache *create_cache(const char *name, struct kmem_cache *s; int err; - if (WARN_ON(args->useroffset + args->usersize > object_size)) - args->useroffset = args->usersize = 0; - /* If a custom freelist pointer is requested make sure it's sane. */ err = -EINVAL; if (args->use_freeptr_offset && -- cgit v1.2.3 From b6da940130579769a42605b2c7f529b6f14ef1f8 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 9 Oct 2024 16:29:37 +0200 Subject: mm, slab: add kerneldocs for common SLAB_ flags We have many SLAB_ flags but many are used only internally, by kunit tests or debugging subsystems cooperating with slab, or are set according to slab_debug boot parameter. Create kerneldocs for the commonly used flags that may be passed to kmem_cache_create(). SLAB_TYPESAFE_BY_RCU already had a detailed description, so turn it to a kerneldoc. Add some details for SLAB_ACCOUNT, SLAB_RECLAIM_ACCOUNT and SLAB_HWCACHE_ALIGN. Reference them from the __kmem_cache_create_args() kerneldoc. Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 60 +++++++++++++++++++++++++++++++++++----------------- mm/slab_common.c | 14 +++++++++++- 2 files changed, 54 insertions(+), 20 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index b35e2db7eb0e..49e9fb93e864 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -77,7 +77,17 @@ enum _slab_flag_bits { #define SLAB_POISON __SLAB_FLAG_BIT(_SLAB_POISON) /* Indicate a kmalloc slab */ #define SLAB_KMALLOC __SLAB_FLAG_BIT(_SLAB_KMALLOC) -/* Align objs on cache lines */ +/** + * define SLAB_HWCACHE_ALIGN - Align objects on cache line boundaries. + * + * Sufficiently large objects are aligned on cache line boundary. For object + * size smaller than a half of cache line size, the alignment is on the half of + * cache line size. In general, if object size is smaller than 1/2^n of cache + * line size, the alignment is adjusted to 1/2^n. + * + * If explicit alignment is also requested by the respective + * &struct kmem_cache_args field, the greater of both is alignments is applied. + */ #define SLAB_HWCACHE_ALIGN __SLAB_FLAG_BIT(_SLAB_HWCACHE_ALIGN) /* Use GFP_DMA memory */ #define SLAB_CACHE_DMA __SLAB_FLAG_BIT(_SLAB_CACHE_DMA) @@ -87,8 +97,8 @@ enum _slab_flag_bits { #define SLAB_STORE_USER __SLAB_FLAG_BIT(_SLAB_STORE_USER) /* Panic if kmem_cache_create() fails */ #define SLAB_PANIC __SLAB_FLAG_BIT(_SLAB_PANIC) -/* - * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS! +/** + * define SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS! * * This delays freeing the SLAB page by a grace period, it does _NOT_ * delay object freeing. This means that if you do kmem_cache_free() @@ -99,20 +109,22 @@ enum _slab_flag_bits { * stays valid, the trick to using this is relying on an independent * object validation pass. Something like: * - * begin: - * rcu_read_lock(); - * obj = lockless_lookup(key); - * if (obj) { - * if (!try_get_ref(obj)) // might fail for free objects - * rcu_read_unlock(); - * goto begin; + * :: + * + * begin: + * rcu_read_lock(); + * obj = lockless_lookup(key); + * if (obj) { + * if (!try_get_ref(obj)) // might fail for free objects + * rcu_read_unlock(); + * goto begin; * - * if (obj->key != key) { // not the object we expected - * put_ref(obj); - * rcu_read_unlock(); - * goto begin; - * } - * } + * if (obj->key != key) { // not the object we expected + * put_ref(obj); + * rcu_read_unlock(); + * goto begin; + * } + * } * rcu_read_unlock(); * * This is useful if we need to approach a kernel structure obliquely, @@ -137,7 +149,6 @@ enum _slab_flag_bits { * * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU. */ -/* Defer freeing slabs to RCU */ #define SLAB_TYPESAFE_BY_RCU __SLAB_FLAG_BIT(_SLAB_TYPESAFE_BY_RCU) /* Trace allocations and frees */ #define SLAB_TRACE __SLAB_FLAG_BIT(_SLAB_TRACE) @@ -170,7 +181,12 @@ enum _slab_flag_bits { #else # define SLAB_FAILSLAB __SLAB_FLAG_UNUSED #endif -/* Account to memcg */ +/** + * define SLAB_ACCOUNT - Account allocations to memcg. + * + * All object allocations from this cache will be memcg accounted, regardless of + * __GFP_ACCOUNT being or not being passed to individual allocations. + */ #ifdef CONFIG_MEMCG # define SLAB_ACCOUNT __SLAB_FLAG_BIT(_SLAB_ACCOUNT) #else @@ -197,7 +213,13 @@ enum _slab_flag_bits { #endif /* The following flags affect the page allocator grouping pages by mobility */ -/* Objects are reclaimable */ +/** + * define SLAB_RECLAIM_ACCOUNT - Objects are reclaimable. + * + * Use this flag for caches that have an associated shrinker. As a result, slab + * pages are allocated with __GFP_RECLAIMABLE, which affects grouping pages by + * mobility, and are accounted in SReclaimable counter in /proc/meminfo + */ #ifndef CONFIG_SLUB_TINY #define SLAB_RECLAIM_ACCOUNT __SLAB_FLAG_BIT(_SLAB_RECLAIM_ACCOUNT) #else diff --git a/mm/slab_common.c b/mm/slab_common.c index 9e22d1266f6a..62878edb0a81 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -254,11 +254,23 @@ out: * @object_size: The size of objects to be created in this cache. * @args: Additional arguments for the cache creation (see * &struct kmem_cache_args). - * @flags: See %SLAB_* flags for an explanation of individual @flags. + * @flags: See the desriptions of individual flags. The common ones are listed + * in the description below. * * Not to be called directly, use the kmem_cache_create() wrapper with the same * parameters. * + * Commonly used @flags: + * + * &SLAB_ACCOUNT - Account allocations to memcg. + * + * &SLAB_HWCACHE_ALIGN - Align objects on cache line boundaries. + * + * &SLAB_RECLAIM_ACCOUNT - Objects are reclaimable. + * + * &SLAB_TYPESAFE_BY_RCU - Slab page (not individual objects) freeing delayed + * by a grace period - see the full description before using. + * * Context: Cannot be called within a interrupt, but can be interrupted. * * Return: a pointer to the cache on success, NULL on failure. -- cgit v1.2.3 From f7c80fad6c2b64cf73361772dbd30493879e85f4 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 1 Oct 2024 12:08:06 -0700 Subject: SLUB: Add support for per object memory policies The old SLAB allocator used to support memory policies on a per allocation bases. In SLUB the memory policies are applied on a per page frame / folio bases. Doing so avoids having to check memory policies in critical code paths for kmalloc and friends. This worked on general well on Intel/AMD/PowerPC because the interconnect technology is mature and can minimize the latencies through intelligent caching even if a small object is not placed optimally. However, on ARM we have an emergence of new NUMA interconnect technology based more on embedded devices. Caching of remote content can currently be ineffective using the standard building blocks / mesh available on that platform. Such architectures benefit if each slab object is individually placed according to memory policies and other restrictions. This patch adds another kernel parameter slab_strict_numa If that is set then a static branch is activated that will cause the hotpaths of the allocator to evaluate the current memory allocation policy. Each object will be properly placed by paying the price of extra processing and SLUB will no longer defer to the page allocator to apply memory policies at the folio level. This patch improves performance of memcached running on Ampere Altra 2P system (ARM Neoverse N1 processor) by 3.6% due to accurate placement of small kernel objects. Tested-by: Huang Shijie Signed-off-by: Christoph Lameter (Ampere) Signed-off-by: Vlastimil Babka --- Documentation/admin-guide/kernel-parameters.txt | 10 ++++++ Documentation/mm/slub.rst | 9 ++++++ mm/slub.c | 43 +++++++++++++++++++++++++ 3 files changed, 62 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 1518343bbe22..9be54e9a55d3 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6147,6 +6147,16 @@ For more information see Documentation/mm/slub.rst. (slub_nomerge legacy name also accepted for now) + slab_strict_numa [MM] + Support memory policies on a per object level + in the slab allocator. The default is for memory + policies to be applied at the folio level when + a new folio is needed or a partial folio is + retrieved from the lists. Increases overhead + in the slab fastpaths but gains more accurate + NUMA kernel object placement which helps with slow + interconnects in NUMA systems. + slram= [HW,MTD] smart2= [HW] diff --git a/Documentation/mm/slub.rst b/Documentation/mm/slub.rst index 60d350d08362..84ca1dc94e5e 100644 --- a/Documentation/mm/slub.rst +++ b/Documentation/mm/slub.rst @@ -175,6 +175,15 @@ can be influenced by kernel parameters: ``slab_max_order`` to 0, what cause minimum possible order of slabs allocation. +``slab_strict_numa`` + Enables the application of memory policies on each + allocation. This results in more accurate placement of + objects which may result in the reduction of accesses + to remote nodes. The default is to only apply memory + policies at the folio level when a new folio is acquired + or a folio is retrieved from the lists. Enabling this + option reduces the fastpath performance of the slab allocator. + SLUB Debug output ================= diff --git a/mm/slub.c b/mm/slub.c index 5b832512044e..d4b1680ef17a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -218,6 +218,10 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled); #endif #endif /* CONFIG_SLUB_DEBUG */ +#ifdef CONFIG_NUMA +static DEFINE_STATIC_KEY_FALSE(strict_numa); +#endif + /* Structure holding parameters for get_partial() call chain */ struct partial_context { gfp_t flags; @@ -3956,6 +3960,28 @@ redo: object = c->freelist; slab = c->slab; +#ifdef CONFIG_NUMA + if (static_branch_unlikely(&strict_numa) && + node == NUMA_NO_NODE) { + + struct mempolicy *mpol = current->mempolicy; + + if (mpol) { + /* + * Special BIND rule support. If existing slab + * is in permitted set then do not redirect + * to a particular node. + * Otherwise we apply the memory policy to get + * the node we need to allocate on. + */ + if (mpol->mode != MPOL_BIND || !slab || + !node_isset(slab_nid(slab), mpol->nodes)) + + node = mempolicy_slab_node(); + } + } +#endif + if (!USE_LOCKLESS_FAST_PATH() || unlikely(!object || !slab || !node_match(slab, node))) { object = __slab_alloc(s, gfpflags, node, addr, c, orig_size); @@ -5602,6 +5628,23 @@ static int __init setup_slub_min_objects(char *str) __setup("slab_min_objects=", setup_slub_min_objects); __setup_param("slub_min_objects=", slub_min_objects, setup_slub_min_objects, 0); +#ifdef CONFIG_NUMA +static int __init setup_slab_strict_numa(char *str) +{ + if (nr_node_ids > 1) { + static_branch_enable(&strict_numa); + pr_info("SLUB: Strict NUMA enabled.\n"); + } else { + pr_warn("slab_strict_numa parameter set on non NUMA system.\n"); + } + + return 1; +} + +__setup("slab_strict_numa", setup_slab_strict_numa); +#endif + + #ifdef CONFIG_HARDENED_USERCOPY /* * Rejects incorrectly sized objects and objects that are to be copied -- cgit v1.2.3 From 9ef8568bd7cddf59e1ff6ee1b7d799539e331b73 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 16 Oct 2024 23:41:50 +0800 Subject: mm/slub: Consider kfence case for get_orig_size() When 'orig_size' of kmalloc object is enabled by debug option, it should either contains the actual requested size or the cache's 'object_size'. But it's not true if that object is a kfence-allocated one, and the data at 'orig_size' offset of metadata could be zero or other values. This is not a big issue for current 'orig_size' usage, as init_object() and check_object() during alloc/free process will be skipped for kfence addresses. But it could cause trouble for other usage in future. Use the existing kfence helper kfence_ksize() which can return the real original request size. Signed-off-by: Feng Tang Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/slub.c b/mm/slub.c index 2ccb016c7f99..e186e7a67597 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -768,6 +768,9 @@ static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) { void *p = kasan_reset_tag(object); + if (is_kfence_address(object)) + return kfence_ksize(object); + if (!slub_debug_orig_size(s)) return s->object_size; -- cgit v1.2.3 From 5474d33ca48e7764ccc38b2ac089863cca83280a Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 16 Oct 2024 23:41:51 +0800 Subject: mm/slub: Improve redzone check and zeroing for krealloc() For current krealloc(), one problem is its caller doesn't pass the old request size, say the object is 64 bytes kmalloc one, but caller may only requested 48 bytes. Then when krealloc() shrinks or grows in the same object, or allocate a new bigger object, it lacks this 'original size' information to do accurate data preserving or zeroing (when __GFP_ZERO is set). Thus with slub debug redzone and object tracking enabled, parts of the object after krealloc() might contain redzone data instead of zeroes, which is violating the __GFP_ZERO guarantees. Good thing is in this case, kmalloc caches do have this 'orig_size' feature. So solve the problem by utilize 'org_size' to do accurate data zeroing and preserving. [Thanks to syzbot and V, Narasimhan for discovering kfence and big kmalloc related issues in early patch version] Suggested-by: Vlastimil Babka Signed-off-by: Feng Tang Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 80 +++++++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 58 insertions(+), 22 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index e186e7a67597..4284cbe41d0d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4718,34 +4718,66 @@ static __always_inline __realloc_size(2) void * __do_krealloc(const void *p, size_t new_size, gfp_t flags) { void *ret; - size_t ks; + size_t ks = 0; + int orig_size = 0; + struct kmem_cache *s = NULL; - /* Check for double-free before calling ksize. */ - if (likely(!ZERO_OR_NULL_PTR(p))) { - if (!kasan_check_byte(p)) - return NULL; - ks = ksize(p); - } else - ks = 0; + if (unlikely(ZERO_OR_NULL_PTR(p))) + goto alloc_new; - /* If the object still fits, repoison it precisely. */ - if (ks >= new_size) { - /* Zero out spare memory. */ - if (want_init_on_alloc(flags)) { - kasan_disable_current(); - memset(kasan_reset_tag(p) + new_size, 0, ks - new_size); - kasan_enable_current(); + /* Check for double-free. */ + if (!kasan_check_byte(p)) + return NULL; + + if (is_kfence_address(p)) { + ks = orig_size = kfence_ksize(p); + } else { + struct folio *folio; + + folio = virt_to_folio(p); + if (unlikely(!folio_test_slab(folio))) { + /* Big kmalloc object */ + WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE); + WARN_ON(p != folio_address(folio)); + ks = folio_size(folio); + } else { + s = folio_slab(folio)->slab_cache; + orig_size = get_orig_size(s, (void *)p); + ks = s->object_size; } + } - p = kasan_krealloc((void *)p, new_size, flags); - return (void *)p; + /* If the old object doesn't fit, allocate a bigger one */ + if (new_size > ks) + goto alloc_new; + + /* Zero out spare memory. */ + if (want_init_on_alloc(flags)) { + kasan_disable_current(); + if (orig_size && orig_size < new_size) + memset(kasan_reset_tag(p) + orig_size, 0, new_size - orig_size); + else + memset(kasan_reset_tag(p) + new_size, 0, ks - new_size); + kasan_enable_current(); } + /* Setup kmalloc redzone when needed */ + if (s && slub_debug_orig_size(s)) { + set_orig_size(s, (void *)p, new_size); + if (s->flags & SLAB_RED_ZONE && new_size < ks) + memset_no_sanitize_memory(kasan_reset_tag(p) + new_size, + SLUB_RED_ACTIVE, ks - new_size); + } + + p = kasan_krealloc(p, new_size, flags); + return (void *)p; + +alloc_new: ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_); if (ret && p) { /* Disable KASAN checks as the object's redzone is accessed. */ kasan_disable_current(); - memcpy(ret, kasan_reset_tag(p), ks); + memcpy(ret, kasan_reset_tag(p), orig_size ?: ks); kasan_enable_current(); } @@ -4766,16 +4798,20 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that * __GFP_ZERO is not fully honored by this API. * - * This is the case, since krealloc() only knows about the bucket size of an - * allocation (but not the exact size it was allocated with) and hence - * implements the following semantics for shrinking and growing buffers with - * __GFP_ZERO. + * When slub_debug_orig_size() is off, krealloc() only knows about the bucket + * size of an allocation (but not the exact size it was allocated with) and + * hence implements the following semantics for shrinking and growing buffers + * with __GFP_ZERO. * * new bucket * 0 size size * |--------|----------------| * | keep | zero | * + * Otherwise, the original allocation size 'orig_size' could be used to + * precisely clear the requested size, and the new size will also be stored + * as the new 'orig_size'. + * * In any case, the contents of the object pointed to are preserved up to the * lesser of the new and old sizes. * -- cgit v1.2.3 From 080c8579c37ec9c11cfb8b9eb25b74912b33dc06 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 16 Oct 2024 23:41:52 +0800 Subject: mm/slub, kunit: Add testcase for krealloc redzone and zeroing Danilo Krummrich raised issue about krealloc+GFP_ZERO [1], and Vlastimil suggested to add some test case which can sanity test the kmalloc-redzone and zeroing by utilizing the kmalloc's 'orig_size' debug feature. It covers the grow and shrink case of krealloc() re-using current kmalloc object, and the case of re-allocating a new bigger object. [1]. https://lore.kernel.org/lkml/20240812223707.32049-1-dakr@kernel.org/ Suggested-by: Vlastimil Babka Signed-off-by: Feng Tang Signed-off-by: Vlastimil Babka --- lib/slub_kunit.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/lib/slub_kunit.c b/lib/slub_kunit.c index 33564f965958..f11691315c2f 100644 --- a/lib/slub_kunit.c +++ b/lib/slub_kunit.c @@ -192,6 +192,47 @@ static void test_leak_destroy(struct kunit *test) KUNIT_EXPECT_EQ(test, 2, slab_errors); } +static void test_krealloc_redzone_zeroing(struct kunit *test) +{ + u8 *p; + int i; + struct kmem_cache *s = test_kmem_cache_create("TestSlub_krealloc", 64, + SLAB_KMALLOC|SLAB_STORE_USER|SLAB_RED_ZONE); + + p = alloc_hooks(__kmalloc_cache_noprof(s, GFP_KERNEL, 48)); + memset(p, 0xff, 48); + + kasan_disable_current(); + OPTIMIZER_HIDE_VAR(p); + + /* Test shrink */ + p = krealloc(p, 40, GFP_KERNEL | __GFP_ZERO); + for (i = 40; i < 64; i++) + KUNIT_EXPECT_EQ(test, p[i], SLUB_RED_ACTIVE); + + /* Test grow within the same 64B kmalloc object */ + p = krealloc(p, 56, GFP_KERNEL | __GFP_ZERO); + for (i = 40; i < 56; i++) + KUNIT_EXPECT_EQ(test, p[i], 0); + for (i = 56; i < 64; i++) + KUNIT_EXPECT_EQ(test, p[i], SLUB_RED_ACTIVE); + + validate_slab_cache(s); + KUNIT_EXPECT_EQ(test, 0, slab_errors); + + memset(p, 0xff, 56); + /* Test grow with allocating a bigger 128B object */ + p = krealloc(p, 112, GFP_KERNEL | __GFP_ZERO); + for (i = 0; i < 56; i++) + KUNIT_EXPECT_EQ(test, p[i], 0xff); + for (i = 56; i < 112; i++) + KUNIT_EXPECT_EQ(test, p[i], 0); + + kfree(p); + kasan_enable_current(); + kmem_cache_destroy(s); +} + static int test_init(struct kunit *test) { slab_errors = 0; @@ -214,6 +255,7 @@ static struct kunit_case test_cases[] = { KUNIT_CASE(test_kmalloc_redzone_access), KUNIT_CASE(test_kfree_rcu), KUNIT_CASE(test_leak_destroy), + KUNIT_CASE(test_krealloc_redzone_zeroing), {} }; -- cgit v1.2.3 From dbc16915279a548a204154368da23d402c141c81 Mon Sep 17 00:00:00 2001 From: "yuan.gao" Date: Fri, 18 Oct 2024 14:44:35 +0800 Subject: mm/slub: Avoid list corruption when removing a slab from the full list Boot with slub_debug=UFPZ. If allocated object failed in alloc_consistency_checks, all objects of the slab will be marked as used, and then the slab will be removed from the partial list. When an object belonging to the slab got freed later, the remove_full() function is called. Because the slab is neither on the partial list nor on the full list, it eventually lead to a list corruption (actually a list poison being detected). So we need to mark and isolate the slab page with metadata corruption, do not put it back in circulation. Because the debug caches avoid all the fastpaths, reusing the frozen bit to mark slab page with metadata corruption seems to be fine. [ 4277.385669] list_del corruption, ffffea00044b3e50->next is LIST_POISON1 (dead000000000100) [ 4277.387023] ------------[ cut here ]------------ [ 4277.387880] kernel BUG at lib/list_debug.c:56! [ 4277.388680] invalid opcode: 0000 [#1] PREEMPT SMP PTI [ 4277.389562] CPU: 5 PID: 90 Comm: kworker/5:1 Kdump: loaded Tainted: G OE 6.6.1-1 #1 [ 4277.392113] Workqueue: xfs-inodegc/vda1 xfs_inodegc_worker [xfs] [ 4277.393551] RIP: 0010:__list_del_entry_valid_or_report+0x7b/0xc0 [ 4277.394518] Code: 48 91 82 e8 37 f9 9a ff 0f 0b 48 89 fe 48 c7 c7 28 49 91 82 e8 26 f9 9a ff 0f 0b 48 89 fe 48 c7 c7 58 49 91 [ 4277.397292] RSP: 0018:ffffc90000333b38 EFLAGS: 00010082 [ 4277.398202] RAX: 000000000000004e RBX: ffffea00044b3e50 RCX: 0000000000000000 [ 4277.399340] RDX: 0000000000000002 RSI: ffffffff828f8715 RDI: 00000000ffffffff [ 4277.400545] RBP: ffffea00044b3e40 R08: 0000000000000000 R09: ffffc900003339f0 [ 4277.401710] R10: 0000000000000003 R11: ffffffff82d44088 R12: ffff888112cf9910 [ 4277.402887] R13: 0000000000000001 R14: 0000000000000001 R15: ffff8881000424c0 [ 4277.404049] FS: 0000000000000000(0000) GS:ffff88842fd40000(0000) knlGS:0000000000000000 [ 4277.405357] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4277.406389] CR2: 00007f2ad0b24000 CR3: 0000000102a3a006 CR4: 00000000007706e0 [ 4277.407589] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 4277.408780] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 4277.410000] PKRU: 55555554 [ 4277.410645] Call Trace: [ 4277.411234] [ 4277.411777] ? die+0x32/0x80 [ 4277.412439] ? do_trap+0xd6/0x100 [ 4277.413150] ? __list_del_entry_valid_or_report+0x7b/0xc0 [ 4277.414158] ? do_error_trap+0x6a/0x90 [ 4277.414948] ? __list_del_entry_valid_or_report+0x7b/0xc0 [ 4277.415915] ? exc_invalid_op+0x4c/0x60 [ 4277.416710] ? __list_del_entry_valid_or_report+0x7b/0xc0 [ 4277.417675] ? asm_exc_invalid_op+0x16/0x20 [ 4277.418482] ? __list_del_entry_valid_or_report+0x7b/0xc0 [ 4277.419466] ? __list_del_entry_valid_or_report+0x7b/0xc0 [ 4277.420410] free_to_partial_list+0x515/0x5e0 [ 4277.421242] ? xfs_iext_remove+0x41a/0xa10 [xfs] [ 4277.422298] xfs_iext_remove+0x41a/0xa10 [xfs] [ 4277.423316] ? xfs_inodegc_worker+0xb4/0x1a0 [xfs] [ 4277.424383] xfs_bmap_del_extent_delay+0x4fe/0x7d0 [xfs] [ 4277.425490] __xfs_bunmapi+0x50d/0x840 [xfs] [ 4277.426445] xfs_itruncate_extents_flags+0x13a/0x490 [xfs] [ 4277.427553] xfs_inactive_truncate+0xa3/0x120 [xfs] [ 4277.428567] xfs_inactive+0x22d/0x290 [xfs] [ 4277.429500] xfs_inodegc_worker+0xb4/0x1a0 [xfs] [ 4277.430479] process_one_work+0x171/0x340 [ 4277.431227] worker_thread+0x277/0x390 [ 4277.431962] ? __pfx_worker_thread+0x10/0x10 [ 4277.432752] kthread+0xf0/0x120 [ 4277.433382] ? __pfx_kthread+0x10/0x10 [ 4277.434134] ret_from_fork+0x2d/0x50 [ 4277.434837] ? __pfx_kthread+0x10/0x10 [ 4277.435566] ret_from_fork_asm+0x1b/0x30 [ 4277.436280] Fixes: 643b113849d8 ("slub: enable tracking of full slabs") Suggested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Suggested-by: Vlastimil Babka Cc: Signed-off-by: yuan.gao Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: Christoph Lameter Signed-off-by: Vlastimil Babka --- mm/slab.h | 5 +++++ mm/slub.c | 9 ++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/mm/slab.h b/mm/slab.h index 298567019485..632fedd71fea 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -73,6 +73,11 @@ struct slab { struct { unsigned inuse:16; unsigned objects:15; + /* + * If slab debugging is enabled then the + * frozen bit can be reused to indicate + * that the slab was corrupted + */ unsigned frozen:1; }; }; diff --git a/mm/slub.c b/mm/slub.c index 4284cbe41d0d..ccbdd7eb37a8 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1409,6 +1409,11 @@ static int check_slab(struct kmem_cache *s, struct slab *slab) slab->inuse, slab->objects); return 0; } + if (slab->frozen) { + slab_err(s, slab, "Slab disabled since SLUB metadata consistency check failed"); + return 0; + } + /* Slab_pad_check fixes things up after itself */ slab_pad_check(s, slab); return 1; @@ -1589,6 +1594,7 @@ bad: slab_fix(s, "Marking all objects used"); slab->inuse = slab->objects; slab->freelist = NULL; + slab->frozen = 1; /* mark consistency-failed slab as frozen */ } return false; } @@ -2730,7 +2736,8 @@ static void *alloc_single_from_partial(struct kmem_cache *s, slab->inuse++; if (!alloc_debug_processing(s, slab, object, orig_size)) { - remove_partial(n, slab); + if (folio_test_slab(slab_folio(slab))) + remove_partial(n, slab); return NULL; } -- cgit v1.2.3 From 2420baa8e0460b1c35008d6bf21b4e6bff023867 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Fri, 1 Nov 2024 22:08:44 +0900 Subject: mm/slab: Allow cache creation to proceed even if sysfs registration fails When kobject_init_and_add() fails during cache creation, kobj->name can be leaked because SLUB does not call kobject_put(), which should be invoked per the kobject API documentation. This has a bit of historical context, though; SLUB does not call kobject_put() to avoid double-free for struct kmem_cache because 1) simply calling it would free all resources related to the cache, and 2) struct kmem_cache descriptor is always freed by cache_cache()'s error handling path, causing struct kmem_cache to be freed twice. This issue can be reproduced by creating new slab caches while applying failslab for kernfs_node_cache. This makes kobject_add_varg() succeed, but causes kobject_add_internal() to fail in kobject_init_and_add() during cache creation. Historically, this issue has attracted developers' attention several times. Each time a fix addressed either the leak or the double-free, it caused the other issue. Let's summarize a bit of history here: The leak has existed since the early days of SLUB. Commit 54b6a731025f ("slub: fix leak of 'name' in sysfs_slab_add") introduced a double-free bug while fixing the leak. Commit 80da026a8e5d ("mm/slub: fix slab double-free in case of duplicate sysfs filename") re-introduced the leak while fixing the double-free error. Commit dde3c6b72a16 ("mm/slub: fix a memory leak in sysfs_slab_add()") fixed the memory leak, but it was later reverted by commit 757fed1d0898 ("Revert "mm/slub: fix a memory leak in sysfs_slab_add()"") to avoid the double-free error. This is where we are now: we've chosen a memory leak over a double-free. To resolve this memory leak, skip creating sysfs files if it fails and continue with cache creation regardless (as suggested by Christoph). This resolves the memory leak because both the cache and the kobject remain alive on kobject_init_and_add() failure. If SLUB tries to create an alias for a cache without sysfs files, its symbolic link will not be generated. Since a slab cache might not have associated sysfs files, call kobject_del() only if such files exist. Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index ccbdd7eb37a8..b41bc989f205 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -6073,7 +6073,8 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, s = find_mergeable(size, align, flags, name, ctor); if (s) { if (sysfs_slab_alias(s, name)) - return NULL; + pr_err("SLUB: Unable to add cache alias %s to sysfs\n", + name); s->refcount++; @@ -6155,15 +6156,18 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, if (!alloc_kmem_cache_cpus(s)) goto out; + err = 0; + /* Mutex is not taken during early boot */ - if (slab_state <= UP) { - err = 0; + if (slab_state <= UP) goto out; - } - err = sysfs_slab_add(s); - if (err) - goto out; + /* + * Failing to create sysfs files is not critical to SLUB functionality. + * If it fails, proceed with cache creation without these files. + */ + if (sysfs_slab_add(s)) + pr_err("SLUB: Unable to add cache %s to sysfs\n", s->name); if (s->flags & SLAB_STORE_USER) debugfs_slab_add(s); @@ -7233,7 +7237,8 @@ out_del_kobj: void sysfs_slab_unlink(struct kmem_cache *s) { - kobject_del(&s->kobj); + if (s->kobj.state_in_sysfs) + kobject_del(&s->kobj); } void sysfs_slab_release(struct kmem_cache *s) @@ -7262,6 +7267,11 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name) * If we have a leftover link then remove it. */ sysfs_remove_link(&slab_kset->kobj, name); + /* + * The original cache may have failed to generate sysfs file. + * In that case, sysfs_create_link() returns -ENOENT and + * symbolic link creation is skipped. + */ return sysfs_create_link(&slab_kset->kobj, &s->kobj, name); } -- cgit v1.2.3 From 9008fe8fad8255edfdbecea32d7eb0485d939d0d Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 20 Nov 2024 13:46:21 +0100 Subject: slab: Fix too strict alignment check in create_cache() On m68k, where the minimum alignment of unsigned long is 2 bytes: Kernel panic - not syncing: __kmem_cache_create_args: Failed to create slab 'io_kiocb'. Error -22 CPU: 0 UID: 0 PID: 1 Comm: swapper Not tainted 6.12.0-atari-03776-g7eaa1f99261a #1783 Stack from 0102fe5c: 0102fe5c 00514a2b 00514a2b ffffff00 00000001 0051f5ed 00425e78 00514a2b 0041eb74 ffffffea 00000310 0051f5ed ffffffea ffffffea 00601f60 00000044 0102ff20 000e7a68 0051ab8e 004383b8 0051f5ed ffffffea 000000b8 00000007 01020c00 00000000 000e77f0 0041e5f0 005f67c0 0051f5ed 000000b6 0102fef4 00000310 0102fef4 00000000 00000016 005f676c 0060a34c 00000010 00000004 00000038 0000009a 01000000 000000b8 005f668e 0102e000 00001372 0102ff88 Call Trace: [<00425e78>] dump_stack+0xc/0x10 [<0041eb74>] panic+0xd8/0x26c [<000e7a68>] __kmem_cache_create_args+0x278/0x2e8 [<000e77f0>] __kmem_cache_create_args+0x0/0x2e8 [<0041e5f0>] memset+0x0/0x8c [<005f67c0>] io_uring_init+0x54/0xd2 The minimal alignment of an integral type may differ from its size, hence is not safe to assume that an arbitrary freeptr_t (which is basically an unsigned long) is always aligned to 4 or 8 bytes. As nothing seems to require the additional alignment, it is safe to fix this by relaxing the check to the actual minimum alignment of freeptr_t. Fixes: aaa736b186239b7d ("io_uring: specify freeptr usage for SLAB_TYPESAFE_BY_RCU io_kiocb cache") Fixes: d345bd2e9834e2da ("mm: add kmem_cache_create_rcu()") Reported-by: Guenter Roeck Closes: https://lore.kernel.org/37c588d4-2c32-4aad-a19e-642961f200d7@roeck-us.net Cc: Signed-off-by: Geert Uytterhoeven Tested-by: Guenter Roeck Reviewed-by: Jens Axboe Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 62878edb0a81..23a5d05e27c9 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -227,7 +227,7 @@ static struct kmem_cache *create_cache(const char *name, if (args->use_freeptr_offset && (args->freeptr_offset >= object_size || !(flags & SLAB_TYPESAFE_BY_RCU) || - !IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t)))) + !IS_ALIGNED(args->freeptr_offset, __alignof__(freeptr_t)))) goto out; err = -ENOMEM; -- cgit v1.2.3