diff options
Diffstat (limited to 'mm/slub.c')
-rw-r--r-- | mm/slub.c | 2394 |
1 files changed, 2250 insertions, 144 deletions
diff --git a/mm/slub.c b/mm/slub.c index 30003763d224..a585d0ac45d4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -44,7 +44,8 @@ #include <kunit/test.h> #include <kunit/test-bug.h> #include <linux/sort.h> - +#include <linux/irq_work.h> +#include <linux/kprobes.h> #include <linux/debugfs.h> #include <trace/events/kmem.h> @@ -363,8 +364,12 @@ static inline void debugfs_slab_add(struct kmem_cache *s) { } #endif enum stat_item { + ALLOC_PCS, /* Allocation from percpu sheaf */ ALLOC_FASTPATH, /* Allocation from cpu slab */ ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ + FREE_PCS, /* Free to percpu sheaf */ + FREE_RCU_SHEAF, /* Free to rcu_free sheaf */ + FREE_RCU_SHEAF_FAIL, /* Failed to free to a rcu_free sheaf */ FREE_FASTPATH, /* Free to cpu slab */ FREE_SLOWPATH, /* Freeing not to cpu slab */ FREE_FROZEN, /* Freeing to frozen slab */ @@ -389,6 +394,19 @@ enum stat_item { CPU_PARTIAL_FREE, /* Refill cpu partial on free */ CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ + SHEAF_FLUSH, /* Objects flushed from a sheaf */ + SHEAF_REFILL, /* Objects refilled to a sheaf */ + SHEAF_ALLOC, /* Allocation of an empty sheaf */ + SHEAF_FREE, /* Freeing of an empty sheaf */ + BARN_GET, /* Got full sheaf from barn */ + BARN_GET_FAIL, /* Failed to get full sheaf from barn */ + BARN_PUT, /* Put full sheaf to barn */ + BARN_PUT_FAIL, /* Failed to put full sheaf to barn */ + SHEAF_PREFILL_FAST, /* Sheaf prefill grabbed the spare sheaf */ + SHEAF_PREFILL_SLOW, /* Sheaf prefill found no spare sheaf */ + SHEAF_PREFILL_OVERSIZE, /* Allocation of oversize sheaf for prefill */ + SHEAF_RETURN_FAST, /* Sheaf return reattached spare sheaf */ + SHEAF_RETURN_SLOW, /* Sheaf return could not reattach spare */ NR_SLUB_STAT_ITEMS }; @@ -409,7 +427,7 @@ struct kmem_cache_cpu { #ifdef CONFIG_SLUB_CPU_PARTIAL struct slab *partial; /* Partially allocated slabs */ #endif - local_lock_t lock; /* Protects the fields above */ + local_trylock_t lock; /* Protects the fields above */ #ifdef CONFIG_SLUB_STATS unsigned int stat[NR_SLUB_STAT_ITEMS]; #endif @@ -435,6 +453,37 @@ void stat_add(const struct kmem_cache *s, enum stat_item si, int v) #endif } +#define MAX_FULL_SHEAVES 10 +#define MAX_EMPTY_SHEAVES 10 + +struct node_barn { + spinlock_t lock; + struct list_head sheaves_full; + struct list_head sheaves_empty; + unsigned int nr_full; + unsigned int nr_empty; +}; + +struct slab_sheaf { + union { + struct rcu_head rcu_head; + struct list_head barn_list; + /* only used for prefilled sheafs */ + unsigned int capacity; + }; + struct kmem_cache *cache; + unsigned int size; + int node; /* only used for rcu_sheaf */ + void *objects[]; +}; + +struct slub_percpu_sheaves { + local_trylock_t lock; + struct slab_sheaf *main; /* never NULL when unlocked */ + struct slab_sheaf *spare; /* empty or full, may be NULL */ + struct slab_sheaf *rcu_free; /* for batching kfree_rcu() */ +}; + /* * The slab lists for all objects. */ @@ -447,6 +496,7 @@ struct kmem_cache_node { atomic_long_t total_objects; struct list_head full; #endif + struct node_barn *barn; }; static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) @@ -454,6 +504,12 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) return s->node[node]; } +/* Get the barn of the current cpu's memory node */ +static inline struct node_barn *get_barn(struct kmem_cache *s) +{ + return get_node(s, numa_mem_id())->barn; +} + /* * Iterator over all nodes. The body will be executed for each node that has * a kmem_cache_node structure allocated (which is true for all online nodes) @@ -470,12 +526,19 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) */ static nodemask_t slab_nodes; -#ifndef CONFIG_SLUB_TINY /* * Workqueue used for flush_cpu_slab(). */ static struct workqueue_struct *flushwq; -#endif + +struct slub_flush_work { + struct work_struct work; + struct kmem_cache *s; + bool skip; +}; + +static DEFINE_MUTEX(flush_lock); +static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); /******************************************************************** * Core slab cache functions @@ -822,6 +885,16 @@ static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) } #ifdef CONFIG_SLUB_DEBUG + +/* + * For debugging context when we want to check if the struct slab pointer + * appears to be valid. + */ +static inline bool validate_slab_ptr(struct slab *slab) +{ + return PageSlab(slab_page(slab)); +} + static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; static DEFINE_SPINLOCK(object_map_lock); @@ -962,19 +1035,19 @@ static struct track *get_track(struct kmem_cache *s, void *object, } #ifdef CONFIG_STACKDEPOT -static noinline depot_stack_handle_t set_track_prepare(void) +static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { depot_stack_handle_t handle; unsigned long entries[TRACK_ADDRS_COUNT]; unsigned int nr_entries; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3); - handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT); + handle = stack_depot_save(entries, nr_entries, gfp_flags); return handle; } #else -static inline depot_stack_handle_t set_track_prepare(void) +static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { return 0; } @@ -996,9 +1069,9 @@ static void set_track_update(struct kmem_cache *s, void *object, } static __always_inline void set_track(struct kmem_cache *s, void *object, - enum track_item alloc, unsigned long addr) + enum track_item alloc, unsigned long addr, gfp_t gfp_flags) { - depot_stack_handle_t handle = set_track_prepare(); + depot_stack_handle_t handle = set_track_prepare(gfp_flags); set_track_update(s, object, alloc, addr, handle); } @@ -1140,7 +1213,12 @@ static void object_err(struct kmem_cache *s, struct slab *slab, return; slab_bug(s, reason); - print_trailer(s, slab, object); + if (!object || !check_valid_pointer(s, slab, object)) { + print_slab_info(slab); + pr_err("Invalid pointer 0x%p\n", object); + } else { + print_trailer(s, slab, object); + } add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); WARN_ON(1); @@ -1444,15 +1522,15 @@ static int check_object(struct kmem_cache *s, struct slab *slab, return ret; } +/* + * Checks if the slab state looks sane. Assumes the struct slab pointer + * was either obtained in a way that ensures it's valid, or validated + * by validate_slab_ptr() + */ static int check_slab(struct kmem_cache *s, struct slab *slab) { int maxobj; - if (!folio_test_slab(slab_folio(slab))) { - slab_err(s, slab, "Not a valid slab page"); - return 0; - } - maxobj = order_objects(slab_order(slab), s->size); if (slab->objects > maxobj) { slab_err(s, slab, "objects %u > max %u", @@ -1648,17 +1726,15 @@ static noinline bool alloc_debug_processing(struct kmem_cache *s, return true; bad: - if (folio_test_slab(slab_folio(slab))) { - /* - * If this is a slab page then lets do the best we can - * to avoid issues in the future. Marking all objects - * as used avoids touching the remaining objects. - */ - slab_fix(s, "Marking all objects used"); - slab->inuse = slab->objects; - slab->freelist = NULL; - slab->frozen = 1; /* mark consistency-failed slab as frozen */ - } + /* + * Let's do the best we can to avoid issues in the future. Marking all + * objects as used avoids touching the remaining objects. + */ + slab_fix(s, "Marking all objects used"); + slab->inuse = slab->objects; + slab->freelist = NULL; + slab->frozen = 1; /* mark consistency-failed slab as frozen */ + return false; } @@ -1679,10 +1755,7 @@ static inline int free_consistency_checks(struct kmem_cache *s, return 0; if (unlikely(s != slab->slab_cache)) { - if (!folio_test_slab(slab_folio(slab))) { - slab_err(s, slab, "Attempt to free object(0x%p) outside of slab", - object); - } else if (!slab->slab_cache) { + if (!slab->slab_cache) { slab_err(NULL, slab, "No slab cache for object 0x%p", object); } else { @@ -1921,9 +1994,9 @@ static inline bool free_debug_processing(struct kmem_cache *s, static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {} static inline int check_object(struct kmem_cache *s, struct slab *slab, void *object, u8 val) { return 1; } -static inline depot_stack_handle_t set_track_prepare(void) { return 0; } +static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { return 0; } static inline void set_track(struct kmem_cache *s, void *object, - enum track_item alloc, unsigned long addr) {} + enum track_item alloc, unsigned long addr, gfp_t gfp_flags) {} static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab) {} static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, @@ -1984,7 +2057,7 @@ static inline void handle_failed_objexts_alloc(unsigned long obj_exts, * objects with no tag reference. Mark all references in this * vector as empty to avoid warnings later on. */ - if (obj_exts & OBJEXTS_ALLOC_FAIL) { + if (obj_exts == OBJEXTS_ALLOC_FAIL) { unsigned int i; for (i = 0; i < objects; i++) @@ -2017,6 +2090,7 @@ static inline void init_slab_obj_exts(struct slab *slab) int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, gfp_t gfp, bool new_slab) { + bool allow_spin = gfpflags_allow_spinning(gfp); unsigned int objects = objs_per_slab(s, slab); unsigned long new_exts; unsigned long old_exts; @@ -2025,17 +2099,32 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, gfp &= ~OBJCGS_CLEAR_MASK; /* Prevent recursive extension vector allocation */ gfp |= __GFP_NO_OBJ_EXT; - vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, - slab_nid(slab)); + + /* + * Note that allow_spin may be false during early boot and its + * restricted GFP_BOOT_MASK. Due to kmalloc_nolock() only supporting + * architectures with cmpxchg16b, early obj_exts will be missing for + * very early allocations on those. + */ + if (unlikely(!allow_spin)) { + size_t sz = objects * sizeof(struct slabobj_ext); + + vec = kmalloc_nolock(sz, __GFP_ZERO | __GFP_NO_OBJ_EXT, + slab_nid(slab)); + } else { + vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, + slab_nid(slab)); + } if (!vec) { /* Mark vectors which failed to allocate */ - if (new_slab) - mark_failed_objexts_alloc(slab); + mark_failed_objexts_alloc(slab); return -ENOMEM; } new_exts = (unsigned long)vec; + if (unlikely(!allow_spin)) + new_exts |= OBJEXTS_NOSPIN_ALLOC; #ifdef CONFIG_MEMCG new_exts |= MEMCG_DATA_OBJEXTS; #endif @@ -2056,7 +2145,10 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, * objcg vector should be reused. */ mark_objexts_empty(vec); - kfree(vec); + if (unlikely(!allow_spin)) + kfree_nolock(vec); + else + kfree(vec); return 0; } @@ -2080,7 +2172,10 @@ static inline void free_slab_obj_exts(struct slab *slab) * the extension for obj_exts is expected to be NULL. */ mark_objexts_empty(obj_exts); - kfree(obj_exts); + if (unlikely(READ_ONCE(slab->obj_exts) & OBJEXTS_NOSPIN_ALLOC)) + kfree_nolock(obj_exts); + else + kfree(obj_exts); slab->obj_exts = 0; } @@ -2414,7 +2509,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init, } /* KASAN might put x into memory quarantine, delaying its reuse. */ - return !kasan_slab_free(s, x, init, still_accessible); + return !kasan_slab_free(s, x, init, still_accessible, false); } static __fastpath_inline @@ -2473,17 +2568,463 @@ static void *setup_object(struct kmem_cache *s, void *object) return object; } +static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) +{ + struct slab_sheaf *sheaf = kzalloc(struct_size(sheaf, objects, + s->sheaf_capacity), gfp); + + if (unlikely(!sheaf)) + return NULL; + + sheaf->cache = s; + + stat(s, SHEAF_ALLOC); + + return sheaf; +} + +static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf) +{ + kfree(sheaf); + + stat(s, SHEAF_FREE); +} + +static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p); + + +static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, + gfp_t gfp) +{ + int to_fill = s->sheaf_capacity - sheaf->size; + int filled; + + if (!to_fill) + return 0; + + filled = __kmem_cache_alloc_bulk(s, gfp, to_fill, + &sheaf->objects[sheaf->size]); + + sheaf->size += filled; + + stat_add(s, SHEAF_REFILL, filled); + + if (filled < to_fill) + return -ENOMEM; + + return 0; +} + + +static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp) +{ + struct slab_sheaf *sheaf = alloc_empty_sheaf(s, gfp); + + if (!sheaf) + return NULL; + + if (refill_sheaf(s, sheaf, gfp)) { + free_empty_sheaf(s, sheaf); + return NULL; + } + + return sheaf; +} + +/* + * Maximum number of objects freed during a single flush of main pcs sheaf. + * Translates directly to an on-stack array size. + */ +#define PCS_BATCH_MAX 32U + +static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p); + +/* + * Free all objects from the main sheaf. In order to perform + * __kmem_cache_free_bulk() outside of cpu_sheaves->lock, work in batches where + * object pointers are moved to a on-stack array under the lock. To bound the + * stack usage, limit each batch to PCS_BATCH_MAX. + * + * returns true if at least partially flushed + */ +static bool sheaf_flush_main(struct kmem_cache *s) +{ + struct slub_percpu_sheaves *pcs; + unsigned int batch, remaining; + void *objects[PCS_BATCH_MAX]; + struct slab_sheaf *sheaf; + bool ret = false; + +next_batch: + if (!local_trylock(&s->cpu_sheaves->lock)) + return ret; + + pcs = this_cpu_ptr(s->cpu_sheaves); + sheaf = pcs->main; + + batch = min(PCS_BATCH_MAX, sheaf->size); + + sheaf->size -= batch; + memcpy(objects, sheaf->objects + sheaf->size, batch * sizeof(void *)); + + remaining = sheaf->size; + + local_unlock(&s->cpu_sheaves->lock); + + __kmem_cache_free_bulk(s, batch, &objects[0]); + + stat_add(s, SHEAF_FLUSH, batch); + + ret = true; + + if (remaining) + goto next_batch; + + return ret; +} + +/* + * Free all objects from a sheaf that's unused, i.e. not linked to any + * cpu_sheaves, so we need no locking and batching. The locking is also not + * necessary when flushing cpu's sheaves (both spare and main) during cpu + * hotremove as the cpu is not executing anymore. + */ +static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf) +{ + if (!sheaf->size) + return; + + stat_add(s, SHEAF_FLUSH, sheaf->size); + + __kmem_cache_free_bulk(s, sheaf->size, &sheaf->objects[0]); + + sheaf->size = 0; +} + +static void __rcu_free_sheaf_prepare(struct kmem_cache *s, + struct slab_sheaf *sheaf) +{ + bool init = slab_want_init_on_free(s); + void **p = &sheaf->objects[0]; + unsigned int i = 0; + + while (i < sheaf->size) { + struct slab *slab = virt_to_slab(p[i]); + + memcg_slab_free_hook(s, slab, p + i, 1); + alloc_tagging_slab_free_hook(s, slab, p + i, 1); + + if (unlikely(!slab_free_hook(s, p[i], init, true))) { + p[i] = p[--sheaf->size]; + continue; + } + + i++; + } +} + +static void rcu_free_sheaf_nobarn(struct rcu_head *head) +{ + struct slab_sheaf *sheaf; + struct kmem_cache *s; + + sheaf = container_of(head, struct slab_sheaf, rcu_head); + s = sheaf->cache; + + __rcu_free_sheaf_prepare(s, sheaf); + + sheaf_flush_unused(s, sheaf); + + free_empty_sheaf(s, sheaf); +} + +/* + * Caller needs to make sure migration is disabled in order to fully flush + * single cpu's sheaves + * + * must not be called from an irq + * + * flushing operations are rare so let's keep it simple and flush to slabs + * directly, skipping the barn + */ +static void pcs_flush_all(struct kmem_cache *s) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *spare, *rcu_free; + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + spare = pcs->spare; + pcs->spare = NULL; + + rcu_free = pcs->rcu_free; + pcs->rcu_free = NULL; + + local_unlock(&s->cpu_sheaves->lock); + + if (spare) { + sheaf_flush_unused(s, spare); + free_empty_sheaf(s, spare); + } + + if (rcu_free) + call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn); + + sheaf_flush_main(s); +} + +static void __pcs_flush_all_cpu(struct kmem_cache *s, unsigned int cpu) +{ + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + /* The cpu is not executing anymore so we don't need pcs->lock */ + sheaf_flush_unused(s, pcs->main); + if (pcs->spare) { + sheaf_flush_unused(s, pcs->spare); + free_empty_sheaf(s, pcs->spare); + pcs->spare = NULL; + } + + if (pcs->rcu_free) { + call_rcu(&pcs->rcu_free->rcu_head, rcu_free_sheaf_nobarn); + pcs->rcu_free = NULL; + } +} + +static void pcs_destroy(struct kmem_cache *s) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + /* can happen when unwinding failed create */ + if (!pcs->main) + continue; + + /* + * We have already passed __kmem_cache_shutdown() so everything + * was flushed and there should be no objects allocated from + * slabs, otherwise kmem_cache_destroy() would have aborted. + * Therefore something would have to be really wrong if the + * warnings here trigger, and we should rather leave objects and + * sheaves to leak in that case. + */ + + WARN_ON(pcs->spare); + WARN_ON(pcs->rcu_free); + + if (!WARN_ON(pcs->main->size)) { + free_empty_sheaf(s, pcs->main); + pcs->main = NULL; + } + } + + free_percpu(s->cpu_sheaves); + s->cpu_sheaves = NULL; +} + +static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn) +{ + struct slab_sheaf *empty = NULL; + unsigned long flags; + + if (!data_race(barn->nr_empty)) + return NULL; + + spin_lock_irqsave(&barn->lock, flags); + + if (likely(barn->nr_empty)) { + empty = list_first_entry(&barn->sheaves_empty, + struct slab_sheaf, barn_list); + list_del(&empty->barn_list); + barn->nr_empty--; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return empty; +} + +/* + * The following two functions are used mainly in cases where we have to undo an + * intended action due to a race or cpu migration. Thus they do not check the + * empty or full sheaf limits for simplicity. + */ + +static void barn_put_empty_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf) +{ + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + list_add(&sheaf->barn_list, &barn->sheaves_empty); + barn->nr_empty++; + + spin_unlock_irqrestore(&barn->lock, flags); +} + +static void barn_put_full_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf) +{ + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + list_add(&sheaf->barn_list, &barn->sheaves_full); + barn->nr_full++; + + spin_unlock_irqrestore(&barn->lock, flags); +} + +static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn) +{ + struct slab_sheaf *sheaf = NULL; + unsigned long flags; + + if (!data_race(barn->nr_full) && !data_race(barn->nr_empty)) + return NULL; + + spin_lock_irqsave(&barn->lock, flags); + + if (barn->nr_full) { + sheaf = list_first_entry(&barn->sheaves_full, struct slab_sheaf, + barn_list); + list_del(&sheaf->barn_list); + barn->nr_full--; + } else if (barn->nr_empty) { + sheaf = list_first_entry(&barn->sheaves_empty, + struct slab_sheaf, barn_list); + list_del(&sheaf->barn_list); + barn->nr_empty--; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return sheaf; +} + +/* + * If a full sheaf is available, return it and put the supplied empty one to + * barn. We ignore the limit on empty sheaves as the number of sheaves doesn't + * change. + */ +static struct slab_sheaf * +barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) +{ + struct slab_sheaf *full = NULL; + unsigned long flags; + + if (!data_race(barn->nr_full)) + return NULL; + + spin_lock_irqsave(&barn->lock, flags); + + if (likely(barn->nr_full)) { + full = list_first_entry(&barn->sheaves_full, struct slab_sheaf, + barn_list); + list_del(&full->barn_list); + list_add(&empty->barn_list, &barn->sheaves_empty); + barn->nr_full--; + barn->nr_empty++; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return full; +} + +/* + * If an empty sheaf is available, return it and put the supplied full one to + * barn. But if there are too many full sheaves, reject this with -E2BIG. + */ +static struct slab_sheaf * +barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full) +{ + struct slab_sheaf *empty; + unsigned long flags; + + /* we don't repeat this check under barn->lock as it's not critical */ + if (data_race(barn->nr_full) >= MAX_FULL_SHEAVES) + return ERR_PTR(-E2BIG); + if (!data_race(barn->nr_empty)) + return ERR_PTR(-ENOMEM); + + spin_lock_irqsave(&barn->lock, flags); + + if (likely(barn->nr_empty)) { + empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf, + barn_list); + list_del(&empty->barn_list); + list_add(&full->barn_list, &barn->sheaves_full); + barn->nr_empty--; + barn->nr_full++; + } else { + empty = ERR_PTR(-ENOMEM); + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return empty; +} + +static void barn_init(struct node_barn *barn) +{ + spin_lock_init(&barn->lock); + INIT_LIST_HEAD(&barn->sheaves_full); + INIT_LIST_HEAD(&barn->sheaves_empty); + barn->nr_full = 0; + barn->nr_empty = 0; +} + +static void barn_shrink(struct kmem_cache *s, struct node_barn *barn) +{ + struct list_head empty_list; + struct list_head full_list; + struct slab_sheaf *sheaf, *sheaf2; + unsigned long flags; + + INIT_LIST_HEAD(&empty_list); + INIT_LIST_HEAD(&full_list); + + spin_lock_irqsave(&barn->lock, flags); + + list_splice_init(&barn->sheaves_full, &full_list); + barn->nr_full = 0; + list_splice_init(&barn->sheaves_empty, &empty_list); + barn->nr_empty = 0; + + spin_unlock_irqrestore(&barn->lock, flags); + + list_for_each_entry_safe(sheaf, sheaf2, &full_list, barn_list) { + sheaf_flush_unused(s, sheaf); + free_empty_sheaf(s, sheaf); + } + + list_for_each_entry_safe(sheaf, sheaf2, &empty_list, barn_list) + free_empty_sheaf(s, sheaf); +} + /* * Slab allocation and freeing */ static inline struct slab *alloc_slab_page(gfp_t flags, int node, - struct kmem_cache_order_objects oo) + struct kmem_cache_order_objects oo, + bool allow_spin) { struct folio *folio; struct slab *slab; unsigned int order = oo_order(oo); - if (node == NUMA_NO_NODE) + if (unlikely(!allow_spin)) + folio = (struct folio *)alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */, + node, order); + else if (node == NUMA_NO_NODE) folio = (struct folio *)alloc_frozen_pages(flags, order); else folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL); @@ -2633,6 +3174,7 @@ static __always_inline void unaccount_slab(struct slab *slab, int order, static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { + bool allow_spin = gfpflags_allow_spinning(flags); struct slab *slab; struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; @@ -2652,7 +3194,11 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM; - slab = alloc_slab_page(alloc_gfp, node, oo); + /* + * __GFP_RECLAIM could be cleared on the first allocation attempt, + * so pass allow_spin flag directly. + */ + slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin); if (unlikely(!slab)) { oo = s->min; alloc_gfp = flags; @@ -2660,7 +3206,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * Allocation may have failed due to fragmentation. * Try a lower order alloc if possible */ - slab = alloc_slab_page(alloc_gfp, node, oo); + slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin); if (unlikely(!slab)) return NULL; stat(s, ORDER_FALLBACK); @@ -2811,13 +3357,21 @@ static void *alloc_single_from_partial(struct kmem_cache *s, lockdep_assert_held(&n->list_lock); +#ifdef CONFIG_SLUB_DEBUG + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!validate_slab_ptr(slab)) { + slab_err(s, slab, "Not a valid slab page"); + return NULL; + } + } +#endif + object = slab->freelist; slab->freelist = get_freepointer(s, object); slab->inuse++; if (!alloc_debug_processing(s, slab, object, orig_size)) { - if (folio_test_slab(slab_folio(slab))) - remove_partial(n, slab); + remove_partial(n, slab); return NULL; } @@ -2829,33 +3383,47 @@ static void *alloc_single_from_partial(struct kmem_cache *s, return object; } +static void defer_deactivate_slab(struct slab *slab, void *flush_freelist); + /* * Called only for kmem_cache_debug() caches to allocate from a freshly * allocated slab. Allocate a single object instead of whole freelist * and put the slab to the partial (or full) list. */ -static void *alloc_single_from_new_slab(struct kmem_cache *s, - struct slab *slab, int orig_size) +static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab, + int orig_size, gfp_t gfpflags) { + bool allow_spin = gfpflags_allow_spinning(gfpflags); int nid = slab_nid(slab); struct kmem_cache_node *n = get_node(s, nid); unsigned long flags; void *object; + if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) { + /* Unlucky, discard newly allocated slab */ + slab->frozen = 1; + defer_deactivate_slab(slab, NULL); + return NULL; + } object = slab->freelist; slab->freelist = get_freepointer(s, object); slab->inuse = 1; - if (!alloc_debug_processing(s, slab, object, orig_size)) + if (!alloc_debug_processing(s, slab, object, orig_size)) { /* * It's not really expected that this would fail on a * freshly allocated slab, but a concurrent memory * corruption in theory could cause that. + * Leak memory of allocated slab. */ + if (!allow_spin) + spin_unlock_irqrestore(&n->list_lock, flags); return NULL; + } - spin_lock_irqsave(&n->list_lock, flags); + if (allow_spin) + spin_lock_irqsave(&n->list_lock, flags); if (slab->inuse == slab->objects) add_full(s, n, slab); @@ -2896,7 +3464,10 @@ static struct slab *get_partial_node(struct kmem_cache *s, if (!n || !n->nr_partial) return NULL; - spin_lock_irqsave(&n->list_lock, flags); + if (gfpflags_allow_spinning(pc->flags)) + spin_lock_irqsave(&n->list_lock, flags); + else if (!spin_trylock_irqsave(&n->list_lock, flags)) + return NULL; list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { if (!pfmemalloc_match(slab, pc->flags)) continue; @@ -3064,30 +3635,46 @@ static inline void note_cmpxchg_failure(const char *n, pr_info("%s %s: cmpxchg redo ", n, s->name); -#ifdef CONFIG_PREEMPTION - if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) + if (IS_ENABLED(CONFIG_PREEMPTION) && + tid_to_cpu(tid) != tid_to_cpu(actual_tid)) { pr_warn("due to cpu change %d -> %d\n", tid_to_cpu(tid), tid_to_cpu(actual_tid)); - else -#endif - if (tid_to_event(tid) != tid_to_event(actual_tid)) + } else if (tid_to_event(tid) != tid_to_event(actual_tid)) { pr_warn("due to cpu running other code. Event %ld->%ld\n", tid_to_event(tid), tid_to_event(actual_tid)); - else + } else { pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", actual_tid, tid, next_tid(tid)); + } #endif stat(s, CMPXCHG_DOUBLE_CPU_FAIL); } static void init_kmem_cache_cpus(struct kmem_cache *s) { +#ifdef CONFIG_PREEMPT_RT + /* + * Register lockdep key for non-boot kmem caches to avoid + * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key() + */ + bool finegrain_lockdep = !init_section_contains(s, 1); +#else + /* + * Don't bother with different lockdep classes for each + * kmem_cache, since we only use local_trylock_irqsave(). + */ + bool finegrain_lockdep = false; +#endif int cpu; struct kmem_cache_cpu *c; + if (finegrain_lockdep) + lockdep_register_key(&s->lock_key); for_each_possible_cpu(cpu) { c = per_cpu_ptr(s->cpu_slab, cpu); - local_lock_init(&c->lock); + local_trylock_init(&c->lock); + if (finegrain_lockdep) + lockdep_set_class(&c->lock, &s->lock_key); c->tid = init_tid(cpu); } } @@ -3178,6 +3765,47 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, } } +/* + * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock + * can be acquired without a deadlock before invoking the function. + * + * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is + * using local_lock_is_locked() properly before calling local_lock_cpu_slab(), + * and kmalloc() is not used in an unsupported context. + * + * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave(). + * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but + * lockdep_assert() will catch a bug in case: + * #1 + * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock() + * or + * #2 + * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock() + * + * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt + * disabled context. The lock will always be acquired and if needed it + * block and sleep until the lock is available. + * #1 is possible in !PREEMPT_RT only. + * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock: + * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) -> + * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B) + * + * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B + */ +#if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP) +#define local_lock_cpu_slab(s, flags) \ + local_lock_irqsave(&(s)->cpu_slab->lock, flags) +#else +#define local_lock_cpu_slab(s, flags) \ + do { \ + bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \ + lockdep_assert(__l); \ + } while (0) +#endif + +#define local_unlock_cpu_slab(s, flags) \ + local_unlock_irqrestore(&(s)->cpu_slab->lock, flags) + #ifdef CONFIG_SLUB_CPU_PARTIAL static void __put_partials(struct kmem_cache *s, struct slab *partial_slab) { @@ -3262,7 +3890,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) unsigned long flags; int slabs = 0; - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); oldslab = this_cpu_read(s->cpu_slab->partial); @@ -3287,7 +3915,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) this_cpu_write(s->cpu_slab->partial, slab); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); if (slab_to_put) { __put_partials(s, slab_to_put); @@ -3344,11 +3972,40 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) put_partials_cpu(s, c); } -struct slub_flush_work { - struct work_struct work; - struct kmem_cache *s; - bool skip; -}; +static inline void flush_this_cpu_slab(struct kmem_cache *s) +{ + struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); + + if (c->slab) + flush_slab(s, c); + + put_partials(s); +} + +static bool has_cpu_slab(int cpu, struct kmem_cache *s) +{ + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + + return c->slab || slub_percpu_partial(c); +} + +#else /* CONFIG_SLUB_TINY */ +static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { } +static inline bool has_cpu_slab(int cpu, struct kmem_cache *s) { return false; } +static inline void flush_this_cpu_slab(struct kmem_cache *s) { } +#endif /* CONFIG_SLUB_TINY */ + +static bool has_pcs_used(int cpu, struct kmem_cache *s) +{ + struct slub_percpu_sheaves *pcs; + + if (!s->cpu_sheaves) + return false; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + return (pcs->spare || pcs->rcu_free || pcs->main->size); +} /* * Flush cpu slab. @@ -3358,30 +4015,18 @@ struct slub_flush_work { static void flush_cpu_slab(struct work_struct *w) { struct kmem_cache *s; - struct kmem_cache_cpu *c; struct slub_flush_work *sfw; sfw = container_of(w, struct slub_flush_work, work); s = sfw->s; - c = this_cpu_ptr(s->cpu_slab); - if (c->slab) - flush_slab(s, c); + if (s->cpu_sheaves) + pcs_flush_all(s); - put_partials(s); + flush_this_cpu_slab(s); } -static bool has_cpu_slab(int cpu, struct kmem_cache *s) -{ - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - - return c->slab || slub_percpu_partial(c); -} - -static DEFINE_MUTEX(flush_lock); -static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); - static void flush_all_cpus_locked(struct kmem_cache *s) { struct slub_flush_work *sfw; @@ -3392,7 +4037,7 @@ static void flush_all_cpus_locked(struct kmem_cache *s) for_each_online_cpu(cpu) { sfw = &per_cpu(slub_flush, cpu); - if (!has_cpu_slab(cpu, s)) { + if (!has_cpu_slab(cpu, s) && !has_pcs_used(cpu, s)) { sfw->skip = true; continue; } @@ -3419,6 +4064,74 @@ static void flush_all(struct kmem_cache *s) cpus_read_unlock(); } +static void flush_rcu_sheaf(struct work_struct *w) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *rcu_free; + struct slub_flush_work *sfw; + struct kmem_cache *s; + + sfw = container_of(w, struct slub_flush_work, work); + s = sfw->s; + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + rcu_free = pcs->rcu_free; + pcs->rcu_free = NULL; + + local_unlock(&s->cpu_sheaves->lock); + + if (rcu_free) + call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn); +} + + +/* needed for kvfree_rcu_barrier() */ +void flush_all_rcu_sheaves(void) +{ + struct slub_flush_work *sfw; + struct kmem_cache *s; + unsigned int cpu; + + cpus_read_lock(); + mutex_lock(&slab_mutex); + + list_for_each_entry(s, &slab_caches, list) { + if (!s->cpu_sheaves) + continue; + + mutex_lock(&flush_lock); + + for_each_online_cpu(cpu) { + sfw = &per_cpu(slub_flush, cpu); + + /* + * we don't check if rcu_free sheaf exists - racing + * __kfree_rcu_sheaf() might have just removed it. + * by executing flush_rcu_sheaf() on the cpu we make + * sure the __kfree_rcu_sheaf() finished its call_rcu() + */ + + INIT_WORK(&sfw->work, flush_rcu_sheaf); + sfw->s = s; + queue_work_on(cpu, flushwq, &sfw->work); + } + + for_each_online_cpu(cpu) { + sfw = &per_cpu(slub_flush, cpu); + flush_work(&sfw->work); + } + + mutex_unlock(&flush_lock); + } + + mutex_unlock(&slab_mutex); + cpus_read_unlock(); + + rcu_barrier(); +} + /* * Use the cpu notifier to insure that the cpu slabs are flushed when * necessary. @@ -3428,19 +4141,15 @@ static int slub_cpu_dead(unsigned int cpu) struct kmem_cache *s; mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) + list_for_each_entry(s, &slab_caches, list) { __flush_cpu_slab(s, cpu); + if (s->cpu_sheaves) + __pcs_flush_all_cpu(s, cpu); + } mutex_unlock(&slab_mutex); return 0; } -#else /* CONFIG_SLUB_TINY */ -static inline void flush_all_cpus_locked(struct kmem_cache *s) { } -static inline void flush_all(struct kmem_cache *s) { } -static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { } -static inline int slub_cpu_dead(unsigned int cpu) { return 0; } -#endif /* CONFIG_SLUB_TINY */ - /* * Check if the objects in a per cpu structure fit numa * locality expectations. @@ -3721,6 +4430,7 @@ static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) { + bool allow_spin = gfpflags_allow_spinning(gfpflags); void *freelist; struct slab *slab; unsigned long flags; @@ -3746,9 +4456,21 @@ reread_slab: if (unlikely(!node_match(slab, node))) { /* * same as above but node_match() being false already - * implies node != NUMA_NO_NODE + * implies node != NUMA_NO_NODE. + * + * We don't strictly honor pfmemalloc and NUMA preferences + * when !allow_spin because: + * + * 1. Most kmalloc() users allocate objects on the local node, + * so kmalloc_nolock() tries not to interfere with them by + * deactivating the cpu slab. + * + * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause + * unnecessary slab allocations even when n->partial list + * is not empty. */ - if (!node_isset(node, slab_nodes)) { + if (!node_isset(node, slab_nodes) || + !allow_spin) { node = NUMA_NO_NODE; } else { stat(s, ALLOC_NODE_MISMATCH); @@ -3761,13 +4483,14 @@ reread_slab: * PFMEMALLOC but right now, we are losing the pfmemalloc * information when the page leaves the per-cpu allocator */ - if (unlikely(!pfmemalloc_match(slab, gfpflags))) + if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) goto deactivate_slab; /* must check again c->slab in case we got preempted and it changed */ - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); + if (unlikely(slab != c->slab)) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); goto reread_slab; } freelist = c->freelist; @@ -3779,7 +4502,7 @@ reread_slab: if (!freelist) { c->slab = NULL; c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); stat(s, DEACTIVATE_BYPASS); goto new_slab; } @@ -3798,34 +4521,34 @@ load_freelist: VM_BUG_ON(!c->slab->frozen); c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); return freelist; deactivate_slab: - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); if (slab != c->slab) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); goto reread_slab; } freelist = c->freelist; c->slab = NULL; c->freelist = NULL; c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); deactivate_slab(s, slab, freelist); new_slab: #ifdef CONFIG_SLUB_CPU_PARTIAL while (slub_percpu_partial(c)) { - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); if (unlikely(c->slab)) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); goto reread_slab; } if (unlikely(!slub_percpu_partial(c))) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); /* we were preempted and partial list got empty */ goto new_objects; } @@ -3834,7 +4557,8 @@ new_slab: slub_set_percpu_partial(c, slab); if (likely(node_match(slab, node) && - pfmemalloc_match(slab, gfpflags))) { + pfmemalloc_match(slab, gfpflags)) || + !allow_spin) { c->slab = slab; freelist = get_freelist(s, slab); VM_BUG_ON(!freelist); @@ -3842,7 +4566,7 @@ new_slab: goto load_freelist; } - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); slab->next = NULL; __put_partials(s, slab); @@ -3864,8 +4588,13 @@ new_objects: * allocating new page from other nodes */ if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) - && try_thisnode)) - pc.flags = GFP_NOWAIT | __GFP_THISNODE; + && try_thisnode)) { + if (unlikely(!allow_spin)) + /* Do not upgrade gfp to NOWAIT from more restrictive mode */ + pc.flags = gfpflags | __GFP_THISNODE; + else + pc.flags = GFP_NOWAIT | __GFP_THISNODE; + } pc.orig_size = orig_size; slab = get_partial(s, node, &pc); @@ -3876,9 +4605,14 @@ new_objects: * For debug caches here we had to go through * alloc_single_from_partial() so just store the * tracking info and return the object. + * + * Due to disabled preemption we need to disallow + * blocking. The flags are further adjusted by + * gfp_nested_mask() in stack_depot itself. */ if (s->flags & SLAB_STORE_USER) - set_track(s, freelist, TRACK_ALLOC, addr); + set_track(s, freelist, TRACK_ALLOC, addr, + gfpflags & ~(__GFP_DIRECT_RECLAIM)); return freelist; } @@ -3904,13 +4638,14 @@ new_objects: stat(s, ALLOC_SLAB); if (kmem_cache_debug(s)) { - freelist = alloc_single_from_new_slab(s, slab, orig_size); + freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); if (unlikely(!freelist)) goto new_objects; if (s->flags & SLAB_STORE_USER) - set_track(s, freelist, TRACK_ALLOC, addr); + set_track(s, freelist, TRACK_ALLOC, addr, + gfpflags & ~(__GFP_DIRECT_RECLAIM)); return freelist; } @@ -3926,7 +4661,7 @@ new_objects: inc_slabs_node(s, slab_nid(slab), slab->objects); - if (unlikely(!pfmemalloc_match(slab, gfpflags))) { + if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) { /* * For !pfmemalloc_match() case we don't load freelist so that * we don't make further mismatched allocations easier. @@ -3937,7 +4672,7 @@ new_objects: retry_load_slab: - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); if (unlikely(c->slab)) { void *flush_freelist = c->freelist; struct slab *flush_slab = c->slab; @@ -3946,9 +4681,14 @@ retry_load_slab: c->freelist = NULL; c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); - deactivate_slab(s, flush_slab, flush_freelist); + if (unlikely(!allow_spin)) { + /* Reentrant slub cannot take locks, defer */ + defer_deactivate_slab(flush_slab, flush_freelist); + } else { + deactivate_slab(s, flush_slab, flush_freelist); + } stat(s, CPUSLAB_FLUSH); @@ -3958,6 +4698,19 @@ retry_load_slab: goto load_freelist; } +/* + * We disallow kprobes in ___slab_alloc() to prevent reentrance + * + * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of + * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf -> + * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast() + * manipulating c->freelist without lock. + * + * This does not prevent kprobe in functions called from ___slab_alloc() such as + * local_lock_irqsave() itself, and that is fine, we only need to protect the + * c->freelist manipulation in ___slab_alloc() itself. + */ +NOKPROBE_SYMBOL(___slab_alloc); /* * A wrapper for ___slab_alloc() for contexts where preemption is not yet @@ -3977,8 +4730,19 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, */ c = slub_get_cpu_ptr(s->cpu_slab); #endif - + if (unlikely(!gfpflags_allow_spinning(gfpflags))) { + if (local_lock_is_locked(&s->cpu_slab->lock)) { + /* + * EBUSY is an internal signal to kmalloc_nolock() to + * retry a different bucket. It's not propagated + * to the caller. + */ + p = ERR_PTR(-EBUSY); + goto out; + } + } p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); +out: #ifdef CONFIG_PREEMPT_COUNT slub_put_cpu_ptr(s->cpu_slab); #endif @@ -4102,7 +4866,7 @@ static void *__slab_alloc_node(struct kmem_cache *s, return NULL; } - object = alloc_single_from_new_slab(s, slab, orig_size); + object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); return object; } @@ -4181,8 +4945,9 @@ bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, if (p[i] && init && (!kasan_init || !kasan_has_integrated_init())) memset(p[i], 0, zero_size); - kmemleak_alloc_recursive(p[i], s->object_size, 1, - s->flags, init_flags); + if (gfpflags_allow_spinning(flags)) + kmemleak_alloc_recursive(p[i], s->object_size, 1, + s->flags, init_flags); kmsan_slab_alloc(s, p[i], init_flags); alloc_tagging_slab_alloc_hook(s, p[i], flags); } @@ -4191,6 +4956,251 @@ bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, } /* + * Replace the empty main sheaf with a (at least partially) full sheaf. + * + * Must be called with the cpu_sheaves local lock locked. If successful, returns + * the pcs pointer and the local lock locked (possibly on a different cpu than + * initially called). If not successful, returns NULL and the local lock + * unlocked. + */ +static struct slub_percpu_sheaves * +__pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, gfp_t gfp) +{ + struct slab_sheaf *empty = NULL; + struct slab_sheaf *full; + struct node_barn *barn; + bool can_alloc; + + lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + + if (pcs->spare && pcs->spare->size > 0) { + swap(pcs->main, pcs->spare); + return pcs; + } + + barn = get_barn(s); + + full = barn_replace_empty_sheaf(barn, pcs->main); + + if (full) { + stat(s, BARN_GET); + pcs->main = full; + return pcs; + } + + stat(s, BARN_GET_FAIL); + + can_alloc = gfpflags_allow_blocking(gfp); + + if (can_alloc) { + if (pcs->spare) { + empty = pcs->spare; + pcs->spare = NULL; + } else { + empty = barn_get_empty_sheaf(barn); + } + } + + local_unlock(&s->cpu_sheaves->lock); + + if (!can_alloc) + return NULL; + + if (empty) { + if (!refill_sheaf(s, empty, gfp)) { + full = empty; + } else { + /* + * we must be very low on memory so don't bother + * with the barn + */ + free_empty_sheaf(s, empty); + } + } else { + full = alloc_full_sheaf(s, gfp); + } + + if (!full) + return NULL; + + /* + * we can reach here only when gfpflags_allow_blocking + * so this must not be an irq + */ + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + /* + * If we are returning empty sheaf, we either got it from the + * barn or had to allocate one. If we are returning a full + * sheaf, it's due to racing or being migrated to a different + * cpu. Breaching the barn's sheaf limits should be thus rare + * enough so just ignore them to simplify the recovery. + */ + + if (pcs->main->size == 0) { + barn_put_empty_sheaf(barn, pcs->main); + pcs->main = full; + return pcs; + } + + if (!pcs->spare) { + pcs->spare = full; + return pcs; + } + + if (pcs->spare->size == 0) { + barn_put_empty_sheaf(barn, pcs->spare); + pcs->spare = full; + return pcs; + } + + barn_put_full_sheaf(barn, full); + stat(s, BARN_PUT); + + return pcs; +} + +static __fastpath_inline +void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node) +{ + struct slub_percpu_sheaves *pcs; + bool node_requested; + void *object; + +#ifdef CONFIG_NUMA + if (static_branch_unlikely(&strict_numa) && + node == NUMA_NO_NODE) { + + struct mempolicy *mpol = current->mempolicy; + + if (mpol) { + /* + * Special BIND rule support. If the local node + * is in permitted set then do not redirect + * to a particular node. + * Otherwise we apply the memory policy to get + * the node we need to allocate on. + */ + if (mpol->mode != MPOL_BIND || + !node_isset(numa_mem_id(), mpol->nodes)) + + node = mempolicy_slab_node(); + } + } +#endif + + node_requested = IS_ENABLED(CONFIG_NUMA) && node != NUMA_NO_NODE; + + /* + * We assume the percpu sheaves contain only local objects although it's + * not completely guaranteed, so we verify later. + */ + if (unlikely(node_requested && node != numa_mem_id())) + return NULL; + + if (!local_trylock(&s->cpu_sheaves->lock)) + return NULL; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->main->size == 0)) { + pcs = __pcs_replace_empty_main(s, pcs, gfp); + if (unlikely(!pcs)) + return NULL; + } + + object = pcs->main->objects[pcs->main->size - 1]; + + if (unlikely(node_requested)) { + /* + * Verify that the object was from the node we want. This could + * be false because of cpu migration during an unlocked part of + * the current allocation or previous freeing process. + */ + if (folio_nid(virt_to_folio(object)) != node) { + local_unlock(&s->cpu_sheaves->lock); + return NULL; + } + } + + pcs->main->size--; + + local_unlock(&s->cpu_sheaves->lock); + + stat(s, ALLOC_PCS); + + return object; +} + +static __fastpath_inline +unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *main; + unsigned int allocated = 0; + unsigned int batch; + +next_batch: + if (!local_trylock(&s->cpu_sheaves->lock)) + return allocated; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->main->size == 0)) { + + struct slab_sheaf *full; + + if (pcs->spare && pcs->spare->size > 0) { + swap(pcs->main, pcs->spare); + goto do_alloc; + } + + full = barn_replace_empty_sheaf(get_barn(s), pcs->main); + + if (full) { + stat(s, BARN_GET); + pcs->main = full; + goto do_alloc; + } + + stat(s, BARN_GET_FAIL); + + local_unlock(&s->cpu_sheaves->lock); + + /* + * Once full sheaves in barn are depleted, let the bulk + * allocation continue from slab pages, otherwise we would just + * be copying arrays of pointers twice. + */ + return allocated; + } + +do_alloc: + + main = pcs->main; + batch = min(size, main->size); + + main->size -= batch; + memcpy(p, main->objects + main->size, batch * sizeof(void *)); + + local_unlock(&s->cpu_sheaves->lock); + + stat_add(s, ALLOC_PCS, batch); + + allocated += batch; + + if (batch < size) { + p += batch; + size -= batch; + goto next_batch; + } + + return allocated; +} + + +/* * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) * have the fastpath folded into their functions. So no function call * overhead for requests that can be satisfied on the fastpath. @@ -4214,7 +5224,11 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list if (unlikely(object)) goto out; - object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); + if (s->cpu_sheaves) + object = alloc_from_pcs(s, gfpflags, node); + + if (!object) + object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); maybe_wipe_obj_freeptr(s, object); init = slab_want_init_on_alloc(gfpflags, s); @@ -4287,6 +5301,228 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int nod EXPORT_SYMBOL(kmem_cache_alloc_node_noprof); /* + * returns a sheaf that has at least the requested size + * when prefilling is needed, do so with given gfp flags + * + * return NULL if sheaf allocation or prefilling failed + */ +struct slab_sheaf * +kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *sheaf = NULL; + + if (unlikely(size > s->sheaf_capacity)) { + + /* + * slab_debug disables cpu sheaves intentionally so all + * prefilled sheaves become "oversize" and we give up on + * performance for the debugging. Same with SLUB_TINY. + * Creating a cache without sheaves and then requesting a + * prefilled sheaf is however not expected, so warn. + */ + WARN_ON_ONCE(s->sheaf_capacity == 0 && + !IS_ENABLED(CONFIG_SLUB_TINY) && + !(s->flags & SLAB_DEBUG_FLAGS)); + + sheaf = kzalloc(struct_size(sheaf, objects, size), gfp); + if (!sheaf) + return NULL; + + stat(s, SHEAF_PREFILL_OVERSIZE); + sheaf->cache = s; + sheaf->capacity = size; + + if (!__kmem_cache_alloc_bulk(s, gfp, size, + &sheaf->objects[0])) { + kfree(sheaf); + return NULL; + } + + sheaf->size = size; + + return sheaf; + } + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (pcs->spare) { + sheaf = pcs->spare; + pcs->spare = NULL; + stat(s, SHEAF_PREFILL_FAST); + } else { + stat(s, SHEAF_PREFILL_SLOW); + sheaf = barn_get_full_or_empty_sheaf(get_barn(s)); + if (sheaf && sheaf->size) + stat(s, BARN_GET); + else + stat(s, BARN_GET_FAIL); + } + + local_unlock(&s->cpu_sheaves->lock); + + + if (!sheaf) + sheaf = alloc_empty_sheaf(s, gfp); + + if (sheaf && sheaf->size < size) { + if (refill_sheaf(s, sheaf, gfp)) { + sheaf_flush_unused(s, sheaf); + free_empty_sheaf(s, sheaf); + sheaf = NULL; + } + } + + if (sheaf) + sheaf->capacity = s->sheaf_capacity; + + return sheaf; +} + +/* + * Use this to return a sheaf obtained by kmem_cache_prefill_sheaf() + * + * If the sheaf cannot simply become the percpu spare sheaf, but there's space + * for a full sheaf in the barn, we try to refill the sheaf back to the cache's + * sheaf_capacity to avoid handling partially full sheaves. + * + * If the refill fails because gfp is e.g. GFP_NOWAIT, or the barn is full, the + * sheaf is instead flushed and freed. + */ +void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf *sheaf) +{ + struct slub_percpu_sheaves *pcs; + struct node_barn *barn; + + if (unlikely(sheaf->capacity != s->sheaf_capacity)) { + sheaf_flush_unused(s, sheaf); + kfree(sheaf); + return; + } + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + barn = get_barn(s); + + if (!pcs->spare) { + pcs->spare = sheaf; + sheaf = NULL; + stat(s, SHEAF_RETURN_FAST); + } + + local_unlock(&s->cpu_sheaves->lock); + + if (!sheaf) + return; + + stat(s, SHEAF_RETURN_SLOW); + + /* + * If the barn has too many full sheaves or we fail to refill the sheaf, + * simply flush and free it. + */ + if (data_race(barn->nr_full) >= MAX_FULL_SHEAVES || + refill_sheaf(s, sheaf, gfp)) { + sheaf_flush_unused(s, sheaf); + free_empty_sheaf(s, sheaf); + return; + } + + barn_put_full_sheaf(barn, sheaf); + stat(s, BARN_PUT); +} + +/* + * refill a sheaf previously returned by kmem_cache_prefill_sheaf to at least + * the given size + * + * the sheaf might be replaced by a new one when requesting more than + * s->sheaf_capacity objects if such replacement is necessary, but the refill + * fails (returning -ENOMEM), the existing sheaf is left intact + * + * In practice we always refill to full sheaf's capacity. + */ +int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf **sheafp, unsigned int size) +{ + struct slab_sheaf *sheaf; + + /* + * TODO: do we want to support *sheaf == NULL to be equivalent of + * kmem_cache_prefill_sheaf() ? + */ + if (!sheafp || !(*sheafp)) + return -EINVAL; + + sheaf = *sheafp; + if (sheaf->size >= size) + return 0; + + if (likely(sheaf->capacity >= size)) { + if (likely(sheaf->capacity == s->sheaf_capacity)) + return refill_sheaf(s, sheaf, gfp); + + if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size, + &sheaf->objects[sheaf->size])) { + return -ENOMEM; + } + sheaf->size = sheaf->capacity; + + return 0; + } + + /* + * We had a regular sized sheaf and need an oversize one, or we had an + * oversize one already but need a larger one now. + * This should be a very rare path so let's not complicate it. + */ + sheaf = kmem_cache_prefill_sheaf(s, gfp, size); + if (!sheaf) + return -ENOMEM; + + kmem_cache_return_sheaf(s, gfp, *sheafp); + *sheafp = sheaf; + return 0; +} + +/* + * Allocate from a sheaf obtained by kmem_cache_prefill_sheaf() + * + * Guaranteed not to fail as many allocations as was the requested size. + * After the sheaf is emptied, it fails - no fallback to the slab cache itself. + * + * The gfp parameter is meant only to specify __GFP_ZERO or __GFP_ACCOUNT + * memcg charging is forced over limit if necessary, to avoid failure. + */ +void * +kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf *sheaf) +{ + void *ret = NULL; + bool init; + + if (sheaf->size == 0) + goto out; + + ret = sheaf->objects[--sheaf->size]; + + init = slab_want_init_on_alloc(gfp, s); + + /* add __GFP_NOFAIL to force successful memcg charging */ + slab_post_alloc_hook(s, NULL, gfp | __GFP_NOFAIL, 1, &ret, init, s->object_size); +out: + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfp, NUMA_NO_NODE); + + return ret; +} + +unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf) +{ + return sheaf->size; +} +/* * To avoid unnecessary overhead, we pass through large allocation requests * directly to the page allocator. We use __GFP_COMP, because we will need to * know the allocation order to free the pages properly in kfree. @@ -4378,6 +5614,96 @@ void *__kmalloc_noprof(size_t size, gfp_t flags) } EXPORT_SYMBOL(__kmalloc_noprof); +/** + * kmalloc_nolock - Allocate an object of given size from any context. + * @size: size to allocate + * @gfp_flags: GFP flags. Only __GFP_ACCOUNT, __GFP_ZERO, __GFP_NO_OBJ_EXT + * allowed. + * @node: node number of the target node. + * + * Return: pointer to the new object or NULL in case of error. + * NULL does not mean EBUSY or EAGAIN. It means ENOMEM. + * There is no reason to call it again and expect !NULL. + */ +void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node) +{ + gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags; + struct kmem_cache *s; + bool can_retry = true; + void *ret = ERR_PTR(-EBUSY); + + VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO | + __GFP_NO_OBJ_EXT)); + + if (unlikely(!size)) + return ZERO_SIZE_PTR; + + if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) + /* kmalloc_nolock() in PREEMPT_RT is not supported from irq */ + return NULL; +retry: + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) + return NULL; + s = kmalloc_slab(size, NULL, alloc_gfp, _RET_IP_); + + if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s)) + /* + * kmalloc_nolock() is not supported on architectures that + * don't implement cmpxchg16b, but debug caches don't use + * per-cpu slab and per-cpu partial slabs. They rely on + * kmem_cache_node->list_lock, so kmalloc_nolock() can + * attempt to allocate from debug caches by + * spin_trylock_irqsave(&n->list_lock, ...) + */ + return NULL; + + /* + * Do not call slab_alloc_node(), since trylock mode isn't + * compatible with slab_pre_alloc_hook/should_failslab and + * kfence_alloc. Hence call __slab_alloc_node() (at most twice) + * and slab_post_alloc_hook() directly. + * + * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair + * in irq saved region. It assumes that the same cpu will not + * __update_cpu_freelist_fast() into the same (freelist,tid) pair. + * Therefore use in_nmi() to check whether particular bucket is in + * irq protected section. + * + * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that + * this cpu was interrupted somewhere inside ___slab_alloc() after + * it did local_lock_irqsave(&s->cpu_slab->lock, flags). + * In this case fast path with __update_cpu_freelist_fast() is not safe. + */ +#ifndef CONFIG_SLUB_TINY + if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock)) +#endif + ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); + + if (PTR_ERR(ret) == -EBUSY) { + if (can_retry) { + /* pick the next kmalloc bucket */ + size = s->object_size + 1; + /* + * Another alternative is to + * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; + * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; + * to retry from bucket of the same size. + */ + can_retry = false; + goto retry; + } + ret = NULL; + } + + maybe_wipe_obj_freeptr(s, ret); + slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret, + slab_want_init_on_alloc(alloc_gfp, s), size); + + ret = kasan_kmalloc(s, ret, size, alloc_gfp); + return ret; +} +EXPORT_SYMBOL_GPL(kmalloc_nolock_noprof); + void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node, unsigned long caller) { @@ -4421,8 +5747,12 @@ static noinline void free_to_partial_list( unsigned long flags; depot_stack_handle_t handle = 0; + /* + * We cannot use GFP_NOWAIT as there are callsites where waking up + * kswapd could deadlock + */ if (s->flags & SLAB_STORE_USER) - handle = set_track_prepare(); + handle = set_track_prepare(__GFP_NOWARN); spin_lock_irqsave(&n->list_lock, flags); @@ -4591,6 +5921,537 @@ slab_empty: discard_slab(s, slab); } +/* + * pcs is locked. We should have get rid of the spare sheaf and obtained an + * empty sheaf, while the main sheaf is full. We want to install the empty sheaf + * as a main sheaf, and make the current main sheaf a spare sheaf. + * + * However due to having relinquished the cpu_sheaves lock when obtaining + * the empty sheaf, we need to handle some unlikely but possible cases. + * + * If we put any sheaf to barn here, it's because we were interrupted or have + * been migrated to a different cpu, which should be rare enough so just ignore + * the barn's limits to simplify the handling. + * + * An alternative scenario that gets us here is when we fail + * barn_replace_full_sheaf(), because there's no empty sheaf available in the + * barn, so we had to allocate it by alloc_empty_sheaf(). But because we saw the + * limit on full sheaves was not exceeded, we assume it didn't change and just + * put the full sheaf there. + */ +static void __pcs_install_empty_sheaf(struct kmem_cache *s, + struct slub_percpu_sheaves *pcs, struct slab_sheaf *empty) +{ + struct node_barn *barn; + + lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + + /* This is what we expect to find if nobody interrupted us. */ + if (likely(!pcs->spare)) { + pcs->spare = pcs->main; + pcs->main = empty; + return; + } + + barn = get_barn(s); + + /* + * Unlikely because if the main sheaf had space, we would have just + * freed to it. Get rid of our empty sheaf. + */ + if (pcs->main->size < s->sheaf_capacity) { + barn_put_empty_sheaf(barn, empty); + return; + } + + /* Also unlikely for the same reason */ + if (pcs->spare->size < s->sheaf_capacity) { + swap(pcs->main, pcs->spare); + barn_put_empty_sheaf(barn, empty); + return; + } + + /* + * We probably failed barn_replace_full_sheaf() due to no empty sheaf + * available there, but we allocated one, so finish the job. + */ + barn_put_full_sheaf(barn, pcs->main); + stat(s, BARN_PUT); + pcs->main = empty; +} + +/* + * Replace the full main sheaf with a (at least partially) empty sheaf. + * + * Must be called with the cpu_sheaves local lock locked. If successful, returns + * the pcs pointer and the local lock locked (possibly on a different cpu than + * initially called). If not successful, returns NULL and the local lock + * unlocked. + */ +static struct slub_percpu_sheaves * +__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs) +{ + struct slab_sheaf *empty; + struct node_barn *barn; + bool put_fail; + +restart: + lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + + barn = get_barn(s); + put_fail = false; + + if (!pcs->spare) { + empty = barn_get_empty_sheaf(barn); + if (empty) { + pcs->spare = pcs->main; + pcs->main = empty; + return pcs; + } + goto alloc_empty; + } + + if (pcs->spare->size < s->sheaf_capacity) { + swap(pcs->main, pcs->spare); + return pcs; + } + + empty = barn_replace_full_sheaf(barn, pcs->main); + + if (!IS_ERR(empty)) { + stat(s, BARN_PUT); + pcs->main = empty; + return pcs; + } + + if (PTR_ERR(empty) == -E2BIG) { + /* Since we got here, spare exists and is full */ + struct slab_sheaf *to_flush = pcs->spare; + + stat(s, BARN_PUT_FAIL); + + pcs->spare = NULL; + local_unlock(&s->cpu_sheaves->lock); + + sheaf_flush_unused(s, to_flush); + empty = to_flush; + goto got_empty; + } + + /* + * We could not replace full sheaf because barn had no empty + * sheaves. We can still allocate it and put the full sheaf in + * __pcs_install_empty_sheaf(), but if we fail to allocate it, + * make sure to count the fail. + */ + put_fail = true; + +alloc_empty: + local_unlock(&s->cpu_sheaves->lock); + + empty = alloc_empty_sheaf(s, GFP_NOWAIT); + if (empty) + goto got_empty; + + if (put_fail) + stat(s, BARN_PUT_FAIL); + + if (!sheaf_flush_main(s)) + return NULL; + + if (!local_trylock(&s->cpu_sheaves->lock)) + return NULL; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + /* + * we flushed the main sheaf so it should be empty now, + * but in case we got preempted or migrated, we need to + * check again + */ + if (pcs->main->size == s->sheaf_capacity) + goto restart; + + return pcs; + +got_empty: + if (!local_trylock(&s->cpu_sheaves->lock)) { + barn_put_empty_sheaf(barn, empty); + return NULL; + } + + pcs = this_cpu_ptr(s->cpu_sheaves); + __pcs_install_empty_sheaf(s, pcs, empty); + + return pcs; +} + +/* + * Free an object to the percpu sheaves. + * The object is expected to have passed slab_free_hook() already. + */ +static __fastpath_inline +bool free_to_pcs(struct kmem_cache *s, void *object) +{ + struct slub_percpu_sheaves *pcs; + + if (!local_trylock(&s->cpu_sheaves->lock)) + return false; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->main->size == s->sheaf_capacity)) { + + pcs = __pcs_replace_full_main(s, pcs); + if (unlikely(!pcs)) + return false; + } + + pcs->main->objects[pcs->main->size++] = object; + + local_unlock(&s->cpu_sheaves->lock); + + stat(s, FREE_PCS); + + return true; +} + +static void rcu_free_sheaf(struct rcu_head *head) +{ + struct slab_sheaf *sheaf; + struct node_barn *barn; + struct kmem_cache *s; + + sheaf = container_of(head, struct slab_sheaf, rcu_head); + + s = sheaf->cache; + + /* + * This may remove some objects due to slab_free_hook() returning false, + * so that the sheaf might no longer be completely full. But it's easier + * to handle it as full (unless it became completely empty), as the code + * handles it fine. The only downside is that sheaf will serve fewer + * allocations when reused. It only happens due to debugging, which is a + * performance hit anyway. + */ + __rcu_free_sheaf_prepare(s, sheaf); + + barn = get_node(s, sheaf->node)->barn; + + /* due to slab_free_hook() */ + if (unlikely(sheaf->size == 0)) + goto empty; + + /* + * Checking nr_full/nr_empty outside lock avoids contention in case the + * barn is at the respective limit. Due to the race we might go over the + * limit but that should be rare and harmless. + */ + + if (data_race(barn->nr_full) < MAX_FULL_SHEAVES) { + stat(s, BARN_PUT); + barn_put_full_sheaf(barn, sheaf); + return; + } + + stat(s, BARN_PUT_FAIL); + sheaf_flush_unused(s, sheaf); + +empty: + if (data_race(barn->nr_empty) < MAX_EMPTY_SHEAVES) { + barn_put_empty_sheaf(barn, sheaf); + return; + } + + free_empty_sheaf(s, sheaf); +} + +bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *rcu_sheaf; + + if (!local_trylock(&s->cpu_sheaves->lock)) + goto fail; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(!pcs->rcu_free)) { + + struct slab_sheaf *empty; + struct node_barn *barn; + + if (pcs->spare && pcs->spare->size == 0) { + pcs->rcu_free = pcs->spare; + pcs->spare = NULL; + goto do_free; + } + + barn = get_barn(s); + + empty = barn_get_empty_sheaf(barn); + + if (empty) { + pcs->rcu_free = empty; + goto do_free; + } + + local_unlock(&s->cpu_sheaves->lock); + + empty = alloc_empty_sheaf(s, GFP_NOWAIT); + + if (!empty) + goto fail; + + if (!local_trylock(&s->cpu_sheaves->lock)) { + barn_put_empty_sheaf(barn, empty); + goto fail; + } + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->rcu_free)) + barn_put_empty_sheaf(barn, empty); + else + pcs->rcu_free = empty; + } + +do_free: + + rcu_sheaf = pcs->rcu_free; + + /* + * Since we flush immediately when size reaches capacity, we never reach + * this with size already at capacity, so no OOB write is possible. + */ + rcu_sheaf->objects[rcu_sheaf->size++] = obj; + + if (likely(rcu_sheaf->size < s->sheaf_capacity)) { + rcu_sheaf = NULL; + } else { + pcs->rcu_free = NULL; + rcu_sheaf->node = numa_mem_id(); + } + + /* + * we flush before local_unlock to make sure a racing + * flush_all_rcu_sheaves() doesn't miss this sheaf + */ + if (rcu_sheaf) + call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf); + + local_unlock(&s->cpu_sheaves->lock); + + stat(s, FREE_RCU_SHEAF); + return true; + +fail: + stat(s, FREE_RCU_SHEAF_FAIL); + return false; +} + +/* + * Bulk free objects to the percpu sheaves. + * Unlike free_to_pcs() this includes the calls to all necessary hooks + * and the fallback to freeing to slab pages. + */ +static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *main, *empty; + bool init = slab_want_init_on_free(s); + unsigned int batch, i = 0; + struct node_barn *barn; + void *remote_objects[PCS_BATCH_MAX]; + unsigned int remote_nr = 0; + int node = numa_mem_id(); + +next_remote_batch: + while (i < size) { + struct slab *slab = virt_to_slab(p[i]); + + memcg_slab_free_hook(s, slab, p + i, 1); + alloc_tagging_slab_free_hook(s, slab, p + i, 1); + + if (unlikely(!slab_free_hook(s, p[i], init, false))) { + p[i] = p[--size]; + if (!size) + goto flush_remote; + continue; + } + + if (unlikely(IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node)) { + remote_objects[remote_nr] = p[i]; + p[i] = p[--size]; + if (++remote_nr >= PCS_BATCH_MAX) + goto flush_remote; + continue; + } + + i++; + } + +next_batch: + if (!local_trylock(&s->cpu_sheaves->lock)) + goto fallback; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (likely(pcs->main->size < s->sheaf_capacity)) + goto do_free; + + barn = get_barn(s); + + if (!pcs->spare) { + empty = barn_get_empty_sheaf(barn); + if (!empty) + goto no_empty; + + pcs->spare = pcs->main; + pcs->main = empty; + goto do_free; + } + + if (pcs->spare->size < s->sheaf_capacity) { + swap(pcs->main, pcs->spare); + goto do_free; + } + + empty = barn_replace_full_sheaf(barn, pcs->main); + if (IS_ERR(empty)) { + stat(s, BARN_PUT_FAIL); + goto no_empty; + } + + stat(s, BARN_PUT); + pcs->main = empty; + +do_free: + main = pcs->main; + batch = min(size, s->sheaf_capacity - main->size); + + memcpy(main->objects + main->size, p, batch * sizeof(void *)); + main->size += batch; + + local_unlock(&s->cpu_sheaves->lock); + + stat_add(s, FREE_PCS, batch); + + if (batch < size) { + p += batch; + size -= batch; + goto next_batch; + } + + return; + +no_empty: + local_unlock(&s->cpu_sheaves->lock); + + /* + * if we depleted all empty sheaves in the barn or there are too + * many full sheaves, free the rest to slab pages + */ +fallback: + __kmem_cache_free_bulk(s, size, p); + +flush_remote: + if (remote_nr) { + __kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]); + if (i < size) { + remote_nr = 0; + goto next_remote_batch; + } + } +} + +struct defer_free { + struct llist_head objects; + struct llist_head slabs; + struct irq_work work; +}; + +static void free_deferred_objects(struct irq_work *work); + +static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = { + .objects = LLIST_HEAD_INIT(objects), + .slabs = LLIST_HEAD_INIT(slabs), + .work = IRQ_WORK_INIT(free_deferred_objects), +}; + +/* + * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe + * to take sleeping spin_locks from __slab_free() and deactivate_slab(). + * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore(). + */ +static void free_deferred_objects(struct irq_work *work) +{ + struct defer_free *df = container_of(work, struct defer_free, work); + struct llist_head *objs = &df->objects; + struct llist_head *slabs = &df->slabs; + struct llist_node *llnode, *pos, *t; + + if (llist_empty(objs) && llist_empty(slabs)) + return; + + llnode = llist_del_all(objs); + llist_for_each_safe(pos, t, llnode) { + struct kmem_cache *s; + struct slab *slab; + void *x = pos; + + slab = virt_to_slab(x); + s = slab->slab_cache; + + /* + * We used freepointer in 'x' to link 'x' into df->objects. + * Clear it to NULL to avoid false positive detection + * of "Freepointer corruption". + */ + *(void **)x = NULL; + + /* Point 'x' back to the beginning of allocated object */ + x -= s->offset; + __slab_free(s, slab, x, x, 1, _THIS_IP_); + } + + llnode = llist_del_all(slabs); + llist_for_each_safe(pos, t, llnode) { + struct slab *slab = container_of(pos, struct slab, llnode); + +#ifdef CONFIG_SLUB_TINY + discard_slab(slab->slab_cache, slab); +#else + deactivate_slab(slab->slab_cache, slab, slab->flush_freelist); +#endif + } +} + +static void defer_free(struct kmem_cache *s, void *head) +{ + struct defer_free *df = this_cpu_ptr(&defer_free_objects); + + if (llist_add(head + s->offset, &df->objects)) + irq_work_queue(&df->work); +} + +static void defer_deactivate_slab(struct slab *slab, void *flush_freelist) +{ + struct defer_free *df = this_cpu_ptr(&defer_free_objects); + + slab->flush_freelist = flush_freelist; + if (llist_add(&slab->llnode, &df->slabs)) + irq_work_queue(&df->work); +} + +void defer_free_barrier(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work); +} + #ifndef CONFIG_SLUB_TINY /* * Fastpath with forced inlining to produce a kfree and kmem_cache_free that @@ -4611,6 +6472,8 @@ static __always_inline void do_slab_free(struct kmem_cache *s, struct slab *slab, void *head, void *tail, int cnt, unsigned long addr) { + /* cnt == 0 signals that it's called from kfree_nolock() */ + bool allow_spin = cnt; struct kmem_cache_cpu *c; unsigned long tid; void **freelist; @@ -4629,10 +6492,29 @@ redo: barrier(); if (unlikely(slab != c->slab)) { - __slab_free(s, slab, head, tail, cnt, addr); + if (unlikely(!allow_spin)) { + /* + * __slab_free() can locklessly cmpxchg16 into a slab, + * but then it might need to take spin_lock or local_lock + * in put_cpu_partial() for further processing. + * Avoid the complexity and simply add to a deferred list. + */ + defer_free(s, head); + } else { + __slab_free(s, slab, head, tail, cnt, addr); + } return; } + if (unlikely(!allow_spin)) { + if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) && + local_lock_is_locked(&s->cpu_slab->lock)) { + defer_free(s, head); + return; + } + cnt = 1; /* restore cnt. kfree_nolock() frees one object at a time */ + } + if (USE_LOCKLESS_FAST_PATH()) { freelist = READ_ONCE(c->freelist); @@ -4643,11 +6525,13 @@ redo: goto redo; } } else { + __maybe_unused unsigned long flags = 0; + /* Update the free list under the local lock */ - local_lock(&s->cpu_slab->lock); + local_lock_cpu_slab(s, flags); c = this_cpu_ptr(s->cpu_slab); if (unlikely(slab != c->slab)) { - local_unlock(&s->cpu_slab->lock); + local_unlock_cpu_slab(s, flags); goto redo; } tid = c->tid; @@ -4657,7 +6541,7 @@ redo: c->freelist = head; c->tid = next_tid(tid); - local_unlock(&s->cpu_slab->lock); + local_unlock_cpu_slab(s, flags); } stat_add(s, FREE_FASTPATH, cnt); } @@ -4677,8 +6561,16 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, memcg_slab_free_hook(s, slab, &object, 1); alloc_tagging_slab_free_hook(s, slab, &object, 1); - if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) - do_slab_free(s, slab, object, object, 1, addr); + if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false))) + return; + + if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) || + slab_nid(slab) == numa_mem_id())) { + if (likely(free_to_pcs(s, object))) + return; + } + + do_slab_free(s, slab, object, object, 1, addr); } #ifdef CONFIG_MEMCG @@ -4880,6 +6772,71 @@ void kfree(const void *object) } EXPORT_SYMBOL(kfree); +/* + * Can be called while holding raw_spinlock_t or from IRQ and NMI, + * but ONLY for objects allocated by kmalloc_nolock(). + * Debug checks (like kmemleak and kfence) were skipped on allocation, + * hence + * obj = kmalloc(); kfree_nolock(obj); + * will miss kmemleak/kfence book keeping and will cause false positives. + * large_kmalloc is not supported either. + */ +void kfree_nolock(const void *object) +{ + struct folio *folio; + struct slab *slab; + struct kmem_cache *s; + void *x = (void *)object; + + if (unlikely(ZERO_OR_NULL_PTR(object))) + return; + + folio = virt_to_folio(object); + if (unlikely(!folio_test_slab(folio))) { + WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()"); + return; + } + + slab = folio_slab(folio); + s = slab->slab_cache; + + memcg_slab_free_hook(s, slab, &x, 1); + alloc_tagging_slab_free_hook(s, slab, &x, 1); + /* + * Unlike slab_free() do NOT call the following: + * kmemleak_free_recursive(x, s->flags); + * debug_check_no_locks_freed(x, s->object_size); + * debug_check_no_obj_freed(x, s->object_size); + * __kcsan_check_access(x, s->object_size, ..); + * kfence_free(x); + * since they take spinlocks or not safe from any context. + */ + kmsan_slab_free(s, x); + /* + * If KASAN finds a kernel bug it will do kasan_report_invalid_free() + * which will call raw_spin_lock_irqsave() which is technically + * unsafe from NMI, but take chance and report kernel bug. + * The sequence of + * kasan_report_invalid_free() -> raw_spin_lock_irqsave() -> NMI + * -> kfree_nolock() -> kasan_report_invalid_free() on the same CPU + * is double buggy and deserves to deadlock. + */ + if (kasan_slab_pre_free(s, x)) + return; + /* + * memcg, kasan_slab_pre_free are done for 'x'. + * The only thing left is kasan_poison without quarantine, + * since kasan quarantine takes locks and not supported from NMI. + */ + kasan_slab_free(s, x, false, false, /* skip quarantine */true); +#ifndef CONFIG_SLUB_TINY + do_slab_free(s, slab, x, x, 0, _RET_IP_); +#else + defer_free(s, x); +#endif +} +EXPORT_SYMBOL_GPL(kfree_nolock); + static __always_inline __realloc_size(2) void * __do_krealloc(const void *p, size_t new_size, gfp_t flags) { @@ -5273,6 +7230,15 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) if (!size) return; + /* + * freeing to sheaves is so incompatible with the detached freelist so + * once we go that way, we have to do everything differently + */ + if (s && s->cpu_sheaves) { + free_to_pcs_bulk(s, size, p); + return; + } + do { struct detached_freelist df; @@ -5391,7 +7357,7 @@ error: int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, void **p) { - int i; + unsigned int i = 0; if (!size) return 0; @@ -5400,9 +7366,20 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, if (unlikely(!s)) return 0; - i = __kmem_cache_alloc_bulk(s, flags, size, p); - if (unlikely(i == 0)) - return 0; + if (s->cpu_sheaves) + i = alloc_from_pcs_bulk(s, size, p); + + if (i < size) { + /* + * If we ran out of memory, don't bother with freeing back to + * the percpu sheaves, we have bigger problems. + */ + if (unlikely(__kmem_cache_alloc_bulk(s, flags, size - i, p + i) == 0)) { + if (i > 0) + __kmem_cache_free_bulk(s, i, p); + return 0; + } + } /* * memcg and kmem_cache debug support and memory initialization. @@ -5412,11 +7389,11 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, slab_want_init_on_alloc(flags, s), s->object_size))) { return 0; } - return i; + + return size; } EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof); - /* * Object placement in a slab is made very easy because we always start at * offset 0. If we tune the size of the object to the alignment then we can @@ -5550,7 +7527,7 @@ static inline int calculate_order(unsigned int size) } static void -init_kmem_cache_node(struct kmem_cache_node *n) +init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn) { n->nr_partial = 0; spin_lock_init(&n->list_lock); @@ -5560,6 +7537,9 @@ init_kmem_cache_node(struct kmem_cache_node *n) atomic_long_set(&n->total_objects, 0); INIT_LIST_HEAD(&n->full); #endif + n->barn = barn; + if (barn) + barn_init(barn); } #ifndef CONFIG_SLUB_TINY @@ -5590,6 +7570,26 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) } #endif /* CONFIG_SLUB_TINY */ +static int init_percpu_sheaves(struct kmem_cache *s) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + local_trylock_init(&pcs->lock); + + pcs->main = alloc_empty_sheaf(s, GFP_KERNEL); + + if (!pcs->main) + return -ENOMEM; + } + + return 0; +} + static struct kmem_cache *kmem_cache_node; /* @@ -5625,7 +7625,7 @@ static void early_kmem_cache_node_alloc(int node) slab->freelist = get_freepointer(kmem_cache_node, n); slab->inuse = 1; kmem_cache_node->node[node] = n; - init_kmem_cache_node(n); + init_kmem_cache_node(n, NULL); inc_slabs_node(kmem_cache_node, node, slab->objects); /* @@ -5641,6 +7641,13 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) struct kmem_cache_node *n; for_each_kmem_cache_node(s, node, n) { + if (n->barn) { + WARN_ON(n->barn->nr_full); + WARN_ON(n->barn->nr_empty); + kfree(n->barn); + n->barn = NULL; + } + s->node[node] = NULL; kmem_cache_free(kmem_cache_node, n); } @@ -5649,7 +7656,12 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) void __kmem_cache_release(struct kmem_cache *s) { cache_random_seq_destroy(s); + if (s->cpu_sheaves) + pcs_destroy(s); #ifndef CONFIG_SLUB_TINY +#ifdef CONFIG_PREEMPT_RT + lockdep_unregister_key(&s->lock_key); +#endif free_percpu(s->cpu_slab); #endif free_kmem_cache_nodes(s); @@ -5661,20 +7673,29 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) for_each_node_mask(node, slab_nodes) { struct kmem_cache_node *n; + struct node_barn *barn = NULL; if (slab_state == DOWN) { early_kmem_cache_node_alloc(node); continue; } + + if (s->cpu_sheaves) { + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); + + if (!barn) + return 0; + } + n = kmem_cache_alloc_node(kmem_cache_node, GFP_KERNEL, node); - if (!n) { - free_kmem_cache_nodes(s); + kfree(barn); return 0; } - init_kmem_cache_node(n); + init_kmem_cache_node(n, barn); + s->node[node] = n; } return 1; @@ -5929,8 +7950,15 @@ int __kmem_cache_shutdown(struct kmem_cache *s) struct kmem_cache_node *n; flush_all_cpus_locked(s); + + /* we might have rcu sheaves in flight */ + if (s->cpu_sheaves) + rcu_barrier(); + /* Attempt to free all objects */ for_each_kmem_cache_node(s, node, n) { + if (n->barn) + barn_shrink(s, n->barn); free_partial(s, n); if (n->nr_partial || node_nr_slabs(n)) return 1; @@ -6134,6 +8162,9 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) for (i = 0; i < SHRINK_PROMOTE_MAX; i++) INIT_LIST_HEAD(promote + i); + if (n->barn) + barn_shrink(s, n->barn); + spin_lock_irqsave(&n->list_lock, flags); /* @@ -6213,12 +8244,24 @@ static int slab_mem_going_online_callback(int nid) */ mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { + struct node_barn *barn = NULL; + /* * The structure may already exist if the node was previously * onlined and offlined. */ if (get_node(s, nid)) continue; + + if (s->cpu_sheaves) { + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid); + + if (!barn) { + ret = -ENOMEM; + goto out; + } + } + /* * XXX: kmem_cache_alloc_node will fallback to other nodes * since memory is not yet available from the node that @@ -6226,10 +8269,13 @@ static int slab_mem_going_online_callback(int nid) */ n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL); if (!n) { + kfree(barn); ret = -ENOMEM; goto out; } - init_kmem_cache_node(n); + + init_kmem_cache_node(n, barn); + s->node[nid] = n; } /* @@ -6442,6 +8488,17 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, set_cpu_partial(s); + if (args->sheaf_capacity && !IS_ENABLED(CONFIG_SLUB_TINY) + && !(s->flags & SLAB_DEBUG_FLAGS)) { + s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); + if (!s->cpu_sheaves) { + err = -ENOMEM; + goto out; + } + // TODO: increase capacity to grow slab_sheaf up to next kmalloc size? + s->sheaf_capacity = args->sheaf_capacity; + } + #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; #endif @@ -6458,6 +8515,12 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, if (!alloc_kmem_cache_cpus(s)) goto out; + if (s->cpu_sheaves) { + err = init_percpu_sheaves(s); + if (err) + goto out; + } + err = 0; /* Mutex is not taken during early boot */ @@ -6499,6 +8562,11 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab, void *p; void *addr = slab_address(slab); + if (!validate_slab_ptr(slab)) { + slab_err(s, slab, "Not a valid slab page"); + return; + } + if (!check_slab(s, slab) || !on_freelist(s, slab, NULL)) return; @@ -6910,6 +8978,12 @@ static ssize_t order_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(order); +static ssize_t sheaf_capacity_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%u\n", s->sheaf_capacity); +} +SLAB_ATTR_RO(sheaf_capacity); + static ssize_t min_partial_show(struct kmem_cache *s, char *buf) { return sysfs_emit(buf, "%lu\n", s->min_partial); @@ -7257,8 +9331,12 @@ static ssize_t text##_store(struct kmem_cache *s, \ } \ SLAB_ATTR(text); \ +STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf); STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); +STAT_ATTR(FREE_PCS, free_cpu_sheaf); +STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf); +STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail); STAT_ATTR(FREE_FASTPATH, free_fastpath); STAT_ATTR(FREE_SLOWPATH, free_slowpath); STAT_ATTR(FREE_FROZEN, free_frozen); @@ -7283,6 +9361,19 @@ STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); +STAT_ATTR(SHEAF_FLUSH, sheaf_flush); +STAT_ATTR(SHEAF_REFILL, sheaf_refill); +STAT_ATTR(SHEAF_ALLOC, sheaf_alloc); +STAT_ATTR(SHEAF_FREE, sheaf_free); +STAT_ATTR(BARN_GET, barn_get); +STAT_ATTR(BARN_GET_FAIL, barn_get_fail); +STAT_ATTR(BARN_PUT, barn_put); +STAT_ATTR(BARN_PUT_FAIL, barn_put_fail); +STAT_ATTR(SHEAF_PREFILL_FAST, sheaf_prefill_fast); +STAT_ATTR(SHEAF_PREFILL_SLOW, sheaf_prefill_slow); +STAT_ATTR(SHEAF_PREFILL_OVERSIZE, sheaf_prefill_oversize); +STAT_ATTR(SHEAF_RETURN_FAST, sheaf_return_fast); +STAT_ATTR(SHEAF_RETURN_SLOW, sheaf_return_slow); #endif /* CONFIG_SLUB_STATS */ #ifdef CONFIG_KFENCE @@ -7313,6 +9404,7 @@ static struct attribute *slab_attrs[] = { &object_size_attr.attr, &objs_per_slab_attr.attr, &order_attr.attr, + &sheaf_capacity_attr.attr, &min_partial_attr.attr, &cpu_partial_attr.attr, &objects_partial_attr.attr, @@ -7344,8 +9436,12 @@ static struct attribute *slab_attrs[] = { &remote_node_defrag_ratio_attr.attr, #endif #ifdef CONFIG_SLUB_STATS + &alloc_cpu_sheaf_attr.attr, &alloc_fastpath_attr.attr, &alloc_slowpath_attr.attr, + &free_cpu_sheaf_attr.attr, + &free_rcu_sheaf_attr.attr, + &free_rcu_sheaf_fail_attr.attr, &free_fastpath_attr.attr, &free_slowpath_attr.attr, &free_frozen_attr.attr, @@ -7370,6 +9466,19 @@ static struct attribute *slab_attrs[] = { &cpu_partial_free_attr.attr, &cpu_partial_node_attr.attr, &cpu_partial_drain_attr.attr, + &sheaf_flush_attr.attr, + &sheaf_refill_attr.attr, + &sheaf_alloc_attr.attr, + &sheaf_free_attr.attr, + &barn_get_attr.attr, + &barn_get_fail_attr.attr, + &barn_put_attr.attr, + &barn_put_fail_attr.attr, + &sheaf_prefill_fast_attr.attr, + &sheaf_prefill_slow_attr.attr, + &sheaf_prefill_oversize_attr.attr, + &sheaf_return_fast_attr.attr, + &sheaf_return_slow_attr.attr, #endif #ifdef CONFIG_FAILSLAB &failslab_attr.attr, @@ -7711,15 +9820,12 @@ static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos) return NULL; } -static int cmp_loc_by_count(const void *a, const void *b, const void *data) +static int cmp_loc_by_count(const void *a, const void *b) { struct location *loc1 = (struct location *)a; struct location *loc2 = (struct location *)b; - if (loc1->count > loc2->count) - return -1; - else - return 1; + return cmp_int(loc2->count, loc1->count); } static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos) @@ -7781,8 +9887,8 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep) } /* Sort locations by count */ - sort_r(t->loc, t->count, sizeof(struct location), - cmp_loc_by_count, NULL, NULL); + sort(t->loc, t->count, sizeof(struct location), + cmp_loc_by_count, NULL); bitmap_free(obj_map); return 0; |