/* SPDX-License-Identifier: GPL-2.0 */ /* * Arena-based task data scheduler. This is a variation of scx_simple * that uses a combined allocator and indexing structure to organize * task data. Task context allocation is done when a task enters the * scheduler, while freeing is done when it exits. Task contexts are * retrieved from task-local storage, pointing to the allocated memory. * * The main purpose of this scheduler is to demostrate arena memory * management. * * Copyright (c) 2024-2025 Meta Platforms, Inc. and affiliates. * Copyright (c) 2024-2025 Emil Tsalapatis * Copyright (c) 2024-2025 Tejun Heo * */ #include #include #include "scx_sdt.h" char _license[] SEC("license") = "GPL"; UEI_DEFINE(uei); struct { __uint(type, BPF_MAP_TYPE_ARENA); __uint(map_flags, BPF_F_MMAPABLE); #if defined(__TARGET_ARCH_arm64) || defined(__aarch64__) __uint(max_entries, 1 << 16); /* number of pages */ __ulong(map_extra, (1ull << 32)); /* start of mmap() region */ #else __uint(max_entries, 1 << 20); /* number of pages */ __ulong(map_extra, (1ull << 44)); /* start of mmap() region */ #endif } arena __weak SEC(".maps"); #define SHARED_DSQ 0 #define DEFINE_SDT_STAT(metric) \ static inline void \ stat_inc_##metric(struct scx_stats __arena *stats) \ { \ cast_kern(stats); \ stats->metric += 1; \ } \ __u64 stat_##metric; \ DEFINE_SDT_STAT(enqueue); DEFINE_SDT_STAT(init); DEFINE_SDT_STAT(exit); DEFINE_SDT_STAT(select_idle_cpu); DEFINE_SDT_STAT(select_busy_cpu); /* * Necessary for cond_break/can_loop's semantics. According to kernel commit * 011832b, the loop counter variable must be seen as imprecise and bounded * by the verifier. Initializing it from a constant (e.g., i = 0;), then, * makes it precise and prevents may_goto from helping with converging the * loop. For these loops we must initialize the loop counter from a variable * whose value the verifier cannot reason about when checking the program, so * that the loop counter's value is imprecise. */ static __u64 zero = 0; /* * XXX Hack to get the verifier to find the arena for sdt_exit_task. * As of 6.12-rc5, The verifier associates arenas with programs by * checking LD.IMM instruction operands for an arena and populating * the program state with the first instance it finds. This requires * accessing our global arena variable, but scx methods do not necessarily * do so while still using pointers from that arena. Insert a bpf_printk * statement that triggers at most once to generate an LD.IMM instruction * to access the arena and help the verifier. */ static volatile bool scx_arena_verify_once; __hidden void scx_arena_subprog_init(void) { if (scx_arena_verify_once) return; bpf_printk("%s: arena pointer %p", __func__, &arena); scx_arena_verify_once = true; } private(LOCK) struct bpf_spin_lock alloc_lock; private(POOL_LOCK) struct bpf_spin_lock alloc_pool_lock; /* allocation pools */ struct sdt_pool desc_pool; struct sdt_pool chunk_pool; /* Protected by alloc_lock. */ struct scx_alloc_stats alloc_stats; /* Allocate element from the pool. Must be called with a then pool lock held. */ static void __arena *scx_alloc_from_pool(struct sdt_pool *pool) { __u64 elem_size, max_elems; void __arena *slab; void __arena *ptr; elem_size = pool->elem_size; max_elems = pool->max_elems; /* If the chunk is spent, get a new one. */ if (pool->idx >= max_elems) { slab = bpf_arena_alloc_pages(&arena, NULL, div_round_up(max_elems * elem_size, PAGE_SIZE), NUMA_NO_NODE, 0); if (!slab) return NULL; pool->slab = slab; pool->idx = 0; } ptr = (void __arena *)((__u64) pool->slab + elem_size * pool->idx); pool->idx += 1; return ptr; } /* Alloc desc and associated chunk. Called with the allocator spinlock held. */ static sdt_desc_t *scx_alloc_chunk(void) { struct sdt_chunk __arena *chunk; sdt_desc_t *desc; sdt_desc_t *out; chunk = scx_alloc_from_pool(&chunk_pool); if (!chunk) return NULL; desc = scx_alloc_from_pool(&desc_pool); if (!desc) { /* * Effectively frees the previous chunk allocation. * Index cannot be 0, so decrementing is always * valid. */ chunk_pool.idx -= 1; return NULL; } out = desc; desc->nr_free = SDT_TASK_ENTS_PER_CHUNK; desc->chunk = chunk; alloc_stats.chunk_allocs += 1; return out; } static int pool_set_size(struct sdt_pool *pool, __u64 data_size, __u64 nr_pages) { if (unlikely(data_size % 8)) return -EINVAL; if (unlikely(nr_pages == 0)) return -EINVAL; pool->elem_size = data_size; pool->max_elems = (PAGE_SIZE * nr_pages) / pool->elem_size; /* Populate the pool slab on the first allocation. */ pool->idx = pool->max_elems; return 0; } /* Initialize both the base pool allocators and the root chunk of the index. */ __hidden int scx_alloc_init(struct scx_allocator *alloc, __u64 data_size) { size_t min_chunk_size; int ret; _Static_assert(sizeof(struct sdt_chunk) <= PAGE_SIZE, "chunk size must fit into a page"); ret = pool_set_size(&chunk_pool, sizeof(struct sdt_chunk), 1); if (ret != 0) return ret; ret = pool_set_size(&desc_pool, sizeof(struct sdt_desc), 1); if (ret != 0) return ret; /* Wrap data into a descriptor and word align. */ data_size += sizeof(struct sdt_data); data_size = round_up(data_size, 8); /* * Ensure we allocate large enough chunks from the arena to avoid excessive * internal fragmentation when turning chunks it into structs. */ min_chunk_size = div_round_up(SDT_TASK_MIN_ELEM_PER_ALLOC * data_size, PAGE_SIZE); ret = pool_set_size(&alloc->pool, data_size, min_chunk_size); if (ret != 0) return ret; bpf_spin_lock(&alloc_lock); alloc->root = scx_alloc_chunk(); bpf_spin_unlock(&alloc_lock); if (!alloc->root) return -ENOMEM; return 0; } static int set_idx_state(sdt_desc_t *desc, __u64 pos, bool state) { __u64 __arena *allocated = desc->allocated; __u64 bit; if (unlikely(pos >= SDT_TASK_ENTS_PER_CHUNK)) return -EINVAL; bit = (__u64)1 << (pos % 64); if (state) allocated[pos / 64] |= bit; else allocated[pos / 64] &= ~bit; return 0; } static __noinline int mark_nodes_avail(sdt_desc_t *lv_desc[SDT_TASK_LEVELS], __u64 lv_pos[SDT_TASK_LEVELS]) { sdt_desc_t *desc; __u64 u, level; int ret; for (u = zero; u < SDT_TASK_LEVELS && can_loop; u++) { level = SDT_TASK_LEVELS - 1 - u; /* Only propagate upwards if we are the parent's only free chunk. */ desc = lv_desc[level]; ret = set_idx_state(desc, lv_pos[level], false); if (unlikely(ret != 0)) return ret; desc->nr_free += 1; if (desc->nr_free > 1) return 0; } return 0; } /* * Free the allocated struct with the given index. Called with the * allocator lock taken. */ __hidden int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx) { const __u64 mask = (1 << SDT_TASK_ENTS_PER_PAGE_SHIFT) - 1; sdt_desc_t *lv_desc[SDT_TASK_LEVELS]; sdt_desc_t * __arena *desc_children; struct sdt_chunk __arena *chunk; sdt_desc_t *desc; struct sdt_data __arena *data; __u64 level, shift, pos; __u64 lv_pos[SDT_TASK_LEVELS]; int ret; int i; if (!alloc) return 0; desc = alloc->root; if (unlikely(!desc)) return -EINVAL; /* To appease the verifier. */ for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) { lv_desc[level] = NULL; lv_pos[level] = 0; } /* Find the leaf node containing the index. */ for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) { shift = (SDT_TASK_LEVELS - 1 - level) * SDT_TASK_ENTS_PER_PAGE_SHIFT; pos = (idx >> shift) & mask; lv_desc[level] = desc; lv_pos[level] = pos; if (level == SDT_TASK_LEVELS - 1) break; chunk = desc->chunk; desc_children = (sdt_desc_t * __arena *)chunk->descs; desc = desc_children[pos]; if (unlikely(!desc)) return -EINVAL; } chunk = desc->chunk; pos = idx & mask; data = chunk->data[pos]; if (likely(data)) { *data = (struct sdt_data) { .tid.genn = data->tid.genn + 1, }; /* Zero out one word at a time. */ for (i = zero; i < alloc->pool.elem_size / 8 && can_loop; i++) { data->payload[i] = 0; } } ret = mark_nodes_avail(lv_desc, lv_pos); if (unlikely(ret != 0)) return ret; alloc_stats.active_allocs -= 1; alloc_stats.free_ops += 1; return 0; } static inline int ffs(__u64 word) { unsigned int num = 0; if ((word & 0xffffffff) == 0) { num += 32; word >>= 32; } if ((word & 0xffff) == 0) { num += 16; word >>= 16; } if ((word & 0xff) == 0) { num += 8; word >>= 8; } if ((word & 0xf) == 0) { num += 4; word >>= 4; } if ((word & 0x3) == 0) { num += 2; word >>= 2; } if ((word & 0x1) == 0) { num += 1; word >>= 1; } return num; } /* find the first empty slot */ __hidden __u64 chunk_find_empty(sdt_desc_t __arg_arena *desc) { __u64 freeslots; __u64 i; for (i = 0; i < SDT_TASK_CHUNK_BITMAP_U64S; i++) { freeslots = ~desc->allocated[i]; if (freeslots == (__u64)0) continue; return (i * 64) + ffs(freeslots); } return SDT_TASK_ENTS_PER_CHUNK; } /* * Find and return an available idx on the allocator. * Called with the task spinlock held. */ static sdt_desc_t * desc_find_empty(sdt_desc_t *desc, __u64 *idxp) { sdt_desc_t *lv_desc[SDT_TASK_LEVELS]; sdt_desc_t * __arena *desc_children; struct sdt_chunk __arena *chunk; sdt_desc_t *tmp; __u64 lv_pos[SDT_TASK_LEVELS]; __u64 u, pos, level; __u64 idx = 0; int ret; for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) { pos = chunk_find_empty(desc); /* If we error out, something has gone very wrong. */ if (unlikely(pos > SDT_TASK_ENTS_PER_CHUNK)) return NULL; if (pos == SDT_TASK_ENTS_PER_CHUNK) return NULL; idx <<= SDT_TASK_ENTS_PER_PAGE_SHIFT; idx |= pos; /* Log the levels to complete allocation. */ lv_desc[level] = desc; lv_pos[level] = pos; /* The rest of the loop is for internal node traversal. */ if (level == SDT_TASK_LEVELS - 1) break; /* Allocate an internal node if necessary. */ chunk = desc->chunk; desc_children = (sdt_desc_t * __arena *)chunk->descs; desc = desc_children[pos]; if (!desc) { desc = scx_alloc_chunk(); if (!desc) return NULL; desc_children[pos] = desc; } } /* * Finding the descriptor along with any internal node * allocations was successful. Update all levels with * the new allocation. */ bpf_for(u, 0, SDT_TASK_LEVELS) { level = SDT_TASK_LEVELS - 1 - u; tmp = lv_desc[level]; ret = set_idx_state(tmp, lv_pos[level], true); if (ret != 0) break; tmp->nr_free -= 1; if (tmp->nr_free > 0) break; } *idxp = idx; return desc; } __hidden void __arena *scx_alloc(struct scx_allocator *alloc) { struct sdt_data __arena *data = NULL; struct sdt_chunk __arena *chunk; sdt_desc_t *desc; __u64 idx, pos; if (!alloc) return NULL; bpf_spin_lock(&alloc_lock); /* We unlock if we encounter an error in the function. */ desc = desc_find_empty(alloc->root, &idx); if (unlikely(desc == NULL)) { bpf_spin_unlock(&alloc_lock); return NULL; } chunk = desc->chunk; /* Populate the leaf node if necessary. */ pos = idx & (SDT_TASK_ENTS_PER_CHUNK - 1); data = chunk->data[pos]; if (!data) { data = scx_alloc_from_pool(&alloc->pool); if (!data) { scx_alloc_free_idx(alloc, idx); bpf_spin_unlock(&alloc_lock); return NULL; } } chunk->data[pos] = data; /* The data counts as a chunk */ alloc_stats.data_allocs += 1; alloc_stats.alloc_ops += 1; alloc_stats.active_allocs += 1; data->tid.idx = idx; bpf_spin_unlock(&alloc_lock); return data; } /* * Task BPF map entry recording the task's assigned ID and pointing to the data * area allocated in arena. */ struct scx_task_map_val { union sdt_id tid; __u64 tptr; struct sdt_data __arena *data; }; struct { __uint(type, BPF_MAP_TYPE_TASK_STORAGE); __uint(map_flags, BPF_F_NO_PREALLOC); __type(key, int); __type(value, struct scx_task_map_val); } scx_task_map SEC(".maps"); static struct scx_allocator scx_task_allocator; __hidden void __arena *scx_task_alloc(struct task_struct *p) { struct sdt_data __arena *data = NULL; struct scx_task_map_val *mval; mval = bpf_task_storage_get(&scx_task_map, p, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); if (!mval) return NULL; data = scx_alloc(&scx_task_allocator); if (unlikely(!data)) return NULL; mval->tid = data->tid; mval->tptr = (__u64) p; mval->data = data; return (void __arena *)data->payload; } __hidden int scx_task_init(__u64 data_size) { return scx_alloc_init(&scx_task_allocator, data_size); } __hidden void __arena *scx_task_data(struct task_struct *p) { struct sdt_data __arena *data; struct scx_task_map_val *mval; scx_arena_subprog_init(); mval = bpf_task_storage_get(&scx_task_map, p, 0, 0); if (!mval) return NULL; data = mval->data; return (void __arena *)data->payload; } __hidden void scx_task_free(struct task_struct *p) { struct scx_task_map_val *mval; scx_arena_subprog_init(); mval = bpf_task_storage_get(&scx_task_map, p, 0, 0); if (!mval) return; bpf_spin_lock(&alloc_lock); scx_alloc_free_idx(&scx_task_allocator, mval->tid.idx); bpf_spin_unlock(&alloc_lock); bpf_task_storage_delete(&scx_task_map, p); } static inline void scx_stat_global_update(struct scx_stats __arena *stats) { cast_kern(stats); __sync_fetch_and_add(&stat_enqueue, stats->enqueue); __sync_fetch_and_add(&stat_init, stats->init); __sync_fetch_and_add(&stat_exit, stats->exit); __sync_fetch_and_add(&stat_select_idle_cpu, stats->select_idle_cpu); __sync_fetch_and_add(&stat_select_busy_cpu, stats->select_busy_cpu); } s32 BPF_STRUCT_OPS(sdt_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) { struct scx_stats __arena *stats; bool is_idle = false; s32 cpu; stats = scx_task_data(p); if (!stats) { scx_bpf_error("%s: no stats for pid %d", __func__, p->pid); return 0; } cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); if (is_idle) { stat_inc_select_idle_cpu(stats); scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); } else { stat_inc_select_busy_cpu(stats); } return cpu; } void BPF_STRUCT_OPS(sdt_enqueue, struct task_struct *p, u64 enq_flags) { struct scx_stats __arena *stats; stats = scx_task_data(p); if (!stats) { scx_bpf_error("%s: no stats for pid %d", __func__, p->pid); return; } stat_inc_enqueue(stats); scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags); } void BPF_STRUCT_OPS(sdt_dispatch, s32 cpu, struct task_struct *prev) { scx_bpf_dsq_move_to_local(SHARED_DSQ); } s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init_task, struct task_struct *p, struct scx_init_task_args *args) { struct scx_stats __arena *stats; stats = scx_task_alloc(p); if (!stats) { scx_bpf_error("arena allocator out of memory"); return -ENOMEM; } stats->pid = p->pid; stat_inc_init(stats); return 0; } void BPF_STRUCT_OPS(sdt_exit_task, struct task_struct *p, struct scx_exit_task_args *args) { struct scx_stats __arena *stats; stats = scx_task_data(p); if (!stats) { scx_bpf_error("%s: no stats for pid %d", __func__, p->pid); return; } stat_inc_exit(stats); scx_stat_global_update(stats); scx_task_free(p); } s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init) { int ret; ret = scx_task_init(sizeof(struct scx_stats)); if (ret < 0) { scx_bpf_error("%s: failed with %d", __func__, ret); return ret; } ret = scx_bpf_create_dsq(SHARED_DSQ, -1); if (ret) { scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret); return ret; } return 0; } void BPF_STRUCT_OPS(sdt_exit, struct scx_exit_info *ei) { UEI_RECORD(uei, ei); } SCX_OPS_DEFINE(sdt_ops, .select_cpu = (void *)sdt_select_cpu, .enqueue = (void *)sdt_enqueue, .dispatch = (void *)sdt_dispatch, .init_task = (void *)sdt_init_task, .exit_task = (void *)sdt_exit_task, .init = (void *)sdt_init, .exit = (void *)sdt_exit, .name = "sdt");