diff options
Diffstat (limited to 'kernel/sched/ext.c')
-rw-r--r-- | kernel/sched/ext.c | 1562 |
1 files changed, 314 insertions, 1248 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 4ae32ef179dd..2b0e88206d07 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -9,1040 +9,6 @@ #include <linux/btf_ids.h> #include "ext_idle.h" -#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) - -enum scx_consts { - SCX_DSP_DFL_MAX_BATCH = 32, - SCX_DSP_MAX_LOOPS = 32, - SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, - - SCX_EXIT_BT_LEN = 64, - SCX_EXIT_MSG_LEN = 1024, - SCX_EXIT_DUMP_DFL_LEN = 32768, - - SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, - - /* - * Iterating all tasks may take a while. Periodically drop - * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. - */ - SCX_TASK_ITER_BATCH = 32, -}; - -enum scx_exit_kind { - SCX_EXIT_NONE, - SCX_EXIT_DONE, - - SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ - SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ - SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ - SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ - - SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ - SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ - SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ -}; - -/* - * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(), - * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes - * are 64bit of the format: - * - * Bits: [63 .. 48 47 .. 32 31 .. 0] - * [ SYS ACT ] [ SYS RSN ] [ USR ] - * - * SYS ACT: System-defined exit actions - * SYS RSN: System-defined exit reasons - * USR : User-defined exit codes and reasons - * - * Using the above, users may communicate intention and context by ORing system - * actions and/or system reasons with a user-defined exit code. - */ -enum scx_exit_code { - /* Reasons */ - SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, - - /* Actions */ - SCX_ECODE_ACT_RESTART = 1LLU << 48, -}; - -/* - * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is - * being disabled. - */ -struct scx_exit_info { - /* %SCX_EXIT_* - broad category of the exit reason */ - enum scx_exit_kind kind; - - /* exit code if gracefully exiting */ - s64 exit_code; - - /* textual representation of the above */ - const char *reason; - - /* backtrace if exiting due to an error */ - unsigned long *bt; - u32 bt_len; - - /* informational message */ - char *msg; - - /* debug dump */ - char *dump; -}; - -/* sched_ext_ops.flags */ -enum scx_ops_flags { - /* - * Keep built-in idle tracking even if ops.update_idle() is implemented. - */ - SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, - - /* - * By default, if there are no other task to run on the CPU, ext core - * keeps running the current task even after its slice expires. If this - * flag is specified, such tasks are passed to ops.enqueue() with - * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. - */ - SCX_OPS_ENQ_LAST = 1LLU << 1, - - /* - * An exiting task may schedule after PF_EXITING is set. In such cases, - * bpf_task_from_pid() may not be able to find the task and if the BPF - * scheduler depends on pid lookup for dispatching, the task will be - * lost leading to various issues including RCU grace period stalls. - * - * To mask this problem, by default, unhashed tasks are automatically - * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't - * depend on pid lookups and wants to handle these tasks directly, the - * following flag can be used. - */ - SCX_OPS_ENQ_EXITING = 1LLU << 2, - - /* - * If set, only tasks with policy set to SCHED_EXT are attached to - * sched_ext. If clear, SCHED_NORMAL tasks are also included. - */ - SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, - - /* - * A migration disabled task can only execute on its current CPU. By - * default, such tasks are automatically put on the CPU's local DSQ with - * the default slice on enqueue. If this ops flag is set, they also go - * through ops.enqueue(). - * - * A migration disabled task never invokes ops.select_cpu() as it can - * only select the current CPU. Also, p->cpus_ptr will only contain its - * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr - * and thus may disagree with cpumask_weight(p->cpus_ptr). - */ - SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, - - /* - * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes - * ops.enqueue() on the ops.select_cpu() selected or the wakee's - * previous CPU via IPI (inter-processor interrupt) to reduce cacheline - * transfers. When this optimization is enabled, ops.select_cpu() is - * skipped in some cases (when racing against the wakee switching out). - * As the BPF scheduler may depend on ops.select_cpu() being invoked - * during wakeups, queued wakeup is disabled by default. - * - * If this ops flag is set, queued wakeup optimization is enabled and - * the BPF scheduler must be able to handle ops.enqueue() invoked on the - * wakee's CPU without preceding ops.select_cpu() even for tasks which - * may be executed on multiple CPUs. - */ - SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, - - /* - * If set, enable per-node idle cpumasks. If clear, use a single global - * flat idle cpumask. - */ - SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, - - /* - * CPU cgroup support flags - */ - SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ - - SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | - SCX_OPS_ENQ_LAST | - SCX_OPS_ENQ_EXITING | - SCX_OPS_ENQ_MIGRATION_DISABLED | - SCX_OPS_ALLOW_QUEUED_WAKEUP | - SCX_OPS_SWITCH_PARTIAL | - SCX_OPS_BUILTIN_IDLE_PER_NODE | - SCX_OPS_HAS_CGROUP_WEIGHT, - - /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */ - __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56, - - SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56, -}; - -/* argument container for ops.init_task() */ -struct scx_init_task_args { - /* - * Set if ops.init_task() is being invoked on the fork path, as opposed - * to the scheduler transition path. - */ - bool fork; -#ifdef CONFIG_EXT_GROUP_SCHED - /* the cgroup the task is joining */ - struct cgroup *cgroup; -#endif -}; - -/* argument container for ops.exit_task() */ -struct scx_exit_task_args { - /* Whether the task exited before running on sched_ext. */ - bool cancelled; -}; - -/* argument container for ops->cgroup_init() */ -struct scx_cgroup_init_args { - /* the weight of the cgroup [1..10000] */ - u32 weight; - - /* bandwidth control parameters from cpu.max and cpu.max.burst */ - u64 bw_period_us; - u64 bw_quota_us; - u64 bw_burst_us; -}; - -enum scx_cpu_preempt_reason { - /* next task is being scheduled by &sched_class_rt */ - SCX_CPU_PREEMPT_RT, - /* next task is being scheduled by &sched_class_dl */ - SCX_CPU_PREEMPT_DL, - /* next task is being scheduled by &sched_class_stop */ - SCX_CPU_PREEMPT_STOP, - /* unknown reason for SCX being preempted */ - SCX_CPU_PREEMPT_UNKNOWN, -}; - -/* - * Argument container for ops->cpu_acquire(). Currently empty, but may be - * expanded in the future. - */ -struct scx_cpu_acquire_args {}; - -/* argument container for ops->cpu_release() */ -struct scx_cpu_release_args { - /* the reason the CPU was preempted */ - enum scx_cpu_preempt_reason reason; - - /* the task that's going to be scheduled on the CPU */ - struct task_struct *task; -}; - -/* - * Informational context provided to dump operations. - */ -struct scx_dump_ctx { - enum scx_exit_kind kind; - s64 exit_code; - const char *reason; - u64 at_ns; - u64 at_jiffies; -}; - -/** - * struct sched_ext_ops - Operation table for BPF scheduler implementation - * - * A BPF scheduler can implement an arbitrary scheduling policy by - * implementing and loading operations in this table. Note that a userland - * scheduling policy can also be implemented using the BPF scheduler - * as a shim layer. - */ -struct sched_ext_ops { - /** - * @select_cpu: Pick the target CPU for a task which is being woken up - * @p: task being woken up - * @prev_cpu: the cpu @p was on before sleeping - * @wake_flags: SCX_WAKE_* - * - * Decision made here isn't final. @p may be moved to any CPU while it - * is getting dispatched for execution later. However, as @p is not on - * the rq at this point, getting the eventual execution CPU right here - * saves a small bit of overhead down the line. - * - * If an idle CPU is returned, the CPU is kicked and will try to - * dispatch. While an explicit custom mechanism can be added, - * select_cpu() serves as the default way to wake up idle CPUs. - * - * @p may be inserted into a DSQ directly by calling - * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped. - * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ - * of the CPU returned by this operation. - * - * Note that select_cpu() is never called for tasks that can only run - * on a single CPU or tasks with migration disabled, as they don't have - * the option to select a different CPU. See select_task_rq() for - * details. - */ - s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); - - /** - * @enqueue: Enqueue a task on the BPF scheduler - * @p: task being enqueued - * @enq_flags: %SCX_ENQ_* - * - * @p is ready to run. Insert directly into a DSQ by calling - * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly - * inserted, the bpf scheduler owns @p and if it fails to dispatch @p, - * the task will stall. - * - * If @p was inserted into a DSQ from ops.select_cpu(), this callback is - * skipped. - */ - void (*enqueue)(struct task_struct *p, u64 enq_flags); - - /** - * @dequeue: Remove a task from the BPF scheduler - * @p: task being dequeued - * @deq_flags: %SCX_DEQ_* - * - * Remove @p from the BPF scheduler. This is usually called to isolate - * the task while updating its scheduling properties (e.g. priority). - * - * The ext core keeps track of whether the BPF side owns a given task or - * not and can gracefully ignore spurious dispatches from BPF side, - * which makes it safe to not implement this method. However, depending - * on the scheduling logic, this can lead to confusing behaviors - e.g. - * scheduling position not being updated across a priority change. - */ - void (*dequeue)(struct task_struct *p, u64 deq_flags); - - /** - * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs - * @cpu: CPU to dispatch tasks for - * @prev: previous task being switched out - * - * Called when a CPU's local dsq is empty. The operation should dispatch - * one or more tasks from the BPF scheduler into the DSQs using - * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ - * using scx_bpf_dsq_move_to_local(). - * - * The maximum number of times scx_bpf_dsq_insert() can be called - * without an intervening scx_bpf_dsq_move_to_local() is specified by - * ops.dispatch_max_batch. See the comments on top of the two functions - * for more details. - * - * When not %NULL, @prev is an SCX task with its slice depleted. If - * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in - * @prev->scx.flags, it is not enqueued yet and will be enqueued after - * ops.dispatch() returns. To keep executing @prev, return without - * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST. - */ - void (*dispatch)(s32 cpu, struct task_struct *prev); - - /** - * @tick: Periodic tick - * @p: task running currently - * - * This operation is called every 1/HZ seconds on CPUs which are - * executing an SCX task. Setting @p->scx.slice to 0 will trigger an - * immediate dispatch cycle on the CPU. - */ - void (*tick)(struct task_struct *p); - - /** - * @runnable: A task is becoming runnable on its associated CPU - * @p: task becoming runnable - * @enq_flags: %SCX_ENQ_* - * - * This and the following three functions can be used to track a task's - * execution state transitions. A task becomes ->runnable() on a CPU, - * and then goes through one or more ->running() and ->stopping() pairs - * as it runs on the CPU, and eventually becomes ->quiescent() when it's - * done running on the CPU. - * - * @p is becoming runnable on the CPU because it's - * - * - waking up (%SCX_ENQ_WAKEUP) - * - being moved from another CPU - * - being restored after temporarily taken off the queue for an - * attribute change. - * - * This and ->enqueue() are related but not coupled. This operation - * notifies @p's state transition and may not be followed by ->enqueue() - * e.g. when @p is being dispatched to a remote CPU, or when @p is - * being enqueued on a CPU experiencing a hotplug event. Likewise, a - * task may be ->enqueue()'d without being preceded by this operation - * e.g. after exhausting its slice. - */ - void (*runnable)(struct task_struct *p, u64 enq_flags); - - /** - * @running: A task is starting to run on its associated CPU - * @p: task starting to run - * - * Note that this callback may be called from a CPU other than the - * one the task is going to run on. This can happen when a task - * property is changed (i.e., affinity), since scx_next_task_scx(), - * which triggers this callback, may run on a CPU different from - * the task's assigned CPU. - * - * Therefore, always use scx_bpf_task_cpu(@p) to determine the - * target CPU the task is going to use. - * - * See ->runnable() for explanation on the task state notifiers. - */ - void (*running)(struct task_struct *p); - - /** - * @stopping: A task is stopping execution - * @p: task stopping to run - * @runnable: is task @p still runnable? - * - * Note that this callback may be called from a CPU other than the - * one the task was running on. This can happen when a task - * property is changed (i.e., affinity), since dequeue_task_scx(), - * which triggers this callback, may run on a CPU different from - * the task's assigned CPU. - * - * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU - * the task was running on. - * - * See ->runnable() for explanation on the task state notifiers. If - * !@runnable, ->quiescent() will be invoked after this operation - * returns. - */ - void (*stopping)(struct task_struct *p, bool runnable); - - /** - * @quiescent: A task is becoming not runnable on its associated CPU - * @p: task becoming not runnable - * @deq_flags: %SCX_DEQ_* - * - * See ->runnable() for explanation on the task state notifiers. - * - * @p is becoming quiescent on the CPU because it's - * - * - sleeping (%SCX_DEQ_SLEEP) - * - being moved to another CPU - * - being temporarily taken off the queue for an attribute change - * (%SCX_DEQ_SAVE) - * - * This and ->dequeue() are related but not coupled. This operation - * notifies @p's state transition and may not be preceded by ->dequeue() - * e.g. when @p is being dispatched to a remote CPU. - */ - void (*quiescent)(struct task_struct *p, u64 deq_flags); - - /** - * @yield: Yield CPU - * @from: yielding task - * @to: optional yield target task - * - * If @to is NULL, @from is yielding the CPU to other runnable tasks. - * The BPF scheduler should ensure that other available tasks are - * dispatched before the yielding task. Return value is ignored in this - * case. - * - * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf - * scheduler can implement the request, return %true; otherwise, %false. - */ - bool (*yield)(struct task_struct *from, struct task_struct *to); - - /** - * @core_sched_before: Task ordering for core-sched - * @a: task A - * @b: task B - * - * Used by core-sched to determine the ordering between two tasks. See - * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on - * core-sched. - * - * Both @a and @b are runnable and may or may not currently be queued on - * the BPF scheduler. Should return %true if @a should run before @b. - * %false if there's no required ordering or @b should run before @a. - * - * If not specified, the default is ordering them according to when they - * became runnable. - */ - bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); - - /** - * @set_weight: Set task weight - * @p: task to set weight for - * @weight: new weight [1..10000] - * - * Update @p's weight to @weight. - */ - void (*set_weight)(struct task_struct *p, u32 weight); - - /** - * @set_cpumask: Set CPU affinity - * @p: task to set CPU affinity for - * @cpumask: cpumask of cpus that @p can run on - * - * Update @p's CPU affinity to @cpumask. - */ - void (*set_cpumask)(struct task_struct *p, - const struct cpumask *cpumask); - - /** - * @update_idle: Update the idle state of a CPU - * @cpu: CPU to update the idle state for - * @idle: whether entering or exiting the idle state - * - * This operation is called when @rq's CPU goes or leaves the idle - * state. By default, implementing this operation disables the built-in - * idle CPU tracking and the following helpers become unavailable: - * - * - scx_bpf_select_cpu_dfl() - * - scx_bpf_select_cpu_and() - * - scx_bpf_test_and_clear_cpu_idle() - * - scx_bpf_pick_idle_cpu() - * - * The user also must implement ops.select_cpu() as the default - * implementation relies on scx_bpf_select_cpu_dfl(). - * - * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle - * tracking. - */ - void (*update_idle)(s32 cpu, bool idle); - - /** - * @cpu_acquire: A CPU is becoming available to the BPF scheduler - * @cpu: The CPU being acquired by the BPF scheduler. - * @args: Acquire arguments, see the struct definition. - * - * A CPU that was previously released from the BPF scheduler is now once - * again under its control. - */ - void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); - - /** - * @cpu_release: A CPU is taken away from the BPF scheduler - * @cpu: The CPU being released by the BPF scheduler. - * @args: Release arguments, see the struct definition. - * - * The specified CPU is no longer under the control of the BPF - * scheduler. This could be because it was preempted by a higher - * priority sched_class, though there may be other reasons as well. The - * caller should consult @args->reason to determine the cause. - */ - void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); - - /** - * @init_task: Initialize a task to run in a BPF scheduler - * @p: task to initialize for BPF scheduling - * @args: init arguments, see the struct definition - * - * Either we're loading a BPF scheduler or a new task is being forked. - * Initialize @p for BPF scheduling. This operation may block and can - * be used for allocations, and is called exactly once for a task. - * - * Return 0 for success, -errno for failure. An error return while - * loading will abort loading of the BPF scheduler. During a fork, it - * will abort that specific fork. - */ - s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); - - /** - * @exit_task: Exit a previously-running task from the system - * @p: task to exit - * @args: exit arguments, see the struct definition - * - * @p is exiting or the BPF scheduler is being unloaded. Perform any - * necessary cleanup for @p. - */ - void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); - - /** - * @enable: Enable BPF scheduling for a task - * @p: task to enable BPF scheduling for - * - * Enable @p for BPF scheduling. enable() is called on @p any time it - * enters SCX, and is always paired with a matching disable(). - */ - void (*enable)(struct task_struct *p); - - /** - * @disable: Disable BPF scheduling for a task - * @p: task to disable BPF scheduling for - * - * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. - * Disable BPF scheduling for @p. A disable() call is always matched - * with a prior enable() call. - */ - void (*disable)(struct task_struct *p); - - /** - * @dump: Dump BPF scheduler state on error - * @ctx: debug dump context - * - * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. - */ - void (*dump)(struct scx_dump_ctx *ctx); - - /** - * @dump_cpu: Dump BPF scheduler state for a CPU on error - * @ctx: debug dump context - * @cpu: CPU to generate debug dump for - * @idle: @cpu is currently idle without any runnable tasks - * - * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for - * @cpu. If @idle is %true and this operation doesn't produce any - * output, @cpu is skipped for dump. - */ - void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); - - /** - * @dump_task: Dump BPF scheduler state for a runnable task on error - * @ctx: debug dump context - * @p: runnable task to generate debug dump for - * - * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for - * @p. - */ - void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); - -#ifdef CONFIG_EXT_GROUP_SCHED - /** - * @cgroup_init: Initialize a cgroup - * @cgrp: cgroup being initialized - * @args: init arguments, see the struct definition - * - * Either the BPF scheduler is being loaded or @cgrp created, initialize - * @cgrp for sched_ext. This operation may block. - * - * Return 0 for success, -errno for failure. An error return while - * loading will abort loading of the BPF scheduler. During cgroup - * creation, it will abort the specific cgroup creation. - */ - s32 (*cgroup_init)(struct cgroup *cgrp, - struct scx_cgroup_init_args *args); - - /** - * @cgroup_exit: Exit a cgroup - * @cgrp: cgroup being exited - * - * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit - * @cgrp for sched_ext. This operation my block. - */ - void (*cgroup_exit)(struct cgroup *cgrp); - - /** - * @cgroup_prep_move: Prepare a task to be moved to a different cgroup - * @p: task being moved - * @from: cgroup @p is being moved from - * @to: cgroup @p is being moved to - * - * Prepare @p for move from cgroup @from to @to. This operation may - * block and can be used for allocations. - * - * Return 0 for success, -errno for failure. An error return aborts the - * migration. - */ - s32 (*cgroup_prep_move)(struct task_struct *p, - struct cgroup *from, struct cgroup *to); - - /** - * @cgroup_move: Commit cgroup move - * @p: task being moved - * @from: cgroup @p is being moved from - * @to: cgroup @p is being moved to - * - * Commit the move. @p is dequeued during this operation. - */ - void (*cgroup_move)(struct task_struct *p, - struct cgroup *from, struct cgroup *to); - - /** - * @cgroup_cancel_move: Cancel cgroup move - * @p: task whose cgroup move is being canceled - * @from: cgroup @p was being moved from - * @to: cgroup @p was being moved to - * - * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). - * Undo the preparation. - */ - void (*cgroup_cancel_move)(struct task_struct *p, - struct cgroup *from, struct cgroup *to); - - /** - * @cgroup_set_weight: A cgroup's weight is being changed - * @cgrp: cgroup whose weight is being updated - * @weight: new weight [1..10000] - * - * Update @cgrp's weight to @weight. - */ - void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); - - /** - * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed - * @cgrp: cgroup whose bandwidth is being updated - * @period_us: bandwidth control period - * @quota_us: bandwidth control quota - * @burst_us: bandwidth control burst - * - * Update @cgrp's bandwidth control parameters. This is from the cpu.max - * cgroup interface. - * - * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled - * to. For example, if @period_us is 1_000_000 and @quota_us is - * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be - * interpreted in the same fashion and specifies how much @cgrp can - * burst temporarily. The specific control mechanism and thus the - * interpretation of @period_us and burstiness is upto to the BPF - * scheduler. - */ - void (*cgroup_set_bandwidth)(struct cgroup *cgrp, - u64 period_us, u64 quota_us, u64 burst_us); - -#endif /* CONFIG_EXT_GROUP_SCHED */ - - /* - * All online ops must come before ops.cpu_online(). - */ - - /** - * @cpu_online: A CPU became online - * @cpu: CPU which just came up - * - * @cpu just came online. @cpu will not call ops.enqueue() or - * ops.dispatch(), nor run tasks associated with other CPUs beforehand. - */ - void (*cpu_online)(s32 cpu); - - /** - * @cpu_offline: A CPU is going offline - * @cpu: CPU which is going offline - * - * @cpu is going offline. @cpu will not call ops.enqueue() or - * ops.dispatch(), nor run tasks associated with other CPUs afterwards. - */ - void (*cpu_offline)(s32 cpu); - - /* - * All CPU hotplug ops must come before ops.init(). - */ - - /** - * @init: Initialize the BPF scheduler - */ - s32 (*init)(void); - - /** - * @exit: Clean up after the BPF scheduler - * @info: Exit info - * - * ops.exit() is also called on ops.init() failure, which is a bit - * unusual. This is to allow rich reporting through @info on how - * ops.init() failed. - */ - void (*exit)(struct scx_exit_info *info); - - /** - * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch - */ - u32 dispatch_max_batch; - - /** - * @flags: %SCX_OPS_* flags - */ - u64 flags; - - /** - * @timeout_ms: The maximum amount of time, in milliseconds, that a - * runnable task should be able to wait before being scheduled. The - * maximum timeout may not exceed the default timeout of 30 seconds. - * - * Defaults to the maximum allowed timeout value of 30 seconds. - */ - u32 timeout_ms; - - /** - * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default - * value of 32768 is used. - */ - u32 exit_dump_len; - - /** - * @hotplug_seq: A sequence number that may be set by the scheduler to - * detect when a hotplug event has occurred during the loading process. - * If 0, no detection occurs. Otherwise, the scheduler will fail to - * load if the sequence number does not match @scx_hotplug_seq on the - * enable path. - */ - u64 hotplug_seq; - - /** - * @name: BPF scheduler's name - * - * Must be a non-zero valid BPF object name including only isalnum(), - * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the - * BPF scheduler is enabled. - */ - char name[SCX_OPS_NAME_LEN]; - - /* internal use only, must be NULL */ - void *priv; -}; - -enum scx_opi { - SCX_OPI_BEGIN = 0, - SCX_OPI_NORMAL_BEGIN = 0, - SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), - SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), - SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), - SCX_OPI_END = SCX_OP_IDX(init), -}; - -/* - * Collection of event counters. Event types are placed in descending order. - */ -struct scx_event_stats { - /* - * If ops.select_cpu() returns a CPU which can't be used by the task, - * the core scheduler code silently picks a fallback CPU. - */ - s64 SCX_EV_SELECT_CPU_FALLBACK; - - /* - * When dispatching to a local DSQ, the CPU may have gone offline in - * the meantime. In this case, the task is bounced to the global DSQ. - */ - s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; - - /* - * If SCX_OPS_ENQ_LAST is not set, the number of times that a task - * continued to run because there were no other tasks on the CPU. - */ - s64 SCX_EV_DISPATCH_KEEP_LAST; - - /* - * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task - * is dispatched to a local DSQ when exiting. - */ - s64 SCX_EV_ENQ_SKIP_EXITING; - - /* - * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a - * migration disabled task skips ops.enqueue() and is dispatched to its - * local DSQ. - */ - s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; - - /* - * Total number of times a task's time slice was refilled with the - * default value (SCX_SLICE_DFL). - */ - s64 SCX_EV_REFILL_SLICE_DFL; - - /* - * The total duration of bypass modes in nanoseconds. - */ - s64 SCX_EV_BYPASS_DURATION; - - /* - * The number of tasks dispatched in the bypassing mode. - */ - s64 SCX_EV_BYPASS_DISPATCH; - - /* - * The number of times the bypassing mode has been activated. - */ - s64 SCX_EV_BYPASS_ACTIVATE; -}; - -struct scx_sched { - struct sched_ext_ops ops; - DECLARE_BITMAP(has_op, SCX_OPI_END); - - /* - * Dispatch queues. - * - * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. - * This is to avoid live-locking in bypass mode where all tasks are - * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If - * per-node split isn't sufficient, it can be further split. - */ - struct rhashtable dsq_hash; - struct scx_dispatch_q **global_dsqs; - - /* - * The event counters are in a per-CPU variable to minimize the - * accounting overhead. A system-wide view on the event counter is - * constructed when requested by scx_bpf_events(). - */ - struct scx_event_stats __percpu *event_stats_cpu; - - bool warned_zero_slice; - - atomic_t exit_kind; - struct scx_exit_info *exit_info; - - struct kobject kobj; - - struct kthread_worker *helper; - struct irq_work error_irq_work; - struct kthread_work disable_work; - struct rcu_work rcu_work; -}; - -enum scx_wake_flags { - /* expose select WF_* flags as enums */ - SCX_WAKE_FORK = WF_FORK, - SCX_WAKE_TTWU = WF_TTWU, - SCX_WAKE_SYNC = WF_SYNC, -}; - -enum scx_enq_flags { - /* expose select ENQUEUE_* flags as enums */ - SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, - SCX_ENQ_HEAD = ENQUEUE_HEAD, - SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, - - /* high 32bits are SCX specific */ - - /* - * Set the following to trigger preemption when calling - * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the - * current task is cleared to zero and the CPU is kicked into the - * scheduling path. Implies %SCX_ENQ_HEAD. - */ - SCX_ENQ_PREEMPT = 1LLU << 32, - - /* - * The task being enqueued was previously enqueued on the current CPU's - * %SCX_DSQ_LOCAL, but was removed from it in a call to the - * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was - * invoked in a ->cpu_release() callback, and the task is again - * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the - * task will not be scheduled on the CPU until at least the next invocation - * of the ->cpu_acquire() callback. - */ - SCX_ENQ_REENQ = 1LLU << 40, - - /* - * The task being enqueued is the only task available for the cpu. By - * default, ext core keeps executing such tasks but when - * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the - * %SCX_ENQ_LAST flag set. - * - * The BPF scheduler is responsible for triggering a follow-up - * scheduling event. Otherwise, Execution may stall. - */ - SCX_ENQ_LAST = 1LLU << 41, - - /* high 8 bits are internal */ - __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, - - SCX_ENQ_CLEAR_OPSS = 1LLU << 56, - SCX_ENQ_DSQ_PRIQ = 1LLU << 57, -}; - -enum scx_deq_flags { - /* expose select DEQUEUE_* flags as enums */ - SCX_DEQ_SLEEP = DEQUEUE_SLEEP, - - /* high 32bits are SCX specific */ - - /* - * The generic core-sched layer decided to execute the task even though - * it hasn't been dispatched yet. Dequeue from the BPF side. - */ - SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, -}; - -enum scx_pick_idle_cpu_flags { - SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ - SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ -}; - -enum scx_kick_flags { - /* - * Kick the target CPU if idle. Guarantees that the target CPU goes - * through at least one full scheduling cycle before going idle. If the - * target CPU can be determined to be currently not idle and going to go - * through a scheduling cycle before going idle, noop. - */ - SCX_KICK_IDLE = 1LLU << 0, - - /* - * Preempt the current task and execute the dispatch path. If the - * current task of the target CPU is an SCX task, its ->scx.slice is - * cleared to zero before the scheduling path is invoked so that the - * task expires and the dispatch path is invoked. - */ - SCX_KICK_PREEMPT = 1LLU << 1, - - /* - * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will - * return after the target CPU finishes picking the next task. - */ - SCX_KICK_WAIT = 1LLU << 2, -}; - -enum scx_tg_flags { - SCX_TG_ONLINE = 1U << 0, - SCX_TG_INITED = 1U << 1, -}; - -enum scx_enable_state { - SCX_ENABLING, - SCX_ENABLED, - SCX_DISABLING, - SCX_DISABLED, -}; - -static const char *scx_enable_state_str[] = { - [SCX_ENABLING] = "enabling", - [SCX_ENABLED] = "enabled", - [SCX_DISABLING] = "disabling", - [SCX_DISABLED] = "disabled", -}; - -/* - * sched_ext_entity->ops_state - * - * Used to track the task ownership between the SCX core and the BPF scheduler. - * State transitions look as follows: - * - * NONE -> QUEUEING -> QUEUED -> DISPATCHING - * ^ | | - * | v v - * \-------------------------------/ - * - * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call - * sites for explanations on the conditions being waited upon and why they are - * safe. Transitions out of them into NONE or QUEUED must store_release and the - * waiters should load_acquire. - * - * Tracking scx_ops_state enables sched_ext core to reliably determine whether - * any given task can be dispatched by the BPF scheduler at all times and thus - * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler - * to try to dispatch any task anytime regardless of its state as the SCX core - * can safely reject invalid dispatches. - */ -enum scx_ops_state { - SCX_OPSS_NONE, /* owned by the SCX core */ - SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ - SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ - SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ - - /* - * QSEQ brands each QUEUED instance so that, when dispatch races - * dequeue/requeue, the dispatcher can tell whether it still has a claim - * on the task being dispatched. - * - * As some 32bit archs can't do 64bit store_release/load_acquire, - * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on - * 32bit machines. The dispatch race window QSEQ protects is very narrow - * and runs with IRQ disabled. 30 bits should be sufficient. - */ - SCX_OPSS_QSEQ_SHIFT = 2, -}; - -/* Use macros to ensure that the type is unsigned long for the masks */ -#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) -#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) - /* * NOTE: sched_ext is in the process of growing multiple scheduler support and * scx_root usage is in a transitional state. Naked dereferences are safe if the @@ -1170,7 +136,7 @@ static struct kset *scx_kset; #include <trace/events/sched_ext.h> static void process_ddsp_deferred_locals(struct rq *rq); -static void scx_bpf_kick_cpu(s32 cpu, u64 flags); +static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, s64 exit_code, const char *fmt, va_list args); @@ -1185,24 +151,7 @@ static __printf(4, 5) void scx_exit(struct scx_sched *sch, va_end(args); } -static __printf(3, 4) void scx_kf_exit(enum scx_exit_kind kind, s64 exit_code, - const char *fmt, ...) -{ - struct scx_sched *sch; - va_list args; - - rcu_read_lock(); - sch = rcu_dereference(scx_root); - if (sch) { - va_start(args, fmt); - scx_vexit(sch, kind, exit_code, fmt, args); - va_end(args); - } - rcu_read_unlock(); -} - #define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args) -#define scx_kf_error(fmt, args...) scx_kf_exit(SCX_EXIT_ERROR, 0, fmt, ##args) #define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op) @@ -1232,10 +181,9 @@ static bool u32_before(u32 a, u32 b) return (s32)(a - b) < 0; } -static struct scx_dispatch_q *find_global_dsq(struct task_struct *p) +static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, + struct task_struct *p) { - struct scx_sched *sch = scx_root; - return sch->global_dsqs[cpu_to_node(task_cpu(p))]; } @@ -1363,11 +311,11 @@ do { \ }) /* @mask is constant, always inline to cull unnecessary branches */ -static __always_inline bool scx_kf_allowed(u32 mask) +static __always_inline bool scx_kf_allowed(struct scx_sched *sch, u32 mask) { if (unlikely(!(current->scx.kf_mask & mask))) { - scx_kf_error("kfunc with mask 0x%x called from an operation only allowing 0x%x", - mask, current->scx.kf_mask); + scx_error(sch, "kfunc with mask 0x%x called from an operation only allowing 0x%x", + mask, current->scx.kf_mask); return false; } @@ -1380,13 +328,13 @@ static __always_inline bool scx_kf_allowed(u32 mask) */ if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE && (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) { - scx_kf_error("cpu_release kfunc called from a nested operation"); + scx_error(sch, "cpu_release kfunc called from a nested operation"); return false; } if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { - scx_kf_error("dispatch kfunc called from a nested operation"); + scx_error(sch, "dispatch kfunc called from a nested operation"); return false; } @@ -1394,15 +342,16 @@ static __always_inline bool scx_kf_allowed(u32 mask) } /* see SCX_CALL_OP_TASK() */ -static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask, +static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch, + u32 mask, struct task_struct *p) { - if (!scx_kf_allowed(mask)) + if (!scx_kf_allowed(sch, mask)) return false; if (unlikely((p != current->scx.kf_tasks[0] && p != current->scx.kf_tasks[1]))) { - scx_kf_error("called on a task not being operated on"); + scx_error(sch, "called on a task not being operated on"); return false; } @@ -1488,10 +437,11 @@ struct bpf_iter_scx_dsq { */ struct scx_task_iter { struct sched_ext_entity cursor; - struct task_struct *locked; + struct task_struct *locked_task; struct rq *rq; struct rq_flags rf; u32 cnt; + bool list_locked; }; /** @@ -1519,15 +469,16 @@ static void scx_task_iter_start(struct scx_task_iter *iter) iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; list_add(&iter->cursor.tasks_node, &scx_tasks); - iter->locked = NULL; + iter->locked_task = NULL; iter->cnt = 0; + iter->list_locked = true; } static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) { - if (iter->locked) { - task_rq_unlock(iter->rq, iter->locked, &iter->rf); - iter->locked = NULL; + if (iter->locked_task) { + task_rq_unlock(iter->rq, iter->locked_task, &iter->rf); + iter->locked_task = NULL; } } @@ -1537,24 +488,24 @@ static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) * * If @iter is in the middle of a locked iteration, it may be locking the rq of * the task currently being visited in addition to scx_tasks_lock. Unlock both. - * This function can be safely called anytime during an iteration. + * This function can be safely called anytime during an iteration. The next + * iterator operation will automatically restore the necessary locking. */ static void scx_task_iter_unlock(struct scx_task_iter *iter) { __scx_task_iter_rq_unlock(iter); - spin_unlock_irq(&scx_tasks_lock); + if (iter->list_locked) { + iter->list_locked = false; + spin_unlock_irq(&scx_tasks_lock); + } } -/** - * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock() - * @iter: iterator to re-lock - * - * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it - * doesn't re-lock the rq lock. Must be called before other iterator operations. - */ -static void scx_task_iter_relock(struct scx_task_iter *iter) +static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) { - spin_lock_irq(&scx_tasks_lock); + if (!iter->list_locked) { + spin_lock_irq(&scx_tasks_lock); + iter->list_locked = true; + } } /** @@ -1567,6 +518,7 @@ static void scx_task_iter_relock(struct scx_task_iter *iter) */ static void scx_task_iter_stop(struct scx_task_iter *iter) { + __scx_task_iter_maybe_relock(iter); list_del_init(&iter->cursor.tasks_node); scx_task_iter_unlock(iter); } @@ -1584,10 +536,12 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) struct list_head *cursor = &iter->cursor.tasks_node; struct sched_ext_entity *pos; + __scx_task_iter_maybe_relock(iter); + if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) { scx_task_iter_unlock(iter); cond_resched(); - scx_task_iter_relock(iter); + __scx_task_iter_maybe_relock(iter); } list_for_each_entry(pos, cursor, tasks_node) { @@ -1650,7 +604,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) return NULL; iter->rq = task_rq_lock(p, &iter->rf); - iter->locked = p; + iter->locked_task = p; return p; } @@ -1664,7 +618,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) * This can be used when preemption is not disabled. */ #define scx_add_event(sch, name, cnt) do { \ - this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ + this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ trace_sched_ext_event(#name, (cnt)); \ } while(0) @@ -1677,7 +631,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) * This should be used only when preemption is disabled. */ #define __scx_add_event(sch, name, cnt) do { \ - __this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ + __this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ trace_sched_ext_event(#name, cnt); \ } while(0) @@ -1766,23 +720,6 @@ static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where) } /** - * kf_cpu_valid - Verify a CPU number, to be used on kfunc input args - * @cpu: cpu number which came from a BPF ops - * @where: extra information reported on error - * - * The same as ops_cpu_valid() but @sch is implicit. - */ -static bool kf_cpu_valid(u32 cpu, const char *where) -{ - if (__cpu_valid(cpu)) { - return true; - } else { - scx_kf_error("invalid CPU %d%s%s", cpu, where ? " " : "", where ?: ""); - return false; - } -} - -/** * ops_sanitize_err - Sanitize a -errno value * @sch: scx_sched to error out on error * @ops_name: operation to blame on failure @@ -1942,10 +879,10 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) WRITE_ONCE(dsq->nr, dsq->nr + delta); } -static void refill_task_slice_dfl(struct task_struct *p) +static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) { p->scx.slice = SCX_SLICE_DFL; - __scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1); + __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); } static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, @@ -1963,7 +900,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, scx_error(sch, "attempting to dispatch to a destroyed dsq"); /* fall back to the global dsq */ raw_spin_unlock(&dsq->lock); - dsq = find_global_dsq(p); + dsq = find_global_dsq(sch, p); raw_spin_lock(&dsq->lock); } } @@ -2142,26 +1079,27 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) - return find_global_dsq(p); + return find_global_dsq(sch, p); return &cpu_rq(cpu)->scx.local_dsq; } if (dsq_id == SCX_DSQ_GLOBAL) - dsq = find_global_dsq(p); + dsq = find_global_dsq(sch, p); else dsq = find_user_dsq(sch, dsq_id); if (unlikely(!dsq)) { scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]", dsq_id, p->comm, p->pid); - return find_global_dsq(p); + return find_global_dsq(sch, p); } return dsq; } -static void mark_direct_dispatch(struct task_struct *ddsp_task, +static void mark_direct_dispatch(struct scx_sched *sch, + struct task_struct *ddsp_task, struct task_struct *p, u64 dsq_id, u64 enq_flags) { @@ -2175,10 +1113,10 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task, /* @p must match the task on the enqueue path */ if (unlikely(p != ddsp_task)) { if (IS_ERR(ddsp_task)) - scx_kf_error("%s[%d] already direct-dispatched", + scx_error(sch, "%s[%d] already direct-dispatched", p->comm, p->pid); else - scx_kf_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]", + scx_error(sch, "scheduling for %s[%d] but trying to direct-dispatch %s[%d]", ddsp_task->comm, ddsp_task->pid, p->comm, p->pid); return; @@ -2333,15 +1271,15 @@ local: * higher priority it becomes from scx_prio_less()'s POV. */ touch_core_sched(rq, p); - refill_task_slice_dfl(p); + refill_task_slice_dfl(sch, p); local_norefill: dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags); return; global: touch_core_sched(rq, p); /* see the comment in local: */ - refill_task_slice_dfl(p); - dispatch_enqueue(sch, find_global_dsq(p), p, enq_flags); + refill_task_slice_dfl(sch, p); + dispatch_enqueue(sch, find_global_dsq(sch, p), p, enq_flags); } static bool task_runnable(const struct task_struct *p) @@ -2651,8 +1589,7 @@ static bool task_can_run_on_remote_rq(struct scx_sched *sch, if (!scx_rq_online(rq)) { if (enforce) - __scx_add_event(scx_root, - SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); + __scx_add_event(sch, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); return false; } @@ -2754,7 +1691,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch, dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); if (src_rq != dst_rq && unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { - dst_dsq = find_global_dsq(p); + dst_dsq = find_global_dsq(sch, p); dst_rq = src_rq; } } else { @@ -2910,7 +1847,7 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, if (src_rq != dst_rq && unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { - dispatch_enqueue(sch, find_global_dsq(p), p, + dispatch_enqueue(sch, find_global_dsq(sch, p), p, enq_flags | SCX_ENQ_CLEAR_OPSS); return; } @@ -3155,10 +2092,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * balance(), we want to complete this scheduling cycle and then * start a new one. IOW, we want to call resched_curr() on the * next, most likely idle, task, not the current one. Use - * scx_bpf_kick_cpu() for deferred kicking. + * scx_kick_cpu() for deferred kicking. */ if (unlikely(!--nr_loops)) { - scx_bpf_kick_cpu(cpu_of(rq), 0); + scx_kick_cpu(sch, cpu_of(rq), 0); break; } } while (dspc->nr_tasks); @@ -3442,24 +2379,25 @@ static struct task_struct *pick_task_scx(struct rq *rq) if (keep_prev) { p = prev; if (!p->scx.slice) - refill_task_slice_dfl(p); + refill_task_slice_dfl(rcu_dereference_sched(scx_root), p); } else { p = first_local_task(rq); if (!p) { if (kick_idle) - scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE); + scx_kick_cpu(rcu_dereference_sched(scx_root), + cpu_of(rq), SCX_KICK_IDLE); return NULL; } if (unlikely(!p->scx.slice)) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = rcu_dereference_sched(scx_root); if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) { printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", p->comm, p->pid, __func__); sch->warned_zero_slice = true; } - refill_task_slice_dfl(p); + refill_task_slice_dfl(sch, p); } } @@ -3548,7 +2486,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0); if (cpu >= 0) { - refill_task_slice_dfl(p); + refill_task_slice_dfl(sch, p); p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; } else { cpu = prev_cpu; @@ -4084,7 +3022,7 @@ bool scx_can_stop_tick(struct rq *rq) #ifdef CONFIG_EXT_GROUP_SCHED -DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); +DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_ops_rwsem); static bool scx_cgroup_enabled; void scx_tg_init(struct task_group *tg) @@ -4101,8 +3039,6 @@ int scx_tg_online(struct task_group *tg) WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED)); - percpu_down_read(&scx_cgroup_rwsem); - if (scx_cgroup_enabled) { if (SCX_HAS_OP(sch, cgroup_init)) { struct scx_cgroup_init_args args = @@ -4122,7 +3058,6 @@ int scx_tg_online(struct task_group *tg) tg->scx.flags |= SCX_TG_ONLINE; } - percpu_up_read(&scx_cgroup_rwsem); return ret; } @@ -4132,15 +3067,11 @@ void scx_tg_offline(struct task_group *tg) WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE)); - percpu_down_read(&scx_cgroup_rwsem); - if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) && (tg->scx.flags & SCX_TG_INITED)) SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, tg->css.cgroup); tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); - - percpu_up_read(&scx_cgroup_rwsem); } int scx_cgroup_can_attach(struct cgroup_taskset *tset) @@ -4150,9 +3081,6 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) struct task_struct *p; int ret; - /* released in scx_finish/cancel_attach() */ - percpu_down_read(&scx_cgroup_rwsem); - if (!scx_cgroup_enabled) return 0; @@ -4192,7 +3120,6 @@ err: p->scx.cgrp_moving_from = NULL; } - percpu_up_read(&scx_cgroup_rwsem); return ops_sanitize_err(sch, "cgroup_prep_move", ret); } @@ -4215,11 +3142,6 @@ void scx_cgroup_move_task(struct task_struct *p) p->scx.cgrp_moving_from = NULL; } -void scx_cgroup_finish_attach(void) -{ - percpu_up_read(&scx_cgroup_rwsem); -} - void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) { struct scx_sched *sch = scx_root; @@ -4227,7 +3149,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) struct task_struct *p; if (!scx_cgroup_enabled) - goto out_unlock; + return; cgroup_taskset_for_each(p, css, tset) { if (SCX_HAS_OP(sch, cgroup_cancel_move) && @@ -4236,15 +3158,13 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) p, p->scx.cgrp_moving_from, css->cgroup); p->scx.cgrp_moving_from = NULL; } -out_unlock: - percpu_up_read(&scx_cgroup_rwsem); } void scx_group_set_weight(struct task_group *tg, unsigned long weight) { struct scx_sched *sch = scx_root; - percpu_down_read(&scx_cgroup_rwsem); + percpu_down_read(&scx_cgroup_ops_rwsem); if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && tg->scx.weight != weight) @@ -4253,7 +3173,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) tg->scx.weight = weight; - percpu_up_read(&scx_cgroup_rwsem); + percpu_up_read(&scx_cgroup_ops_rwsem); } void scx_group_set_idle(struct task_group *tg, bool idle) @@ -4266,7 +3186,7 @@ void scx_group_set_bandwidth(struct task_group *tg, { struct scx_sched *sch = scx_root; - percpu_down_read(&scx_cgroup_rwsem); + percpu_down_read(&scx_cgroup_ops_rwsem); if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) && (tg->scx.bw_period_us != period_us || @@ -4279,23 +3199,25 @@ void scx_group_set_bandwidth(struct task_group *tg, tg->scx.bw_quota_us = quota_us; tg->scx.bw_burst_us = burst_us; - percpu_up_read(&scx_cgroup_rwsem); + percpu_up_read(&scx_cgroup_ops_rwsem); } static void scx_cgroup_lock(void) { - percpu_down_write(&scx_cgroup_rwsem); + percpu_down_write(&scx_cgroup_ops_rwsem); + cgroup_lock(); } static void scx_cgroup_unlock(void) { - percpu_up_write(&scx_cgroup_rwsem); + cgroup_unlock(); + percpu_up_write(&scx_cgroup_ops_rwsem); } #else /* CONFIG_EXT_GROUP_SCHED */ -static inline void scx_cgroup_lock(void) {} -static inline void scx_cgroup_unlock(void) {} +static void scx_cgroup_lock(void) {} +static void scx_cgroup_unlock(void) {} #endif /* CONFIG_EXT_GROUP_SCHED */ @@ -4411,15 +3333,12 @@ static void scx_cgroup_exit(struct scx_sched *sch) { struct cgroup_subsys_state *css; - percpu_rwsem_assert_held(&scx_cgroup_rwsem); - scx_cgroup_enabled = false; /* - * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk + * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk * cgroups and exit all the inited ones, all online cgroups are exited. */ - rcu_read_lock(); css_for_each_descendant_post(css, &root_task_group.css) { struct task_group *tg = css_tg(css); @@ -4430,17 +3349,9 @@ static void scx_cgroup_exit(struct scx_sched *sch) if (!sch->ops.cgroup_exit) continue; - if (WARN_ON_ONCE(!css_tryget(css))) - continue; - rcu_read_unlock(); - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, css->cgroup); - - rcu_read_lock(); - css_put(css); } - rcu_read_unlock(); } static int scx_cgroup_init(struct scx_sched *sch) @@ -4448,13 +3359,10 @@ static int scx_cgroup_init(struct scx_sched *sch) struct cgroup_subsys_state *css; int ret; - percpu_rwsem_assert_held(&scx_cgroup_rwsem); - /* - * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk + * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk * cgroups and init, all online cgroups are initialized. */ - rcu_read_lock(); css_for_each_descendant_pre(css, &root_task_group.css) { struct task_group *tg = css_tg(css); struct scx_cgroup_init_args args = { @@ -4473,10 +3381,6 @@ static int scx_cgroup_init(struct scx_sched *sch) continue; } - if (WARN_ON_ONCE(!css_tryget(css))) - continue; - rcu_read_unlock(); - ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL, css->cgroup, &args); if (ret) { @@ -4485,11 +3389,7 @@ static int scx_cgroup_init(struct scx_sched *sch) return ret; } tg->scx.flags |= SCX_TG_INITED; - - rcu_read_lock(); - css_put(css); } - rcu_read_unlock(); WARN_ON_ONCE(scx_cgroup_enabled); scx_cgroup_enabled = true; @@ -4572,7 +3472,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work) int node; kthread_stop(sch->helper->task); - free_percpu(sch->event_stats_cpu); + free_percpu(sch->pcpu); for_each_node_state(node, N_POSSIBLE) kfree(sch->global_dsqs[node]); @@ -4671,9 +3571,22 @@ bool task_should_scx(int policy) bool scx_allow_ttwu_queue(const struct task_struct *p) { - return !scx_enabled() || - (scx_root->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) || - p->sched_class != &ext_sched_class; + struct scx_sched *sch; + + if (!scx_enabled()) + return true; + + sch = rcu_dereference_sched(scx_root); + if (unlikely(!sch)) + return true; + + if (sch->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) + return true; + + if (unlikely(p->sched_class != &ext_sched_class)) + return true; + + return false; } /** @@ -4789,7 +3702,7 @@ static void scx_clear_softlockup(void) * * - pick_next_task() suppresses zero slice warning. * - * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM + * - scx_kick_cpu() is disabled to avoid irq_work malfunction during PM * operations. * * - scx_prio_less() reverts to the default core_sched_at order. @@ -5234,7 +4147,8 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u", p->scx.dsq_vtime, p->scx.slice, p->scx.weight); - dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); + dump_line(s, " cpus=%*pb no_mig=%u", cpumask_pr_args(p->cpus_ptr), + p->migration_disabled); if (SCX_HAS_OP(sch, dump_task)) { ops_dump_init(s, " "); @@ -5473,13 +4387,13 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) sch->global_dsqs[node] = dsq; } - sch->event_stats_cpu = alloc_percpu(struct scx_event_stats); - if (!sch->event_stats_cpu) + sch->pcpu = alloc_percpu(struct scx_sched_pcpu); + if (!sch->pcpu) goto err_free_gdsqs; sch->helper = kthread_run_worker(0, "sched_ext_helper"); if (!sch->helper) - goto err_free_event_stats; + goto err_free_pcpu; sched_set_fifo(sch->helper->task); atomic_set(&sch->exit_kind, SCX_EXIT_NONE); @@ -5497,8 +4411,8 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) err_stop_helper: kthread_stop(sch->helper->task); -err_free_event_stats: - free_percpu(sch->event_stats_cpu); +err_free_pcpu: + free_percpu(sch->pcpu); err_free_gdsqs: for_each_node_state(node, N_POSSIBLE) kfree(sch->global_dsqs[node]); @@ -5621,6 +4535,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_error(sch, "ops.init() failed (%d)", ret); goto err_disable; } + sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; } for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) @@ -5713,7 +4628,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) ret = scx_init_task(p, task_group(p), false); if (ret) { put_task_struct(p); - scx_task_iter_relock(&sti); scx_task_iter_stop(&sti); scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", ret, p->comm, p->pid); @@ -5723,7 +4637,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_set_task_state(p, SCX_TASK_READY); put_task_struct(p); - scx_task_iter_relock(&sti); } scx_task_iter_stop(&sti); scx_cgroup_unlock(); @@ -5795,7 +4708,7 @@ err_unlock: err_disable_unlock_all: scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); - scx_bypass(false); + /* we'll soon enter disable path, keep bypass on */ err_disable: mutex_unlock(&scx_enable_mutex); /* @@ -6328,40 +5241,41 @@ void __init init_sched_ext_class(void) /******************************************************************************** * Helpers that can be called from the BPF scheduler. */ -static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags) +static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p, + u64 enq_flags) { - if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) + if (!scx_kf_allowed(sch, SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) return false; lockdep_assert_irqs_disabled(); if (unlikely(!p)) { - scx_kf_error("called with NULL task"); + scx_error(sch, "called with NULL task"); return false; } if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) { - scx_kf_error("invalid enq_flags 0x%llx", enq_flags); + scx_error(sch, "invalid enq_flags 0x%llx", enq_flags); return false; } return true; } -static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id, - u64 enq_flags) +static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p, + u64 dsq_id, u64 enq_flags) { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); struct task_struct *ddsp_task; ddsp_task = __this_cpu_read(direct_dispatch_task); if (ddsp_task) { - mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags); + mark_direct_dispatch(sch, ddsp_task, p, dsq_id, enq_flags); return; } if (unlikely(dspc->cursor >= scx_dsp_max_batch)) { - scx_kf_error("dispatch buffer overflow"); + scx_error(sch, "dispatch buffer overflow"); return; } @@ -6413,7 +5327,14 @@ __bpf_kfunc_start_defs(); __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) { - if (!scx_dsq_insert_preamble(p, enq_flags)) + struct scx_sched *sch; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + + if (!scx_dsq_insert_preamble(sch, p, enq_flags)) return; if (slice) @@ -6421,7 +5342,7 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice else p->scx.slice = p->scx.slice ?: 1; - scx_dsq_insert_commit(p, dsq_id, enq_flags); + scx_dsq_insert_commit(sch, p, dsq_id, enq_flags); } /** @@ -6448,7 +5369,14 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) { - if (!scx_dsq_insert_preamble(p, enq_flags)) + struct scx_sched *sch; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + + if (!scx_dsq_insert_preamble(sch, p, enq_flags)) return; if (slice) @@ -6458,7 +5386,7 @@ __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, p->scx.dsq_vtime = vtime; - scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); + scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); } __bpf_kfunc_end_defs(); @@ -6483,7 +5411,8 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, bool in_balance; unsigned long flags; - if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH)) + if (!scx_kf_allowed_if_unlocked() && + !scx_kf_allowed(sch, SCX_KF_DISPATCH)) return false; /* @@ -6568,7 +5497,15 @@ __bpf_kfunc_start_defs(); */ __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) { - if (!scx_kf_allowed(SCX_KF_DISPATCH)) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return 0; + + if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) return 0; return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor); @@ -6583,14 +5520,21 @@ __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) __bpf_kfunc void scx_bpf_dispatch_cancel(void) { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); + struct scx_sched *sch; + + guard(rcu)(); - if (!scx_kf_allowed(SCX_KF_DISPATCH)) + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + + if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) return; if (dspc->cursor > 0) dspc->cursor--; else - scx_kf_error("dispatch buffer underflow"); + scx_error(sch, "dispatch buffer underflow"); } /** @@ -6609,11 +5553,17 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void) */ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id) { - struct scx_sched *sch = scx_root; struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); struct scx_dispatch_q *dsq; + struct scx_sched *sch; + + guard(rcu)(); - if (!scx_kf_allowed(SCX_KF_DISPATCH)) + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return false; + + if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) return false; flush_dispatch_buf(sch, dspc->rq); @@ -6760,12 +5710,18 @@ __bpf_kfunc_start_defs(); */ __bpf_kfunc u32 scx_bpf_reenqueue_local(void) { + struct scx_sched *sch; LIST_HEAD(tasks); u32 nr_enqueued = 0; struct rq *rq; struct task_struct *p, *n; - if (!scx_kf_allowed(SCX_KF_CPU_RELEASE)) + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return 0; + + if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE)) return 0; rq = cpu_rq(smp_processor_id()); @@ -6788,12 +5744,8 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void) * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to * the current local DSQ for running tasks and thus are not * visible to the BPF scheduler. - * - * Also skip re-enqueueing tasks that can only run on this - * CPU, as they would just be re-added to the same local - * DSQ without any benefit. */ - if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1) + if (p->migration_pending) continue; dispatch_dequeue(rq, p); @@ -6881,22 +5833,12 @@ static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { __bpf_kfunc_start_defs(); -/** - * scx_bpf_kick_cpu - Trigger reschedule on a CPU - * @cpu: cpu to kick - * @flags: %SCX_KICK_* flags - * - * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or - * trigger rescheduling on a busy CPU. This can be called from any online - * scx_ops operation and the actual kicking is performed asynchronously through - * an irq work. - */ -__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) +static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags) { struct rq *this_rq; unsigned long irq_flags; - if (!kf_cpu_valid(cpu, NULL)) + if (!ops_cpu_valid(sch, cpu, NULL)) return; local_irq_save(irq_flags); @@ -6920,7 +5862,7 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) struct rq *target_rq = cpu_rq(cpu); if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) - scx_kf_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); + scx_error(sch, "PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); if (raw_spin_rq_trylock(target_rq)) { if (can_skip_idle_kick(target_rq)) { @@ -6945,6 +5887,26 @@ out: } /** + * scx_bpf_kick_cpu - Trigger reschedule on a CPU + * @cpu: cpu to kick + * @flags: %SCX_KICK_* flags + * + * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or + * trigger rescheduling on a busy CPU. This can be called from any online + * scx_ops operation and the actual kicking is performed asynchronously through + * an irq work. + */ +__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) +{ + struct scx_sched *sch; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (likely(sch)) + scx_kick_cpu(sch, cpu, flags); +} + +/** * scx_bpf_dsq_nr_queued - Return the number of queued tasks * @dsq_id: id of the DSQ * @@ -7124,28 +6086,29 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __bpf_kfunc_end_defs(); -static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, - char *fmt, unsigned long long *data, u32 data__sz) +static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, + size_t line_size, char *fmt, unsigned long long *data, + u32 data__sz) { struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; s32 ret; if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || (data__sz && !data)) { - scx_kf_error("invalid data=%p and data__sz=%u", (void *)data, data__sz); + scx_error(sch, "invalid data=%p and data__sz=%u", (void *)data, data__sz); return -EINVAL; } ret = copy_from_kernel_nofault(data_buf, data, data__sz); if (ret < 0) { - scx_kf_error("failed to read data fields (%d)", ret); + scx_error(sch, "failed to read data fields (%d)", ret); return ret; } ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8, &bprintf_data); if (ret < 0) { - scx_kf_error("format preparation failed (%d)", ret); + scx_error(sch, "format preparation failed (%d)", ret); return ret; } @@ -7153,17 +6116,17 @@ static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, bprintf_data.bin_args); bpf_bprintf_cleanup(&bprintf_data); if (ret < 0) { - scx_kf_error("(\"%s\", %p, %u) failed to format", fmt, data, data__sz); + scx_error(sch, "(\"%s\", %p, %u) failed to format", fmt, data, data__sz); return ret; } return ret; } -static s32 bstr_format(struct scx_bstr_buf *buf, +static s32 bstr_format(struct scx_sched *sch, struct scx_bstr_buf *buf, char *fmt, unsigned long long *data, u32 data__sz) { - return __bstr_format(buf->data, buf->line, sizeof(buf->line), + return __bstr_format(sch, buf->data, buf->line, sizeof(buf->line), fmt, data, data__sz); } @@ -7182,11 +6145,14 @@ __bpf_kfunc_start_defs(); __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) { + struct scx_sched *sch; unsigned long flags; raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); - if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) - scx_kf_exit(SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); + sch = rcu_dereference_bh(scx_root); + if (likely(sch) && + bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) + scx_exit(sch, SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); } @@ -7202,11 +6168,14 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data__sz) { + struct scx_sched *sch; unsigned long flags; raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); - if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) - scx_kf_exit(SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); + sch = rcu_dereference_bh(scx_root); + if (likely(sch) && + bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) + scx_exit(sch, SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); } @@ -7225,17 +6194,24 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data__sz) { + struct scx_sched *sch; struct scx_dump_data *dd = &scx_dump_data; struct scx_bstr_buf *buf = &dd->buf; s32 ret; + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + if (raw_smp_processor_id() != dd->cpu) { - scx_kf_error("scx_bpf_dump() must only be called from ops.dump() and friends"); + scx_error(sch, "scx_bpf_dump() must only be called from ops.dump() and friends"); return; } /* append the formatted string to the line buf */ - ret = __bstr_format(buf->data, buf->line + dd->cursor, + ret = __bstr_format(sch, buf->data, buf->line + dd->cursor, sizeof(buf->line) - dd->cursor, fmt, data, data__sz); if (ret < 0) { dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)", @@ -7271,7 +6247,12 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, */ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) { - if (kf_cpu_valid(cpu, NULL)) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) return arch_scale_cpu_capacity(cpu); else return SCX_CPUPERF_ONE; @@ -7293,7 +6274,12 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) */ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) { - if (kf_cpu_valid(cpu, NULL)) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) return arch_scale_freq_capacity(cpu); else return SCX_CPUPERF_ONE; @@ -7315,12 +6301,20 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) */ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) { + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(sch); + if (unlikely(!sch)) + return; + if (unlikely(perf > SCX_CPUPERF_ONE)) { - scx_kf_error("Invalid cpuperf target %u for CPU %d", perf, cpu); + scx_error(sch, "Invalid cpuperf target %u for CPU %d", perf, cpu); return; } - if (kf_cpu_valid(cpu, NULL)) { + if (ops_cpu_valid(sch, cpu, NULL)) { struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq(); struct rq_flags rf; @@ -7329,7 +6323,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) * to the corresponding CPU to prevent ABBA deadlocks. */ if (locked_rq && rq != locked_rq) { - scx_kf_error("Invalid target CPU %d", cpu); + scx_error(sch, "Invalid target CPU %d", cpu); return; } @@ -7424,13 +6418,76 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) */ __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu) { - if (!kf_cpu_valid(cpu, NULL)) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) return NULL; + if (!ops_cpu_valid(sch, cpu, NULL)) + return NULL; + + if (!sch->warned_deprecated_rq) { + printk_deferred(KERN_WARNING "sched_ext: %s() is deprecated; " + "use scx_bpf_locked_rq() when holding rq lock " + "or scx_bpf_cpu_curr() to read remote curr safely.\n", __func__); + sch->warned_deprecated_rq = true; + } + return cpu_rq(cpu); } /** + * scx_bpf_locked_rq - Return the rq currently locked by SCX + * + * Returns the rq if a rq lock is currently held by SCX. + * Otherwise emits an error and returns NULL. + */ +__bpf_kfunc struct rq *scx_bpf_locked_rq(void) +{ + struct scx_sched *sch; + struct rq *rq; + + guard(preempt)(); + + sch = rcu_dereference_sched(scx_root); + if (unlikely(!sch)) + return NULL; + + rq = scx_locked_rq(); + if (!rq) { + scx_error(sch, "accessing rq without holding rq lock"); + return NULL; + } + + return rq; +} + +/** + * scx_bpf_cpu_curr - Return remote CPU's curr task + * @cpu: CPU of interest + * + * Callers must hold RCU read lock (KF_RCU). + */ +__bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu) +{ + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return NULL; + + if (!ops_cpu_valid(sch, cpu, NULL)) + return NULL; + + return rcu_dereference(cpu_rq(cpu)->curr); +} + +/** * scx_bpf_task_cgroup - Return the sched cgroup of a task * @p: task of interest * @@ -7446,8 +6503,15 @@ __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) { struct task_group *tg = p->sched_task_group; struct cgroup *cgrp = &cgrp_dfl_root.cgrp; + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + goto out; - if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p)) + if (!scx_kf_allowed_on_arg_tasks(sch, __SCX_KF_RQ_LOCKED, p)) goto out; cgrp = tg_cgrp(tg); @@ -7528,7 +6592,7 @@ static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *event /* Aggregate per-CPU event counters into @events. */ memset(events, 0, sizeof(*events)); for_each_possible_cpu(cpu) { - e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu); + e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats; scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); @@ -7594,6 +6658,8 @@ BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_cpu_rq) +BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_RET_NULL) +BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED) #ifdef CONFIG_CGROUP_SCHED BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) #endif |