diff options
Diffstat (limited to 'kernel/sched/ext.c')
| -rw-r--r-- | kernel/sched/ext.c | 126 |
1 files changed, 111 insertions, 15 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 2b0e88206d07..ecb251e883ea 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -67,8 +67,19 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; static struct delayed_work scx_watchdog_work; -/* for %SCX_KICK_WAIT */ -static unsigned long __percpu *scx_kick_cpus_pnt_seqs; +/* + * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence + * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu + * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated + * lazily when enabling and freed when disabling to avoid waste when sched_ext + * isn't active. + */ +struct scx_kick_pseqs { + struct rcu_head rcu; + unsigned long seqs[]; +}; + +static DEFINE_PER_CPU(struct scx_kick_pseqs __rcu *, scx_kick_pseqs); /* * Direct dispatch marker. @@ -780,13 +791,23 @@ static void schedule_deferred(struct rq *rq) if (rq->scx.flags & SCX_RQ_IN_WAKEUP) return; + /* Don't do anything if there already is a deferred operation. */ + if (rq->scx.flags & SCX_RQ_BAL_CB_PENDING) + return; + /* * If in balance, the balance callbacks will be called before rq lock is * released. Schedule one. + * + * + * We can't directly insert the callback into the + * rq's list: The call can drop its lock and make the pending balance + * callback visible to unrelated code paths that call rq_pin_lock(). + * + * Just let balance_one() know that it must do it itself. */ if (rq->scx.flags & SCX_RQ_IN_BALANCE) { - queue_balance_callback(rq, &rq->scx.deferred_bal_cb, - deferred_bal_cb_workfn); + rq->scx.flags |= SCX_RQ_BAL_CB_PENDING; return; } @@ -2003,6 +2024,19 @@ static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq) dspc->cursor = 0; } +static inline void maybe_queue_balance_callback(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + + if (!(rq->scx.flags & SCX_RQ_BAL_CB_PENDING)) + return; + + queue_balance_callback(rq, &rq->scx.deferred_bal_cb, + deferred_bal_cb_workfn); + + rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING; +} + static int balance_one(struct rq *rq, struct task_struct *prev) { struct scx_sched *sch = scx_root; @@ -2150,6 +2184,8 @@ static int balance_scx(struct rq *rq, struct task_struct *prev, #endif rq_repin_lock(rq, rf); + maybe_queue_balance_callback(rq); + return ret; } @@ -3471,7 +3507,9 @@ static void scx_sched_free_rcu_work(struct work_struct *work) struct scx_dispatch_q *dsq; int node; + irq_work_sync(&sch->error_irq_work); kthread_stop(sch->helper->task); + free_percpu(sch->pcpu); for_each_node_state(node, N_POSSIBLE) @@ -3850,6 +3888,27 @@ static const char *scx_exit_reason(enum scx_exit_kind kind) } } +static void free_kick_pseqs_rcu(struct rcu_head *rcu) +{ + struct scx_kick_pseqs *pseqs = container_of(rcu, struct scx_kick_pseqs, rcu); + + kvfree(pseqs); +} + +static void free_kick_pseqs(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu); + struct scx_kick_pseqs *to_free; + + to_free = rcu_replace_pointer(*pseqs, NULL, true); + if (to_free) + call_rcu(&to_free->rcu, free_kick_pseqs_rcu); + } +} + static void scx_disable_workfn(struct kthread_work *work) { struct scx_sched *sch = container_of(work, struct scx_sched, disable_work); @@ -3986,6 +4045,7 @@ static void scx_disable_workfn(struct kthread_work *work) free_percpu(scx_dsp_ctx); scx_dsp_ctx = NULL; scx_dsp_max_batch = 0; + free_kick_pseqs(); mutex_unlock(&scx_enable_mutex); @@ -4348,6 +4408,33 @@ static void scx_vexit(struct scx_sched *sch, irq_work_queue(&sch->error_irq_work); } +static int alloc_kick_pseqs(void) +{ + int cpu; + + /* + * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size + * can exceed percpu allocator limits on large machines. + */ + for_each_possible_cpu(cpu) { + struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu); + struct scx_kick_pseqs *new_pseqs; + + WARN_ON_ONCE(rcu_access_pointer(*pseqs)); + + new_pseqs = kvzalloc_node(struct_size(new_pseqs, seqs, nr_cpu_ids), + GFP_KERNEL, cpu_to_node(cpu)); + if (!new_pseqs) { + free_kick_pseqs(); + return -ENOMEM; + } + + rcu_assign_pointer(*pseqs, new_pseqs); + } + + return 0; +} + static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) { struct scx_sched *sch; @@ -4495,10 +4582,14 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) goto err_unlock; } + ret = alloc_kick_pseqs(); + if (ret) + goto err_unlock; + sch = scx_alloc_and_add_sched(ops); if (IS_ERR(sch)) { ret = PTR_ERR(sch); - goto err_unlock; + goto err_free_pseqs; } /* @@ -4701,6 +4792,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) return 0; +err_free_pseqs: + free_kick_pseqs(); err_unlock: mutex_unlock(&scx_enable_mutex); return ret; @@ -5082,10 +5175,18 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work) { struct rq *this_rq = this_rq(); struct scx_rq *this_scx = &this_rq->scx; - unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs); + struct scx_kick_pseqs __rcu *pseqs_pcpu = __this_cpu_read(scx_kick_pseqs); bool should_wait = false; + unsigned long *pseqs; s32 cpu; + if (unlikely(!pseqs_pcpu)) { + pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_pseqs"); + return; + } + + pseqs = rcu_dereference_bh(pseqs_pcpu)->seqs; + for_each_cpu(cpu, this_scx->cpus_to_kick) { should_wait |= kick_one_cpu(cpu, this_rq, pseqs); cpumask_clear_cpu(cpu, this_scx->cpus_to_kick); @@ -5208,11 +5309,6 @@ void __init init_sched_ext_class(void) scx_idle_init_masks(); - scx_kick_cpus_pnt_seqs = - __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids, - __alignof__(scx_kick_cpus_pnt_seqs[0])); - BUG_ON(!scx_kick_cpus_pnt_seqs); - for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); int n = cpu_to_node(cpu); @@ -5688,8 +5784,8 @@ BTF_KFUNCS_START(scx_kfunc_ids_dispatch) BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel) BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local) -BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice) -BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_dispatch) @@ -5820,8 +5916,8 @@ __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_unlocked) BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) -BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice) -BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_unlocked) |
