From 8823eaef45da7f156a1396f40d53b985c511edef Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 5 Mar 2026 08:15:40 -0800 Subject: workqueue: Show all busy workers in stall diagnostics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit show_cpu_pool_hog() only prints workers whose task is currently running on the CPU (task_is_running()). This misses workers that are busy processing a work item but are sleeping or blocked — for example, a worker that clears PF_WQ_WORKER and enters wait_event_idle(). Such a worker still occupies a pool slot and prevents progress, yet produces an empty backtrace section in the watchdog output. This is happening on real arm64 systems, where toggle_allocation_gate() IPIs every single CPU in the machine (which lacks NMI), causing workqueue stalls that show empty backtraces because toggle_allocation_gate() is sleeping in wait_event_idle(). Remove the task_is_running() filter so every in-flight worker in the pool's busy_hash is dumped. The busy_hash is protected by pool->lock, which is already held. Signed-off-by: Breno Leitao Acked-by: Song Liu Signed-off-by: Tejun Heo --- kernel/workqueue.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 56d8af13843f..09b9ad78d566 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -7583,9 +7583,9 @@ MODULE_PARM_DESC(panic_on_stall_time, "Panic if stall exceeds this many seconds /* * Show workers that might prevent the processing of pending work items. - * The only candidates are CPU-bound workers in the running state. - * Pending work items should be handled by another idle worker - * in all other situations. + * A busy worker that is not running on the CPU (e.g. sleeping in + * wait_event_idle() with PF_WQ_WORKER cleared) can stall the pool just as + * effectively as a CPU-bound one, so dump every in-flight worker. */ static void show_cpu_pool_hog(struct worker_pool *pool) { @@ -7596,19 +7596,17 @@ static void show_cpu_pool_hog(struct worker_pool *pool) raw_spin_lock_irqsave(&pool->lock, irq_flags); hash_for_each(pool->busy_hash, bkt, worker, hentry) { - if (task_is_running(worker->task)) { - /* - * Defer printing to avoid deadlocks in console - * drivers that queue work while holding locks - * also taken in their write paths. - */ - printk_deferred_enter(); + /* + * Defer printing to avoid deadlocks in console + * drivers that queue work while holding locks + * also taken in their write paths. + */ + printk_deferred_enter(); - pr_info("pool %d:\n", pool->id); - sched_show_task(worker->task); + pr_info("pool %d:\n", pool->id); + sched_show_task(worker->task); - printk_deferred_exit(); - } + printk_deferred_exit(); } raw_spin_unlock_irqrestore(&pool->lock, irq_flags); @@ -7619,7 +7617,7 @@ static void show_cpu_pools_hogs(void) struct worker_pool *pool; int pi; - pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n"); + pr_info("Showing backtraces of busy workers in stalled CPU-bound worker pools:\n"); rcu_read_lock(); -- cgit v1.2.3