From 255019537cfd63d6adc16a55bcbfd79530d5937e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Dec 2025 11:16:15 -0800 Subject: rcu: Make expedited RCU CPU stall warnings detect stall-end races If an expedited RCU CPU stall ends just at the stall-warning timeout, the current code will print an expedited stall-warning message, but one that doesn't identify any CPUs or tasks causing the stall. This is most likely to happen for short-timeout stalls, for example, the 20-millisecond timeouts that are sometimes used for small embedded devices. Needless to say, these semi-empty stall-warning messages can be rather confusing. One option would be to suppress the stall-warning message entirely in this case, but the near-miss information can be quite valuable. Detect this race condition and emits a "INFO: Expedited stall ended before state dump start" message to clarify matters. [boqun: Apply feedback from Borislav] Reported-by: Borislav Petkov Signed-off-by: Paul E. McKenney Acked-by: Borislav Petkov (AMD) Signed-off-by: Boqun Feng --- kernel/rcu/tree_exp.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 96c49c56fc14..82cada459e5d 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -589,7 +589,12 @@ static void synchronize_rcu_expedited_stall(unsigned long jiffies_start, unsigne pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", j - jiffies_start, rcu_state.expedited_sequence, data_race(rnp_root->expmask), ".T"[!!data_race(rnp_root->exp_tasks)]); - if (ndetected) { + if (!ndetected) { + // This is invoked from the grace-period worker, so + // a new grace period cannot have started. And if this + // worker were stalled, we would not get here. ;-) + pr_err("INFO: Expedited stall ended before state dump start\n"); + } else { pr_err("blocking rcu_node structures (internal RCU debug):"); rcu_for_each_node_breadth_first(rnp) { if (rnp == rnp_root) -- cgit v1.2.3 From 37d9b475077b5096d41ebdc416a9019bd4fcfdb9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Dec 2025 11:16:16 -0800 Subject: rcutorture: Correctly compute probability to invoke ->exp_current() Lack of parentheses causes the ->exp_current() function, for example, srcu_expedite_current(), to be called only once in four billion times instead of the intended once in 256 times. This commit therefore adds the needed parentheses. Reported-by: Chris Mason Reported-by: Joel Fernandes Fixes: 950063c6e897 ("rcutorture: Test srcu_expedite_current()") Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 07e51974b06b..83934402a287 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1750,7 +1750,7 @@ rcu_torture_writer(void *arg) ulo[i] = cur_ops->get_comp_state(); gp_snap = cur_ops->start_gp_poll(); rcu_torture_writer_state = RTWS_POLL_WAIT; - if (cur_ops->exp_current && !torture_random(&rand) % 0xff) + if (cur_ops->exp_current && !(torture_random(&rand) & 0xff)) cur_ops->exp_current(); while (!cur_ops->poll_gp_state(gp_snap)) { gp_snap1 = cur_ops->get_gp_state(); @@ -1772,7 +1772,7 @@ rcu_torture_writer(void *arg) cur_ops->get_comp_state_full(&rgo[i]); cur_ops->start_gp_poll_full(&gp_snap_full); rcu_torture_writer_state = RTWS_POLL_WAIT_FULL; - if (cur_ops->exp_current && !torture_random(&rand) % 0xff) + if (cur_ops->exp_current && !(torture_random(&rand) & 0xff)) cur_ops->exp_current(); while (!cur_ops->poll_gp_state_full(&gp_snap_full)) { cur_ops->get_gp_state_full(&gp_snap1_full); -- cgit v1.2.3 From d41e37f26b3157b3f1d10223863519a943aa239b Mon Sep 17 00:00:00 2001 From: Yao Kai Date: Thu, 1 Jan 2026 11:34:10 -0500 Subject: rcu: Fix rcu_read_unlock() deadloop due to softirq Commit 5f5fa7ea89dc ("rcu: Don't use negative nesting depth in __rcu_read_unlock()") removes the recursion-protection code from __rcu_read_unlock(). Therefore, we could invoke the deadloop in raise_softirq_irqoff() with ftrace enabled as follows: WARNING: CPU: 0 PID: 0 at kernel/trace/trace.c:3021 __ftrace_trace_stack.constprop.0+0x172/0x180 Modules linked in: my_irq_work(O) CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Tainted: G O 6.18.0-rc7-dirty #23 PREEMPT(full) Tainted: [O]=OOT_MODULE Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 RIP: 0010:__ftrace_trace_stack.constprop.0+0x172/0x180 RSP: 0018:ffffc900000034a8 EFLAGS: 00010002 RAX: 0000000000000000 RBX: 0000000000000004 RCX: 0000000000000000 RDX: 0000000000000003 RSI: ffffffff826d7b87 RDI: ffffffff826e9329 RBP: 0000000000090009 R08: 0000000000000005 R09: ffffffff82afbc4c R10: 0000000000000008 R11: 0000000000011d7a R12: 0000000000000000 R13: ffff888003874100 R14: 0000000000000003 R15: ffff8880038c1054 FS: 0000000000000000(0000) GS:ffff8880fa8ea000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055b31fa7f540 CR3: 00000000078f4005 CR4: 0000000000770ef0 PKRU: 55555554 Call Trace: trace_buffer_unlock_commit_regs+0x6d/0x220 trace_event_buffer_commit+0x5c/0x260 trace_event_raw_event_softirq+0x47/0x80 raise_softirq_irqoff+0x6e/0xa0 rcu_read_unlock_special+0xb1/0x160 unwind_next_frame+0x203/0x9b0 __unwind_start+0x15d/0x1c0 arch_stack_walk+0x62/0xf0 stack_trace_save+0x48/0x70 __ftrace_trace_stack.constprop.0+0x144/0x180 trace_buffer_unlock_commit_regs+0x6d/0x220 trace_event_buffer_commit+0x5c/0x260 trace_event_raw_event_softirq+0x47/0x80 raise_softirq_irqoff+0x6e/0xa0 rcu_read_unlock_special+0xb1/0x160 unwind_next_frame+0x203/0x9b0 __unwind_start+0x15d/0x1c0 arch_stack_walk+0x62/0xf0 stack_trace_save+0x48/0x70 __ftrace_trace_stack.constprop.0+0x144/0x180 trace_buffer_unlock_commit_regs+0x6d/0x220 trace_event_buffer_commit+0x5c/0x260 trace_event_raw_event_softirq+0x47/0x80 raise_softirq_irqoff+0x6e/0xa0 rcu_read_unlock_special+0xb1/0x160 unwind_next_frame+0x203/0x9b0 __unwind_start+0x15d/0x1c0 arch_stack_walk+0x62/0xf0 stack_trace_save+0x48/0x70 __ftrace_trace_stack.constprop.0+0x144/0x180 trace_buffer_unlock_commit_regs+0x6d/0x220 trace_event_buffer_commit+0x5c/0x260 trace_event_raw_event_softirq+0x47/0x80 raise_softirq_irqoff+0x6e/0xa0 rcu_read_unlock_special+0xb1/0x160 __is_insn_slot_addr+0x54/0x70 kernel_text_address+0x48/0xc0 __kernel_text_address+0xd/0x40 unwind_get_return_address+0x1e/0x40 arch_stack_walk+0x9c/0xf0 stack_trace_save+0x48/0x70 __ftrace_trace_stack.constprop.0+0x144/0x180 trace_buffer_unlock_commit_regs+0x6d/0x220 trace_event_buffer_commit+0x5c/0x260 trace_event_raw_event_softirq+0x47/0x80 __raise_softirq_irqoff+0x61/0x80 __flush_smp_call_function_queue+0x115/0x420 __sysvec_call_function_single+0x17/0xb0 sysvec_call_function_single+0x8c/0xc0 Commit b41642c87716 ("rcu: Fix rcu_read_unlock() deadloop due to IRQ work") fixed the infinite loop in rcu_read_unlock_special() for IRQ work by setting a flag before calling irq_work_queue_on(). We fix this issue by setting the same flag before calling raise_softirq_irqoff() and rename the flag to defer_qs_pending for more common. Fixes: 5f5fa7ea89dc ("rcu: Don't use negative nesting depth in __rcu_read_unlock()") Reported-by: Tengda Wu Signed-off-by: Yao Kai Reviewed-by: Joel Fernandes Tested-by: Paul E. McKenney Signed-off-by: Joel Fernandes Signed-off-by: Boqun Feng --- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index b8bbe7960cda..2265b9c2906e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -203,7 +203,7 @@ struct rcu_data { /* during and after the last grace */ /* period it is aware of. */ struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */ - int defer_qs_iw_pending; /* Scheduler attention pending? */ + int defer_qs_pending; /* irqwork or softirq pending? */ struct work_struct strict_work; /* Schedule readers for strict GPs. */ /* 2) batch handling */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index dbe2d02be824..95ad967adcf3 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -487,8 +487,8 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) union rcu_special special; rdp = this_cpu_ptr(&rcu_data); - if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING) - rdp->defer_qs_iw_pending = DEFER_QS_IDLE; + if (rdp->defer_qs_pending == DEFER_QS_PENDING) + rdp->defer_qs_pending = DEFER_QS_IDLE; /* * If RCU core is waiting for this CPU to exit its critical section, @@ -645,7 +645,7 @@ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) * 5. Deferred QS reporting does not happen. */ if (rcu_preempt_depth() > 0) - WRITE_ONCE(rdp->defer_qs_iw_pending, DEFER_QS_IDLE); + WRITE_ONCE(rdp->defer_qs_pending, DEFER_QS_IDLE); } /* @@ -747,7 +747,10 @@ static void rcu_read_unlock_special(struct task_struct *t) // Using softirq, safe to awaken, and either the // wakeup is free or there is either an expedited // GP in flight or a potential need to deboost. - raise_softirq_irqoff(RCU_SOFTIRQ); + if (rdp->defer_qs_pending != DEFER_QS_PENDING) { + rdp->defer_qs_pending = DEFER_QS_PENDING; + raise_softirq_irqoff(RCU_SOFTIRQ); + } } else { // Enabling BH or preempt does reschedule, so... // Also if no expediting and no possible deboosting, @@ -755,11 +758,11 @@ static void rcu_read_unlock_special(struct task_struct *t) // tick enabled. set_need_resched_current(); if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && - needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING && + needs_exp && rdp->defer_qs_pending != DEFER_QS_PENDING && cpu_online(rdp->cpu)) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. - rdp->defer_qs_iw_pending = DEFER_QS_PENDING; + rdp->defer_qs_pending = DEFER_QS_PENDING; irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } } -- cgit v1.2.3 From cee2557ae3b19e0cdfa09695a4d6ba420cc1fd41 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Thu, 1 Jan 2026 11:34:11 -0500 Subject: srcu: Use suitable gfp_flags for the init_srcu_struct_nodes() For use the init_srcu_struct*() to initialized srcu structure, the srcu structure's->srcu_sup and sda use GFP_KERNEL flags to allocate memory. similarly, if set SRCU_SIZING_INIT, the srcu_sup's->node can still use GFP_KERNEL flags to allocate memory, not need to use GFP_ATOMIC flags all the time. Signed-off-by: Zqiang Reviewed-by: Joel Fernandes Tested-by: Paul E. McKenney Signed-off-by: Joel Fernandes Signed-off-by: Boqun Feng --- kernel/rcu/srcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index ea3f128de06f..c469c708fdd6 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -262,7 +262,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->srcu_sup->srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL; ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns(); if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { - if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) + if (!init_srcu_struct_nodes(ssp, is_static ? GFP_ATOMIC : GFP_KERNEL)) goto err_free_sda; WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); } -- cgit v1.2.3 From bc3705e20988778791a4a5e9e2700fbc22cc942d Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 1 Jan 2026 11:34:15 -0500 Subject: rcu: Reduce synchronize_rcu() latency by reporting GP kthread's CPU QS early The RCU grace period mechanism uses a two-phase FQS (Force Quiescent State) design where the first FQS saves dyntick-idle snapshots and the second FQS compares them. This results in long and unnecessary latency for synchronize_rcu() on idle systems (two FQS waits of ~3ms each with 1000HZ) whenever one FQS wait sufficed. Some investigations showed that the GP kthread's CPU is the holdout CPU a lot of times after the first FQS as - it cannot be detected as "idle" because it's actively running the FQS scan in the GP kthread. Therefore, at the end of rcu_gp_init(), immediately report a quiescent state for the GP kthread's CPU using rcu_qs() + rcu_report_qs_rdp(). The GP kthread cannot be in an RCU read-side critical section while running GP initialization, so this is safe and results in significant latency improvements. The following tests were performed: (1) synchronize_rcu() benchmarking 100 synchronize_rcu() calls with 32 CPUs, 10 runs each (default fqs jiffies settings): Baseline (without fix): | Run | Mean | Min | Max | |-----|-----------|----------|-----------| | 1 | 10.088 ms | 9.989 ms | 18.848 ms | | 2 | 10.064 ms | 9.982 ms | 16.470 ms | | 3 | 10.051 ms | 9.988 ms | 15.113 ms | | 4 | 10.125 ms | 9.929 ms | 22.411 ms | | 5 | 8.695 ms | 5.996 ms | 15.471 ms | | 6 | 10.157 ms | 9.977 ms | 25.723 ms | | 7 | 10.102 ms | 9.990 ms | 20.224 ms | | 8 | 8.050 ms | 5.985 ms | 10.007 ms | | 9 | 10.059 ms | 9.978 ms | 15.934 ms | | 10 | 10.077 ms | 9.984 ms | 17.703 ms | With fix: | Run | Mean | Min | Max | |-----|----------|----------|-----------| | 1 | 6.027 ms | 5.915 ms | 8.589 ms | | 2 | 6.032 ms | 5.984 ms | 9.241 ms | | 3 | 6.010 ms | 5.986 ms | 7.004 ms | | 4 | 6.076 ms | 5.993 ms | 10.001 ms | | 5 | 6.084 ms | 5.893 ms | 10.250 ms | | 6 | 6.034 ms | 5.908 ms | 9.456 ms | | 7 | 6.051 ms | 5.993 ms | 10.000 ms | | 8 | 6.057 ms | 5.941 ms | 10.001 ms | | 9 | 6.016 ms | 5.927 ms | 7.540 ms | | 10 | 6.036 ms | 5.993 ms | 9.579 ms | Summary: - Mean latency: 9.75 ms -> 6.04 ms (38% improvement) - Max latency: 25.72 ms -> 10.25 ms (60% improvement) (2) Bridge setup/teardown latency (Uladzislau Rezki) x86_64 with 64 CPUs, 100 iterations of bridge add/configure/delete: real time 1 - default: 24.221s 2 - this patch: 20.754s (14% faster) 3 - this patch + wake_from_gp: 15.895s (34% faster) 4 - wake_from_gp only: 18.947s (22% faster) Per-synchronize_rcu() latency (in usec): 1 2 3 4 median: 37249.5 31540.5 15765 22480 min: 7881 7918 9803 7857 max: 63651 55639 31861 32040 This patch combined with rcu_normal_wake_from_gp reduces bridge setup/teardown time from 24 seconds to 16 seconds. (3) CPU overhead verification (Uladzislau Rezki) System CPU time across 5 runs showed no measurable increase: default: 1.698s - 1.937s this patch: 1.667s - 1.930s Conclusion: variations are within noise, no CPU overhead regression. (4) rcutorture Tested TREE and SRCU configurations - no regressions. Reviewed-by: "Paul E. McKenney" Tested-by: Uladzislau Rezki (Sony) Tested-by: Paul E. McKenney Tested-by: Samir M Signed-off-by: Joel Fernandes Signed-off-by: Boqun Feng --- kernel/rcu/tree.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 293bbd9ac3f4..2c1c9759e278 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -160,6 +160,7 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, unsigned long gps, unsigned long flags); static void invoke_rcu_core(void); static void rcu_report_exp_rdp(struct rcu_data *rdp); +static void rcu_report_qs_rdp(struct rcu_data *rdp); static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); static bool rcu_rdp_is_offloaded(struct rcu_data *rdp); static bool rcu_rdp_cpu_online(struct rcu_data *rdp); @@ -1983,6 +1984,17 @@ static noinline_for_stack bool rcu_gp_init(void) if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) on_each_cpu(rcu_strict_gp_boundary, NULL, 0); + /* + * Immediately report QS for the GP kthread's CPU. The GP kthread + * cannot be in an RCU read-side critical section while running + * the FQS scan. This eliminates the need for a second FQS wait + * when all CPUs are idle. + */ + preempt_disable(); + rcu_qs(); + rcu_report_qs_rdp(this_cpu_ptr(&rcu_data)); + preempt_enable(); + return true; } -- cgit v1.2.3