From 255019537cfd63d6adc16a55bcbfd79530d5937e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 29 Dec 2025 11:16:15 -0800
Subject: rcu: Make expedited RCU CPU stall warnings detect stall-end races

If an expedited RCU CPU stall ends just at the stall-warning timeout,
the current code will print an expedited stall-warning message, but one
that doesn't identify any CPUs or tasks causing the stall.  This is most
likely to happen for short-timeout stalls, for example, the 20-millisecond
timeouts that are sometimes used for small embedded devices.  Needless to
say, these semi-empty stall-warning messages can be rather confusing.

One option would be to suppress the stall-warning message entirely in
this case, but the near-miss information can be quite valuable.

Detect this race condition and emits a "INFO: Expedited stall ended
before state dump start" message to clarify matters.

[boqun: Apply feedback from Borislav]

Reported-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Acked-by: Borislav Petkov (AMD) <bp@alien8.de>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 kernel/rcu/tree_exp.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 96c49c56fc14..82cada459e5d 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -589,7 +589,12 @@ static void synchronize_rcu_expedited_stall(unsigned long jiffies_start, unsigne
 	pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
 		j - jiffies_start, rcu_state.expedited_sequence, data_race(rnp_root->expmask),
 		".T"[!!data_race(rnp_root->exp_tasks)]);
-	if (ndetected) {
+	if (!ndetected) {
+		// This is invoked from the grace-period worker, so
+		// a new grace period cannot have started.  And if this
+		// worker were stalled, we would not get here.  ;-)
+		pr_err("INFO: Expedited stall ended before state dump start\n");
+	} else {
 		pr_err("blocking rcu_node structures (internal RCU debug):");
 		rcu_for_each_node_breadth_first(rnp) {
 			if (rnp == rnp_root)
-- 
cgit v1.2.3


From 37d9b475077b5096d41ebdc416a9019bd4fcfdb9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 29 Dec 2025 11:16:16 -0800
Subject: rcutorture: Correctly compute probability to invoke ->exp_current()

Lack of parentheses causes the ->exp_current() function, for example,
srcu_expedite_current(), to be called only once in four billion times
instead of the intended once in 256 times.  This commit therefore adds
the needed parentheses.

Reported-by: Chris Mason <clm@meta.com>
Reported-by: Joel Fernandes <joelagnelf@nvidia.com>
Fixes: 950063c6e897 ("rcutorture: Test srcu_expedite_current()")
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 kernel/rcu/rcutorture.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 07e51974b06b..83934402a287 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1750,7 +1750,7 @@ rcu_torture_writer(void *arg)
 					ulo[i] = cur_ops->get_comp_state();
 				gp_snap = cur_ops->start_gp_poll();
 				rcu_torture_writer_state = RTWS_POLL_WAIT;
-				if (cur_ops->exp_current && !torture_random(&rand) % 0xff)
+				if (cur_ops->exp_current && !(torture_random(&rand) & 0xff))
 					cur_ops->exp_current();
 				while (!cur_ops->poll_gp_state(gp_snap)) {
 					gp_snap1 = cur_ops->get_gp_state();
@@ -1772,7 +1772,7 @@ rcu_torture_writer(void *arg)
 					cur_ops->get_comp_state_full(&rgo[i]);
 				cur_ops->start_gp_poll_full(&gp_snap_full);
 				rcu_torture_writer_state = RTWS_POLL_WAIT_FULL;
-				if (cur_ops->exp_current && !torture_random(&rand) % 0xff)
+				if (cur_ops->exp_current && !(torture_random(&rand) & 0xff))
 					cur_ops->exp_current();
 				while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
 					cur_ops->get_gp_state_full(&gp_snap1_full);
-- 
cgit v1.2.3


From d41e37f26b3157b3f1d10223863519a943aa239b Mon Sep 17 00:00:00 2001
From: Yao Kai <yaokai34@huawei.com>
Date: Thu, 1 Jan 2026 11:34:10 -0500
Subject: rcu: Fix rcu_read_unlock() deadloop due to softirq

Commit 5f5fa7ea89dc ("rcu: Don't use negative nesting depth in
__rcu_read_unlock()") removes the recursion-protection code from
__rcu_read_unlock(). Therefore, we could invoke the deadloop in
raise_softirq_irqoff() with ftrace enabled as follows:

WARNING: CPU: 0 PID: 0 at kernel/trace/trace.c:3021 __ftrace_trace_stack.constprop.0+0x172/0x180
Modules linked in: my_irq_work(O)
CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Tainted: G O 6.18.0-rc7-dirty #23 PREEMPT(full)
Tainted: [O]=OOT_MODULE
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
RIP: 0010:__ftrace_trace_stack.constprop.0+0x172/0x180
RSP: 0018:ffffc900000034a8 EFLAGS: 00010002
RAX: 0000000000000000 RBX: 0000000000000004 RCX: 0000000000000000
RDX: 0000000000000003 RSI: ffffffff826d7b87 RDI: ffffffff826e9329
RBP: 0000000000090009 R08: 0000000000000005 R09: ffffffff82afbc4c
R10: 0000000000000008 R11: 0000000000011d7a R12: 0000000000000000
R13: ffff888003874100 R14: 0000000000000003 R15: ffff8880038c1054
FS:  0000000000000000(0000) GS:ffff8880fa8ea000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000055b31fa7f540 CR3: 00000000078f4005 CR4: 0000000000770ef0
PKRU: 55555554
Call Trace:
 <IRQ>
 trace_buffer_unlock_commit_regs+0x6d/0x220
 trace_event_buffer_commit+0x5c/0x260
 trace_event_raw_event_softirq+0x47/0x80
 raise_softirq_irqoff+0x6e/0xa0
 rcu_read_unlock_special+0xb1/0x160
 unwind_next_frame+0x203/0x9b0
 __unwind_start+0x15d/0x1c0
 arch_stack_walk+0x62/0xf0
 stack_trace_save+0x48/0x70
 __ftrace_trace_stack.constprop.0+0x144/0x180
 trace_buffer_unlock_commit_regs+0x6d/0x220
 trace_event_buffer_commit+0x5c/0x260
 trace_event_raw_event_softirq+0x47/0x80
 raise_softirq_irqoff+0x6e/0xa0
 rcu_read_unlock_special+0xb1/0x160
 unwind_next_frame+0x203/0x9b0
 __unwind_start+0x15d/0x1c0
 arch_stack_walk+0x62/0xf0
 stack_trace_save+0x48/0x70
 __ftrace_trace_stack.constprop.0+0x144/0x180
 trace_buffer_unlock_commit_regs+0x6d/0x220
 trace_event_buffer_commit+0x5c/0x260
 trace_event_raw_event_softirq+0x47/0x80
 raise_softirq_irqoff+0x6e/0xa0
 rcu_read_unlock_special+0xb1/0x160
 unwind_next_frame+0x203/0x9b0
 __unwind_start+0x15d/0x1c0
 arch_stack_walk+0x62/0xf0
 stack_trace_save+0x48/0x70
 __ftrace_trace_stack.constprop.0+0x144/0x180
 trace_buffer_unlock_commit_regs+0x6d/0x220
 trace_event_buffer_commit+0x5c/0x260
 trace_event_raw_event_softirq+0x47/0x80
 raise_softirq_irqoff+0x6e/0xa0
 rcu_read_unlock_special+0xb1/0x160
 __is_insn_slot_addr+0x54/0x70
 kernel_text_address+0x48/0xc0
 __kernel_text_address+0xd/0x40
 unwind_get_return_address+0x1e/0x40
 arch_stack_walk+0x9c/0xf0
 stack_trace_save+0x48/0x70
 __ftrace_trace_stack.constprop.0+0x144/0x180
 trace_buffer_unlock_commit_regs+0x6d/0x220
 trace_event_buffer_commit+0x5c/0x260
 trace_event_raw_event_softirq+0x47/0x80
 __raise_softirq_irqoff+0x61/0x80
 __flush_smp_call_function_queue+0x115/0x420
 __sysvec_call_function_single+0x17/0xb0
 sysvec_call_function_single+0x8c/0xc0
 </IRQ>

Commit b41642c87716 ("rcu: Fix rcu_read_unlock() deadloop due to IRQ work")
fixed the infinite loop in rcu_read_unlock_special() for IRQ work by
setting a flag before calling irq_work_queue_on(). We fix this issue by
setting the same flag before calling raise_softirq_irqoff() and rename the
flag to defer_qs_pending for more common.

Fixes: 5f5fa7ea89dc ("rcu: Don't use negative nesting depth in __rcu_read_unlock()")
Reported-by: Tengda Wu <wutengda2@huawei.com>
Signed-off-by: Yao Kai <yaokai34@huawei.com>
Reviewed-by: Joel Fernandes <joelagnelf@nvidia.com>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 kernel/rcu/tree.h        |  2 +-
 kernel/rcu/tree_plugin.h | 15 +++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index b8bbe7960cda..2265b9c2906e 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -203,7 +203,7 @@ struct rcu_data {
 					/*  during and after the last grace */
 					/* period it is aware of. */
 	struct irq_work defer_qs_iw;	/* Obtain later scheduler attention. */
-	int defer_qs_iw_pending;	/* Scheduler attention pending? */
+	int defer_qs_pending;		/* irqwork or softirq pending? */
 	struct work_struct strict_work;	/* Schedule readers for strict GPs. */
 
 	/* 2) batch handling */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index dbe2d02be824..95ad967adcf3 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -487,8 +487,8 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 	union rcu_special special;
 
 	rdp = this_cpu_ptr(&rcu_data);
-	if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING)
-		rdp->defer_qs_iw_pending = DEFER_QS_IDLE;
+	if (rdp->defer_qs_pending == DEFER_QS_PENDING)
+		rdp->defer_qs_pending = DEFER_QS_IDLE;
 
 	/*
 	 * If RCU core is waiting for this CPU to exit its critical section,
@@ -645,7 +645,7 @@ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
 	 * 5. Deferred QS reporting does not happen.
 	 */
 	if (rcu_preempt_depth() > 0)
-		WRITE_ONCE(rdp->defer_qs_iw_pending, DEFER_QS_IDLE);
+		WRITE_ONCE(rdp->defer_qs_pending, DEFER_QS_IDLE);
 }
 
 /*
@@ -747,7 +747,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
 			// Using softirq, safe to awaken, and either the
 			// wakeup is free or there is either an expedited
 			// GP in flight or a potential need to deboost.
-			raise_softirq_irqoff(RCU_SOFTIRQ);
+			if (rdp->defer_qs_pending != DEFER_QS_PENDING) {
+				rdp->defer_qs_pending = DEFER_QS_PENDING;
+				raise_softirq_irqoff(RCU_SOFTIRQ);
+			}
 		} else {
 			// Enabling BH or preempt does reschedule, so...
 			// Also if no expediting and no possible deboosting,
@@ -755,11 +758,11 @@ static void rcu_read_unlock_special(struct task_struct *t)
 			// tick enabled.
 			set_need_resched_current();
 			if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
-			    needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING &&
+			    needs_exp && rdp->defer_qs_pending != DEFER_QS_PENDING &&
 			    cpu_online(rdp->cpu)) {
 				// Get scheduler to re-evaluate and call hooks.
 				// If !IRQ_WORK, FQS scan will eventually IPI.
-				rdp->defer_qs_iw_pending = DEFER_QS_PENDING;
+				rdp->defer_qs_pending = DEFER_QS_PENDING;
 				irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
 			}
 		}
-- 
cgit v1.2.3


From cee2557ae3b19e0cdfa09695a4d6ba420cc1fd41 Mon Sep 17 00:00:00 2001
From: Zqiang <qiang.zhang@linux.dev>
Date: Thu, 1 Jan 2026 11:34:11 -0500
Subject: srcu: Use suitable gfp_flags for the init_srcu_struct_nodes()

For use the init_srcu_struct*() to initialized srcu structure,
the srcu structure's->srcu_sup and sda use GFP_KERNEL flags to
allocate memory. similarly, if set SRCU_SIZING_INIT, the
srcu_sup's->node can still use GFP_KERNEL flags to allocate
memory, not need to use GFP_ATOMIC flags all the time.

Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Reviewed-by: Joel Fernandes <joelagnelf@nvidia.com>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 kernel/rcu/srcutree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index ea3f128de06f..c469c708fdd6 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -262,7 +262,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 	ssp->srcu_sup->srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL;
 	ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns();
 	if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
-		if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC))
+		if (!init_srcu_struct_nodes(ssp, is_static ? GFP_ATOMIC : GFP_KERNEL))
 			goto err_free_sda;
 		WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
 	}
-- 
cgit v1.2.3


From bc3705e20988778791a4a5e9e2700fbc22cc942d Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelagnelf@nvidia.com>
Date: Thu, 1 Jan 2026 11:34:15 -0500
Subject: rcu: Reduce synchronize_rcu() latency by reporting GP kthread's CPU
 QS early

The RCU grace period mechanism uses a two-phase FQS (Force Quiescent
State) design where the first FQS saves dyntick-idle snapshots and
the second FQS compares them. This results in long and unnecessary latency
for synchronize_rcu() on idle systems (two FQS waits of ~3ms each with
1000HZ) whenever one FQS wait sufficed.

Some investigations showed that the GP kthread's CPU is the holdout CPU
a lot of times after the first FQS as - it cannot be detected as "idle"
because it's actively running the FQS scan in the GP kthread.

Therefore, at the end of rcu_gp_init(), immediately report a quiescent
state for the GP kthread's CPU using rcu_qs() + rcu_report_qs_rdp(). The
GP kthread cannot be in an RCU read-side critical section while running
GP initialization, so this is safe and results in significant latency
improvements.

The following tests were performed:

(1) synchronize_rcu() benchmarking

    100 synchronize_rcu() calls with 32 CPUs, 10 runs each (default fqs
    jiffies settings):

    Baseline (without fix):
    | Run | Mean      | Min      | Max       |
    |-----|-----------|----------|-----------|
    | 1   | 10.088 ms | 9.989 ms | 18.848 ms |
    | 2   | 10.064 ms | 9.982 ms | 16.470 ms |
    | 3   | 10.051 ms | 9.988 ms | 15.113 ms |
    | 4   | 10.125 ms | 9.929 ms | 22.411 ms |
    | 5   |  8.695 ms | 5.996 ms | 15.471 ms |
    | 6   | 10.157 ms | 9.977 ms | 25.723 ms |
    | 7   | 10.102 ms | 9.990 ms | 20.224 ms |
    | 8   |  8.050 ms | 5.985 ms | 10.007 ms |
    | 9   | 10.059 ms | 9.978 ms | 15.934 ms |
    | 10  | 10.077 ms | 9.984 ms | 17.703 ms |

    With fix:
    | Run | Mean     | Min      | Max       |
    |-----|----------|----------|-----------|
    | 1   | 6.027 ms | 5.915 ms |  8.589 ms |
    | 2   | 6.032 ms | 5.984 ms |  9.241 ms |
    | 3   | 6.010 ms | 5.986 ms |  7.004 ms |
    | 4   | 6.076 ms | 5.993 ms | 10.001 ms |
    | 5   | 6.084 ms | 5.893 ms | 10.250 ms |
    | 6   | 6.034 ms | 5.908 ms |  9.456 ms |
    | 7   | 6.051 ms | 5.993 ms | 10.000 ms |
    | 8   | 6.057 ms | 5.941 ms | 10.001 ms |
    | 9   | 6.016 ms | 5.927 ms |  7.540 ms |
    | 10  | 6.036 ms | 5.993 ms |  9.579 ms |

    Summary:
    - Mean latency: 9.75 ms -> 6.04 ms (38% improvement)
    - Max latency:  25.72 ms -> 10.25 ms (60% improvement)

(2) Bridge setup/teardown latency (Uladzislau Rezki)

    x86_64 with 64 CPUs, 100 iterations of bridge add/configure/delete:

                                   real time
    1 - default:                   24.221s
    2 - this patch:                20.754s  (14% faster)
    3 - this patch + wake_from_gp: 15.895s  (34% faster)
    4 - wake_from_gp only:         18.947s  (22% faster)

    Per-synchronize_rcu() latency (in usec):
                  1         2         3       4
    median: 37249.5   31540.5   15765   22480
    min:    7881      7918      9803    7857
    max:    63651     55639     31861   32040

    This patch combined with rcu_normal_wake_from_gp reduces bridge
    setup/teardown time from 24 seconds to 16 seconds.

(3) CPU overhead verification (Uladzislau Rezki)

    System CPU time across 5 runs showed no measurable increase:
      default:     1.698s - 1.937s
      this patch:  1.667s - 1.930s
    Conclusion: variations are within noise, no CPU overhead regression.

(4) rcutorture

    Tested TREE and SRCU configurations - no regressions.

Reviewed-by: "Paul E. McKenney" <paulmck@kernel.org>
Tested-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Tested-by: Samir M <samir@linux.ibm.com>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 kernel/rcu/tree.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 293bbd9ac3f4..2c1c9759e278 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -160,6 +160,7 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
 			      unsigned long gps, unsigned long flags);
 static void invoke_rcu_core(void);
 static void rcu_report_exp_rdp(struct rcu_data *rdp);
+static void rcu_report_qs_rdp(struct rcu_data *rdp);
 static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
 static bool rcu_rdp_is_offloaded(struct rcu_data *rdp);
 static bool rcu_rdp_cpu_online(struct rcu_data *rdp);
@@ -1983,6 +1984,17 @@ static noinline_for_stack bool rcu_gp_init(void)
 	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
 		on_each_cpu(rcu_strict_gp_boundary, NULL, 0);
 
+	/*
+	 * Immediately report QS for the GP kthread's CPU. The GP kthread
+	 * cannot be in an RCU read-side critical section while running
+	 * the FQS scan. This eliminates the need for a second FQS wait
+	 * when all CPUs are idle.
+	 */
+	preempt_disable();
+	rcu_qs();
+	rcu_report_qs_rdp(this_cpu_ptr(&rcu_data));
+	preempt_enable();
+
 	return true;
 }
 
-- 
cgit v1.2.3