From 1806b1f97f7ab0bb8bda7c117b2573a335cea940 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 Oct 2024 11:02:49 -0700 Subject: refscale: Add test for sched_clock() This commit adds a "sched-clock" test for the sched_clock() function. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/refscale.c | 40 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) (limited to 'kernel/rcu') diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index aacfcc9838b3..1b47376acdc4 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "rcu.h" @@ -531,6 +532,39 @@ static const struct ref_scale_ops acqrel_ops = { static volatile u64 stopopts; +static void ref_sched_clock_section(const int nloops) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) + x += sched_clock(); + preempt_enable(); + stopopts = x; +} + +static void ref_sched_clock_delay_section(const int nloops, const int udl, const int ndl) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + x += sched_clock(); + un_delay(udl, ndl); + } + preempt_enable(); + stopopts = x; +} + +static const struct ref_scale_ops sched_clock_ops = { + .readsection = ref_sched_clock_section, + .delaysection = ref_sched_clock_delay_section, + .name = "sched-clock" +}; + + static void ref_clock_section(const int nloops) { u64 x = 0; @@ -1130,9 +1164,9 @@ ref_scale_init(void) int firsterr = 0; static const struct ref_scale_ops *scale_ops[] = { &rcu_ops, &srcu_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS - &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, - &clock_ops, &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops, - &typesafe_seqlock_ops, + &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, + &acqrel_ops, &sched_clock_ops, &clock_ops, &jiffies_ops, + &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, }; if (!torture_init_begin(scale_type, verbose)) -- cgit v1.2.3 From 0203b485d26d5b403ff4ed21e4cc85ba9ec0fe67 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 Oct 2024 16:42:37 -0700 Subject: torture: Add dowarn argument to torture_sched_setaffinity() Current use cases of torture_sched_setaffinity() are well served by its unconditional warning on error. However, an upcoming use case for a preemption kthread needs to avoid warnings that might otherwise arise when that kthread attempted to bind itself to a CPU on its way offline. This commit therefore adds a dowarn argument that, when false, suppresses the warning. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- include/linux/torture.h | 2 +- kernel/locking/locktorture.c | 6 +++--- kernel/rcu/rcutorture.c | 2 +- kernel/rcu/update.c | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel/rcu') diff --git a/include/linux/torture.h b/include/linux/torture.h index c2e979f82f8d..0134e7221cae 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -130,7 +130,7 @@ void _torture_stop_kthread(char *m, struct task_struct **tp); #endif #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST) -long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask); +long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool dowarn); #endif #endif /* __LINUX_TORTURE_H */ diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index de95ec07e477..cc33470f4de9 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -106,7 +106,7 @@ static const struct kernel_param_ops lt_bind_ops = { module_param_cb(bind_readers, <_bind_ops, &bind_readers, 0644); module_param_cb(bind_writers, <_bind_ops, &bind_writers, 0644); -long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask); +long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool dowarn); static struct task_struct *stats_task; static struct task_struct **writer_tasks; @@ -1358,7 +1358,7 @@ static int __init lock_torture_init(void) if (torture_init_error(firsterr)) goto unwind; if (cpumask_nonempty(bind_writers)) - torture_sched_setaffinity(writer_tasks[i]->pid, bind_writers); + torture_sched_setaffinity(writer_tasks[i]->pid, bind_writers, true); create_reader: if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress)) @@ -1369,7 +1369,7 @@ static int __init lock_torture_init(void) if (torture_init_error(firsterr)) goto unwind; if (cpumask_nonempty(bind_readers)) - torture_sched_setaffinity(reader_tasks[j]->pid, bind_readers); + torture_sched_setaffinity(reader_tasks[j]->pid, bind_readers, true); } if (stat_interval > 0) { firsterr = torture_create_kthread(lock_torture_stats, NULL, diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 612d27690335..908506b68c41 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -857,7 +857,7 @@ static void synchronize_rcu_trivial(void) int cpu; for_each_online_cpu(cpu) { - torture_sched_setaffinity(current->pid, cpumask_of(cpu)); + torture_sched_setaffinity(current->pid, cpumask_of(cpu), true); WARN_ON_ONCE(raw_smp_processor_id() != cpu); } } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index f8436969e0c8..c912b594ba98 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -527,12 +527,12 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST) /* Get rcutorture access to sched_setaffinity(). */ -long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool dowarn) { int ret; ret = sched_setaffinity(pid, in_mask); - WARN_ONCE(ret, "%s: sched_setaffinity(%d) returned %d\n", __func__, pid, ret); + WARN_ONCE(dowarn && ret, "%s: sched_setaffinity(%d) returned %d\n", __func__, pid, ret); return ret; } EXPORT_SYMBOL_GPL(torture_sched_setaffinity); -- cgit v1.2.3 From 584975ccb7bd8088e681b0b75335295d0a2c6da1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 Oct 2024 11:42:53 -0700 Subject: rcutorture: Add random real-time preemption This commit adds the rcutorture.preempt_duration kernel module parameter, which gives the real-time preemption duration in milliseconds (zero to disable, which is the default) and also the rcutorture.preempt_interval module parameter, which gives the interval between successive preemptions, also in milliseconds, defaulting to one second. The CPU to preempt is chosen at random from those online at that time. Races between preempting a given CPU and that CPU going offline are ignored, and preemption is forgone when this occurs. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- Documentation/admin-guide/kernel-parameters.txt | 16 +++++++++ kernel/rcu/rcutorture.c | 44 +++++++++++++++++++++++-- 2 files changed, 58 insertions(+), 2 deletions(-) (limited to 'kernel/rcu') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index dc663c0ca670..65e5343b46cf 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5424,6 +5424,22 @@ Set time (jiffies) between CPU-hotplug operations, or zero to disable CPU-hotplug testing. + rcutorture.preempt_duration= [KNL] + Set duration (in milliseconds) of preemptions + by a high-priority FIFO real-time task. Set to + zero (the default) to disable. The CPUs to + preempt are selected randomly from the set that + are online at a given point in time. Races with + CPUs going offline are ignored, with that attempt + at preemption skipped. + + rcutorture.preempt_interval= [KNL] + Set interval (in milliseconds, defaulting to one + second) between preemptions by a high-priority + FIFO real-time task. This delay is mediated + by an hrtimer and is further fuzzed to avoid + inadvertent synchronizations. + rcutorture.read_exit_burst= [KNL] The number of times in a given read-then-exit episode that a set of read-then-exit kthreads diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 908506b68c41..99780a74da44 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -109,6 +109,8 @@ torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable"); torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable"); torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)"); +torture_param(int, preempt_duration, 0, "Preemption duration (ms), zero to disable"); +torture_param(int, preempt_interval, MSEC_PER_SEC, "Interval between preemptions (ms)"); torture_param(int, read_exit_delay, 13, "Delay between read-then-exit episodes (s)"); torture_param(int, read_exit_burst, 16, "# of read-then-exit bursts per episode, zero to disable"); torture_param(int, reader_flavor, 0x1, "Reader flavors to use, one per bit."); @@ -149,6 +151,7 @@ static struct task_struct **fwd_prog_tasks; static struct task_struct **barrier_cbs_tasks; static struct task_struct *barrier_task; static struct task_struct *read_exit_task; +static struct task_struct *preempt_task; #define RCU_TORTURE_PIPE_LEN 10 @@ -2425,7 +2428,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "read_exit_delay=%d read_exit_burst=%d " "reader_flavor=%x " "nocbs_nthreads=%d nocbs_toggle=%d " - "test_nmis=%d\n", + "test_nmis=%d " + "preempt_duration=%d preempt_interval=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, @@ -2438,7 +2442,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) read_exit_delay, read_exit_burst, reader_flavor, nocbs_nthreads, nocbs_toggle, - test_nmis); + test_nmis, + preempt_duration, preempt_interval); } static int rcutorture_booster_cleanup(unsigned int cpu) @@ -3418,6 +3423,35 @@ static void rcutorture_test_nmis(int n) #endif // #else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) } +// Randomly preempt online CPUs. +static int rcu_torture_preempt(void *unused) +{ + int cpu = -1; + DEFINE_TORTURE_RANDOM(rand); + + schedule_timeout_idle(stall_cpu_holdoff); + do { + // Wait for preempt_interval ms with up to 100us fuzz. + torture_hrtimeout_ms(preempt_interval, 100, &rand); + // Select online CPU. + cpu = cpumask_next(cpu, cpu_online_mask); + if (cpu >= nr_cpu_ids) + cpu = cpumask_next(-1, cpu_online_mask); + WARN_ON_ONCE(cpu >= nr_cpu_ids); + // Move to that CPU, if can't do so, retry later. + if (torture_sched_setaffinity(current->pid, cpumask_of(cpu), false)) + continue; + // Preempt at high-ish priority, then reset to normal. + sched_set_fifo(current); + torture_sched_setaffinity(current->pid, cpu_present_mask, true); + mdelay(preempt_duration); + sched_set_normal(current, 0); + stutter_wait("rcu_torture_preempt"); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_torture_preempt"); + return 0; +} + static enum cpuhp_state rcutor_hp; static void @@ -3446,6 +3480,7 @@ rcu_torture_cleanup(void) if (cur_ops->gp_kthread_dbg) cur_ops->gp_kthread_dbg(); + torture_stop_kthread(rcu_torture_preempt, preempt_task); rcu_torture_read_exit_cleanup(); rcu_torture_barrier_cleanup(); rcu_torture_fwd_prog_cleanup(); @@ -4019,6 +4054,11 @@ rcu_torture_init(void) firsterr = rcu_torture_read_exit_init(); if (torture_init_error(firsterr)) goto unwind; + if (preempt_duration > 0) { + firsterr = torture_create_kthread(rcu_torture_preempt, NULL, preempt_task); + if (torture_init_error(firsterr)) + goto unwind; + } if (object_debug) rcu_test_debug_objects(); torture_init_end(); -- cgit v1.2.3 From 579a05da40a4980870a13d30cd0532f77aa15b8b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Oct 2024 09:50:40 -0700 Subject: rcutorture: Decorate failing reader segments with CPU ID This commit adds CPU number to the "Failure/close-call rcutorture reader segments" list printed at the end of an rcutorture run that had too-short grace periods. This information can help debugging interactions with migration and CPU hotplug. However, experience indicates that sampling the CPU number in rcutorture's read-side code can reduce the probability of too-short bugs by a small integer factor. And small integer factors are crucial to RCU bug hunting, so this commit also introduces a default-off RCU_TORTURE_TEST_LOG_CPU Kconfig option to enable this CPU-number-logging functionality at build time. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/Kconfig.debug | 15 +++++++++++++++ kernel/rcu/rcutorture.c | 9 +++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'kernel/rcu') diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 9b0b52e1836f..b3ac000004bf 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -53,6 +53,21 @@ config RCU_TORTURE_TEST Say M if you want the RCU torture tests to build as a module. Say N if you are unsure. +config RCU_TORTURE_TEST_LOG_CPU + tristate "Log CPU for rcutorture failures" + depends on RCU_TORTURE_TEST + default n + help + This option causes rcutorture to decorate each entry of its + log of failure/close-call rcutorture reader segments with the + number of the CPU that the reader was running on at the time. + This information can be useful, but it does incur additional + overhead, overhead that can make both failures and close calls + less probable. + + Say Y here if you want CPU IDs logged. + Say N if you are unsure. + config RCU_REF_SCALE_TEST tristate "Scalability tests for read-side synchronization (RCU and others)" depends on DEBUG_KERNEL diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 99780a74da44..0bc6fc582215 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -262,6 +262,7 @@ struct rt_read_seg { unsigned long rt_delay_ms; unsigned long rt_delay_us; bool rt_preempted; + int rt_cpu; }; static int err_segs_recorded; static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS]; @@ -1862,6 +1863,8 @@ static void rcutorture_one_extend(int *readstate, int newstate, WARN_ON_ONCE(idxold2 < 0); WARN_ON_ONCE(idxold2 & ~RCUTORTURE_RDR_ALLBITS); rtrsp->rt_readstate = newstate; + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) + rtrsp->rt_cpu = raw_smp_processor_id(); /* First, put new protection in place to avoid critical-section gap. */ if (statesnew & RCUTORTURE_RDR_BH) @@ -3559,8 +3562,10 @@ rcu_torture_cleanup(void) err_segs[i].rt_delay_us); firsttime = 0; } - pr_cont("%s\n", - err_segs[i].rt_preempted ? "preempted" : ""); + pr_cont("%s", err_segs[i].rt_preempted ? "preempted" : ""); + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) + pr_cont(" CPU %d", err_segs[i].rt_cpu); + pr_cont("\n"); } } -- cgit v1.2.3 From 7b6c1648bb6e041b3f2284b7f602283adc852bb7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 21 Oct 2024 07:07:47 -0700 Subject: rcutorture: Use finer-grained timeouts for rcu_torture_writer() polling The rcu_torture_writer() polling currently uses timeouts ranging from zero to 16 milliseconds to wait for the polled grace period to end. This works, but it would be better to have a higher probability of exercising races with the code that cleans up after a grace period. This commit therefore switches from these millisecond-scale timeouts to timeouts ranging from zero to 128 microseconds, and with a full microsecond's worth of timeout fuzz. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel/rcu') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0bc6fc582215..6067f2740247 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1561,8 +1561,8 @@ rcu_torture_writer(void *arg) break; } WARN_ON_ONCE(ulo_size > 0 && i >= ulo_size); - torture_hrtimeout_jiffies(torture_random(&rand) % 16, - &rand); + torture_hrtimeout_us(torture_random(&rand) % 128, 1000, + &rand); } rcu_torture_pipe_update(old_rp); break; @@ -1582,8 +1582,8 @@ rcu_torture_writer(void *arg) break; } WARN_ON_ONCE(rgo_size > 0 && i >= rgo_size); - torture_hrtimeout_jiffies(torture_random(&rand) % 16, - &rand); + torture_hrtimeout_us(torture_random(&rand) % 128, 1000, + &rand); } rcu_torture_pipe_update(old_rp); break; @@ -1592,8 +1592,8 @@ rcu_torture_writer(void *arg) gp_snap = cur_ops->start_gp_poll_exp(); rcu_torture_writer_state = RTWS_POLL_WAIT_EXP; while (!cur_ops->poll_gp_state_exp(gp_snap)) - torture_hrtimeout_jiffies(torture_random(&rand) % 16, - &rand); + torture_hrtimeout_us(torture_random(&rand) % 128, 1000, + &rand); rcu_torture_pipe_update(old_rp); break; case RTWS_POLL_GET_EXP_FULL: @@ -1601,8 +1601,8 @@ rcu_torture_writer(void *arg) cur_ops->start_gp_poll_exp_full(&gp_snap_full); rcu_torture_writer_state = RTWS_POLL_WAIT_EXP_FULL; while (!cur_ops->poll_gp_state_full(&gp_snap_full)) - torture_hrtimeout_jiffies(torture_random(&rand) % 16, - &rand); + torture_hrtimeout_us(torture_random(&rand) % 128, 1000, + &rand); rcu_torture_pipe_update(old_rp); break; case RTWS_SYNC: -- cgit v1.2.3 From 4569cf60b6caf26995f314a0c1e14f73ab8c924b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 Oct 2024 14:29:11 -0700 Subject: rcutorture: Add ->cond_sync_exp_full function to rcu_ops structure The rcu_ops structure currently lacks a ->cond_sync_exp_full function, which prevents testign of conditional full-state polled grace periods. This commit therefore adds them, enabling testing this option. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/rcu') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 6067f2740247..658ac46581d8 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -577,6 +577,7 @@ static struct rcu_torture_ops rcu_ops = { .start_gp_poll_exp_full = start_poll_synchronize_rcu_expedited_full, .poll_gp_state_exp = poll_state_synchronize_rcu, .cond_sync_exp = cond_synchronize_rcu_expedited, + .cond_sync_exp_full = cond_synchronize_rcu_expedited_full, .call = call_rcu_hurry, .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, -- cgit v1.2.3 From 0f38c06cab7712fc82c314fe4264a8897f3e6365 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Oct 2024 13:07:11 -0700 Subject: rcutorture: Check preemption for failing reader This commit checks to see if the RCU reader has been preempted within its read-side critical section for RCU flavors supporting this notion (currently only preemptible RCU). If such a preemption occurred, then this is printed at the end of the "Failure/close-call rcutorture reader segments" list at the end of the rcutorture run. [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Tested-by: kernel test robot Signed-off-by: Uladzislau Rezki (Sony) --- include/linux/rcupdate_wait.h | 11 +++++++++++ kernel/rcu/rcutorture.c | 11 +++++++++++ 2 files changed, 22 insertions(+) (limited to 'kernel/rcu') diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h index 303ab9bee155..f9bed3d3f78d 100644 --- a/include/linux/rcupdate_wait.h +++ b/include/linux/rcupdate_wait.h @@ -65,4 +65,15 @@ static inline void cond_resched_rcu(void) #endif } +// Has the current task blocked within its current RCU read-side +// critical section? +static inline bool has_rcu_reader_blocked(void) +{ +#ifdef CONFIG_PREEMPT_RCU + return !list_empty(¤t->rcu_node_entry); +#else + return false; +#endif +} + #endif /* _LINUX_SCHED_RCUPDATE_WAIT_H */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 658ac46581d8..9b81e21c75d1 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -267,6 +267,7 @@ struct rt_read_seg { static int err_segs_recorded; static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS]; static int rt_read_nsegs; +static int rt_read_preempted; static const char *rcu_torture_writer_state_getname(void) { @@ -394,6 +395,7 @@ struct rcu_torture_ops { void (*get_gp_data)(int *flags, unsigned long *gp_seq); void (*gp_slow_register)(atomic_t *rgssp); void (*gp_slow_unregister)(atomic_t *rgssp); + bool (*reader_blocked)(void); long cbflood_max; int irq_capable; int can_boost; @@ -587,6 +589,9 @@ static struct rcu_torture_ops rcu_ops = { .get_gp_data = rcutorture_get_gp_data, .gp_slow_register = rcu_gp_slow_register, .gp_slow_unregister = rcu_gp_slow_unregister, + .reader_blocked = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU) + ? has_rcu_reader_blocked + : NULL, .irq_capable = 1, .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), .extendables = RCUTORTURE_MAX_EXTEND, @@ -2035,6 +2040,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) int newstate; struct rcu_torture *p; int pipe_count; + bool preempted = false; int readstate = 0; struct rt_read_seg rtseg[RCUTORTURE_RDR_MAX_SEGS] = { { 0 } }; struct rt_read_seg *rtrsp = &rtseg[0]; @@ -2100,6 +2106,8 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) rcu_torture_writer_state, cpumask_pr_args(cpu_online_mask)); } + if (cur_ops->reader_blocked) + preempted = cur_ops->reader_blocked(); rcutorture_one_extend(&readstate, 0, trsp, rtrsp); WARN_ON_ONCE(readstate); // This next splat is expected behavior if leakpointer, especially @@ -2112,6 +2120,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) for (rtrsp1 = &rtseg[0]; rtrsp1 < rtrsp; rtrsp1++) err_segs[i++] = *rtrsp1; rt_read_nsegs = i; + rt_read_preempted = preempted; } return true; @@ -3569,6 +3578,8 @@ rcu_torture_cleanup(void) pr_cont("\n"); } + if (rt_read_preempted) + pr_alert("\tReader was preempted.\n"); } if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); -- cgit v1.2.3 From 3b476823b98685ec4d228af32323854f8e45aac7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Oct 2024 16:58:51 -0700 Subject: rcutorture: Decorate failing reader segments with last CPU ID In kernels built with CONFIG_RCU_TORTURE_TEST_LOG_CPU=y, the CPU is logged at the beginning of each reader segment. This commit further logs it at the end of the full set of reader segments in order to show any migration that might have occurred during the last reader segment. Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/rcu') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 9b81e21c75d1..61b092a3dc3f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -268,6 +268,7 @@ static int err_segs_recorded; static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS]; static int rt_read_nsegs; static int rt_read_preempted; +static int rt_last_cpu; static const char *rcu_torture_writer_state_getname(void) { @@ -2108,6 +2109,8 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) } if (cur_ops->reader_blocked) preempted = cur_ops->reader_blocked(); + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) + rt_last_cpu = raw_smp_processor_id(); rcutorture_one_extend(&readstate, 0, trsp, rtrsp); WARN_ON_ONCE(readstate); // This next splat is expected behavior if leakpointer, especially @@ -3580,6 +3583,8 @@ rcu_torture_cleanup(void) } if (rt_read_preempted) pr_alert("\tReader was preempted.\n"); + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) + pr_alert("\tReader last ran on CPU %d.\n", rt_last_cpu); } if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); -- cgit v1.2.3 From b27a34f908c7c2424483e50c13894414bb169aad Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Oct 2024 09:47:11 -0700 Subject: rcutorture: Add full read-side contexts to "busted" torture type The purpose of the "busted" torture type is to test rcutorture code paths used only when a too-short grace period is detected. Currently, "busted" only uses normal rcu_read_lock()-style readers, which fails to exercise much of the "Failure/close-call rcutorture reader segments" functionality. This commit therefore sets the .extendables field of rcu_busted_ops to RCUTORTURE_MAX_EXTEND in order to more fully exercise the reporting. Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/rcu') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 61b092a3dc3f..81b3743f81dc 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -639,6 +639,7 @@ static struct rcu_torture_ops rcu_busted_ops = { .exp_sync = synchronize_rcu_busted, .call = call_rcu_busted, .irq_capable = 1, + .extendables = RCUTORTURE_MAX_EXTEND, .name = "busted" }; -- cgit v1.2.3 From 16338e7cb7450574ae3a210db6f35280fc44e50e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Oct 2024 12:21:28 -0700 Subject: rcutorture: Pretty-print rcutorture reader segments The current "Failure/close-call rcutorture reader segments" output is good and sufficient, but annoying when you have to interpret several tens of them after an all-night rcutorture run. This commit therefore makes them a bit more human-readable. Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'kernel/rcu') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 81b3743f81dc..c875e7239ae7 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -3560,25 +3560,39 @@ rcu_torture_cleanup(void) pr_alert("\t: No segments recorded!!!\n"); firsttime = 1; for (i = 0; i < rt_read_nsegs; i++) { - pr_alert("\t%d: %#x ", i, err_segs[i].rt_readstate); + pr_alert("\t%d: %#4x", i, err_segs[i].rt_readstate); if (err_segs[i].rt_delay_jiffies != 0) { pr_cont("%s%ldjiffies", firsttime ? "" : "+", err_segs[i].rt_delay_jiffies); firsttime = 0; } + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) + pr_cont(" CPU %-2d", err_segs[i].rt_cpu); if (err_segs[i].rt_delay_ms != 0) { - pr_cont("%s%ldms", firsttime ? "" : "+", + pr_cont(" %s%ldms", firsttime ? "" : "+", err_segs[i].rt_delay_ms); firsttime = 0; } if (err_segs[i].rt_delay_us != 0) { - pr_cont("%s%ldus", firsttime ? "" : "+", + pr_cont(" %s%ldus", firsttime ? "" : "+", err_segs[i].rt_delay_us); firsttime = 0; } - pr_cont("%s", err_segs[i].rt_preempted ? "preempted" : ""); - if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) - pr_cont(" CPU %d", err_segs[i].rt_cpu); + pr_cont("%s", err_segs[i].rt_preempted ? " preempted" : ""); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_BH) + pr_cont(" BH"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_IRQ) + pr_cont(" IRQ"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_PREEMPT) + pr_cont(" PREEMPT"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_RBH) + pr_cont(" RBH"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_SCHED) + pr_cont(" SCHED"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_RCU_1) + pr_cont(" RCU_1"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_RCU_2) + pr_cont(" RCU_2"); pr_cont("\n"); } -- cgit v1.2.3 From ec9d6356bfda69abe5f4767dd56c964127913233 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Oct 2024 17:10:22 -0700 Subject: rcutorture: Make rcutorture_one_extend() check reader state This commit adds reader-state debugging checks to a new function named rcutorture_one_extend_check(), which is invoked before and after setting new reader states by the existing rcutorture_one_extend() function. These checks have proven to be rather heavyweight, reducing reproduction rate of some failures by a factor of two. They are therefore hidden behind a new RCU_TORTURE_TEST_CHK_RDR_STATE Kconfig option. Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Tested-by: kernel test robot Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/Kconfig.debug | 16 +++++++++++ kernel/rcu/rcutorture.c | 71 ++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 79 insertions(+), 8 deletions(-) (limited to 'kernel/rcu') diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index b3ac000004bf..6af90510a1ca 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -53,6 +53,22 @@ config RCU_TORTURE_TEST Say M if you want the RCU torture tests to build as a module. Say N if you are unsure. +config RCU_TORTURE_TEST_CHK_RDR_STATE + tristate "Check rcutorture reader state" + depends on RCU_TORTURE_TEST + default n + help + This option causes rcutorture to check the desired rcutorture + reader state for each segment against the actual context. + Note that PREEMPT_COUNT must be enabled if the preempt-disabled + and bh-disabled checks are to take effect, and that PREEMPT_RCU + must be enabled for the RCU-nesting checks to take effect. + These checks add overhead, and this Kconfig options is therefore + disabled by default. + + Say Y here if you want rcutorture reader contexts checked. + Say N if you are unsure. + config RCU_TORTURE_TEST_LOG_CPU tristate "Log CPU for rcutorture failures" depends on RCU_TORTURE_TEST diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index c875e7239ae7..8c7820a00f3c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -359,7 +359,8 @@ struct rcu_torture_ops { void (*read_delay)(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp); void (*readunlock)(int idx); - int (*readlock_held)(void); + int (*readlock_held)(void); // lockdep. + int (*readlock_nesting)(void); // actual nesting, if available, -1 if not. unsigned long (*get_gp_seq)(void); unsigned long (*gp_diff)(unsigned long new, unsigned long old); void (*deferred_free)(struct rcu_torture *p); @@ -466,6 +467,15 @@ static void rcu_torture_read_unlock(int idx) rcu_read_unlock(); } +static int rcu_torture_readlock_nesting(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) + return rcu_preempt_depth(); + if (IS_ENABLED(CONFIG_PREEMPT_COUNT)) + return (preempt_count() & PREEMPT_MASK); + return -1; +} + /* * Update callback in the pipe. This should be invoked after a grace period. */ @@ -555,6 +565,7 @@ static struct rcu_torture_ops rcu_ops = { .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, .readlock_held = torture_readlock_not_held, + .readlock_nesting = rcu_torture_readlock_nesting, .get_gp_seq = rcu_get_gp_seq, .gp_diff = rcu_seq_diff, .deferred_free = rcu_torture_deferred_free, @@ -1847,6 +1858,44 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp, smp_store_release(&rtrcp_assigner->rtc_chkrdr, -1); // Assigner can again assign. } +// Verify the specified RCUTORTURE_RDR* state. +#define ROEC_ARGS "%s %s: Current %#x To add %#x To remove %#x\n", __func__, s, curstate, new, old +static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, bool insoftirq) +{ + if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE)) + return; + + WARN_ONCE(!(curstate & RCUTORTURE_RDR_IRQ) && irqs_disabled(), ROEC_ARGS); + WARN_ONCE((curstate & RCUTORTURE_RDR_IRQ) && !irqs_disabled(), ROEC_ARGS); + + // If CONFIG_PREEMPT_COUNT=n, further checks are unreliable. + if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) + return; + + WARN_ONCE((curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) && + !(preempt_count() & SOFTIRQ_MASK), ROEC_ARGS); + WARN_ONCE((curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) && + !(preempt_count() & PREEMPT_MASK), ROEC_ARGS); + WARN_ONCE(cur_ops->readlock_nesting && + (curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) && + cur_ops->readlock_nesting() == 0, ROEC_ARGS); + + // Timer handlers have all sorts of stuff disabled, so ignore + // unintended disabling. + if (insoftirq) + return; + + WARN_ONCE(cur_ops->extendables && + !(curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) && + (preempt_count() & SOFTIRQ_MASK), ROEC_ARGS); + WARN_ONCE(cur_ops->extendables && + !(curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) && + (preempt_count() & PREEMPT_MASK), ROEC_ARGS); + WARN_ONCE(cur_ops->readlock_nesting && + !(curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) && + cur_ops->readlock_nesting() > 0, ROEC_ARGS); +} + /* * Do one extension of an RCU read-side critical section using the * current reader state in readstate (set to zero for initial entry @@ -1856,7 +1905,7 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp, * beginning or end of the critical section and if there was actually a * change, do a ->read_delay(). */ -static void rcutorture_one_extend(int *readstate, int newstate, +static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, struct torture_random_state *trsp, struct rt_read_seg *rtrsp) { @@ -1870,6 +1919,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, WARN_ON_ONCE(idxold2 < 0); WARN_ON_ONCE(idxold2 & ~RCUTORTURE_RDR_ALLBITS); + rcutorture_one_extend_check("before change", idxold1, statesnew, statesold, insoftirq); rtrsp->rt_readstate = newstate; if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) rtrsp->rt_cpu = raw_smp_processor_id(); @@ -1890,6 +1940,10 @@ static void rcutorture_one_extend(int *readstate, int newstate, if (statesnew & RCUTORTURE_RDR_RCU_2) idxnew2 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_2) & RCUTORTURE_RDR_MASK_2; + // Complain unless both the old and the new protection is in place. + rcutorture_one_extend_check("during change", + idxold1 | statesnew, statesnew, statesold, insoftirq); + /* * Next, remove old protection, in decreasing order of strength * to avoid unlock paths that aren't safe in the stronger @@ -1940,6 +1994,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, WARN_ON_ONCE(*readstate < 0); if (WARN_ON_ONCE(*readstate & ~RCUTORTURE_RDR_ALLBITS)) pr_info("Unexpected readstate value of %#x\n", *readstate); + rcutorture_one_extend_check("after change", *readstate, statesnew, statesold, insoftirq); } /* Return the biggest extendables mask given current RCU and boot parameters. */ @@ -2006,7 +2061,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) * critical section. */ static struct rt_read_seg * -rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, +rcutorture_loop_extend(int *readstate, bool insoftirq, struct torture_random_state *trsp, struct rt_read_seg *rtrsp) { int i; @@ -2021,7 +2076,7 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, i = ((i | (i >> 3)) & RCUTORTURE_RDR_MAX_LOOPS) + 1; for (j = 0; j < i; j++) { mask = rcutorture_extend_mask(*readstate, trsp); - rcutorture_one_extend(readstate, mask, trsp, &rtrsp[j]); + rcutorture_one_extend(readstate, mask, insoftirq, trsp, &rtrsp[j]); } return &rtrsp[j]; } @@ -2051,7 +2106,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) WARN_ON_ONCE(!rcu_is_watching()); newstate = rcutorture_extend_mask(readstate, trsp); - rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++); + rcutorture_one_extend(&readstate, newstate, myid < 0, trsp, rtrsp++); if (checkpolling) { if (cur_ops->get_gp_state && cur_ops->poll_gp_state) cookie = cur_ops->get_gp_state(); @@ -2064,13 +2119,13 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) !cur_ops->readlock_held || cur_ops->readlock_held()); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ - rcutorture_one_extend(&readstate, 0, trsp, rtrsp); + rcutorture_one_extend(&readstate, 0, myid < 0, trsp, rtrsp); return false; } if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); rcu_torture_reader_do_mbchk(myid, p, trsp); - rtrsp = rcutorture_loop_extend(&readstate, trsp, rtrsp); + rtrsp = rcutorture_loop_extend(&readstate, myid < 0, trsp, rtrsp); preempt_disable(); pipe_count = READ_ONCE(p->rtort_pipe_count); if (pipe_count > RCU_TORTURE_PIPE_LEN) { @@ -2112,7 +2167,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) preempted = cur_ops->reader_blocked(); if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) rt_last_cpu = raw_smp_processor_id(); - rcutorture_one_extend(&readstate, 0, trsp, rtrsp); + rcutorture_one_extend(&readstate, 0, myid < 0, trsp, rtrsp); WARN_ON_ONCE(readstate); // This next splat is expected behavior if leakpointer, especially // for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels. -- cgit v1.2.3 From a2ab1e457897f4fc8ff23f7ead92ee2b29655c04 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 6 Nov 2024 12:19:35 -0800 Subject: rcutorture: Ignore attempts to test preemption and forward progress Use of the rcutorture preempt_duration and the default-on fwd_progress kernel parameters can result in preemption of callback processing during forward-progress testing, which is an excellent way to OOM your test if your kernel offloads RCU callbacks. This commit therefore treats preempt_duration in the same way as stall_cpu in CONFIG_RCU_NOCB_CPU=y kernels, prohibiting fwd_progress testing and splatting when rcutorture is built in (as opposed to being a loadable module). Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/rcu') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 8c7820a00f3c..3595ce889b44 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -3145,12 +3145,12 @@ static int __init rcu_torture_fwd_prog_init(void) fwd_progress = 0; return 0; } - if (stall_cpu > 0) { - VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall testing"); + if (stall_cpu > 0 || (preempt_duration > 0 && IS_ENABLED(CONFIG_RCU_NOCB_CPU))) { + VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall and/or preemption testing"); fwd_progress = 0; if (IS_MODULE(CONFIG_RCU_TORTURE_TEST)) return -EINVAL; /* In module, can fail back to user. */ - WARN_ON(1); /* Make sure rcutorture notices conflict. */ + WARN_ON(1); /* Make sure rcutorture scripting notices conflict. */ return 0; } if (fwd_progress_holdoff <= 0) -- cgit v1.2.3 From 282e06cc8f595e999f778bd284d6a3608f7d1d62 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 6 Nov 2024 15:42:06 -0800 Subject: rcutorture: Add parameters to control polled/conditional wait interval This commit adds rcutorture module parameters gp_cond_wi, gp_cond_wi_exp, gp_poll_wi, and gp_poll_wi_exp to control the wait interval for conditional, conditional expedited, polled, and polled expedited grace periods, respectively. When rcu_torture_writer() is testing these types of grace periods, hrtimers are used to randomly wait up to the specified number of microseconds, but with nanosecond granularity. In the case of conditional grace periods (get_state_synchronize_rcu() and cond_synchronize_rcu(), for example) there is just one wait. For polled grace periods (start_poll_synchronize_rcu() and poll_state_synchronize_rcu(), for example), there is a repeated series of waits until the grace period ends. For normal grace periods, the default is 16 jiffies (for example, 16,000 microseconds on a HZ=1000 system) and for expedited grace periods the default is 128 microseconds. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- Documentation/admin-guide/kernel-parameters.txt | 38 +++++++++++++++++++++++++ kernel/rcu/rcutorture.c | 37 ++++++++++++++++-------- 2 files changed, 63 insertions(+), 12 deletions(-) (limited to 'kernel/rcu') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 03d13ca0604f..3152f2c1da29 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5380,6 +5380,25 @@ concurrent normal grace periods into account, if available. + rcutorture.gp_cond_wi= [KNL] + Nominal wait interval for normal conditional + grace periods (specified by rcutorture's + gp_cond and gp_cond_full module parameters), + in microseconds. The actual wait interval will + be randomly selected to nanosecond granularity up + to this wait interval. Defaults to 16 jiffies, + for example, 16,000 microseconds on a system + with HZ=1000. + + rcutorture.gp_cond_wi_exp= [KNL] + Nominal wait interval for expedited conditional + grace periods (specified by rcutorture's + gp_cond_exp and gp_cond_exp_full module + parameters), in microseconds. The actual wait + interval will be randomly selected to nanosecond + granularity up to this wait interval. Defaults to + 128 microseconds. + rcutorture.gp_exp= [KNL] Use expedited update-side primitives, if available. @@ -5405,6 +5424,25 @@ primitives that also take concurrent normal grace periods into account, if available. + rcutorture.gp_poll_wi= [KNL] + Nominal wait interval for normal conditional + grace periods (specified by rcutorture's + gp_poll and gp_poll_full module parameters), + in microseconds. The actual wait interval will + be randomly selected to nanosecond granularity up + to this wait interval. Defaults to 16 jiffies, + for example, 16,000 microseconds on a system + with HZ=1000. + + rcutorture.gp_poll_wi_exp= [KNL] + Nominal wait interval for expedited conditional + grace periods (specified by rcutorture's + gp_poll_exp and gp_poll_exp_full module + parameters), in microseconds. The actual wait + interval will be randomly selected to nanosecond + granularity up to this wait interval. Defaults to + 128 microseconds. + rcutorture.gp_sync= [KNL] Use normal (non-expedited) synchronous update-side primitives, if available. If all diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 3595ce889b44..235a73dad280 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -92,12 +92,20 @@ torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait torture_param(bool, gp_cond_full, false, "Use conditional/async full-state GP wait primitives"); torture_param(bool, gp_cond_exp_full, false, "Use conditional/async full-stateexpedited GP wait primitives"); +torture_param(int, gp_cond_wi, 16 * USEC_PER_SEC / HZ, + "Wait interval for normal conditional grace periods, us (default 16 jiffies)"); +torture_param(int, gp_cond_wi_exp, 128, + "Wait interval for expedited conditional grace periods, us (default 128 us)"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives"); torture_param(bool, gp_poll, false, "Use polling GP wait primitives"); torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives"); torture_param(bool, gp_poll_full, false, "Use polling full-state GP wait primitives"); torture_param(bool, gp_poll_exp_full, false, "Use polling full-state expedited GP wait primitives"); +torture_param(int, gp_poll_wi, 16 * USEC_PER_SEC / HZ, + "Wait interval for normal polled grace periods, us (default 16 jiffies)"); +torture_param(int, gp_poll_wi_exp, 128, + "Wait interval for expedited polled grace periods, us (default 128 us)"); torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers"); @@ -1370,6 +1378,7 @@ static void rcu_torture_write_types(void) pr_alert("%s: gp_sync without primitives.\n", __func__); } pr_alert("%s: Testing %d update types.\n", __func__, nsynctypes); + pr_info("%s: gp_cond_wi %d gp_cond_wi_exp %d gp_poll_wi %d gp_poll_wi_exp %d\n", __func__, gp_cond_wi, gp_cond_wi_exp, gp_poll_wi, gp_poll_wi_exp); } /* @@ -1536,7 +1545,8 @@ rcu_torture_writer(void *arg) case RTWS_COND_GET: rcu_torture_writer_state = RTWS_COND_GET; gp_snap = cur_ops->get_gp_state(); - torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi, + 1000, &rand); rcu_torture_writer_state = RTWS_COND_SYNC; cur_ops->cond_sync(gp_snap); rcu_torture_pipe_update(old_rp); @@ -1544,7 +1554,8 @@ rcu_torture_writer(void *arg) case RTWS_COND_GET_EXP: rcu_torture_writer_state = RTWS_COND_GET_EXP; gp_snap = cur_ops->get_gp_state_exp(); - torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi_exp, + 1000, &rand); rcu_torture_writer_state = RTWS_COND_SYNC_EXP; cur_ops->cond_sync_exp(gp_snap); rcu_torture_pipe_update(old_rp); @@ -1552,7 +1563,8 @@ rcu_torture_writer(void *arg) case RTWS_COND_GET_FULL: rcu_torture_writer_state = RTWS_COND_GET_FULL; cur_ops->get_gp_state_full(&gp_snap_full); - torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi, + 1000, &rand); rcu_torture_writer_state = RTWS_COND_SYNC_FULL; cur_ops->cond_sync_full(&gp_snap_full); rcu_torture_pipe_update(old_rp); @@ -1560,7 +1572,8 @@ rcu_torture_writer(void *arg) case RTWS_COND_GET_EXP_FULL: rcu_torture_writer_state = RTWS_COND_GET_EXP_FULL; cur_ops->get_gp_state_full(&gp_snap_full); - torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi_exp, + 1000, &rand); rcu_torture_writer_state = RTWS_COND_SYNC_EXP_FULL; cur_ops->cond_sync_exp_full(&gp_snap_full); rcu_torture_pipe_update(old_rp); @@ -1580,8 +1593,8 @@ rcu_torture_writer(void *arg) break; } WARN_ON_ONCE(ulo_size > 0 && i >= ulo_size); - torture_hrtimeout_us(torture_random(&rand) % 128, 1000, - &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi, + 1000, &rand); } rcu_torture_pipe_update(old_rp); break; @@ -1601,8 +1614,8 @@ rcu_torture_writer(void *arg) break; } WARN_ON_ONCE(rgo_size > 0 && i >= rgo_size); - torture_hrtimeout_us(torture_random(&rand) % 128, 1000, - &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi, + 1000, &rand); } rcu_torture_pipe_update(old_rp); break; @@ -1611,8 +1624,8 @@ rcu_torture_writer(void *arg) gp_snap = cur_ops->start_gp_poll_exp(); rcu_torture_writer_state = RTWS_POLL_WAIT_EXP; while (!cur_ops->poll_gp_state_exp(gp_snap)) - torture_hrtimeout_us(torture_random(&rand) % 128, 1000, - &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi_exp, + 1000, &rand); rcu_torture_pipe_update(old_rp); break; case RTWS_POLL_GET_EXP_FULL: @@ -1620,8 +1633,8 @@ rcu_torture_writer(void *arg) cur_ops->start_gp_poll_exp_full(&gp_snap_full); rcu_torture_writer_state = RTWS_POLL_WAIT_EXP_FULL; while (!cur_ops->poll_gp_state_full(&gp_snap_full)) - torture_hrtimeout_us(torture_random(&rand) % 128, 1000, - &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi_exp, + 1000, &rand); rcu_torture_pipe_update(old_rp); break; case RTWS_SYNC: -- cgit v1.2.3 From c31569eec4815c6fa64948c31f00ebe50b1c75dc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 8 Nov 2024 14:58:54 -0800 Subject: rcutorture: Add preempt_count() to rcutorture_one_extend_check() diagnostics This commit adds the value of preempt_count() to the diagnostics produced by rcutorture_one_extend_check() to improve debugging. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/rcu') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 235a73dad280..70c27bd67be1 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1872,7 +1872,7 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp, } // Verify the specified RCUTORTURE_RDR* state. -#define ROEC_ARGS "%s %s: Current %#x To add %#x To remove %#x\n", __func__, s, curstate, new, old +#define ROEC_ARGS "%s %s: Current %#x To add %#x To remove %#x preempt_count() %#x\n", __func__, s, curstate, new, old, preempt_count() static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, bool insoftirq) { if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE)) -- cgit v1.2.3 From 885a6f4729c688c50bb2f470dfcc0ad0dea43c1e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 8 Nov 2024 15:36:55 -0800 Subject: rcutorture: Read CPU ID for decoration protected by both reader types Currently, rcutorture_one_extend() reads the CPU ID before making any change to the type of RCU reader. This can be confusing because the properties of the code from which the CPU ID is read are not that of the reader segment that this same CPU ID is listed with. This commit therefore causes rcutorture_one_extend() to read the CPU ID just after the new protections have been added, but before the old protections have been removed. With this change in place, all of the protections of a given reader segment apply from the reading of one CPU ID to the reading of the next. This change therefore also allows a single read of the CPU ID to work for both the old and the new reader segment. And this dual use of a single read of the CPU ID avoids inflicting any additional to heisenbugs. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'kernel/rcu') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 70c27bd67be1..ab354bb7f1b6 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -271,12 +271,12 @@ struct rt_read_seg { unsigned long rt_delay_us; bool rt_preempted; int rt_cpu; + int rt_end_cpu; }; static int err_segs_recorded; static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS]; static int rt_read_nsegs; static int rt_read_preempted; -static int rt_last_cpu; static const char *rcu_torture_writer_state_getname(void) { @@ -1922,6 +1922,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, struct torture_random_state *trsp, struct rt_read_seg *rtrsp) { + bool first; unsigned long flags; int idxnew1 = -1; int idxnew2 = -1; @@ -1930,12 +1931,11 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, int statesnew = ~*readstate & newstate; int statesold = *readstate & ~newstate; + first = idxold1 == 0; WARN_ON_ONCE(idxold2 < 0); WARN_ON_ONCE(idxold2 & ~RCUTORTURE_RDR_ALLBITS); rcutorture_one_extend_check("before change", idxold1, statesnew, statesold, insoftirq); rtrsp->rt_readstate = newstate; - if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) - rtrsp->rt_cpu = raw_smp_processor_id(); /* First, put new protection in place to avoid critical-section gap. */ if (statesnew & RCUTORTURE_RDR_BH) @@ -1957,6 +1957,14 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, rcutorture_one_extend_check("during change", idxold1 | statesnew, statesnew, statesold, insoftirq); + // Sample CPU under both sets of protections to reduce confusion. + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) { + int cpu = raw_smp_processor_id(); + rtrsp->rt_cpu = cpu; + if (!first) + rtrsp[-1].rt_end_cpu = cpu; + } + /* * Next, remove old protection, in decreasing order of strength * to avoid unlock paths that aren't safe in the stronger @@ -2178,8 +2186,6 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) } if (cur_ops->reader_blocked) preempted = cur_ops->reader_blocked(); - if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) - rt_last_cpu = raw_smp_processor_id(); rcutorture_one_extend(&readstate, 0, myid < 0, trsp, rtrsp); WARN_ON_ONCE(readstate); // This next splat is expected behavior if leakpointer, especially @@ -3634,8 +3640,13 @@ rcu_torture_cleanup(void) err_segs[i].rt_delay_jiffies); firsttime = 0; } - if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) - pr_cont(" CPU %-2d", err_segs[i].rt_cpu); + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) { + pr_cont(" CPU %2d", err_segs[i].rt_cpu); + if (err_segs[i].rt_cpu != err_segs[i].rt_end_cpu) + pr_cont("->%-2d", err_segs[i].rt_end_cpu); + else + pr_cont(" ..."); + } if (err_segs[i].rt_delay_ms != 0) { pr_cont(" %s%ldms", firsttime ? "" : "+", err_segs[i].rt_delay_ms); @@ -3666,8 +3677,6 @@ rcu_torture_cleanup(void) } if (rt_read_preempted) pr_alert("\tReader was preempted.\n"); - if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) - pr_alert("\tReader last ran on CPU %d.\n", rt_last_cpu); } if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); -- cgit v1.2.3 From 223f16b87d70a62058d3b3f4aae0da570b4380a1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 13 Nov 2024 19:10:19 -0800 Subject: rcutorture: Add per-reader-segment preemption diagnostics For preemptible RCU, this commit adds an indication for each reader segments to whether the rcu_torture_reader() task was on the ->blkd_tasks lists, though only in kernels built with CONFIG_RCU_TORTURE_TEST_LOG_CPU=y. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel/rcu') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index ab354bb7f1b6..41b661bf000a 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -464,10 +464,8 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) rtrsp->rt_delay_us = shortdelay_us; } if (!preempt_count() && - !(torture_random(rrsp) % (nrealreaders * 500))) { + !(torture_random(rrsp) % (nrealreaders * 500))) torture_preempt_schedule(); /* QS only if preemptible. */ - rtrsp->rt_preempted = true; - } } static void rcu_torture_read_unlock(int idx) @@ -1961,8 +1959,11 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) { int cpu = raw_smp_processor_id(); rtrsp->rt_cpu = cpu; - if (!first) + if (!first) { rtrsp[-1].rt_end_cpu = cpu; + if (cur_ops->reader_blocked) + rtrsp[-1].rt_preempted = cur_ops->reader_blocked(); + } } /* -- cgit v1.2.3 From 0fef924e3918e72768357a220c84e6b4dd2b6180 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Nov 2024 11:11:18 -0800 Subject: rcutorture: Use symbols for SRCU reader flavors This commit converts rcutorture.c values for the reader_flavor module parameter from hexadecimal to the SRCU_READ_FLAVOR_* C-preprocessor macros. The actual modprobe or kernel-boot-parameter values for read_flavor must still be entered in hexadecimal. Link: https://lore.kernel.org/all/c48c9dca-fe07-4833-acaa-28c827e5a79e@amd.com/ Suggested-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- include/linux/srcu.h | 6 ++++++ include/linux/srcutree.h | 6 +----- kernel/rcu/rcutorture.c | 14 +++++++------- 3 files changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel/rcu') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 08339eb8a01c..da8224d0f71c 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -43,6 +43,12 @@ int init_srcu_struct(struct srcu_struct *ssp); #define __SRCU_DEP_MAP_INIT(srcu_name) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +/* Values for SRCU Tree srcu_data ->srcu_reader_flavor, but also used by rcutorture. */ +#define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). +#define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). +#define SRCU_READ_FLAVOR_LITE 0x4 // srcu_read_lock_lite(). +#define SRCU_READ_FLAVOR_ALL 0x7 // All of the above. + #ifdef CONFIG_TINY_SRCU #include #elif defined(CONFIG_TREE_SRCU) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 490aeecc6bb4..80016bbed672 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -26,6 +26,7 @@ struct srcu_data { atomic_long_t srcu_lock_count[2]; /* Locks per CPU. */ atomic_long_t srcu_unlock_count[2]; /* Unlocks per CPU. */ int srcu_reader_flavor; /* Reader flavor for srcu_struct structure? */ + /* Values: SRCU_READ_FLAVOR_.* */ /* Update-side state. */ spinlock_t __private lock ____cacheline_internodealigned_in_smp; @@ -43,11 +44,6 @@ struct srcu_data { struct srcu_struct *ssp; }; -/* Values for ->srcu_reader_flavor. */ -#define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). -#define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). -#define SRCU_READ_FLAVOR_LITE 0x4 // srcu_read_lock_lite(). - /* * Node in SRCU combining tree, similar in function to rcu_data. */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 41b661bf000a..d26fb1d33ed9 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -121,7 +121,7 @@ torture_param(int, preempt_duration, 0, "Preemption duration (ms), zero to disab torture_param(int, preempt_interval, MSEC_PER_SEC, "Interval between preemptions (ms)"); torture_param(int, read_exit_delay, 13, "Delay between read-then-exit episodes (s)"); torture_param(int, read_exit_burst, 16, "# of read-then-exit bursts per episode, zero to disable"); -torture_param(int, reader_flavor, 0x1, "Reader flavors to use, one per bit."); +torture_param(int, reader_flavor, SRCU_READ_FLAVOR_NORMAL, "Reader flavors to use, one per bit."); torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles"); torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); @@ -679,17 +679,17 @@ static int srcu_torture_read_lock(void) int idx; int ret = 0; - if ((reader_flavor & 0x1) || !(reader_flavor & 0x7)) { + if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) { idx = srcu_read_lock(srcu_ctlp); WARN_ON_ONCE(idx & ~0x1); ret += idx; } - if (reader_flavor & 0x2) { + if (reader_flavor & SRCU_READ_FLAVOR_NMI) { idx = srcu_read_lock_nmisafe(srcu_ctlp); WARN_ON_ONCE(idx & ~0x1); ret += idx << 1; } - if (reader_flavor & 0x4) { + if (reader_flavor & SRCU_READ_FLAVOR_LITE) { idx = srcu_read_lock_lite(srcu_ctlp); WARN_ON_ONCE(idx & ~0x1); ret += idx << 2; @@ -719,11 +719,11 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) static void srcu_torture_read_unlock(int idx) { WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); - if (reader_flavor & 0x4) + if (reader_flavor & SRCU_READ_FLAVOR_LITE) srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2); - if (reader_flavor & 0x2) + if (reader_flavor & SRCU_READ_FLAVOR_NMI) srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1); - if ((reader_flavor & 0x1) || !(reader_flavor & 0x7)) + if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) srcu_read_unlock(srcu_ctlp, idx & 0x1); } -- cgit v1.2.3 From 049dfe96baf97228a9e98eaf50a8a7386ec7a483 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 2 Oct 2024 16:57:38 +0200 Subject: rcu: Report callbacks enqueued on offline CPU blind spot Callbacks enqueued after rcutree_report_cpu_dead() fall into RCU barrier blind spot. Report any potential misuse. Reported-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tree.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/rcu') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ff98233d4aa5..24f1cb292a92 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3084,8 +3084,11 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) head->func = func; head->next = NULL; kasan_record_aux_stack_noalloc(head); + local_irq_save(flags); rdp = this_cpu_ptr(&rcu_data); + RCU_LOCKDEP_WARN(!rcu_rdp_cpu_online(rdp), "Callback enqueued on offline CPU!"); + lazy = lazy_in && !rcu_async_should_hurry(); /* Add the callback to our list. */ -- cgit v1.2.3 From d16e32f75f30d5228d9e0a3f6ca77b62c3a4383d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 Oct 2024 11:43:28 -0700 Subject: rcu: Make rcu_report_exp_cpu_mult() caller acquire lock There is a hard-to-trigger bug in the expedited grace-period computation whose fix requires that the __sync_rcu_exp_select_node_cpus() function to check that the grace-period sequence number has not changed before invoking rcu_report_exp_cpu_mult(). However, this check must be done while holding the leaf rcu_node structure's ->lock. This commit therefore prepares for that fix by moving this lock's acquisition from rcu_report_exp_cpu_mult() to its callers (all two of them). Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tree_exp.h | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'kernel/rcu') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index fb664d3a01c9..581e88d39542 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -227,16 +227,16 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake) /* * Report expedited quiescent state for multiple CPUs, all covered by the - * specified leaf rcu_node structure. + * specified leaf rcu_node structure, which is acquired by the caller. */ -static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, +static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, unsigned long flags, unsigned long mask, bool wake) + __releases(rnp->lock) { int cpu; - unsigned long flags; struct rcu_data *rdp; - raw_spin_lock_irqsave_rcu_node(rnp, flags); + raw_lockdep_assert_held_rcu_node(rnp); if (!(rnp->expmask & mask)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; @@ -257,8 +257,12 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, */ static void rcu_report_exp_rdp(struct rcu_data *rdp) { + unsigned long flags; + struct rcu_node *rnp = rdp->mynode; + WRITE_ONCE(rdp->cpu_no_qs.b.exp, false); - rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true); + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rcu_report_exp_cpu_mult(rnp, flags, rdp->grpmask, true); } /* Common code for work-done checking. */ @@ -432,8 +436,10 @@ retry_ipi: raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* Report quiescent states for those that went offline. */ - if (mask_ofl_test) - rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false); + if (mask_ofl_test) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rcu_report_exp_cpu_mult(rnp, flags, mask_ofl_test, false); + } } static void rcu_exp_sel_wait_wake(unsigned long s); -- cgit v1.2.3 From e2bd168295e4eb719a343086baddfe0abca512a2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 Oct 2024 15:56:17 -0700 Subject: rcu: Move rcu_report_exp_rdp() setting of ->cpu_no_qs.b.exp under lock This commit reduces the state space of rcu_report_exp_rdp() by moving the setting of ->cpu_no_qs.b.exp under the rcu_node structure's ->lock. The lock isn't really all that important here, given that this per-CPU field is supposed to be written only by its CPU, but the disabling of interrupts excludes things like rcu_exp_handler(), which also can write to this same field. Avoiding this sort of interleaved access reduces the state space. Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/rcu') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 581e88d39542..5c4ea66cc00d 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -260,8 +260,8 @@ static void rcu_report_exp_rdp(struct rcu_data *rdp) unsigned long flags; struct rcu_node *rnp = rdp->mynode; - WRITE_ONCE(rdp->cpu_no_qs.b.exp, false); raw_spin_lock_irqsave_rcu_node(rnp, flags); + WRITE_ONCE(rdp->cpu_no_qs.b.exp, false); rcu_report_exp_cpu_mult(rnp, flags, rdp->grpmask, true); } -- cgit v1.2.3 From 6ae4c30fee05d97c4f53237e3cd13795a6f44422 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 Oct 2024 16:33:36 -0700 Subject: rcu: Replace open-coded rcu_exp_need_qs() from rcu_exp_handler() with call Currently, the preemptible implementation of rcu_exp_handler() almost open-codes rcu_exp_need_qs(). A call to that function would be shorter and would improve expediting in cases where rcu_exp_handler() interrupted a preemption-disabled or bh-disabled region of code. This commit therefore moves rcu_exp_need_qs() out of the non-preemptible leg of the enclosing #ifdef and replaces the open coding in preemptible rcu_exp_handler() with a call to rcu_exp_need_qs(). Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tree_exp.h | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) (limited to 'kernel/rcu') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 5c4ea66cc00d..f3884393b947 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -718,6 +718,16 @@ static void rcu_exp_sel_wait_wake(unsigned long s) rcu_exp_wait_wake(s); } +/* Request an expedited quiescent state. */ +static void rcu_exp_need_qs(void) +{ + __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); + /* Store .exp before .rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); + set_tsk_need_resched(current); + set_preempt_need_resched(); +} + #ifdef CONFIG_PREEMPT_RCU /* @@ -742,13 +752,10 @@ static void rcu_exp_handler(void *unused) */ if (!depth) { if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || - rcu_is_cpu_rrupt_from_idle()) { + rcu_is_cpu_rrupt_from_idle()) rcu_report_exp_rdp(rdp); - } else { - WRITE_ONCE(rdp->cpu_no_qs.b.exp, true); - set_tsk_need_resched(t); - set_preempt_need_resched(); - } + else + rcu_exp_need_qs(); return; } @@ -841,16 +848,6 @@ static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp) #else /* #ifdef CONFIG_PREEMPT_RCU */ -/* Request an expedited quiescent state. */ -static void rcu_exp_need_qs(void) -{ - __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); - /* Store .exp before .rcu_urgent_qs. */ - smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); - set_tsk_need_resched(current); - set_preempt_need_resched(); -} - /* Invoked on each online non-idle CPU for expedited quiescent state. */ static void rcu_exp_handler(void *unused) { -- cgit v1.2.3 From 7a323371197b184c8f0cff0d67d74f48f0098164 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 Oct 2024 17:17:16 -0700 Subject: rcu: Make preemptible rcu_exp_handler() check idempotency Although the non-preemptible implementation of rcu_exp_handler() contains checks to enforce idempotency, the preemptible version does not. The reason for this omission is that in preemptible kernels, there is no reporting of quiescent states from CPU hotplug notifiers, and thus no need for idempotency. In theory, anyway. In practice, accidents happen. This commit therefore adds checks under WARN_ON_ONCE() to catch any such accidents. Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tree_exp.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel/rcu') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index f3884393b947..6985c998fe6b 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -746,7 +746,19 @@ static void rcu_exp_handler(void *unused) struct task_struct *t = current; /* - * First, the common case of not being in an RCU read-side + * First, is there no need for a quiescent state from this CPU, + * or is this CPU already looking for a quiescent state for the + * current grace period? If either is the case, just leave. + * However, this should not happen due to the preemptible + * sync_sched_exp_online_cleanup() implementation being a no-op, + * so warn if this does happen. + */ + if (WARN_ON_ONCE(!(READ_ONCE(rnp->expmask) & rdp->grpmask) || + READ_ONCE(rdp->cpu_no_qs.b.exp))) + return; + + /* + * Second, the common case of not being in an RCU read-side * critical section. If also enabled or idle, immediately * report the quiescent state, otherwise defer. */ @@ -760,7 +772,7 @@ static void rcu_exp_handler(void *unused) } /* - * Second, the less-common case of being in an RCU read-side + * Third, the less-common case of being in an RCU read-side * critical section. In this case we can count on a future * rcu_read_unlock(). However, this rcu_read_unlock() might * execute on some other CPU, but in that case there will be @@ -781,7 +793,7 @@ static void rcu_exp_handler(void *unused) return; } - // Finally, negative nesting depth should not happen. + // Fourth and finally, negative nesting depth should not happen. WARN_ON_ONCE(1); } -- cgit v1.2.3 From ecc5e6b0d3c982091b82615af21817ddf9cfdd60 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 Oct 2024 17:44:28 -0700 Subject: rcu: Add KCSAN exclusive-writer assertions for rdp->cpu_no_qs.b.exp The value of rdp->cpu_no_qs.b.exp may be changed only by the corresponding CPU, and that CPU is not even allowed to race with itself, for example, via interrupt handlers. This commit therefore adds KCSAN exclusive-writer assertions to check this constraint. Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tree_exp.h | 4 ++++ kernel/rcu/tree_plugin.h | 1 + 2 files changed, 5 insertions(+) (limited to 'kernel/rcu') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6985c998fe6b..ce5b09921d04 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -262,6 +262,7 @@ static void rcu_report_exp_rdp(struct rcu_data *rdp) raw_spin_lock_irqsave_rcu_node(rnp, flags); WRITE_ONCE(rdp->cpu_no_qs.b.exp, false); + ASSERT_EXCLUSIVE_WRITER(rdp->cpu_no_qs.b.exp); rcu_report_exp_cpu_mult(rnp, flags, rdp->grpmask, true); } @@ -721,6 +722,7 @@ static void rcu_exp_sel_wait_wake(unsigned long s) /* Request an expedited quiescent state. */ static void rcu_exp_need_qs(void) { + ASSERT_EXCLUSIVE_WRITER_SCOPED(*this_cpu_ptr(&rcu_data.cpu_no_qs.b.exp)); __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); /* Store .exp before .rcu_urgent_qs. */ smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); @@ -753,6 +755,7 @@ static void rcu_exp_handler(void *unused) * sync_sched_exp_online_cleanup() implementation being a no-op, * so warn if this does happen. */ + ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp); if (WARN_ON_ONCE(!(READ_ONCE(rnp->expmask) & rdp->grpmask) || READ_ONCE(rdp->cpu_no_qs.b.exp))) return; @@ -867,6 +870,7 @@ static void rcu_exp_handler(void *unused) struct rcu_node *rnp = rdp->mynode; bool preempt_bh_enabled = !(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)); + ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp); if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) return; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3927ea5f7955..bb7ca6eb9ef0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -275,6 +275,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) rcu_report_exp_rdp(rdp); else WARN_ON_ONCE(rdp->cpu_no_qs.b.exp); + ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp); } /* -- cgit v1.2.3 From 1bb03ad383a7311f609ecfd13c3aaab248c0627f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 Oct 2024 19:56:20 -0700 Subject: rcu: Add lockdep_assert_irqs_disabled() to rcu_exp_need_qs() Callers to rcu_exp_need_qs() are supposed to disable interrupts, so this commit enlists lockdep's aid in checking this. Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tree_exp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/rcu') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index ce5b09921d04..77efed89c79e 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -722,6 +722,7 @@ static void rcu_exp_sel_wait_wake(unsigned long s) /* Request an expedited quiescent state. */ static void rcu_exp_need_qs(void) { + lockdep_assert_irqs_disabled(); ASSERT_EXCLUSIVE_WRITER_SCOPED(*this_cpu_ptr(&rcu_data.cpu_no_qs.b.exp)); __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); /* Store .exp before .rcu_urgent_qs. */ -- cgit v1.2.3 From d465492a224b2409508224cf6970d7b97e2285cc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 21 Oct 2024 15:09:39 -0700 Subject: srcu: Guarantee non-negative return value from srcu_read_lock() For almost 20 years, the int return value from srcu_read_lock() has been always either zero or one. This commit therefore documents the fact that it will be non-negative, and does the same for the underlying __srcu_read_lock(). [ paulmck: Apply Andrii Nakryiko feedback. ] Signed-off-by: Paul E. McKenney Acked-by: Andrii Nakryiko Acked-by: Peter Zijlstra (Intel) Signed-off-by: Uladzislau Rezki (Sony) --- include/linux/srcu.h | 15 ++++++++------- kernel/rcu/srcutree.c | 3 ++- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel/rcu') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 08339eb8a01c..abaddd7e6ddf 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -232,13 +232,14 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) * a mutex that is held elsewhere while calling synchronize_srcu() or * synchronize_srcu_expedited(). * - * The return value from srcu_read_lock() must be passed unaltered - * to the matching srcu_read_unlock(). Note that srcu_read_lock() and - * the matching srcu_read_unlock() must occur in the same context, for - * example, it is illegal to invoke srcu_read_unlock() in an irq handler - * if the matching srcu_read_lock() was invoked in process context. Or, - * for that matter to invoke srcu_read_unlock() from one task and the - * matching srcu_read_lock() from another. + * The return value from srcu_read_lock() is guaranteed to be + * non-negative. This value must be passed unaltered to the matching + * srcu_read_unlock(). Note that srcu_read_lock() and the matching + * srcu_read_unlock() must occur in the same context, for example, it is + * illegal to invoke srcu_read_unlock() in an irq handler if the matching + * srcu_read_lock() was invoked in process context. Or, for that matter to + * invoke srcu_read_unlock() from one task and the matching srcu_read_lock() + * from another. */ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) { diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 5e2e53464794..26ef58b481aa 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -738,7 +738,8 @@ EXPORT_SYMBOL_GPL(__srcu_check_read_flavor); /* * Counts the new reader in the appropriate per-CPU element of the * srcu_struct. - * Returns an index that must be passed to the matching srcu_read_unlock(). + * Returns a guaranteed non-negative index that must be passed to the + * matching __srcu_read_unlock(). */ int __srcu_read_lock(struct srcu_struct *ssp) { -- cgit v1.2.3 From 45c7c67643ae6391a354e42cf729d807fe341491 Mon Sep 17 00:00:00 2001 From: Feng Lee <379943137@qq.com> Date: Tue, 19 Nov 2024 15:29:49 +0800 Subject: srcu: Remove redundant GP sequence checks in srcu_funnel_gp_start We will perform GP sequence checking at the beginning of srcu_gp_start, thus making it safe to remove duplicate GP sequence checks prior to calling srcu_gp_start. Signed-off-by: Feng Lee <379943137@qq.com> Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/srcutree.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/rcu') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 26ef58b481aa..b83c74c4dcc0 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1077,7 +1077,6 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, /* If grace period not already in progress, start it. */ if (!WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && rcu_seq_state(sup->srcu_gp_seq) == SRCU_STATE_IDLE) { - WARN_ON_ONCE(ULONG_CMP_GE(sup->srcu_gp_seq, sup->srcu_gp_seq_needed)); srcu_gp_start(ssp); // And how can that list_add() in the "else" clause -- cgit v1.2.3 From db7ee3cb620b2cec5a5f44767ab93cb4eb80d961 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:49:05 +0200 Subject: rcu: Use kthread preferred affinity for RCU boost Now that kthreads have an infrastructure to handle preferred affinity against CPU hotplug and housekeeping cpumask, convert RCU boost to use it instead of handling all the constraints by itself. Acked-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tree.c | 27 +++++++++++++++++++-------- kernel/rcu/tree_plugin.h | 11 ++--------- 2 files changed, 21 insertions(+), 17 deletions(-) (limited to 'kernel/rcu') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ff98233d4aa5..4a4c49821058 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -149,7 +149,6 @@ static int rcu_scheduler_fully_active __read_mostly; static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, unsigned long gps, unsigned long flags); -static struct task_struct *rcu_boost_task(struct rcu_node *rnp); static void invoke_rcu_core(void); static void rcu_report_exp_rdp(struct rcu_data *rdp); static void sync_sched_exp_online_cleanup(int cpu); @@ -5011,6 +5010,22 @@ int rcutree_prepare_cpu(unsigned int cpu) return 0; } +static void rcu_thread_affine_rnp(struct task_struct *t, struct rcu_node *rnp) +{ + cpumask_var_t affinity; + int cpu; + + if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) + return; + + for_each_leaf_node_possible_cpu(rnp, cpu) + cpumask_set_cpu(cpu, affinity); + + kthread_affine_preferred(t, affinity); + + free_cpumask_var(affinity); +} + /* * Update kthreads affinity during CPU-hotplug changes. * @@ -5030,19 +5045,18 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu) unsigned long mask; struct rcu_data *rdp; struct rcu_node *rnp; - struct task_struct *task_boost, *task_exp; + struct task_struct *task_exp; rdp = per_cpu_ptr(&rcu_data, cpu); rnp = rdp->mynode; - task_boost = rcu_boost_task(rnp); task_exp = rcu_exp_par_gp_task(rnp); /* - * If CPU is the boot one, those tasks are created later from early + * If CPU is the boot one, this task is created later from early * initcall since kthreadd must be created first. */ - if (!task_boost && !task_exp) + if (!task_exp) return; if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) @@ -5064,9 +5078,6 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu) if (task_exp) set_cpus_allowed_ptr(task_exp, cm); - if (task_boost) - set_cpus_allowed_ptr(task_boost, cm); - mutex_unlock(&rnp->kthread_mutex); free_cpumask_var(cm); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3927ea5f7955..b141b8cc4f41 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1217,16 +1217,13 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + rcu_thread_affine_rnp(t, rnp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ } -static struct task_struct *rcu_boost_task(struct rcu_node *rnp) -{ - return READ_ONCE(rnp->boost_kthread_task); -} - #else /* #ifdef CONFIG_RCU_BOOST */ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) @@ -1243,10 +1240,6 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) { } -static struct task_struct *rcu_boost_task(struct rcu_node *rnp) -{ - return NULL; -} #endif /* #else #ifdef CONFIG_RCU_BOOST */ /* -- cgit v1.2.3 From b04e317b522630b46f78ee62ecbdc5734e8d43de Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:49:07 +0200 Subject: treewide: Introduce kthread_run_worker[_on_cpu]() kthread_create() creates a kthread without running it yet. kthread_run() creates a kthread and runs it. On the other hand, kthread_create_worker() creates a kthread worker and runs it. This difference in behaviours is confusing. Also there is no way to create a kthread worker and affine it using kthread_bind_mask() or kthread_affine_preferred() before starting it. Consolidate the behaviours and introduce kthread_run_worker[_on_cpu]() that behaves just like kthread_run(). kthread_create_worker[_on_cpu]() will now only create a kthread worker without starting it. Signed-off-by: Frederic Weisbecker Signed-off-by: Dan Carpenter --- arch/x86/kvm/i8254.c | 2 +- crypto/crypto_engine.c | 2 +- drivers/cpufreq/cppc_cpufreq.c | 2 +- drivers/gpu/drm/drm_vblank_work.c | 2 +- .../gpu/drm/i915/gem/selftests/i915_gem_context.c | 2 +- drivers/gpu/drm/i915/gt/selftest_execlists.c | 2 +- drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 2 +- drivers/gpu/drm/i915/gt/selftest_slpc.c | 2 +- drivers/gpu/drm/i915/selftests/i915_request.c | 8 ++-- drivers/gpu/drm/msm/disp/msm_disp_snapshot.c | 2 +- drivers/gpu/drm/msm/msm_atomic.c | 2 +- drivers/gpu/drm/msm/msm_gpu.c | 2 +- drivers/gpu/drm/msm/msm_kms.c | 2 +- .../media/platform/chips-media/wave5/wave5-vpu.c | 2 +- drivers/net/dsa/mv88e6xxx/chip.c | 2 +- drivers/net/ethernet/intel/ice/ice_dpll.c | 2 +- drivers/net/ethernet/intel/ice/ice_gnss.c | 2 +- drivers/net/ethernet/intel/ice/ice_ptp.c | 2 +- drivers/platform/chrome/cros_ec_spi.c | 2 +- drivers/ptp/ptp_clock.c | 2 +- drivers/spi/spi.c | 2 +- drivers/usb/typec/tcpm/tcpm.c | 2 +- drivers/vdpa/vdpa_sim/vdpa_sim.c | 2 +- drivers/watchdog/watchdog_dev.c | 2 +- fs/erofs/zdata.c | 2 +- include/linux/kthread.h | 48 ++++++++++++++++++---- kernel/kthread.c | 31 ++++---------- kernel/rcu/tree.c | 4 +- kernel/sched/ext.c | 2 +- kernel/workqueue.c | 2 +- net/dsa/tag_ksz.c | 2 +- net/dsa/tag_ocelot_8021q.c | 2 +- net/dsa/tag_sja1105.c | 2 +- 33 files changed, 83 insertions(+), 66 deletions(-) (limited to 'kernel/rcu') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index cd57a517d04a..d7ab8780ab9e 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -681,7 +681,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) pid_nr = pid_vnr(pid); put_pid(pid); - pit->worker = kthread_create_worker(0, "kvm-pit/%d", pid_nr); + pit->worker = kthread_run_worker(0, "kvm-pit/%d", pid_nr); if (IS_ERR(pit->worker)) goto fail_kthread; diff --git a/crypto/crypto_engine.c b/crypto/crypto_engine.c index e60a0eb628e8..c7c16da5e649 100644 --- a/crypto/crypto_engine.c +++ b/crypto/crypto_engine.c @@ -517,7 +517,7 @@ struct crypto_engine *crypto_engine_alloc_init_and_set(struct device *dev, crypto_init_queue(&engine->queue, qlen); spin_lock_init(&engine->queue_lock); - engine->kworker = kthread_create_worker(0, "%s", engine->name); + engine->kworker = kthread_run_worker(0, "%s", engine->name); if (IS_ERR(engine->kworker)) { dev_err(dev, "failed to create crypto request pump task\n"); return NULL; diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index bd8f75accfa0..2486a6c5256a 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -225,7 +225,7 @@ static void __init cppc_freq_invariance_init(void) if (fie_disabled) return; - kworker_fie = kthread_create_worker(0, "cppc_fie"); + kworker_fie = kthread_run_worker(0, "cppc_fie"); if (IS_ERR(kworker_fie)) { pr_warn("%s: failed to create kworker_fie: %ld\n", __func__, PTR_ERR(kworker_fie)); diff --git a/drivers/gpu/drm/drm_vblank_work.c b/drivers/gpu/drm/drm_vblank_work.c index 1752ffb44e1d..9cc71120246f 100644 --- a/drivers/gpu/drm/drm_vblank_work.c +++ b/drivers/gpu/drm/drm_vblank_work.c @@ -277,7 +277,7 @@ int drm_vblank_worker_init(struct drm_vblank_crtc *vblank) INIT_LIST_HEAD(&vblank->pending_work); init_waitqueue_head(&vblank->work_wait_queue); - worker = kthread_create_worker(0, "card%d-crtc%d", + worker = kthread_run_worker(0, "card%d-crtc%d", vblank->dev->primary->index, vblank->pipe); if (IS_ERR(worker)) diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c index 89d4dc8b60c6..eb0158e43417 100644 --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c @@ -369,7 +369,7 @@ static int live_parallel_switch(void *arg) if (!data[n].ce[0]) continue; - worker = kthread_create_worker(0, "igt/parallel:%s", + worker = kthread_run_worker(0, "igt/parallel:%s", data[n].ce[0]->engine->name); if (IS_ERR(worker)) { err = PTR_ERR(worker); diff --git a/drivers/gpu/drm/i915/gt/selftest_execlists.c b/drivers/gpu/drm/i915/gt/selftest_execlists.c index 222ca7c44951..81c31396eceb 100644 --- a/drivers/gpu/drm/i915/gt/selftest_execlists.c +++ b/drivers/gpu/drm/i915/gt/selftest_execlists.c @@ -3574,7 +3574,7 @@ static int smoke_crescendo(struct preempt_smoke *smoke, unsigned int flags) arg[id].batch = NULL; arg[id].count = 0; - worker[id] = kthread_create_worker(0, "igt/smoke:%d", id); + worker[id] = kthread_run_worker(0, "igt/smoke:%d", id); if (IS_ERR(worker[id])) { err = PTR_ERR(worker[id]); break; diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c index 9ce8ff1c04fe..9d3aeb237295 100644 --- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c +++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c @@ -1025,7 +1025,7 @@ static int __igt_reset_engines(struct intel_gt *gt, threads[tmp].engine = other; threads[tmp].flags = flags; - worker = kthread_create_worker(0, "igt/%s", + worker = kthread_run_worker(0, "igt/%s", other->name); if (IS_ERR(worker)) { err = PTR_ERR(worker); diff --git a/drivers/gpu/drm/i915/gt/selftest_slpc.c b/drivers/gpu/drm/i915/gt/selftest_slpc.c index 4ecc4ae74a54..e218b229681f 100644 --- a/drivers/gpu/drm/i915/gt/selftest_slpc.c +++ b/drivers/gpu/drm/i915/gt/selftest_slpc.c @@ -489,7 +489,7 @@ static int live_slpc_tile_interaction(void *arg) return -ENOMEM; for_each_gt(gt, i915, i) { - threads[i].worker = kthread_create_worker(0, "igt/slpc_parallel:%d", gt->info.id); + threads[i].worker = kthread_run_worker(0, "igt/slpc_parallel:%d", gt->info.id); if (IS_ERR(threads[i].worker)) { ret = PTR_ERR(threads[i].worker); diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c index acae30a04a94..88870844b5bd 100644 --- a/drivers/gpu/drm/i915/selftests/i915_request.c +++ b/drivers/gpu/drm/i915/selftests/i915_request.c @@ -492,7 +492,7 @@ static int mock_breadcrumbs_smoketest(void *arg) for (n = 0; n < ncpus; n++) { struct kthread_worker *worker; - worker = kthread_create_worker(0, "igt/%d", n); + worker = kthread_run_worker(0, "igt/%d", n); if (IS_ERR(worker)) { ret = PTR_ERR(worker); ncpus = n; @@ -1645,7 +1645,7 @@ static int live_parallel_engines(void *arg) for_each_uabi_engine(engine, i915) { struct kthread_worker *worker; - worker = kthread_create_worker(0, "igt/parallel:%s", + worker = kthread_run_worker(0, "igt/parallel:%s", engine->name); if (IS_ERR(worker)) { err = PTR_ERR(worker); @@ -1806,7 +1806,7 @@ static int live_breadcrumbs_smoketest(void *arg) unsigned int i = idx * ncpus + n; struct kthread_worker *worker; - worker = kthread_create_worker(0, "igt/%d.%d", idx, n); + worker = kthread_run_worker(0, "igt/%d.%d", idx, n); if (IS_ERR(worker)) { ret = PTR_ERR(worker); goto out_flush; @@ -3219,7 +3219,7 @@ static int perf_parallel_engines(void *arg) memset(&engines[idx].p, 0, sizeof(engines[idx].p)); - worker = kthread_create_worker(0, "igt:%s", + worker = kthread_run_worker(0, "igt:%s", engine->name); if (IS_ERR(worker)) { err = PTR_ERR(worker); diff --git a/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c b/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c index e75b97127c0d..2be00b11e557 100644 --- a/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c +++ b/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c @@ -109,7 +109,7 @@ int msm_disp_snapshot_init(struct drm_device *drm_dev) mutex_init(&kms->dump_mutex); - kms->dump_worker = kthread_create_worker(0, "%s", "disp_snapshot"); + kms->dump_worker = kthread_run_worker(0, "%s", "disp_snapshot"); if (IS_ERR(kms->dump_worker)) DRM_ERROR("failed to create disp state task\n"); diff --git a/drivers/gpu/drm/msm/msm_atomic.c b/drivers/gpu/drm/msm/msm_atomic.c index 9c45d641b521..a7a2384044ff 100644 --- a/drivers/gpu/drm/msm/msm_atomic.c +++ b/drivers/gpu/drm/msm/msm_atomic.c @@ -115,7 +115,7 @@ int msm_atomic_init_pending_timer(struct msm_pending_timer *timer, timer->kms = kms; timer->crtc_idx = crtc_idx; - timer->worker = kthread_create_worker(0, "atomic-worker-%d", crtc_idx); + timer->worker = kthread_run_worker(0, "atomic-worker-%d", crtc_idx); if (IS_ERR(timer->worker)) { int ret = PTR_ERR(timer->worker); timer->worker = NULL; diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index 0d4a3744cfcb..8557998e0c92 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -859,7 +859,7 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, gpu->funcs = funcs; gpu->name = name; - gpu->worker = kthread_create_worker(0, "gpu-worker"); + gpu->worker = kthread_run_worker(0, "gpu-worker"); if (IS_ERR(gpu->worker)) { ret = PTR_ERR(gpu->worker); gpu->worker = NULL; diff --git a/drivers/gpu/drm/msm/msm_kms.c b/drivers/gpu/drm/msm/msm_kms.c index f3326d09bdbc..dac831ba6219 100644 --- a/drivers/gpu/drm/msm/msm_kms.c +++ b/drivers/gpu/drm/msm/msm_kms.c @@ -269,7 +269,7 @@ int msm_drm_kms_init(struct device *dev, const struct drm_driver *drv) /* initialize event thread */ ev_thread = &priv->event_thread[drm_crtc_index(crtc)]; ev_thread->dev = ddev; - ev_thread->worker = kthread_create_worker(0, "crtc_event:%d", crtc->base.id); + ev_thread->worker = kthread_run_worker(0, "crtc_event:%d", crtc->base.id); if (IS_ERR(ev_thread->worker)) { ret = PTR_ERR(ev_thread->worker); DRM_DEV_ERROR(dev, "failed to create crtc_event kthread\n"); diff --git a/drivers/media/platform/chips-media/wave5/wave5-vpu.c b/drivers/media/platform/chips-media/wave5/wave5-vpu.c index 6b294a2d6717..d1320298a0f7 100644 --- a/drivers/media/platform/chips-media/wave5/wave5-vpu.c +++ b/drivers/media/platform/chips-media/wave5/wave5-vpu.c @@ -271,7 +271,7 @@ static int wave5_vpu_probe(struct platform_device *pdev) dev_err(&pdev->dev, "failed to get irq resource, falling back to polling\n"); hrtimer_init(&dev->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); dev->hrtimer.function = &wave5_vpu_timer_callback; - dev->worker = kthread_create_worker(0, "vpu_irq_thread"); + dev->worker = kthread_run_worker(0, "vpu_irq_thread"); if (IS_ERR(dev->worker)) { dev_err(&pdev->dev, "failed to create vpu irq worker\n"); ret = PTR_ERR(dev->worker); diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 3a792f79270d..377e66cf7a48 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -394,7 +394,7 @@ static int mv88e6xxx_irq_poll_setup(struct mv88e6xxx_chip *chip) kthread_init_delayed_work(&chip->irq_poll_work, mv88e6xxx_irq_poll); - chip->kworker = kthread_create_worker(0, "%s", dev_name(chip->dev)); + chip->kworker = kthread_run_worker(0, "%s", dev_name(chip->dev)); if (IS_ERR(chip->kworker)) return PTR_ERR(chip->kworker); diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c index d5ad6d84007c..75570be61fef 100644 --- a/drivers/net/ethernet/intel/ice/ice_dpll.c +++ b/drivers/net/ethernet/intel/ice/ice_dpll.c @@ -2053,7 +2053,7 @@ static int ice_dpll_init_worker(struct ice_pf *pf) struct kthread_worker *kworker; kthread_init_delayed_work(&d->work, ice_dpll_periodic_work); - kworker = kthread_create_worker(0, "ice-dplls-%s", + kworker = kthread_run_worker(0, "ice-dplls-%s", dev_name(ice_pf_to_dev(pf))); if (IS_ERR(kworker)) return PTR_ERR(kworker); diff --git a/drivers/net/ethernet/intel/ice/ice_gnss.c b/drivers/net/ethernet/intel/ice/ice_gnss.c index f02e8ca55375..b2148dbe49b2 100644 --- a/drivers/net/ethernet/intel/ice/ice_gnss.c +++ b/drivers/net/ethernet/intel/ice/ice_gnss.c @@ -182,7 +182,7 @@ static struct gnss_serial *ice_gnss_struct_init(struct ice_pf *pf) pf->gnss_serial = gnss; kthread_init_delayed_work(&gnss->read_work, ice_gnss_read); - kworker = kthread_create_worker(0, "ice-gnss-%s", dev_name(dev)); + kworker = kthread_run_worker(0, "ice-gnss-%s", dev_name(dev)); if (IS_ERR(kworker)) { kfree(gnss); return NULL; diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index a999fface272..3154bb674dd3 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -3080,7 +3080,7 @@ static int ice_ptp_init_work(struct ice_pf *pf, struct ice_ptp *ptp) /* Allocate a kworker for handling work required for the ports * connected to the PTP hardware clock. */ - kworker = kthread_create_worker(0, "ice-ptp-%s", + kworker = kthread_run_worker(0, "ice-ptp-%s", dev_name(ice_pf_to_dev(pf))); if (IS_ERR(kworker)) return PTR_ERR(kworker); diff --git a/drivers/platform/chrome/cros_ec_spi.c b/drivers/platform/chrome/cros_ec_spi.c index 86a3d32a7763..08f566cc1480 100644 --- a/drivers/platform/chrome/cros_ec_spi.c +++ b/drivers/platform/chrome/cros_ec_spi.c @@ -715,7 +715,7 @@ static int cros_ec_spi_devm_high_pri_alloc(struct device *dev, int err; ec_spi->high_pri_worker = - kthread_create_worker(0, "cros_ec_spi_high_pri"); + kthread_run_worker(0, "cros_ec_spi_high_pri"); if (IS_ERR(ec_spi->high_pri_worker)) { err = PTR_ERR(ec_spi->high_pri_worker); diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c index 77a36e7bddd5..b932425ddc6a 100644 --- a/drivers/ptp/ptp_clock.c +++ b/drivers/ptp/ptp_clock.c @@ -296,7 +296,7 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, if (ptp->info->do_aux_work) { kthread_init_delayed_work(&ptp->aux_work, ptp_aux_kworker); - ptp->kworker = kthread_create_worker(0, "ptp%d", ptp->index); + ptp->kworker = kthread_run_worker(0, "ptp%d", ptp->index); if (IS_ERR(ptp->kworker)) { err = PTR_ERR(ptp->kworker); pr_err("failed to create ptp aux_worker %d\n", err); diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index ff1add2ecb91..e4aa8f838934 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -2060,7 +2060,7 @@ static int spi_init_queue(struct spi_controller *ctlr) ctlr->busy = false; ctlr->queue_empty = true; - ctlr->kworker = kthread_create_worker(0, dev_name(&ctlr->dev)); + ctlr->kworker = kthread_run_worker(0, dev_name(&ctlr->dev)); if (IS_ERR(ctlr->kworker)) { dev_err(&ctlr->dev, "failed to create message pump kworker\n"); return PTR_ERR(ctlr->kworker); diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index 6021eeb903fe..95c0c63119ac 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -7635,7 +7635,7 @@ struct tcpm_port *tcpm_register_port(struct device *dev, struct tcpc_dev *tcpc) mutex_init(&port->lock); mutex_init(&port->swap_lock); - port->wq = kthread_create_worker(0, dev_name(dev)); + port->wq = kthread_run_worker(0, dev_name(dev)); if (IS_ERR(port->wq)) return ERR_CAST(port->wq); sched_set_fifo(port->wq->task); diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 8ffea8430f95..c204fc8e471a 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -229,7 +229,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr, dev = &vdpasim->vdpa.dev; kthread_init_work(&vdpasim->work, vdpasim_work_fn); - vdpasim->worker = kthread_create_worker(0, "vDPA sim worker: %s", + vdpasim->worker = kthread_run_worker(0, "vDPA sim worker: %s", dev_attr->name); if (IS_ERR(vdpasim->worker)) goto err_iommu; diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c index 4190cb800cc4..19698d87dc57 100644 --- a/drivers/watchdog/watchdog_dev.c +++ b/drivers/watchdog/watchdog_dev.c @@ -1229,7 +1229,7 @@ int __init watchdog_dev_init(void) { int err; - watchdog_kworker = kthread_create_worker(0, "watchdogd"); + watchdog_kworker = kthread_run_worker(0, "watchdogd"); if (IS_ERR(watchdog_kworker)) { pr_err("Failed to create watchdog kworker\n"); return PTR_ERR(watchdog_kworker); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index a23392327ce2..35381c00ee09 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -320,7 +320,7 @@ static void erofs_destroy_percpu_workers(void) static struct kthread_worker *erofs_init_percpu_worker(int cpu) { struct kthread_worker *worker = - kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u"); + kthread_run_worker_on_cpu(cpu, 0, "erofs_worker/%u"); if (IS_ERR(worker)) return worker; diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 0c66e7c1092a..8d27403888ce 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -193,19 +193,53 @@ struct kthread_worker *kthread_create_worker_on_node(unsigned int flags, const char namefmt[], ...); #define kthread_create_worker(flags, namefmt, ...) \ -({ \ - struct kthread_worker *__kw \ - = kthread_create_worker_on_node(flags, NUMA_NO_NODE, \ - namefmt, ## __VA_ARGS__); \ - if (!IS_ERR(__kw)) \ - wake_up_process(__kw->task); \ - __kw; \ + kthread_create_worker_on_node(flags, NUMA_NO_NODE, namefmt, ## __VA_ARGS__); + +/** + * kthread_run_worker - create and wake a kthread worker. + * @flags: flags modifying the default behavior of the worker + * @namefmt: printf-style name for the thread. + * + * Description: Convenient wrapper for kthread_create_worker() followed by + * wake_up_process(). Returns the kthread_worker or ERR_PTR(-ENOMEM). + */ +#define kthread_run_worker(flags, namefmt, ...) \ +({ \ + struct kthread_worker *__kw \ + = kthread_create_worker(flags, namefmt, ## __VA_ARGS__); \ + if (!IS_ERR(__kw)) \ + wake_up_process(__kw->task); \ + __kw; \ }) struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, const char namefmt[]); +/** + * kthread_run_worker_on_cpu - create and wake a cpu bound kthread worker. + * @cpu: CPU number + * @flags: flags modifying the default behavior of the worker + * @namefmt: printf-style name for the thread. Format is restricted + * to "name.*%u". Code fills in cpu number. + * + * Description: Convenient wrapper for kthread_create_worker_on_cpu() + * followed by wake_up_process(). Returns the kthread_worker or + * ERR_PTR(-ENOMEM). + */ +static inline struct kthread_worker * +kthread_run_worker_on_cpu(int cpu, unsigned int flags, + const char namefmt[]) +{ + struct kthread_worker *kw; + + kw = kthread_create_worker_on_cpu(cpu, flags, namefmt); + if (!IS_ERR(kw)) + wake_up_process(kw->task); + + return kw; +} + bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work); diff --git a/kernel/kthread.c b/kernel/kthread.c index 2d52126d5da0..922f15762ec3 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1077,33 +1077,10 @@ kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[] worker = __kthread_create_worker_on_node(flags, node, namefmt, args); va_end(args); - if (worker) - wake_up_process(worker->task); - return worker; } EXPORT_SYMBOL(kthread_create_worker_on_node); -static __printf(3, 4) struct kthread_worker * -__kthread_create_worker_on_cpu(int cpu, unsigned int flags, - const char namefmt[], ...) -{ - struct kthread_worker *worker; - va_list args; - - va_start(args, namefmt); - worker = __kthread_create_worker_on_node(flags, cpu_to_node(cpu), - namefmt, args); - va_end(args); - - if (worker) { - kthread_bind(worker->task, cpu); - wake_up_process(worker->task); - } - - return worker; -} - /** * kthread_create_worker_on_cpu - create a kthread worker and bind it * to a given CPU and the associated NUMA node. @@ -1144,7 +1121,13 @@ struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, const char namefmt[]) { - return __kthread_create_worker_on_cpu(cpu, flags, namefmt, cpu); + struct kthread_worker *worker; + + worker = kthread_create_worker_on_node(flags, cpu_to_node(cpu), namefmt, cpu); + if (!IS_ERR(worker)) + kthread_bind(worker->task, cpu); + + return worker; } EXPORT_SYMBOL(kthread_create_worker_on_cpu); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4a4c49821058..d4b8e87a473b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4906,7 +4906,7 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) if (rnp->exp_kworker) return; - kworker = kthread_create_worker(0, name, rnp_index); + kworker = kthread_run_worker(0, name, rnp_index); if (IS_ERR_OR_NULL(kworker)) { pr_err("Failed to create par gp kworker on %d/%d\n", rnp->grplo, rnp->grphi); @@ -4933,7 +4933,7 @@ static void __init rcu_start_exp_gp_kworker(void) const char *name = "rcu_exp_gp_kthread_worker"; struct sched_param param = { .sched_priority = kthread_prio }; - rcu_exp_gp_kworker = kthread_create_worker(0, name); + rcu_exp_gp_kworker = kthread_run_worker(0, name); if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { pr_err("Failed to create %s!\n", name); rcu_exp_gp_kworker = NULL; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7fff1d045477..ab8962d2e9d3 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5352,7 +5352,7 @@ static struct kthread_worker *scx_create_rt_helper(const char *name) { struct kthread_worker *helper; - helper = kthread_create_worker(0, name); + helper = kthread_run_worker(0, name); if (helper) sched_set_fifo(helper->task); return helper; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8b07576814a5..fe01c1f8095c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -7828,7 +7828,7 @@ static void __init wq_cpu_intensive_thresh_init(void) unsigned long thresh; unsigned long bogo; - pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release"); + pwq_release_worker = kthread_run_worker(0, "pool_workqueue_release"); BUG_ON(IS_ERR(pwq_release_worker)); /* if the user set it to a specific value, keep it */ diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 281bbac5539d..c33d4bf17929 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -66,7 +66,7 @@ static int ksz_connect(struct dsa_switch *ds) if (!priv) return -ENOMEM; - xmit_worker = kthread_create_worker(0, "dsa%d:%d_xmit", + xmit_worker = kthread_run_worker(0, "dsa%d:%d_xmit", ds->dst->index, ds->index); if (IS_ERR(xmit_worker)) { ret = PTR_ERR(xmit_worker); diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c index 8e8b1bef6af6..6ce0bc166792 100644 --- a/net/dsa/tag_ocelot_8021q.c +++ b/net/dsa/tag_ocelot_8021q.c @@ -110,7 +110,7 @@ static int ocelot_connect(struct dsa_switch *ds) if (!priv) return -ENOMEM; - priv->xmit_worker = kthread_create_worker(0, "felix_xmit"); + priv->xmit_worker = kthread_run_worker(0, "felix_xmit"); if (IS_ERR(priv->xmit_worker)) { err = PTR_ERR(priv->xmit_worker); kfree(priv); diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 3e902af7eea6..02adec693811 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -707,7 +707,7 @@ static int sja1105_connect(struct dsa_switch *ds) spin_lock_init(&priv->meta_lock); - xmit_worker = kthread_create_worker(0, "dsa%d:%d_xmit", + xmit_worker = kthread_run_worker(0, "dsa%d:%d_xmit", ds->dst->index, ds->index); if (IS_ERR(xmit_worker)) { err = PTR_ERR(xmit_worker); -- cgit v1.2.3 From 8044c589767456af2061ca03468aa6a295da1925 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:49:08 +0200 Subject: rcu: Use kthread preferred affinity for RCU exp kworkers Now that kthreads have an infrastructure to handle preferred affinity against CPU hotplug and housekeeping cpumask, convert RCU exp workers to use it instead of handling all the constraints by itself. Acked-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tree.c | 105 ++++++++++-------------------------------------------- 1 file changed, 19 insertions(+), 86 deletions(-) (limited to 'kernel/rcu') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d4b8e87a473b..c160e05dfb7c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4894,6 +4894,22 @@ rcu_boot_init_percpu_data(int cpu) rcu_boot_init_nocb_percpu_data(rdp); } +static void rcu_thread_affine_rnp(struct task_struct *t, struct rcu_node *rnp) +{ + cpumask_var_t affinity; + int cpu; + + if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) + return; + + for_each_leaf_node_possible_cpu(rnp, cpu) + cpumask_set_cpu(cpu, affinity); + + kthread_affine_preferred(t, affinity); + + free_cpumask_var(affinity); +} + struct kthread_worker *rcu_exp_gp_kworker; static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) @@ -4906,7 +4922,7 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) if (rnp->exp_kworker) return; - kworker = kthread_run_worker(0, name, rnp_index); + kworker = kthread_create_worker(0, name, rnp_index); if (IS_ERR_OR_NULL(kworker)) { pr_err("Failed to create par gp kworker on %d/%d\n", rnp->grplo, rnp->grphi); @@ -4916,16 +4932,9 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD)) sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); -} -static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp) -{ - struct kthread_worker *kworker = READ_ONCE(rnp->exp_kworker); - - if (!kworker) - return NULL; - - return kworker->task; + rcu_thread_affine_rnp(kworker->task, rnp); + wake_up_process(kworker->task); } static void __init rcu_start_exp_gp_kworker(void) @@ -5010,79 +5019,6 @@ int rcutree_prepare_cpu(unsigned int cpu) return 0; } -static void rcu_thread_affine_rnp(struct task_struct *t, struct rcu_node *rnp) -{ - cpumask_var_t affinity; - int cpu; - - if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) - return; - - for_each_leaf_node_possible_cpu(rnp, cpu) - cpumask_set_cpu(cpu, affinity); - - kthread_affine_preferred(t, affinity); - - free_cpumask_var(affinity); -} - -/* - * Update kthreads affinity during CPU-hotplug changes. - * - * Set the per-rcu_node kthread's affinity to cover all CPUs that are - * served by the rcu_node in question. The CPU hotplug lock is still - * held, so the value of rnp->qsmaskinit will be stable. - * - * We don't include outgoingcpu in the affinity set, use -1 if there is - * no outgoing CPU. If there are no CPUs left in the affinity set, - * this function allows the kthread to execute on any CPU. - * - * Any future concurrent calls are serialized via ->kthread_mutex. - */ -static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu) -{ - cpumask_var_t cm; - unsigned long mask; - struct rcu_data *rdp; - struct rcu_node *rnp; - struct task_struct *task_exp; - - rdp = per_cpu_ptr(&rcu_data, cpu); - rnp = rdp->mynode; - - task_exp = rcu_exp_par_gp_task(rnp); - - /* - * If CPU is the boot one, this task is created later from early - * initcall since kthreadd must be created first. - */ - if (!task_exp) - return; - - if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) - return; - - mutex_lock(&rnp->kthread_mutex); - mask = rcu_rnp_online_cpus(rnp); - for_each_leaf_node_possible_cpu(rnp, cpu) - if ((mask & leaf_node_cpu_bit(rnp, cpu)) && - cpu != outgoingcpu) - cpumask_set_cpu(cpu, cm); - cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU)); - if (cpumask_empty(cm)) { - cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU)); - if (outgoingcpu >= 0) - cpumask_clear_cpu(outgoingcpu, cm); - } - - if (task_exp) - set_cpus_allowed_ptr(task_exp, cm); - - mutex_unlock(&rnp->kthread_mutex); - - free_cpumask_var(cm); -} - /* * Has the specified (known valid) CPU ever been fully online? */ @@ -5111,7 +5047,6 @@ int rcutree_online_cpu(unsigned int cpu) if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return 0; /* Too early in boot for scheduler work. */ sync_sched_exp_online_cleanup(cpu); - rcutree_affinity_setting(cpu, -1); // Stop-machine done, so allow nohz_full to disable tick. tick_dep_clear(TICK_DEP_BIT_RCU); @@ -5328,8 +5263,6 @@ int rcutree_offline_cpu(unsigned int cpu) rnp->ffmask &= ~rdp->grpmask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - rcutree_affinity_setting(cpu, cpu); - // nohz_full CPUs need the tick for stop-machine to work quickly tick_dep_set(TICK_DEP_BIT_RCU); return 0; -- cgit v1.2.3 From 0f52b4db4f91320569311b97a1a14a18fb8ff256 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 12 Dec 2024 19:02:04 +0100 Subject: rcu/kvfree: Initialize kvfree_rcu() separately Introduce a separate initialization of kvfree_rcu() functionality. For such purpose a kfree_rcu_batch_init() is renamed to a kvfree_rcu_init() and it is invoked from the main.c right after rcu_init() is done. Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Hyeonggon Yoo Tested-by: Hyeonggon Yoo Signed-off-by: Vlastimil Babka --- include/linux/rcupdate.h | 1 + init/main.c | 1 + kernel/rcu/tree.c | 3 +-- 3 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/rcu') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 48e5c03df1dd..acb0095b4dbe 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -118,6 +118,7 @@ static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) /* Internal to kernel */ void rcu_init(void); +void __init kvfree_rcu_init(void); extern int rcu_scheduler_active; void rcu_sched_clock_irq(int user); diff --git a/init/main.c b/init/main.c index 00fac1170294..893cb77aef22 100644 --- a/init/main.c +++ b/init/main.c @@ -992,6 +992,7 @@ void start_kernel(void) workqueue_init_early(); rcu_init(); + kvfree_rcu_init(); /* Trace events are available after this */ trace_init(); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ff98233d4aa5..e69b867de8ef 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -5648,7 +5648,7 @@ static void __init rcu_dump_rcu_node_tree(void) struct workqueue_struct *rcu_gp_wq; -static void __init kfree_rcu_batch_init(void) +void __init kvfree_rcu_init(void) { int cpu; int i, j; @@ -5703,7 +5703,6 @@ void __init rcu_init(void) rcu_early_boot_tests(); - kfree_rcu_batch_init(); rcu_bootup_announce(); sanitize_kthread_prio(); rcu_init_geometry(); -- cgit v1.2.3 From d824ed707ba257e508d83126cf1127ce2fd98139 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 12 Dec 2024 19:02:05 +0100 Subject: rcu/kvfree: Move some functions under CONFIG_TINY_RCU Currently when a tiny RCU is enabled, the tree.c file is not compiled, thus duplicating function names do not conflict with each other. Because of moving of kvfree_rcu() functionality to the SLAB, we have to reorder some functions and place them together under CONFIG_TINY_RCU macro definition. Therefore, those functions name will not conflict when a kernel is compiled for CONFIG_TINY_RCU flavor. Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Hyeonggon Yoo Tested-by: Hyeonggon Yoo Signed-off-by: Vlastimil Babka --- kernel/rcu/tree.c | 90 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 47 insertions(+), 43 deletions(-) (limited to 'kernel/rcu') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e69b867de8ef..b3853ae6e869 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3653,16 +3653,6 @@ static void kfree_rcu_monitor(struct work_struct *work) schedule_delayed_monitor_work(krcp); } -static enum hrtimer_restart -schedule_page_work_fn(struct hrtimer *t) -{ - struct kfree_rcu_cpu *krcp = - container_of(t, struct kfree_rcu_cpu, hrtimer); - - queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0); - return HRTIMER_NORESTART; -} - static void fill_page_cache_func(struct work_struct *work) { struct kvfree_rcu_bulk_data *bnode; @@ -3698,27 +3688,6 @@ static void fill_page_cache_func(struct work_struct *work) atomic_set(&krcp->backoff_page_cache_fill, 0); } -static void -run_page_cache_worker(struct kfree_rcu_cpu *krcp) -{ - // If cache disabled, bail out. - if (!rcu_min_cached_objs) - return; - - if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && - !atomic_xchg(&krcp->work_in_progress, 1)) { - if (atomic_read(&krcp->backoff_page_cache_fill)) { - queue_delayed_work(system_unbound_wq, - &krcp->page_cache_work, - msecs_to_jiffies(rcu_delay_page_cache_fill_msec)); - } else { - hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - krcp->hrtimer.function = schedule_page_work_fn; - hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); - } - } -} - // Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock() // state specified by flags. If can_alloc is true, the caller must // be schedulable and not be holding any locks or mutexes that might be @@ -3779,6 +3748,51 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, return true; } +#if !defined(CONFIG_TINY_RCU) + +static enum hrtimer_restart +schedule_page_work_fn(struct hrtimer *t) +{ + struct kfree_rcu_cpu *krcp = + container_of(t, struct kfree_rcu_cpu, hrtimer); + + queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0); + return HRTIMER_NORESTART; +} + +static void +run_page_cache_worker(struct kfree_rcu_cpu *krcp) +{ + // If cache disabled, bail out. + if (!rcu_min_cached_objs) + return; + + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && + !atomic_xchg(&krcp->work_in_progress, 1)) { + if (atomic_read(&krcp->backoff_page_cache_fill)) { + queue_delayed_work(system_unbound_wq, + &krcp->page_cache_work, + msecs_to_jiffies(rcu_delay_page_cache_fill_msec)); + } else { + hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + krcp->hrtimer.function = schedule_page_work_fn; + hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); + } + } +} + +void __init kfree_rcu_scheduler_running(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + if (need_offload_krc(krcp)) + schedule_delayed_monitor_work(krcp); + } +} + /* * Queue a request for lazy invocation of the appropriate free routine * after a grace period. Please note that three paths are maintained, @@ -3944,6 +3958,8 @@ void kvfree_rcu_barrier(void) } EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); +#endif /* #if !defined(CONFIG_TINY_RCU) */ + static unsigned long kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { @@ -3985,18 +4001,6 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) return freed == 0 ? SHRINK_STOP : freed; } -void __init kfree_rcu_scheduler_running(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - - if (need_offload_krc(krcp)) - schedule_delayed_monitor_work(krcp); - } -} - /* * During early boot, any blocking grace-period wait automatically * implies a grace period. -- cgit v1.2.3 From ba5cac52d0446f3d606a0c74a6856bcef88a6331 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 12 Dec 2024 19:02:06 +0100 Subject: rcu/kvfree: Adjust names passed into trace functions Currently trace functions are supplied with "rcu_state.name" member which is located in the structure. The problem is that the "rcu_state" structure variable is local and can not be accessed from another place. To address this, this preparation patch passes "slab" string as a first argument. Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Hyeonggon Yoo Tested-by: Hyeonggon Yoo Signed-off-by: Vlastimil Babka --- kernel/rcu/tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/rcu') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b3853ae6e869..6ab21655c248 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3379,14 +3379,14 @@ kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp, rcu_lock_acquire(&rcu_callback_map); if (idx == 0) { // kmalloc() / kfree(). trace_rcu_invoke_kfree_bulk_callback( - rcu_state.name, bnode->nr_records, + "slab", bnode->nr_records, bnode->records); kfree_bulk(bnode->nr_records, bnode->records); } else { // vmalloc() / vfree(). for (i = 0; i < bnode->nr_records; i++) { trace_rcu_invoke_kvfree_callback( - rcu_state.name, bnode->records[i], 0); + "slab", bnode->records[i], 0); vfree(bnode->records[i]); } @@ -3417,7 +3417,7 @@ kvfree_rcu_list(struct rcu_head *head) next = head->next; debug_rcu_head_unqueue((struct rcu_head *)ptr); rcu_lock_acquire(&rcu_callback_map); - trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset); + trace_rcu_invoke_kvfree_callback("slab", head, offset); if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) kvfree(ptr); -- cgit v1.2.3 From c18bcd85cea7f71ff956056347df87e559c3ad0f Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 12 Dec 2024 19:02:07 +0100 Subject: rcu/kvfree: Adjust a shrinker name Rename "rcu-kfree" to "slab-kvfree-rcu" since it goes to the slab_common.c file soon. Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Hyeonggon Yoo Tested-by: Hyeonggon Yoo Signed-off-by: Vlastimil Babka --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/rcu') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6ab21655c248..b7ec998f360e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -5689,7 +5689,7 @@ void __init kvfree_rcu_init(void) krcp->initialized = true; } - kfree_rcu_shrinker = shrinker_alloc(0, "rcu-kfree"); + kfree_rcu_shrinker = shrinker_alloc(0, "slab-kvfree-rcu"); if (!kfree_rcu_shrinker) { pr_err("Failed to allocate kfree_rcu() shrinker!\n"); return; -- cgit v1.2.3 From bbe658d6580251d1832d408fa8a71ec254dc4416 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 12 Dec 2024 19:02:08 +0100 Subject: mm/slab: Move kvfree_rcu() into SLAB Move kvfree_rcu() functionality to the slab_common.c file. The reason to have kvfree_rcu() functionality as part of SLAB is that there is a clear trend and need of closer integration. One of the recent example is creating a barrier function for SLAB caches. Another reason is to prevent of having several implementations of RCU machinery for reclaiming objects after a GP. As future steps, it can be more integrated(easier) with SLAB internals. Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Hyeonggon Yoo Tested-by: Hyeonggon Yoo Signed-off-by: Vlastimil Babka --- include/linux/rcupdate.h | 1 - include/linux/slab.h | 1 + kernel/rcu/tree.c | 879 ---------------------------------------------- mm/slab_common.c | 880 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 881 insertions(+), 880 deletions(-) (limited to 'kernel/rcu') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index acb0095b4dbe..48e5c03df1dd 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -118,7 +118,6 @@ static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) /* Internal to kernel */ void rcu_init(void); -void __init kvfree_rcu_init(void); extern int rcu_scheduler_active; void rcu_sched_clock_irq(int user); diff --git a/include/linux/slab.h b/include/linux/slab.h index 10a971c2bde3..09eedaecf120 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -1099,5 +1099,6 @@ unsigned int kmem_cache_size(struct kmem_cache *s); size_t kmalloc_size_roundup(size_t size); void __init kmem_cache_init_late(void); +void __init kvfree_rcu_init(void); #endif /* _LINUX_SLAB_H */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b7ec998f360e..6af042cde972 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -186,26 +186,6 @@ static int rcu_unlock_delay; module_param(rcu_unlock_delay, int, 0444); #endif -/* - * This rcu parameter is runtime-read-only. It reflects - * a minimum allowed number of objects which can be cached - * per-CPU. Object size is equal to one page. This value - * can be changed at boot time. - */ -static int rcu_min_cached_objs = 5; -module_param(rcu_min_cached_objs, int, 0444); - -// A page shrinker can ask for pages to be freed to make them -// available for other parts of the system. This usually happens -// under low memory conditions, and in that case we should also -// defer page-cache filling for a short time period. -// -// The default value is 5 seconds, which is long enough to reduce -// interference with the shrinker while it asks other systems to -// drain their caches. -static int rcu_delay_page_cache_fill_msec = 5000; -module_param(rcu_delay_page_cache_fill_msec, int, 0444); - /* Retrieve RCU kthreads priority for rcutorture */ int rcu_get_gp_kthreads_prio(void) { @@ -3191,816 +3171,6 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) } EXPORT_SYMBOL_GPL(call_rcu); -/* Maximum number of jiffies to wait before draining a batch. */ -#define KFREE_DRAIN_JIFFIES (5 * HZ) -#define KFREE_N_BATCHES 2 -#define FREE_N_CHANNELS 2 - -/** - * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers - * @list: List node. All blocks are linked between each other - * @gp_snap: Snapshot of RCU state for objects placed to this bulk - * @nr_records: Number of active pointers in the array - * @records: Array of the kvfree_rcu() pointers - */ -struct kvfree_rcu_bulk_data { - struct list_head list; - struct rcu_gp_oldstate gp_snap; - unsigned long nr_records; - void *records[] __counted_by(nr_records); -}; - -/* - * This macro defines how many entries the "records" array - * will contain. It is based on the fact that the size of - * kvfree_rcu_bulk_data structure becomes exactly one page. - */ -#define KVFREE_BULK_MAX_ENTR \ - ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *)) - -/** - * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests - * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period - * @head_free: List of kfree_rcu() objects waiting for a grace period - * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees. - * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period - * @krcp: Pointer to @kfree_rcu_cpu structure - */ - -struct kfree_rcu_cpu_work { - struct rcu_work rcu_work; - struct rcu_head *head_free; - struct rcu_gp_oldstate head_free_gp_snap; - struct list_head bulk_head_free[FREE_N_CHANNELS]; - struct kfree_rcu_cpu *krcp; -}; - -/** - * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period - * @head: List of kfree_rcu() objects not yet waiting for a grace period - * @head_gp_snap: Snapshot of RCU state for objects placed to "@head" - * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period - * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period - * @lock: Synchronize access to this structure - * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES - * @initialized: The @rcu_work fields have been initialized - * @head_count: Number of objects in rcu_head singular list - * @bulk_count: Number of objects in bulk-list - * @bkvcache: - * A simple cache list that contains objects for reuse purpose. - * In order to save some per-cpu space the list is singular. - * Even though it is lockless an access has to be protected by the - * per-cpu lock. - * @page_cache_work: A work to refill the cache when it is empty - * @backoff_page_cache_fill: Delay cache refills - * @work_in_progress: Indicates that page_cache_work is running - * @hrtimer: A hrtimer for scheduling a page_cache_work - * @nr_bkv_objs: number of allocated objects at @bkvcache. - * - * This is a per-CPU structure. The reason that it is not included in - * the rcu_data structure is to permit this code to be extracted from - * the RCU files. Such extraction could allow further optimization of - * the interactions with the slab allocators. - */ -struct kfree_rcu_cpu { - // Objects queued on a linked list - // through their rcu_head structures. - struct rcu_head *head; - unsigned long head_gp_snap; - atomic_t head_count; - - // Objects queued on a bulk-list. - struct list_head bulk_head[FREE_N_CHANNELS]; - atomic_t bulk_count[FREE_N_CHANNELS]; - - struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; - raw_spinlock_t lock; - struct delayed_work monitor_work; - bool initialized; - - struct delayed_work page_cache_work; - atomic_t backoff_page_cache_fill; - atomic_t work_in_progress; - struct hrtimer hrtimer; - - struct llist_head bkvcache; - int nr_bkv_objs; -}; - -static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { - .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock), -}; - -static __always_inline void -debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead) -{ -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD - int i; - - for (i = 0; i < bhead->nr_records; i++) - debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i])); -#endif -} - -static inline struct kfree_rcu_cpu * -krc_this_cpu_lock(unsigned long *flags) -{ - struct kfree_rcu_cpu *krcp; - - local_irq_save(*flags); // For safely calling this_cpu_ptr(). - krcp = this_cpu_ptr(&krc); - raw_spin_lock(&krcp->lock); - - return krcp; -} - -static inline void -krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) -{ - raw_spin_unlock_irqrestore(&krcp->lock, flags); -} - -static inline struct kvfree_rcu_bulk_data * -get_cached_bnode(struct kfree_rcu_cpu *krcp) -{ - if (!krcp->nr_bkv_objs) - return NULL; - - WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1); - return (struct kvfree_rcu_bulk_data *) - llist_del_first(&krcp->bkvcache); -} - -static inline bool -put_cached_bnode(struct kfree_rcu_cpu *krcp, - struct kvfree_rcu_bulk_data *bnode) -{ - // Check the limit. - if (krcp->nr_bkv_objs >= rcu_min_cached_objs) - return false; - - llist_add((struct llist_node *) bnode, &krcp->bkvcache); - WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1); - return true; -} - -static int -drain_page_cache(struct kfree_rcu_cpu *krcp) -{ - unsigned long flags; - struct llist_node *page_list, *pos, *n; - int freed = 0; - - if (!rcu_min_cached_objs) - return 0; - - raw_spin_lock_irqsave(&krcp->lock, flags); - page_list = llist_del_all(&krcp->bkvcache); - WRITE_ONCE(krcp->nr_bkv_objs, 0); - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - llist_for_each_safe(pos, n, page_list) { - free_page((unsigned long)pos); - freed++; - } - - return freed; -} - -static void -kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp, - struct kvfree_rcu_bulk_data *bnode, int idx) -{ - unsigned long flags; - int i; - - if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) { - debug_rcu_bhead_unqueue(bnode); - rcu_lock_acquire(&rcu_callback_map); - if (idx == 0) { // kmalloc() / kfree(). - trace_rcu_invoke_kfree_bulk_callback( - "slab", bnode->nr_records, - bnode->records); - - kfree_bulk(bnode->nr_records, bnode->records); - } else { // vmalloc() / vfree(). - for (i = 0; i < bnode->nr_records; i++) { - trace_rcu_invoke_kvfree_callback( - "slab", bnode->records[i], 0); - - vfree(bnode->records[i]); - } - } - rcu_lock_release(&rcu_callback_map); - } - - raw_spin_lock_irqsave(&krcp->lock, flags); - if (put_cached_bnode(krcp, bnode)) - bnode = NULL; - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - if (bnode) - free_page((unsigned long) bnode); - - cond_resched_tasks_rcu_qs(); -} - -static void -kvfree_rcu_list(struct rcu_head *head) -{ - struct rcu_head *next; - - for (; head; head = next) { - void *ptr = (void *) head->func; - unsigned long offset = (void *) head - ptr; - - next = head->next; - debug_rcu_head_unqueue((struct rcu_head *)ptr); - rcu_lock_acquire(&rcu_callback_map); - trace_rcu_invoke_kvfree_callback("slab", head, offset); - - if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) - kvfree(ptr); - - rcu_lock_release(&rcu_callback_map); - cond_resched_tasks_rcu_qs(); - } -} - -/* - * This function is invoked in workqueue context after a grace period. - * It frees all the objects queued on ->bulk_head_free or ->head_free. - */ -static void kfree_rcu_work(struct work_struct *work) -{ - unsigned long flags; - struct kvfree_rcu_bulk_data *bnode, *n; - struct list_head bulk_head[FREE_N_CHANNELS]; - struct rcu_head *head; - struct kfree_rcu_cpu *krcp; - struct kfree_rcu_cpu_work *krwp; - struct rcu_gp_oldstate head_gp_snap; - int i; - - krwp = container_of(to_rcu_work(work), - struct kfree_rcu_cpu_work, rcu_work); - krcp = krwp->krcp; - - raw_spin_lock_irqsave(&krcp->lock, flags); - // Channels 1 and 2. - for (i = 0; i < FREE_N_CHANNELS; i++) - list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]); - - // Channel 3. - head = krwp->head_free; - krwp->head_free = NULL; - head_gp_snap = krwp->head_free_gp_snap; - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - // Handle the first two channels. - for (i = 0; i < FREE_N_CHANNELS; i++) { - // Start from the tail page, so a GP is likely passed for it. - list_for_each_entry_safe(bnode, n, &bulk_head[i], list) - kvfree_rcu_bulk(krcp, bnode, i); - } - - /* - * This is used when the "bulk" path can not be used for the - * double-argument of kvfree_rcu(). This happens when the - * page-cache is empty, which means that objects are instead - * queued on a linked list through their rcu_head structures. - * This list is named "Channel 3". - */ - if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap))) - kvfree_rcu_list(head); -} - -static bool -need_offload_krc(struct kfree_rcu_cpu *krcp) -{ - int i; - - for (i = 0; i < FREE_N_CHANNELS; i++) - if (!list_empty(&krcp->bulk_head[i])) - return true; - - return !!READ_ONCE(krcp->head); -} - -static bool -need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp) -{ - int i; - - for (i = 0; i < FREE_N_CHANNELS; i++) - if (!list_empty(&krwp->bulk_head_free[i])) - return true; - - return !!krwp->head_free; -} - -static int krc_count(struct kfree_rcu_cpu *krcp) -{ - int sum = atomic_read(&krcp->head_count); - int i; - - for (i = 0; i < FREE_N_CHANNELS; i++) - sum += atomic_read(&krcp->bulk_count[i]); - - return sum; -} - -static void -__schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) -{ - long delay, delay_left; - - delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES; - if (delayed_work_pending(&krcp->monitor_work)) { - delay_left = krcp->monitor_work.timer.expires - jiffies; - if (delay < delay_left) - mod_delayed_work(system_unbound_wq, &krcp->monitor_work, delay); - return; - } - queue_delayed_work(system_unbound_wq, &krcp->monitor_work, delay); -} - -static void -schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&krcp->lock, flags); - __schedule_delayed_monitor_work(krcp); - raw_spin_unlock_irqrestore(&krcp->lock, flags); -} - -static void -kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) -{ - struct list_head bulk_ready[FREE_N_CHANNELS]; - struct kvfree_rcu_bulk_data *bnode, *n; - struct rcu_head *head_ready = NULL; - unsigned long flags; - int i; - - raw_spin_lock_irqsave(&krcp->lock, flags); - for (i = 0; i < FREE_N_CHANNELS; i++) { - INIT_LIST_HEAD(&bulk_ready[i]); - - list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) { - if (!poll_state_synchronize_rcu_full(&bnode->gp_snap)) - break; - - atomic_sub(bnode->nr_records, &krcp->bulk_count[i]); - list_move(&bnode->list, &bulk_ready[i]); - } - } - - if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) { - head_ready = krcp->head; - atomic_set(&krcp->head_count, 0); - WRITE_ONCE(krcp->head, NULL); - } - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - for (i = 0; i < FREE_N_CHANNELS; i++) { - list_for_each_entry_safe(bnode, n, &bulk_ready[i], list) - kvfree_rcu_bulk(krcp, bnode, i); - } - - if (head_ready) - kvfree_rcu_list(head_ready); -} - -/* - * Return: %true if a work is queued, %false otherwise. - */ -static bool -kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp) -{ - unsigned long flags; - bool queued = false; - int i, j; - - raw_spin_lock_irqsave(&krcp->lock, flags); - - // Attempt to start a new batch. - for (i = 0; i < KFREE_N_BATCHES; i++) { - struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]); - - // Try to detach bulk_head or head and attach it, only when - // all channels are free. Any channel is not free means at krwp - // there is on-going rcu work to handle krwp's free business. - if (need_wait_for_krwp_work(krwp)) - continue; - - // kvfree_rcu_drain_ready() might handle this krcp, if so give up. - if (need_offload_krc(krcp)) { - // Channel 1 corresponds to the SLAB-pointer bulk path. - // Channel 2 corresponds to vmalloc-pointer bulk path. - for (j = 0; j < FREE_N_CHANNELS; j++) { - if (list_empty(&krwp->bulk_head_free[j])) { - atomic_set(&krcp->bulk_count[j], 0); - list_replace_init(&krcp->bulk_head[j], - &krwp->bulk_head_free[j]); - } - } - - // Channel 3 corresponds to both SLAB and vmalloc - // objects queued on the linked list. - if (!krwp->head_free) { - krwp->head_free = krcp->head; - get_state_synchronize_rcu_full(&krwp->head_free_gp_snap); - atomic_set(&krcp->head_count, 0); - WRITE_ONCE(krcp->head, NULL); - } - - // One work is per one batch, so there are three - // "free channels", the batch can handle. Break - // the loop since it is done with this CPU thus - // queuing an RCU work is _always_ success here. - queued = queue_rcu_work(system_unbound_wq, &krwp->rcu_work); - WARN_ON_ONCE(!queued); - break; - } - } - - raw_spin_unlock_irqrestore(&krcp->lock, flags); - return queued; -} - -/* - * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. - */ -static void kfree_rcu_monitor(struct work_struct *work) -{ - struct kfree_rcu_cpu *krcp = container_of(work, - struct kfree_rcu_cpu, monitor_work.work); - - // Drain ready for reclaim. - kvfree_rcu_drain_ready(krcp); - - // Queue a batch for a rest. - kvfree_rcu_queue_batch(krcp); - - // If there is nothing to detach, it means that our job is - // successfully done here. In case of having at least one - // of the channels that is still busy we should rearm the - // work to repeat an attempt. Because previous batches are - // still in progress. - if (need_offload_krc(krcp)) - schedule_delayed_monitor_work(krcp); -} - -static void fill_page_cache_func(struct work_struct *work) -{ - struct kvfree_rcu_bulk_data *bnode; - struct kfree_rcu_cpu *krcp = - container_of(work, struct kfree_rcu_cpu, - page_cache_work.work); - unsigned long flags; - int nr_pages; - bool pushed; - int i; - - nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ? - 1 : rcu_min_cached_objs; - - for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) { - bnode = (struct kvfree_rcu_bulk_data *) - __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); - - if (!bnode) - break; - - raw_spin_lock_irqsave(&krcp->lock, flags); - pushed = put_cached_bnode(krcp, bnode); - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - if (!pushed) { - free_page((unsigned long) bnode); - break; - } - } - - atomic_set(&krcp->work_in_progress, 0); - atomic_set(&krcp->backoff_page_cache_fill, 0); -} - -// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock() -// state specified by flags. If can_alloc is true, the caller must -// be schedulable and not be holding any locks or mutexes that might be -// acquired by the memory allocator or anything that it might invoke. -// Returns true if ptr was successfully recorded, else the caller must -// use a fallback. -static inline bool -add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, - unsigned long *flags, void *ptr, bool can_alloc) -{ - struct kvfree_rcu_bulk_data *bnode; - int idx; - - *krcp = krc_this_cpu_lock(flags); - if (unlikely(!(*krcp)->initialized)) - return false; - - idx = !!is_vmalloc_addr(ptr); - bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx], - struct kvfree_rcu_bulk_data, list); - - /* Check if a new block is required. */ - if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) { - bnode = get_cached_bnode(*krcp); - if (!bnode && can_alloc) { - krc_this_cpu_unlock(*krcp, *flags); - - // __GFP_NORETRY - allows a light-weight direct reclaim - // what is OK from minimizing of fallback hitting point of - // view. Apart of that it forbids any OOM invoking what is - // also beneficial since we are about to release memory soon. - // - // __GFP_NOMEMALLOC - prevents from consuming of all the - // memory reserves. Please note we have a fallback path. - // - // __GFP_NOWARN - it is supposed that an allocation can - // be failed under low memory or high memory pressure - // scenarios. - bnode = (struct kvfree_rcu_bulk_data *) - __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); - raw_spin_lock_irqsave(&(*krcp)->lock, *flags); - } - - if (!bnode) - return false; - - // Initialize the new block and attach it. - bnode->nr_records = 0; - list_add(&bnode->list, &(*krcp)->bulk_head[idx]); - } - - // Finally insert and update the GP for this page. - bnode->nr_records++; - bnode->records[bnode->nr_records - 1] = ptr; - get_state_synchronize_rcu_full(&bnode->gp_snap); - atomic_inc(&(*krcp)->bulk_count[idx]); - - return true; -} - -#if !defined(CONFIG_TINY_RCU) - -static enum hrtimer_restart -schedule_page_work_fn(struct hrtimer *t) -{ - struct kfree_rcu_cpu *krcp = - container_of(t, struct kfree_rcu_cpu, hrtimer); - - queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0); - return HRTIMER_NORESTART; -} - -static void -run_page_cache_worker(struct kfree_rcu_cpu *krcp) -{ - // If cache disabled, bail out. - if (!rcu_min_cached_objs) - return; - - if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && - !atomic_xchg(&krcp->work_in_progress, 1)) { - if (atomic_read(&krcp->backoff_page_cache_fill)) { - queue_delayed_work(system_unbound_wq, - &krcp->page_cache_work, - msecs_to_jiffies(rcu_delay_page_cache_fill_msec)); - } else { - hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - krcp->hrtimer.function = schedule_page_work_fn; - hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); - } - } -} - -void __init kfree_rcu_scheduler_running(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - - if (need_offload_krc(krcp)) - schedule_delayed_monitor_work(krcp); - } -} - -/* - * Queue a request for lazy invocation of the appropriate free routine - * after a grace period. Please note that three paths are maintained, - * two for the common case using arrays of pointers and a third one that - * is used only when the main paths cannot be used, for example, due to - * memory pressure. - * - * Each kvfree_call_rcu() request is added to a batch. The batch will be drained - * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will - * be free'd in workqueue context. This allows us to: batch requests together to - * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. - */ -void kvfree_call_rcu(struct rcu_head *head, void *ptr) -{ - unsigned long flags; - struct kfree_rcu_cpu *krcp; - bool success; - - /* - * Please note there is a limitation for the head-less - * variant, that is why there is a clear rule for such - * objects: it can be used from might_sleep() context - * only. For other places please embed an rcu_head to - * your data. - */ - if (!head) - might_sleep(); - - // Queue the object but don't yet schedule the batch. - if (debug_rcu_head_queue(ptr)) { - // Probable double kfree_rcu(), just leak. - WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", - __func__, head); - - // Mark as success and leave. - return; - } - - kasan_record_aux_stack_noalloc(ptr); - success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head); - if (!success) { - run_page_cache_worker(krcp); - - if (head == NULL) - // Inline if kvfree_rcu(one_arg) call. - goto unlock_return; - - head->func = ptr; - head->next = krcp->head; - WRITE_ONCE(krcp->head, head); - atomic_inc(&krcp->head_count); - - // Take a snapshot for this krcp. - krcp->head_gp_snap = get_state_synchronize_rcu(); - success = true; - } - - /* - * The kvfree_rcu() caller considers the pointer freed at this point - * and likely removes any references to it. Since the actual slab - * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore - * this object (no scanning or false positives reporting). - */ - kmemleak_ignore(ptr); - - // Set timer to drain after KFREE_DRAIN_JIFFIES. - if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) - __schedule_delayed_monitor_work(krcp); - -unlock_return: - krc_this_cpu_unlock(krcp, flags); - - /* - * Inline kvfree() after synchronize_rcu(). We can do - * it from might_sleep() context only, so the current - * CPU can pass the QS state. - */ - if (!success) { - debug_rcu_head_unqueue((struct rcu_head *) ptr); - synchronize_rcu(); - kvfree(ptr); - } -} -EXPORT_SYMBOL_GPL(kvfree_call_rcu); - -/** - * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete. - * - * Note that a single argument of kvfree_rcu() call has a slow path that - * triggers synchronize_rcu() following by freeing a pointer. It is done - * before the return from the function. Therefore for any single-argument - * call that will result in a kfree() to a cache that is to be destroyed - * during module exit, it is developer's responsibility to ensure that all - * such calls have returned before the call to kmem_cache_destroy(). - */ -void kvfree_rcu_barrier(void) -{ - struct kfree_rcu_cpu_work *krwp; - struct kfree_rcu_cpu *krcp; - bool queued; - int i, cpu; - - /* - * Firstly we detach objects and queue them over an RCU-batch - * for all CPUs. Finally queued works are flushed for each CPU. - * - * Please note. If there are outstanding batches for a particular - * CPU, those have to be finished first following by queuing a new. - */ - for_each_possible_cpu(cpu) { - krcp = per_cpu_ptr(&krc, cpu); - - /* - * Check if this CPU has any objects which have been queued for a - * new GP completion. If not(means nothing to detach), we are done - * with it. If any batch is pending/running for this "krcp", below - * per-cpu flush_rcu_work() waits its completion(see last step). - */ - if (!need_offload_krc(krcp)) - continue; - - while (1) { - /* - * If we are not able to queue a new RCU work it means: - * - batches for this CPU are still in flight which should - * be flushed first and then repeat; - * - no objects to detach, because of concurrency. - */ - queued = kvfree_rcu_queue_batch(krcp); - - /* - * Bail out, if there is no need to offload this "krcp" - * anymore. As noted earlier it can run concurrently. - */ - if (queued || !need_offload_krc(krcp)) - break; - - /* There are ongoing batches. */ - for (i = 0; i < KFREE_N_BATCHES; i++) { - krwp = &(krcp->krw_arr[i]); - flush_rcu_work(&krwp->rcu_work); - } - } - } - - /* - * Now we guarantee that all objects are flushed. - */ - for_each_possible_cpu(cpu) { - krcp = per_cpu_ptr(&krc, cpu); - - /* - * A monitor work can drain ready to reclaim objects - * directly. Wait its completion if running or pending. - */ - cancel_delayed_work_sync(&krcp->monitor_work); - - for (i = 0; i < KFREE_N_BATCHES; i++) { - krwp = &(krcp->krw_arr[i]); - flush_rcu_work(&krwp->rcu_work); - } - } -} -EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); - -#endif /* #if !defined(CONFIG_TINY_RCU) */ - -static unsigned long -kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) -{ - int cpu; - unsigned long count = 0; - - /* Snapshot count of all CPUs */ - for_each_possible_cpu(cpu) { - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - - count += krc_count(krcp); - count += READ_ONCE(krcp->nr_bkv_objs); - atomic_set(&krcp->backoff_page_cache_fill, 1); - } - - return count == 0 ? SHRINK_EMPTY : count; -} - -static unsigned long -kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) -{ - int cpu, freed = 0; - - for_each_possible_cpu(cpu) { - int count; - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - - count = krc_count(krcp); - count += drain_page_cache(krcp); - kfree_rcu_monitor(&krcp->monitor_work.work); - - sc->nr_to_scan -= count; - freed += count; - - if (sc->nr_to_scan <= 0) - break; - } - - return freed == 0 ? SHRINK_STOP : freed; -} - /* * During early boot, any blocking grace-period wait automatically * implies a grace period. @@ -5652,55 +4822,6 @@ static void __init rcu_dump_rcu_node_tree(void) struct workqueue_struct *rcu_gp_wq; -void __init kvfree_rcu_init(void) -{ - int cpu; - int i, j; - struct shrinker *kfree_rcu_shrinker; - - /* Clamp it to [0:100] seconds interval. */ - if (rcu_delay_page_cache_fill_msec < 0 || - rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) { - - rcu_delay_page_cache_fill_msec = - clamp(rcu_delay_page_cache_fill_msec, 0, - (int) (100 * MSEC_PER_SEC)); - - pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n", - rcu_delay_page_cache_fill_msec); - } - - for_each_possible_cpu(cpu) { - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - - for (i = 0; i < KFREE_N_BATCHES; i++) { - INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); - krcp->krw_arr[i].krcp = krcp; - - for (j = 0; j < FREE_N_CHANNELS; j++) - INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]); - } - - for (i = 0; i < FREE_N_CHANNELS; i++) - INIT_LIST_HEAD(&krcp->bulk_head[i]); - - INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); - INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func); - krcp->initialized = true; - } - - kfree_rcu_shrinker = shrinker_alloc(0, "slab-kvfree-rcu"); - if (!kfree_rcu_shrinker) { - pr_err("Failed to allocate kfree_rcu() shrinker!\n"); - return; - } - - kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count; - kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan; - - shrinker_register(kfree_rcu_shrinker); -} - void __init rcu_init(void) { int cpu = smp_processor_id(); diff --git a/mm/slab_common.c b/mm/slab_common.c index a29457bef626..69f2d19010de 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -28,7 +28,9 @@ #include #include #include +#include +#include "../kernel/rcu/rcu.h" #include "internal.h" #include "slab.h" @@ -1282,3 +1284,881 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); EXPORT_TRACEPOINT_SYMBOL(kfree); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); +/* + * This rcu parameter is runtime-read-only. It reflects + * a minimum allowed number of objects which can be cached + * per-CPU. Object size is equal to one page. This value + * can be changed at boot time. + */ +static int rcu_min_cached_objs = 5; +module_param(rcu_min_cached_objs, int, 0444); + +// A page shrinker can ask for pages to be freed to make them +// available for other parts of the system. This usually happens +// under low memory conditions, and in that case we should also +// defer page-cache filling for a short time period. +// +// The default value is 5 seconds, which is long enough to reduce +// interference with the shrinker while it asks other systems to +// drain their caches. +static int rcu_delay_page_cache_fill_msec = 5000; +module_param(rcu_delay_page_cache_fill_msec, int, 0444); + +/* Maximum number of jiffies to wait before draining a batch. */ +#define KFREE_DRAIN_JIFFIES (5 * HZ) +#define KFREE_N_BATCHES 2 +#define FREE_N_CHANNELS 2 + +/** + * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers + * @list: List node. All blocks are linked between each other + * @gp_snap: Snapshot of RCU state for objects placed to this bulk + * @nr_records: Number of active pointers in the array + * @records: Array of the kvfree_rcu() pointers + */ +struct kvfree_rcu_bulk_data { + struct list_head list; + struct rcu_gp_oldstate gp_snap; + unsigned long nr_records; + void *records[] __counted_by(nr_records); +}; + +/* + * This macro defines how many entries the "records" array + * will contain. It is based on the fact that the size of + * kvfree_rcu_bulk_data structure becomes exactly one page. + */ +#define KVFREE_BULK_MAX_ENTR \ + ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *)) + +/** + * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests + * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period + * @head_free: List of kfree_rcu() objects waiting for a grace period + * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees. + * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period + * @krcp: Pointer to @kfree_rcu_cpu structure + */ + +struct kfree_rcu_cpu_work { + struct rcu_work rcu_work; + struct rcu_head *head_free; + struct rcu_gp_oldstate head_free_gp_snap; + struct list_head bulk_head_free[FREE_N_CHANNELS]; + struct kfree_rcu_cpu *krcp; +}; + +/** + * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period + * @head: List of kfree_rcu() objects not yet waiting for a grace period + * @head_gp_snap: Snapshot of RCU state for objects placed to "@head" + * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period + * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period + * @lock: Synchronize access to this structure + * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES + * @initialized: The @rcu_work fields have been initialized + * @head_count: Number of objects in rcu_head singular list + * @bulk_count: Number of objects in bulk-list + * @bkvcache: + * A simple cache list that contains objects for reuse purpose. + * In order to save some per-cpu space the list is singular. + * Even though it is lockless an access has to be protected by the + * per-cpu lock. + * @page_cache_work: A work to refill the cache when it is empty + * @backoff_page_cache_fill: Delay cache refills + * @work_in_progress: Indicates that page_cache_work is running + * @hrtimer: A hrtimer for scheduling a page_cache_work + * @nr_bkv_objs: number of allocated objects at @bkvcache. + * + * This is a per-CPU structure. The reason that it is not included in + * the rcu_data structure is to permit this code to be extracted from + * the RCU files. Such extraction could allow further optimization of + * the interactions with the slab allocators. + */ +struct kfree_rcu_cpu { + // Objects queued on a linked list + // through their rcu_head structures. + struct rcu_head *head; + unsigned long head_gp_snap; + atomic_t head_count; + + // Objects queued on a bulk-list. + struct list_head bulk_head[FREE_N_CHANNELS]; + atomic_t bulk_count[FREE_N_CHANNELS]; + + struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; + raw_spinlock_t lock; + struct delayed_work monitor_work; + bool initialized; + + struct delayed_work page_cache_work; + atomic_t backoff_page_cache_fill; + atomic_t work_in_progress; + struct hrtimer hrtimer; + + struct llist_head bkvcache; + int nr_bkv_objs; +}; + +static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { + .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock), +}; + +static __always_inline void +debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead) +{ +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD + int i; + + for (i = 0; i < bhead->nr_records; i++) + debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i])); +#endif +} + +static inline struct kfree_rcu_cpu * +krc_this_cpu_lock(unsigned long *flags) +{ + struct kfree_rcu_cpu *krcp; + + local_irq_save(*flags); // For safely calling this_cpu_ptr(). + krcp = this_cpu_ptr(&krc); + raw_spin_lock(&krcp->lock); + + return krcp; +} + +static inline void +krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) +{ + raw_spin_unlock_irqrestore(&krcp->lock, flags); +} + +static inline struct kvfree_rcu_bulk_data * +get_cached_bnode(struct kfree_rcu_cpu *krcp) +{ + if (!krcp->nr_bkv_objs) + return NULL; + + WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1); + return (struct kvfree_rcu_bulk_data *) + llist_del_first(&krcp->bkvcache); +} + +static inline bool +put_cached_bnode(struct kfree_rcu_cpu *krcp, + struct kvfree_rcu_bulk_data *bnode) +{ + // Check the limit. + if (krcp->nr_bkv_objs >= rcu_min_cached_objs) + return false; + + llist_add((struct llist_node *) bnode, &krcp->bkvcache); + WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1); + return true; +} + +static int +drain_page_cache(struct kfree_rcu_cpu *krcp) +{ + unsigned long flags; + struct llist_node *page_list, *pos, *n; + int freed = 0; + + if (!rcu_min_cached_objs) + return 0; + + raw_spin_lock_irqsave(&krcp->lock, flags); + page_list = llist_del_all(&krcp->bkvcache); + WRITE_ONCE(krcp->nr_bkv_objs, 0); + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + llist_for_each_safe(pos, n, page_list) { + free_page((unsigned long)pos); + freed++; + } + + return freed; +} + +static void +kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp, + struct kvfree_rcu_bulk_data *bnode, int idx) +{ + unsigned long flags; + int i; + + if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) { + debug_rcu_bhead_unqueue(bnode); + rcu_lock_acquire(&rcu_callback_map); + if (idx == 0) { // kmalloc() / kfree(). + trace_rcu_invoke_kfree_bulk_callback( + "slab", bnode->nr_records, + bnode->records); + + kfree_bulk(bnode->nr_records, bnode->records); + } else { // vmalloc() / vfree(). + for (i = 0; i < bnode->nr_records; i++) { + trace_rcu_invoke_kvfree_callback( + "slab", bnode->records[i], 0); + + vfree(bnode->records[i]); + } + } + rcu_lock_release(&rcu_callback_map); + } + + raw_spin_lock_irqsave(&krcp->lock, flags); + if (put_cached_bnode(krcp, bnode)) + bnode = NULL; + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + if (bnode) + free_page((unsigned long) bnode); + + cond_resched_tasks_rcu_qs(); +} + +static void +kvfree_rcu_list(struct rcu_head *head) +{ + struct rcu_head *next; + + for (; head; head = next) { + void *ptr = (void *) head->func; + unsigned long offset = (void *) head - ptr; + + next = head->next; + debug_rcu_head_unqueue((struct rcu_head *)ptr); + rcu_lock_acquire(&rcu_callback_map); + trace_rcu_invoke_kvfree_callback("slab", head, offset); + + if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) + kvfree(ptr); + + rcu_lock_release(&rcu_callback_map); + cond_resched_tasks_rcu_qs(); + } +} + +/* + * This function is invoked in workqueue context after a grace period. + * It frees all the objects queued on ->bulk_head_free or ->head_free. + */ +static void kfree_rcu_work(struct work_struct *work) +{ + unsigned long flags; + struct kvfree_rcu_bulk_data *bnode, *n; + struct list_head bulk_head[FREE_N_CHANNELS]; + struct rcu_head *head; + struct kfree_rcu_cpu *krcp; + struct kfree_rcu_cpu_work *krwp; + struct rcu_gp_oldstate head_gp_snap; + int i; + + krwp = container_of(to_rcu_work(work), + struct kfree_rcu_cpu_work, rcu_work); + krcp = krwp->krcp; + + raw_spin_lock_irqsave(&krcp->lock, flags); + // Channels 1 and 2. + for (i = 0; i < FREE_N_CHANNELS; i++) + list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]); + + // Channel 3. + head = krwp->head_free; + krwp->head_free = NULL; + head_gp_snap = krwp->head_free_gp_snap; + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + // Handle the first two channels. + for (i = 0; i < FREE_N_CHANNELS; i++) { + // Start from the tail page, so a GP is likely passed for it. + list_for_each_entry_safe(bnode, n, &bulk_head[i], list) + kvfree_rcu_bulk(krcp, bnode, i); + } + + /* + * This is used when the "bulk" path can not be used for the + * double-argument of kvfree_rcu(). This happens when the + * page-cache is empty, which means that objects are instead + * queued on a linked list through their rcu_head structures. + * This list is named "Channel 3". + */ + if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap))) + kvfree_rcu_list(head); +} + +static bool +need_offload_krc(struct kfree_rcu_cpu *krcp) +{ + int i; + + for (i = 0; i < FREE_N_CHANNELS; i++) + if (!list_empty(&krcp->bulk_head[i])) + return true; + + return !!READ_ONCE(krcp->head); +} + +static bool +need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp) +{ + int i; + + for (i = 0; i < FREE_N_CHANNELS; i++) + if (!list_empty(&krwp->bulk_head_free[i])) + return true; + + return !!krwp->head_free; +} + +static int krc_count(struct kfree_rcu_cpu *krcp) +{ + int sum = atomic_read(&krcp->head_count); + int i; + + for (i = 0; i < FREE_N_CHANNELS; i++) + sum += atomic_read(&krcp->bulk_count[i]); + + return sum; +} + +static void +__schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) +{ + long delay, delay_left; + + delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES; + if (delayed_work_pending(&krcp->monitor_work)) { + delay_left = krcp->monitor_work.timer.expires - jiffies; + if (delay < delay_left) + mod_delayed_work(system_unbound_wq, &krcp->monitor_work, delay); + return; + } + queue_delayed_work(system_unbound_wq, &krcp->monitor_work, delay); +} + +static void +schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&krcp->lock, flags); + __schedule_delayed_monitor_work(krcp); + raw_spin_unlock_irqrestore(&krcp->lock, flags); +} + +static void +kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) +{ + struct list_head bulk_ready[FREE_N_CHANNELS]; + struct kvfree_rcu_bulk_data *bnode, *n; + struct rcu_head *head_ready = NULL; + unsigned long flags; + int i; + + raw_spin_lock_irqsave(&krcp->lock, flags); + for (i = 0; i < FREE_N_CHANNELS; i++) { + INIT_LIST_HEAD(&bulk_ready[i]); + + list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) { + if (!poll_state_synchronize_rcu_full(&bnode->gp_snap)) + break; + + atomic_sub(bnode->nr_records, &krcp->bulk_count[i]); + list_move(&bnode->list, &bulk_ready[i]); + } + } + + if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) { + head_ready = krcp->head; + atomic_set(&krcp->head_count, 0); + WRITE_ONCE(krcp->head, NULL); + } + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + for (i = 0; i < FREE_N_CHANNELS; i++) { + list_for_each_entry_safe(bnode, n, &bulk_ready[i], list) + kvfree_rcu_bulk(krcp, bnode, i); + } + + if (head_ready) + kvfree_rcu_list(head_ready); +} + +/* + * Return: %true if a work is queued, %false otherwise. + */ +static bool +kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp) +{ + unsigned long flags; + bool queued = false; + int i, j; + + raw_spin_lock_irqsave(&krcp->lock, flags); + + // Attempt to start a new batch. + for (i = 0; i < KFREE_N_BATCHES; i++) { + struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]); + + // Try to detach bulk_head or head and attach it, only when + // all channels are free. Any channel is not free means at krwp + // there is on-going rcu work to handle krwp's free business. + if (need_wait_for_krwp_work(krwp)) + continue; + + // kvfree_rcu_drain_ready() might handle this krcp, if so give up. + if (need_offload_krc(krcp)) { + // Channel 1 corresponds to the SLAB-pointer bulk path. + // Channel 2 corresponds to vmalloc-pointer bulk path. + for (j = 0; j < FREE_N_CHANNELS; j++) { + if (list_empty(&krwp->bulk_head_free[j])) { + atomic_set(&krcp->bulk_count[j], 0); + list_replace_init(&krcp->bulk_head[j], + &krwp->bulk_head_free[j]); + } + } + + // Channel 3 corresponds to both SLAB and vmalloc + // objects queued on the linked list. + if (!krwp->head_free) { + krwp->head_free = krcp->head; + get_state_synchronize_rcu_full(&krwp->head_free_gp_snap); + atomic_set(&krcp->head_count, 0); + WRITE_ONCE(krcp->head, NULL); + } + + // One work is per one batch, so there are three + // "free channels", the batch can handle. Break + // the loop since it is done with this CPU thus + // queuing an RCU work is _always_ success here. + queued = queue_rcu_work(system_unbound_wq, &krwp->rcu_work); + WARN_ON_ONCE(!queued); + break; + } + } + + raw_spin_unlock_irqrestore(&krcp->lock, flags); + return queued; +} + +/* + * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. + */ +static void kfree_rcu_monitor(struct work_struct *work) +{ + struct kfree_rcu_cpu *krcp = container_of(work, + struct kfree_rcu_cpu, monitor_work.work); + + // Drain ready for reclaim. + kvfree_rcu_drain_ready(krcp); + + // Queue a batch for a rest. + kvfree_rcu_queue_batch(krcp); + + // If there is nothing to detach, it means that our job is + // successfully done here. In case of having at least one + // of the channels that is still busy we should rearm the + // work to repeat an attempt. Because previous batches are + // still in progress. + if (need_offload_krc(krcp)) + schedule_delayed_monitor_work(krcp); +} + +static void fill_page_cache_func(struct work_struct *work) +{ + struct kvfree_rcu_bulk_data *bnode; + struct kfree_rcu_cpu *krcp = + container_of(work, struct kfree_rcu_cpu, + page_cache_work.work); + unsigned long flags; + int nr_pages; + bool pushed; + int i; + + nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ? + 1 : rcu_min_cached_objs; + + for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) { + bnode = (struct kvfree_rcu_bulk_data *) + __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + + if (!bnode) + break; + + raw_spin_lock_irqsave(&krcp->lock, flags); + pushed = put_cached_bnode(krcp, bnode); + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + if (!pushed) { + free_page((unsigned long) bnode); + break; + } + } + + atomic_set(&krcp->work_in_progress, 0); + atomic_set(&krcp->backoff_page_cache_fill, 0); +} + +// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock() +// state specified by flags. If can_alloc is true, the caller must +// be schedulable and not be holding any locks or mutexes that might be +// acquired by the memory allocator or anything that it might invoke. +// Returns true if ptr was successfully recorded, else the caller must +// use a fallback. +static inline bool +add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, + unsigned long *flags, void *ptr, bool can_alloc) +{ + struct kvfree_rcu_bulk_data *bnode; + int idx; + + *krcp = krc_this_cpu_lock(flags); + if (unlikely(!(*krcp)->initialized)) + return false; + + idx = !!is_vmalloc_addr(ptr); + bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx], + struct kvfree_rcu_bulk_data, list); + + /* Check if a new block is required. */ + if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) { + bnode = get_cached_bnode(*krcp); + if (!bnode && can_alloc) { + krc_this_cpu_unlock(*krcp, *flags); + + // __GFP_NORETRY - allows a light-weight direct reclaim + // what is OK from minimizing of fallback hitting point of + // view. Apart of that it forbids any OOM invoking what is + // also beneficial since we are about to release memory soon. + // + // __GFP_NOMEMALLOC - prevents from consuming of all the + // memory reserves. Please note we have a fallback path. + // + // __GFP_NOWARN - it is supposed that an allocation can + // be failed under low memory or high memory pressure + // scenarios. + bnode = (struct kvfree_rcu_bulk_data *) + __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + raw_spin_lock_irqsave(&(*krcp)->lock, *flags); + } + + if (!bnode) + return false; + + // Initialize the new block and attach it. + bnode->nr_records = 0; + list_add(&bnode->list, &(*krcp)->bulk_head[idx]); + } + + // Finally insert and update the GP for this page. + bnode->nr_records++; + bnode->records[bnode->nr_records - 1] = ptr; + get_state_synchronize_rcu_full(&bnode->gp_snap); + atomic_inc(&(*krcp)->bulk_count[idx]); + + return true; +} + +#if !defined(CONFIG_TINY_RCU) + +static enum hrtimer_restart +schedule_page_work_fn(struct hrtimer *t) +{ + struct kfree_rcu_cpu *krcp = + container_of(t, struct kfree_rcu_cpu, hrtimer); + + queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0); + return HRTIMER_NORESTART; +} + +static void +run_page_cache_worker(struct kfree_rcu_cpu *krcp) +{ + // If cache disabled, bail out. + if (!rcu_min_cached_objs) + return; + + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && + !atomic_xchg(&krcp->work_in_progress, 1)) { + if (atomic_read(&krcp->backoff_page_cache_fill)) { + queue_delayed_work(system_unbound_wq, + &krcp->page_cache_work, + msecs_to_jiffies(rcu_delay_page_cache_fill_msec)); + } else { + hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + krcp->hrtimer.function = schedule_page_work_fn; + hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); + } + } +} + +void __init kfree_rcu_scheduler_running(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + if (need_offload_krc(krcp)) + schedule_delayed_monitor_work(krcp); + } +} + +/* + * Queue a request for lazy invocation of the appropriate free routine + * after a grace period. Please note that three paths are maintained, + * two for the common case using arrays of pointers and a third one that + * is used only when the main paths cannot be used, for example, due to + * memory pressure. + * + * Each kvfree_call_rcu() request is added to a batch. The batch will be drained + * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will + * be free'd in workqueue context. This allows us to: batch requests together to + * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. + */ +void kvfree_call_rcu(struct rcu_head *head, void *ptr) +{ + unsigned long flags; + struct kfree_rcu_cpu *krcp; + bool success; + + /* + * Please note there is a limitation for the head-less + * variant, that is why there is a clear rule for such + * objects: it can be used from might_sleep() context + * only. For other places please embed an rcu_head to + * your data. + */ + if (!head) + might_sleep(); + + // Queue the object but don't yet schedule the batch. + if (debug_rcu_head_queue(ptr)) { + // Probable double kfree_rcu(), just leak. + WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", + __func__, head); + + // Mark as success and leave. + return; + } + + kasan_record_aux_stack_noalloc(ptr); + success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head); + if (!success) { + run_page_cache_worker(krcp); + + if (head == NULL) + // Inline if kvfree_rcu(one_arg) call. + goto unlock_return; + + head->func = ptr; + head->next = krcp->head; + WRITE_ONCE(krcp->head, head); + atomic_inc(&krcp->head_count); + + // Take a snapshot for this krcp. + krcp->head_gp_snap = get_state_synchronize_rcu(); + success = true; + } + + /* + * The kvfree_rcu() caller considers the pointer freed at this point + * and likely removes any references to it. Since the actual slab + * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore + * this object (no scanning or false positives reporting). + */ + kmemleak_ignore(ptr); + + // Set timer to drain after KFREE_DRAIN_JIFFIES. + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) + __schedule_delayed_monitor_work(krcp); + +unlock_return: + krc_this_cpu_unlock(krcp, flags); + + /* + * Inline kvfree() after synchronize_rcu(). We can do + * it from might_sleep() context only, so the current + * CPU can pass the QS state. + */ + if (!success) { + debug_rcu_head_unqueue((struct rcu_head *) ptr); + synchronize_rcu(); + kvfree(ptr); + } +} +EXPORT_SYMBOL_GPL(kvfree_call_rcu); + +/** + * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete. + * + * Note that a single argument of kvfree_rcu() call has a slow path that + * triggers synchronize_rcu() following by freeing a pointer. It is done + * before the return from the function. Therefore for any single-argument + * call that will result in a kfree() to a cache that is to be destroyed + * during module exit, it is developer's responsibility to ensure that all + * such calls have returned before the call to kmem_cache_destroy(). + */ +void kvfree_rcu_barrier(void) +{ + struct kfree_rcu_cpu_work *krwp; + struct kfree_rcu_cpu *krcp; + bool queued; + int i, cpu; + + /* + * Firstly we detach objects and queue them over an RCU-batch + * for all CPUs. Finally queued works are flushed for each CPU. + * + * Please note. If there are outstanding batches for a particular + * CPU, those have to be finished first following by queuing a new. + */ + for_each_possible_cpu(cpu) { + krcp = per_cpu_ptr(&krc, cpu); + + /* + * Check if this CPU has any objects which have been queued for a + * new GP completion. If not(means nothing to detach), we are done + * with it. If any batch is pending/running for this "krcp", below + * per-cpu flush_rcu_work() waits its completion(see last step). + */ + if (!need_offload_krc(krcp)) + continue; + + while (1) { + /* + * If we are not able to queue a new RCU work it means: + * - batches for this CPU are still in flight which should + * be flushed first and then repeat; + * - no objects to detach, because of concurrency. + */ + queued = kvfree_rcu_queue_batch(krcp); + + /* + * Bail out, if there is no need to offload this "krcp" + * anymore. As noted earlier it can run concurrently. + */ + if (queued || !need_offload_krc(krcp)) + break; + + /* There are ongoing batches. */ + for (i = 0; i < KFREE_N_BATCHES; i++) { + krwp = &(krcp->krw_arr[i]); + flush_rcu_work(&krwp->rcu_work); + } + } + } + + /* + * Now we guarantee that all objects are flushed. + */ + for_each_possible_cpu(cpu) { + krcp = per_cpu_ptr(&krc, cpu); + + /* + * A monitor work can drain ready to reclaim objects + * directly. Wait its completion if running or pending. + */ + cancel_delayed_work_sync(&krcp->monitor_work); + + for (i = 0; i < KFREE_N_BATCHES; i++) { + krwp = &(krcp->krw_arr[i]); + flush_rcu_work(&krwp->rcu_work); + } + } +} +EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); + +#endif /* #if !defined(CONFIG_TINY_RCU) */ + +static unsigned long +kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ + int cpu; + unsigned long count = 0; + + /* Snapshot count of all CPUs */ + for_each_possible_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + count += krc_count(krcp); + count += READ_ONCE(krcp->nr_bkv_objs); + atomic_set(&krcp->backoff_page_cache_fill, 1); + } + + return count == 0 ? SHRINK_EMPTY : count; +} + +static unsigned long +kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + int cpu, freed = 0; + + for_each_possible_cpu(cpu) { + int count; + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + count = krc_count(krcp); + count += drain_page_cache(krcp); + kfree_rcu_monitor(&krcp->monitor_work.work); + + sc->nr_to_scan -= count; + freed += count; + + if (sc->nr_to_scan <= 0) + break; + } + + return freed == 0 ? SHRINK_STOP : freed; +} + +void __init kvfree_rcu_init(void) +{ + int cpu; + int i, j; + struct shrinker *kfree_rcu_shrinker; + + /* Clamp it to [0:100] seconds interval. */ + if (rcu_delay_page_cache_fill_msec < 0 || + rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) { + + rcu_delay_page_cache_fill_msec = + clamp(rcu_delay_page_cache_fill_msec, 0, + (int) (100 * MSEC_PER_SEC)); + + pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n", + rcu_delay_page_cache_fill_msec); + } + + for_each_possible_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + for (i = 0; i < KFREE_N_BATCHES; i++) { + INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); + krcp->krw_arr[i].krcp = krcp; + + for (j = 0; j < FREE_N_CHANNELS; j++) + INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]); + } + + for (i = 0; i < FREE_N_CHANNELS; i++) + INIT_LIST_HEAD(&krcp->bulk_head[i]); + + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); + INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func); + krcp->initialized = true; + } + + kfree_rcu_shrinker = shrinker_alloc(0, "slab-kvfree-rcu"); + if (!kfree_rcu_shrinker) { + pr_err("Failed to allocate kfree_rcu() shrinker!\n"); + return; + } + + kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count; + kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan; + + shrinker_register(kfree_rcu_shrinker); +} -- cgit v1.2.3 From d40797d6720e861196e848f3615bb09dae5be7ce Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Nov 2024 16:54:51 +0100 Subject: kasan: make kasan_record_aux_stack_noalloc() the default behaviour kasan_record_aux_stack_noalloc() was introduced to record a stack trace without allocating memory in the process. It has been added to callers which were invoked while a raw_spinlock_t was held. More and more callers were identified and changed over time. Is it a good thing to have this while functions try their best to do a locklessly setup? The only downside of having kasan_record_aux_stack() not allocate any memory is that we end up without a stacktrace if stackdepot runs out of memory and at the same stacktrace was not recorded before To quote Marco Elver from https://lore.kernel.org/all/CANpmjNPmQYJ7pv1N3cuU8cP18u7PP_uoZD8YxwZd4jtbof9nVQ@mail.gmail.com/ | I'd be in favor, it simplifies things. And stack depot should be | able to replenish its pool sufficiently in the "non-aux" cases | i.e. regular allocations. Worst case we fail to record some | aux stacks, but I think that's only really bad if there's a bug | around one of these allocations. In general the probabilities | of this being a regression are extremely small [...] Make the kasan_record_aux_stack_noalloc() behaviour default as kasan_record_aux_stack(). [bigeasy@linutronix.de: dressed the diff as patch] Link: https://lkml.kernel.org/r/20241122155451.Mb2pmeyJ@linutronix.de Fixes: 7cb3007ce2da ("kasan: generic: introduce kasan_record_aux_stack_noalloc()") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior Reported-by: syzbot+39f85d612b7c20d8db48@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67275485.050a0220.3c8d68.0a37.GAE@google.com Reviewed-by: Andrey Konovalov Reviewed-by: Marco Elver Reviewed-by: Waiman Long Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Ben Segall Cc: Boqun Feng Cc: Christoph Lameter Cc: David Rientjes Cc: Dietmar Eggemann Cc: Dmitry Vyukov Cc: Frederic Weisbecker Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes (Google) Cc: Joonsoo Kim Cc: Josh Triplett Cc: Juri Lelli Cc: Cc: Lai Jiangshan Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Mathieu Desnoyers Cc: Mel Gorman Cc: Neeraj Upadhyay Cc: Paul E. McKenney Cc: Pekka Enberg Cc: Roman Gushchin Cc: Steven Rostedt Cc: syzkaller-bugs@googlegroups.com Cc: Tejun Heo Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Valentin Schneider Cc: Vincent Guittot Cc: Vincenzo Frascino Cc: Vlastimil Babka Cc: Zqiang Signed-off-by: Andrew Morton --- include/linux/kasan.h | 2 -- include/linux/task_work.h | 3 --- kernel/irq_work.c | 2 +- kernel/rcu/tiny.c | 2 +- kernel/rcu/tree.c | 4 ++-- kernel/sched/core.c | 2 +- kernel/task_work.c | 14 +------------- kernel/workqueue.c | 2 +- mm/kasan/generic.c | 18 ++++++------------ mm/slub.c | 2 +- 10 files changed, 14 insertions(+), 37 deletions(-) (limited to 'kernel/rcu') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 6bbfc8aa42e8..1c1b3d39e7b6 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -491,7 +491,6 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); void kasan_record_aux_stack(void *ptr); -void kasan_record_aux_stack_noalloc(void *ptr); #else /* CONFIG_KASAN_GENERIC */ @@ -509,7 +508,6 @@ static inline void kasan_cache_create(struct kmem_cache *cache, static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} static inline void kasan_record_aux_stack(void *ptr) {} -static inline void kasan_record_aux_stack_noalloc(void *ptr) {} #endif /* CONFIG_KASAN_GENERIC */ diff --git a/include/linux/task_work.h b/include/linux/task_work.h index 2964171856e0..0646804860ff 100644 --- a/include/linux/task_work.h +++ b/include/linux/task_work.h @@ -19,9 +19,6 @@ enum task_work_notify_mode { TWA_SIGNAL, TWA_SIGNAL_NO_IPI, TWA_NMI_CURRENT, - - TWA_FLAGS = 0xff00, - TWAF_NO_ALLOC = 0x0100, }; static inline bool task_work_pending(struct task_struct *task) diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 2f4fb336dda1..73f7e1fd4ab4 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -147,7 +147,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (!irq_work_claim(work)) return false; - kasan_record_aux_stack_noalloc(work); + kasan_record_aux_stack(work); preempt_disable(); if (cpu != smp_processor_id()) { diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index b3b3ce34df63..4b3f31911465 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -250,7 +250,7 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); void kvfree_call_rcu(struct rcu_head *head, void *ptr) { if (head) - kasan_record_aux_stack_noalloc(ptr); + kasan_record_aux_stack(ptr); __kvfree_call_rcu(head, ptr); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ff98233d4aa5..3885aae5f9cb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3083,7 +3083,7 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) } head->func = func; head->next = NULL; - kasan_record_aux_stack_noalloc(head); + kasan_record_aux_stack(head); local_irq_save(flags); rdp = this_cpu_ptr(&rcu_data); lazy = lazy_in && !rcu_async_should_hurry(); @@ -3817,7 +3817,7 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) return; } - kasan_record_aux_stack_noalloc(ptr); + kasan_record_aux_stack(ptr); success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head); if (!success) { run_page_cache_worker(krcp); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3e5a6bf587f9..755ae4659b64 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10590,7 +10590,7 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) return; /* No page allocation under rq lock */ - task_work_add(curr, work, TWA_RESUME | TWAF_NO_ALLOC); + task_work_add(curr, work, TWA_RESUME); } void sched_mm_cid_exit_signals(struct task_struct *t) diff --git a/kernel/task_work.c b/kernel/task_work.c index c969f1f26be5..d1efec571a4a 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -55,26 +55,14 @@ int task_work_add(struct task_struct *task, struct callback_head *work, enum task_work_notify_mode notify) { struct callback_head *head; - int flags = notify & TWA_FLAGS; - notify &= ~TWA_FLAGS; if (notify == TWA_NMI_CURRENT) { if (WARN_ON_ONCE(task != current)) return -EINVAL; if (!IS_ENABLED(CONFIG_IRQ_WORK)) return -EINVAL; } else { - /* - * Record the work call stack in order to print it in KASAN - * reports. - * - * Note that stack allocation can fail if TWAF_NO_ALLOC flag - * is set and new page is needed to expand the stack buffer. - */ - if (flags & TWAF_NO_ALLOC) - kasan_record_aux_stack_noalloc(work); - else - kasan_record_aux_stack(work); + kasan_record_aux_stack(work); } head = READ_ONCE(task->task_works); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f7d8fc204579..77d8f672e175 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2180,7 +2180,7 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, debug_work_activate(work); /* record the work call stack in order to print it in KASAN reports */ - kasan_record_aux_stack_noalloc(work); + kasan_record_aux_stack(work); /* we own @work, set data and link */ set_work_pwq(work, pwq, extra_flags); diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 8b9e348113b1..d54e89f8c3e7 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -524,7 +524,11 @@ size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object) sizeof(struct kasan_free_meta) : 0); } -static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags) +/* + * This function avoids dynamic memory allocations and thus can be called from + * contexts that do not allow allocating memory. + */ +void kasan_record_aux_stack(void *addr) { struct slab *slab = kasan_addr_to_slab(addr); struct kmem_cache *cache; @@ -541,17 +545,7 @@ static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags) return; alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0]; - alloc_meta->aux_stack[0] = kasan_save_stack(0, depot_flags); -} - -void kasan_record_aux_stack(void *addr) -{ - return __kasan_record_aux_stack(addr, STACK_DEPOT_FLAG_CAN_ALLOC); -} - -void kasan_record_aux_stack_noalloc(void *addr) -{ - return __kasan_record_aux_stack(addr, 0); + alloc_meta->aux_stack[0] = kasan_save_stack(0, 0); } void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) diff --git a/mm/slub.c b/mm/slub.c index cef25d9a476a..a8e9b5106f4c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2311,7 +2311,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init, * We have to do this manually because the rcu_head is * not located inside the object. */ - kasan_record_aux_stack_noalloc(x); + kasan_record_aux_stack(x); delayed_free->object = x; call_rcu(&delayed_free->head, slab_free_after_rcu_debug); -- cgit v1.2.3