From dbcdcb62b59db2cf6a24113873b90da15c6f0b19 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:26 +0100 Subject: tracing/timers: Enhance timer_start tracepoint For starting a timer, the timer is enqueued into a bucket of the timer wheel. The bucket expiry is the defacto expiry of the timer but it is not equal the timer expiry because of increasing granularity when bucket is in a higher level of the wheel. To be able to figure out in a trace whether a timer expired in time or not, the bucket expiry time is required as well. Add bucket expiry time to the timer_start tracepoint and thereby simplify the arguments. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-5-anna-maria@linutronix.de --- kernel/time/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/time/timer.c') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 63a8ce7177dd..a81d793a43d0 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -606,7 +606,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer, __set_bit(idx, base->pending_map); timer_set_idx(timer, idx); - trace_timer_start(timer, timer->expires, timer->flags); + trace_timer_start(timer, bucket_expiry); /* * Check whether this is the new first expiring timer. The -- cgit v1.2.3 From b573c73101d8786446535b2ab28cbc8907bda9a9 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:27 +0100 Subject: tracing/timers: Add tracepoint for tracking timer base is_idle flag When debugging timer code the timer tracepoints are very important. There is no tracepoint when the is_idle flag of the timer base changes. Instead of always adding manually trace_printk(), add tracepoints which can be easily enabled whenever required. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-6-anna-maria@linutronix.de --- include/trace/events/timer.h | 20 ++++++++++++++++++++ kernel/time/timer.c | 14 +++++++++++--- 2 files changed, 31 insertions(+), 3 deletions(-) (limited to 'kernel/time/timer.c') diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index 99ada928d445..1ef58a04fc57 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -142,6 +142,26 @@ DEFINE_EVENT(timer_class, timer_cancel, TP_ARGS(timer) ); +TRACE_EVENT(timer_base_idle, + + TP_PROTO(bool is_idle, unsigned int cpu), + + TP_ARGS(is_idle, cpu), + + TP_STRUCT__entry( + __field( bool, is_idle ) + __field( unsigned int, cpu ) + ), + + TP_fast_assign( + __entry->is_idle = is_idle; + __entry->cpu = cpu; + ), + + TP_printk("is_idle=%d cpu=%d", + __entry->is_idle, __entry->cpu) +); + #define decode_clockid(type) \ __print_symbolic(type, \ { CLOCK_REALTIME, "CLOCK_REALTIME" }, \ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a81d793a43d0..ed8d6063d9ef 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1950,7 +1950,10 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) if (time_before_eq(nextevt, basej)) { expires = basem; - base->is_idle = false; + if (base->is_idle) { + base->is_idle = false; + trace_timer_base_idle(false, base->cpu); + } } else { if (base->timers_pending) expires = basem + (u64)(nextevt - basej) * TICK_NSEC; @@ -1961,8 +1964,10 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) * logic is only maintained for the BASE_STD base, deferrable * timers may still see large granularity skew (by design). */ - if ((expires - basem) > TICK_NSEC) + if ((expires - basem) > TICK_NSEC && !base->is_idle) { base->is_idle = true; + trace_timer_base_idle(true, base->cpu); + } } raw_spin_unlock(&base->lock); @@ -1984,7 +1989,10 @@ void timer_clear_idle(void) * sending the IPI a few instructions smaller for the cost of taking * the lock in the exit from idle path. */ - base->is_idle = false; + if (base->is_idle) { + base->is_idle = false; + trace_timer_base_idle(false, smp_processor_id()); + } } #endif -- cgit v1.2.3 From d124c3393e798b1fb142ee728d5c8976d11e722d Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:28 +0100 Subject: timers: Do not IPI for deferrable timers Deferrable timers do not prevent CPU from going idle and are not taken into account on idle path. Sending an IPI to a remote CPU when a new first deferrable timer was enqueued will wake up the remote CPU but nothing will be done regarding the deferrable timers. Drop IPI completely when a new first deferrable timer was enqueued. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-7-anna-maria@linutronix.de --- kernel/time/timer.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'kernel/time/timer.c') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index ed8d6063d9ef..91882059bf3d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -571,18 +571,15 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk, static void trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) { - if (!is_timers_nohz_active()) - return; - /* - * TODO: This wants some optimizing similar to the code below, but we - * will do that when we switch from push to pull for deferrable timers. + * Deferrable timers do not prevent the CPU from entering dynticks and + * are not taken into account on the idle/nohz_full path. An IPI when a + * new deferrable timer is enqueued will wake up the remote CPU but + * nothing will be done with the deferrable timer base. Therefore skip + * the remote IPI for deferrable timers completely. */ - if (timer->flags & TIMER_DEFERRABLE) { - if (tick_nohz_full_cpu(base->cpu)) - wake_up_nohz_cpu(base->cpu); + if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE) return; - } /* * We might have to IPI the remote CPU if the base is idle and the -- cgit v1.2.3 From b5e6f59888c7bde3c05f61b3ce06b78a86713fc0 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:29 +0100 Subject: timers: Move store of next event into __next_timer_interrupt() Both call sites of __next_timer_interrupt() store the return value directly in base->next_expiry. Move the store into __next_timer_interrupt() and to make its purpose more clear, rename the function to next_expiry_recalc(). No functional change. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-8-anna-maria@linutronix.de --- kernel/time/timer.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel/time/timer.c') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 91882059bf3d..490ff8e66fc2 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1800,8 +1800,10 @@ static int next_pending_bucket(struct timer_base *base, unsigned offset, /* * Search the first expiring timer in the various clock levels. Caller must * hold base->lock. + * + * Store next expiry time in base->next_expiry. */ -static unsigned long __next_timer_interrupt(struct timer_base *base) +static void next_expiry_recalc(struct timer_base *base) { unsigned long clk, next, adj; unsigned lvl, offset = 0; @@ -1867,10 +1869,9 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) clk += adj; } + base->next_expiry = next; base->next_expiry_recalc = false; base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA); - - return next; } #ifdef CONFIG_NO_HZ_COMMON @@ -1930,7 +1931,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) raw_spin_lock(&base->lock); if (base->next_expiry_recalc) - base->next_expiry = __next_timer_interrupt(base); + next_expiry_recalc(base); nextevt = base->next_expiry; /* @@ -2021,7 +2022,7 @@ static inline void __run_timers(struct timer_base *base) WARN_ON_ONCE(!levels && !base->next_expiry_recalc && base->timers_pending); base->clk++; - base->next_expiry = __next_timer_interrupt(base); + next_expiry_recalc(base); while (levels--) expire_timers(base, heads + levels); -- cgit v1.2.3 From 8a2c9c7e7848d7f63d38b698209148b5bb4ba7f3 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:30 +0100 Subject: timers: Clarify check in forward_timer_base() The current check whether a forward of the timer base is required can be simplified by using an already existing comparison function which is easier to read. The related comment is outdated and was not updated when the check changed in commit 36cd28a4cdd0 ("timers: Lower base clock forwarding threshold"). Use time_before_eq() for the check and replace the comment by copying the comment from the same check inside get_next_timer_interrupt(). Move the precious information of the outdated comment to the proper place in __run_timers(). No functional change. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-9-anna-maria@linutronix.de --- kernel/time/timer.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel/time/timer.c') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 490ff8e66fc2..f75f932b128e 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -944,11 +944,10 @@ static inline void forward_timer_base(struct timer_base *base) unsigned long jnow = READ_ONCE(jiffies); /* - * No need to forward if we are close enough below jiffies. - * Also while executing timers, base->clk is 1 offset ahead - * of jiffies to avoid endless requeuing to current jiffies. + * Check whether we can forward the base. We can only do that when + * @basej is past base->clk otherwise we might rewind base->clk. */ - if ((long)(jnow - base->clk) < 1) + if (time_before_eq(jnow, base->clk)) return; /* @@ -2021,6 +2020,10 @@ static inline void __run_timers(struct timer_base *base) */ WARN_ON_ONCE(!levels && !base->next_expiry_recalc && base->timers_pending); + /* + * While executing timers, base->clk is set 1 offset ahead of + * jiffies to avoid endless requeuing to current jiffies. + */ base->clk++; next_expiry_recalc(base); -- cgit v1.2.3 From 1e490484aa3af42d4eeffabf96d6a02be69d586b Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:31 +0100 Subject: timers: Split out forward timer base functionality Forwarding timer base is done when the next expiry value is calculated and when a new timer is enqueued. When the next expiry value is calculated the jiffies value is already available and does not need to be reread a second time. Splitting out the forward timer base functionality to make it executable via both contextes - those where jiffies are already known and those, where jiffies need to be read. No functional change. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-10-anna-maria@linutronix.de --- kernel/time/timer.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel/time/timer.c') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index f75f932b128e..5b02e169ab23 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -939,30 +939,34 @@ get_target_base(struct timer_base *base, unsigned tflags) return get_timer_this_cpu_base(tflags); } -static inline void forward_timer_base(struct timer_base *base) +static inline void __forward_timer_base(struct timer_base *base, + unsigned long basej) { - unsigned long jnow = READ_ONCE(jiffies); - /* * Check whether we can forward the base. We can only do that when * @basej is past base->clk otherwise we might rewind base->clk. */ - if (time_before_eq(jnow, base->clk)) + if (time_before_eq(basej, base->clk)) return; /* * If the next expiry value is > jiffies, then we fast forward to * jiffies otherwise we forward to the next expiry value. */ - if (time_after(base->next_expiry, jnow)) { - base->clk = jnow; + if (time_after(base->next_expiry, basej)) { + base->clk = basej; } else { if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk))) return; base->clk = base->next_expiry; } + } +static inline void forward_timer_base(struct timer_base *base) +{ + __forward_timer_base(base, READ_ONCE(jiffies)); +} /* * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means -- cgit v1.2.3 From 7a39a5080ef0e3cf233d92165f6a778f08a08244 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:32 +0100 Subject: timers: Use already existing function for forwarding timer base There is an already existing function for forwarding the timer base. Forwarding the timer base is implemented directly in get_next_timer_interrupt() as well. Remove the code duplication and invoke __forward_timer_base() instead. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-11-anna-maria@linutronix.de --- kernel/time/timer.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel/time/timer.c') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 5b02e169ab23..1a73d396101b 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1939,15 +1939,9 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) /* * We have a fresh next event. Check whether we can forward the - * base. We can only do that when @basej is past base->clk - * otherwise we might rewind base->clk. + * base. */ - if (time_after(basej, base->clk)) { - if (time_after(nextevt, basej)) - base->clk = basej; - else if (time_after(nextevt, base->clk)) - base->clk = nextevt; - } + __forward_timer_base(base, basej); if (time_before_eq(nextevt, basej)) { expires = basem; -- cgit v1.2.3 From bb8caad5083f8fbba70faf41f1d3bab7cf09da6d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 Dec 2023 10:26:33 +0100 Subject: timers: Rework idle logic To improve readability of the code, split base->idle calculation and expires calculation into separate parts. While at it, update the comment about timer base idle marking. Thereby the following subtle change happens if the next event is just one jiffy ahead and the tick was already stopped: Originally base->is_idle remains true in this situation. Now base->is_idle turns to false. This may spare an IPI if a timer is enqueued remotely to an idle CPU that is going to tick on the next jiffy. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-12-anna-maria@linutronix.de --- kernel/time/timer.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel/time/timer.c') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 1a73d396101b..cf51655add64 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1924,6 +1924,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); u64 expires = KTIME_MAX; unsigned long nextevt; + bool was_idle; /* * Pretend that there is no timer pending if the cpu is offline. @@ -1943,27 +1944,26 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) */ __forward_timer_base(base, basej); - if (time_before_eq(nextevt, basej)) { - expires = basem; - if (base->is_idle) { - base->is_idle = false; - trace_timer_base_idle(false, base->cpu); - } - } else { - if (base->timers_pending) - expires = basem + (u64)(nextevt - basej) * TICK_NSEC; - /* - * If we expect to sleep more than a tick, mark the base idle. - * Also the tick is stopped so any added timer must forward - * the base clk itself to keep granularity small. This idle - * logic is only maintained for the BASE_STD base, deferrable - * timers may still see large granularity skew (by design). - */ - if ((expires - basem) > TICK_NSEC && !base->is_idle) { - base->is_idle = true; - trace_timer_base_idle(true, base->cpu); - } + if (base->timers_pending) { + /* If we missed a tick already, force 0 delta */ + if (time_before(nextevt, basej)) + nextevt = basej; + expires = basem + (u64)(nextevt - basej) * TICK_NSEC; } + + /* + * Base is idle if the next event is more than a tick away. + * + * If the base is marked idle then any timer add operation must forward + * the base clk itself to keep granularity small. This idle logic is + * only maintained for the BASE_STD base, deferrable timers may still + * see large granularity skew (by design). + */ + was_idle = base->is_idle; + base->is_idle = time_after(nextevt, basej + 1); + if (was_idle != base->is_idle) + trace_timer_base_idle(base->is_idle, base->cpu); + raw_spin_unlock(&base->lock); return cmp_next_hrtimer_event(basem, expires); -- cgit v1.2.3 From da65f29dada7f7cbbf0d6375b88a0316f5f7d6f5 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:34 +0100 Subject: timers: Fix nextevt calculation when no timers are pending When no timer is queued into an empty timer base, the next_expiry will not be updated. It was originally calculated as base->clk + NEXT_TIMER_MAX_DELTA When the timer base stays empty long enough (> NEXT_TIMER_MAX_DELTA), the next_expiry value of the empty base suggests that there is a timer pending soon. This might be more a kind of a theoretical problem, but the fix doesn't hurt. Use only base->next_expiry value as nextevt when timers are pending. Otherwise nextevt will be jiffies + NEXT_TIMER_MAX_DELTA. As all information is in place, update base->next_expiry value of the empty timer base as well. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-13-anna-maria@linutronix.de --- kernel/time/timer.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel/time/timer.c') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index cf51655add64..352b161113cd 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1922,8 +1922,8 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) u64 get_next_timer_interrupt(unsigned long basej, u64 basem) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + unsigned long nextevt = basej + NEXT_TIMER_MAX_DELTA; u64 expires = KTIME_MAX; - unsigned long nextevt; bool was_idle; /* @@ -1936,7 +1936,6 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) raw_spin_lock(&base->lock); if (base->next_expiry_recalc) next_expiry_recalc(base); - nextevt = base->next_expiry; /* * We have a fresh next event. Check whether we can forward the @@ -1945,10 +1944,20 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) __forward_timer_base(base, basej); if (base->timers_pending) { + nextevt = base->next_expiry; + /* If we missed a tick already, force 0 delta */ if (time_before(nextevt, basej)) nextevt = basej; expires = basem + (u64)(nextevt - basej) * TICK_NSEC; + } else { + /* + * Move next_expiry for the empty base into the future to + * prevent a unnecessary raise of the timer softirq when the + * next_expiry value will be reached even if there is no timer + * pending. + */ + base->next_expiry = nextevt; } /* -- cgit v1.2.3