From 102f085d84607462234ac60f6027973b45a9bde2 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:21 +0200 Subject: timers: Rename usleep_idle_range() to usleep_range_idle() usleep_idle_range() is a variant of usleep_range(). Both are using usleep_range_state() as a base. To be able to find all the related functions in one go, rename it usleep_idle_range() to usleep_range_idle(). No functional change. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Reviewed-by: SeongJae Park Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-4-dc8b907cb62f@linutronix.de --- include/linux/delay.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/delay.h') diff --git a/include/linux/delay.h b/include/linux/delay.h index ff9cda975e30..2bc586aa2068 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -68,7 +68,7 @@ static inline void usleep_range(unsigned long min, unsigned long max) usleep_range_state(min, max, TASK_UNINTERRUPTIBLE); } -static inline void usleep_idle_range(unsigned long min, unsigned long max) +static inline void usleep_range_idle(unsigned long min, unsigned long max) { usleep_range_state(min, max, TASK_IDLE); } -- cgit v1.2.3 From f36eb171410839325fff9cd9b7b7400f7e606962 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:22 +0200 Subject: timers: Update function descriptions of sleep/delay related functions A lot of commonly used functions for inserting a sleep or delay lack a proper function description. Add function descriptions to all of them to have important information in a central place close to the code. No functional change. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-5-dc8b907cb62f@linutronix.de --- include/asm-generic/delay.h | 41 +++++++++++++++++++++++++++++++---- include/linux/delay.h | 48 ++++++++++++++++++++++++++++++---------- kernel/time/sleep_timeout.c | 53 ++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 120 insertions(+), 22 deletions(-) (limited to 'include/linux/delay.h') diff --git a/include/asm-generic/delay.h b/include/asm-generic/delay.h index e448ac61430c..a8cee41cc51b 100644 --- a/include/asm-generic/delay.h +++ b/include/asm-generic/delay.h @@ -12,11 +12,39 @@ extern void __const_udelay(unsigned long xloops); extern void __delay(unsigned long loops); /* - * The weird n/20000 thing suppresses a "comparison is always false due to - * limited range of data type" warning with non-const 8-bit arguments. + * Implementation details: + * + * * The weird n/20000 thing suppresses a "comparison is always false due to + * limited range of data type" warning with non-const 8-bit arguments. + * * 0x10c7 is 2**32 / 1000000 (rounded up) -> udelay + * * 0x5 is 2**32 / 1000000000 (rounded up) -> ndelay */ -/* 0x10c7 is 2**32 / 1000000 (rounded up) */ +/** + * udelay - Inserting a delay based on microseconds with busy waiting + * @usec: requested delay in microseconds + * + * When delaying in an atomic context ndelay(), udelay() and mdelay() are the + * only valid variants of delaying/sleeping to go with. + * + * When inserting delays in non atomic context which are shorter than the time + * which is required to queue e.g. an hrtimer and to enter then the scheduler, + * it is also valuable to use udelay(). But it is not simple to specify a + * generic threshold for this which will fit for all systems. An approximation + * is a threshold for all delays up to 10 microseconds. + * + * When having a delay which is larger than the architecture specific + * %MAX_UDELAY_MS value, please make sure mdelay() is used. Otherwise a overflow + * risk is given. + * + * Please note that ndelay(), udelay() and mdelay() may return early for several + * reasons (https://lists.openwall.net/linux-kernel/2011/01/09/56): + * + * #. computed loops_per_jiffy too low (due to the time taken to execute the + * timer interrupt.) + * #. cache behaviour affecting the time it takes to execute the loop function. + * #. CPU clock rate changes. + */ #define udelay(n) \ ({ \ if (__builtin_constant_p(n)) { \ @@ -29,7 +57,12 @@ extern void __delay(unsigned long loops); } \ }) -/* 0x5 is 2**32 / 1000000000 (rounded up) */ +/** + * ndelay - Inserting a delay based on nanoseconds with busy waiting + * @nsec: requested delay in nanoseconds + * + * See udelay() for basic information about ndelay() and it's variants. + */ #define ndelay(n) \ ({ \ if (__builtin_constant_p(n)) { \ diff --git a/include/linux/delay.h b/include/linux/delay.h index 2bc586aa2068..2de509e4adce 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -6,17 +6,7 @@ * Copyright (C) 1993 Linus Torvalds * * Delay routines, using a pre-computed "loops_per_jiffy" value. - * - * Please note that ndelay(), udelay() and mdelay() may return early for - * several reasons: - * 1. computed loops_per_jiffy too low (due to the time taken to - * execute the timer interrupt.) - * 2. cache behaviour affecting the time it takes to execute the - * loop function. - * 3. CPU clock rate changes. - * - * Please see this thread: - * https://lists.openwall.net/linux-kernel/2011/01/09/56 + * Sleep routines using timer list timers or hrtimers. */ #include @@ -35,12 +25,21 @@ extern unsigned long loops_per_jiffy; * The 2nd mdelay() definition ensures GCC will optimize away the * while loop for the common cases where n <= MAX_UDELAY_MS -- Paul G. */ - #ifndef MAX_UDELAY_MS #define MAX_UDELAY_MS 5 #endif #ifndef mdelay +/** + * mdelay - Inserting a delay based on milliseconds with busy waiting + * @n: requested delay in milliseconds + * + * See udelay() for basic information about mdelay() and it's variants. + * + * Please double check, whether mdelay() is the right way to go or whether a + * refactoring of the code is the better variant to be able to use msleep() + * instead. + */ #define mdelay(n) (\ (__builtin_constant_p(n) && (n)<=MAX_UDELAY_MS) ? udelay((n)*1000) : \ ({unsigned long __ms=(n); while (__ms--) udelay(1000);})) @@ -63,16 +62,41 @@ unsigned long msleep_interruptible(unsigned int msecs); void usleep_range_state(unsigned long min, unsigned long max, unsigned int state); +/** + * usleep_range - Sleep for an approximate time + * @min: Minimum time in microseconds to sleep + * @max: Maximum time in microseconds to sleep + * + * For basic information please refere to usleep_range_state(). + * + * The task will be in the state TASK_UNINTERRUPTIBLE during the sleep. + */ static inline void usleep_range(unsigned long min, unsigned long max) { usleep_range_state(min, max, TASK_UNINTERRUPTIBLE); } +/** + * usleep_range_idle - Sleep for an approximate time with idle time accounting + * @min: Minimum time in microseconds to sleep + * @max: Maximum time in microseconds to sleep + * + * For basic information please refere to usleep_range_state(). + * + * The sleeping task has the state TASK_IDLE during the sleep to prevent + * contribution to the load avarage. + */ static inline void usleep_range_idle(unsigned long min, unsigned long max) { usleep_range_state(min, max, TASK_IDLE); } +/** + * ssleep - wrapper for seconds around msleep + * @seconds: Requested sleep duration in seconds + * + * Please refere to msleep() for detailed information. + */ static inline void ssleep(unsigned int seconds) { msleep(seconds * 1000); diff --git a/kernel/time/sleep_timeout.c b/kernel/time/sleep_timeout.c index 560d17c30aa5..f3f246e4c8d1 100644 --- a/kernel/time/sleep_timeout.c +++ b/kernel/time/sleep_timeout.c @@ -281,7 +281,34 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout); /** * msleep - sleep safely even with waitqueue interruptions - * @msecs: Time in milliseconds to sleep for + * @msecs: Requested sleep duration in milliseconds + * + * msleep() uses jiffy based timeouts for the sleep duration. Because of the + * design of the timer wheel, the maximum additional percentage delay (slack) is + * 12.5%. This is only valid for timers which will end up in level 1 or a higher + * level of the timer wheel. For explanation of those 12.5% please check the + * detailed description about the basics of the timer wheel. + * + * The slack of timers which will end up in level 0 depends on sleep duration + * (msecs) and HZ configuration and can be calculated in the following way (with + * the timer wheel design restriction that the slack is not less than 12.5%): + * + * ``slack = MSECS_PER_TICK / msecs`` + * + * When the allowed slack of the callsite is known, the calculation could be + * turned around to find the minimal allowed sleep duration to meet the + * constraints. For example: + * + * * ``HZ=1000`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 1 / (1/4) = 4``: + * all sleep durations greater or equal 4ms will meet the constraints. + * * ``HZ=1000`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 1 / (1/8) = 8``: + * all sleep durations greater or equal 8ms will meet the constraints. + * * ``HZ=250`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 4 / (1/4) = 16``: + * all sleep durations greater or equal 16ms will meet the constraints. + * * ``HZ=250`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 4 / (1/8) = 32``: + * all sleep durations greater or equal 32ms will meet the constraints. + * + * See also the signal aware variant msleep_interruptible(). */ void msleep(unsigned int msecs) { @@ -294,7 +321,15 @@ EXPORT_SYMBOL(msleep); /** * msleep_interruptible - sleep waiting for signals - * @msecs: Time in milliseconds to sleep for + * @msecs: Requested sleep duration in milliseconds + * + * See msleep() for some basic information. + * + * The difference between msleep() and msleep_interruptible() is that the sleep + * could be interrupted by a signal delivery and then returns early. + * + * Returns: The remaining time of the sleep duration transformed to msecs (see + * schedule_timeout() for details). */ unsigned long msleep_interruptible(unsigned int msecs) { @@ -312,11 +347,17 @@ EXPORT_SYMBOL(msleep_interruptible); * @max: Maximum time in usecs to sleep * @state: State of the current task that will be while sleeping * + * usleep_range_state() sleeps at least for the minimum specified time but not + * longer than the maximum specified amount of time. The range might reduce + * power usage by allowing hrtimers to coalesce an already scheduled interrupt + * with this hrtimer. In the worst case, an interrupt is scheduled for the upper + * bound. + * + * The sleeping task is set to the specified state before starting the sleep. + * * In non-atomic context where the exact wakeup time is flexible, use - * usleep_range_state() instead of udelay(). The sleep improves responsiveness - * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces - * power usage by allowing hrtimers to take advantage of an already- - * scheduled interrupt instead of scheduling a new one just for this sleep. + * usleep_range() or its variants instead of udelay(). The sleep improves + * responsiveness by avoiding the CPU-hogging busy-wait of udelay(). */ void __sched usleep_range_state(unsigned long min, unsigned long max, unsigned int state) { -- cgit v1.2.3 From 82e11e47c1880362e05c065bef7dbe28a749555c Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:24 +0200 Subject: timers: Adjust flseep() to reflect reality fsleep() simply implements the recommendations of the outdated documentation in "Documentation/timers/timers-howto.rst". This should be a user friendly interface to choose always the best timeout function approach: - udelay() for very short sleep durations shorter than 10 microseconds - usleep_range() for sleep durations until 20 milliseconds - msleep() for the others The actual implementation has several problems: - It does not take into account that HZ resolution also has an impact on granularity of jiffies and has also an impact on the granularity of the buckets of timer wheel levels. This means that accuracy for the timeout does not have an upper limit. When executing fsleep(20000) on a HZ=100 system, the possible additional slack will be 50% as the granularity of the buckets in the lowest level is 10 milliseconds. - The upper limit of usleep_range() is twice the requested timeout. When no other interrupts occur in this range, the maximum value is used. This means that the requested sleep length has then an additional delay of 100%. Change the thresholds for the decisions in fsleep() to make sure the maximum slack which is added to the sleep duration is 25%. Note: Outdated documentation will be updated in a followup patch. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-7-dc8b907cb62f@linutronix.de --- include/linux/delay.h | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'include/linux/delay.h') diff --git a/include/linux/delay.h b/include/linux/delay.h index 2de509e4adce..89866bab100d 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -11,6 +11,7 @@ #include #include +#include extern unsigned long loops_per_jiffy; @@ -102,15 +103,35 @@ static inline void ssleep(unsigned int seconds) msleep(seconds * 1000); } -/* see Documentation/timers/timers-howto.rst for the thresholds */ +static const unsigned int max_slack_shift = 2; +#define USLEEP_RANGE_UPPER_BOUND ((TICK_NSEC << max_slack_shift) / NSEC_PER_USEC) + +/** + * fsleep - flexible sleep which autoselects the best mechanism + * @usecs: requested sleep duration in microseconds + * + * flseep() selects the best mechanism that will provide maximum 25% slack + * to the requested sleep duration. Therefore it uses: + * + * * udelay() loop for sleep durations <= 10 microseconds to avoid hrtimer + * overhead for really short sleep durations. + * * usleep_range() for sleep durations which would lead with the usage of + * msleep() to a slack larger than 25%. This depends on the granularity of + * jiffies. + * * msleep() for all other sleep durations. + * + * Note: When %CONFIG_HIGH_RES_TIMERS is not set, all sleeps are processed with + * the granularity of jiffies and the slack might exceed 25% especially for + * short sleep durations. + */ static inline void fsleep(unsigned long usecs) { if (usecs <= 10) udelay(usecs); - else if (usecs <= 20000) - usleep_range(usecs, 2 * usecs); + else if (usecs < USLEEP_RANGE_UPPER_BOUND) + usleep_range(usecs, usecs + (usecs >> max_slack_shift)); else - msleep(DIV_ROUND_UP(usecs, 1000)); + msleep(DIV_ROUND_UP(usecs, USEC_PER_MSEC)); } #endif /* defined(_LINUX_DELAY_H) */ -- cgit v1.2.3