From 5bb962269c29cbb878414cddf0ebdff8c5cdef0a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 15 Oct 2012 02:03:27 +0200 Subject: tick: Consolidate timekeeping handling code Unify the duplicated timekeeping handling code of low and high res tick sched handlers. Signed-off-by: Frederic Weisbecker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Peter Zijlstra --- kernel/time/tick-sched.c | 54 +++++++++++++++++++++--------------------------- 1 file changed, 24 insertions(+), 30 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a40260885265..360674c485f5 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -98,6 +98,28 @@ static ktime_t tick_init_jiffy_update(void) return period; } + +static void tick_sched_do_timer(ktime_t now) +{ + int cpu = smp_processor_id(); + +#ifdef CONFIG_NO_HZ + /* + * Check if the do_timer duty was dropped. We don't care about + * concurrency: This happens only when the cpu in charge went + * into a long sleep. If two cpus happen to assign themself to + * this duty, then the jiffies update is still serialized by + * xtime_lock. + */ + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) + tick_do_timer_cpu = cpu; +#endif + + /* Check, if the jiffies need an update */ + if (tick_do_timer_cpu == cpu) + tick_do_update_jiffies64(now); +} + /* * NOHZ - aka dynamic tick functionality */ @@ -648,24 +670,11 @@ static void tick_nohz_handler(struct clock_event_device *dev) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); struct pt_regs *regs = get_irq_regs(); - int cpu = smp_processor_id(); ktime_t now = ktime_get(); dev->next_event.tv64 = KTIME_MAX; - /* - * Check if the do_timer duty was dropped. We don't care about - * concurrency: This happens only when the cpu in charge went - * into a long sleep. If two cpus happen to assign themself to - * this duty, then the jiffies update is still serialized by - * xtime_lock. - */ - if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) - tick_do_timer_cpu = cpu; - - /* Check, if the jiffies need an update */ - if (tick_do_timer_cpu == cpu) - tick_do_update_jiffies64(now); + tick_sched_do_timer(now); /* * When we are idle and the tick is stopped, we have to touch @@ -802,23 +811,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) container_of(timer, struct tick_sched, sched_timer); struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); - int cpu = smp_processor_id(); -#ifdef CONFIG_NO_HZ - /* - * Check if the do_timer duty was dropped. We don't care about - * concurrency: This happens only when the cpu in charge went - * into a long sleep. If two cpus happen to assign themself to - * this duty, then the jiffies update is still serialized by - * xtime_lock. - */ - if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) - tick_do_timer_cpu = cpu; -#endif - - /* Check, if the jiffies need an update */ - if (tick_do_timer_cpu == cpu) - tick_do_update_jiffies64(now); + tick_sched_do_timer(now); /* * Do not call, when we are not in irq context and have -- cgit v1.2.3 From 9e8f559b08cbc1cfcbf093840a2a760a946cb90f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 15 Oct 2012 02:43:03 +0200 Subject: tick: Consolidate tick handling for high and low res handlers Besides unifying code, this also adds the idle check before processing idle accounting specifics on the low res handler. This way we also generalize this part of the nohz code for !CONFIG_HIGH_RES_TIMERS to prepare for the adaptive tickless features. Signed-off-by: Frederic Weisbecker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Peter Zijlstra --- kernel/time/tick-sched.c | 55 +++++++++++++++++++----------------------------- 1 file changed, 22 insertions(+), 33 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 360674c485f5..68a873af09a8 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -120,6 +120,25 @@ static void tick_sched_do_timer(ktime_t now) tick_do_update_jiffies64(now); } +static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) +{ + /* + * When we are idle and the tick is stopped, we have to touch + * the watchdog as we might not schedule for a really long + * time. This happens on complete idle SMP systems while + * waiting on the login prompt. We also increment the "start of + * idle" jiffy stamp so the idle accounting adjustment we do + * when we go busy again does not account too much ticks. + */ + if (ts->tick_stopped) { + touch_softlockup_watchdog(); + if (is_idle_task(current)) + ts->idle_jiffies++; + } + update_process_times(user_mode(regs)); + profile_tick(CPU_PROFILING); +} + /* * NOHZ - aka dynamic tick functionality */ @@ -675,22 +694,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) dev->next_event.tv64 = KTIME_MAX; tick_sched_do_timer(now); - - /* - * When we are idle and the tick is stopped, we have to touch - * the watchdog as we might not schedule for a really long - * time. This happens on complete idle SMP systems while - * waiting on the login prompt. We also increment the "start - * of idle" jiffy stamp so the idle accounting adjustment we - * do when we go busy again does not account too much ticks. - */ - if (ts->tick_stopped) { - touch_softlockup_watchdog(); - ts->idle_jiffies++; - } - - update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING); + tick_sched_handle(ts, regs); while (tick_nohz_reprogram(ts, now)) { now = ktime_get(); @@ -818,23 +822,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) * Do not call, when we are not in irq context and have * no valid regs pointer */ - if (regs) { - /* - * When we are idle and the tick is stopped, we have to touch - * the watchdog as we might not schedule for a really long - * time. This happens on complete idle SMP systems while - * waiting on the login prompt. We also increment the "start of - * idle" jiffy stamp so the idle accounting adjustment we do - * when we go busy again does not account too much ticks. - */ - if (ts->tick_stopped) { - touch_softlockup_watchdog(); - if (is_idle_task(current)) - ts->idle_jiffies++; - } - update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING); - } + if (regs) + tick_sched_handle(ts, regs); hrtimer_forward(timer, now, tick_period); -- cgit v1.2.3 From 94a571402012e0dfaa23bbbdd64d033f48477d86 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 15 Oct 2012 16:17:16 +0200 Subject: tick: Conditionally build nohz specific code in tick handler This optimize a bit the high res tick sched handler. Signed-off-by: Frederic Weisbecker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Peter Zijlstra --- kernel/time/tick-sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 68a873af09a8..766d4c47a4a4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -122,6 +122,7 @@ static void tick_sched_do_timer(ktime_t now) static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) { +#ifdef CONFIG_NO_HZ /* * When we are idle and the tick is stopped, we have to touch * the watchdog as we might not schedule for a really long @@ -135,6 +136,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) if (is_idle_task(current)) ts->idle_jiffies++; } +#endif update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); } -- cgit v1.2.3 From 351f181f9134d71efd46ddf0c0abca31b58cd79b Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Thu, 25 Oct 2012 01:07:35 +0800 Subject: timers, sched: Correct the comments for tick_sched_timer() In the comments of function tick_sched_timer(), the sentence "timer->base->cpu_base->lock held" is not right. In function __run_hrtimer(), before call timer->function(), the cpu_base->lock has been unlocked. Signed-off-by: liu chuansheng Cc: fei.li@intel.com Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1351098455.15558.1421.camel@cliu38-desktop-build Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a40260885265..2bc73d3bf7fa 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -794,7 +794,7 @@ void tick_check_idle(int cpu) #ifdef CONFIG_HIGH_RES_TIMERS /* * We rearm the timer until we get disabled by the idle code. - * Called with interrupts disabled and timer->base->cpu_base->lock held. + * Called with interrupts disabled. */ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) { -- cgit v1.2.3 From b8f61116c1ce342804a0897b0a80eb4df5f19453 Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Thu, 25 Oct 2012 01:07:35 +0800 Subject: tick: Correct the comments for tick_sched_timer() In the comments of function tick_sched_timer(), the sentence "timer->base->cpu_base->lock held" is not right. In function __run_hrtimer(), before call timer->function(), the cpu_base->lock has been unlocked. Signed-off-by: liu chuansheng Cc: fei.li@intel.com Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1351098455.15558.1421.camel@cliu38-desktop-build Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 766d4c47a4a4..77729cc3750b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -809,7 +809,7 @@ void tick_check_idle(int cpu) #ifdef CONFIG_HIGH_RES_TIMERS /* * We rearm the timer until we get disabled by the idle code. - * Called with interrupts disabled and timer->base->cpu_base->lock held. + * Called with interrupts disabled. */ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) { -- cgit v1.2.3 From 65f8f9a1c1db831e5159e3e3e50912d1f214cd0c Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 31 Oct 2012 06:27:25 +0000 Subject: time: remove the timecompare code. This patch removes the timecompare code from the kernel. The top five reasons to do this are: 1. There are no more users of this code. 2. The original idea was a bit weak. 3. The original author has disappeared. 4. The code was not general purpose but tuned to a particular hardware, 5. There are better ways to accomplish clock synchronization. Signed-off-by: Richard Cochran Acked-by: John Stultz Tested-by: Bob Liu Signed-off-by: David S. Miller --- include/linux/timecompare.h | 125 ---------------------------- kernel/time/Makefile | 2 +- kernel/time/timecompare.c | 193 -------------------------------------------- 3 files changed, 1 insertion(+), 319 deletions(-) delete mode 100644 include/linux/timecompare.h delete mode 100644 kernel/time/timecompare.c (limited to 'kernel/time') diff --git a/include/linux/timecompare.h b/include/linux/timecompare.h deleted file mode 100644 index 546e2234e4b3..000000000000 --- a/include/linux/timecompare.h +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Utility code which helps transforming between two different time - * bases, called "source" and "target" time in this code. - * - * Source time has to be provided via the timecounter API while target - * time is accessed via a function callback whose prototype - * intentionally matches ktime_get() and ktime_get_real(). These - * interfaces where chosen like this so that the code serves its - * initial purpose without additional glue code. - * - * This purpose is synchronizing a hardware clock in a NIC with system - * time, in order to implement the Precision Time Protocol (PTP, - * IEEE1588) with more accurate hardware assisted time stamping. In - * that context only synchronization against system time (= - * ktime_get_real()) is currently needed. But this utility code might - * become useful in other situations, which is why it was written as - * general purpose utility code. - * - * The source timecounter is assumed to return monotonically - * increasing time (but this code does its best to compensate if that - * is not the case) whereas target time may jump. - * - * The target time corresponding to a source time is determined by - * reading target time, reading source time, reading target time - * again, then assuming that average target time corresponds to source - * time. In other words, the assumption is that reading the source - * time is slow and involves equal time for sending the request and - * receiving the reply, whereas reading target time is assumed to be - * fast. - * - * Copyright (C) 2009 Intel Corporation. - * Author: Patrick Ohly - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - */ -#ifndef _LINUX_TIMECOMPARE_H -#define _LINUX_TIMECOMPARE_H - -#include -#include - -/** - * struct timecompare - stores state and configuration for the two clocks - * - * Initialize to zero, then set source/target/num_samples. - * - * Transformation between source time and target time is done with: - * target_time = source_time + offset + - * (source_time - last_update) * skew / - * TIMECOMPARE_SKEW_RESOLUTION - * - * @source: used to get source time stamps via timecounter_read() - * @target: function returning target time (for example, ktime_get - * for monotonic time, or ktime_get_real for wall clock) - * @num_samples: number of times that source time and target time are to - * be compared when determining their offset - * @offset: (target time - source time) at the time of the last update - * @skew: average (target time - source time) / delta source time * - * TIMECOMPARE_SKEW_RESOLUTION - * @last_update: last source time stamp when time offset was measured - */ -struct timecompare { - struct timecounter *source; - ktime_t (*target)(void); - int num_samples; - - s64 offset; - s64 skew; - u64 last_update; -}; - -/** - * timecompare_transform - transform source time stamp into target time base - * @sync: context for time sync - * @source_tstamp: the result of timecounter_read() or - * timecounter_cyc2time() - */ -extern ktime_t timecompare_transform(struct timecompare *sync, - u64 source_tstamp); - -/** - * timecompare_offset - measure current (target time - source time) offset - * @sync: context for time sync - * @offset: average offset during sample period returned here - * @source_tstamp: average source time during sample period returned here - * - * Returns number of samples used. Might be zero (= no result) in the - * unlikely case that target time was monotonically decreasing for all - * samples (= broken). - */ -extern int timecompare_offset(struct timecompare *sync, - s64 *offset, - u64 *source_tstamp); - -extern void __timecompare_update(struct timecompare *sync, - u64 source_tstamp); - -/** - * timecompare_update - update offset and skew by measuring current offset - * @sync: context for time sync - * @source_tstamp: the result of timecounter_read() or - * timecounter_cyc2time(), pass zero to force update - * - * Updates are only done at most once per second. - */ -static inline void timecompare_update(struct timecompare *sync, - u64 source_tstamp) -{ - if (!source_tstamp || - (s64)(source_tstamp - sync->last_update) >= NSEC_PER_SEC) - __timecompare_update(sync, source_tstamp); -} - -#endif /* _LINUX_TIMECOMPARE_H */ diff --git a/kernel/time/Makefile b/kernel/time/Makefile index e2fd74b8e8c2..ff7d9d2ab504 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,4 +1,4 @@ -obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o obj-y += timeconv.o posix-clock.o alarmtimer.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c deleted file mode 100644 index a9ae369925ce..000000000000 --- a/kernel/time/timecompare.c +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright (C) 2009 Intel Corporation. - * Author: Patrick Ohly - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include -#include -#include -#include -#include - -/* - * fixed point arithmetic scale factor for skew - * - * Usually one would measure skew in ppb (parts per billion, 1e9), but - * using a factor of 2 simplifies the math. - */ -#define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30) - -ktime_t timecompare_transform(struct timecompare *sync, - u64 source_tstamp) -{ - u64 nsec; - - nsec = source_tstamp + sync->offset; - nsec += (s64)(source_tstamp - sync->last_update) * sync->skew / - TIMECOMPARE_SKEW_RESOLUTION; - - return ns_to_ktime(nsec); -} -EXPORT_SYMBOL_GPL(timecompare_transform); - -int timecompare_offset(struct timecompare *sync, - s64 *offset, - u64 *source_tstamp) -{ - u64 start_source = 0, end_source = 0; - struct { - s64 offset; - s64 duration_target; - } buffer[10], sample, *samples; - int counter = 0, i; - int used; - int index; - int num_samples = sync->num_samples; - - if (num_samples > ARRAY_SIZE(buffer)) { - samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); - if (!samples) { - samples = buffer; - num_samples = ARRAY_SIZE(buffer); - } - } else { - samples = buffer; - } - - /* run until we have enough valid samples, but do not try forever */ - i = 0; - counter = 0; - while (1) { - u64 ts; - ktime_t start, end; - - start = sync->target(); - ts = timecounter_read(sync->source); - end = sync->target(); - - if (!i) - start_source = ts; - - /* ignore negative durations */ - sample.duration_target = ktime_to_ns(ktime_sub(end, start)); - if (sample.duration_target >= 0) { - /* - * assume symetric delay to and from source: - * average target time corresponds to measured - * source time - */ - sample.offset = - (ktime_to_ns(end) + ktime_to_ns(start)) / 2 - - ts; - - /* simple insertion sort based on duration */ - index = counter - 1; - while (index >= 0) { - if (samples[index].duration_target < - sample.duration_target) - break; - samples[index + 1] = samples[index]; - index--; - } - samples[index + 1] = sample; - counter++; - } - - i++; - if (counter >= num_samples || i >= 100000) { - end_source = ts; - break; - } - } - - *source_tstamp = (end_source + start_source) / 2; - - /* remove outliers by only using 75% of the samples */ - used = counter * 3 / 4; - if (!used) - used = counter; - if (used) { - /* calculate average */ - s64 off = 0; - for (index = 0; index < used; index++) - off += samples[index].offset; - *offset = div_s64(off, used); - } - - if (samples && samples != buffer) - kfree(samples); - - return used; -} -EXPORT_SYMBOL_GPL(timecompare_offset); - -void __timecompare_update(struct timecompare *sync, - u64 source_tstamp) -{ - s64 offset; - u64 average_time; - - if (!timecompare_offset(sync, &offset, &average_time)) - return; - - if (!sync->last_update) { - sync->last_update = average_time; - sync->offset = offset; - sync->skew = 0; - } else { - s64 delta_nsec = average_time - sync->last_update; - - /* avoid division by negative or small deltas */ - if (delta_nsec >= 10000) { - s64 delta_offset_nsec = offset - sync->offset; - s64 skew; /* delta_offset_nsec * - TIMECOMPARE_SKEW_RESOLUTION / - delta_nsec */ - u64 divisor; - - /* div_s64() is limited to 32 bit divisor */ - skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION; - divisor = delta_nsec; - while (unlikely(divisor >= ((s64)1) << 32)) { - /* divide both by 2; beware, right shift - of negative value has undefined - behavior and can only be used for - the positive divisor */ - skew = div_s64(skew, 2); - divisor >>= 1; - } - skew = div_s64(skew, divisor); - - /* - * Calculate new overall skew as 4/16 the - * old value and 12/16 the new one. This is - * a rather arbitrary tradeoff between - * only using the latest measurement (0/16 and - * 16/16) and even more weight on past measurements. - */ -#define TIMECOMPARE_NEW_SKEW_PER_16 12 - sync->skew = - div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) * - sync->skew + - TIMECOMPARE_NEW_SKEW_PER_16 * skew, - 16); - sync->last_update = average_time; - sync->offset = offset; - } - } -} -EXPORT_SYMBOL_GPL(__timecompare_update); -- cgit v1.2.3 From f95a985781e9e986992351c971af7f7e46e06ed5 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Thu, 18 Oct 2012 11:34:41 +0200 Subject: time/jiffies: Make clocksource_jiffies static Commit f1b8274 ("clocksource: Cleanup clocksource selection") removed all external references to clocksource_jiffies so there is no need to have the symbol globally visible. Fixes the following sparse warning: CHECK kernel/time/jiffies.c kernel/time/jiffies.c:61:20: warning: symbol 'clocksource_jiffies' was not declared. Should it be static? Signed-off-by: Lars-Peter Clausen Signed-off-by: John Stultz --- kernel/time/jiffies.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/time') diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 6629bf7b5285..25f5b2699d37 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -58,7 +58,7 @@ static cycle_t jiffies_read(struct clocksource *cs) return (cycle_t) jiffies; } -struct clocksource clocksource_jiffies = { +static struct clocksource clocksource_jiffies = { .name = "jiffies", .rating = 1, /* lowest valid rating*/ .read = jiffies_read, -- cgit v1.2.3 From d6ad418763888f617ac5b4849823e4cd670df1dd Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 28 Feb 2012 16:50:11 -0800 Subject: time: Kill xtime_lock, replacing it with jiffies_lock Now that timekeeping is protected by its own locks, rename the xtime_lock to jifffies_lock to better describe what it protects. CC: Thomas Gleixner CC: Eric Dumazet CC: Richard Cochran Signed-off-by: John Stultz --- drivers/clocksource/i8253.c | 2 +- include/linux/jiffies.h | 3 ++- kernel/time/jiffies.c | 6 ++++-- kernel/time/tick-common.c | 8 ++++---- kernel/time/tick-internal.h | 1 - kernel/time/tick-sched.c | 22 +++++++++++----------- kernel/time/timekeeping.c | 14 +++----------- 7 files changed, 25 insertions(+), 31 deletions(-) (limited to 'kernel/time') diff --git a/drivers/clocksource/i8253.c b/drivers/clocksource/i8253.c index e7cab2da910f..14ee3efcc404 100644 --- a/drivers/clocksource/i8253.c +++ b/drivers/clocksource/i8253.c @@ -35,7 +35,7 @@ static cycle_t i8253_read(struct clocksource *cs) raw_spin_lock_irqsave(&i8253_lock, flags); /* - * Although our caller may have the read side of xtime_lock, + * Although our caller may have the read side of jiffies_lock, * this is now a seqlock, and we are cheating in this routine * by having side effects on state that we cannot undo if * there is a collision on the seqlock and our caller has to diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 6b87413da9d6..82ed068b1ebe 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -70,11 +70,12 @@ extern int register_refined_jiffies(long clock_tick_rate); /* * The 64-bit value is not atomic - you MUST NOT read it - * without sampling the sequence number in xtime_lock. + * without sampling the sequence number in jiffies_lock. * get_jiffies_64() will do this for you as appropriate. */ extern u64 __jiffy_data jiffies_64; extern unsigned long volatile __jiffy_data jiffies; +extern seqlock_t jiffies_lock; #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 25f5b2699d37..7a925ba456fb 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -67,6 +67,8 @@ static struct clocksource clocksource_jiffies = { .shift = JIFFIES_SHIFT, }; +__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); + #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) { @@ -74,9 +76,9 @@ u64 get_jiffies_64(void) u64 ret; do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&jiffies_lock); ret = jiffies_64; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&jiffies_lock, seq)); return ret; } EXPORT_SYMBOL(get_jiffies_64); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index da6c9ecad4e4..b1600a6973f4 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -63,13 +63,13 @@ int tick_is_oneshot_available(void) static void tick_periodic(int cpu) { if (tick_do_timer_cpu == cpu) { - write_seqlock(&xtime_lock); + write_seqlock(&jiffies_lock); /* Keep track of the next tick event */ tick_next_period = ktime_add(tick_next_period, tick_period); do_timer(1); - write_sequnlock(&xtime_lock); + write_sequnlock(&jiffies_lock); } update_process_times(user_mode(get_irq_regs())); @@ -130,9 +130,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) ktime_t next; do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&jiffies_lock); next = tick_next_period; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&jiffies_lock, seq)); clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 4e265b901fed..cf3e59ed6dc0 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -141,4 +141,3 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) #endif extern void do_timer(unsigned long ticks); -extern seqlock_t xtime_lock; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a40260885265..a678046c3e5e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -31,7 +31,7 @@ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); /* - * The time, when the last jiffy update happened. Protected by xtime_lock. + * The time, when the last jiffy update happened. Protected by jiffies_lock. */ static ktime_t last_jiffies_update; @@ -49,14 +49,14 @@ static void tick_do_update_jiffies64(ktime_t now) ktime_t delta; /* - * Do a quick check without holding xtime_lock: + * Do a quick check without holding jiffies_lock: */ delta = ktime_sub(now, last_jiffies_update); if (delta.tv64 < tick_period.tv64) return; - /* Reevalute with xtime_lock held */ - write_seqlock(&xtime_lock); + /* Reevalute with jiffies_lock held */ + write_seqlock(&jiffies_lock); delta = ktime_sub(now, last_jiffies_update); if (delta.tv64 >= tick_period.tv64) { @@ -79,7 +79,7 @@ static void tick_do_update_jiffies64(ktime_t now) /* Keep the tick_next_period variable up to date */ tick_next_period = ktime_add(last_jiffies_update, tick_period); } - write_sequnlock(&xtime_lock); + write_sequnlock(&jiffies_lock); } /* @@ -89,12 +89,12 @@ static ktime_t tick_init_jiffy_update(void) { ktime_t period; - write_seqlock(&xtime_lock); + write_seqlock(&jiffies_lock); /* Did we start the jiffies update yet ? */ if (last_jiffies_update.tv64 == 0) last_jiffies_update = tick_next_period; period = last_jiffies_update; - write_sequnlock(&xtime_lock); + write_sequnlock(&jiffies_lock); return period; } @@ -282,11 +282,11 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, /* Read jiffies and the time when jiffies were updated last */ do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&jiffies_lock); last_update = last_jiffies_update; last_jiffies = jiffies; time_delta = timekeeping_max_deferment(); - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&jiffies_lock, seq)); if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || arch_needs_cpu(cpu)) { @@ -658,7 +658,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) * concurrency: This happens only when the cpu in charge went * into a long sleep. If two cpus happen to assign themself to * this duty, then the jiffies update is still serialized by - * xtime_lock. + * jiffies_lock. */ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) tick_do_timer_cpu = cpu; @@ -810,7 +810,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) * concurrency: This happens only when the cpu in charge went * into a long sleep. If two cpus happen to assign themself to * this duty, then the jiffies update is still serialized by - * xtime_lock. + * jiffies_lock. */ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) tick_do_timer_cpu = cpu; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e424970bb562..4c7de02eacdc 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -25,12 +25,6 @@ static struct timekeeper timekeeper; -/* - * This read-write spinlock protects us from races in SMP while - * playing with xtime. - */ -__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); - /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -1299,9 +1293,7 @@ struct timespec get_monotonic_coarse(void) } /* - * The 64-bit jiffies value is not atomic - you MUST NOT read it - * without sampling the sequence number in xtime_lock. - * jiffies is defined in the linker script... + * Must hold jiffies_lock */ void do_timer(unsigned long ticks) { @@ -1389,7 +1381,7 @@ EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); */ void xtime_update(unsigned long ticks) { - write_seqlock(&xtime_lock); + write_seqlock(&jiffies_lock); do_timer(ticks); - write_sequnlock(&xtime_lock); + write_sequnlock(&jiffies_lock); } -- cgit v1.2.3 From 69a37beabf1f0a6705c08e879bdd5d82ff6486c4 Mon Sep 17 00:00:00 2001 From: Youquan Song Date: Fri, 26 Oct 2012 12:26:41 +0200 Subject: cpuidle: Quickly notice prediction failure for repeat mode The prediction for future is difficult and when the cpuidle governor prediction fails and govenor possibly choose the shallower C-state than it should. How to quickly notice and find the failure becomes important for power saving. cpuidle menu governor has a method to predict the repeat pattern if there are 8 C-states residency which are continuous and the same or very close, so it will predict the next C-states residency will keep same residency time. There is a real case that turbostat utility (tools/power/x86/turbostat) at kernel 3.3 or early. turbostat utility will read 10 registers one by one at Sandybridge, so it will generate 10 IPIs to wake up idle CPUs. So cpuidle menu governor will predict it is repeat mode and there is another IPI wake up idle CPU soon, so it keeps idle CPU stay at C1 state even though CPU is totally idle. However, in the turbostat, following 10 registers reading is sleep 5 seconds by default, so the idle CPU will keep at C1 for a long time though it is idle until break event occurs. In a idle Sandybridge system, run "./turbostat -v", we will notice that deep C-state dangles between "70% ~ 99%". After patched the kernel, we will notice deep C-state stays at >99.98%. In the patch, a timer is added when menu governor detects a repeat mode and choose a shallow C-state. The timer is set to a time out value that greater than predicted time, and we conclude repeat mode prediction failure if timer is triggered. When repeat mode happens as expected, the timer is not triggered and CPU waken up from C-states and it will cancel the timer initiatively. When repeat mode does not happen, the timer will be time out and menu governor will quickly notice that the repeat mode prediction fails and then re-evaluates deeper C-states possibility. Below is another case which will clearly show the patch much benefit: #include #include #include #include #include #include #include volatile int * shutdown; volatile long * count; int delay = 20; int loop = 8; void usage(void) { fprintf(stderr, "Usage: idle_predict [options]\n" " --help -h Print this help\n" " --thread -n Thread number\n" " --loop -l Loop times in shallow Cstate\n" " --delay -t Sleep time (uS)in shallow Cstate\n"); } void *simple_loop() { int idle_num = 1; while (!(*shutdown)) { *count = *count + 1; if (idle_num % loop) usleep(delay); else { /* sleep 1 second */ usleep(1000000); idle_num = 0; } idle_num++; } } static void sighand(int sig) { *shutdown = 1; } int main(int argc, char *argv[]) { sigset_t sigset; int signum = SIGALRM; int i, c, er = 0, thread_num = 8; pthread_t pt[1024]; static char optstr[] = "n:l:t:h:"; while ((c = getopt(argc, argv, optstr)) != EOF) switch (c) { case 'n': thread_num = atoi(optarg); break; case 'l': loop = atoi(optarg); break; case 't': delay = atoi(optarg); break; case 'h': default: usage(); exit(1); } printf("thread=%d,loop=%d,delay=%d\n",thread_num,loop,delay); count = malloc(sizeof(long)); shutdown = malloc(sizeof(int)); *count = 0; *shutdown = 0; sigemptyset(&sigset); sigaddset(&sigset, signum); sigprocmask (SIG_BLOCK, &sigset, NULL); signal(SIGINT, sighand); signal(SIGTERM, sighand); for(i = 0; i < thread_num ; i++) pthread_create(&pt[i], NULL, simple_loop, NULL); for (i = 0; i < thread_num; i++) pthread_join(pt[i], NULL); exit(0); } Get powertop V2 from git://github.com/fenrus75/powertop, build powertop. After build the above test application, then run it. Test plaform can be Intel Sandybridge or other recent platforms. #./idle_predict -l 10 & #./powertop We will find that deep C-state will dangle between 40%~100% and much time spent on C1 state. It is because menu governor wrongly predict that repeat mode is kept, so it will choose the C1 shallow C-state even though it has chance to sleep 1 second in deep C-state. While after patched the kernel, we find that deep C-state will keep >99.6%. Signed-off-by: Rik van Riel Signed-off-by: Youquan Song Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/menu.c | 75 +++++++++++++++++++++++++++++++++++++--- include/linux/tick.h | 6 ++++ kernel/time/tick-sched.c | 4 +++ 3 files changed, 80 insertions(+), 5 deletions(-) (limited to 'kernel/time') diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 5b1f2c372c1f..37c0ff6c805c 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -28,6 +28,13 @@ #define MAX_INTERESTING 50000 #define STDDEV_THRESH 400 +/* 60 * 60 > STDDEV_THRESH * INTERVALS = 400 * 8 */ +#define MAX_DEVIATION 60 + +static DEFINE_PER_CPU(struct hrtimer, menu_hrtimer); +static DEFINE_PER_CPU(int, hrtimer_status); +/* menu hrtimer mode */ +enum {MENU_HRTIMER_STOP, MENU_HRTIMER_REPEAT}; /* * Concepts and ideas behind the menu governor @@ -191,17 +198,42 @@ static u64 div_round64(u64 dividend, u32 divisor) return div_u64(dividend + (divisor / 2), divisor); } +/* Cancel the hrtimer if it is not triggered yet */ +void menu_hrtimer_cancel(void) +{ + int cpu = smp_processor_id(); + struct hrtimer *hrtmr = &per_cpu(menu_hrtimer, cpu); + + /* The timer is still not time out*/ + if (per_cpu(hrtimer_status, cpu)) { + hrtimer_cancel(hrtmr); + per_cpu(hrtimer_status, cpu) = MENU_HRTIMER_STOP; + } +} +EXPORT_SYMBOL_GPL(menu_hrtimer_cancel); + +/* Call back for hrtimer is triggered */ +static enum hrtimer_restart menu_hrtimer_notify(struct hrtimer *hrtimer) +{ + int cpu = smp_processor_id(); + + per_cpu(hrtimer_status, cpu) = MENU_HRTIMER_STOP; + + return HRTIMER_NORESTART; +} + /* * Try detecting repeating patterns by keeping track of the last 8 * intervals, and checking if the standard deviation of that set * of points is below a threshold. If it is... then use the * average of these 8 points as the estimated value. */ -static void detect_repeating_patterns(struct menu_device *data) +static int detect_repeating_patterns(struct menu_device *data) { int i; uint64_t avg = 0; uint64_t stddev = 0; /* contains the square of the std deviation */ + int ret = 0; /* first calculate average and standard deviation of the past */ for (i = 0; i < INTERVALS; i++) @@ -210,7 +242,7 @@ static void detect_repeating_patterns(struct menu_device *data) /* if the avg is beyond the known next tick, it's worthless */ if (avg > data->expected_us) - return; + return 0; for (i = 0; i < INTERVALS; i++) stddev += (data->intervals[i] - avg) * @@ -223,8 +255,12 @@ static void detect_repeating_patterns(struct menu_device *data) * repeating pattern and predict we keep doing this. */ - if (avg && stddev < STDDEV_THRESH) + if (avg && stddev < STDDEV_THRESH) { data->predicted_us = avg; + ret = 1; + } + + return ret; } /** @@ -240,6 +276,9 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) int i; int multiplier; struct timespec t; + int repeat = 0, low_predicted = 0; + int cpu = smp_processor_id(); + struct hrtimer *hrtmr = &per_cpu(menu_hrtimer, cpu); if (data->needs_update) { menu_update(drv, dev); @@ -274,7 +313,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) data->predicted_us = div_round64(data->expected_us * data->correction_factor[data->bucket], RESOLUTION * DECAY); - detect_repeating_patterns(data); + repeat = detect_repeating_patterns(data); /* * We want to default to C1 (hlt), not to busy polling @@ -295,8 +334,10 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) if (s->disabled || su->disable) continue; - if (s->target_residency > data->predicted_us) + if (s->target_residency > data->predicted_us) { + low_predicted = 1; continue; + } if (s->exit_latency > latency_req) continue; if (s->exit_latency * multiplier > data->predicted_us) @@ -309,6 +350,27 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) } } + /* not deepest C-state chosen for low predicted residency */ + if (low_predicted) { + unsigned int timer_us = 0; + + /* + * Set a timer to detect whether this sleep is much + * longer than repeat mode predicted. If the timer + * triggers, the code will evaluate whether to put + * the CPU into a deeper C-state. + * The timer is cancelled on CPU wakeup. + */ + timer_us = 2 * (data->predicted_us + MAX_DEVIATION); + + if (repeat && (4 * timer_us < data->expected_us)) { + hrtimer_start(hrtmr, ns_to_ktime(1000 * timer_us), + HRTIMER_MODE_REL_PINNED); + /* In repeat case, menu hrtimer is started */ + per_cpu(hrtimer_status, cpu) = MENU_HRTIMER_REPEAT; + } + } + return data->last_state_idx; } @@ -399,6 +461,9 @@ static int menu_enable_device(struct cpuidle_driver *drv, struct cpuidle_device *dev) { struct menu_device *data = &per_cpu(menu_devices, dev->cpu); + struct hrtimer *t = &per_cpu(menu_hrtimer, dev->cpu); + hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + t->function = menu_hrtimer_notify; memset(data, 0, sizeof(struct menu_device)); diff --git a/include/linux/tick.h b/include/linux/tick.h index f37fceb69b73..1a6567b48492 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -142,4 +142,10 @@ static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } # endif /* !NO_HZ */ +# ifdef CONFIG_CPU_IDLE_GOV_MENU +extern void menu_hrtimer_cancel(void); +# else +static inline void menu_hrtimer_cancel(void) {} +# endif /* CONFIG_CPU_IDLE_GOV_MENU */ + #endif diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a40260885265..6f337068dc4c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -526,6 +526,8 @@ void tick_nohz_irq_exit(void) if (!ts->inidle) return; + /* Cancel the timer because CPU already waken up from the C-states*/ + menu_hrtimer_cancel(); __tick_nohz_idle_enter(ts); } @@ -621,6 +623,8 @@ void tick_nohz_idle_exit(void) ts->inidle = 0; + /* Cancel the timer because CPU already waken up from the C-states*/ + menu_hrtimer_cancel(); if (ts->idle_active || ts->tick_stopped) now = ktime_get(); -- cgit v1.2.3 From e0b306fef90556233797d2e1747bd6a3ae35ea93 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 Nov 2012 23:28:59 -0200 Subject: time: export time information for KVM pvclock As suggested by John, export time data similarly to how its done by vsyscall support. This allows KVM to retrieve necessary information to implement vsyscall support in KVM guests. Acked-by: John Stultz Signed-off-by: Marcelo Tosatti --- include/linux/pvclock_gtod.h | 9 ++++++++ kernel/time/timekeeping.c | 50 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 include/linux/pvclock_gtod.h (limited to 'kernel/time') diff --git a/include/linux/pvclock_gtod.h b/include/linux/pvclock_gtod.h new file mode 100644 index 000000000000..0ca75825b60d --- /dev/null +++ b/include/linux/pvclock_gtod.h @@ -0,0 +1,9 @@ +#ifndef _PVCLOCK_GTOD_H +#define _PVCLOCK_GTOD_H + +#include + +extern int pvclock_gtod_register_notifier(struct notifier_block *nb); +extern int pvclock_gtod_unregister_notifier(struct notifier_block *nb); + +#endif /* _PVCLOCK_GTOD_H */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e424970bb562..69f5342e8d1c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -21,6 +21,7 @@ #include #include #include +#include static struct timekeeper timekeeper; @@ -180,6 +181,54 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) return nsec + arch_gettimeoffset(); } +static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); + +static void update_pvclock_gtod(struct timekeeper *tk) +{ + raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk); +} + +/** + * pvclock_gtod_register_notifier - register a pvclock timedata update listener + * + * Must hold write on timekeeper.lock + */ +int pvclock_gtod_register_notifier(struct notifier_block *nb) +{ + struct timekeeper *tk = &timekeeper; + unsigned long flags; + int ret; + + write_seqlock_irqsave(&tk->lock, flags); + ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); + /* update timekeeping data */ + update_pvclock_gtod(tk); + write_sequnlock_irqrestore(&tk->lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); + +/** + * pvclock_gtod_unregister_notifier - unregister a pvclock + * timedata update listener + * + * Must hold write on timekeeper.lock + */ +int pvclock_gtod_unregister_notifier(struct notifier_block *nb) +{ + struct timekeeper *tk = &timekeeper; + unsigned long flags; + int ret; + + write_seqlock_irqsave(&tk->lock, flags); + ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); + write_sequnlock_irqrestore(&tk->lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); + /* must hold write on timekeeper.lock */ static void timekeeping_update(struct timekeeper *tk, bool clearntp) { @@ -188,6 +237,7 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp) ntp_clear(); } update_vsyscall(tk); + update_pvclock_gtod(tk); } /** -- cgit v1.2.3