diff options
| author | Namhyung Kim <namhyung@kernel.org> | 2025-02-05 14:57:18 -0800 |
|---|---|---|
| committer | Namhyung Kim <namhyung@kernel.org> | 2025-02-05 14:57:18 -0800 |
| commit | 9e676a024fa1fa2bd8150c2d2ba85478280353bc (patch) | |
| tree | 5cf0e1d4ab27002fcafdc7dc5bdfdd9ff3f3c9f1 /kernel/time | |
| parent | 357b965deba9fb71467413e473764ec4e1694d8d (diff) | |
| parent | 2014c95afecee3e76ca4a56956a936e23283f05b (diff) | |
Merge tag 'v6.14-rc1' into perf-tools-next
To get the various fixes in the current master.
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Diffstat (limited to 'kernel/time')
| -rw-r--r-- | kernel/time/clocksource-wdtest.c | 3 | ||||
| -rw-r--r-- | kernel/time/hrtimer.c | 18 | ||||
| -rw-r--r-- | kernel/time/posix-timers.c | 2 | ||||
| -rw-r--r-- | kernel/time/tick-broadcast.c | 2 | ||||
| -rw-r--r-- | kernel/time/timekeeping.c | 77 | ||||
| -rw-r--r-- | kernel/time/timer.c | 18 | ||||
| -rw-r--r-- | kernel/time/timer_migration.c | 68 | ||||
| -rw-r--r-- | kernel/time/timer_migration.h | 21 |
8 files changed, 98 insertions, 111 deletions
diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c index 62e73444ffe4..38dae590b29f 100644 --- a/kernel/time/clocksource-wdtest.c +++ b/kernel/time/clocksource-wdtest.c @@ -137,7 +137,8 @@ static int wdtest_func(void *arg) udelay(1); j2 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime); pr_info("--- tsc-like times: %lu - %lu = %lu.\n", j2, j1, j2 - j1); - WARN_ON_ONCE(time_before(j2, j1 + NSEC_PER_USEC)); + WARN_ONCE(time_before(j2, j1 + NSEC_PER_USEC), + "Expected at least 1000ns, got %lu.\n", j2 - j1); /* Verify tsc-like stability with various numbers of errors injected. */ max_retries = clocksource_get_max_watchdog_retry(); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 80fe3749d2db..f6d8df94045c 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1067,11 +1067,10 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); * The timer is inserted in expiry order. Insertion into the * red black tree is O(log(n)). Must hold the base lock. * - * Returns 1 when the new timer is the leftmost timer in the tree. + * Returns true when the new timer is the leftmost timer in the tree. */ -static int enqueue_hrtimer(struct hrtimer *timer, - struct hrtimer_clock_base *base, - enum hrtimer_mode mode) +static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, + enum hrtimer_mode mode) { debug_activate(timer, mode); WARN_ON_ONCE(!base->cpu_base->online); @@ -2202,6 +2201,15 @@ int hrtimers_prepare_cpu(unsigned int cpu) } cpu_base->cpu = cpu; + hrtimer_cpu_base_init_expiry_lock(cpu_base); + return 0; +} + +int hrtimers_cpu_starting(unsigned int cpu) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + + /* Clear out any left over state from a CPU down operation */ cpu_base->active_bases = 0; cpu_base->hres_active = 0; cpu_base->hang_detected = 0; @@ -2210,7 +2218,6 @@ int hrtimers_prepare_cpu(unsigned int cpu) cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->online = 1; - hrtimer_cpu_base_init_expiry_lock(cpu_base); return 0; } @@ -2286,5 +2293,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) void __init hrtimers_init(void) { hrtimers_prepare_cpu(smp_processor_id()); + hrtimers_cpu_starting(smp_processor_id()); open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 881a9ce96af7..1b675aee99a9 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -538,7 +538,7 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) * When the reference count reaches zero, the timer is scheduled * for RCU removal after the grace period. * - * Holding rcu_read_lock() accross the lookup ensures that + * Holding rcu_read_lock() across the lookup ensures that * the timer cannot be freed. * * The lookup validates locklessly that timr::it_signal == diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index ed58eebb4e8f..0207868c8b4d 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -1020,6 +1020,8 @@ static inline ktime_t tick_get_next_period(void) /** * tick_broadcast_setup_oneshot - setup the broadcast device + * @bc: the broadcast device + * @from_periodic: true if called from periodic mode */ static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3d128825d343..1e67d076f195 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -485,91 +485,30 @@ u64 notrace ktime_get_tai_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns); -static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) +/** + * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. + * + * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering. + */ +u64 ktime_get_real_fast_ns(void) { + struct tk_fast *tkf = &tk_fast_mono; struct tk_read_base *tkr; - u64 basem, baser, delta; + u64 baser, delta; unsigned int seq; do { seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); - basem = ktime_to_ns(tkr->base); baser = ktime_to_ns(tkr->base_real); delta = timekeeping_get_ns(tkr); } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); - if (mono) - *mono = basem + delta; return baser + delta; } - -/** - * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. - * - * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering. - */ -u64 ktime_get_real_fast_ns(void) -{ - return __ktime_get_real_fast(&tk_fast_mono, NULL); -} EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); /** - * ktime_get_fast_timestamps: - NMI safe timestamps - * @snapshot: Pointer to timestamp storage - * - * Stores clock monotonic, boottime and realtime timestamps. - * - * Boot time is a racy access on 32bit systems if the sleep time injection - * happens late during resume and not in timekeeping_resume(). That could - * be avoided by expanding struct tk_read_base with boot offset for 32bit - * and adding more overhead to the update. As this is a hard to observe - * once per resume event which can be filtered with reasonable effort using - * the accurate mono/real timestamps, it's probably not worth the trouble. - * - * Aside of that it might be possible on 32 and 64 bit to observe the - * following when the sleep time injection happens late: - * - * CPU 0 CPU 1 - * timekeeping_resume() - * ktime_get_fast_timestamps() - * mono, real = __ktime_get_real_fast() - * inject_sleep_time() - * update boot offset - * boot = mono + bootoffset; - * - * That means that boot time already has the sleep time adjustment, but - * real time does not. On the next readout both are in sync again. - * - * Preventing this for 64bit is not really feasible without destroying the - * careful cache layout of the timekeeper because the sequence count and - * struct tk_read_base would then need two cache lines instead of one. - * - * Access to the time keeper clock source is disabled across the innermost - * steps of suspend/resume. The accessors still work, but the timestamps - * are frozen until time keeping is resumed which happens very early. - * - * For regular suspend/resume there is no observable difference vs. sched - * clock, but it might affect some of the nasty low level debug printks. - * - * OTOH, access to sched clock is not guaranteed across suspend/resume on - * all systems either so it depends on the hardware in use. - * - * If that turns out to be a real problem then this could be mitigated by - * using sched clock in a similar way as during early boot. But it's not as - * trivial as on early boot because it needs some careful protection - * against the clock monotonic timestamp jumping backwards on resume. - */ -void ktime_get_fast_timestamps(struct ktime_timestamps *snapshot) -{ - struct timekeeper *tk = &tk_core.timekeeper; - - snapshot->real = __ktime_get_real_fast(&tk_fast_mono, &snapshot->mono); - snapshot->boot = snapshot->mono + ktime_to_ns(data_race(tk->offs_boot)); -} - -/** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. * diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a5860bf6d16f..c8f776dc6ee0 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -301,7 +301,7 @@ static int timer_migration_handler(const struct ctl_table *table, int write, return ret; } -static struct ctl_table timer_sysctl[] = { +static const struct ctl_table timer_sysctl[] = { { .procname = "timer_migration", .data = &sysctl_timer_migration, @@ -956,33 +956,29 @@ static int detach_if_pending(struct timer_list *timer, struct timer_base *base, static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) { int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL; - struct timer_base *base; - - base = per_cpu_ptr(&timer_bases[index], cpu); /* * If the timer is deferrable and NO_HZ_COMMON is set then we need * to use the deferrable base. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) - base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); - return base; + index = BASE_DEF; + + return per_cpu_ptr(&timer_bases[index], cpu); } static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) { int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL; - struct timer_base *base; - - base = this_cpu_ptr(&timer_bases[index]); /* * If the timer is deferrable and NO_HZ_COMMON is set then we need * to use the deferrable base. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) - base = this_cpu_ptr(&timer_bases[BASE_DEF]); - return base; + index = BASE_DEF; + + return this_cpu_ptr(&timer_bases[index]); } static inline struct timer_base *get_timer_base(u32 tflags) diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 8d57f7686bb0..9cb9b6584ea1 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -534,8 +534,13 @@ static void __walk_groups(up_f up, struct tmigr_walk *data, break; child = group; - group = group->parent; + /* + * Pairs with the store release on group connection + * to make sure group initialization is visible. + */ + group = READ_ONCE(group->parent); data->childmask = child->groupmask; + WARN_ON_ONCE(!data->childmask); } while (group); } @@ -564,7 +569,7 @@ static struct tmigr_event *tmigr_next_groupevt(struct tmigr_group *group) while ((node = timerqueue_getnext(&group->events))) { evt = container_of(node, struct tmigr_event, nextevt); - if (!evt->ignore) { + if (!READ_ONCE(evt->ignore)) { WRITE_ONCE(group->next_expiry, evt->nextevt.expires); return evt; } @@ -660,7 +665,7 @@ static bool tmigr_active_up(struct tmigr_group *group, * lock is held while updating the ignore flag in idle path. So this * state change will not be lost. */ - group->groupevt.ignore = true; + WRITE_ONCE(group->groupevt.ignore, true); return walk_done; } @@ -721,6 +726,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, union tmigr_state childstate, groupstate; bool remote = data->remote; bool walk_done = false; + bool ignore; u64 nextexp; if (child) { @@ -739,11 +745,19 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, nextexp = child->next_expiry; evt = &child->groupevt; - evt->ignore = (nextexp == KTIME_MAX) ? true : false; + /* + * This can race with concurrent idle exit (activate). + * If the current writer wins, a useless remote expiration may + * be scheduled. If the activate wins, the event is properly + * ignored. + */ + ignore = (nextexp == KTIME_MAX) ? true : false; + WRITE_ONCE(evt->ignore, ignore); } else { nextexp = data->nextexp; first_childevt = evt = data->evt; + ignore = evt->ignore; /* * Walking the hierarchy is required in any case when a @@ -769,7 +783,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, * first event information of the group is updated properly and * also handled properly, so skip this fast return path. */ - if (evt->ignore && !remote && group->parent) + if (ignore && !remote && group->parent) return true; raw_spin_lock(&group->lock); @@ -783,7 +797,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, * queue when the expiry time changed only or when it could be ignored. */ if (timerqueue_node_queued(&evt->nextevt)) { - if ((evt->nextevt.expires == nextexp) && !evt->ignore) { + if ((evt->nextevt.expires == nextexp) && !ignore) { /* Make sure not to miss a new CPU event with the same expiry */ evt->cpu = first_childevt->cpu; goto check_toplvl; @@ -793,7 +807,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, WRITE_ONCE(group->next_expiry, KTIME_MAX); } - if (evt->ignore) { + if (ignore) { /* * When the next child event could be ignored (nextexp is * KTIME_MAX) and there was no remote timer handling before or @@ -1487,6 +1501,21 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, s.seq = 0; atomic_set(&group->migr_state, s.state); + /* + * If this is a new top-level, prepare its groupmask in advance. + * This avoids accidents where yet another new top-level is + * created in the future and made visible before the current groupmask. + */ + if (list_empty(&tmigr_level_list[lvl])) { + group->groupmask = BIT(0); + /* + * The previous top level has prepared its groupmask already, + * simply account it as the first child. + */ + if (lvl > 0) + group->num_children = 1; + } + timerqueue_init_head(&group->events); timerqueue_init(&group->groupevt.nextevt); group->groupevt.nextevt.expires = KTIME_MAX; @@ -1550,8 +1579,25 @@ static void tmigr_connect_child_parent(struct tmigr_group *child, raw_spin_lock_irq(&child->lock); raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING); - child->parent = parent; - child->groupmask = BIT(parent->num_children++); + if (activate) { + /* + * @child is the old top and @parent the new one. In this + * case groupmask is pre-initialized and @child already + * accounted, along with its new sibling corresponding to the + * CPU going up. + */ + WARN_ON_ONCE(child->groupmask != BIT(0) || parent->num_children != 2); + } else { + /* Adding @child for the CPU going up to @parent. */ + child->groupmask = BIT(parent->num_children++); + } + + /* + * Make sure parent initialization is visible before publishing it to a + * racing CPU entering/exiting idle. This RELEASE barrier enforces an + * address dependency that pairs with the READ_ONCE() in __walk_groups(). + */ + smp_store_release(&child->parent, parent); raw_spin_unlock(&parent->lock); raw_spin_unlock_irq(&child->lock); @@ -1624,9 +1670,7 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) * be different from tmigr_hierarchy_levels, contains only a * single group. */ - if (group->parent || i == tmigr_hierarchy_levels || - (list_empty(&tmigr_level_list[i]) && - list_is_singular(&tmigr_level_list[i - 1]))) + if (group->parent || list_is_singular(&tmigr_level_list[i - 1])) break; } while (i < tmigr_hierarchy_levels); diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h index 154accc7a543..ae19f70f8170 100644 --- a/kernel/time/timer_migration.h +++ b/kernel/time/timer_migration.h @@ -110,22 +110,19 @@ struct tmigr_cpu { * union tmigr_state - state of tmigr_group * @state: Combined version of the state - only used for atomic * read/cmpxchg function - * @struct: Split version of the state - only use the struct members to + * &anon struct: Split version of the state - only use the struct members to * update information to stay independent of endianness + * @active: Contains each mask bit of the active children + * @migrator: Contains mask of the child which is migrator + * @seq: Sequence counter needs to be increased when an update + * to the tmigr_state is done. It prevents a race when + * updates in the child groups are propagated in changed + * order. Detailed information about the scenario is + * given in the documentation at the begin of + * timer_migration.c. */ union tmigr_state { u32 state; - /** - * struct - split state of tmigr_group - * @active: Contains each mask bit of the active children - * @migrator: Contains mask of the child which is migrator - * @seq: Sequence counter needs to be increased when an update - * to the tmigr_state is done. It prevents a race when - * updates in the child groups are propagated in changed - * order. Detailed information about the scenario is - * given in the documentation at the begin of - * timer_migration.c. - */ struct { u8 active; u8 migrator; |
