diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-02-10 12:00:46 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-02-10 12:00:46 -0800 |
| commit | 4d84667627c4ff70826b349c449bbaf63b9af4e5 (patch) | |
| tree | 27340dbd3ed4e7bbb8b95f53c1af4c4733331a47 /kernel | |
| parent | a9aabb3b839aba094ed80861054993785c61462c (diff) | |
| parent | 7db06e329af30dcb170a6782c1714217ad65033d (diff) | |
Merge tag 'perf-core-2026-02-09' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull performance event updates from Ingo Molnar:
"x86 PMU driver updates:
- Add support for the core PMU for Intel Diamond Rapids (DMR) CPUs
(Dapeng Mi)
Compared to previous iterations of the Intel PMU code, there's been
a lot of changes, which center around three main areas:
- Introduce the OFF-MODULE RESPONSE (OMR) facility to replace the
Off-Core Response (OCR) facility
- New PEBS data source encoding layout
- Support the new "RDPMC user disable" feature
- Likewise, a large series adds uncore PMU support for Intel Diamond
Rapids (DMR) CPUs (Zide Chen)
This centers around these four main areas:
- DMR may have two Integrated I/O and Memory Hub (IMH) dies,
separate from the compute tile (CBB) dies. Each CBB and each IMH
die has its own discovery domain.
- Unlike prior CPUs that retrieve the global discovery table
portal exclusively via PCI or MSR, DMR uses PCI for IMH PMON
discovery and MSR for CBB PMON discovery.
- DMR introduces several new PMON types: SCA, HAMVF, D2D_ULA, UBR,
PCIE4, CRS, CPC, ITC, OTC, CMS, and PCIE6.
- IIO free-running counters in DMR are MMIO-based, unlike SPR.
- Also add support for Add missing PMON units for Intel Panther Lake,
and support Nova Lake (NVL), which largely maps to Panther Lake.
(Zide Chen)
- KVM integration: Add support for mediated vPMUs (by Kan Liang and
Sean Christopherson, with fixes and cleanups by Peter Zijlstra,
Sandipan Das and Mingwei Zhang)
- Add Intel cstate driver to support for Wildcat Lake (WCL) CPUs,
which are a low-power variant of Panther Lake (Zide Chen)
- Add core, cstate and MSR PMU support for the Airmont NP Intel CPU
(aka MaxLinear Lightning Mountain), which maps to the existing
Airmont code (Martin Schiller)
Performance enhancements:
- Speed up kexec shutdown by avoiding unnecessary cross CPU calls
(Jan H. Schönherr)
- Fix slow perf_event_task_exit() with LBR callstacks (Namhyung Kim)
User-space stack unwinding support:
- Various cleanups and refactorings in preparation to generalize the
unwinding code for other architectures (Jens Remus)
Uprobes updates:
- Transition from kmap_atomic to kmap_local_page (Keke Ming)
- Fix incorrect lockdep condition in filter_chain() (Breno Leitao)
- Fix XOL allocation failure for 32-bit tasks (Oleg Nesterov)
Misc fixes and cleanups:
- s390: Remove kvm_types.h from Kbuild (Randy Dunlap)
- x86/intel/uncore: Convert comma to semicolon (Chen Ni)
- x86/uncore: Clean up const mismatch (Greg Kroah-Hartman)
- x86/ibs: Fix typo in dc_l2tlb_miss comment (Xiang-Bin Shi)"
* tag 'perf-core-2026-02-09' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (58 commits)
s390: remove kvm_types.h from Kbuild
uprobes: Fix incorrect lockdep condition in filter_chain()
x86/ibs: Fix typo in dc_l2tlb_miss comment
x86/uprobes: Fix XOL allocation failure for 32-bit tasks
perf/x86/intel/uncore: Convert comma to semicolon
perf/x86/intel: Add support for rdpmc user disable feature
perf/x86: Use macros to replace magic numbers in attr_rdpmc
perf/x86/intel: Add core PMU support for Novalake
perf/x86/intel: Add support for PEBS memory auxiliary info field in NVL
perf/x86/intel: Add core PMU support for DMR
perf/x86/intel: Add support for PEBS memory auxiliary info field in DMR
perf/x86/intel: Support the 4 new OMR MSRs introduced in DMR and NVL
perf/core: Fix slow perf_event_task_exit() with LBR callstacks
perf/core: Speed up kexec shutdown by avoiding unnecessary cross CPU calls
uprobes: use kmap_local_page() for temporary page mappings
arm/uprobes: use kmap_local_page() in arch_uprobe_copy_ixol()
mips/uprobes: use kmap_local_page() in arch_uprobe_copy_ixol()
arm64/uprobes: use kmap_local_page() in arch_uprobe_copy_ixol()
riscv/uprobes: use kmap_local_page() in arch_uprobe_copy_ixol()
perf/x86/intel/uncore: Add Nova Lake support
...
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/events/core.c | 544 | ||||
| -rw-r--r-- | kernel/events/uprobes.c | 24 | ||||
| -rw-r--r-- | kernel/unwind/user.c | 12 |
3 files changed, 440 insertions, 140 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index 8cca80094624..e18119f30c29 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -57,6 +57,7 @@ #include <linux/task_work.h> #include <linux/percpu-rwsem.h> #include <linux/unwind_deferred.h> +#include <linux/kvm_types.h> #include "internal.h" @@ -166,6 +167,18 @@ enum event_type_t { EVENT_CPU = 0x10, EVENT_CGROUP = 0x20, + /* + * EVENT_GUEST is set when scheduling in/out events between the host + * and a guest with a mediated vPMU. Among other things, EVENT_GUEST + * is used: + * + * - In for_each_epc() to skip PMUs that don't support events in a + * MEDIATED_VPMU guest, i.e. don't need to be context switched. + * - To indicate the start/end point of the events in a guest. Guest + * running time is deducted for host-only (exclude_guest) events. + */ + EVENT_GUEST = 0x40, + EVENT_FLAGS = EVENT_CGROUP | EVENT_GUEST, /* compound helpers */ EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN, @@ -458,6 +471,20 @@ static cpumask_var_t perf_online_pkg_mask; static cpumask_var_t perf_online_sys_mask; static struct kmem_cache *perf_event_cache; +#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU +static DEFINE_PER_CPU(bool, guest_ctx_loaded); + +static __always_inline bool is_guest_mediated_pmu_loaded(void) +{ + return __this_cpu_read(guest_ctx_loaded); +} +#else +static __always_inline bool is_guest_mediated_pmu_loaded(void) +{ + return false; +} +#endif + /* * perf event paranoia level: * -1 - not paranoid at all @@ -779,33 +806,97 @@ do { \ ___p; \ }) -#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \ +static bool perf_skip_pmu_ctx(struct perf_event_pmu_context *pmu_ctx, + enum event_type_t event_type) +{ + if ((event_type & EVENT_CGROUP) && !pmu_ctx->nr_cgroups) + return true; + if ((event_type & EVENT_GUEST) && + !(pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU)) + return true; + return false; +} + +#define for_each_epc(_epc, _ctx, _pmu, _event_type) \ list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \ - if (_cgroup && !_epc->nr_cgroups) \ + if (perf_skip_pmu_ctx(_epc, _event_type)) \ continue; \ else if (_pmu && _epc->pmu != _pmu) \ continue; \ else -static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) +static void perf_ctx_disable(struct perf_event_context *ctx, + enum event_type_t event_type) { struct perf_event_pmu_context *pmu_ctx; - for_each_epc(pmu_ctx, ctx, NULL, cgroup) + for_each_epc(pmu_ctx, ctx, NULL, event_type) perf_pmu_disable(pmu_ctx->pmu); } -static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) +static void perf_ctx_enable(struct perf_event_context *ctx, + enum event_type_t event_type) { struct perf_event_pmu_context *pmu_ctx; - for_each_epc(pmu_ctx, ctx, NULL, cgroup) + for_each_epc(pmu_ctx, ctx, NULL, event_type) perf_pmu_enable(pmu_ctx->pmu); } static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); +static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, bool adv) +{ + if (adv) + time->time += now - time->stamp; + time->stamp = now; + + /* + * The above: time' = time + (now - timestamp), can be re-arranged + * into: time` = now + (time - timestamp), which gives a single value + * offset to compute future time without locks on. + * + * See perf_event_time_now(), which can be used from NMI context where + * it's (obviously) not possible to acquire ctx->lock in order to read + * both the above values in a consistent manner. + */ + WRITE_ONCE(time->offset, time->time - time->stamp); +} + +static_assert(offsetof(struct perf_event_context, timeguest) - + offsetof(struct perf_event_context, time) == + sizeof(struct perf_time_ctx)); + +#define T_TOTAL 0 +#define T_GUEST 1 + +static inline u64 __perf_event_time_ctx(struct perf_event *event, + struct perf_time_ctx *times) +{ + u64 time = times[T_TOTAL].time; + + if (event->attr.exclude_guest) + time -= times[T_GUEST].time; + + return time; +} + +static inline u64 __perf_event_time_ctx_now(struct perf_event *event, + struct perf_time_ctx *times, + u64 now) +{ + if (is_guest_mediated_pmu_loaded() && event->attr.exclude_guest) { + /* + * (now + times[total].offset) - (now + times[guest].offset) := + * times[total].offset - times[guest].offset + */ + return READ_ONCE(times[T_TOTAL].offset) - READ_ONCE(times[T_GUEST].offset); + } + + return now + READ_ONCE(times[T_TOTAL].offset); +} + #ifdef CONFIG_CGROUP_PERF static inline bool @@ -842,12 +933,16 @@ static inline int is_cgroup_event(struct perf_event *event) return event->cgrp != NULL; } +static_assert(offsetof(struct perf_cgroup_info, timeguest) - + offsetof(struct perf_cgroup_info, time) == + sizeof(struct perf_time_ctx)); + static inline u64 perf_cgroup_event_time(struct perf_event *event) { struct perf_cgroup_info *t; t = per_cpu_ptr(event->cgrp->info, event->cpu); - return t->time; + return __perf_event_time_ctx(event, &t->time); } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) @@ -856,20 +951,21 @@ static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) t = per_cpu_ptr(event->cgrp->info, event->cpu); if (!__load_acquire(&t->active)) - return t->time; - now += READ_ONCE(t->timeoffset); - return now; + return __perf_event_time_ctx(event, &t->time); + + return __perf_event_time_ctx_now(event, &t->time, now); } -static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) +static inline void __update_cgrp_guest_time(struct perf_cgroup_info *info, u64 now, bool adv) { - if (adv) - info->time += now - info->timestamp; - info->timestamp = now; - /* - * see update_context_time() - */ - WRITE_ONCE(info->timeoffset, info->time - info->timestamp); + update_perf_time_ctx(&info->timeguest, now, adv); +} + +static inline void update_cgrp_time(struct perf_cgroup_info *info, u64 now) +{ + update_perf_time_ctx(&info->time, now, true); + if (is_guest_mediated_pmu_loaded()) + __update_cgrp_guest_time(info, now, true); } static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) @@ -885,7 +981,7 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - __update_cgrp_time(info, now, true); + update_cgrp_time(info, now); if (final) __store_release(&info->active, 0); } @@ -908,11 +1004,11 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) * Do not update time when cgroup is not active */ if (info->active) - __update_cgrp_time(info, perf_clock(), true); + update_cgrp_time(info, perf_clock()); } static inline void -perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) { struct perf_event_context *ctx = &cpuctx->ctx; struct perf_cgroup *cgrp = cpuctx->cgrp; @@ -932,8 +1028,12 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - __update_cgrp_time(info, ctx->timestamp, false); - __store_release(&info->active, 1); + if (guest) { + __update_cgrp_guest_time(info, ctx->time.stamp, false); + } else { + update_perf_time_ctx(&info->time, ctx->time.stamp, false); + __store_release(&info->active, 1); + } } } @@ -964,8 +1064,7 @@ static void perf_cgroup_switch(struct task_struct *task) return; WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); - - perf_ctx_disable(&cpuctx->ctx, true); + perf_ctx_disable(&cpuctx->ctx, EVENT_CGROUP); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); /* @@ -981,7 +1080,7 @@ static void perf_cgroup_switch(struct task_struct *task) */ ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); - perf_ctx_enable(&cpuctx->ctx, true); + perf_ctx_enable(&cpuctx->ctx, EVENT_CGROUP); } static int perf_cgroup_ensure_storage(struct perf_event *event, @@ -1138,7 +1237,7 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, } static inline void -perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) { } @@ -1550,29 +1649,24 @@ static void perf_unpin_context(struct perf_event_context *ctx) */ static void __update_context_time(struct perf_event_context *ctx, bool adv) { - u64 now = perf_clock(); - lockdep_assert_held(&ctx->lock); - if (adv) - ctx->time += now - ctx->timestamp; - ctx->timestamp = now; + update_perf_time_ctx(&ctx->time, perf_clock(), adv); +} - /* - * The above: time' = time + (now - timestamp), can be re-arranged - * into: time` = now + (time - timestamp), which gives a single value - * offset to compute future time without locks on. - * - * See perf_event_time_now(), which can be used from NMI context where - * it's (obviously) not possible to acquire ctx->lock in order to read - * both the above values in a consistent manner. - */ - WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); +static void __update_context_guest_time(struct perf_event_context *ctx, bool adv) +{ + lockdep_assert_held(&ctx->lock); + + /* must be called after __update_context_time(); */ + update_perf_time_ctx(&ctx->timeguest, ctx->time.stamp, adv); } static void update_context_time(struct perf_event_context *ctx) { __update_context_time(ctx, true); + if (is_guest_mediated_pmu_loaded()) + __update_context_guest_time(ctx, true); } static u64 perf_event_time(struct perf_event *event) @@ -1585,7 +1679,7 @@ static u64 perf_event_time(struct perf_event *event) if (is_cgroup_event(event)) return perf_cgroup_event_time(event); - return ctx->time; + return __perf_event_time_ctx(event, &ctx->time); } static u64 perf_event_time_now(struct perf_event *event, u64 now) @@ -1599,10 +1693,9 @@ static u64 perf_event_time_now(struct perf_event *event, u64 now) return perf_cgroup_event_time_now(event, now); if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) - return ctx->time; + return __perf_event_time_ctx(event, &ctx->time); - now += READ_ONCE(ctx->timeoffset); - return now; + return __perf_event_time_ctx_now(event, &ctx->time, now); } static enum event_type_t get_event_type(struct perf_event *event) @@ -2422,20 +2515,23 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) } static inline void -__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final) +__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, + bool final, enum event_type_t event_type) { if (ctx->is_active & EVENT_TIME) { if (ctx->is_active & EVENT_FROZEN) return; + update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx, final); + /* vPMU should not stop time */ + update_cgrp_time_from_cpuctx(cpuctx, !(event_type & EVENT_GUEST) && final); } } static inline void ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - __ctx_time_update(cpuctx, ctx, false); + __ctx_time_update(cpuctx, ctx, false, 0); } /* @@ -2861,14 +2957,15 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, - struct pmu *pmu) + struct pmu *pmu, + enum event_type_t event_type) { - ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED | event_type); if (ctx) - ctx_sched_in(ctx, pmu, EVENT_PINNED); - ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); + ctx_sched_in(ctx, pmu, EVENT_PINNED | event_type); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE | event_type); if (ctx) - ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE); + ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE | event_type); } /* @@ -2902,11 +2999,11 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, event_type &= EVENT_ALL; - for_each_epc(epc, &cpuctx->ctx, pmu, false) + for_each_epc(epc, &cpuctx->ctx, pmu, 0) perf_pmu_disable(epc->pmu); if (task_ctx) { - for_each_epc(epc, task_ctx, pmu, false) + for_each_epc(epc, task_ctx, pmu, 0) perf_pmu_disable(epc->pmu); task_ctx_sched_out(task_ctx, pmu, event_type); @@ -2924,13 +3021,13 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, else if (event_type & EVENT_PINNED) ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); - perf_event_sched_in(cpuctx, task_ctx, pmu); + perf_event_sched_in(cpuctx, task_ctx, pmu, 0); - for_each_epc(epc, &cpuctx->ctx, pmu, false) + for_each_epc(epc, &cpuctx->ctx, pmu, 0) perf_pmu_enable(epc->pmu); if (task_ctx) { - for_each_epc(epc, task_ctx, pmu, false) + for_each_epc(epc, task_ctx, pmu, 0) perf_pmu_enable(epc->pmu); } } @@ -3479,11 +3576,10 @@ static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + enum event_type_t active_type = event_type & ~EVENT_FLAGS; struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; - bool cgroup = event_type & EVENT_CGROUP; - event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); @@ -3507,14 +3603,14 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t * * would only update time for the pinned events. */ - __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx); + __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx, event_type); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() */ barrier(); - ctx->is_active &= ~event_type; + ctx->is_active &= ~active_type; if (!(ctx->is_active & EVENT_ALL)) { /* @@ -3533,9 +3629,20 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t cpuctx->task_ctx = NULL; } - is_active ^= ctx->is_active; /* changed bits */ + if (event_type & EVENT_GUEST) { + /* + * Schedule out all exclude_guest events of PMU + * with PERF_PMU_CAP_MEDIATED_VPMU. + */ + is_active = EVENT_ALL; + __update_context_guest_time(ctx, false); + perf_cgroup_set_timestamp(cpuctx, true); + barrier(); + } else { + is_active ^= ctx->is_active; /* changed bits */ + } - for_each_epc(pmu_ctx, ctx, pmu, cgroup) + for_each_epc(pmu_ctx, ctx, pmu, event_type) __pmu_ctx_sched_out(pmu_ctx, is_active); } @@ -3691,7 +3798,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { - perf_ctx_disable(ctx, false); + perf_ctx_disable(ctx, 0); /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */ if (local_read(&ctx->nr_no_switch_fast) || @@ -3715,7 +3822,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) perf_ctx_sched_task_cb(ctx, task, false); - perf_ctx_enable(ctx, false); + perf_ctx_enable(ctx, 0); /* * RCU_INIT_POINTER here is safe because we've not @@ -3739,13 +3846,13 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); - perf_ctx_disable(ctx, false); + perf_ctx_disable(ctx, 0); inside_switch: perf_ctx_sched_task_cb(ctx, task, false); task_ctx_sched_out(ctx, NULL, EVENT_ALL); - perf_ctx_enable(ctx, false); + perf_ctx_enable(ctx, 0); raw_spin_unlock(&ctx->lock); } } @@ -3992,10 +4099,15 @@ static inline void group_update_userpage(struct perf_event *group_event) event_update_userpage(event); } +struct merge_sched_data { + int can_add_hw; + enum event_type_t event_type; +}; + static int merge_sched_in(struct perf_event *event, void *data) { struct perf_event_context *ctx = event->ctx; - int *can_add_hw = data; + struct merge_sched_data *msd = data; if (event->state <= PERF_EVENT_STATE_OFF) return 0; @@ -4003,13 +4115,22 @@ static int merge_sched_in(struct perf_event *event, void *data) if (!event_filter_match(event)) return 0; - if (group_can_go_on(event, *can_add_hw)) { + /* + * Don't schedule in any host events from PMU with + * PERF_PMU_CAP_MEDIATED_VPMU, while a guest is running. + */ + if (is_guest_mediated_pmu_loaded() && + event->pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU && + !(msd->event_type & EVENT_GUEST)) + return 0; + + if (group_can_go_on(event, msd->can_add_hw)) { if (!group_sched_in(event, ctx)) list_add_tail(&event->active_list, get_event_list(event)); } if (event->state == PERF_EVENT_STATE_INACTIVE) { - *can_add_hw = 0; + msd->can_add_hw = 0; if (event->attr.pinned) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); @@ -4032,11 +4153,15 @@ static int merge_sched_in(struct perf_event *event, void *data) static void pmu_groups_sched_in(struct perf_event_context *ctx, struct perf_event_groups *groups, - struct pmu *pmu) + struct pmu *pmu, + enum event_type_t event_type) { - int can_add_hw = 1; + struct merge_sched_data msd = { + .can_add_hw = 1, + .event_type = event_type, + }; visit_groups_merge(ctx, groups, smp_processor_id(), pmu, - merge_sched_in, &can_add_hw); + merge_sched_in, &msd); } static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, @@ -4045,20 +4170,18 @@ static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, struct perf_event_context *ctx = pmu_ctx->ctx; if (event_type & EVENT_PINNED) - pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu); + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu, event_type); if (event_type & EVENT_FLEXIBLE) - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu); + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu, event_type); } static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + enum event_type_t active_type = event_type & ~EVENT_FLAGS; struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; - bool cgroup = event_type & EVENT_CGROUP; - - event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); @@ -4066,9 +4189,11 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t return; if (!(is_active & EVENT_TIME)) { + /* EVENT_TIME should be active while the guest runs */ + WARN_ON_ONCE(event_type & EVENT_GUEST); /* start ctx time */ __update_context_time(ctx, false); - perf_cgroup_set_timestamp(cpuctx); + perf_cgroup_set_timestamp(cpuctx, false); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() @@ -4076,7 +4201,7 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t barrier(); } - ctx->is_active |= (event_type | EVENT_TIME); + ctx->is_active |= active_type | EVENT_TIME; if (ctx->task) { if (!(is_active & EVENT_ALL)) cpuctx->task_ctx = ctx; @@ -4084,21 +4209,37 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t WARN_ON_ONCE(cpuctx->task_ctx != ctx); } - is_active ^= ctx->is_active; /* changed bits */ + if (event_type & EVENT_GUEST) { + /* + * Schedule in the required exclude_guest events of PMU + * with PERF_PMU_CAP_MEDIATED_VPMU. + */ + is_active = event_type & EVENT_ALL; + + /* + * Update ctx time to set the new start time for + * the exclude_guest events. + */ + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx, false); + barrier(); + } else { + is_active ^= ctx->is_active; /* changed bits */ + } /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ if (is_active & EVENT_PINNED) { - for_each_epc(pmu_ctx, ctx, pmu, cgroup) - __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED); + for_each_epc(pmu_ctx, ctx, pmu, event_type) + __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED | (event_type & EVENT_GUEST)); } /* Then walk through the lower prio flexible groups */ if (is_active & EVENT_FLEXIBLE) { - for_each_epc(pmu_ctx, ctx, pmu, cgroup) - __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE); + for_each_epc(pmu_ctx, ctx, pmu, event_type) + __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE | (event_type & EVENT_GUEST)); } } @@ -4114,11 +4255,11 @@ static void perf_event_context_sched_in(struct task_struct *task) if (cpuctx->task_ctx == ctx) { perf_ctx_lock(cpuctx, ctx); - perf_ctx_disable(ctx, false); + perf_ctx_disable(ctx, 0); perf_ctx_sched_task_cb(ctx, task, true); - perf_ctx_enable(ctx, false); + perf_ctx_enable(ctx, 0); perf_ctx_unlock(cpuctx, ctx); goto rcu_unlock; } @@ -4131,7 +4272,7 @@ static void perf_event_context_sched_in(struct task_struct *task) if (!ctx->nr_events) goto unlock; - perf_ctx_disable(ctx, false); + perf_ctx_disable(ctx, 0); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -4141,18 +4282,18 @@ static void perf_event_context_sched_in(struct task_struct *task) * events, no need to flip the cpuctx's events around. */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { - perf_ctx_disable(&cpuctx->ctx, false); + perf_ctx_disable(&cpuctx->ctx, 0); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE); } - perf_event_sched_in(cpuctx, ctx, NULL); + perf_event_sched_in(cpuctx, ctx, NULL, 0); perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true); if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) - perf_ctx_enable(&cpuctx->ctx, false); + perf_ctx_enable(&cpuctx->ctx, 0); - perf_ctx_enable(ctx, false); + perf_ctx_enable(ctx, 0); unlock: perf_ctx_unlock(cpuctx, ctx); @@ -5280,9 +5421,20 @@ attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache, return -ENOMEM; for (;;) { - if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) { + if (try_cmpxchg(&task->perf_ctx_data, &old, cd)) { if (old) perf_free_ctx_data_rcu(old); + /* + * Above try_cmpxchg() pairs with try_cmpxchg() from + * detach_task_ctx_data() such that + * if we race with perf_event_exit_task(), we must + * observe PF_EXITING. + */ + if (task->flags & PF_EXITING) { + /* detach_task_ctx_data() may free it already */ + if (try_cmpxchg(&task->perf_ctx_data, &cd, NULL)) + perf_free_ctx_data_rcu(cd); + } return 0; } @@ -5328,6 +5480,8 @@ again: /* Allocate everything */ scoped_guard (rcu) { for_each_process_thread(g, p) { + if (p->flags & PF_EXITING) + continue; cd = rcu_dereference(p->perf_ctx_data); if (cd && !cd->global) { cd->global = 1; @@ -5594,6 +5748,8 @@ static void __free_event(struct perf_event *event) { struct pmu *pmu = event->pmu; + security_perf_event_free(event); + if (event->attach_state & PERF_ATTACH_CALLCHAIN) put_callchain_buffers(); @@ -5647,6 +5803,8 @@ static void __free_event(struct perf_event *event) call_rcu(&event->rcu_head, free_event_rcu); } +static void mediated_pmu_unaccount_event(struct perf_event *event); + DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T)) /* vs perf_event_alloc() success */ @@ -5656,8 +5814,7 @@ static void _free_event(struct perf_event *event) irq_work_sync(&event->pending_disable_irq); unaccount_event(event); - - security_perf_event_free(event); + mediated_pmu_unaccount_event(event); if (event->rb) { /* @@ -6180,6 +6337,138 @@ u64 perf_event_pause(struct perf_event *event, bool reset) } EXPORT_SYMBOL_GPL(perf_event_pause); +#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU +static atomic_t nr_include_guest_events __read_mostly; + +static atomic_t nr_mediated_pmu_vms __read_mostly; +static DEFINE_MUTEX(perf_mediated_pmu_mutex); + +/* !exclude_guest event of PMU with PERF_PMU_CAP_MEDIATED_VPMU */ +static inline bool is_include_guest_event(struct perf_event *event) +{ + if ((event->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU) && + !event->attr.exclude_guest) + return true; + + return false; +} + +static int mediated_pmu_account_event(struct perf_event *event) +{ + if (!is_include_guest_event(event)) + return 0; + + if (atomic_inc_not_zero(&nr_include_guest_events)) + return 0; + + guard(mutex)(&perf_mediated_pmu_mutex); + if (atomic_read(&nr_mediated_pmu_vms)) + return -EOPNOTSUPP; + + atomic_inc(&nr_include_guest_events); + return 0; +} + +static void mediated_pmu_unaccount_event(struct perf_event *event) +{ + if (!is_include_guest_event(event)) + return; + + if (WARN_ON_ONCE(!atomic_read(&nr_include_guest_events))) + return; + + atomic_dec(&nr_include_guest_events); +} + +/* + * Currently invoked at VM creation to + * - Check whether there are existing !exclude_guest events of PMU with + * PERF_PMU_CAP_MEDIATED_VPMU + * - Set nr_mediated_pmu_vms to prevent !exclude_guest event creation on + * PMUs with PERF_PMU_CAP_MEDIATED_VPMU + * + * No impact for the PMU without PERF_PMU_CAP_MEDIATED_VPMU. The perf + * still owns all the PMU resources. + */ +int perf_create_mediated_pmu(void) +{ + if (atomic_inc_not_zero(&nr_mediated_pmu_vms)) + return 0; + + guard(mutex)(&perf_mediated_pmu_mutex); + if (atomic_read(&nr_include_guest_events)) + return -EBUSY; + + atomic_inc(&nr_mediated_pmu_vms); + return 0; +} +EXPORT_SYMBOL_FOR_KVM(perf_create_mediated_pmu); + +void perf_release_mediated_pmu(void) +{ + if (WARN_ON_ONCE(!atomic_read(&nr_mediated_pmu_vms))) + return; + + atomic_dec(&nr_mediated_pmu_vms); +} +EXPORT_SYMBOL_FOR_KVM(perf_release_mediated_pmu); + +/* When loading a guest's mediated PMU, schedule out all exclude_guest events. */ +void perf_load_guest_context(void) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + + lockdep_assert_irqs_disabled(); + + guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); + + if (WARN_ON_ONCE(__this_cpu_read(guest_ctx_loaded))) + return; + + perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST); + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST); + if (cpuctx->task_ctx) { + perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST); + task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST); + } + + perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST); + if (cpuctx->task_ctx) + perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST); + + __this_cpu_write(guest_ctx_loaded, true); +} +EXPORT_SYMBOL_GPL(perf_load_guest_context); + +void perf_put_guest_context(void) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + + lockdep_assert_irqs_disabled(); + + guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); + + if (WARN_ON_ONCE(!__this_cpu_read(guest_ctx_loaded))) + return; + + perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST); + if (cpuctx->task_ctx) + perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST); + + perf_event_sched_in(cpuctx, cpuctx->task_ctx, NULL, EVENT_GUEST); + + if (cpuctx->task_ctx) + perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST); + perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST); + + __this_cpu_write(guest_ctx_loaded, false); +} +EXPORT_SYMBOL_GPL(perf_put_guest_context); +#else +static int mediated_pmu_account_event(struct perf_event *event) { return 0; } +static void mediated_pmu_unaccount_event(struct perf_event *event) {} +#endif + /* * Holding the top-level event's child_mutex means that any * descendant process that has inherited this event will block @@ -6548,22 +6837,22 @@ void perf_event_update_userpage(struct perf_event *event) goto unlock; /* - * compute total_time_enabled, total_time_running - * based on snapshot values taken when the event - * was last scheduled in. + * Disable preemption to guarantee consistent time stamps are stored to + * the user page. + */ + preempt_disable(); + + /* + * Compute total_time_enabled, total_time_running based on snapshot + * values taken when the event was last scheduled in. * - * we cannot simply called update_context_time() - * because of locking issue as we can be called in - * NMI context + * We cannot simply call update_context_time() because doing so would + * lead to deadlock when called from NMI context. */ calc_timer_values(event, &now, &enabled, &running); userpg = rb->user_page; - /* - * Disable preemption to guarantee consistent time stamps are stored to - * the user page. - */ - preempt_disable(); + ++userpg->lock; barrier(); userpg->index = perf_event_index(event); @@ -7383,6 +7672,7 @@ struct perf_guest_info_callbacks __rcu *perf_guest_cbs; DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state); DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip); DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); +DEFINE_STATIC_CALL_RET0(__perf_guest_handle_mediated_pmi, *perf_guest_cbs->handle_mediated_pmi); void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { @@ -7397,6 +7687,10 @@ void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) if (cbs->handle_intel_pt_intr) static_call_update(__perf_guest_handle_intel_pt_intr, cbs->handle_intel_pt_intr); + + if (cbs->handle_mediated_pmi) + static_call_update(__perf_guest_handle_mediated_pmi, + cbs->handle_mediated_pmi); } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); @@ -7408,8 +7702,8 @@ void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) rcu_assign_pointer(perf_guest_cbs, NULL); static_call_update(__perf_guest_state, (void *)&__static_call_return0); static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0); - static_call_update(__perf_guest_handle_intel_pt_intr, - (void *)&__static_call_return0); + static_call_update(__perf_guest_handle_intel_pt_intr, (void *)&__static_call_return0); + static_call_update(__perf_guest_handle_mediated_pmi, (void *)&__static_call_return0); synchronize_rcu(); } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); @@ -7869,13 +8163,11 @@ static void perf_output_read(struct perf_output_handle *handle, u64 read_format = event->attr.read_format; /* - * compute total_time_enabled, total_time_running - * based on snapshot values taken when the event - * was last scheduled in. + * Compute total_time_enabled, total_time_running based on snapshot + * values taken when the event was last scheduled in. * - * we cannot simply called update_context_time() - * because of locking issue as we are called in - * NMI context + * We cannot simply call update_context_time() because doing so would + * lead to deadlock when called from NMI context. */ if (read_format & PERF_FORMAT_TOTAL_TIMES) calc_timer_values(event, &now, &enabled, &running); @@ -12043,7 +12335,7 @@ static void task_clock_event_update(struct perf_event *event, u64 now) static void task_clock_event_start(struct perf_event *event, int flags) { event->hw.state = 0; - local64_set(&event->hw.prev_count, event->ctx->time); + local64_set(&event->hw.prev_count, event->ctx->time.time); perf_swevent_start_hrtimer(event); } @@ -12052,7 +12344,7 @@ static void task_clock_event_stop(struct perf_event *event, int flags) event->hw.state = PERF_HES_STOPPED; perf_swevent_cancel_hrtimer(event); if (flags & PERF_EF_UPDATE) - task_clock_event_update(event, event->ctx->time); + task_clock_event_update(event, event->ctx->time.time); } static int task_clock_event_add(struct perf_event *event, int flags) @@ -12072,8 +12364,8 @@ static void task_clock_event_del(struct perf_event *event, int flags) static void task_clock_event_read(struct perf_event *event) { u64 now = perf_clock(); - u64 delta = now - event->ctx->timestamp; - u64 time = event->ctx->time + delta; + u64 delta = now - event->ctx->time.stamp; + u64 time = event->ctx->time.time + delta; task_clock_event_update(event, time); } @@ -13155,6 +13447,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (err) return ERR_PTR(err); + err = mediated_pmu_account_event(event); + if (err) + return ERR_PTR(err); + /* symmetric to unaccount_event() in _free_event() */ account_event(event); @@ -14294,8 +14590,11 @@ void perf_event_exit_task(struct task_struct *task) /* * Detach the perf_ctx_data for the system-wide event. + * + * Done without holding global_ctx_data_rwsem; typically + * attach_global_ctx_data() will skip over this task, but otherwise + * attach_task_ctx_data() will observe PF_EXITING. */ - guard(percpu_read)(&global_ctx_data_rwsem); detach_task_ctx_data(task); } @@ -14798,7 +15097,8 @@ static void perf_event_exit_cpu_context(int cpu) ctx = &cpuctx->ctx; mutex_lock(&ctx->mutex); - smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); + if (ctx->nr_events) + smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); cpuctx->online = 0; mutex_unlock(&ctx->mutex); mutex_unlock(&pmus_lock); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d546d32390a8..424ef2235b07 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -179,16 +179,16 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn) void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) { - void *kaddr = kmap_atomic(page); + void *kaddr = kmap_local_page(page); memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); - kunmap_atomic(kaddr); + kunmap_local(kaddr); } static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len) { - void *kaddr = kmap_atomic(page); + void *kaddr = kmap_local_page(page); memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len); - kunmap_atomic(kaddr); + kunmap_local(kaddr); } static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn, @@ -323,7 +323,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) return ret == 0 ? -EBUSY : ret; } - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); ptr = kaddr + (vaddr & ~PAGE_MASK); if (unlikely(*ptr + d < 0)) { @@ -336,7 +336,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) *ptr += d; ret = 0; out: - kunmap_atomic(kaddr); + kunmap_local(kaddr); put_page(page); return ret; } @@ -1138,7 +1138,7 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) bool ret = false; down_read(&uprobe->consumer_rwsem); - list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { + list_for_each_entry(uc, &uprobe->consumers, cons_node) { ret = consumer_filter(uc, mm); if (ret) break; @@ -1694,6 +1694,12 @@ static const struct vm_special_mapping xol_mapping = { .mremap = xol_mremap, }; +unsigned long __weak arch_uprobe_get_xol_area(void) +{ + /* Try to map as high as possible, this is only a hint. */ + return get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); +} + /* Slot allocation for XOL */ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) { @@ -1709,9 +1715,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) } if (!area->vaddr) { - /* Try to map as high as possible, this is only a hint. */ - area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, - PAGE_SIZE, 0, 0); + area->vaddr = arch_uprobe_get_xol_area(); if (IS_ERR_VALUE(area->vaddr)) { ret = area->vaddr; goto fail; diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c index 39e270789444..90ab3c1a205e 100644 --- a/kernel/unwind/user.c +++ b/kernel/unwind/user.c @@ -31,6 +31,7 @@ static int unwind_user_next_common(struct unwind_user_state *state, { unsigned long cfa, fp, ra; + /* Get the Canonical Frame Address (CFA) */ if (frame->use_fp) { if (state->fp < state->sp) return -EINVAL; @@ -38,11 +39,9 @@ static int unwind_user_next_common(struct unwind_user_state *state, } else { cfa = state->sp; } - - /* Get the Canonical Frame Address (CFA) */ cfa += frame->cfa_off; - /* stack going in wrong direction? */ + /* Make sure that stack is not going in wrong direction */ if (cfa <= state->sp) return -EINVAL; @@ -50,10 +49,11 @@ static int unwind_user_next_common(struct unwind_user_state *state, if (cfa & (state->ws - 1)) return -EINVAL; - /* Find the Return Address (RA) */ + /* Get the Return Address (RA) */ if (get_user_word(&ra, cfa, frame->ra_off, state->ws)) return -EINVAL; + /* Get the Frame Pointer (FP) */ if (frame->fp_off && get_user_word(&fp, cfa, frame->fp_off, state->ws)) return -EINVAL; @@ -67,7 +67,6 @@ static int unwind_user_next_common(struct unwind_user_state *state, static int unwind_user_next_fp(struct unwind_user_state *state) { -#ifdef CONFIG_HAVE_UNWIND_USER_FP struct pt_regs *regs = task_pt_regs(current); if (state->topmost && unwind_user_at_function_start(regs)) { @@ -81,9 +80,6 @@ static int unwind_user_next_fp(struct unwind_user_state *state) ARCH_INIT_USER_FP_FRAME(state->ws) }; return unwind_user_next_common(state, &fp_frame); -#else - return -EINVAL; -#endif } static int unwind_user_next(struct unwind_user_state *state) |
