diff options
Diffstat (limited to 'kernel/events')
| -rw-r--r-- | kernel/events/Makefile | 1 | ||||
| -rw-r--r-- | kernel/events/callchain.c | 4 | ||||
| -rw-r--r-- | kernel/events/core.c | 650 | ||||
| -rw-r--r-- | kernel/events/hw_breakpoint.c | 648 | ||||
| -rw-r--r-- | kernel/events/hw_breakpoint_test.c | 333 | ||||
| -rw-r--r-- | kernel/events/internal.h | 5 | ||||
| -rw-r--r-- | kernel/events/ring_buffer.c | 12 | ||||
| -rw-r--r-- | kernel/events/uprobes.c | 57 |
8 files changed, 1250 insertions, 460 deletions
diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 8591c180b52b..91a62f566743 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -2,4 +2,5 @@ obj-y := core.o ring_buffer.o callchain.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o +obj-$(CONFIG_HW_BREAKPOINT_KUNIT_TEST) += hw_breakpoint_test.o obj-$(CONFIG_UPROBES) += uprobes.o diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 58cbe357fb2b..1273be84392c 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -209,17 +209,13 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, } if (regs) { - mm_segment_t fs; - if (crosstask) goto exit_put; if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); - fs = force_uaccess_begin(); perf_callchain_user(&ctx, regs); - force_uaccess_end(fs); } } diff --git a/kernel/events/core.c b/kernel/events/core.c index 76c754e45d01..7f04f995c975 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -54,6 +54,7 @@ #include <linux/highmem.h> #include <linux/pgtable.h> #include <linux/buildid.h> +#include <linux/task_work.h> #include "internal.h" @@ -574,8 +575,7 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, enum event_type_t event_type); static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task); + enum event_type_t event_type); static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); @@ -781,7 +781,6 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, static inline void update_cgrp_time_from_event(struct perf_event *event) { struct perf_cgroup_info *info; - struct perf_cgroup *cgrp; /* * ensure we access cgroup data only when needed and @@ -790,21 +789,19 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) if (!is_cgroup_event(event)) return; - cgrp = perf_cgroup_from_task(current, event->ctx); + info = this_cpu_ptr(event->cgrp->info); /* * Do not update time when cgroup is not active */ - if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) { - info = this_cpu_ptr(event->cgrp->info); + if (info->active) __update_cgrp_time(info, perf_clock(), true); - } } static inline void -perf_cgroup_set_timestamp(struct task_struct *task, - struct perf_event_context *ctx) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { - struct perf_cgroup *cgrp; + struct perf_event_context *ctx = &cpuctx->ctx; + struct perf_cgroup *cgrp = cpuctx->cgrp; struct perf_cgroup_info *info; struct cgroup_subsys_state *css; @@ -813,10 +810,10 @@ perf_cgroup_set_timestamp(struct task_struct *task, * ensure we do not access cgroup data * unless we have the cgroup pinned (css_get) */ - if (!task || !ctx->nr_cgroups) + if (!cgrp) return; - cgrp = perf_cgroup_from_task(task, ctx); + WARN_ON_ONCE(!ctx->nr_cgroups); for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); @@ -828,18 +825,13 @@ perf_cgroup_set_timestamp(struct task_struct *task, static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); -#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ -#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ - /* * reschedule events based on the cgroup constraint of task. - * - * mode SWOUT : schedule out everything - * mode SWIN : schedule in based on cgroup for next */ -static void perf_cgroup_switch(struct task_struct *task, int mode) +static void perf_cgroup_switch(struct task_struct *task) { - struct perf_cpu_context *cpuctx; + struct perf_cgroup *cgrp; + struct perf_cpu_context *cpuctx, *tmp; struct list_head *list; unsigned long flags; @@ -849,35 +841,31 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) */ local_irq_save(flags); + cgrp = perf_cgroup_from_task(task, NULL); + list = this_cpu_ptr(&cgrp_cpuctx_list); - list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) { + list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) { WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); + if (READ_ONCE(cpuctx->cgrp) == cgrp) + continue; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(cpuctx->ctx.pmu); - if (mode & PERF_CGROUP_SWOUT) { - cpu_ctx_sched_out(cpuctx, EVENT_ALL); - /* - * must not be done before ctxswout due - * to event_filter_match() in event_sched_out() - */ - cpuctx->cgrp = NULL; - } + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + /* + * must not be done before ctxswout due + * to update_cgrp_time_from_cpuctx() in + * ctx_sched_out() + */ + cpuctx->cgrp = cgrp; + /* + * set cgrp before ctxsw in to allow + * perf_cgroup_set_timestamp() in ctx_sched_in() + * to not have to pass task around + */ + cpu_ctx_sched_in(cpuctx, EVENT_ALL); - if (mode & PERF_CGROUP_SWIN) { - WARN_ON_ONCE(cpuctx->cgrp); - /* - * set cgrp before ctxsw in to allow - * event_filter_match() to not have to pass - * task around - * we pass the cpuctx->ctx to perf_cgroup_from_task() - * because cgorup events are only per-cpu - */ - cpuctx->cgrp = perf_cgroup_from_task(task, - &cpuctx->ctx); - cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); - } perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } @@ -885,58 +873,6 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) local_irq_restore(flags); } -static inline void perf_cgroup_sched_out(struct task_struct *task, - struct task_struct *next) -{ - struct perf_cgroup *cgrp1; - struct perf_cgroup *cgrp2 = NULL; - - rcu_read_lock(); - /* - * we come here when we know perf_cgroup_events > 0 - * we do not need to pass the ctx here because we know - * we are holding the rcu lock - */ - cgrp1 = perf_cgroup_from_task(task, NULL); - cgrp2 = perf_cgroup_from_task(next, NULL); - - /* - * only schedule out current cgroup events if we know - * that we are switching to a different cgroup. Otherwise, - * do no touch the cgroup events. - */ - if (cgrp1 != cgrp2) - perf_cgroup_switch(task, PERF_CGROUP_SWOUT); - - rcu_read_unlock(); -} - -static inline void perf_cgroup_sched_in(struct task_struct *prev, - struct task_struct *task) -{ - struct perf_cgroup *cgrp1; - struct perf_cgroup *cgrp2 = NULL; - - rcu_read_lock(); - /* - * we come here when we know perf_cgroup_events > 0 - * we do not need to pass the ctx here because we know - * we are holding the rcu lock - */ - cgrp1 = perf_cgroup_from_task(task, NULL); - cgrp2 = perf_cgroup_from_task(prev, NULL); - - /* - * only need to schedule in cgroup events if we are changing - * cgroup during ctxsw. Cgroup events were not scheduled - * out of ctxsw out if that was not the case. - */ - if (cgrp1 != cgrp2) - perf_cgroup_switch(task, PERF_CGROUP_SWIN); - - rcu_read_unlock(); -} - static int perf_cgroup_ensure_storage(struct perf_event *event, struct cgroup_subsys_state *css) { @@ -1032,22 +968,10 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct */ cpuctx = container_of(ctx, struct perf_cpu_context, ctx); - /* - * Since setting cpuctx->cgrp is conditional on the current @cgrp - * matching the event's cgroup, we must do this for every new event, - * because if the first would mismatch, the second would not try again - * and we would leave cpuctx->cgrp unset. - */ - if (ctx->is_active && !cpuctx->cgrp) { - struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); - - if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) - cpuctx->cgrp = cgrp; - } - if (ctx->nr_cgroups++) return; + cpuctx->cgrp = perf_cgroup_from_task(current, ctx); list_add(&cpuctx->cgrp_cpuctx_entry, per_cpu_ptr(&cgrp_cpuctx_list, event->cpu)); } @@ -1069,9 +993,7 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c if (--ctx->nr_cgroups) return; - if (ctx->is_active && cpuctx->cgrp) - cpuctx->cgrp = NULL; - + cpuctx->cgrp = NULL; list_del(&cpuctx->cgrp_cpuctx_entry); } @@ -1100,16 +1022,6 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, { } -static inline void perf_cgroup_sched_out(struct task_struct *task, - struct task_struct *next) -{ -} - -static inline void perf_cgroup_sched_in(struct task_struct *prev, - struct task_struct *task) -{ -} - static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, struct perf_event_attr *attr, struct perf_event *group_leader) @@ -1118,13 +1030,7 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, } static inline void -perf_cgroup_set_timestamp(struct task_struct *task, - struct perf_event_context *ctx) -{ -} - -static inline void -perf_cgroup_switch(struct task_struct *task, struct task_struct *next) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { } @@ -1147,6 +1053,10 @@ static inline void perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) { } + +static void perf_cgroup_switch(struct task_struct *task) +{ +} #endif /* @@ -1559,6 +1469,8 @@ static void __update_context_time(struct perf_event_context *ctx, bool adv) { u64 now = perf_clock(); + lockdep_assert_held(&ctx->lock); + if (adv) ctx->time += now - ctx->timestamp; ctx->timestamp = now; @@ -1910,6 +1822,9 @@ static void __perf_event_read_size(struct perf_event *event, int nr_siblings) if (event->attr.read_format & PERF_FORMAT_ID) entry += sizeof(u64); + if (event->attr.read_format & PERF_FORMAT_LOST) + entry += sizeof(u64); + if (event->attr.read_format & PERF_FORMAT_GROUP) { nr += nr_siblings; size += sizeof(u64); @@ -2312,16 +2227,22 @@ static inline int __pmu_filter_match(struct perf_event *event) static inline int pmu_filter_match(struct perf_event *event) { struct perf_event *sibling; + unsigned long flags; + int ret = 1; if (!__pmu_filter_match(event)) return 0; + local_irq_save(flags); for_each_sibling_event(sibling, event) { - if (!__pmu_filter_match(sibling)) - return 0; + if (!__pmu_filter_match(sibling)) { + ret = 0; + break; + } } + local_irq_restore(flags); - return 1; + return ret; } static inline int @@ -2356,11 +2277,27 @@ event_sched_out(struct perf_event *event, event->pmu->del(event, 0); event->oncpu = -1; - if (READ_ONCE(event->pending_disable) >= 0) { - WRITE_ONCE(event->pending_disable, -1); + if (event->pending_disable) { + event->pending_disable = 0; perf_cgroup_event_disable(event, ctx); state = PERF_EVENT_STATE_OFF; } + + if (event->pending_sigtrap) { + bool dec = true; + + event->pending_sigtrap = 0; + if (state != PERF_EVENT_STATE_OFF && + !event->pending_work) { + event->pending_work = 1; + dec = false; + WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount)); + task_work_add(current, &event->pending_task, TWA_RESUME); + } + if (dec) + local_dec(&event->ctx->nr_pending); + } + perf_event_set_state(event, state); if (!is_software_event(event)) @@ -2400,6 +2337,7 @@ group_sched_out(struct perf_event *group_event, #define DETACH_GROUP 0x01UL #define DETACH_CHILD 0x02UL +#define DETACH_DEAD 0x04UL /* * Cross CPU call to remove a performance event @@ -2420,12 +2358,20 @@ __perf_remove_from_context(struct perf_event *event, update_cgrp_time_from_cpuctx(cpuctx, false); } + /* + * Ensure event_sched_out() switches to OFF, at the very least + * this avoids raising perf_pending_task() at this time. + */ + if (flags & DETACH_DEAD) + event->pending_disable = 1; event_sched_out(event, cpuctx, ctx); if (flags & DETACH_GROUP) perf_group_detach(event); if (flags & DETACH_CHILD) perf_child_detach(event); list_del_event(event, ctx); + if (flags & DETACH_DEAD) + event->state = PERF_EVENT_STATE_DEAD; if (!ctx->nr_events && ctx->is_active) { if (ctx == &cpuctx->ctx) @@ -2512,7 +2458,7 @@ static void __perf_event_disable(struct perf_event *event, * hold the top-level event's child_mutex, so any descendant that * goes to exit will block in perf_event_exit_event(). * - * When called from perf_pending_event it's OK because event->ctx + * When called from perf_pending_irq it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ @@ -2551,9 +2497,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable); void perf_event_disable_inatomic(struct perf_event *event) { - WRITE_ONCE(event->pending_disable, smp_processor_id()); - /* can fail, see perf_pending_event_disable() */ - irq_work_queue(&event->pending); + event->pending_disable = 1; + irq_work_queue(&event->pending_irq); } #define MAX_INTERRUPTS (~0ULL) @@ -2713,8 +2658,7 @@ static void ctx_sched_out(struct perf_event_context *ctx, static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task); + enum event_type_t event_type); static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, @@ -2730,15 +2674,14 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx, - struct task_struct *task) + struct perf_event_context *ctx) { - cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task); + cpu_ctx_sched_in(cpuctx, EVENT_PINNED); if (ctx) - ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); + ctx_sched_in(ctx, cpuctx, EVENT_PINNED); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); if (ctx) - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); } /* @@ -2788,7 +2731,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, else if (ctx_event_type & EVENT_PINNED) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - perf_event_sched_in(cpuctx, task_ctx, current); + perf_event_sched_in(cpuctx, task_ctx); perf_pmu_enable(cpuctx->ctx.pmu); } @@ -3011,7 +2954,7 @@ static void __perf_event_enable(struct perf_event *event, return; if (!event_filter_match(event)) { - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + ctx_sched_in(ctx, cpuctx, EVENT_TIME); return; } @@ -3020,7 +2963,7 @@ static void __perf_event_enable(struct perf_event *event, * then don't put it on unless the group is on. */ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + ctx_sched_in(ctx, cpuctx, EVENT_TIME); return; } @@ -3238,6 +3181,15 @@ static int perf_event_modify_breakpoint(struct perf_event *bp, return err; } +/* + * Copy event-type-independent attributes that may be modified. + */ +static void perf_event_modify_copy_attr(struct perf_event_attr *to, + const struct perf_event_attr *from) +{ + to->sig_data = from->sig_data; +} + static int perf_event_modify_attr(struct perf_event *event, struct perf_event_attr *attr) { @@ -3260,10 +3212,17 @@ static int perf_event_modify_attr(struct perf_event *event, WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->child_mutex); + /* + * Event-type-independent attributes must be copied before event-type + * modification, which will validate that final attributes match the + * source attributes after all relevant attributes have been copied. + */ + perf_event_modify_copy_attr(&event->attr, attr); err = func(event, attr); if (err) goto out; list_for_each_entry(child, &event->child_list, child_list) { + perf_event_modify_copy_attr(&child->attr, attr); err = func(child, attr); if (err) goto out; @@ -3494,11 +3453,23 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { + perf_pmu_disable(pmu); + + /* PMIs are disabled; ctx->nr_pending is stable. */ + if (local_read(&ctx->nr_pending) || + local_read(&next_ctx->nr_pending)) { + /* + * Must not swap out ctx when there's pending + * events that rely on the ctx->task relation. + */ + raw_spin_unlock(&next_ctx->lock); + rcu_read_unlock(); + goto inside_switch; + } + WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); - perf_pmu_disable(pmu); - if (cpuctx->sched_cb_usage && pmu->sched_task) pmu->sched_task(ctx, false); @@ -3539,6 +3510,7 @@ unlock: raw_spin_lock(&ctx->lock); perf_pmu_disable(pmu); +inside_switch: if (cpuctx->sched_cb_usage && pmu->sched_task) pmu->sched_task(ctx, false); task_ctx_sched_out(cpuctx, ctx, EVENT_ALL); @@ -3652,7 +3624,7 @@ void __perf_event_task_sched_out(struct task_struct *task, * cgroup event are system-wide mode only */ if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_sched_out(task, next); + perf_cgroup_switch(next); } /* @@ -3849,8 +3821,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task) + enum event_type_t event_type) { int is_active = ctx->is_active; @@ -3862,7 +3833,7 @@ ctx_sched_in(struct perf_event_context *ctx, if (is_active ^ EVENT_TIME) { /* start ctx time */ __update_context_time(ctx, false); - perf_cgroup_set_timestamp(task, ctx); + perf_cgroup_set_timestamp(cpuctx); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() @@ -3893,12 +3864,11 @@ ctx_sched_in(struct perf_event_context *ctx, } static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task) + enum event_type_t event_type) { struct perf_event_context *ctx = &cpuctx->ctx; - ctx_sched_in(ctx, cpuctx, event_type, task); + ctx_sched_in(ctx, cpuctx, event_type); } static void perf_event_context_sched_in(struct perf_event_context *ctx, @@ -3940,7 +3910,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - perf_event_sched_in(cpuctx, ctx, task); + perf_event_sched_in(cpuctx, ctx); if (cpuctx->sched_cb_usage && pmu->sched_task) pmu->sched_task(cpuctx->task_ctx, true); @@ -3968,16 +3938,6 @@ void __perf_event_task_sched_in(struct task_struct *prev, struct perf_event_context *ctx; int ctxn; - /* - * If cgroup events exist on this CPU, then we need to check if we have - * to switch in PMU state; cgroup event are system-wide mode only. - * - * Since cgroup events are CPU events, we must schedule these in before - * we schedule in the task events. - */ - if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_sched_in(prev, task); - for_each_task_context_nr(ctxn) { ctx = task->perf_event_ctxp[ctxn]; if (likely(!ctx)) @@ -4251,7 +4211,7 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx) if (cpu_event) rotate_ctx(&cpuctx->ctx, cpu_event); - perf_event_sched_in(cpuctx, task_ctx, current); + perf_event_sched_in(cpuctx, task_ctx); perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@ -4323,7 +4283,7 @@ static void perf_event_enable_on_exec(int ctxn) clone_ctx = unclone_ctx(ctx); ctx_resched(cpuctx, ctx, event_type); } else { - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + ctx_sched_in(ctx, cpuctx, EVENT_TIME); } perf_ctx_unlock(cpuctx, ctx); @@ -4346,7 +4306,6 @@ static void perf_event_remove_on_exec(int ctxn) { struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_event *event, *next; - LIST_HEAD(free_list); unsigned long flags; bool modified = false; @@ -4544,7 +4503,7 @@ int perf_event_read_local(struct perf_event *event, u64 *value, *value = local64_read(&event->count); if (enabled || running) { - u64 __enabled, __running, __now;; + u64 __enabled, __running, __now; calc_timer_values(event, &__now, &__enabled, &__running); if (enabled) @@ -5018,7 +4977,7 @@ static void perf_addr_filters_splice(struct perf_event *event, static void _free_event(struct perf_event *event) { - irq_work_sync(&event->pending); + irq_work_sync(&event->pending_irq); unaccount_event(event); @@ -5172,9 +5131,7 @@ int perf_event_release_kernel(struct perf_event *event) ctx = perf_event_ctx_lock(event); WARN_ON_ONCE(ctx->parent_ctx); - perf_remove_from_context(event, DETACH_GROUP); - raw_spin_lock_irq(&ctx->lock); /* * Mark this event as STATE_DEAD, there is no external reference to it * anymore. @@ -5186,8 +5143,7 @@ int perf_event_release_kernel(struct perf_event *event) * Thus this guarantees that we will in fact observe and kill _ALL_ * child events. */ - event->state = PERF_EVENT_STATE_DEAD; - raw_spin_unlock_irq(&ctx->lock); + perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); perf_event_ctx_unlock(event, ctx); @@ -5350,11 +5306,15 @@ static int __perf_read_group_add(struct perf_event *leader, values[n++] += perf_event_count(leader); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&leader->lost_samples); for_each_sibling_event(sub, leader) { values[n++] += perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&sub->lost_samples); } raw_spin_unlock_irqrestore(&ctx->lock, flags); @@ -5411,7 +5371,7 @@ static int perf_read_one(struct perf_event *event, u64 read_format, char __user *buf) { u64 enabled, running; - u64 values[4]; + u64 values[5]; int n = 0; values[n++] = __perf_event_read_value(event, &enabled, &running); @@ -5421,6 +5381,8 @@ static int perf_read_one(struct perf_event *event, values[n++] = running; if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&event->lost_samples); if (copy_to_user(buf, values, n * sizeof(u64))) return -EFAULT; @@ -6336,17 +6298,17 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) again: mutex_lock(&event->mmap_mutex); if (event->rb) { - if (event->rb->nr_pages != nr_pages) { + if (data_page_nr(event->rb) != nr_pages) { ret = -EINVAL; goto unlock; } if (!atomic_inc_not_zero(&event->rb->mmap_count)) { /* - * Raced against perf_mmap_close() through - * perf_event_set_output(). Try again, hope for better - * luck. + * Raced against perf_mmap_close(); remove the + * event and try again. */ + ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); goto again; } @@ -6512,32 +6474,43 @@ static void perf_sigtrap(struct perf_event *event) return; /* - * perf_pending_event() can race with the task exiting. + * Both perf_pending_task() and perf_pending_irq() can race with the + * task exiting. */ if (current->flags & PF_EXITING) return; - force_sig_perf((void __user *)event->pending_addr, - event->attr.type, event->attr.sig_data); + send_sig_perf((void __user *)event->pending_addr, + event->attr.type, event->attr.sig_data); } -static void perf_pending_event_disable(struct perf_event *event) +/* + * Deliver the pending work in-event-context or follow the context. + */ +static void __perf_pending_irq(struct perf_event *event) { - int cpu = READ_ONCE(event->pending_disable); + int cpu = READ_ONCE(event->oncpu); + /* + * If the event isn't running; we done. event_sched_out() will have + * taken care of things. + */ if (cpu < 0) return; + /* + * Yay, we hit home and are in the context of the event. + */ if (cpu == smp_processor_id()) { - WRITE_ONCE(event->pending_disable, -1); - - if (event->attr.sigtrap) { + if (event->pending_sigtrap) { + event->pending_sigtrap = 0; perf_sigtrap(event); - atomic_set_release(&event->event_limit, 1); /* rearm event */ - return; + local_dec(&event->ctx->nr_pending); + } + if (event->pending_disable) { + event->pending_disable = 0; + perf_event_disable_local(event); } - - perf_event_disable_local(event); return; } @@ -6557,35 +6530,64 @@ static void perf_pending_event_disable(struct perf_event *event) * irq_work_queue(); // FAILS * * irq_work_run() - * perf_pending_event() + * perf_pending_irq() * * But the event runs on CPU-B and wants disabling there. */ - irq_work_queue_on(&event->pending, cpu); + irq_work_queue_on(&event->pending_irq, cpu); } -static void perf_pending_event(struct irq_work *entry) +static void perf_pending_irq(struct irq_work *entry) { - struct perf_event *event = container_of(entry, struct perf_event, pending); + struct perf_event *event = container_of(entry, struct perf_event, pending_irq); int rctx; - rctx = perf_swevent_get_recursion_context(); /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. */ + rctx = perf_swevent_get_recursion_context(); - perf_pending_event_disable(event); - + /* + * The wakeup isn't bound to the context of the event -- it can happen + * irrespective of where the event is. + */ if (event->pending_wakeup) { event->pending_wakeup = 0; perf_event_wakeup(event); } + __perf_pending_irq(event); + if (rctx >= 0) perf_swevent_put_recursion_context(rctx); } +static void perf_pending_task(struct callback_head *head) +{ + struct perf_event *event = container_of(head, struct perf_event, pending_task); + int rctx; + + /* + * If we 'fail' here, that's OK, it means recursion is already disabled + * and we won't recurse 'further'. + */ + preempt_disable_notrace(); + rctx = perf_swevent_get_recursion_context(); + + if (event->pending_work) { + event->pending_work = 0; + perf_sigtrap(event); + local_dec(&event->ctx->nr_pending); + } + + if (rctx >= 0) + perf_swevent_put_recursion_context(rctx); + preempt_enable_notrace(); + + put_event(event); +} + #ifdef CONFIG_GUEST_PERF_EVENTS struct perf_guest_info_callbacks __rcu *perf_guest_cbs; @@ -6730,7 +6732,6 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, unsigned long sp; unsigned int rem; u64 dyn_size; - mm_segment_t fs; /* * We dump: @@ -6748,9 +6749,7 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, /* Data. */ sp = perf_user_stack_pointer(regs); - fs = force_uaccess_begin(); rem = __output_copy_user(handle, (void *) sp, dump_size); - force_uaccess_end(fs); dyn_size = dump_size - rem; perf_output_skip(handle, rem); @@ -6878,11 +6877,10 @@ out_put: static void __perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, - struct perf_event *event) + struct perf_event *event, + u64 sample_type) { - u64 sample_type = event->attr.sample_type; - - data->type = sample_type; + data->type = event->attr.sample_type; header->size += event->id_header_size; if (sample_type & PERF_SAMPLE_TID) { @@ -6911,7 +6909,7 @@ void perf_event_header__init_id(struct perf_event_header *header, struct perf_event *event) { if (event->attr.sample_id_all) - __perf_event_header__init_id(header, data, event); + __perf_event_header__init_id(header, data, event, event->attr.sample_type); } static void __perf_event__output_id_sample(struct perf_output_handle *handle, @@ -6951,7 +6949,7 @@ static void perf_output_read_one(struct perf_output_handle *handle, u64 enabled, u64 running) { u64 read_format = event->attr.read_format; - u64 values[4]; + u64 values[5]; int n = 0; values[n++] = perf_event_count(event); @@ -6965,6 +6963,8 @@ static void perf_output_read_one(struct perf_output_handle *handle, } if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&event->lost_samples); __output_copy(handle, values, n * sizeof(u64)); } @@ -6975,9 +6975,16 @@ static void perf_output_read_group(struct perf_output_handle *handle, { struct perf_event *leader = event->group_leader, *sub; u64 read_format = event->attr.read_format; - u64 values[5]; + unsigned long flags; + u64 values[6]; int n = 0; + /* + * Disabling interrupts avoids all counter scheduling + * (context switches, timer based rotation and IPIs). + */ + local_irq_save(flags); + values[n++] = 1 + leader->nr_siblings; if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) @@ -6993,6 +7000,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, values[n++] = perf_event_count(leader); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&leader->lost_samples); __output_copy(handle, values, n * sizeof(u64)); @@ -7006,9 +7015,13 @@ static void perf_output_read_group(struct perf_output_handle *handle, values[n++] = perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&sub->lost_samples); __output_copy(handle, values, n * sizeof(u64)); } + + local_irq_restore(flags); } #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ @@ -7045,11 +7058,6 @@ static void perf_output_read(struct perf_output_handle *handle, perf_output_read_one(handle, event, enabled, running); } -static inline bool perf_sample_save_hw_index(struct perf_event *event) -{ - return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; -} - void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, @@ -7131,14 +7139,14 @@ void perf_output_sample(struct perf_output_handle *handle, } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { - if (data->br_stack) { + if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { size_t size; size = data->br_stack->nr * sizeof(struct perf_branch_entry); perf_output_put(handle, data->br_stack->nr); - if (perf_sample_save_hw_index(event)) + if (branch_sample_hw_index(event)) perf_output_put(handle, data->br_stack->hw_idx); perf_output_copy(handle, data->br_stack->entries, size); } else { @@ -7381,6 +7389,7 @@ void perf_prepare_sample(struct perf_event_header *header, struct pt_regs *regs) { u64 sample_type = event->attr.sample_type; + u64 filtered_sample_type; header->type = PERF_RECORD_SAMPLE; header->size = sizeof(*header) + event->header_size; @@ -7388,7 +7397,12 @@ void perf_prepare_sample(struct perf_event_header *header, header->misc = 0; header->misc |= perf_misc_flags(regs); - __perf_event_header__init_id(header, data, event); + /* + * Clear the sample flags that have already been done by the + * PMU driver. + */ + filtered_sample_type = sample_type & ~data->sample_flags; + __perf_event_header__init_id(header, data, event, filtered_sample_type); if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) data->ip = perf_instruction_pointer(regs); @@ -7396,7 +7410,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; - if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) data->callchain = perf_callchain(event, regs); size += data->callchain->nr; @@ -7408,7 +7422,7 @@ void perf_prepare_sample(struct perf_event_header *header, struct perf_raw_record *raw = data->raw; int size; - if (raw) { + if (raw && (data->sample_flags & PERF_SAMPLE_RAW)) { struct perf_raw_frag *frag = &raw->frag; u32 sum = 0; @@ -7424,6 +7438,7 @@ void perf_prepare_sample(struct perf_event_header *header, frag->pad = raw->size - sum; } else { size = sizeof(u64); + data->raw = NULL; } header->size += size; @@ -7431,8 +7446,8 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_BRANCH_STACK) { int size = sizeof(u64); /* nr */ - if (data->br_stack) { - if (perf_sample_save_hw_index(event)) + if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { + if (branch_sample_hw_index(event)) size += sizeof(u64); size += data->br_stack->nr @@ -7481,6 +7496,20 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } + if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) + data->weight.full = 0; + + if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) + data->data_src.val = PERF_MEM_NA; + + if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) + data->txn = 0; + + if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_DATA_PAGE_SIZE)) { + if (filtered_sample_type & PERF_SAMPLE_ADDR) + data->addr = 0; + } + if (sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); @@ -7496,7 +7525,8 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } - if (sample_type & PERF_SAMPLE_PHYS_ADDR) + if (sample_type & PERF_SAMPLE_PHYS_ADDR && + filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) data->phys_addr = perf_virt_to_phys(data->addr); #ifdef CONFIG_CGROUP_PERF @@ -9009,7 +9039,7 @@ static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog, PERF_RECORD_KSYMBOL_TYPE_BPF, (u64)(unsigned long)subprog->bpf_func, subprog->jited_len, unregister, - prog->aux->ksym.name); + subprog->aux->ksym.name); } } } @@ -9252,13 +9282,26 @@ int perf_event_account_interrupt(struct perf_event *event) return __perf_event_account_interrupt(event, 1); } +static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs) +{ + /* + * Due to interrupt latency (AKA "skid"), we may enter the + * kernel before taking an overflow, even if the PMU is only + * counting user events. + */ + if (event->attr.exclude_kernel && !user_mode(regs)) + return false; + + return true; +} + /* * Generic event overflow handling, sampling. */ static int __perf_event_overflow(struct perf_event *event, - int throttle, struct perf_sample_data *data, - struct pt_regs *regs) + int throttle, struct perf_sample_data *data, + struct pt_regs *regs) { int events = atomic_read(&event->event_limit); int ret = 0; @@ -9281,24 +9324,59 @@ static int __perf_event_overflow(struct perf_event *event, if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; - event->pending_addr = data->addr; - perf_event_disable_inatomic(event); } + if (event->attr.sigtrap) { + /* + * The desired behaviour of sigtrap vs invalid samples is a bit + * tricky; on the one hand, one should not loose the SIGTRAP if + * it is the first event, on the other hand, we should also not + * trigger the WARN or override the data address. + */ + bool valid_sample = sample_is_allowed(event, regs); + unsigned int pending_id = 1; + + if (regs) + pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1; + if (!event->pending_sigtrap) { + event->pending_sigtrap = pending_id; + local_inc(&event->ctx->nr_pending); + } else if (event->attr.exclude_kernel && valid_sample) { + /* + * Should not be able to return to user space without + * consuming pending_sigtrap; with exceptions: + * + * 1. Where !exclude_kernel, events can overflow again + * in the kernel without returning to user space. + * + * 2. Events that can overflow again before the IRQ- + * work without user space progress (e.g. hrtimer). + * To approximate progress (with false negatives), + * check 32-bit hash of the current IP. + */ + WARN_ON_ONCE(event->pending_sigtrap != pending_id); + } + + event->pending_addr = 0; + if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) + event->pending_addr = data->addr; + irq_work_queue(&event->pending_irq); + } + READ_ONCE(event->overflow_handler)(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; - irq_work_queue(&event->pending); + irq_work_queue(&event->pending_irq); } return ret; } int perf_event_overflow(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) + struct perf_sample_data *data, + struct pt_regs *regs) { return __perf_event_overflow(event, 1, data, regs); } @@ -9813,6 +9891,7 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, perf_sample_data_init(&data, 0, 0); data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; perf_trace_buf_update(record, event_type); @@ -10067,8 +10146,16 @@ static void bpf_overflow_handler(struct perf_event *event, goto out; rcu_read_lock(); prog = READ_ONCE(event->prog); - if (prog) + if (prog) { + if (prog->call_get_stack && + (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) && + !(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) { + data->callchain = perf_callchain(event, regs); + data->sample_flags |= PERF_SAMPLE_CALLCHAIN; + } + ret = bpf_prog_run(prog, &ctx); + } rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); @@ -10094,7 +10181,7 @@ static int perf_event_set_bpf_handler(struct perf_event *event, if (event->attr.precise_ip && prog->call_get_stack && - (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) || + (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) || event->attr.exclude_callchain_kernel || event->attr.exclude_callchain_user)) { /* @@ -10161,26 +10248,30 @@ static inline bool perf_event_is_tracing(struct perf_event *event) int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { - bool is_kprobe, is_tracepoint, is_syscall_tp; + bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp; if (!perf_event_is_tracing(event)) return perf_event_set_bpf_handler(event, prog, bpf_cookie); - is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; + is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE; + is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; is_syscall_tp = is_syscall_trace_event(event->tp_event); - if (!is_kprobe && !is_tracepoint && !is_syscall_tp) + if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp) /* bpf programs can only be attached to u/kprobe or tracepoint */ return -EINVAL; - if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) || + if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) || (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) || (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) return -EINVAL; + if (prog->type == BPF_PROG_TYPE_KPROBE && prog->aux->sleepable && !is_uprobe) + /* only uprobe programs are allowed to be sleepable */ + return -EINVAL; + /* Kprobe override only works for kprobes, not uprobes. */ - if (prog->kprobe_override && - !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) + if (prog->kprobe_override && !is_kprobe) return -EINVAL; if (is_tracepoint || is_syscall_tp) { @@ -10303,8 +10394,9 @@ static void perf_addr_filter_apply(struct perf_addr_filter *filter, struct perf_addr_filter_range *fr) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!vma->vm_file) continue; @@ -10515,8 +10607,6 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, */ if (state == IF_STATE_END) { ret = -EINVAL; - if (kernel && event->attr.exclude_kernel) - goto fail; /* * ACTION "filter" must have a non-zero length region @@ -10558,8 +10648,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, } /* ready to consume more filters */ + kfree(filename); + filename = NULL; state = IF_STATE_ACTION; filter = NULL; + kernel = 0; } } @@ -11006,7 +11099,7 @@ static ssize_t nr_addr_filters_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); } DEVICE_ATTR_RO(nr_addr_filters); @@ -11017,7 +11110,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type); } static DEVICE_ATTR_RO(type); @@ -11028,7 +11121,7 @@ perf_event_mux_interval_ms_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms); } static DEFINE_MUTEX(mux_interval_mutex); @@ -11601,8 +11694,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, init_waitqueue_head(&event->waitq); - event->pending_disable = -1; - init_irq_work(&event->pending, perf_pending_event); + init_irq_work(&event->pending_irq, perf_pending_irq); + init_task_work(&event->pending_task, perf_pending_task); mutex_init(&event->mmap_mutex); raw_spin_lock_init(&event->addr_filters.lock); @@ -11621,8 +11714,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->state = PERF_EVENT_STATE_INACTIVE; - if (event->attr.sigtrap) - atomic_set(&event->event_limit, 1); + if (parent_event) + event->event_caps = parent_event->event_caps; if (task) { event->attach_state = PERF_ATTACH_TASK; @@ -11779,11 +11872,9 @@ err_pmu: event->destroy(event); module_put(pmu->module); err_ns: - if (event->ns) - put_pid_ns(event->ns); if (event->hw.target) put_task_struct(event->hw.target); - kmem_cache_free(perf_event_cache, event); + call_rcu(&event->rcu_head, free_event_rcu); return ERR_PTR(err); } @@ -11914,14 +12005,25 @@ err_size: goto out; } +static void mutex_lock_double(struct mutex *a, struct mutex *b) +{ + if (b < a) + swap(a, b); + + mutex_lock(a); + mutex_lock_nested(b, SINGLE_DEPTH_NESTING); +} + static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { struct perf_buffer *rb = NULL; int ret = -EINVAL; - if (!output_event) + if (!output_event) { + mutex_lock(&event->mmap_mutex); goto set; + } /* don't allow circular references */ if (event == output_event) @@ -11959,8 +12061,15 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) event->pmu != output_event->pmu) goto out; + /* + * Hold both mmap_mutex to serialize against perf_mmap_close(). Since + * output_event is already on rb->event_list, and the list iteration + * restarts after every removal, it is guaranteed this new event is + * observed *OR* if output_event is already removed, it's guaranteed we + * observe !rb->mmap_count. + */ + mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex); set: - mutex_lock(&event->mmap_mutex); /* Can't redirect output if we've got an active mmap() */ if (atomic_read(&event->mmap_count)) goto unlock; @@ -11970,6 +12079,12 @@ set: rb = ring_buffer_get(output_event); if (!rb) goto unlock; + + /* did we race against perf_mmap_close() */ + if (!atomic_read(&rb->mmap_count)) { + ring_buffer_put(rb); + goto unlock; + } } ring_buffer_attach(event, rb); @@ -11977,20 +12092,13 @@ set: ret = 0; unlock: mutex_unlock(&event->mmap_mutex); + if (output_event) + mutex_unlock(&output_event->mmap_mutex); out: return ret; } -static void mutex_lock_double(struct mutex *a, struct mutex *b) -{ - if (b < a) - swap(a, b); - - mutex_lock(a); - mutex_lock_nested(b, SINGLE_DEPTH_NESTING); -} - static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) { bool nmi_safe = false; @@ -12305,6 +12413,9 @@ SYSCALL_DEFINE5(perf_event_open, * Do not allow to attach to a group in a different task * or CPU context. If we're moving SW events, we'll fix * this up later, so allow that. + * + * Racy, not holding group_leader->ctx->mutex, see comment with + * perf_event_ctx_lock(). */ if (!move_group && group_leader->ctx != ctx) goto err_context; @@ -12370,6 +12481,7 @@ SYSCALL_DEFINE5(perf_event_open, } else { perf_event_ctx_unlock(group_leader, gctx); move_group = 0; + goto not_move_group; } } @@ -12386,7 +12498,17 @@ SYSCALL_DEFINE5(perf_event_open, } } else { mutex_lock(&ctx->mutex); + + /* + * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx, + * see the group_leader && !move_group test earlier. + */ + if (group_leader && group_leader->ctx != ctx) { + err = -EINVAL; + goto err_locked; + } } +not_move_group: if (ctx->task == TASK_TOMBSTONE) { err = -ESRCH; @@ -13548,7 +13670,7 @@ static int __perf_cgroup_move(void *info) { struct task_struct *task = info; rcu_read_lock(); - perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); + perf_cgroup_switch(task); rcu_read_unlock(); return 0; } diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index f32320ac02fd..c3797701339c 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -17,61 +17,276 @@ * This file contains the arch-independent routines. */ +#include <linux/hw_breakpoint.h> + +#include <linux/atomic.h> +#include <linux/bug.h> +#include <linux/cpu.h> +#include <linux/export.h> +#include <linux/init.h> #include <linux/irqflags.h> -#include <linux/kallsyms.h> -#include <linux/notifier.h> -#include <linux/kprobes.h> #include <linux/kdebug.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/notifier.h> +#include <linux/percpu-rwsem.h> #include <linux/percpu.h> +#include <linux/rhashtable.h> #include <linux/sched.h> -#include <linux/init.h> #include <linux/slab.h> -#include <linux/list.h> -#include <linux/cpu.h> -#include <linux/smp.h> -#include <linux/bug.h> -#include <linux/hw_breakpoint.h> /* - * Constraints data + * Datastructure to track the total uses of N slots across tasks or CPUs; + * bp_slots_histogram::count[N] is the number of assigned N+1 breakpoint slots. + */ +struct bp_slots_histogram { +#ifdef hw_breakpoint_slots + atomic_t count[hw_breakpoint_slots(0)]; +#else + atomic_t *count; +#endif +}; + +/* + * Per-CPU constraints data. */ struct bp_cpuinfo { - /* Number of pinned cpu breakpoints in a cpu */ - unsigned int cpu_pinned; - /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */ - unsigned int *tsk_pinned; - /* Number of non-pinned cpu/task breakpoints in a cpu */ - unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */ + /* Number of pinned CPU breakpoints in a CPU. */ + unsigned int cpu_pinned; + /* Histogram of pinned task breakpoints in a CPU. */ + struct bp_slots_histogram tsk_pinned; }; static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]); -static int nr_slots[TYPE_MAX]; static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) { return per_cpu_ptr(bp_cpuinfo + type, cpu); } +/* Number of pinned CPU breakpoints globally. */ +static struct bp_slots_histogram cpu_pinned[TYPE_MAX]; +/* Number of pinned CPU-independent task breakpoints. */ +static struct bp_slots_histogram tsk_pinned_all[TYPE_MAX]; + /* Keep track of the breakpoints attached to tasks */ -static LIST_HEAD(bp_task_head); +static struct rhltable task_bps_ht; +static const struct rhashtable_params task_bps_ht_params = { + .head_offset = offsetof(struct hw_perf_event, bp_list), + .key_offset = offsetof(struct hw_perf_event, target), + .key_len = sizeof_field(struct hw_perf_event, target), + .automatic_shrinking = true, +}; -static int constraints_initialized; +static bool constraints_initialized __ro_after_init; -/* Gather the number of total pinned and un-pinned bp in a cpuset */ -struct bp_busy_slots { - unsigned int pinned; - unsigned int flexible; -}; +/* + * Synchronizes accesses to the per-CPU constraints; the locking rules are: + * + * 1. Atomic updates to bp_cpuinfo::tsk_pinned only require a held read-lock + * (due to bp_slots_histogram::count being atomic, no update are lost). + * + * 2. Holding a write-lock is required for computations that require a + * stable snapshot of all bp_cpuinfo::tsk_pinned. + * + * 3. In all other cases, non-atomic accesses require the appropriately held + * lock (read-lock for read-only accesses; write-lock for reads/writes). + */ +DEFINE_STATIC_PERCPU_RWSEM(bp_cpuinfo_sem); -/* Serialize accesses to the above constraints */ -static DEFINE_MUTEX(nr_bp_mutex); +/* + * Return mutex to serialize accesses to per-task lists in task_bps_ht. Since + * rhltable synchronizes concurrent insertions/deletions, independent tasks may + * insert/delete concurrently; therefore, a mutex per task is sufficient. + * + * Uses task_struct::perf_event_mutex, to avoid extending task_struct with a + * hw_breakpoint-only mutex, which may be infrequently used. The caveat here is + * that hw_breakpoint may contend with per-task perf event list management. The + * assumption is that perf usecases involving hw_breakpoints are very unlikely + * to result in unnecessary contention. + */ +static inline struct mutex *get_task_bps_mutex(struct perf_event *bp) +{ + struct task_struct *tsk = bp->hw.target; -__weak int hw_breakpoint_weight(struct perf_event *bp) + return tsk ? &tsk->perf_event_mutex : NULL; +} + +static struct mutex *bp_constraints_lock(struct perf_event *bp) +{ + struct mutex *tsk_mtx = get_task_bps_mutex(bp); + + if (tsk_mtx) { + /* + * Fully analogous to the perf_try_init_event() nesting + * argument in the comment near perf_event_ctx_lock_nested(); + * this child->perf_event_mutex cannot ever deadlock against + * the parent->perf_event_mutex usage from + * perf_event_task_{en,dis}able(). + * + * Specifically, inherited events will never occur on + * ->perf_event_list. + */ + mutex_lock_nested(tsk_mtx, SINGLE_DEPTH_NESTING); + percpu_down_read(&bp_cpuinfo_sem); + } else { + percpu_down_write(&bp_cpuinfo_sem); + } + + return tsk_mtx; +} + +static void bp_constraints_unlock(struct mutex *tsk_mtx) +{ + if (tsk_mtx) { + percpu_up_read(&bp_cpuinfo_sem); + mutex_unlock(tsk_mtx); + } else { + percpu_up_write(&bp_cpuinfo_sem); + } +} + +static bool bp_constraints_is_locked(struct perf_event *bp) +{ + struct mutex *tsk_mtx = get_task_bps_mutex(bp); + + return percpu_is_write_locked(&bp_cpuinfo_sem) || + (tsk_mtx ? mutex_is_locked(tsk_mtx) : + percpu_is_read_locked(&bp_cpuinfo_sem)); +} + +static inline void assert_bp_constraints_lock_held(struct perf_event *bp) +{ + struct mutex *tsk_mtx = get_task_bps_mutex(bp); + + if (tsk_mtx) + lockdep_assert_held(tsk_mtx); + lockdep_assert_held(&bp_cpuinfo_sem); +} + +#ifdef hw_breakpoint_slots +/* + * Number of breakpoint slots is constant, and the same for all types. + */ +static_assert(hw_breakpoint_slots(TYPE_INST) == hw_breakpoint_slots(TYPE_DATA)); +static inline int hw_breakpoint_slots_cached(int type) { return hw_breakpoint_slots(type); } +static inline int init_breakpoint_slots(void) { return 0; } +#else +/* + * Dynamic number of breakpoint slots. + */ +static int __nr_bp_slots[TYPE_MAX] __ro_after_init; + +static inline int hw_breakpoint_slots_cached(int type) +{ + return __nr_bp_slots[type]; +} + +static __init bool +bp_slots_histogram_alloc(struct bp_slots_histogram *hist, enum bp_type_idx type) +{ + hist->count = kcalloc(hw_breakpoint_slots_cached(type), sizeof(*hist->count), GFP_KERNEL); + return hist->count; +} + +static __init void bp_slots_histogram_free(struct bp_slots_histogram *hist) +{ + kfree(hist->count); +} + +static __init int init_breakpoint_slots(void) +{ + int i, cpu, err_cpu; + + for (i = 0; i < TYPE_MAX; i++) + __nr_bp_slots[i] = hw_breakpoint_slots(i); + + for_each_possible_cpu(cpu) { + for (i = 0; i < TYPE_MAX; i++) { + struct bp_cpuinfo *info = get_bp_info(cpu, i); + + if (!bp_slots_histogram_alloc(&info->tsk_pinned, i)) + goto err; + } + } + for (i = 0; i < TYPE_MAX; i++) { + if (!bp_slots_histogram_alloc(&cpu_pinned[i], i)) + goto err; + if (!bp_slots_histogram_alloc(&tsk_pinned_all[i], i)) + goto err; + } + + return 0; +err: + for_each_possible_cpu(err_cpu) { + for (i = 0; i < TYPE_MAX; i++) + bp_slots_histogram_free(&get_bp_info(err_cpu, i)->tsk_pinned); + if (err_cpu == cpu) + break; + } + for (i = 0; i < TYPE_MAX; i++) { + bp_slots_histogram_free(&cpu_pinned[i]); + bp_slots_histogram_free(&tsk_pinned_all[i]); + } + + return -ENOMEM; +} +#endif + +static inline void +bp_slots_histogram_add(struct bp_slots_histogram *hist, int old, int val) +{ + const int old_idx = old - 1; + const int new_idx = old_idx + val; + + if (old_idx >= 0) + WARN_ON(atomic_dec_return_relaxed(&hist->count[old_idx]) < 0); + if (new_idx >= 0) + WARN_ON(atomic_inc_return_relaxed(&hist->count[new_idx]) < 0); +} + +static int +bp_slots_histogram_max(struct bp_slots_histogram *hist, enum bp_type_idx type) +{ + for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { + const int count = atomic_read(&hist->count[i]); + + /* Catch unexpected writers; we want a stable snapshot. */ + ASSERT_EXCLUSIVE_WRITER(hist->count[i]); + if (count > 0) + return i + 1; + WARN(count < 0, "inconsistent breakpoint slots histogram"); + } + + return 0; +} + +static int +bp_slots_histogram_max_merge(struct bp_slots_histogram *hist1, struct bp_slots_histogram *hist2, + enum bp_type_idx type) +{ + for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { + const int count1 = atomic_read(&hist1->count[i]); + const int count2 = atomic_read(&hist2->count[i]); + + /* Catch unexpected writers; we want a stable snapshot. */ + ASSERT_EXCLUSIVE_WRITER(hist1->count[i]); + ASSERT_EXCLUSIVE_WRITER(hist2->count[i]); + if (count1 + count2 > 0) + return i + 1; + WARN(count1 < 0, "inconsistent breakpoint slots histogram"); + WARN(count2 < 0, "inconsistent breakpoint slots histogram"); + } + + return 0; +} + +#ifndef hw_breakpoint_weight +static inline int hw_breakpoint_weight(struct perf_event *bp) { return 1; } +#endif static inline enum bp_type_idx find_slot_idx(u64 bp_type) { @@ -82,39 +297,61 @@ static inline enum bp_type_idx find_slot_idx(u64 bp_type) } /* - * Report the maximum number of pinned breakpoints a task - * have in this cpu + * Return the maximum number of pinned breakpoints a task has in this CPU. */ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) { - unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; - int i; + struct bp_slots_histogram *tsk_pinned = &get_bp_info(cpu, type)->tsk_pinned; - for (i = nr_slots[type] - 1; i >= 0; i--) { - if (tsk_pinned[i] > 0) - return i + 1; - } - - return 0; + /* + * At this point we want to have acquired the bp_cpuinfo_sem as a + * writer to ensure that there are no concurrent writers in + * toggle_bp_task_slot() to tsk_pinned, and we get a stable snapshot. + */ + lockdep_assert_held_write(&bp_cpuinfo_sem); + return bp_slots_histogram_max_merge(tsk_pinned, &tsk_pinned_all[type], type); } /* * Count the number of breakpoints of the same type and same task. * The given event must be not on the list. + * + * If @cpu is -1, but the result of task_bp_pinned() is not CPU-independent, + * returns a negative value. */ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) { - struct task_struct *tsk = bp->hw.target; + struct rhlist_head *head, *pos; struct perf_event *iter; int count = 0; - list_for_each_entry(iter, &bp_task_head, hw.bp_list) { - if (iter->hw.target == tsk && - find_slot_idx(iter->attr.bp_type) == type && - (iter->cpu < 0 || cpu == iter->cpu)) - count += hw_breakpoint_weight(iter); + /* + * We need a stable snapshot of the per-task breakpoint list. + */ + assert_bp_constraints_lock_held(bp); + + rcu_read_lock(); + head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params); + if (!head) + goto out; + + rhl_for_each_entry_rcu(iter, pos, head, hw.bp_list) { + if (find_slot_idx(iter->attr.bp_type) != type) + continue; + + if (iter->cpu >= 0) { + if (cpu == -1) { + count = -1; + goto out; + } else if (cpu != iter->cpu) + continue; + } + + count += hw_breakpoint_weight(iter); } +out: + rcu_read_unlock(); return count; } @@ -126,16 +363,29 @@ static const struct cpumask *cpumask_of_bp(struct perf_event *bp) } /* - * Report the number of pinned/un-pinned breakpoints we have in - * a given cpu (cpu > -1) or in all of them (cpu = -1). + * Returns the max pinned breakpoint slots in a given + * CPU (cpu > -1) or across all of them (cpu = -1). */ -static void -fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, - enum bp_type_idx type) +static int +max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type) { const struct cpumask *cpumask = cpumask_of_bp(bp); + int pinned_slots = 0; int cpu; + if (bp->hw.target && bp->cpu < 0) { + int max_pinned = task_bp_pinned(-1, bp, type); + + if (max_pinned >= 0) { + /* + * Fast path: task_bp_pinned() is CPU-independent and + * returns the same value for any CPU. + */ + max_pinned += bp_slots_histogram_max(&cpu_pinned[type], type); + return max_pinned; + } + } + for_each_cpu(cpu, cpumask) { struct bp_cpuinfo *info = get_bp_info(cpu, type); int nr; @@ -146,71 +396,131 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, else nr += task_bp_pinned(cpu, bp, type); - if (nr > slots->pinned) - slots->pinned = nr; - - nr = info->flexible; - if (nr > slots->flexible) - slots->flexible = nr; + pinned_slots = max(nr, pinned_slots); } -} -/* - * For now, continue to consider flexible as pinned, until we can - * ensure no flexible event can ever be scheduled before a pinned event - * in a same cpu. - */ -static void -fetch_this_slot(struct bp_busy_slots *slots, int weight) -{ - slots->pinned += weight; -} - -/* - * Add a pinned breakpoint for the given task in our constraint table - */ -static void toggle_bp_task_slot(struct perf_event *bp, int cpu, - enum bp_type_idx type, int weight) -{ - unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; - int old_idx, new_idx; - - old_idx = task_bp_pinned(cpu, bp, type) - 1; - new_idx = old_idx + weight; - - if (old_idx >= 0) - tsk_pinned[old_idx]--; - if (new_idx >= 0) - tsk_pinned[new_idx]++; + return pinned_slots; } /* * Add/remove the given breakpoint in our constraint table */ -static void -toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, - int weight) +static int +toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight) { - const struct cpumask *cpumask = cpumask_of_bp(bp); - int cpu; + int cpu, next_tsk_pinned; if (!enable) weight = -weight; - /* Pinned counter cpu profiling */ if (!bp->hw.target) { - get_bp_info(bp->cpu, type)->cpu_pinned += weight; - return; + /* + * Update the pinned CPU slots, in per-CPU bp_cpuinfo and in the + * global histogram. + */ + struct bp_cpuinfo *info = get_bp_info(bp->cpu, type); + + lockdep_assert_held_write(&bp_cpuinfo_sem); + bp_slots_histogram_add(&cpu_pinned[type], info->cpu_pinned, weight); + info->cpu_pinned += weight; + return 0; + } + + /* + * If bp->hw.target, tsk_pinned is only modified, but not used + * otherwise. We can permit concurrent updates as long as there are no + * other uses: having acquired bp_cpuinfo_sem as a reader allows + * concurrent updates here. Uses of tsk_pinned will require acquiring + * bp_cpuinfo_sem as a writer to stabilize tsk_pinned's value. + */ + lockdep_assert_held_read(&bp_cpuinfo_sem); + + /* + * Update the pinned task slots, in per-CPU bp_cpuinfo and in the global + * histogram. We need to take care of 4 cases: + * + * 1. This breakpoint targets all CPUs (cpu < 0), and there may only + * exist other task breakpoints targeting all CPUs. In this case we + * can simply update the global slots histogram. + * + * 2. This breakpoint targets a specific CPU (cpu >= 0), but there may + * only exist other task breakpoints targeting all CPUs. + * + * a. On enable: remove the existing breakpoints from the global + * slots histogram and use the per-CPU histogram. + * + * b. On disable: re-insert the existing breakpoints into the global + * slots histogram and remove from per-CPU histogram. + * + * 3. Some other existing task breakpoints target specific CPUs. Only + * update the per-CPU slots histogram. + */ + + if (!enable) { + /* + * Remove before updating histograms so we can determine if this + * was the last task breakpoint for a specific CPU. + */ + int ret = rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); + + if (ret) + return ret; + } + /* + * Note: If !enable, next_tsk_pinned will not count the to-be-removed breakpoint. + */ + next_tsk_pinned = task_bp_pinned(-1, bp, type); + + if (next_tsk_pinned >= 0) { + if (bp->cpu < 0) { /* Case 1: fast path */ + if (!enable) + next_tsk_pinned += hw_breakpoint_weight(bp); + bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned, weight); + } else if (enable) { /* Case 2.a: slow path */ + /* Add existing to per-CPU histograms. */ + for_each_possible_cpu(cpu) { + bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, + 0, next_tsk_pinned); + } + /* Add this first CPU-pinned task breakpoint. */ + bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned, + next_tsk_pinned, weight); + /* Rebalance global task pinned histogram. */ + bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned, + -next_tsk_pinned); + } else { /* Case 2.b: slow path */ + /* Remove this last CPU-pinned task breakpoint. */ + bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned, + next_tsk_pinned + hw_breakpoint_weight(bp), weight); + /* Remove all from per-CPU histograms. */ + for_each_possible_cpu(cpu) { + bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, + next_tsk_pinned, -next_tsk_pinned); + } + /* Rebalance global task pinned histogram. */ + bp_slots_histogram_add(&tsk_pinned_all[type], 0, next_tsk_pinned); + } + } else { /* Case 3: slow path */ + const struct cpumask *cpumask = cpumask_of_bp(bp); + + for_each_cpu(cpu, cpumask) { + next_tsk_pinned = task_bp_pinned(cpu, bp, type); + if (!enable) + next_tsk_pinned += hw_breakpoint_weight(bp); + bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, + next_tsk_pinned, weight); + } } - /* Pinned counter task profiling */ - for_each_cpu(cpu, cpumask) - toggle_bp_task_slot(bp, cpu, type, weight); + /* + * Readers want a stable snapshot of the per-task breakpoint list. + */ + assert_bp_constraints_lock_held(bp); if (enable) - list_add_tail(&bp->hw.bp_list, &bp_task_head); - else - list_del(&bp->hw.bp_list); + return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); + + return 0; } __weak int arch_reserve_bp_slot(struct perf_event *bp) @@ -234,7 +544,12 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) } /* - * Constraints to check before allowing this new breakpoint counter: + * Constraints to check before allowing this new breakpoint counter. + * + * Note: Flexible breakpoints are currently unimplemented, but outlined in the + * below algorithm for completeness. The implementation treats flexible as + * pinned due to no guarantee that we currently always schedule flexible events + * before a pinned event in a same CPU. * * == Non-pinned counter == (Considered as pinned for now) * @@ -276,8 +591,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) */ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) { - struct bp_busy_slots slots = {0}; enum bp_type_idx type; + int max_pinned_slots; int weight; int ret; @@ -293,36 +608,24 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); - fetch_bp_busy_slots(&slots, bp, type); - /* - * Simulate the addition of this breakpoint to the constraints - * and see the result. - */ - fetch_this_slot(&slots, weight); - - /* Flexible counters need to keep at least one slot */ - if (slots.pinned + (!!slots.flexible) > nr_slots[type]) + /* Check if this new breakpoint can be satisfied across all CPUs. */ + max_pinned_slots = max_bp_pinned_slots(bp, type) + weight; + if (max_pinned_slots > hw_breakpoint_slots_cached(type)) return -ENOSPC; ret = arch_reserve_bp_slot(bp); if (ret) return ret; - toggle_bp_slot(bp, true, type, weight); - - return 0; + return toggle_bp_slot(bp, true, type, weight); } int reserve_bp_slot(struct perf_event *bp) { - int ret; - - mutex_lock(&nr_bp_mutex); - - ret = __reserve_bp_slot(bp, bp->attr.bp_type); - - mutex_unlock(&nr_bp_mutex); + struct mutex *mtx = bp_constraints_lock(bp); + int ret = __reserve_bp_slot(bp, bp->attr.bp_type); + bp_constraints_unlock(mtx); return ret; } @@ -335,17 +638,16 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type) type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); - toggle_bp_slot(bp, false, type, weight); + WARN_ON(toggle_bp_slot(bp, false, type, weight)); } void release_bp_slot(struct perf_event *bp) { - mutex_lock(&nr_bp_mutex); + struct mutex *mtx = bp_constraints_lock(bp); arch_unregister_hw_breakpoint(bp); __release_bp_slot(bp, bp->attr.bp_type); - - mutex_unlock(&nr_bp_mutex); + bp_constraints_unlock(mtx); } static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) @@ -372,11 +674,10 @@ static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) { - int ret; + struct mutex *mtx = bp_constraints_lock(bp); + int ret = __modify_bp_slot(bp, old_type, new_type); - mutex_lock(&nr_bp_mutex); - ret = __modify_bp_slot(bp, old_type, new_type); - mutex_unlock(&nr_bp_mutex); + bp_constraints_unlock(mtx); return ret; } @@ -387,18 +688,28 @@ static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) */ int dbg_reserve_bp_slot(struct perf_event *bp) { - if (mutex_is_locked(&nr_bp_mutex)) + int ret; + + if (bp_constraints_is_locked(bp)) return -1; - return __reserve_bp_slot(bp, bp->attr.bp_type); + /* Locks aren't held; disable lockdep assert checking. */ + lockdep_off(); + ret = __reserve_bp_slot(bp, bp->attr.bp_type); + lockdep_on(); + + return ret; } int dbg_release_bp_slot(struct perf_event *bp) { - if (mutex_is_locked(&nr_bp_mutex)) + if (bp_constraints_is_locked(bp)) return -1; + /* Locks aren't held; disable lockdep assert checking. */ + lockdep_off(); __release_bp_slot(bp, bp->attr.bp_type); + lockdep_on(); return 0; } @@ -604,6 +915,50 @@ void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) } EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); +/** + * hw_breakpoint_is_used - check if breakpoints are currently used + * + * Returns: true if breakpoints are used, false otherwise. + */ +bool hw_breakpoint_is_used(void) +{ + int cpu; + + if (!constraints_initialized) + return false; + + for_each_possible_cpu(cpu) { + for (int type = 0; type < TYPE_MAX; ++type) { + struct bp_cpuinfo *info = get_bp_info(cpu, type); + + if (info->cpu_pinned) + return true; + + for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) { + if (atomic_read(&info->tsk_pinned.count[slot])) + return true; + } + } + } + + for (int type = 0; type < TYPE_MAX; ++type) { + for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) { + /* + * Warn, because if there are CPU pinned counters, + * should never get here; bp_cpuinfo::cpu_pinned should + * be consistent with the global cpu_pinned histogram. + */ + if (WARN_ON(atomic_read(&cpu_pinned[type].count[slot]))) + return true; + + if (atomic_read(&tsk_pinned_all[type].count[slot])) + return true; + } + } + + return false; +} + static struct notifier_block hw_breakpoint_exceptions_nb = { .notifier_call = hw_breakpoint_exceptions_notify, /* we need to be notified first */ @@ -678,38 +1033,19 @@ static struct pmu perf_breakpoint = { int __init init_hw_breakpoint(void) { - int cpu, err_cpu; - int i; - - for (i = 0; i < TYPE_MAX; i++) - nr_slots[i] = hw_breakpoint_slots(i); + int ret; - for_each_possible_cpu(cpu) { - for (i = 0; i < TYPE_MAX; i++) { - struct bp_cpuinfo *info = get_bp_info(cpu, i); + ret = rhltable_init(&task_bps_ht, &task_bps_ht_params); + if (ret) + return ret; - info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int), - GFP_KERNEL); - if (!info->tsk_pinned) - goto err_alloc; - } - } + ret = init_breakpoint_slots(); + if (ret) + return ret; - constraints_initialized = 1; + constraints_initialized = true; perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); return register_die_notifier(&hw_breakpoint_exceptions_nb); - - err_alloc: - for_each_possible_cpu(err_cpu) { - for (i = 0; i < TYPE_MAX; i++) - kfree(get_bp_info(err_cpu, i)->tsk_pinned); - if (err_cpu == cpu) - break; - } - - return -ENOMEM; } - - diff --git a/kernel/events/hw_breakpoint_test.c b/kernel/events/hw_breakpoint_test.c new file mode 100644 index 000000000000..c57610f52bb4 --- /dev/null +++ b/kernel/events/hw_breakpoint_test.c @@ -0,0 +1,333 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit test for hw_breakpoint constraints accounting logic. + * + * Copyright (C) 2022, Google LLC. + */ + +#include <kunit/test.h> +#include <linux/cpumask.h> +#include <linux/hw_breakpoint.h> +#include <linux/kthread.h> +#include <linux/perf_event.h> +#include <asm/hw_breakpoint.h> + +#define TEST_REQUIRES_BP_SLOTS(test, slots) \ + do { \ + if ((slots) > get_test_bp_slots()) { \ + kunit_skip((test), "Requires breakpoint slots: %d > %d", slots, \ + get_test_bp_slots()); \ + } \ + } while (0) + +#define TEST_EXPECT_NOSPC(expr) KUNIT_EXPECT_EQ(test, -ENOSPC, PTR_ERR(expr)) + +#define MAX_TEST_BREAKPOINTS 512 + +static char break_vars[MAX_TEST_BREAKPOINTS]; +static struct perf_event *test_bps[MAX_TEST_BREAKPOINTS]; +static struct task_struct *__other_task; + +static struct perf_event *register_test_bp(int cpu, struct task_struct *tsk, int idx) +{ + struct perf_event_attr attr = {}; + + if (WARN_ON(idx < 0 || idx >= MAX_TEST_BREAKPOINTS)) + return NULL; + + hw_breakpoint_init(&attr); + attr.bp_addr = (unsigned long)&break_vars[idx]; + attr.bp_len = HW_BREAKPOINT_LEN_1; + attr.bp_type = HW_BREAKPOINT_RW; + return perf_event_create_kernel_counter(&attr, cpu, tsk, NULL, NULL); +} + +static void unregister_test_bp(struct perf_event **bp) +{ + if (WARN_ON(IS_ERR(*bp))) + return; + if (WARN_ON(!*bp)) + return; + unregister_hw_breakpoint(*bp); + *bp = NULL; +} + +static int get_test_bp_slots(void) +{ + static int slots; + + if (!slots) + slots = hw_breakpoint_slots(TYPE_DATA); + + return slots; +} + +static void fill_one_bp_slot(struct kunit *test, int *id, int cpu, struct task_struct *tsk) +{ + struct perf_event *bp = register_test_bp(cpu, tsk, *id); + + KUNIT_ASSERT_NOT_NULL(test, bp); + KUNIT_ASSERT_FALSE(test, IS_ERR(bp)); + KUNIT_ASSERT_NULL(test, test_bps[*id]); + test_bps[(*id)++] = bp; +} + +/* + * Fills up the given @cpu/@tsk with breakpoints, only leaving @skip slots free. + * + * Returns true if this can be called again, continuing at @id. + */ +static bool fill_bp_slots(struct kunit *test, int *id, int cpu, struct task_struct *tsk, int skip) +{ + for (int i = 0; i < get_test_bp_slots() - skip; ++i) + fill_one_bp_slot(test, id, cpu, tsk); + + return *id + get_test_bp_slots() <= MAX_TEST_BREAKPOINTS; +} + +static int dummy_kthread(void *arg) +{ + return 0; +} + +static struct task_struct *get_other_task(struct kunit *test) +{ + struct task_struct *tsk; + + if (__other_task) + return __other_task; + + tsk = kthread_create(dummy_kthread, NULL, "hw_breakpoint_dummy_task"); + KUNIT_ASSERT_FALSE(test, IS_ERR(tsk)); + __other_task = tsk; + return __other_task; +} + +static int get_test_cpu(int num) +{ + int cpu; + + WARN_ON(num < 0); + + for_each_online_cpu(cpu) { + if (num-- <= 0) + break; + } + + return cpu; +} + +/* ===== Test cases ===== */ + +static void test_one_cpu(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, get_test_cpu(0), NULL, 0); + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); +} + +static void test_many_cpus(struct kunit *test) +{ + int idx = 0; + int cpu; + + /* Test that CPUs are independent. */ + for_each_online_cpu(cpu) { + bool do_continue = fill_bp_slots(test, &idx, cpu, NULL, 0); + + TEST_EXPECT_NOSPC(register_test_bp(cpu, NULL, idx)); + if (!do_continue) + break; + } +} + +static void test_one_task_on_all_cpus(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, -1, current, 0); + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* Remove one and adding back CPU-target should work. */ + unregister_test_bp(&test_bps[0]); + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); +} + +static void test_two_tasks_on_all_cpus(struct kunit *test) +{ + int idx = 0; + + /* Test that tasks are independent. */ + fill_bp_slots(test, &idx, -1, current, 0); + fill_bp_slots(test, &idx, -1, get_other_task(test), 0); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* Remove one from first task and adding back CPU-target should not work. */ + unregister_test_bp(&test_bps[0]); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); +} + +static void test_one_task_on_one_cpu(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, get_test_cpu(0), current, 0); + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* + * Remove one and adding back CPU-target should work; this case is + * special vs. above because the task's constraints are CPU-dependent. + */ + unregister_test_bp(&test_bps[0]); + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); +} + +static void test_one_task_mixed(struct kunit *test) +{ + int idx = 0; + + TEST_REQUIRES_BP_SLOTS(test, 3); + + fill_one_bp_slot(test, &idx, get_test_cpu(0), current); + fill_bp_slots(test, &idx, -1, current, 1); + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + + /* Transition from CPU-dependent pinned count to CPU-independent. */ + unregister_test_bp(&test_bps[0]); + unregister_test_bp(&test_bps[1]); + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); +} + +static void test_two_tasks_on_one_cpu(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, get_test_cpu(0), current, 0); + fill_bp_slots(test, &idx, get_test_cpu(0), get_other_task(test), 0); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* Can still create breakpoints on some other CPU. */ + fill_bp_slots(test, &idx, get_test_cpu(1), NULL, 0); +} + +static void test_two_tasks_on_one_all_cpus(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, get_test_cpu(0), current, 0); + fill_bp_slots(test, &idx, -1, get_other_task(test), 0); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* Cannot create breakpoints on some other CPU either. */ + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx)); +} + +static void test_task_on_all_and_one_cpu(struct kunit *test) +{ + int tsk_on_cpu_idx, cpu_idx; + int idx = 0; + + TEST_REQUIRES_BP_SLOTS(test, 3); + + fill_bp_slots(test, &idx, -1, current, 2); + /* Transitioning from only all CPU breakpoints to mixed. */ + tsk_on_cpu_idx = idx; + fill_one_bp_slot(test, &idx, get_test_cpu(0), current); + fill_one_bp_slot(test, &idx, -1, current); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + + /* We should still be able to use up another CPU's slots. */ + cpu_idx = idx; + fill_one_bp_slot(test, &idx, get_test_cpu(1), NULL); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx)); + + /* Transitioning back to task target on all CPUs. */ + unregister_test_bp(&test_bps[tsk_on_cpu_idx]); + /* Still have a CPU target breakpoint in get_test_cpu(1). */ + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + /* Remove it and try again. */ + unregister_test_bp(&test_bps[cpu_idx]); + fill_one_bp_slot(test, &idx, -1, current); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx)); +} + +static struct kunit_case hw_breakpoint_test_cases[] = { + KUNIT_CASE(test_one_cpu), + KUNIT_CASE(test_many_cpus), + KUNIT_CASE(test_one_task_on_all_cpus), + KUNIT_CASE(test_two_tasks_on_all_cpus), + KUNIT_CASE(test_one_task_on_one_cpu), + KUNIT_CASE(test_one_task_mixed), + KUNIT_CASE(test_two_tasks_on_one_cpu), + KUNIT_CASE(test_two_tasks_on_one_all_cpus), + KUNIT_CASE(test_task_on_all_and_one_cpu), + {}, +}; + +static int test_init(struct kunit *test) +{ + /* Most test cases want 2 distinct CPUs. */ + if (num_online_cpus() < 2) + kunit_skip(test, "not enough cpus"); + + /* Want the system to not use breakpoints elsewhere. */ + if (hw_breakpoint_is_used()) + kunit_skip(test, "hw breakpoint already in use"); + + return 0; +} + +static void test_exit(struct kunit *test) +{ + for (int i = 0; i < MAX_TEST_BREAKPOINTS; ++i) { + if (test_bps[i]) + unregister_test_bp(&test_bps[i]); + } + + if (__other_task) { + kthread_stop(__other_task); + __other_task = NULL; + } + + /* Verify that internal state agrees that no breakpoints are in use. */ + KUNIT_EXPECT_FALSE(test, hw_breakpoint_is_used()); +} + +static struct kunit_suite hw_breakpoint_test_suite = { + .name = "hw_breakpoint", + .test_cases = hw_breakpoint_test_cases, + .init = test_init, + .exit = test_exit, +}; + +kunit_test_suites(&hw_breakpoint_test_suite); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marco Elver <elver@google.com>"); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 082832738c8f..5150d5f84c03 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -116,6 +116,11 @@ static inline int page_order(struct perf_buffer *rb) } #endif +static inline int data_page_nr(struct perf_buffer *rb) +{ + return rb->nr_pages << page_order(rb); +} + static inline unsigned long perf_data_size(struct perf_buffer *rb) { return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 52868716ec35..273a0fe7910a 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -22,7 +22,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle) atomic_set(&handle->rb->poll, EPOLLIN); handle->event->pending_wakeup = 1; - irq_work_queue(&handle->event->pending); + irq_work_queue(&handle->event->pending_irq); } /* @@ -172,8 +172,10 @@ __perf_output_begin(struct perf_output_handle *handle, goto out; if (unlikely(rb->paused)) { - if (rb->nr_pages) + if (rb->nr_pages) { local_inc(&rb->lost); + atomic64_inc(&event->lost_samples); + } goto out; } @@ -254,6 +256,7 @@ __perf_output_begin(struct perf_output_handle *handle, fail: local_inc(&rb->lost); + atomic64_inc(&event->lost_samples); perf_output_put_handle(handle); out: rcu_read_unlock(); @@ -859,11 +862,6 @@ void rb_free(struct perf_buffer *rb) } #else -static int data_page_nr(struct perf_buffer *rb) -{ - return rb->nr_pages << page_order(rb); -} - static struct page * __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6357c3580d07..d9e357b7e17c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -19,7 +19,7 @@ #include <linux/export.h> #include <linux/rmap.h> /* anon_vma_prepare */ #include <linux/mmu_notifier.h> /* set_pte_at_notify */ -#include <linux/swap.h> /* try_to_free_swap */ +#include <linux/swap.h> /* folio_free_swap */ #include <linux/ptrace.h> /* user_enable_single_step */ #include <linux/kdebug.h> /* notifier mechanism */ #include "../../mm/internal.h" /* munlock_vma_page */ @@ -154,12 +154,10 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) { + struct folio *old_folio = page_folio(old_page); + struct folio *new_folio; struct mm_struct *mm = vma->vm_mm; - struct page_vma_mapped_walk pvmw = { - .page = compound_head(old_page), - .vma = vma, - .address = addr, - }; + DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0); int err; struct mmu_notifier_range range; @@ -167,14 +165,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, addr + PAGE_SIZE); if (new_page) { - err = mem_cgroup_charge(page_folio(new_page), vma->vm_mm, - GFP_KERNEL); + new_folio = page_folio(new_page); + err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL); if (err) return err; } - /* For try_to_free_swap() and munlock_vma_page() below */ - lock_page(old_page); + /* For folio_free_swap() below */ + folio_lock(old_folio); mmu_notifier_invalidate_range_start(&range); err = -EAGAIN; @@ -183,14 +181,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, VM_BUG_ON_PAGE(addr != pvmw.address, old_page); if (new_page) { - get_page(new_page); - page_add_new_anon_rmap(new_page, vma, addr, false); - lru_cache_add_inactive_or_unevictable(new_page, vma); + folio_get(new_folio); + page_add_new_anon_rmap(new_page, vma, addr); + folio_add_lru_vma(new_folio, vma); } else /* no new page, just dec_mm_counter for old_page */ dec_mm_counter(mm, MM_ANONPAGES); - if (!PageAnon(old_page)) { + if (!folio_test_anon(old_folio)) { dec_mm_counter(mm, mm_counter_file(old_page)); inc_mm_counter(mm, MM_ANONPAGES); } @@ -201,19 +199,16 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, set_pte_at_notify(mm, addr, pvmw.pte, mk_pte(new_page, vma->vm_page_prot)); - page_remove_rmap(old_page, false); - if (!page_mapped(old_page)) - try_to_free_swap(old_page); + page_remove_rmap(old_page, vma, false); + if (!folio_mapped(old_folio)) + folio_free_swap(old_folio); page_vma_mapped_walk_done(&pvmw); - - if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page)) - munlock_vma_page(old_page); - put_page(old_page); + folio_put(old_folio); err = 0; unlock: mmu_notifier_invalidate_range_end(&range); - unlock_page(old_page); + folio_unlock(old_folio); return err; } @@ -356,9 +351,10 @@ static bool valid_ref_ctr_vma(struct uprobe *uprobe, static struct vm_area_struct * find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *tmp; - for (tmp = mm->mmap; tmp; tmp = tmp->vm_next) + for_each_vma(vmi, tmp) if (valid_ref_ctr_vma(uprobe, tmp)) return tmp; @@ -559,7 +555,7 @@ put_old: /* try collapse pmd for compound page */ if (!ret && orig_page_huge) - collapse_pte_mapped_thp(mm, vaddr); + collapse_pte_mapped_thp(mm, vaddr, false); return ret; } @@ -794,10 +790,10 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, struct page *page; /* * Ensure that the page that has the original instruction is populated - * and in page-cache. If ->readpage == NULL it must be shmem_mapping(), + * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(), * see uprobe_register(). */ - if (mapping->a_ops->readpage) + if (mapping->a_ops->read_folio) page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp); else page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT); @@ -1150,7 +1146,8 @@ static int __uprobe_register(struct inode *inode, loff_t offset, return -EINVAL; /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */ - if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping)) + if (!inode->i_mapping->a_ops->read_folio && + !shmem_mapping(inode->i_mapping)) return -EIO; /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) @@ -1237,11 +1234,12 @@ int uprobe_apply(struct inode *inode, loff_t offset, static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; int err = 0; mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { unsigned long vaddr; loff_t offset; @@ -1989,9 +1987,10 @@ bool uprobe_deny_signal(void) static void mmf_recalc_uprobes(struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!valid_vma(vma, false)) continue; /* |
