From e33a5b611c98566dd733a7ad0735eeb23e2739d4 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 7 Feb 2022 16:41:30 +0100 Subject: [PATCH] Revert "perf: Fix perf_event_read_local() time" This reverts commit 91b04e83c71057927380d7597efe1e93e0bf3462 which is commit 09f5e7dc7ad705289e1b1ec065439aa3c42951c4 upstream It breaks the kernel abi and is not needed for Android systems, so revert it. Bug: 161946584 Fixes: 91b04e83c710 ("perf: Fix perf_event_read_local() time") Signed-off-by: Greg Kroah-Hartman Change-Id: Ic93864b39858bea47cc735e83c294fc4063b9dcd --- include/linux/perf_event.h | 15 ++- kernel/events/core.c | 246 +++++++++++++++---------------------- 2 files changed, 112 insertions(+), 149 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 67a50c78232f..c94551091dad 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -678,6 +678,18 @@ struct perf_event { u64 total_time_running; u64 tstamp; + /* + * timestamp shadows the actual context timing but it can + * be safely used in NMI interrupt context. It reflects the + * context time as it was when the event was last scheduled in, + * or when ctx_sched_in failed to schedule the event because we + * run out of PMC. + * + * ctx_time already accounts for ctx->timestamp. Therefore to + * compute ctx_time for a sample, simply add perf_clock(). + */ + u64 shadow_ctx_time; + struct perf_event_attr attr; u16 header_size; u16 id_header_size; @@ -822,7 +834,6 @@ struct perf_event_context { */ u64 time; u64 timestamp; - u64 timeoffset; /* * These fields let us detect when two contexts have both @@ -905,8 +916,6 @@ struct bpf_perf_event_data_kern { struct perf_cgroup_info { u64 time; u64 timestamp; - u64 timeoffset; - int active; }; struct perf_cgroup { diff --git a/kernel/events/core.c b/kernel/events/core.c index eba6eacce261..b4a15978ce7d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -673,23 +673,6 @@ perf_event_set_state(struct perf_event *event, enum perf_event_state state) WRITE_ONCE(event->state, state); } -/* - * UP store-release, load-acquire - */ - -#define __store_release(ptr, val) \ -do { \ - barrier(); \ - WRITE_ONCE(*(ptr), (val)); \ -} while (0) - -#define __load_acquire(ptr) \ -({ \ - __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \ - barrier(); \ - ___p; \ -}) - #ifdef CONFIG_CGROUP_PERF static inline bool @@ -735,51 +718,34 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event) return t->time; } -static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) +static inline void __update_cgrp_time(struct perf_cgroup *cgrp) { - struct perf_cgroup_info *t; + struct perf_cgroup_info *info; + u64 now; - t = per_cpu_ptr(event->cgrp->info, event->cpu); - if (!__load_acquire(&t->active)) - return t->time; - now += READ_ONCE(t->timeoffset); - return now; -} + now = perf_clock(); -static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) -{ - if (adv) - info->time += now - info->timestamp; + info = this_cpu_ptr(cgrp->info); + + info->time += now - info->timestamp; info->timestamp = now; - /* - * see update_context_time() - */ - WRITE_ONCE(info->timeoffset, info->time - info->timestamp); } -static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) { struct perf_cgroup *cgrp = cpuctx->cgrp; struct cgroup_subsys_state *css; - struct perf_cgroup_info *info; if (cgrp) { - u64 now = perf_clock(); - for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); - info = this_cpu_ptr(cgrp->info); - - __update_cgrp_time(info, now, true); - if (final) - __store_release(&info->active, 0); + __update_cgrp_time(cgrp); } } } static inline void update_cgrp_time_from_event(struct perf_event *event) { - struct perf_cgroup_info *info; struct perf_cgroup *cgrp; /* @@ -793,10 +759,8 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) /* * Do not update time when cgroup is not active */ - if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) { - info = this_cpu_ptr(event->cgrp->info); - __update_cgrp_time(info, perf_clock(), true); - } + if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) + __update_cgrp_time(event->cgrp); } static inline void @@ -820,8 +784,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - __update_cgrp_time(info, ctx->timestamp, false); - __store_release(&info->active, 1); + info->timestamp = ctx->timestamp; } } @@ -1017,6 +980,14 @@ out: return ret; } +static inline void +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +{ + struct perf_cgroup_info *t; + t = per_cpu_ptr(event->cgrp->info, event->cpu); + event->shadow_ctx_time = now - t->timestamp; +} + static inline void perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { @@ -1094,8 +1065,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) { } -static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, - bool final) +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) { } @@ -1127,12 +1097,12 @@ perf_cgroup_switch(struct task_struct *task, struct task_struct *next) { } -static inline u64 perf_cgroup_event_time(struct perf_event *event) +static inline void +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) { - return 0; } -static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) +static inline u64 perf_cgroup_event_time(struct perf_event *event) { return 0; } @@ -1554,59 +1524,22 @@ static void perf_unpin_context(struct perf_event_context *ctx) /* * Update the record of the current time in a context. */ -static void __update_context_time(struct perf_event_context *ctx, bool adv) +static void update_context_time(struct perf_event_context *ctx) { u64 now = perf_clock(); - if (adv) - ctx->time += now - ctx->timestamp; + ctx->time += now - ctx->timestamp; ctx->timestamp = now; - - /* - * The above: time' = time + (now - timestamp), can be re-arranged - * into: time` = now + (time - timestamp), which gives a single value - * offset to compute future time without locks on. - * - * See perf_event_time_now(), which can be used from NMI context where - * it's (obviously) not possible to acquire ctx->lock in order to read - * both the above values in a consistent manner. - */ - WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); -} - -static void update_context_time(struct perf_event_context *ctx) -{ - __update_context_time(ctx, true); } static u64 perf_event_time(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; - if (unlikely(!ctx)) - return 0; - if (is_cgroup_event(event)) return perf_cgroup_event_time(event); - return ctx->time; -} - -static u64 perf_event_time_now(struct perf_event *event, u64 now) -{ - struct perf_event_context *ctx = event->ctx; - - if (unlikely(!ctx)) - return 0; - - if (is_cgroup_event(event)) - return perf_cgroup_event_time_now(event, now); - - if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) - return ctx->time; - - now += READ_ONCE(ctx->timeoffset); - return now; + return ctx ? ctx->time : 0; } static enum event_type_t get_event_type(struct perf_event *event) @@ -2400,7 +2333,7 @@ __perf_remove_from_context(struct perf_event *event, if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx, false); + update_cgrp_time_from_cpuctx(cpuctx); } event_sched_out(event, cpuctx, ctx); @@ -2409,9 +2342,6 @@ __perf_remove_from_context(struct perf_event *event, list_del_event(event, ctx); if (!ctx->nr_events && ctx->is_active) { - if (ctx == &cpuctx->ctx) - update_cgrp_time_from_cpuctx(cpuctx, true); - ctx->is_active = 0; ctx->rotate_necessary = 0; if (ctx->task) { @@ -2537,6 +2467,40 @@ void perf_event_disable_inatomic(struct perf_event *event) irq_work_queue(&event->pending); } +static void perf_set_shadow_time(struct perf_event *event, + struct perf_event_context *ctx) +{ + /* + * use the correct time source for the time snapshot + * + * We could get by without this by leveraging the + * fact that to get to this function, the caller + * has most likely already called update_context_time() + * and update_cgrp_time_xx() and thus both timestamp + * are identical (or very close). Given that tstamp is, + * already adjusted for cgroup, we could say that: + * tstamp - ctx->timestamp + * is equivalent to + * tstamp - cgrp->timestamp. + * + * Then, in perf_output_read(), the calculation would + * work with no changes because: + * - event is guaranteed scheduled in + * - no scheduled out in between + * - thus the timestamp would be the same + * + * But this is a bit hairy. + * + * So instead, we have an explicit cgroup call to remain + * within the time source all along. We believe it + * is cleaner and simpler to understand. + */ + if (is_cgroup_event(event)) + perf_cgroup_set_shadow_time(event, event->tstamp); + else + event->shadow_ctx_time = event->tstamp - ctx->timestamp; +} + #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_event *event, int enable); @@ -2577,6 +2541,8 @@ event_sched_in(struct perf_event *event, perf_pmu_disable(event->pmu); + perf_set_shadow_time(event, ctx); + perf_log_itrace_start(event); if (event->pmu->add(event, PERF_EF_START)) { @@ -3250,6 +3216,16 @@ static void ctx_sched_out(struct perf_event_context *ctx, return; } + ctx->is_active &= ~event_type; + if (!(ctx->is_active & EVENT_ALL)) + ctx->is_active = 0; + + if (ctx->task) { + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + if (!ctx->is_active) + cpuctx->task_ctx = NULL; + } + /* * Always update time if it was set; not only when it changes. * Otherwise we can 'forget' to update time for any but the last @@ -3263,22 +3239,7 @@ static void ctx_sched_out(struct perf_event_context *ctx, if (is_active & EVENT_TIME) { /* update (and stop) ctx time */ update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx); - /* - * CPU-release for the below ->is_active store, - * see __load_acquire() in perf_event_time_now() - */ - barrier(); - } - - ctx->is_active &= ~event_type; - if (!(ctx->is_active & EVENT_ALL)) - ctx->is_active = 0; - - if (ctx->task) { - WARN_ON_ONCE(cpuctx->task_ctx != ctx); - if (!ctx->is_active) - cpuctx->task_ctx = NULL; + update_cgrp_time_from_cpuctx(cpuctx); } is_active ^= ctx->is_active; /* changed bits */ @@ -3715,19 +3676,13 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, return 0; } -/* - * Because the userpage is strictly per-event (there is no concept of context, - * so there cannot be a context indirection), every userpage must be updated - * when context time starts :-( - * - * IOW, we must not miss EVENT_TIME edges. - */ static inline bool event_update_userpage(struct perf_event *event) { if (likely(!atomic_read(&event->mmap_count))) return false; perf_event_update_time(event); + perf_set_shadow_time(event, event->ctx); perf_event_update_userpage(event); return true; @@ -3811,23 +3766,13 @@ ctx_sched_in(struct perf_event_context *ctx, struct task_struct *task) { int is_active = ctx->is_active; + u64 now; lockdep_assert_held(&ctx->lock); if (likely(!ctx->nr_events)) return; - if (is_active ^ EVENT_TIME) { - /* start ctx time */ - __update_context_time(ctx, false); - perf_cgroup_set_timestamp(task, ctx); - /* - * CPU-release for the below ->is_active store, - * see __load_acquire() in perf_event_time_now() - */ - barrier(); - } - ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { if (!is_active) @@ -3838,6 +3783,13 @@ ctx_sched_in(struct perf_event_context *ctx, is_active ^= ctx->is_active; /* changed bits */ + if (is_active & EVENT_TIME) { + /* start ctx time */ + now = perf_clock(); + ctx->timestamp = now; + perf_cgroup_set_timestamp(task, ctx); + } + /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. @@ -4373,18 +4325,6 @@ static inline u64 perf_event_count(struct perf_event *event) return local64_read(&event->count) + atomic64_read(&event->child_count); } -static void calc_timer_values(struct perf_event *event, - u64 *now, - u64 *enabled, - u64 *running) -{ - u64 ctx_time; - - *now = perf_clock(); - ctx_time = perf_event_time_now(event, *now); - __perf_update_times(event, ctx_time, enabled, running); -} - /* * NMI-safe method to read a local event, that is an event that * is: @@ -4444,9 +4384,10 @@ int perf_event_read_local(struct perf_event *event, u64 *value, *value = local64_read(&event->count); if (enabled || running) { - u64 __enabled, __running, __now;; + u64 now = event->shadow_ctx_time + perf_clock(); + u64 __enabled, __running; - calc_timer_values(event, &__now, &__enabled, &__running); + __perf_update_times(event, now, &__enabled, &__running); if (enabled) *enabled = __enabled; if (running) @@ -5754,6 +5695,18 @@ static int perf_event_index(struct perf_event *event) return event->pmu->event_idx(event); } +static void calc_timer_values(struct perf_event *event, + u64 *now, + u64 *enabled, + u64 *running) +{ + u64 ctx_time; + + *now = perf_clock(); + ctx_time = event->shadow_ctx_time + *now; + __perf_update_times(event, ctx_time, enabled, running); +} + static void perf_event_init_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; @@ -6293,6 +6246,7 @@ accounting: ring_buffer_attach(event, rb); perf_event_update_time(event); + perf_set_shadow_time(event, event->ctx); perf_event_init_userpage(event); perf_event_update_userpage(event); } else {