perf: Store active software events in a hashlist
Each time a software event triggers, we need to walk through the entire list of events from the current cpu and task contexts to retrieve a running perf event that matches. We also need to check a matching perf event is actually counting. This walk is wasteful and makes the event fast path scaling down with a growing number of events running on the same contexts. To solve this, we store the running perf events in a hashlist to get an immediate access to them against their type:event_id when they trigger. v2: - Fix SWEVENT_HLIST_SIZE definition (and re-learn some basic maths along the way) - Only allocate hlist for online cpus, but keep track of the refcount on offline possible cpus too, so that we allocate it if needed when it becomes online. - Drop the kref use as it's not adapted to our tricks anymore. v3: - Fix bad refcount check (address instead of value). Thanks to Eric Dumazet who spotted this. - While exiting cpu, move the hlist release out of the IPI path to lock the hlist mutex sanely. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
@@ -589,6 +589,14 @@ enum perf_group_flag {
|
|||||||
PERF_GROUP_SOFTWARE = 0x1,
|
PERF_GROUP_SOFTWARE = 0x1,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define SWEVENT_HLIST_BITS 8
|
||||||
|
#define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS)
|
||||||
|
|
||||||
|
struct swevent_hlist {
|
||||||
|
struct hlist_head heads[SWEVENT_HLIST_SIZE];
|
||||||
|
struct rcu_head rcu_head;
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* struct perf_event - performance event kernel representation:
|
* struct perf_event - performance event kernel representation:
|
||||||
*/
|
*/
|
||||||
@@ -597,6 +605,7 @@ struct perf_event {
|
|||||||
struct list_head group_entry;
|
struct list_head group_entry;
|
||||||
struct list_head event_entry;
|
struct list_head event_entry;
|
||||||
struct list_head sibling_list;
|
struct list_head sibling_list;
|
||||||
|
struct hlist_node hlist_entry;
|
||||||
int nr_siblings;
|
int nr_siblings;
|
||||||
int group_flags;
|
int group_flags;
|
||||||
struct perf_event *group_leader;
|
struct perf_event *group_leader;
|
||||||
@@ -744,6 +753,9 @@ struct perf_cpu_context {
|
|||||||
int active_oncpu;
|
int active_oncpu;
|
||||||
int max_pertask;
|
int max_pertask;
|
||||||
int exclusive;
|
int exclusive;
|
||||||
|
struct swevent_hlist *swevent_hlist;
|
||||||
|
struct mutex hlist_mutex;
|
||||||
|
int hlist_refcount;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Recursion avoidance:
|
* Recursion avoidance:
|
||||||
|
@@ -16,6 +16,7 @@
|
|||||||
#include <linux/file.h>
|
#include <linux/file.h>
|
||||||
#include <linux/poll.h>
|
#include <linux/poll.h>
|
||||||
#include <linux/slab.h>
|
#include <linux/slab.h>
|
||||||
|
#include <linux/hash.h>
|
||||||
#include <linux/sysfs.h>
|
#include <linux/sysfs.h>
|
||||||
#include <linux/dcache.h>
|
#include <linux/dcache.h>
|
||||||
#include <linux/percpu.h>
|
#include <linux/percpu.h>
|
||||||
@@ -3966,36 +3967,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
|
|||||||
perf_swevent_overflow(event, 0, nmi, data, regs);
|
perf_swevent_overflow(event, 0, nmi, data, regs);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int perf_swevent_is_counting(struct perf_event *event)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* The event is active, we're good!
|
|
||||||
*/
|
|
||||||
if (event->state == PERF_EVENT_STATE_ACTIVE)
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* The event is off/error, not counting.
|
|
||||||
*/
|
|
||||||
if (event->state != PERF_EVENT_STATE_INACTIVE)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* The event is inactive, if the context is active
|
|
||||||
* we're part of a group that didn't make it on the 'pmu',
|
|
||||||
* not counting.
|
|
||||||
*/
|
|
||||||
if (event->ctx->is_active)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* We're inactive and the context is too, this means the
|
|
||||||
* task is scheduled out, we're counting events that happen
|
|
||||||
* to us, like migration events.
|
|
||||||
*/
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int perf_tp_event_match(struct perf_event *event,
|
static int perf_tp_event_match(struct perf_event *event,
|
||||||
struct perf_sample_data *data);
|
struct perf_sample_data *data);
|
||||||
|
|
||||||
@@ -4019,12 +3990,6 @@ static int perf_swevent_match(struct perf_event *event,
|
|||||||
struct perf_sample_data *data,
|
struct perf_sample_data *data,
|
||||||
struct pt_regs *regs)
|
struct pt_regs *regs)
|
||||||
{
|
{
|
||||||
if (event->cpu != -1 && event->cpu != smp_processor_id())
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
if (!perf_swevent_is_counting(event))
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
if (event->attr.type != type)
|
if (event->attr.type != type)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
@@ -4041,18 +4006,53 @@ static int perf_swevent_match(struct perf_event *event,
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void perf_swevent_ctx_event(struct perf_event_context *ctx,
|
static inline u64 swevent_hash(u64 type, u32 event_id)
|
||||||
enum perf_type_id type,
|
|
||||||
u32 event_id, u64 nr, int nmi,
|
|
||||||
struct perf_sample_data *data,
|
|
||||||
struct pt_regs *regs)
|
|
||||||
{
|
{
|
||||||
struct perf_event *event;
|
u64 val = event_id | (type << 32);
|
||||||
|
|
||||||
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
|
return hash_64(val, SWEVENT_HLIST_BITS);
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct hlist_head *
|
||||||
|
find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id)
|
||||||
|
{
|
||||||
|
u64 hash;
|
||||||
|
struct swevent_hlist *hlist;
|
||||||
|
|
||||||
|
hash = swevent_hash(type, event_id);
|
||||||
|
|
||||||
|
hlist = rcu_dereference(ctx->swevent_hlist);
|
||||||
|
if (!hlist)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
return &hlist->heads[hash];
|
||||||
|
}
|
||||||
|
|
||||||
|
static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
|
||||||
|
u64 nr, int nmi,
|
||||||
|
struct perf_sample_data *data,
|
||||||
|
struct pt_regs *regs)
|
||||||
|
{
|
||||||
|
struct perf_cpu_context *cpuctx;
|
||||||
|
struct perf_event *event;
|
||||||
|
struct hlist_node *node;
|
||||||
|
struct hlist_head *head;
|
||||||
|
|
||||||
|
cpuctx = &__get_cpu_var(perf_cpu_context);
|
||||||
|
|
||||||
|
rcu_read_lock();
|
||||||
|
|
||||||
|
head = find_swevent_head(cpuctx, type, event_id);
|
||||||
|
|
||||||
|
if (!head)
|
||||||
|
goto end;
|
||||||
|
|
||||||
|
hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
|
||||||
if (perf_swevent_match(event, type, event_id, data, regs))
|
if (perf_swevent_match(event, type, event_id, data, regs))
|
||||||
perf_swevent_add(event, nr, nmi, data, regs);
|
perf_swevent_add(event, nr, nmi, data, regs);
|
||||||
}
|
}
|
||||||
|
end:
|
||||||
|
rcu_read_unlock();
|
||||||
}
|
}
|
||||||
|
|
||||||
int perf_swevent_get_recursion_context(void)
|
int perf_swevent_get_recursion_context(void)
|
||||||
@@ -4090,27 +4090,6 @@ void perf_swevent_put_recursion_context(int rctx)
|
|||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
|
EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
|
||||||
|
|
||||||
static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
|
|
||||||
u64 nr, int nmi,
|
|
||||||
struct perf_sample_data *data,
|
|
||||||
struct pt_regs *regs)
|
|
||||||
{
|
|
||||||
struct perf_cpu_context *cpuctx;
|
|
||||||
struct perf_event_context *ctx;
|
|
||||||
|
|
||||||
cpuctx = &__get_cpu_var(perf_cpu_context);
|
|
||||||
rcu_read_lock();
|
|
||||||
perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
|
|
||||||
nr, nmi, data, regs);
|
|
||||||
/*
|
|
||||||
* doesn't really matter which of the child contexts the
|
|
||||||
* events ends up in.
|
|
||||||
*/
|
|
||||||
ctx = rcu_dereference(current->perf_event_ctxp);
|
|
||||||
if (ctx)
|
|
||||||
perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
|
|
||||||
rcu_read_unlock();
|
|
||||||
}
|
|
||||||
|
|
||||||
void __perf_sw_event(u32 event_id, u64 nr, int nmi,
|
void __perf_sw_event(u32 event_id, u64 nr, int nmi,
|
||||||
struct pt_regs *regs, u64 addr)
|
struct pt_regs *regs, u64 addr)
|
||||||
@@ -4136,16 +4115,28 @@ static void perf_swevent_read(struct perf_event *event)
|
|||||||
static int perf_swevent_enable(struct perf_event *event)
|
static int perf_swevent_enable(struct perf_event *event)
|
||||||
{
|
{
|
||||||
struct hw_perf_event *hwc = &event->hw;
|
struct hw_perf_event *hwc = &event->hw;
|
||||||
|
struct perf_cpu_context *cpuctx;
|
||||||
|
struct hlist_head *head;
|
||||||
|
|
||||||
|
cpuctx = &__get_cpu_var(perf_cpu_context);
|
||||||
|
|
||||||
if (hwc->sample_period) {
|
if (hwc->sample_period) {
|
||||||
hwc->last_period = hwc->sample_period;
|
hwc->last_period = hwc->sample_period;
|
||||||
perf_swevent_set_period(event);
|
perf_swevent_set_period(event);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
head = find_swevent_head(cpuctx, event->attr.type, event->attr.config);
|
||||||
|
if (WARN_ON_ONCE(!head))
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
hlist_add_head_rcu(&event->hlist_entry, head);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void perf_swevent_disable(struct perf_event *event)
|
static void perf_swevent_disable(struct perf_event *event)
|
||||||
{
|
{
|
||||||
|
hlist_del_rcu(&event->hlist_entry);
|
||||||
}
|
}
|
||||||
|
|
||||||
static const struct pmu perf_ops_generic = {
|
static const struct pmu perf_ops_generic = {
|
||||||
@@ -4359,13 +4350,115 @@ static int perf_tp_event_match(struct perf_event *event,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
|
||||||
|
{
|
||||||
|
struct swevent_hlist *hlist;
|
||||||
|
|
||||||
|
hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
|
||||||
|
kfree(hlist);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
|
||||||
|
{
|
||||||
|
struct swevent_hlist *hlist;
|
||||||
|
|
||||||
|
if (!cpuctx->swevent_hlist)
|
||||||
|
return;
|
||||||
|
|
||||||
|
hlist = cpuctx->swevent_hlist;
|
||||||
|
rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
|
||||||
|
call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
|
||||||
|
{
|
||||||
|
struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
|
||||||
|
|
||||||
|
mutex_lock(&cpuctx->hlist_mutex);
|
||||||
|
|
||||||
|
if (!--cpuctx->hlist_refcount)
|
||||||
|
swevent_hlist_release(cpuctx);
|
||||||
|
|
||||||
|
mutex_unlock(&cpuctx->hlist_mutex);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void swevent_hlist_put(struct perf_event *event)
|
||||||
|
{
|
||||||
|
int cpu;
|
||||||
|
|
||||||
|
if (event->cpu != -1) {
|
||||||
|
swevent_hlist_put_cpu(event, event->cpu);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for_each_possible_cpu(cpu)
|
||||||
|
swevent_hlist_put_cpu(event, cpu);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
|
||||||
|
{
|
||||||
|
struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
|
||||||
|
int err = 0;
|
||||||
|
|
||||||
|
mutex_lock(&cpuctx->hlist_mutex);
|
||||||
|
|
||||||
|
if (!cpuctx->swevent_hlist && cpu_online(cpu)) {
|
||||||
|
struct swevent_hlist *hlist;
|
||||||
|
|
||||||
|
hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
|
||||||
|
if (!hlist) {
|
||||||
|
err = -ENOMEM;
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
|
||||||
|
}
|
||||||
|
cpuctx->hlist_refcount++;
|
||||||
|
exit:
|
||||||
|
mutex_unlock(&cpuctx->hlist_mutex);
|
||||||
|
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int swevent_hlist_get(struct perf_event *event)
|
||||||
|
{
|
||||||
|
int err;
|
||||||
|
int cpu, failed_cpu;
|
||||||
|
|
||||||
|
if (event->cpu != -1)
|
||||||
|
return swevent_hlist_get_cpu(event, event->cpu);
|
||||||
|
|
||||||
|
get_online_cpus();
|
||||||
|
for_each_possible_cpu(cpu) {
|
||||||
|
err = swevent_hlist_get_cpu(event, cpu);
|
||||||
|
if (err) {
|
||||||
|
failed_cpu = cpu;
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
put_online_cpus();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
fail:
|
||||||
|
for_each_possible_cpu(cpu) {
|
||||||
|
if (cpu == failed_cpu)
|
||||||
|
break;
|
||||||
|
swevent_hlist_put_cpu(event, cpu);
|
||||||
|
}
|
||||||
|
|
||||||
|
put_online_cpus();
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
static void tp_perf_event_destroy(struct perf_event *event)
|
static void tp_perf_event_destroy(struct perf_event *event)
|
||||||
{
|
{
|
||||||
perf_trace_disable(event->attr.config);
|
perf_trace_disable(event->attr.config);
|
||||||
|
swevent_hlist_put(event);
|
||||||
}
|
}
|
||||||
|
|
||||||
static const struct pmu *tp_perf_event_init(struct perf_event *event)
|
static const struct pmu *tp_perf_event_init(struct perf_event *event)
|
||||||
{
|
{
|
||||||
|
int err;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Raw tracepoint data is a severe data leak, only allow root to
|
* Raw tracepoint data is a severe data leak, only allow root to
|
||||||
* have these.
|
* have these.
|
||||||
@@ -4379,6 +4472,11 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
|
|||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
event->destroy = tp_perf_event_destroy;
|
event->destroy = tp_perf_event_destroy;
|
||||||
|
err = swevent_hlist_get(event);
|
||||||
|
if (err) {
|
||||||
|
perf_trace_disable(event->attr.config);
|
||||||
|
return ERR_PTR(err);
|
||||||
|
}
|
||||||
|
|
||||||
return &perf_ops_generic;
|
return &perf_ops_generic;
|
||||||
}
|
}
|
||||||
@@ -4479,6 +4577,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
|
|||||||
WARN_ON(event->parent);
|
WARN_ON(event->parent);
|
||||||
|
|
||||||
atomic_dec(&perf_swevent_enabled[event_id]);
|
atomic_dec(&perf_swevent_enabled[event_id]);
|
||||||
|
swevent_hlist_put(event);
|
||||||
}
|
}
|
||||||
|
|
||||||
static const struct pmu *sw_perf_event_init(struct perf_event *event)
|
static const struct pmu *sw_perf_event_init(struct perf_event *event)
|
||||||
@@ -4517,6 +4616,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
|
|||||||
case PERF_COUNT_SW_ALIGNMENT_FAULTS:
|
case PERF_COUNT_SW_ALIGNMENT_FAULTS:
|
||||||
case PERF_COUNT_SW_EMULATION_FAULTS:
|
case PERF_COUNT_SW_EMULATION_FAULTS:
|
||||||
if (!event->parent) {
|
if (!event->parent) {
|
||||||
|
int err;
|
||||||
|
|
||||||
|
err = swevent_hlist_get(event);
|
||||||
|
if (err)
|
||||||
|
return ERR_PTR(err);
|
||||||
|
|
||||||
atomic_inc(&perf_swevent_enabled[event_id]);
|
atomic_inc(&perf_swevent_enabled[event_id]);
|
||||||
event->destroy = sw_perf_event_destroy;
|
event->destroy = sw_perf_event_destroy;
|
||||||
}
|
}
|
||||||
@@ -5389,6 +5494,7 @@ static void __init perf_event_init_all_cpus(void)
|
|||||||
|
|
||||||
for_each_possible_cpu(cpu) {
|
for_each_possible_cpu(cpu) {
|
||||||
cpuctx = &per_cpu(perf_cpu_context, cpu);
|
cpuctx = &per_cpu(perf_cpu_context, cpu);
|
||||||
|
mutex_init(&cpuctx->hlist_mutex);
|
||||||
__perf_event_init_context(&cpuctx->ctx, NULL);
|
__perf_event_init_context(&cpuctx->ctx, NULL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -5402,6 +5508,16 @@ static void __cpuinit perf_event_init_cpu(int cpu)
|
|||||||
spin_lock(&perf_resource_lock);
|
spin_lock(&perf_resource_lock);
|
||||||
cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
|
cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
|
||||||
spin_unlock(&perf_resource_lock);
|
spin_unlock(&perf_resource_lock);
|
||||||
|
|
||||||
|
mutex_lock(&cpuctx->hlist_mutex);
|
||||||
|
if (cpuctx->hlist_refcount > 0) {
|
||||||
|
struct swevent_hlist *hlist;
|
||||||
|
|
||||||
|
hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
|
||||||
|
WARN_ON_ONCE(!hlist);
|
||||||
|
rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
|
||||||
|
}
|
||||||
|
mutex_unlock(&cpuctx->hlist_mutex);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_HOTPLUG_CPU
|
#ifdef CONFIG_HOTPLUG_CPU
|
||||||
@@ -5421,6 +5537,10 @@ static void perf_event_exit_cpu(int cpu)
|
|||||||
struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
|
struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
|
||||||
struct perf_event_context *ctx = &cpuctx->ctx;
|
struct perf_event_context *ctx = &cpuctx->ctx;
|
||||||
|
|
||||||
|
mutex_lock(&cpuctx->hlist_mutex);
|
||||||
|
swevent_hlist_release(cpuctx);
|
||||||
|
mutex_unlock(&cpuctx->hlist_mutex);
|
||||||
|
|
||||||
mutex_lock(&ctx->mutex);
|
mutex_lock(&ctx->mutex);
|
||||||
smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
|
smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
|
||||||
mutex_unlock(&ctx->mutex);
|
mutex_unlock(&ctx->mutex);
|
||||||
|
Reference in New Issue
Block a user