Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf updates from Ingo Molnar:
 "The main kernel side changes in this cycle were:

   - Various Intel-PT updates and optimizations (Alexander Shishkin)

   - Prohibit kprobes on Xen/KVM emulate prefixes (Masami Hiramatsu)

   - Add support for LSM and SELinux checks to control access to the
     perf syscall (Joel Fernandes)

   - Misc other changes, optimizations, fixes and cleanups - see the
     shortlog for details.

  There were numerous tooling changes as well - 254 non-merge commits.
  Here are the main changes - too many to list in detail:

   - Enhancements to core tooling infrastructure, perf.data, libperf,
     libtraceevent, event parsing, vendor events, Intel PT, callchains,
     BPF support and instruction decoding.

   - There were updates to the following tools:

        perf annotate
        perf diff
        perf inject
        perf kvm
        perf list
        perf maps
        perf parse
        perf probe
        perf record
        perf report
        perf script
        perf stat
        perf test
        perf trace

   - And a lot of other changes: please see the shortlog and Git log for
     more details"

* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (279 commits)
  perf parse: Fix potential memory leak when handling tracepoint errors
  perf probe: Fix spelling mistake "addrees" -> "address"
  libtraceevent: Fix memory leakage in copy_filter_type
  libtraceevent: Fix header installation
  perf intel-bts: Does not support AUX area sampling
  perf intel-pt: Add support for decoding AUX area samples
  perf intel-pt: Add support for recording AUX area samples
  perf pmu: When using default config, record which bits of config were changed by the user
  perf auxtrace: Add support for queuing AUX area samples
  perf session: Add facility to peek at all events
  perf auxtrace: Add support for dumping AUX area samples
  perf inject: Cut AUX area samples
  perf record: Add aux-sample-size config term
  perf record: Add support for AUX area sampling
  perf auxtrace: Add support for AUX area sample recording
  perf auxtrace: Move perf_evsel__find_pmu()
  perf record: Add a function to test for kernel support for AUX area sampling
  perf tools: Add kernel AUX area sampling definitions
  perf/core: Make the mlock accounting simple again
  perf report: Jump to symbol source view from total cycles view
  ...
Cette révision appartient à :
Linus Torvalds
2019-11-26 15:04:47 -08:00
révision 3f59dbcace
297 fichiers modifiés avec 44924 ajouts et 35331 suppressions

Voir le fichier

@@ -1941,6 +1941,11 @@ static void perf_put_aux_event(struct perf_event *event)
}
}
static bool perf_need_aux_event(struct perf_event *event)
{
return !!event->attr.aux_output || !!event->attr.aux_sample_size;
}
static int perf_get_aux_event(struct perf_event *event,
struct perf_event *group_leader)
{
@@ -1953,7 +1958,17 @@ static int perf_get_aux_event(struct perf_event *event,
if (!group_leader)
return 0;
if (!perf_aux_output_match(event, group_leader))
/*
* aux_output and aux_sample_size are mutually exclusive.
*/
if (event->attr.aux_output && event->attr.aux_sample_size)
return 0;
if (event->attr.aux_output &&
!perf_aux_output_match(event, group_leader))
return 0;
if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
return 0;
if (!atomic_long_inc_not_zero(&group_leader->refcount))
@@ -2666,6 +2681,25 @@ perf_install_in_context(struct perf_event_context *ctx,
*/
smp_store_release(&event->ctx, ctx);
/*
* perf_event_attr::disabled events will not run and can be initialized
* without IPI. Except when this is the first event for the context, in
* that case we need the magic of the IPI to set ctx->is_active.
*
* The IOC_ENABLE that is sure to follow the creation of a disabled
* event will issue the IPI and reprogram the hardware.
*/
if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
raw_spin_lock_irq(&ctx->lock);
if (ctx->task == TASK_TOMBSTONE) {
raw_spin_unlock_irq(&ctx->lock);
return;
}
add_event_to_ctx(event, ctx);
raw_spin_unlock_irq(&ctx->lock);
return;
}
if (!task) {
cpu_function_call(cpu, __perf_install_in_context, event);
return;
@@ -3204,10 +3238,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_lock(&ctx->lock);
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
struct pmu *pmu = ctx->pmu;
WRITE_ONCE(ctx->task, next);
WRITE_ONCE(next_ctx->task, task);
swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
/*
* PMU specific parts of task perf context can require
* additional synchronization. As an example of such
* synchronization see implementation details of Intel
* LBR call stack data profiling;
*/
if (pmu->swap_task_ctx)
pmu->swap_task_ctx(ctx, next_ctx);
else
swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
/*
* RCU_INIT_POINTER here is safe because we've not
@@ -4229,8 +4274,9 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
if (!task) {
/* Must be root to operate on a CPU event: */
if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
return ERR_PTR(-EACCES);
err = perf_allow_cpu(&event->attr);
if (err)
return ERR_PTR(err);
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
ctx = &cpuctx->ctx;
@@ -4539,6 +4585,8 @@ static void _free_event(struct perf_event *event)
unaccount_event(event);
security_perf_event_free(event);
if (event->rb) {
/*
* Can happen when we close an event with re-directed output.
@@ -4992,6 +5040,10 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
struct perf_event_context *ctx;
int ret;
ret = security_perf_event_read(event);
if (ret)
return ret;
ctx = perf_event_ctx_lock(event);
ret = __perf_read(event, buf, count);
perf_event_ctx_unlock(event, ctx);
@@ -5288,6 +5340,11 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct perf_event_context *ctx;
long ret;
/* Treat ioctl like writes as it is likely a mutating operation. */
ret = security_perf_event_write(event);
if (ret)
return ret;
ctx = perf_event_ctx_lock(event);
ret = _perf_ioctl(event, cmd, arg);
perf_event_ctx_unlock(event, ctx);
@@ -5639,10 +5696,8 @@ static void perf_mmap_close(struct vm_area_struct *vma)
perf_pmu_output_stop(event);
/* now it's safe to free the pages */
if (!rb->aux_mmap_locked)
atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
else
atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
/* this has to be the last one */
rb_free_aux(rb);
@@ -5753,6 +5808,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
if (!(vma->vm_flags & VM_SHARED))
return -EINVAL;
ret = security_perf_event_read(event);
if (ret)
return ret;
vma_size = vma->vm_end - vma->vm_start;
if (vma->vm_pgoff == 0) {
@@ -5859,13 +5918,7 @@ accounting:
user_locked = atomic_long_read(&user->locked_vm) + user_extra;
if (user_locked <= user_lock_limit) {
/* charge all to locked_vm */
} else if (atomic_long_read(&user->locked_vm) >= user_lock_limit) {
/* charge all to pinned_vm */
extra = user_extra;
user_extra = 0;
} else {
if (user_locked > user_lock_limit) {
/*
* charge locked_vm until it hits user_lock_limit;
* charge the rest from pinned_vm
@@ -5878,7 +5931,7 @@ accounting:
lock_limit >>= PAGE_SHIFT;
locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
if ((locked > lock_limit) && perf_is_paranoid() &&
!capable(CAP_IPC_LOCK)) {
ret = -EPERM;
goto unlock;
@@ -6208,6 +6261,122 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
}
}
static unsigned long perf_prepare_sample_aux(struct perf_event *event,
struct perf_sample_data *data,
size_t size)
{
struct perf_event *sampler = event->aux_event;
struct ring_buffer *rb;
data->aux_size = 0;
if (!sampler)
goto out;
if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
goto out;
if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
goto out;
rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
if (!rb)
goto out;
/*
* If this is an NMI hit inside sampling code, don't take
* the sample. See also perf_aux_sample_output().
*/
if (READ_ONCE(rb->aux_in_sampling)) {
data->aux_size = 0;
} else {
size = min_t(size_t, size, perf_aux_size(rb));
data->aux_size = ALIGN(size, sizeof(u64));
}
ring_buffer_put(rb);
out:
return data->aux_size;
}
long perf_pmu_snapshot_aux(struct ring_buffer *rb,
struct perf_event *event,
struct perf_output_handle *handle,
unsigned long size)
{
unsigned long flags;
long ret;
/*
* Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
* paths. If we start calling them in NMI context, they may race with
* the IRQ ones, that is, for example, re-starting an event that's just
* been stopped, which is why we're using a separate callback that
* doesn't change the event state.
*
* IRQs need to be disabled to prevent IPIs from racing with us.
*/
local_irq_save(flags);
/*
* Guard against NMI hits inside the critical section;
* see also perf_prepare_sample_aux().
*/
WRITE_ONCE(rb->aux_in_sampling, 1);
barrier();
ret = event->pmu->snapshot_aux(event, handle, size);
barrier();
WRITE_ONCE(rb->aux_in_sampling, 0);
local_irq_restore(flags);
return ret;
}
static void perf_aux_sample_output(struct perf_event *event,
struct perf_output_handle *handle,
struct perf_sample_data *data)
{
struct perf_event *sampler = event->aux_event;
unsigned long pad;
struct ring_buffer *rb;
long size;
if (WARN_ON_ONCE(!sampler || !data->aux_size))
return;
rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
if (!rb)
return;
size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
/*
* An error here means that perf_output_copy() failed (returned a
* non-zero surplus that it didn't copy), which in its current
* enlightened implementation is not possible. If that changes, we'd
* like to know.
*/
if (WARN_ON_ONCE(size < 0))
goto out_put;
/*
* The pad comes from ALIGN()ing data->aux_size up to u64 in
* perf_prepare_sample_aux(), so should not be more than that.
*/
pad = data->aux_size - size;
if (WARN_ON_ONCE(pad >= sizeof(u64)))
pad = 8;
if (pad) {
u64 zero = 0;
perf_output_copy(handle, &zero, pad);
}
out_put:
ring_buffer_put(rb);
}
static void __perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
@@ -6527,6 +6696,13 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
perf_output_put(handle, data->phys_addr);
if (sample_type & PERF_SAMPLE_AUX) {
perf_output_put(handle, data->aux_size);
if (data->aux_size)
perf_aux_sample_output(event, handle, data);
}
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
@@ -6715,6 +6891,35 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
data->phys_addr = perf_virt_to_phys(data->addr);
if (sample_type & PERF_SAMPLE_AUX) {
u64 size;
header->size += sizeof(u64); /* size */
/*
* Given the 16bit nature of header::size, an AUX sample can
* easily overflow it, what with all the preceding sample bits.
* Make sure this doesn't happen by using up to U16_MAX bytes
* per sample in total (rounded down to 8 byte boundary).
*/
size = min_t(size_t, U16_MAX - header->size,
event->attr.aux_sample_size);
size = rounddown(size, 8);
size = perf_prepare_sample_aux(event, data, size);
WARN_ON_ONCE(size + header->size > U16_MAX);
header->size += size;
}
/*
* If you're adding more sample types here, you likely need to do
* something about the overflowing header::size, like repurpose the
* lowest 3 bits of size, which should be always zero at the moment.
* This raises a more important question, do we really need 512k sized
* samples and why, so good argumentation is in order for whatever you
* do here next.
*/
WARN_ON_ONCE(header->size & 7);
}
static __always_inline int
@@ -10066,7 +10271,7 @@ static struct lock_class_key cpuctx_lock;
int perf_pmu_register(struct pmu *pmu, const char *name, int type)
{
int cpu, ret;
int cpu, ret, max = PERF_TYPE_MAX;
mutex_lock(&pmus_lock);
ret = -ENOMEM;
@@ -10079,12 +10284,17 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
goto skip_type;
pmu->name = name;
if (type < 0) {
type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
if (type < 0) {
ret = type;
if (type != PERF_TYPE_SOFTWARE) {
if (type >= 0)
max = type;
ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
if (ret < 0)
goto free_pdc;
}
WARN_ON(type >= 0 && ret != type);
type = ret;
}
pmu->type = type;
@@ -10161,7 +10371,16 @@ got_cpu_context:
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;
list_add_rcu(&pmu->entry, &pmus);
/*
* Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
* since these cannot be in the IDR. This way the linear search
* is fast, provided a valid software event is provided.
*/
if (type == PERF_TYPE_SOFTWARE || !name)
list_add_rcu(&pmu->entry, &pmus);
else
list_add_tail_rcu(&pmu->entry, &pmus);
atomic_set(&pmu->exclusive_cnt, 0);
ret = 0;
unlock:
@@ -10174,7 +10393,7 @@ free_dev:
put_device(pmu->dev);
free_idr:
if (pmu->type >= PERF_TYPE_MAX)
if (pmu->type != PERF_TYPE_SOFTWARE)
idr_remove(&pmu_idr, pmu->type);
free_pdc:
@@ -10196,7 +10415,7 @@ void perf_pmu_unregister(struct pmu *pmu)
synchronize_rcu();
free_percpu(pmu->pmu_disable_count);
if (pmu->type >= PERF_TYPE_MAX)
if (pmu->type != PERF_TYPE_SOFTWARE)
idr_remove(&pmu_idr, pmu->type);
if (pmu_bus_running) {
if (pmu->nr_addr_filters)
@@ -10266,9 +10485,8 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
static struct pmu *perf_init_event(struct perf_event *event)
{
int idx, type, ret;
struct pmu *pmu;
int idx;
int ret;
idx = srcu_read_lock(&pmus_srcu);
@@ -10280,13 +10498,28 @@ static struct pmu *perf_init_event(struct perf_event *event)
goto unlock;
}
/*
* PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
* are often aliases for PERF_TYPE_RAW.
*/
type = event->attr.type;
if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE)
type = PERF_TYPE_RAW;
again:
rcu_read_lock();
pmu = idr_find(&pmu_idr, event->attr.type);
pmu = idr_find(&pmu_idr, type);
rcu_read_unlock();
if (pmu) {
ret = perf_try_init_event(pmu, event);
if (ret == -ENOENT && event->attr.type != type) {
type = event->attr.type;
goto again;
}
if (ret)
pmu = ERR_PTR(ret);
goto unlock;
}
@@ -10618,11 +10851,20 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
}
err = security_perf_event_alloc(event);
if (err)
goto err_callchain_buffer;
/* symmetric to unaccount_event() in _free_event() */
account_event(event);
return event;
err_callchain_buffer:
if (!event->parent) {
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
put_callchain_buffers();
}
err_addr_filters:
kfree(event->addr_filter_ranges);
@@ -10673,7 +10915,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
attr->size = size;
if (attr->__reserved_1 || attr->__reserved_2)
if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
return -EINVAL;
if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@ -10711,9 +10953,11 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
attr->branch_sample_type = mask;
}
/* privileged levels capture (kernel, hv): check permissions */
if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
&& perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
return -EACCES;
if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
ret = perf_allow_kernel(attr);
if (ret)
return ret;
}
}
if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
@@ -10926,13 +11170,19 @@ SYSCALL_DEFINE5(perf_event_open,
if (flags & ~PERF_FLAG_ALL)
return -EINVAL;
/* Do we allow access to perf_event_open(2) ? */
err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
if (err)
return err;
err = perf_copy_attr(attr_uptr, &attr);
if (err)
return err;
if (!attr.exclude_kernel) {
if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
return -EACCES;
err = perf_allow_kernel(&attr);
if (err)
return err;
}
if (attr.namespaces) {
@@ -10949,9 +11199,11 @@ SYSCALL_DEFINE5(perf_event_open,
}
/* Only privileged users can get physical addresses */
if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
return -EACCES;
if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
err = perf_allow_kernel(&attr);
if (err)
return err;
}
err = security_locked_down(LOCKDOWN_PERF);
if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR))
@@ -11213,7 +11465,7 @@ SYSCALL_DEFINE5(perf_event_open,
}
}
if (event->attr.aux_output && !perf_get_aux_event(event, group_leader))
if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader))
goto err_locked;
/*

Voir le fichier

@@ -50,6 +50,7 @@ struct ring_buffer {
unsigned long aux_mmap_locked;
void (*free_aux)(void *);
refcount_t aux_refcount;
int aux_in_sampling;
void **aux_pages;
void *aux_priv;

Voir le fichier

@@ -562,6 +562,42 @@ void *perf_get_aux(struct perf_output_handle *handle)
}
EXPORT_SYMBOL_GPL(perf_get_aux);
/*
* Copy out AUX data from an AUX handle.
*/
long perf_output_copy_aux(struct perf_output_handle *aux_handle,
struct perf_output_handle *handle,
unsigned long from, unsigned long to)
{
unsigned long tocopy, remainder, len = 0;
struct ring_buffer *rb = aux_handle->rb;
void *addr;
from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
do {
tocopy = PAGE_SIZE - offset_in_page(from);
if (to > from)
tocopy = min(tocopy, to - from);
if (!tocopy)
break;
addr = rb->aux_pages[from >> PAGE_SHIFT];
addr += offset_in_page(from);
remainder = perf_output_copy(handle, addr, tocopy);
if (remainder)
return -EFAULT;
len += tocopy;
from += tocopy;
from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
} while (to != from);
return len;
}
#define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
static struct page *rb_alloc_aux_page(int node, int order)
@@ -754,6 +790,14 @@ static void *perf_mmap_alloc_page(int cpu)
return page_address(page);
}
static void perf_mmap_free_page(void *addr)
{
struct page *page = virt_to_page(addr);
page->mapping = NULL;
__free_page(page);
}
struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
{
struct ring_buffer *rb;
@@ -788,9 +832,9 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
fail_data_pages:
for (i--; i >= 0; i--)
free_page((unsigned long)rb->data_pages[i]);
perf_mmap_free_page(rb->data_pages[i]);
free_page((unsigned long)rb->user_page);
perf_mmap_free_page(rb->user_page);
fail_user_page:
kfree(rb);
@@ -799,21 +843,13 @@ fail:
return NULL;
}
static void perf_mmap_free_page(unsigned long addr)
{
struct page *page = virt_to_page((void *)addr);
page->mapping = NULL;
__free_page(page);
}
void rb_free(struct ring_buffer *rb)
{
int i;
perf_mmap_free_page((unsigned long)rb->user_page);
perf_mmap_free_page(rb->user_page);
for (i = 0; i < rb->nr_pages; i++)
perf_mmap_free_page((unsigned long)rb->data_pages[i]);
perf_mmap_free_page(rb->data_pages[i]);
kfree(rb);
}

Voir le fichier

@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/kprobes.h>
#include <linux/security.h>
#include "trace.h"
#include "trace_probe.h"
@@ -26,8 +27,10 @@ static int total_ref_count;
static int perf_trace_event_perm(struct trace_event_call *tp_event,
struct perf_event *p_event)
{
int ret;
if (tp_event->perf_perm) {
int ret = tp_event->perf_perm(tp_event, p_event);
ret = tp_event->perf_perm(tp_event, p_event);
if (ret)
return ret;
}
@@ -46,8 +49,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
/* The ftrace function trace is allowed only for root. */
if (ftrace_event_is_function(tp_event)) {
if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
return -EPERM;
ret = perf_allow_tracepoint(&p_event->attr);
if (ret)
return ret;
if (!is_sampling_event(p_event))
return 0;
@@ -82,8 +86,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
* ...otherwise raw tracepoint data can be a severe data leak,
* only allow root to have these.
*/
if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
return -EPERM;
ret = perf_allow_tracepoint(&p_event->attr);
if (ret)
return ret;
return 0;
}