Merge tag 'perf-core-2020-08-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf event updates from Ingo Molnar:
 "HW support updates:

   - Add uncore support for Intel Comet Lake

   - Add RAPL support for Hygon Fam18h

   - Add Intel "IIO stack to PMON mapping" support on Skylake-SP CPUs,
     which enumerates per device performance counters via sysfs and
     enables the perf stat --iiostat functionality

   - Add support for Intel "Architectural LBRs", which generalized the
     model specific LBR hardware tracing feature into a
     model-independent, architected performance monitoring feature.

     Usage is mostly seamless to tooling, as the pre-existing LBR
     features are kept, but there's a couple of advantages under the
     hood, such as faster context-switching, faster LBR reads, cleaner
     exposure of LBR features to guest kernels, etc.

     ( Since architectural LBRs are supported via XSAVE, there's related
       changes to the x86 FPU code as well. )

  ftrace/perf updates:

   - Add support to add a text poke event to record changes to kernel
     text (i.e. self-modifying code) in order to support tracers like
     Intel PT decoding through jump labels, kprobes and ftrace
     trampolines.

  Misc cleanups, smaller fixes..."

* tag 'perf-core-2020-08-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (47 commits)
  perf/x86/rapl: Add Hygon Fam18h RAPL support
  kprobes: Remove unnecessary module_mutex locking from kprobe_optimizer()
  x86/perf: Fix a typo
  perf: <linux/perf_event.h>: drop a duplicated word
  perf/x86/intel/lbr: Support XSAVES for arch LBR read
  perf/x86/intel/lbr: Support XSAVES/XRSTORS for LBR context switch
  x86/fpu/xstate: Add helpers for LBR dynamic supervisor feature
  x86/fpu/xstate: Support dynamic supervisor feature for LBR
  x86/fpu: Use proper mask to replace full instruction mask
  perf/x86: Remove task_ctx_size
  perf/x86/intel/lbr: Create kmem_cache for the LBR context data
  perf/core: Use kmem_cache to allocate the PMU specific data
  perf/core: Factor out functions to allocate/free the task_ctx_data
  perf/x86/intel/lbr: Support Architectural LBR
  perf/x86/intel/lbr: Factor out intel_pmu_store_lbr
  perf/x86/intel/lbr: Factor out rdlbr_all() and wrlbr_all()
  perf/x86/intel/lbr: Mark the {rd,wr}lbr_{to,from} wrappers __always_inline
  perf/x86/intel/lbr: Unify the stored format of LBR information
  perf/x86/intel/lbr: Support LBR_CTL
  perf/x86: Expose CPUID enumeration bits for arch LBR
  ...
This commit is contained in:
Linus Torvalds
2020-08-03 14:51:09 -07:00
32 changed files with 1943 additions and 280 deletions

View File

@@ -3,6 +3,7 @@
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/perf_event.h>
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/stringify.h>
@@ -1001,6 +1002,7 @@ struct text_poke_loc {
s32 rel32;
u8 opcode;
const u8 text[POKE_MAX_OPCODE_SIZE];
u8 old;
};
struct bp_patching_desc {
@@ -1168,8 +1170,10 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
/*
* First step: add a int3 trap to the address that will be patched.
*/
for (i = 0; i < nr_entries; i++)
for (i = 0; i < nr_entries; i++) {
tp[i].old = *(u8 *)text_poke_addr(&tp[i]);
text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
}
text_poke_sync();
@@ -1177,14 +1181,45 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
* Second step: update all but the first byte of the patched range.
*/
for (do_sync = 0, i = 0; i < nr_entries; i++) {
u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, };
int len = text_opcode_size(tp[i].opcode);
if (len - INT3_INSN_SIZE > 0) {
memcpy(old + INT3_INSN_SIZE,
text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
len - INT3_INSN_SIZE);
text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
(const char *)tp[i].text + INT3_INSN_SIZE,
len - INT3_INSN_SIZE);
do_sync++;
}
/*
* Emit a perf event to record the text poke, primarily to
* support Intel PT decoding which must walk the executable code
* to reconstruct the trace. The flow up to here is:
* - write INT3 byte
* - IPI-SYNC
* - write instruction tail
* At this point the actual control flow will be through the
* INT3 and handler and not hit the old or new instruction.
* Intel PT outputs FUP/TIP packets for the INT3, so the flow
* can still be decoded. Subsequently:
* - emit RECORD_TEXT_POKE with the new instruction
* - IPI-SYNC
* - write first byte
* - IPI-SYNC
* So before the text poke event timestamp, the decoder will see
* either the old instruction flow or FUP/TIP of INT3. After the
* text poke event timestamp, the decoder will see either the
* new instruction flow or FUP/TIP of INT3. Thus decoders can
* use the timestamp as the point at which to modify the
* executable code.
* The old instruction is recorded so that the event can be
* processed forwards or backwards.
*/
perf_event_text_poke(text_poke_addr(&tp[i]), old, len,
tp[i].text, len);
}
if (do_sync) {

View File

@@ -82,6 +82,45 @@ bool irq_fpu_usable(void)
}
EXPORT_SYMBOL(irq_fpu_usable);
/*
* These must be called with preempt disabled. Returns
* 'true' if the FPU state is still intact and we can
* keep registers active.
*
* The legacy FNSAVE instruction cleared all FPU state
* unconditionally, so registers are essentially destroyed.
* Modern FPU state can be kept in registers, if there are
* no pending FP exceptions.
*/
int copy_fpregs_to_fpstate(struct fpu *fpu)
{
if (likely(use_xsave())) {
copy_xregs_to_kernel(&fpu->state.xsave);
/*
* AVX512 state is tracked here because its use is
* known to slow the max clock speed of the core.
*/
if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
fpu->avx512_timestamp = jiffies;
return 1;
}
if (likely(use_fxsr())) {
copy_fxregs_to_kernel(fpu);
return 1;
}
/*
* Legacy FPU register saving, FNSAVE always clears FPU registers,
* so we have to mark them inactive:
*/
asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));
return 0;
}
EXPORT_SYMBOL(copy_fpregs_to_fpstate);
void kernel_fpu_begin(void)
{
preempt_disable();

View File

@@ -233,8 +233,10 @@ void fpu__init_cpu_xstate(void)
/*
* MSR_IA32_XSS sets supervisor states managed by XSAVES.
*/
if (boot_cpu_has(X86_FEATURE_XSAVES))
wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
if (boot_cpu_has(X86_FEATURE_XSAVES)) {
wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
xfeatures_mask_dynamic());
}
}
static bool xfeature_enabled(enum xfeature xfeature)
@@ -486,7 +488,7 @@ static int xfeature_uncompacted_offset(int xfeature_nr)
return ebx;
}
static int xfeature_size(int xfeature_nr)
int xfeature_size(int xfeature_nr)
{
u32 eax, ebx, ecx, edx;
@@ -598,7 +600,8 @@ static void check_xstate_against_struct(int nr)
*/
if ((nr < XFEATURE_YMM) ||
(nr >= XFEATURE_MAX) ||
(nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR)) {
(nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) ||
((nr >= XFEATURE_RSRVD_COMP_10) && (nr <= XFEATURE_LBR))) {
WARN_ONCE(1, "no structure for xstate: %d\n", nr);
XSTATE_WARN_ON(1);
}
@@ -847,8 +850,10 @@ void fpu__resume_cpu(void)
* Restore IA32_XSS. The same CPUID bit enumerates support
* of XSAVES and MSR_IA32_XSS.
*/
if (boot_cpu_has(X86_FEATURE_XSAVES))
wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
if (boot_cpu_has(X86_FEATURE_XSAVES)) {
wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
xfeatures_mask_dynamic());
}
}
/*
@@ -1356,6 +1361,78 @@ void copy_supervisor_to_kernel(struct xregs_state *xstate)
}
}
/**
* copy_dynamic_supervisor_to_kernel() - Save dynamic supervisor states to
* an xsave area
* @xstate: A pointer to an xsave area
* @mask: Represent the dynamic supervisor features saved into the xsave area
*
* Only the dynamic supervisor states sets in the mask are saved into the xsave
* area (See the comment in XFEATURE_MASK_DYNAMIC for the details of dynamic
* supervisor feature). Besides the dynamic supervisor states, the legacy
* region and XSAVE header are also saved into the xsave area. The supervisor
* features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and
* XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not saved.
*
* The xsave area must be 64-bytes aligned.
*/
void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask)
{
u64 dynamic_mask = xfeatures_mask_dynamic() & mask;
u32 lmask, hmask;
int err;
if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES)))
return;
if (WARN_ON_FPU(!dynamic_mask))
return;
lmask = dynamic_mask;
hmask = dynamic_mask >> 32;
XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
/* Should never fault when copying to a kernel buffer */
WARN_ON_FPU(err);
}
/**
* copy_kernel_to_dynamic_supervisor() - Restore dynamic supervisor states from
* an xsave area
* @xstate: A pointer to an xsave area
* @mask: Represent the dynamic supervisor features restored from the xsave area
*
* Only the dynamic supervisor states sets in the mask are restored from the
* xsave area (See the comment in XFEATURE_MASK_DYNAMIC for the details of
* dynamic supervisor feature). Besides the dynamic supervisor states, the
* legacy region and XSAVE header are also restored from the xsave area. The
* supervisor features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and
* XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not restored.
*
* The xsave area must be 64-bytes aligned.
*/
void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask)
{
u64 dynamic_mask = xfeatures_mask_dynamic() & mask;
u32 lmask, hmask;
int err;
if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES)))
return;
if (WARN_ON_FPU(!dynamic_mask))
return;
lmask = dynamic_mask;
hmask = dynamic_mask >> 32;
XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
/* Should never fault when copying from a kernel buffer */
WARN_ON_FPU(err);
}
#ifdef CONFIG_PROC_PID_ARCH_STATUS
/*
* Report the amount of time elapsed in millisecond since last AVX512

View File

@@ -33,6 +33,7 @@
#include <linux/hardirq.h>
#include <linux/preempt.h>
#include <linux/sched/debug.h>
#include <linux/perf_event.h>
#include <linux/extable.h>
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
@@ -472,6 +473,9 @@ static int arch_copy_kprobe(struct kprobe *p)
/* Also, displacement change doesn't affect the first byte */
p->opcode = buf[0];
p->ainsn.tp_len = len;
perf_event_text_poke(p->ainsn.insn, NULL, 0, buf, len);
/* OK, write back the instruction(s) into ROX insn buffer */
text_poke(p->ainsn.insn, buf, len);
@@ -503,12 +507,18 @@ int arch_prepare_kprobe(struct kprobe *p)
void arch_arm_kprobe(struct kprobe *p)
{
text_poke(p->addr, ((unsigned char []){INT3_INSN_OPCODE}), 1);
u8 int3 = INT3_INSN_OPCODE;
text_poke(p->addr, &int3, 1);
text_poke_sync();
perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1);
}
void arch_disarm_kprobe(struct kprobe *p)
{
u8 int3 = INT3_INSN_OPCODE;
perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1);
text_poke(p->addr, &p->opcode, 1);
text_poke_sync();
}
@@ -516,6 +526,9 @@ void arch_disarm_kprobe(struct kprobe *p)
void arch_remove_kprobe(struct kprobe *p)
{
if (p->ainsn.insn) {
/* Record the perf event before freeing the slot */
perf_event_text_poke(p->ainsn.insn, p->ainsn.insn,
p->ainsn.tp_len, NULL, 0);
free_insn_slot(p->ainsn.insn, p->ainsn.boostable);
p->ainsn.insn = NULL;
}

View File

@@ -6,6 +6,7 @@
* Copyright (C) Hitachi Ltd., 2012
*/
#include <linux/kprobes.h>
#include <linux/perf_event.h>
#include <linux/ptrace.h>
#include <linux/string.h>
#include <linux/slab.h>
@@ -352,8 +353,15 @@ int arch_within_optimized_kprobe(struct optimized_kprobe *op,
static
void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
{
if (op->optinsn.insn) {
free_optinsn_slot(op->optinsn.insn, dirty);
u8 *slot = op->optinsn.insn;
if (slot) {
int len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE;
/* Record the perf event before freeing the slot */
if (dirty)
perf_event_text_poke(slot, slot, len, NULL, 0);
free_optinsn_slot(slot, dirty);
op->optinsn.insn = NULL;
op->optinsn.size = 0;
}
@@ -424,8 +432,15 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
(u8 *)op->kp.addr + op->optinsn.size);
len += JMP32_INSN_SIZE;
/*
* Note len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE is also
* used in __arch_remove_optimized_kprobe().
*/
/* We have to use text_poke() for instruction buffer because it is RO */
perf_event_text_poke(slot, NULL, 0, buf, len);
text_poke(slot, buf, len);
ret = 0;
out:
kfree(buf);
@@ -477,10 +492,23 @@ void arch_optimize_kprobes(struct list_head *oplist)
*/
void arch_unoptimize_kprobe(struct optimized_kprobe *op)
{
arch_arm_kprobe(&op->kp);
text_poke(op->kp.addr + INT3_INSN_SIZE,
op->optinsn.copied_insn, DISP32_SIZE);
u8 new[JMP32_INSN_SIZE] = { INT3_INSN_OPCODE, };
u8 old[JMP32_INSN_SIZE];
u8 *addr = op->kp.addr;
memcpy(old, op->kp.addr, JMP32_INSN_SIZE);
memcpy(new + INT3_INSN_SIZE,
op->optinsn.copied_insn,
JMP32_INSN_SIZE - INT3_INSN_SIZE);
text_poke(addr, new, INT3_INSN_SIZE);
text_poke_sync();
text_poke(addr + INT3_INSN_SIZE,
new + INT3_INSN_SIZE,
JMP32_INSN_SIZE - INT3_INSN_SIZE);
text_poke_sync();
perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE);
}
/*