Merge branch 'perf/uprobes' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/uprobes

This commit is contained in:
Ingo Molnar
2012-05-14 14:43:40 +02:00
629 changed files with 8302 additions and 5149 deletions

View File

@@ -484,9 +484,6 @@ static int __x86_pmu_event_init(struct perf_event *event)
/* mark unused */
event->hw.extra_reg.idx = EXTRA_REG_NONE;
/* mark not used */
event->hw.extra_reg.idx = EXTRA_REG_NONE;
event->hw.branch_reg.idx = EXTRA_REG_NONE;
return x86_pmu.hw_config(event);
@@ -1186,8 +1183,6 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
int idx, handled = 0;
u64 val;
perf_sample_data_init(&data, 0);
cpuc = &__get_cpu_var(cpu_hw_events);
/*
@@ -1222,7 +1217,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
* event overflow
*/
handled++;
data.period = event->hw.last_period;
perf_sample_data_init(&data, 0, event->hw.last_period);
if (!x86_perf_event_set_period(event))
continue;

View File

@@ -134,8 +134,13 @@ static u64 amd_pmu_event_map(int hw_event)
static int amd_pmu_hw_config(struct perf_event *event)
{
int ret = x86_pmu_hw_config(event);
int ret;
/* pass precise event sampling to ibs: */
if (event->attr.precise_ip && get_ibs_caps())
return -ENOENT;
ret = x86_pmu_hw_config(event);
if (ret)
return ret;
@@ -205,10 +210,8 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
* when we come here
*/
for (i = 0; i < x86_pmu.num_counters; i++) {
if (nb->owners[i] == event) {
cmpxchg(nb->owners+i, event, NULL);
if (cmpxchg(nb->owners + i, event, NULL) == event)
break;
}
}
}

View File

@@ -9,6 +9,7 @@
#include <linux/perf_event.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/ptrace.h>
#include <asm/apic.h>
@@ -16,36 +17,591 @@ static u32 ibs_caps;
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
static struct pmu perf_ibs;
#include <linux/kprobes.h>
#include <linux/hardirq.h>
#include <asm/nmi.h>
#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT
enum ibs_states {
IBS_ENABLED = 0,
IBS_STARTED = 1,
IBS_STOPPING = 2,
IBS_MAX_STATES,
};
struct cpu_perf_ibs {
struct perf_event *event;
unsigned long state[BITS_TO_LONGS(IBS_MAX_STATES)];
};
struct perf_ibs {
struct pmu pmu;
unsigned int msr;
u64 config_mask;
u64 cnt_mask;
u64 enable_mask;
u64 valid_mask;
u64 max_period;
unsigned long offset_mask[1];
int offset_max;
struct cpu_perf_ibs __percpu *pcpu;
u64 (*get_count)(u64 config);
};
struct perf_ibs_data {
u32 size;
union {
u32 data[0]; /* data buffer starts here */
u32 caps;
};
u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX];
};
static int
perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
{
s64 left = local64_read(&hwc->period_left);
s64 period = hwc->sample_period;
int overflow = 0;
/*
* If we are way outside a reasonable range then just skip forward:
*/
if (unlikely(left <= -period)) {
left = period;
local64_set(&hwc->period_left, left);
hwc->last_period = period;
overflow = 1;
}
if (unlikely(left < (s64)min)) {
left += period;
local64_set(&hwc->period_left, left);
hwc->last_period = period;
overflow = 1;
}
/*
* If the hw period that triggers the sw overflow is too short
* we might hit the irq handler. This biases the results.
* Thus we shorten the next-to-last period and set the last
* period to the max period.
*/
if (left > max) {
left -= max;
if (left > max)
left = max;
else if (left < min)
left = min;
}
*hw_period = (u64)left;
return overflow;
}
static int
perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
{
struct hw_perf_event *hwc = &event->hw;
int shift = 64 - width;
u64 prev_raw_count;
u64 delta;
/*
* Careful: an NMI might modify the previous event value.
*
* Our tactic to handle this is to first atomically read and
* exchange a new raw count - then add that new-prev delta
* count to the generic event atomically:
*/
prev_raw_count = local64_read(&hwc->prev_count);
if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
new_raw_count) != prev_raw_count)
return 0;
/*
* Now we have the new raw value and have updated the prev
* timestamp already. We can now calculate the elapsed delta
* (event-)time and add that to the generic event.
*
* Careful, not all hw sign-extends above the physical width
* of the count.
*/
delta = (new_raw_count << shift) - (prev_raw_count << shift);
delta >>= shift;
local64_add(delta, &event->count);
local64_sub(delta, &hwc->period_left);
return 1;
}
static struct perf_ibs perf_ibs_fetch;
static struct perf_ibs perf_ibs_op;
static struct perf_ibs *get_ibs_pmu(int type)
{
if (perf_ibs_fetch.pmu.type == type)
return &perf_ibs_fetch;
if (perf_ibs_op.pmu.type == type)
return &perf_ibs_op;
return NULL;
}
/*
* Use IBS for precise event sampling:
*
* perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count
* perf record -a -e r076:p ... # same as -e cpu-cycles:p
* perf record -a -e r0C1:p ... # use ibs op counting micro-ops
*
* IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
* MSRC001_1033) is used to select either cycle or micro-ops counting
* mode.
*
* The rip of IBS samples has skid 0. Thus, IBS supports precise
* levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
* rip is invalid when IBS was not able to record the rip correctly.
* We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
*
*/
static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
{
switch (event->attr.precise_ip) {
case 0:
return -ENOENT;
case 1:
case 2:
break;
default:
return -EOPNOTSUPP;
}
switch (event->attr.type) {
case PERF_TYPE_HARDWARE:
switch (event->attr.config) {
case PERF_COUNT_HW_CPU_CYCLES:
*config = 0;
return 0;
}
break;
case PERF_TYPE_RAW:
switch (event->attr.config) {
case 0x0076:
*config = 0;
return 0;
case 0x00C1:
*config = IBS_OP_CNT_CTL;
return 0;
}
break;
default:
return -ENOENT;
}
return -EOPNOTSUPP;
}
static int perf_ibs_init(struct perf_event *event)
{
if (perf_ibs.type != event->attr.type)
struct hw_perf_event *hwc = &event->hw;
struct perf_ibs *perf_ibs;
u64 max_cnt, config;
int ret;
perf_ibs = get_ibs_pmu(event->attr.type);
if (perf_ibs) {
config = event->attr.config;
} else {
perf_ibs = &perf_ibs_op;
ret = perf_ibs_precise_event(event, &config);
if (ret)
return ret;
}
if (event->pmu != &perf_ibs->pmu)
return -ENOENT;
if (config & ~perf_ibs->config_mask)
return -EINVAL;
if (hwc->sample_period) {
if (config & perf_ibs->cnt_mask)
/* raw max_cnt may not be set */
return -EINVAL;
if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
/*
* lower 4 bits can not be set in ibs max cnt,
* but allowing it in case we adjust the
* sample period to set a frequency.
*/
return -EINVAL;
hwc->sample_period &= ~0x0FULL;
if (!hwc->sample_period)
hwc->sample_period = 0x10;
} else {
max_cnt = config & perf_ibs->cnt_mask;
config &= ~perf_ibs->cnt_mask;
event->attr.sample_period = max_cnt << 4;
hwc->sample_period = event->attr.sample_period;
}
if (!hwc->sample_period)
return -EINVAL;
/*
* If we modify hwc->sample_period, we also need to update
* hwc->last_period and hwc->period_left.
*/
hwc->last_period = hwc->sample_period;
local64_set(&hwc->period_left, hwc->sample_period);
hwc->config_base = perf_ibs->msr;
hwc->config = config;
return 0;
}
static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
struct hw_perf_event *hwc, u64 *period)
{
int overflow;
/* ignore lower 4 bits in min count: */
overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
local64_set(&hwc->prev_count, 0);
return overflow;
}
static u64 get_ibs_fetch_count(u64 config)
{
return (config & IBS_FETCH_CNT) >> 12;
}
static u64 get_ibs_op_count(u64 config)
{
u64 count = 0;
if (config & IBS_OP_VAL)
count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */
if (ibs_caps & IBS_CAPS_RDWROPCNT)
count += (config & IBS_OP_CUR_CNT) >> 32;
return count;
}
static void
perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
u64 *config)
{
u64 count = perf_ibs->get_count(*config);
/*
* Set width to 64 since we do not overflow on max width but
* instead on max count. In perf_ibs_set_period() we clear
* prev count manually on overflow.
*/
while (!perf_event_try_update(event, count, 64)) {
rdmsrl(event->hw.config_base, *config);
count = perf_ibs->get_count(*config);
}
}
static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
struct hw_perf_event *hwc, u64 config)
{
wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask);
}
/*
* Erratum #420 Instruction-Based Sampling Engine May Generate
* Interrupt that Cannot Be Cleared:
*
* Must clear counter mask first, then clear the enable bit. See
* Revision Guide for AMD Family 10h Processors, Publication #41322.
*/
static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
struct hw_perf_event *hwc, u64 config)
{
config &= ~perf_ibs->cnt_mask;
wrmsrl(hwc->config_base, config);
config &= ~perf_ibs->enable_mask;
wrmsrl(hwc->config_base, config);
}
/*
* We cannot restore the ibs pmu state, so we always needs to update
* the event while stopping it and then reset the state when starting
* again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
* perf_ibs_start()/perf_ibs_stop() and instead always do it.
*/
static void perf_ibs_start(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;
struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
u64 period;
if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
return;
WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
hwc->state = 0;
perf_ibs_set_period(perf_ibs, hwc, &period);
set_bit(IBS_STARTED, pcpu->state);
perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
perf_event_update_userpage(event);
}
static void perf_ibs_stop(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;
struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
u64 config;
int stopping;
stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
if (!stopping && (hwc->state & PERF_HES_UPTODATE))
return;
rdmsrl(hwc->config_base, config);
if (stopping) {
set_bit(IBS_STOPPING, pcpu->state);
perf_ibs_disable_event(perf_ibs, hwc, config);
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
hwc->state |= PERF_HES_STOPPED;
}
if (hwc->state & PERF_HES_UPTODATE)
return;
/*
* Clear valid bit to not count rollovers on update, rollovers
* are only updated in the irq handler.
*/
config &= ~perf_ibs->valid_mask;
perf_ibs_event_update(perf_ibs, event, &config);
hwc->state |= PERF_HES_UPTODATE;
}
static int perf_ibs_add(struct perf_event *event, int flags)
{
struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
if (test_and_set_bit(IBS_ENABLED, pcpu->state))
return -ENOSPC;
event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
pcpu->event = event;
if (flags & PERF_EF_START)
perf_ibs_start(event, PERF_EF_RELOAD);
return 0;
}
static void perf_ibs_del(struct perf_event *event, int flags)
{
struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
return;
perf_ibs_stop(event, PERF_EF_UPDATE);
pcpu->event = NULL;
perf_event_update_userpage(event);
}
static struct pmu perf_ibs = {
.event_init= perf_ibs_init,
.add= perf_ibs_add,
.del= perf_ibs_del,
static void perf_ibs_read(struct perf_event *event) { }
static struct perf_ibs perf_ibs_fetch = {
.pmu = {
.task_ctx_nr = perf_invalid_context,
.event_init = perf_ibs_init,
.add = perf_ibs_add,
.del = perf_ibs_del,
.start = perf_ibs_start,
.stop = perf_ibs_stop,
.read = perf_ibs_read,
},
.msr = MSR_AMD64_IBSFETCHCTL,
.config_mask = IBS_FETCH_CONFIG_MASK,
.cnt_mask = IBS_FETCH_MAX_CNT,
.enable_mask = IBS_FETCH_ENABLE,
.valid_mask = IBS_FETCH_VAL,
.max_period = IBS_FETCH_MAX_CNT << 4,
.offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK },
.offset_max = MSR_AMD64_IBSFETCH_REG_COUNT,
.get_count = get_ibs_fetch_count,
};
static struct perf_ibs perf_ibs_op = {
.pmu = {
.task_ctx_nr = perf_invalid_context,
.event_init = perf_ibs_init,
.add = perf_ibs_add,
.del = perf_ibs_del,
.start = perf_ibs_start,
.stop = perf_ibs_stop,
.read = perf_ibs_read,
},
.msr = MSR_AMD64_IBSOPCTL,
.config_mask = IBS_OP_CONFIG_MASK,
.cnt_mask = IBS_OP_MAX_CNT,
.enable_mask = IBS_OP_ENABLE,
.valid_mask = IBS_OP_VAL,
.max_period = IBS_OP_MAX_CNT << 4,
.offset_mask = { MSR_AMD64_IBSOP_REG_MASK },
.offset_max = MSR_AMD64_IBSOP_REG_COUNT,
.get_count = get_ibs_op_count,
};
static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
{
struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
struct perf_event *event = pcpu->event;
struct hw_perf_event *hwc = &event->hw;
struct perf_sample_data data;
struct perf_raw_record raw;
struct pt_regs regs;
struct perf_ibs_data ibs_data;
int offset, size, check_rip, offset_max, throttle = 0;
unsigned int msr;
u64 *buf, *config, period;
if (!test_bit(IBS_STARTED, pcpu->state)) {
/*
* Catch spurious interrupts after stopping IBS: After
* disabling IBS there could be still incomming NMIs
* with samples that even have the valid bit cleared.
* Mark all this NMIs as handled.
*/
return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
}
msr = hwc->config_base;
buf = ibs_data.regs;
rdmsrl(msr, *buf);
if (!(*buf++ & perf_ibs->valid_mask))
return 0;
config = &ibs_data.regs[0];
perf_ibs_event_update(perf_ibs, event, config);
perf_sample_data_init(&data, 0, hwc->last_period);
if (!perf_ibs_set_period(perf_ibs, hwc, &period))
goto out; /* no sw counter overflow */
ibs_data.caps = ibs_caps;
size = 1;
offset = 1;
check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
if (event->attr.sample_type & PERF_SAMPLE_RAW)
offset_max = perf_ibs->offset_max;
else if (check_rip)
offset_max = 2;
else
offset_max = 1;
do {
rdmsrl(msr + offset, *buf++);
size++;
offset = find_next_bit(perf_ibs->offset_mask,
perf_ibs->offset_max,
offset + 1);
} while (offset < offset_max);
ibs_data.size = sizeof(u64) * size;
regs = *iregs;
if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
regs.flags &= ~PERF_EFLAGS_EXACT;
} else {
instruction_pointer_set(&regs, ibs_data.regs[1]);
regs.flags |= PERF_EFLAGS_EXACT;
}
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
raw.size = sizeof(u32) + ibs_data.size;
raw.data = ibs_data.data;
data.raw = &raw;
}
throttle = perf_event_overflow(event, &data, &regs);
out:
if (throttle)
perf_ibs_disable_event(perf_ibs, hwc, *config);
else
perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
perf_event_update_userpage(event);
return 1;
}
static int __kprobes
perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
{
int handled = 0;
handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
if (handled)
inc_irq_stat(apic_perf_irqs);
return handled;
}
static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
{
struct cpu_perf_ibs __percpu *pcpu;
int ret;
pcpu = alloc_percpu(struct cpu_perf_ibs);
if (!pcpu)
return -ENOMEM;
perf_ibs->pcpu = pcpu;
ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
if (ret) {
perf_ibs->pcpu = NULL;
free_percpu(pcpu);
}
return ret;
}
static __init int perf_event_ibs_init(void)
{
if (!ibs_caps)
return -ENODEV; /* ibs not supported by the cpu */
perf_pmu_register(&perf_ibs, "ibs", -1);
perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
if (ibs_caps & IBS_CAPS_OPCNT)
perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);
return 0;

View File

@@ -1027,8 +1027,6 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
u64 status;
int handled;
perf_sample_data_init(&data, 0);
cpuc = &__get_cpu_var(cpu_hw_events);
/*
@@ -1082,7 +1080,7 @@ again:
if (!intel_pmu_save_and_restart(event))
continue;
data.period = event->hw.last_period;
perf_sample_data_init(&data, 0, event->hw.last_period);
if (has_branch_stack(event))
data.br_stack = &cpuc->lbr_stack;

View File

@@ -316,8 +316,7 @@ int intel_pmu_drain_bts_buffer(void)
ds->bts_index = ds->bts_buffer_base;
perf_sample_data_init(&data, 0);
data.period = event->hw.last_period;
perf_sample_data_init(&data, 0, event->hw.last_period);
regs.ip = 0;
/*
@@ -564,8 +563,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
if (!intel_pmu_save_and_restart(event))
return;
perf_sample_data_init(&data, 0);
data.period = event->hw.last_period;
perf_sample_data_init(&data, 0, event->hw.last_period);
/*
* We use the interrupt regs as a base because the PEBS record

View File

@@ -1005,8 +1005,6 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
int idx, handled = 0;
u64 val;
perf_sample_data_init(&data, 0);
cpuc = &__get_cpu_var(cpu_hw_events);
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
@@ -1034,10 +1032,12 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
handled += overflow;
/* event overflow for sure */
data.period = event->hw.last_period;
perf_sample_data_init(&data, 0, hwc->last_period);
if (!x86_perf_event_set_period(event))
continue;
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
}

View File

@@ -24,40 +24,21 @@
#include <trace/syscall.h>
#include <asm/cacheflush.h>
#include <asm/kprobes.h>
#include <asm/ftrace.h>
#include <asm/nops.h>
#include <asm/nmi.h>
#ifdef CONFIG_DYNAMIC_FTRACE
/*
* modifying_code is set to notify NMIs that they need to use
* memory barriers when entering or exiting. But we don't want
* to burden NMIs with unnecessary memory barriers when code
* modification is not being done (which is most of the time).
*
* A mutex is already held when ftrace_arch_code_modify_prepare
* and post_process are called. No locks need to be taken here.
*
* Stop machine will make sure currently running NMIs are done
* and new NMIs will see the updated variable before we need
* to worry about NMIs doing memory barriers.
*/
static int modifying_code __read_mostly;
static DEFINE_PER_CPU(int, save_modifying_code);
int ftrace_arch_code_modify_prepare(void)
{
set_kernel_text_rw();
set_all_modules_text_rw();
modifying_code = 1;
return 0;
}
int ftrace_arch_code_modify_post_process(void)
{
modifying_code = 0;
set_all_modules_text_ro();
set_kernel_text_ro();
return 0;
@@ -90,134 +71,6 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
return calc.code;
}
/*
* Modifying code must take extra care. On an SMP machine, if
* the code being modified is also being executed on another CPU
* that CPU will have undefined results and possibly take a GPF.
* We use kstop_machine to stop other CPUS from exectuing code.
* But this does not stop NMIs from happening. We still need
* to protect against that. We separate out the modification of
* the code to take care of this.
*
* Two buffers are added: An IP buffer and a "code" buffer.
*
* 1) Put the instruction pointer into the IP buffer
* and the new code into the "code" buffer.
* 2) Wait for any running NMIs to finish and set a flag that says
* we are modifying code, it is done in an atomic operation.
* 3) Write the code
* 4) clear the flag.
* 5) Wait for any running NMIs to finish.
*
* If an NMI is executed, the first thing it does is to call
* "ftrace_nmi_enter". This will check if the flag is set to write
* and if it is, it will write what is in the IP and "code" buffers.
*
* The trick is, it does not matter if everyone is writing the same
* content to the code location. Also, if a CPU is executing code
* it is OK to write to that code location if the contents being written
* are the same as what exists.
*/
#define MOD_CODE_WRITE_FLAG (1 << 31) /* set when NMI should do the write */
static atomic_t nmi_running = ATOMIC_INIT(0);
static int mod_code_status; /* holds return value of text write */
static void *mod_code_ip; /* holds the IP to write to */
static const void *mod_code_newcode; /* holds the text to write to the IP */
static unsigned nmi_wait_count;
static atomic_t nmi_update_count = ATOMIC_INIT(0);
int ftrace_arch_read_dyn_info(char *buf, int size)
{
int r;
r = snprintf(buf, size, "%u %u",
nmi_wait_count,
atomic_read(&nmi_update_count));
return r;
}
static void clear_mod_flag(void)
{
int old = atomic_read(&nmi_running);
for (;;) {
int new = old & ~MOD_CODE_WRITE_FLAG;
if (old == new)
break;
old = atomic_cmpxchg(&nmi_running, old, new);
}
}
static void ftrace_mod_code(void)
{
/*
* Yes, more than one CPU process can be writing to mod_code_status.
* (and the code itself)
* But if one were to fail, then they all should, and if one were
* to succeed, then they all should.
*/
mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
MCOUNT_INSN_SIZE);
/* if we fail, then kill any new writers */
if (mod_code_status)
clear_mod_flag();
}
void ftrace_nmi_enter(void)
{
__this_cpu_write(save_modifying_code, modifying_code);
if (!__this_cpu_read(save_modifying_code))
return;
if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
smp_rmb();
ftrace_mod_code();
atomic_inc(&nmi_update_count);
}
/* Must have previous changes seen before executions */
smp_mb();
}
void ftrace_nmi_exit(void)
{
if (!__this_cpu_read(save_modifying_code))
return;
/* Finish all executions before clearing nmi_running */
smp_mb();
atomic_dec(&nmi_running);
}
static void wait_for_nmi_and_set_mod_flag(void)
{
if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG))
return;
do {
cpu_relax();
} while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG));
nmi_wait_count++;
}
static void wait_for_nmi(void)
{
if (!atomic_read(&nmi_running))
return;
do {
cpu_relax();
} while (atomic_read(&nmi_running));
nmi_wait_count++;
}
static inline int
within(unsigned long addr, unsigned long start, unsigned long end)
{
@@ -238,26 +91,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code)
if (within(ip, (unsigned long)_text, (unsigned long)_etext))
ip = (unsigned long)__va(__pa(ip));
mod_code_ip = (void *)ip;
mod_code_newcode = new_code;
/* The buffers need to be visible before we let NMIs write them */
smp_mb();
wait_for_nmi_and_set_mod_flag();
/* Make sure all running NMIs have finished before we write the code */
smp_mb();
ftrace_mod_code();
/* Make sure the write happens before clearing the bit */
smp_mb();
clear_mod_flag();
wait_for_nmi();
return mod_code_status;
return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE);
}
static const unsigned char *ftrace_nop_replace(void)
@@ -334,6 +168,347 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
return ret;
}
int modifying_ftrace_code __read_mostly;
/*
* A breakpoint was added to the code address we are about to
* modify, and this is the handle that will just skip over it.
* We are either changing a nop into a trace call, or a trace
* call to a nop. While the change is taking place, we treat
* it just like it was a nop.
*/
int ftrace_int3_handler(struct pt_regs *regs)
{
if (WARN_ON_ONCE(!regs))
return 0;
if (!ftrace_location(regs->ip - 1))
return 0;
regs->ip += MCOUNT_INSN_SIZE - 1;
return 1;
}
static int ftrace_write(unsigned long ip, const char *val, int size)
{
/*
* On x86_64, kernel text mappings are mapped read-only with
* CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
* of the kernel text mapping to modify the kernel text.
*
* For 32bit kernels, these mappings are same and we can use
* kernel identity mapping to modify code.
*/
if (within(ip, (unsigned long)_text, (unsigned long)_etext))
ip = (unsigned long)__va(__pa(ip));
return probe_kernel_write((void *)ip, val, size);
}
static int add_break(unsigned long ip, const char *old)
{
unsigned char replaced[MCOUNT_INSN_SIZE];
unsigned char brk = BREAKPOINT_INSTRUCTION;
if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
return -EFAULT;
/* Make sure it is what we expect it to be */
if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0)
return -EINVAL;
if (ftrace_write(ip, &brk, 1))
return -EPERM;
return 0;
}
static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr)
{
unsigned const char *old;
unsigned long ip = rec->ip;
old = ftrace_call_replace(ip, addr);
return add_break(rec->ip, old);
}
static int add_brk_on_nop(struct dyn_ftrace *rec)
{
unsigned const char *old;
old = ftrace_nop_replace();
return add_break(rec->ip, old);
}
static int add_breakpoints(struct dyn_ftrace *rec, int enable)
{
unsigned long ftrace_addr;
int ret;
ret = ftrace_test_record(rec, enable);
ftrace_addr = (unsigned long)FTRACE_ADDR;
switch (ret) {
case FTRACE_UPDATE_IGNORE:
return 0;
case FTRACE_UPDATE_MAKE_CALL:
/* converting nop to call */
return add_brk_on_nop(rec);
case FTRACE_UPDATE_MAKE_NOP:
/* converting a call to a nop */
return add_brk_on_call(rec, ftrace_addr);
}
return 0;
}
/*
* On error, we need to remove breakpoints. This needs to
* be done caefully. If the address does not currently have a
* breakpoint, we know we are done. Otherwise, we look at the
* remaining 4 bytes of the instruction. If it matches a nop
* we replace the breakpoint with the nop. Otherwise we replace
* it with the call instruction.
*/
static int remove_breakpoint(struct dyn_ftrace *rec)
{
unsigned char ins[MCOUNT_INSN_SIZE];
unsigned char brk = BREAKPOINT_INSTRUCTION;
const unsigned char *nop;
unsigned long ftrace_addr;
unsigned long ip = rec->ip;
/* If we fail the read, just give up */
if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE))
return -EFAULT;
/* If this does not have a breakpoint, we are done */
if (ins[0] != brk)
return -1;
nop = ftrace_nop_replace();
/*
* If the last 4 bytes of the instruction do not match
* a nop, then we assume that this is a call to ftrace_addr.
*/
if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) {
/*
* For extra paranoidism, we check if the breakpoint is on
* a call that would actually jump to the ftrace_addr.
* If not, don't touch the breakpoint, we make just create
* a disaster.
*/
ftrace_addr = (unsigned long)FTRACE_ADDR;
nop = ftrace_call_replace(ip, ftrace_addr);
if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
return -EINVAL;
}
return probe_kernel_write((void *)ip, &nop[0], 1);
}
static int add_update_code(unsigned long ip, unsigned const char *new)
{
/* skip breakpoint */
ip++;
new++;
if (ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1))
return -EPERM;
return 0;
}
static int add_update_call(struct dyn_ftrace *rec, unsigned long addr)
{
unsigned long ip = rec->ip;
unsigned const char *new;
new = ftrace_call_replace(ip, addr);
return add_update_code(ip, new);
}
static int add_update_nop(struct dyn_ftrace *rec)
{
unsigned long ip = rec->ip;
unsigned const char *new;
new = ftrace_nop_replace();
return add_update_code(ip, new);
}
static int add_update(struct dyn_ftrace *rec, int enable)
{
unsigned long ftrace_addr;
int ret;
ret = ftrace_test_record(rec, enable);
ftrace_addr = (unsigned long)FTRACE_ADDR;
switch (ret) {
case FTRACE_UPDATE_IGNORE:
return 0;
case FTRACE_UPDATE_MAKE_CALL:
/* converting nop to call */
return add_update_call(rec, ftrace_addr);
case FTRACE_UPDATE_MAKE_NOP:
/* converting a call to a nop */
return add_update_nop(rec);
}
return 0;
}
static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr)
{
unsigned long ip = rec->ip;
unsigned const char *new;
new = ftrace_call_replace(ip, addr);
if (ftrace_write(ip, new, 1))
return -EPERM;
return 0;
}
static int finish_update_nop(struct dyn_ftrace *rec)
{
unsigned long ip = rec->ip;
unsigned const char *new;
new = ftrace_nop_replace();
if (ftrace_write(ip, new, 1))
return -EPERM;
return 0;
}
static int finish_update(struct dyn_ftrace *rec, int enable)
{
unsigned long ftrace_addr;
int ret;
ret = ftrace_update_record(rec, enable);
ftrace_addr = (unsigned long)FTRACE_ADDR;
switch (ret) {
case FTRACE_UPDATE_IGNORE:
return 0;
case FTRACE_UPDATE_MAKE_CALL:
/* converting nop to call */
return finish_update_call(rec, ftrace_addr);
case FTRACE_UPDATE_MAKE_NOP:
/* converting a call to a nop */
return finish_update_nop(rec);
}
return 0;
}
static void do_sync_core(void *data)
{
sync_core();
}
static void run_sync(void)
{
int enable_irqs = irqs_disabled();
/* We may be called with interrupts disbled (on bootup). */
if (enable_irqs)
local_irq_enable();
on_each_cpu(do_sync_core, NULL, 1);
if (enable_irqs)
local_irq_disable();
}
static void ftrace_replace_code(int enable)
{
struct ftrace_rec_iter *iter;
struct dyn_ftrace *rec;
const char *report = "adding breakpoints";
int count = 0;
int ret;
for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);
ret = add_breakpoints(rec, enable);
if (ret)
goto remove_breakpoints;
count++;
}
run_sync();
report = "updating code";
for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);
ret = add_update(rec, enable);
if (ret)
goto remove_breakpoints;
}
run_sync();
report = "removing breakpoints";
for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);
ret = finish_update(rec, enable);
if (ret)
goto remove_breakpoints;
}
run_sync();
return;
remove_breakpoints:
ftrace_bug(ret, rec ? rec->ip : 0);
printk(KERN_WARNING "Failed on %s (%d):\n", report, count);
for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);
remove_breakpoint(rec);
}
}
void arch_ftrace_update_code(int command)
{
modifying_ftrace_code++;
if (command & FTRACE_UPDATE_CALLS)
ftrace_replace_code(1);
else if (command & FTRACE_DISABLE_CALLS)
ftrace_replace_code(0);
if (command & FTRACE_UPDATE_TRACE_FUNC)
ftrace_update_ftrace_func(ftrace_trace_function);
if (command & FTRACE_START_FUNC_RET)
ftrace_enable_ftrace_graph_caller();
else if (command & FTRACE_STOP_FUNC_RET)
ftrace_disable_ftrace_graph_caller();
modifying_ftrace_code--;
}
int __init ftrace_dyn_arch_init(void *data)
{
/* The return code is retured via data */

View File

@@ -84,7 +84,7 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic);
#define nmi_to_desc(type) (&nmi_desc[type])
static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
{
struct nmi_desc *desc = nmi_to_desc(type);
struct nmiaction *a;
@@ -209,7 +209,7 @@ void unregister_nmi_handler(unsigned int type, const char *name)
EXPORT_SYMBOL_GPL(unregister_nmi_handler);
static notrace __kprobes void
static __kprobes void
pci_serr_error(unsigned char reason, struct pt_regs *regs)
{
pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
@@ -236,7 +236,7 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
outb(reason, NMI_REASON_PORT);
}
static notrace __kprobes void
static __kprobes void
io_check_error(unsigned char reason, struct pt_regs *regs)
{
unsigned long i;
@@ -263,7 +263,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
outb(reason, NMI_REASON_PORT);
}
static notrace __kprobes void
static __kprobes void
unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
{
int handled;
@@ -305,7 +305,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
static DEFINE_PER_CPU(bool, swallow_nmi);
static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
static __kprobes void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
int handled;

View File

@@ -50,6 +50,7 @@
#include <asm/processor.h>
#include <asm/debugreg.h>
#include <linux/atomic.h>
#include <asm/ftrace.h>
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/i387.h>
@@ -303,8 +304,13 @@ gp_in_kernel:
}
/* May run on IST stack. */
dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code)
{
#ifdef CONFIG_DYNAMIC_FTRACE
/* ftrace must be first, everything else may cause a recursive crash */
if (unlikely(modifying_ftrace_code) && ftrace_int3_handler(regs))
return;
#endif
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)

View File

@@ -216,9 +216,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
current_thread_info()->sig_on_uaccess_error = 1;
/*
* 0 is a valid user pointer (in the access_ok sense) on 32-bit and
* NULL is a valid user pointer (in the access_ok sense) on 32-bit and
* 64-bit, so we don't need to special-case it here. For all the
* vsyscalls, 0 means "don't write anything" not "write it at
* vsyscalls, NULL means "don't write anything" not "write it at
* address 0".
*/
ret = -EFAULT;
@@ -247,7 +247,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
ret = sys_getcpu((unsigned __user *)regs->di,
(unsigned __user *)regs->si,
0);
NULL);
break;
}