Merge branch 'tip/perf/core-4' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace into perf/core

This commit is contained in:
Ingo Molnar
2012-05-07 11:03:52 +02:00
12 changed files with 815 additions and 388 deletions

View File

@@ -24,40 +24,21 @@
#include <trace/syscall.h>
#include <asm/cacheflush.h>
#include <asm/kprobes.h>
#include <asm/ftrace.h>
#include <asm/nops.h>
#include <asm/nmi.h>
#ifdef CONFIG_DYNAMIC_FTRACE
/*
* modifying_code is set to notify NMIs that they need to use
* memory barriers when entering or exiting. But we don't want
* to burden NMIs with unnecessary memory barriers when code
* modification is not being done (which is most of the time).
*
* A mutex is already held when ftrace_arch_code_modify_prepare
* and post_process are called. No locks need to be taken here.
*
* Stop machine will make sure currently running NMIs are done
* and new NMIs will see the updated variable before we need
* to worry about NMIs doing memory barriers.
*/
static int modifying_code __read_mostly;
static DEFINE_PER_CPU(int, save_modifying_code);
int ftrace_arch_code_modify_prepare(void)
{
set_kernel_text_rw();
set_all_modules_text_rw();
modifying_code = 1;
return 0;
}
int ftrace_arch_code_modify_post_process(void)
{
modifying_code = 0;
set_all_modules_text_ro();
set_kernel_text_ro();
return 0;
@@ -90,134 +71,6 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
return calc.code;
}
/*
* Modifying code must take extra care. On an SMP machine, if
* the code being modified is also being executed on another CPU
* that CPU will have undefined results and possibly take a GPF.
* We use kstop_machine to stop other CPUS from exectuing code.
* But this does not stop NMIs from happening. We still need
* to protect against that. We separate out the modification of
* the code to take care of this.
*
* Two buffers are added: An IP buffer and a "code" buffer.
*
* 1) Put the instruction pointer into the IP buffer
* and the new code into the "code" buffer.
* 2) Wait for any running NMIs to finish and set a flag that says
* we are modifying code, it is done in an atomic operation.
* 3) Write the code
* 4) clear the flag.
* 5) Wait for any running NMIs to finish.
*
* If an NMI is executed, the first thing it does is to call
* "ftrace_nmi_enter". This will check if the flag is set to write
* and if it is, it will write what is in the IP and "code" buffers.
*
* The trick is, it does not matter if everyone is writing the same
* content to the code location. Also, if a CPU is executing code
* it is OK to write to that code location if the contents being written
* are the same as what exists.
*/
#define MOD_CODE_WRITE_FLAG (1 << 31) /* set when NMI should do the write */
static atomic_t nmi_running = ATOMIC_INIT(0);
static int mod_code_status; /* holds return value of text write */
static void *mod_code_ip; /* holds the IP to write to */
static const void *mod_code_newcode; /* holds the text to write to the IP */
static unsigned nmi_wait_count;
static atomic_t nmi_update_count = ATOMIC_INIT(0);
int ftrace_arch_read_dyn_info(char *buf, int size)
{
int r;
r = snprintf(buf, size, "%u %u",
nmi_wait_count,
atomic_read(&nmi_update_count));
return r;
}
static void clear_mod_flag(void)
{
int old = atomic_read(&nmi_running);
for (;;) {
int new = old & ~MOD_CODE_WRITE_FLAG;
if (old == new)
break;
old = atomic_cmpxchg(&nmi_running, old, new);
}
}
static void ftrace_mod_code(void)
{
/*
* Yes, more than one CPU process can be writing to mod_code_status.
* (and the code itself)
* But if one were to fail, then they all should, and if one were
* to succeed, then they all should.
*/
mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
MCOUNT_INSN_SIZE);
/* if we fail, then kill any new writers */
if (mod_code_status)
clear_mod_flag();
}
void ftrace_nmi_enter(void)
{
__this_cpu_write(save_modifying_code, modifying_code);
if (!__this_cpu_read(save_modifying_code))
return;
if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
smp_rmb();
ftrace_mod_code();
atomic_inc(&nmi_update_count);
}
/* Must have previous changes seen before executions */
smp_mb();
}
void ftrace_nmi_exit(void)
{
if (!__this_cpu_read(save_modifying_code))
return;
/* Finish all executions before clearing nmi_running */
smp_mb();
atomic_dec(&nmi_running);
}
static void wait_for_nmi_and_set_mod_flag(void)
{
if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG))
return;
do {
cpu_relax();
} while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG));
nmi_wait_count++;
}
static void wait_for_nmi(void)
{
if (!atomic_read(&nmi_running))
return;
do {
cpu_relax();
} while (atomic_read(&nmi_running));
nmi_wait_count++;
}
static inline int
within(unsigned long addr, unsigned long start, unsigned long end)
{
@@ -238,26 +91,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code)
if (within(ip, (unsigned long)_text, (unsigned long)_etext))
ip = (unsigned long)__va(__pa(ip));
mod_code_ip = (void *)ip;
mod_code_newcode = new_code;
/* The buffers need to be visible before we let NMIs write them */
smp_mb();
wait_for_nmi_and_set_mod_flag();
/* Make sure all running NMIs have finished before we write the code */
smp_mb();
ftrace_mod_code();
/* Make sure the write happens before clearing the bit */
smp_mb();
clear_mod_flag();
wait_for_nmi();
return mod_code_status;
return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE);
}
static const unsigned char *ftrace_nop_replace(void)
@@ -334,6 +168,347 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
return ret;
}
int modifying_ftrace_code __read_mostly;
/*
* A breakpoint was added to the code address we are about to
* modify, and this is the handle that will just skip over it.
* We are either changing a nop into a trace call, or a trace
* call to a nop. While the change is taking place, we treat
* it just like it was a nop.
*/
int ftrace_int3_handler(struct pt_regs *regs)
{
if (WARN_ON_ONCE(!regs))
return 0;
if (!ftrace_location(regs->ip - 1))
return 0;
regs->ip += MCOUNT_INSN_SIZE - 1;
return 1;
}
static int ftrace_write(unsigned long ip, const char *val, int size)
{
/*
* On x86_64, kernel text mappings are mapped read-only with
* CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
* of the kernel text mapping to modify the kernel text.
*
* For 32bit kernels, these mappings are same and we can use
* kernel identity mapping to modify code.
*/
if (within(ip, (unsigned long)_text, (unsigned long)_etext))
ip = (unsigned long)__va(__pa(ip));
return probe_kernel_write((void *)ip, val, size);
}
static int add_break(unsigned long ip, const char *old)
{
unsigned char replaced[MCOUNT_INSN_SIZE];
unsigned char brk = BREAKPOINT_INSTRUCTION;
if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
return -EFAULT;
/* Make sure it is what we expect it to be */
if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0)
return -EINVAL;
if (ftrace_write(ip, &brk, 1))
return -EPERM;
return 0;
}
static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr)
{
unsigned const char *old;
unsigned long ip = rec->ip;
old = ftrace_call_replace(ip, addr);
return add_break(rec->ip, old);
}
static int add_brk_on_nop(struct dyn_ftrace *rec)
{
unsigned const char *old;
old = ftrace_nop_replace();
return add_break(rec->ip, old);
}
static int add_breakpoints(struct dyn_ftrace *rec, int enable)
{
unsigned long ftrace_addr;
int ret;
ret = ftrace_test_record(rec, enable);
ftrace_addr = (unsigned long)FTRACE_ADDR;
switch (ret) {
case FTRACE_UPDATE_IGNORE:
return 0;
case FTRACE_UPDATE_MAKE_CALL:
/* converting nop to call */
return add_brk_on_nop(rec);
case FTRACE_UPDATE_MAKE_NOP:
/* converting a call to a nop */
return add_brk_on_call(rec, ftrace_addr);
}
return 0;
}
/*
* On error, we need to remove breakpoints. This needs to
* be done caefully. If the address does not currently have a
* breakpoint, we know we are done. Otherwise, we look at the
* remaining 4 bytes of the instruction. If it matches a nop
* we replace the breakpoint with the nop. Otherwise we replace
* it with the call instruction.
*/
static int remove_breakpoint(struct dyn_ftrace *rec)
{
unsigned char ins[MCOUNT_INSN_SIZE];
unsigned char brk = BREAKPOINT_INSTRUCTION;
const unsigned char *nop;
unsigned long ftrace_addr;
unsigned long ip = rec->ip;
/* If we fail the read, just give up */
if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE))
return -EFAULT;
/* If this does not have a breakpoint, we are done */
if (ins[0] != brk)
return -1;
nop = ftrace_nop_replace();
/*
* If the last 4 bytes of the instruction do not match
* a nop, then we assume that this is a call to ftrace_addr.
*/
if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) {
/*
* For extra paranoidism, we check if the breakpoint is on
* a call that would actually jump to the ftrace_addr.
* If not, don't touch the breakpoint, we make just create
* a disaster.
*/
ftrace_addr = (unsigned long)FTRACE_ADDR;
nop = ftrace_call_replace(ip, ftrace_addr);
if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
return -EINVAL;
}
return probe_kernel_write((void *)ip, &nop[0], 1);
}
static int add_update_code(unsigned long ip, unsigned const char *new)
{
/* skip breakpoint */
ip++;
new++;
if (ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1))
return -EPERM;
return 0;
}
static int add_update_call(struct dyn_ftrace *rec, unsigned long addr)
{
unsigned long ip = rec->ip;
unsigned const char *new;
new = ftrace_call_replace(ip, addr);
return add_update_code(ip, new);
}
static int add_update_nop(struct dyn_ftrace *rec)
{
unsigned long ip = rec->ip;
unsigned const char *new;
new = ftrace_nop_replace();
return add_update_code(ip, new);
}
static int add_update(struct dyn_ftrace *rec, int enable)
{
unsigned long ftrace_addr;
int ret;
ret = ftrace_test_record(rec, enable);
ftrace_addr = (unsigned long)FTRACE_ADDR;
switch (ret) {
case FTRACE_UPDATE_IGNORE:
return 0;
case FTRACE_UPDATE_MAKE_CALL:
/* converting nop to call */
return add_update_call(rec, ftrace_addr);
case FTRACE_UPDATE_MAKE_NOP:
/* converting a call to a nop */
return add_update_nop(rec);
}
return 0;
}
static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr)
{
unsigned long ip = rec->ip;
unsigned const char *new;
new = ftrace_call_replace(ip, addr);
if (ftrace_write(ip, new, 1))
return -EPERM;
return 0;
}
static int finish_update_nop(struct dyn_ftrace *rec)
{
unsigned long ip = rec->ip;
unsigned const char *new;
new = ftrace_nop_replace();
if (ftrace_write(ip, new, 1))
return -EPERM;
return 0;
}
static int finish_update(struct dyn_ftrace *rec, int enable)
{
unsigned long ftrace_addr;
int ret;
ret = ftrace_update_record(rec, enable);
ftrace_addr = (unsigned long)FTRACE_ADDR;
switch (ret) {
case FTRACE_UPDATE_IGNORE:
return 0;
case FTRACE_UPDATE_MAKE_CALL:
/* converting nop to call */
return finish_update_call(rec, ftrace_addr);
case FTRACE_UPDATE_MAKE_NOP:
/* converting a call to a nop */
return finish_update_nop(rec);
}
return 0;
}
static void do_sync_core(void *data)
{
sync_core();
}
static void run_sync(void)
{
int enable_irqs = irqs_disabled();
/* We may be called with interrupts disbled (on bootup). */
if (enable_irqs)
local_irq_enable();
on_each_cpu(do_sync_core, NULL, 1);
if (enable_irqs)
local_irq_disable();
}
static void ftrace_replace_code(int enable)
{
struct ftrace_rec_iter *iter;
struct dyn_ftrace *rec;
const char *report = "adding breakpoints";
int count = 0;
int ret;
for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);
ret = add_breakpoints(rec, enable);
if (ret)
goto remove_breakpoints;
count++;
}
run_sync();
report = "updating code";
for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);
ret = add_update(rec, enable);
if (ret)
goto remove_breakpoints;
}
run_sync();
report = "removing breakpoints";
for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);
ret = finish_update(rec, enable);
if (ret)
goto remove_breakpoints;
}
run_sync();
return;
remove_breakpoints:
ftrace_bug(ret, rec ? rec->ip : 0);
printk(KERN_WARNING "Failed on %s (%d):\n", report, count);
for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);
remove_breakpoint(rec);
}
}
void arch_ftrace_update_code(int command)
{
modifying_ftrace_code++;
if (command & FTRACE_UPDATE_CALLS)
ftrace_replace_code(1);
else if (command & FTRACE_DISABLE_CALLS)
ftrace_replace_code(0);
if (command & FTRACE_UPDATE_TRACE_FUNC)
ftrace_update_ftrace_func(ftrace_trace_function);
if (command & FTRACE_START_FUNC_RET)
ftrace_enable_ftrace_graph_caller();
else if (command & FTRACE_STOP_FUNC_RET)
ftrace_disable_ftrace_graph_caller();
modifying_ftrace_code--;
}
int __init ftrace_dyn_arch_init(void *data)
{
/* The return code is retured via data */

View File

@@ -84,7 +84,7 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic);
#define nmi_to_desc(type) (&nmi_desc[type])
static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
{
struct nmi_desc *desc = nmi_to_desc(type);
struct nmiaction *a;
@@ -209,7 +209,7 @@ void unregister_nmi_handler(unsigned int type, const char *name)
EXPORT_SYMBOL_GPL(unregister_nmi_handler);
static notrace __kprobes void
static __kprobes void
pci_serr_error(unsigned char reason, struct pt_regs *regs)
{
pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
@@ -236,7 +236,7 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
outb(reason, NMI_REASON_PORT);
}
static notrace __kprobes void
static __kprobes void
io_check_error(unsigned char reason, struct pt_regs *regs)
{
unsigned long i;
@@ -263,7 +263,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
outb(reason, NMI_REASON_PORT);
}
static notrace __kprobes void
static __kprobes void
unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
{
int handled;
@@ -305,7 +305,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
static DEFINE_PER_CPU(bool, swallow_nmi);
static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
static __kprobes void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
int handled;

View File

@@ -50,6 +50,7 @@
#include <asm/processor.h>
#include <asm/debugreg.h>
#include <linux/atomic.h>
#include <asm/ftrace.h>
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/i387.h>
@@ -303,8 +304,13 @@ gp_in_kernel:
}
/* May run on IST stack. */
dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code)
{
#ifdef CONFIG_DYNAMIC_FTRACE
/* ftrace must be first, everything else may cause a recursive crash */
if (unlikely(modifying_ftrace_code) && ftrace_int3_handler(regs))
return;
#endif
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)