Merge commit '8700c95adb03' into timers/nohz

The full dynticks tree needs the latest RCU and sched
upstream updates in order to fix some dependencies.

Merge a common upstream merge point that has these
updates.

Conflicts:
	include/linux/perf_event.h
	kernel/rcutree.h
	kernel/rcutree_plugin.h

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
This commit is contained in:
Frederic Weisbecker
2013-05-02 17:37:49 +02:00
3692 changed files with 150969 additions and 120861 deletions

1
kernel/.gitignore vendored
View File

@@ -4,3 +4,4 @@
config_data.h
config_data.gz
timeconst.h
hz.bc

View File

@@ -24,6 +24,7 @@ endif
obj-y += sched/
obj-y += power/
obj-y += cpu/
obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o

View File

@@ -73,7 +73,7 @@ struct async_entry {
struct list_head global_list;
struct work_struct work;
async_cookie_t cookie;
async_func_ptr *func;
async_func_t func;
void *data;
struct async_domain *domain;
};
@@ -84,24 +84,20 @@ static atomic_t entry_count;
static async_cookie_t lowest_in_progress(struct async_domain *domain)
{
struct async_entry *first = NULL;
struct list_head *pending;
async_cookie_t ret = ASYNC_COOKIE_MAX;
unsigned long flags;
spin_lock_irqsave(&async_lock, flags);
if (domain) {
if (!list_empty(&domain->pending))
first = list_first_entry(&domain->pending,
struct async_entry, domain_list);
} else {
if (!list_empty(&async_global_pending))
first = list_first_entry(&async_global_pending,
struct async_entry, global_list);
}
if (domain)
pending = &domain->pending;
else
pending = &async_global_pending;
if (first)
ret = first->cookie;
if (!list_empty(pending))
ret = list_first_entry(pending, struct async_entry,
domain_list)->cookie;
spin_unlock_irqrestore(&async_lock, flags);
return ret;
@@ -149,7 +145,7 @@ static void async_run_entry_fn(struct work_struct *work)
wake_up(&async_done);
}
static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *domain)
static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain)
{
struct async_entry *entry;
unsigned long flags;
@@ -169,13 +165,13 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
spin_unlock_irqrestore(&async_lock, flags);
/* low on memory.. run synchronously */
ptr(data, newcookie);
func(data, newcookie);
return newcookie;
}
INIT_LIST_HEAD(&entry->domain_list);
INIT_LIST_HEAD(&entry->global_list);
INIT_WORK(&entry->work, async_run_entry_fn);
entry->func = ptr;
entry->func = func;
entry->data = data;
entry->domain = domain;
@@ -202,21 +198,21 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
/**
* async_schedule - schedule a function for asynchronous execution
* @ptr: function to execute asynchronously
* @func: function to execute asynchronously
* @data: data pointer to pass to the function
*
* Returns an async_cookie_t that may be used for checkpointing later.
* Note: This function may be called from atomic or non-atomic contexts.
*/
async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
async_cookie_t async_schedule(async_func_t func, void *data)
{
return __async_schedule(ptr, data, &async_dfl_domain);
return __async_schedule(func, data, &async_dfl_domain);
}
EXPORT_SYMBOL_GPL(async_schedule);
/**
* async_schedule_domain - schedule a function for asynchronous execution within a certain domain
* @ptr: function to execute asynchronously
* @func: function to execute asynchronously
* @data: data pointer to pass to the function
* @domain: the domain
*
@@ -226,10 +222,10 @@ EXPORT_SYMBOL_GPL(async_schedule);
* synchronization domain is specified via @domain. Note: This function
* may be called from atomic or non-atomic contexts.
*/
async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
async_cookie_t async_schedule_domain(async_func_t func, void *data,
struct async_domain *domain)
{
return __async_schedule(ptr, data, domain);
return __async_schedule(func, data, domain);
}
EXPORT_SYMBOL_GPL(async_schedule_domain);

View File

@@ -660,14 +660,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
/* As soon as there's any sign of userspace auditd,
* start kauditd to talk to it */
if (!kauditd_task)
if (!kauditd_task) {
kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
if (IS_ERR(kauditd_task)) {
err = PTR_ERR(kauditd_task);
kauditd_task = NULL;
return err;
if (IS_ERR(kauditd_task)) {
err = PTR_ERR(kauditd_task);
kauditd_task = NULL;
return err;
}
}
loginuid = audit_get_loginuid(current);
sessionid = audit_get_sessionid(current);
security_task_getsecid(current, &sid);

View File

@@ -59,10 +59,7 @@ struct audit_entry {
struct audit_krule rule;
};
#ifdef CONFIG_AUDIT
extern int audit_enabled;
extern int audit_ever_enabled;
#endif
extern int audit_pid;

View File

@@ -617,9 +617,9 @@ void audit_trim_trees(void)
}
spin_unlock(&hash_lock);
trim_marked(tree);
put_tree(tree);
drop_collected_mounts(root_mnt);
skip_it:
put_tree(tree);
mutex_lock(&audit_filter_mutex);
}
list_del(&cursor);

View File

@@ -594,6 +594,10 @@ exit_nofree:
return entry;
exit_free:
if (entry->rule.watch)
audit_put_watch(entry->rule.watch); /* matches initial get */
if (entry->rule.tree)
audit_put_tree(entry->rule.tree); /* that's the temporary one */
audit_free_rule(entry);
return ERR_PTR(err);
}

View File

@@ -1034,21 +1034,15 @@ static inline void audit_free_aux(struct audit_context *context)
}
}
static inline void audit_zero_context(struct audit_context *context,
enum audit_state state)
{
memset(context, 0, sizeof(*context));
context->state = state;
context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
}
static inline struct audit_context *audit_alloc_context(enum audit_state state)
{
struct audit_context *context;
if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
context = kzalloc(sizeof(*context), GFP_KERNEL);
if (!context)
return NULL;
audit_zero_context(context, state);
context->state = state;
context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
INIT_LIST_HEAD(&context->killed_trees);
INIT_LIST_HEAD(&context->names_list);
return context;

View File

@@ -392,6 +392,30 @@ bool ns_capable(struct user_namespace *ns, int cap)
}
EXPORT_SYMBOL(ns_capable);
/**
* file_ns_capable - Determine if the file's opener had a capability in effect
* @file: The file we want to check
* @ns: The usernamespace we want the capability in
* @cap: The capability to be tested for
*
* Return true if task that opened the file had a capability in effect
* when the file was opened.
*
* This does not set PF_SUPERPRIV because the caller may not
* actually be privileged.
*/
bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap)
{
if (WARN_ON_ONCE(!cap_valid(cap)))
return false;
if (security_capable(file->f_cred, ns, cap) == 0)
return true;
return false;
}
EXPORT_SYMBOL(file_ns_capable);
/**
* capable - Determine if the current task has a superior capability in effect
* @cap: The capability to be tested for

File diff suppressed because it is too large Load Diff

1
kernel/cpu/Makefile Normal file
View File

@@ -0,0 +1 @@
obj-y = idle.o

107
kernel/cpu/idle.c Normal file
View File

@@ -0,0 +1,107 @@
/*
* Generic entry point for the idle threads
*/
#include <linux/sched.h>
#include <linux/cpu.h>
#include <linux/tick.h>
#include <linux/mm.h>
#include <asm/tlb.h>
#include <trace/events/power.h>
static int __read_mostly cpu_idle_force_poll;
void cpu_idle_poll_ctrl(bool enable)
{
if (enable) {
cpu_idle_force_poll++;
} else {
cpu_idle_force_poll--;
WARN_ON_ONCE(cpu_idle_force_poll < 0);
}
}
#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
static int __init cpu_idle_poll_setup(char *__unused)
{
cpu_idle_force_poll = 1;
return 1;
}
__setup("nohlt", cpu_idle_poll_setup);
static int __init cpu_idle_nopoll_setup(char *__unused)
{
cpu_idle_force_poll = 0;
return 1;
}
__setup("hlt", cpu_idle_nopoll_setup);
#endif
static inline int cpu_idle_poll(void)
{
trace_cpu_idle_rcuidle(0, smp_processor_id());
local_irq_enable();
while (!need_resched())
cpu_relax();
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
return 1;
}
/* Weak implementations for optional arch specific functions */
void __weak arch_cpu_idle_prepare(void) { }
void __weak arch_cpu_idle_enter(void) { }
void __weak arch_cpu_idle_exit(void) { }
void __weak arch_cpu_idle_dead(void) { }
void __weak arch_cpu_idle(void)
{
cpu_idle_force_poll = 1;
}
/*
* Generic idle loop implementation
*/
static void cpu_idle_loop(void)
{
while (1) {
tick_nohz_idle_enter();
while (!need_resched()) {
check_pgt_cache();
rmb();
if (cpu_is_offline(smp_processor_id()))
arch_cpu_idle_dead();
local_irq_disable();
arch_cpu_idle_enter();
if (cpu_idle_force_poll) {
cpu_idle_poll();
} else {
current_clr_polling();
if (!need_resched()) {
stop_critical_timings();
rcu_idle_enter();
arch_cpu_idle();
WARN_ON_ONCE(irqs_disabled());
rcu_idle_exit();
start_critical_timings();
} else {
local_irq_enable();
}
current_set_polling();
}
arch_cpu_idle_exit();
}
tick_nohz_idle_exit();
schedule_preempt_disabled();
}
}
void cpu_startup_entry(enum cpuhp_state state)
{
current_set_polling();
arch_cpu_idle_prepare();
cpu_idle_loop();
}

View File

@@ -264,17 +264,6 @@ static struct cpuset top_cpuset = {
static DEFINE_MUTEX(cpuset_mutex);
static DEFINE_MUTEX(callback_mutex);
/*
* cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist
* buffers. They are statically allocated to prevent using excess stack
* when calling cpuset_print_task_mems_allowed().
*/
#define CPUSET_NAME_LEN (128)
#define CPUSET_NODELIST_LEN (256)
static char cpuset_name[CPUSET_NAME_LEN];
static char cpuset_nodelist[CPUSET_NODELIST_LEN];
static DEFINE_SPINLOCK(cpuset_buffer_lock);
/*
* CPU / memory hotplug is handled asynchronously.
*/
@@ -780,25 +769,26 @@ static void rebuild_sched_domains_locked(void)
lockdep_assert_held(&cpuset_mutex);
get_online_cpus();
/*
* We have raced with CPU hotplug. Don't do anything to avoid
* passing doms with offlined cpu to partition_sched_domains().
* Anyways, hotplug work item will rebuild sched domains.
*/
if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
goto out;
/* Generate domain masks and attrs */
ndoms = generate_sched_domains(&doms, &attr);
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
out:
put_online_cpus();
}
#else /* !CONFIG_SMP */
static void rebuild_sched_domains_locked(void)
{
}
static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes)
{
*domains = NULL;
return 1;
}
#endif /* CONFIG_SMP */
void rebuild_sched_domains(void)
@@ -1388,16 +1378,16 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
cgroup_taskset_for_each(task, cgrp, tset) {
/*
* Kthreads bound to specific cpus cannot be moved to a new
* cpuset; we cannot change their cpu affinity and
* isolating such threads by their set of allowed nodes is
* unnecessary. Thus, cpusets are not applicable for such
* threads. This prevents checking for success of
* set_cpus_allowed_ptr() on all attached tasks before
* cpus_allowed may be changed.
* Kthreads which disallow setaffinity shouldn't be moved
* to a new cpuset; we don't want to change their cpu
* affinity and isolating such threads by their set of
* allowed nodes is unnecessary. Thus, cpusets are not
* applicable for such threads. This prevents checking for
* success of set_cpus_allowed_ptr() on all attached tasks
* before cpus_allowed may be changed.
*/
ret = -EINVAL;
if (task->flags & PF_THREAD_BOUND)
if (task->flags & PF_NO_SETAFFINITY)
goto out_unlock;
ret = security_task_setscheduler(task);
if (ret)
@@ -2005,50 +1995,6 @@ int __init cpuset_init(void)
return 0;
}
/**
* cpuset_do_move_task - move a given task to another cpuset
* @tsk: pointer to task_struct the task to move
* @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
*
* Called by cgroup_scan_tasks() for each task in a cgroup.
* Return nonzero to stop the walk through the tasks.
*/
static void cpuset_do_move_task(struct task_struct *tsk,
struct cgroup_scanner *scan)
{
struct cgroup *new_cgroup = scan->data;
cgroup_lock();
cgroup_attach_task(new_cgroup, tsk);
cgroup_unlock();
}
/**
* move_member_tasks_to_cpuset - move tasks from one cpuset to another
* @from: cpuset in which the tasks currently reside
* @to: cpuset to which the tasks will be moved
*
* Called with cpuset_mutex held
* callback_mutex must not be held, as cpuset_attach() will take it.
*
* The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
* calling callback functions for each.
*/
static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
{
struct cgroup_scanner scan;
scan.cg = from->css.cgroup;
scan.test_task = NULL; /* select all tasks in cgroup */
scan.process_task = cpuset_do_move_task;
scan.heap = NULL;
scan.data = to->css.cgroup;
if (cgroup_scan_tasks(&scan))
printk(KERN_ERR "move_member_tasks_to_cpuset: "
"cgroup_scan_tasks failed\n");
}
/*
* If CPU and/or memory hotplug handlers, below, unplug any CPUs
* or memory nodes, we need to walk over the cpuset hierarchy,
@@ -2069,7 +2015,12 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
nodes_empty(parent->mems_allowed))
parent = parent_cs(parent);
move_member_tasks_to_cpuset(cs, parent);
if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
rcu_read_lock();
printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n",
cgroup_name(cs->css.cgroup));
rcu_read_unlock();
}
}
/**
@@ -2222,17 +2173,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
flush_workqueue(cpuset_propagate_hotplug_wq);
/* rebuild sched domains if cpus_allowed has changed */
if (cpus_updated) {
struct sched_domain_attr *attr;
cpumask_var_t *doms;
int ndoms;
mutex_lock(&cpuset_mutex);
ndoms = generate_sched_domains(&doms, &attr);
mutex_unlock(&cpuset_mutex);
partition_sched_domains(ndoms, doms, attr);
}
if (cpus_updated)
rebuild_sched_domains();
}
void cpuset_update_active_cpus(bool cpu_online)
@@ -2251,7 +2193,6 @@ void cpuset_update_active_cpus(bool cpu_online)
schedule_work(&cpuset_hotplug_work);
}
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
* Call this routine anytime after node_states[N_MEMORY] changes.
@@ -2263,20 +2204,23 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
schedule_work(&cpuset_hotplug_work);
return NOTIFY_OK;
}
#endif
static struct notifier_block cpuset_track_online_nodes_nb = {
.notifier_call = cpuset_track_online_nodes,
.priority = 10, /* ??! */
};
/**
* cpuset_init_smp - initialize cpus_allowed
*
* Description: Finish top cpuset after cpu, node maps are initialized
**/
*/
void __init cpuset_init_smp(void)
{
cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
top_cpuset.mems_allowed = node_states[N_MEMORY];
hotplug_memory_notifier(cpuset_track_online_nodes, 10);
register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
cpuset_propagate_hotplug_wq =
alloc_ordered_workqueue("cpuset_hotplug", 0);
@@ -2592,6 +2536,8 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
}
#define CPUSET_NODELIST_LEN (256)
/**
* cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
* @task: pointer to task_struct of some task.
@@ -2602,25 +2548,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
*/
void cpuset_print_task_mems_allowed(struct task_struct *tsk)
{
struct dentry *dentry;
/* Statically allocated to prevent using excess stack. */
static char cpuset_nodelist[CPUSET_NODELIST_LEN];
static DEFINE_SPINLOCK(cpuset_buffer_lock);
dentry = task_cs(tsk)->css.cgroup->dentry;
struct cgroup *cgrp = task_cs(tsk)->css.cgroup;
rcu_read_lock();
spin_lock(&cpuset_buffer_lock);
if (!dentry) {
strcpy(cpuset_name, "/");
} else {
spin_lock(&dentry->d_lock);
strlcpy(cpuset_name, (const char *)dentry->d_name.name,
CPUSET_NAME_LEN);
spin_unlock(&dentry->d_lock);
}
nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
tsk->mems_allowed);
printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
tsk->comm, cpuset_name, cpuset_nodelist);
tsk->comm, cgroup_name(cgrp), cpuset_nodelist);
spin_unlock(&cpuset_buffer_lock);
rcu_read_unlock();
}
/*

View File

@@ -38,6 +38,7 @@
#include <linux/ftrace_event.h>
#include <linux/hw_breakpoint.h>
#include <linux/mm_types.h>
#include <linux/cgroup.h>
#include "internal.h"
@@ -234,6 +235,20 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
#ifdef CONFIG_CGROUP_PERF
/*
* perf_cgroup_info keeps track of time_enabled for a cgroup.
* This is a per-cpu dynamically allocated data structure.
*/
struct perf_cgroup_info {
u64 time;
u64 timestamp;
};
struct perf_cgroup {
struct cgroup_subsys_state css;
struct perf_cgroup_info __percpu *info;
};
/*
* Must ensure cgroup is pinned (css_get) before calling
* this function. In other words, we cannot call this function
@@ -252,7 +267,22 @@ perf_cgroup_match(struct perf_event *event)
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
return !event->cgrp || event->cgrp == cpuctx->cgrp;
/* @event doesn't care about cgroup */
if (!event->cgrp)
return true;
/* wants specific cgroup scope but @cpuctx isn't associated with any */
if (!cpuctx->cgrp)
return false;
/*
* Cgroup scoping is recursive. An event enabled for a cgroup is
* also enabled for all its descendant cgroups. If @cpuctx's
* cgroup is a descendant of @event's (the test covers identity
* case), it's a match.
*/
return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
event->cgrp->css.cgroup);
}
static inline bool perf_tryget_cgroup(struct perf_event *event)
@@ -966,9 +996,15 @@ static void perf_event__header_size(struct perf_event *event)
if (sample_type & PERF_SAMPLE_PERIOD)
size += sizeof(data->period);
if (sample_type & PERF_SAMPLE_WEIGHT)
size += sizeof(data->weight);
if (sample_type & PERF_SAMPLE_READ)
size += event->read_size;
if (sample_type & PERF_SAMPLE_DATA_SRC)
size += sizeof(data->data_src.val);
event->header_size = size;
}
@@ -4193,6 +4229,12 @@ void perf_output_sample(struct perf_output_handle *handle,
perf_output_sample_ustack(handle,
data->stack_user_size,
data->regs_user.regs);
if (sample_type & PERF_SAMPLE_WEIGHT)
perf_output_put(handle, data->weight);
if (sample_type & PERF_SAMPLE_DATA_SRC)
perf_output_put(handle, data->data_src.val);
}
void perf_prepare_sample(struct perf_event_header *header,
@@ -4449,12 +4491,15 @@ static void perf_event_task_event(struct perf_task_event *task_event)
if (ctxn < 0)
goto next;
ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
if (ctx)
perf_event_task_ctx(ctx, task_event);
}
if (ctx)
perf_event_task_ctx(ctx, task_event);
next:
put_cpu_ptr(pmu->pmu_cpu_context);
}
if (task_event->task_ctx)
perf_event_task_ctx(task_event->task_ctx, task_event);
rcu_read_unlock();
}
@@ -4608,6 +4653,7 @@ void perf_event_comm(struct task_struct *task)
struct perf_event_context *ctx;
int ctxn;
rcu_read_lock();
for_each_task_context_nr(ctxn) {
ctx = task->perf_event_ctxp[ctxn];
if (!ctx)
@@ -4615,6 +4661,7 @@ void perf_event_comm(struct task_struct *task)
perf_event_enable_on_exec(ctx);
}
rcu_read_unlock();
if (!atomic_read(&nr_comm_events))
return;
@@ -4749,7 +4796,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
} else {
if (arch_vma_name(mmap_event->vma)) {
name = strncpy(tmp, arch_vma_name(mmap_event->vma),
sizeof(tmp));
sizeof(tmp) - 1);
tmp[sizeof(tmp) - 1] = '\0';
goto got_name;
}
@@ -4776,6 +4824,9 @@ got_name:
mmap_event->file_name = name;
mmap_event->file_size = size;
if (!(vma->vm_flags & VM_EXEC))
mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
rcu_read_lock();
@@ -5342,7 +5393,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
static int perf_swevent_init(struct perf_event *event)
{
int event_id = event->attr.config;
u64 event_id = event->attr.config;
if (event->attr.type != PERF_TYPE_SOFTWARE)
return -ENOENT;
@@ -5662,6 +5713,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
event->attr.sample_period = NSEC_PER_SEC / freq;
hwc->sample_period = event->attr.sample_period;
local64_set(&hwc->period_left, hwc->sample_period);
hwc->last_period = hwc->sample_period;
event->attr.freq = 0;
}
}
@@ -5997,6 +6049,7 @@ skip_type:
if (pmu->pmu_cpu_context)
goto got_cpu_context;
ret = -ENOMEM;
pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
if (!pmu->pmu_cpu_context)
goto free_dev;
@@ -7524,12 +7577,5 @@ struct cgroup_subsys perf_subsys = {
.css_free = perf_cgroup_css_free,
.exit = perf_cgroup_exit,
.attach = perf_cgroup_attach,
/*
* perf_event cgroup doesn't handle nesting correctly.
* ctx->nr_cgroups adjustments should be propagated through the
* cgroup hierarchy. Fix it and remove the following.
*/
.broken_hierarchy = true,
};
#endif /* CONFIG_CGROUP_PERF */

View File

@@ -16,7 +16,7 @@ struct ring_buffer {
int page_order; /* allocation order */
#endif
int nr_pages; /* nr of data pages */
int writable; /* are we writable */
int overwrite; /* can overwrite itself */
atomic_t poll; /* POLL_ for wakeups */

View File

@@ -18,12 +18,24 @@
static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
unsigned long offset, unsigned long head)
{
unsigned long mask;
unsigned long sz = perf_data_size(rb);
unsigned long mask = sz - 1;
if (!rb->writable)
/*
* check if user-writable
* overwrite : over-write its own tail
* !overwrite: buffer possibly drops events.
*/
if (rb->overwrite)
return true;
mask = perf_data_size(rb) - 1;
/*
* verify that payload is not bigger than buffer
* otherwise masking logic may fail to detect
* the "not enough space" condition
*/
if ((head - offset) > sz)
return false;
offset = (offset - tail) & mask;
head = (head - tail) & mask;
@@ -212,7 +224,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
rb->watermark = max_size / 2;
if (flags & RING_BUFFER_WRITABLE)
rb->writable = 1;
rb->overwrite = 0;
else
rb->overwrite = 1;
atomic_set(&rb->refcount, 1);

View File

@@ -75,6 +75,15 @@ struct uprobe {
struct arch_uprobe arch;
};
struct return_instance {
struct uprobe *uprobe;
unsigned long func;
unsigned long orig_ret_vaddr; /* original return address */
bool chained; /* true, if instance is nested */
struct return_instance *next; /* keep as stack */
};
/*
* valid_vma: Verify if the specified vma is an executable vma
* Relax restrictions while unregistering: vm_flags might have
@@ -173,10 +182,31 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn)
return *insn == UPROBE_SWBP_INSN;
}
static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode)
/**
* is_trap_insn - check if instruction is breakpoint instruction.
* @insn: instruction to be checked.
* Default implementation of is_trap_insn
* Returns true if @insn is a breakpoint instruction.
*
* This function is needed for the case where an architecture has multiple
* trap instructions (like powerpc).
*/
bool __weak is_trap_insn(uprobe_opcode_t *insn)
{
return is_swbp_insn(insn);
}
static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
{
void *kaddr = kmap_atomic(page);
memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE);
memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
kunmap_atomic(kaddr);
}
static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
{
void *kaddr = kmap_atomic(page);
memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
kunmap_atomic(kaddr);
}
@@ -185,7 +215,16 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
uprobe_opcode_t old_opcode;
bool is_swbp;
copy_opcode(page, vaddr, &old_opcode);
/*
* Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
* We do not check if it is any other 'trap variant' which could
* be conditional trap instruction such as the one powerpc supports.
*
* The logic is that we do not care if the underlying instruction
* is a trap variant; uprobes always wins over any other (gdb)
* breakpoint.
*/
copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
is_swbp = is_swbp_insn(&old_opcode);
if (is_swbp_insn(new_opcode)) {
@@ -204,7 +243,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
* Expect the breakpoint instruction to be the smallest size instruction for
* the architecture. If an arch has variable length instruction and the
* breakpoint instruction is not of the smallest length instruction
* supported by that architecture then we need to modify is_swbp_at_addr and
* supported by that architecture then we need to modify is_trap_at_addr and
* write_opcode accordingly. This would never be a problem for archs that
* have fixed length instructions.
*/
@@ -225,7 +264,6 @@ static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
uprobe_opcode_t opcode)
{
struct page *old_page, *new_page;
void *vaddr_old, *vaddr_new;
struct vm_area_struct *vma;
int ret;
@@ -246,15 +284,8 @@ retry:
__SetPageUptodate(new_page);
/* copy the page now that we've got it stable */
vaddr_old = kmap_atomic(old_page);
vaddr_new = kmap_atomic(new_page);
memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE);
kunmap_atomic(vaddr_new);
kunmap_atomic(vaddr_old);
copy_highpage(new_page, old_page);
copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
ret = anon_vma_prepare(vma);
if (ret)
@@ -477,30 +508,18 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,
unsigned long nbytes, loff_t offset)
{
struct page *page;
void *vaddr;
unsigned long off;
pgoff_t idx;
if (!filp)
return -EINVAL;
if (!mapping->a_ops->readpage)
return -EIO;
idx = offset >> PAGE_CACHE_SHIFT;
off = offset & ~PAGE_MASK;
/*
* Ensure that the page that has the original instruction is
* populated and in page-cache.
*/
page = read_mapping_page(mapping, idx, filp);
page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
if (IS_ERR(page))
return PTR_ERR(page);
vaddr = kmap_atomic(page);
memcpy(insn, vaddr + off, nbytes);
kunmap_atomic(vaddr);
copy_from_page(page, offset, insn, nbytes);
page_cache_release(page);
return 0;
@@ -550,7 +569,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
goto out;
ret = -ENOTSUPP;
if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn))
goto out;
ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
@@ -758,7 +777,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
down_write(&mm->mmap_sem);
vma = find_vma(mm, info->vaddr);
if (!vma || !valid_vma(vma, is_register) ||
vma->vm_file->f_mapping->host != uprobe->inode)
file_inode(vma->vm_file) != uprobe->inode)
goto unlock;
if (vma->vm_start > info->vaddr ||
@@ -828,6 +847,10 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
struct uprobe *uprobe;
int ret;
/* Uprobe must have at least one set consumer */
if (!uc->handler && !uc->ret_handler)
return -EINVAL;
/* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))
return -EINVAL;
@@ -917,7 +940,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
loff_t offset;
if (!valid_vma(vma, false) ||
vma->vm_file->f_mapping->host != uprobe->inode)
file_inode(vma->vm_file) != uprobe->inode)
continue;
offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
@@ -1010,7 +1033,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
if (no_uprobe_events() || !valid_vma(vma, true))
return 0;
inode = vma->vm_file->f_mapping->host;
inode = file_inode(vma->vm_file);
if (!inode)
return 0;
@@ -1041,7 +1064,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
struct inode *inode;
struct rb_node *n;
inode = vma->vm_file->f_mapping->host;
inode = file_inode(vma->vm_file);
min = vaddr_to_offset(vma, start);
max = min + (end - start) - 1;
@@ -1114,6 +1137,7 @@ static struct xol_area *get_xol_area(void)
{
struct mm_struct *mm = current->mm;
struct xol_area *area;
uprobe_opcode_t insn = UPROBE_SWBP_INSN;
area = mm->uprobes_state.xol_area;
if (area)
@@ -1131,7 +1155,12 @@ static struct xol_area *get_xol_area(void)
if (!area->page)
goto free_bitmap;
/* allocate first slot of task's xol_area for the return probes */
set_bit(0, area->bitmap);
copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
atomic_set(&area->slot_count, 1);
init_waitqueue_head(&area->wq);
if (!xol_add_vma(area))
return area;
@@ -1216,9 +1245,7 @@ static unsigned long xol_take_insn_slot(struct xol_area *area)
static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
{
struct xol_area *area;
unsigned long offset;
unsigned long xol_vaddr;
void *vaddr;
area = get_xol_area();
if (!area)
@@ -1229,10 +1256,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
return 0;
/* Initialize the slot */
offset = xol_vaddr & ~PAGE_MASK;
vaddr = kmap_atomic(area->page);
memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
kunmap_atomic(vaddr);
copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES);
/*
* We probably need flush_icache_user_range() but it needs vma.
* This should work on supported architectures too.
@@ -1298,6 +1322,7 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
void uprobe_free_utask(struct task_struct *t)
{
struct uprobe_task *utask = t->utask;
struct return_instance *ri, *tmp;
if (!utask)
return;
@@ -1305,6 +1330,15 @@ void uprobe_free_utask(struct task_struct *t)
if (utask->active_uprobe)
put_uprobe(utask->active_uprobe);
ri = utask->return_instances;
while (ri) {
tmp = ri;
ri = ri->next;
put_uprobe(tmp->uprobe);
kfree(tmp);
}
xol_free_insn_slot(t);
kfree(utask);
t->utask = NULL;
@@ -1333,6 +1367,93 @@ static struct uprobe_task *get_utask(void)
return current->utask;
}
/*
* Current area->vaddr notion assume the trampoline address is always
* equal area->vaddr.
*
* Returns -1 in case the xol_area is not allocated.
*/
static unsigned long get_trampoline_vaddr(void)
{
struct xol_area *area;
unsigned long trampoline_vaddr = -1;
area = current->mm->uprobes_state.xol_area;
smp_read_barrier_depends();
if (area)
trampoline_vaddr = area->vaddr;
return trampoline_vaddr;
}
static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
{
struct return_instance *ri;
struct uprobe_task *utask;
unsigned long orig_ret_vaddr, trampoline_vaddr;
bool chained = false;
if (!get_xol_area())
return;
utask = get_utask();
if (!utask)
return;
if (utask->depth >= MAX_URETPROBE_DEPTH) {
printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
" nestedness limit pid/tgid=%d/%d\n",
current->pid, current->tgid);
return;
}
ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL);
if (!ri)
goto fail;
trampoline_vaddr = get_trampoline_vaddr();
orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
if (orig_ret_vaddr == -1)
goto fail;
/*
* We don't want to keep trampoline address in stack, rather keep the
* original return address of first caller thru all the consequent
* instances. This also makes breakpoint unwrapping easier.
*/
if (orig_ret_vaddr == trampoline_vaddr) {
if (!utask->return_instances) {
/*
* This situation is not possible. Likely we have an
* attack from user-space.
*/
pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n",
current->pid, current->tgid);
goto fail;
}
chained = true;
orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
}
atomic_inc(&uprobe->ref);
ri->uprobe = uprobe;
ri->func = instruction_pointer(regs);
ri->orig_ret_vaddr = orig_ret_vaddr;
ri->chained = chained;
utask->depth++;
/* add instance to the stack */
ri->next = utask->return_instances;
utask->return_instances = ri;
return;
fail:
kfree(ri);
}
/* Prepare to single-step probed instruction out of line. */
static int
pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
@@ -1431,7 +1552,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
clear_bit(MMF_HAS_UPROBES, &mm->flags);
}
static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
{
struct page *page;
uprobe_opcode_t opcode;
@@ -1449,10 +1570,11 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
if (result < 0)
return result;
copy_opcode(page, vaddr, &opcode);
copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
put_page(page);
out:
return is_swbp_insn(&opcode);
/* This needs to return true for any variant of the trap insn */
return is_trap_insn(&opcode);
}
static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
@@ -1465,14 +1587,14 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
vma = find_vma(mm, bp_vaddr);
if (vma && vma->vm_start <= bp_vaddr) {
if (valid_vma(vma, false)) {
struct inode *inode = vma->vm_file->f_mapping->host;
struct inode *inode = file_inode(vma->vm_file);
loff_t offset = vaddr_to_offset(vma, bp_vaddr);
uprobe = find_uprobe(inode, offset);
}
if (!uprobe)
*is_swbp = is_swbp_at_addr(mm, bp_vaddr);
*is_swbp = is_trap_at_addr(mm, bp_vaddr);
} else {
*is_swbp = -EFAULT;
}
@@ -1488,16 +1610,27 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
{
struct uprobe_consumer *uc;
int remove = UPROBE_HANDLER_REMOVE;
bool need_prep = false; /* prepare return uprobe, when needed */
down_read(&uprobe->register_rwsem);
for (uc = uprobe->consumers; uc; uc = uc->next) {
int rc = uc->handler(uc, regs);
int rc = 0;
if (uc->handler) {
rc = uc->handler(uc, regs);
WARN(rc & ~UPROBE_HANDLER_MASK,
"bad rc=0x%x from %pf()\n", rc, uc->handler);
}
if (uc->ret_handler)
need_prep = true;
WARN(rc & ~UPROBE_HANDLER_MASK,
"bad rc=0x%x from %pf()\n", rc, uc->handler);
remove &= rc;
}
if (need_prep && !remove)
prepare_uretprobe(uprobe, regs); /* put bp at return */
if (remove && uprobe->consumers) {
WARN_ON(!uprobe_is_active(uprobe));
unapply_uprobe(uprobe, current->mm);
@@ -1505,6 +1638,64 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
up_read(&uprobe->register_rwsem);
}
static void
handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
{
struct uprobe *uprobe = ri->uprobe;
struct uprobe_consumer *uc;
down_read(&uprobe->register_rwsem);
for (uc = uprobe->consumers; uc; uc = uc->next) {
if (uc->ret_handler)
uc->ret_handler(uc, ri->func, regs);
}
up_read(&uprobe->register_rwsem);
}
static bool handle_trampoline(struct pt_regs *regs)
{
struct uprobe_task *utask;
struct return_instance *ri, *tmp;
bool chained;
utask = current->utask;
if (!utask)
return false;
ri = utask->return_instances;
if (!ri)
return false;
/*
* TODO: we should throw out return_instance's invalidated by
* longjmp(), currently we assume that the probed function always
* returns.
*/
instruction_pointer_set(regs, ri->orig_ret_vaddr);
for (;;) {
handle_uretprobe_chain(ri, regs);
chained = ri->chained;
put_uprobe(ri->uprobe);
tmp = ri;
ri = ri->next;
kfree(tmp);
if (!chained)
break;
utask->depth--;
BUG_ON(!ri);
}
utask->return_instances = ri;
return true;
}
/*
* Run handler and ask thread to singlestep.
* Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -1516,8 +1707,15 @@ static void handle_swbp(struct pt_regs *regs)
int uninitialized_var(is_swbp);
bp_vaddr = uprobe_get_swbp_addr(regs);
uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
if (bp_vaddr == get_trampoline_vaddr()) {
if (handle_trampoline(regs))
return;
pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n",
current->pid, current->tgid);
}
uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
if (!uprobe) {
if (is_swbp > 0) {
/* No matching uprobe; signal SIGTRAP. */
@@ -1616,7 +1814,11 @@ void uprobe_notify_resume(struct pt_regs *regs)
*/
int uprobe_pre_sstep_notifier(struct pt_regs *regs)
{
if (!current->mm || !test_bit(MMF_HAS_UPROBES, &current->mm->flags))
if (!current->mm)
return 0;
if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
(!current->utask || !current->utask->return_instances))
return 0;
set_thread_flag(TIF_UPROBE);

View File

@@ -835,7 +835,7 @@ void do_exit(long code)
/*
* Make sure we are holding no locks:
*/
debug_check_no_locks_held();
debug_check_no_locks_held(tsk);
/*
* We can do this unlocked here. The futex code uses this flag
* just to verify whether the pi state cleanup has been done

View File

@@ -1141,6 +1141,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
return ERR_PTR(-EINVAL);
/*
* Thread groups must share signals as well, and detached threads
* can only be started up within the thread group.
@@ -1807,7 +1810,7 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
* If unsharing a user namespace must also unshare the thread.
*/
if (unshare_flags & CLONE_NEWUSER)
unshare_flags |= CLONE_THREAD;
unshare_flags |= CLONE_THREAD | CLONE_FS;
/*
* If unsharing a pid namespace must also unshare the thread.
*/

View File

@@ -223,7 +223,8 @@ static void drop_futex_key_refs(union futex_key *key)
* @rw: mapping needs to be read/write (values: VERIFY_READ,
* VERIFY_WRITE)
*
* Returns a negative error code or 0
* Return: a negative error code or 0
*
* The key words are stored in *key on success.
*
* For shared mappings, it's (page->index, file_inode(vma->vm_file),
@@ -705,9 +706,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
* be "current" except in the case of requeue pi.
* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
*
* Returns:
* 0 - ready to wait
* 1 - acquired the lock
* Return:
* 0 - ready to wait;
* 1 - acquired the lock;
* <0 - error
*
* The hb->lock and futex_key refs shall be held by the caller.
@@ -1191,9 +1192,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
* then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
* hb1 and hb2 must be held by the caller.
*
* Returns:
* 0 - failed to acquire the lock atomicly
* 1 - acquired the lock
* Return:
* 0 - failed to acquire the lock atomically;
* 1 - acquired the lock;
* <0 - error
*/
static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1254,8 +1255,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
* Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
* uaddr2 atomically on behalf of the top waiter.
*
* Returns:
* >=0 - on success, the number of tasks requeued or woken
* Return:
* >=0 - on success, the number of tasks requeued or woken;
* <0 - on error
*/
static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
@@ -1536,8 +1537,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
* The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
* be paired with exactly one earlier call to queue_me().
*
* Returns:
* 1 - if the futex_q was still queued (and we removed unqueued it)
* Return:
* 1 - if the futex_q was still queued (and we removed unqueued it);
* 0 - if the futex_q was already removed by the waking thread
*/
static int unqueue_me(struct futex_q *q)
@@ -1707,9 +1708,9 @@ static long futex_wait_restart(struct restart_block *restart);
* the pi_state owner as well as handle race conditions that may allow us to
* acquire the lock. Must be called with the hb lock held.
*
* Returns:
* 1 - success, lock taken
* 0 - success, lock not taken
* Return:
* 1 - success, lock taken;
* 0 - success, lock not taken;
* <0 - on error (-EFAULT)
*/
static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
@@ -1824,8 +1825,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
* Return with the hb lock held and a q.key reference on success, and unlocked
* with no q.key reference on failure.
*
* Returns:
* 0 - uaddr contains val and hb has been locked
* Return:
* 0 - uaddr contains val and hb has been locked;
* <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
*/
static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
@@ -2203,9 +2204,9 @@ pi_faulted:
* the wakeup and return the appropriate error code to the caller. Must be
* called with the hb lock held.
*
* Returns
* 0 - no early wakeup detected
* <0 - -ETIMEDOUT or -ERESTARTNOINTR
* Return:
* 0 = no early wakeup detected;
* <0 = -ETIMEDOUT or -ERESTARTNOINTR
*/
static inline
int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
@@ -2247,7 +2248,6 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* @val: the expected value of uaddr
* @abs_time: absolute timeout
* @bitset: 32 bit wakeup bitset set by userspace, defaults to all
* @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
* @uaddr2: the pi futex we will take prior to returning to user-space
*
* The caller will wait on uaddr and will be requeued by futex_requeue() to
@@ -2258,7 +2258,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* there was a need to.
*
* We call schedule in futex_wait_queue_me() when we enqueue and return there
* via the following:
* via the following--
* 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
* 2) wakeup on uaddr2 after a requeue
* 3) signal
@@ -2276,8 +2276,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
*
* If 4 or 7, we cleanup and return with -ETIMEDOUT.
*
* Returns:
* 0 - On success
* Return:
* 0 - On success;
* <0 - On error
*/
static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,

View File

@@ -63,6 +63,7 @@
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
.clock_base =
{
{
@@ -1642,8 +1643,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
int i;
raw_spin_lock_init(&cpu_base->lock);
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
cpu_base->clock_base[i].cpu_base = cpu_base;
timerqueue_init_head(&cpu_base->clock_base[i].active);

View File

@@ -55,7 +55,7 @@ struct resource crashk_res = {
.flags = IORESOURCE_BUSY | IORESOURCE_MEM
};
struct resource crashk_low_res = {
.name = "Crash kernel low",
.name = "Crash kernel",
.start = 0,
.end = 0,
.flags = IORESOURCE_BUSY | IORESOURCE_MEM
@@ -1118,12 +1118,8 @@ void __weak crash_free_reserved_phys_range(unsigned long begin,
{
unsigned long addr;
for (addr = begin; addr < end; addr += PAGE_SIZE) {
ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
free_page((unsigned long)__va(addr));
totalram_pages++;
}
for (addr = begin; addr < end; addr += PAGE_SIZE)
free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
}
int crash_shrink_memory(unsigned long new_size)
@@ -1368,35 +1364,114 @@ static int __init parse_crashkernel_simple(char *cmdline,
return 0;
}
#define SUFFIX_HIGH 0
#define SUFFIX_LOW 1
#define SUFFIX_NULL 2
static __initdata char *suffix_tbl[] = {
[SUFFIX_HIGH] = ",high",
[SUFFIX_LOW] = ",low",
[SUFFIX_NULL] = NULL,
};
/*
* That function is the entry point for command line parsing and should be
* called from the arch-specific code.
* That function parses "suffix" crashkernel command lines like
*
* crashkernel=size,[high|low]
*
* It returns 0 on success and -EINVAL on failure.
*/
static int __init parse_crashkernel_suffix(char *cmdline,
unsigned long long *crash_size,
unsigned long long *crash_base,
const char *suffix)
{
char *cur = cmdline;
*crash_size = memparse(cmdline, &cur);
if (cmdline == cur) {
pr_warn("crashkernel: memory value expected\n");
return -EINVAL;
}
/* check with suffix */
if (strncmp(cur, suffix, strlen(suffix))) {
pr_warn("crashkernel: unrecognized char\n");
return -EINVAL;
}
cur += strlen(suffix);
if (*cur != ' ' && *cur != '\0') {
pr_warn("crashkernel: unrecognized char\n");
return -EINVAL;
}
return 0;
}
static __init char *get_last_crashkernel(char *cmdline,
const char *name,
const char *suffix)
{
char *p = cmdline, *ck_cmdline = NULL;
/* find crashkernel and use the last one if there are more */
p = strstr(p, name);
while (p) {
char *end_p = strchr(p, ' ');
char *q;
if (!end_p)
end_p = p + strlen(p);
if (!suffix) {
int i;
/* skip the one with any known suffix */
for (i = 0; suffix_tbl[i]; i++) {
q = end_p - strlen(suffix_tbl[i]);
if (!strncmp(q, suffix_tbl[i],
strlen(suffix_tbl[i])))
goto next;
}
ck_cmdline = p;
} else {
q = end_p - strlen(suffix);
if (!strncmp(q, suffix, strlen(suffix)))
ck_cmdline = p;
}
next:
p = strstr(p+1, name);
}
if (!ck_cmdline)
return NULL;
return ck_cmdline;
}
static int __init __parse_crashkernel(char *cmdline,
unsigned long long system_ram,
unsigned long long *crash_size,
unsigned long long *crash_base,
const char *name)
const char *name,
const char *suffix)
{
char *p = cmdline, *ck_cmdline = NULL;
char *first_colon, *first_space;
char *ck_cmdline;
BUG_ON(!crash_size || !crash_base);
*crash_size = 0;
*crash_base = 0;
/* find crashkernel and use the last one if there are more */
p = strstr(p, name);
while (p) {
ck_cmdline = p;
p = strstr(p+1, name);
}
ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
if (!ck_cmdline)
return -EINVAL;
ck_cmdline += strlen(name);
if (suffix)
return parse_crashkernel_suffix(ck_cmdline, crash_size,
crash_base, suffix);
/*
* if the commandline contains a ':', then that's the extended
* syntax -- if not, it must be the classic syntax
@@ -1413,13 +1488,26 @@ static int __init __parse_crashkernel(char *cmdline,
return 0;
}
/*
* That function is the entry point for command line parsing and should be
* called from the arch-specific code.
*/
int __init parse_crashkernel(char *cmdline,
unsigned long long system_ram,
unsigned long long *crash_size,
unsigned long long *crash_base)
{
return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
"crashkernel=");
"crashkernel=", NULL);
}
int __init parse_crashkernel_high(char *cmdline,
unsigned long long system_ram,
unsigned long long *crash_size,
unsigned long long *crash_base)
{
return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
"crashkernel=", suffix_tbl[SUFFIX_HIGH]);
}
int __init parse_crashkernel_low(char *cmdline,
@@ -1428,7 +1516,7 @@ int __init parse_crashkernel_low(char *cmdline,
unsigned long long *crash_base)
{
return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
"crashkernel_low=");
"crashkernel=", suffix_tbl[SUFFIX_LOW]);
}
static void update_vmcoreinfo_note(void)
@@ -1489,7 +1577,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_SYMBOL(swapper_pg_dir);
#endif
VMCOREINFO_SYMBOL(_stext);
VMCOREINFO_SYMBOL(vmlist);
VMCOREINFO_SYMBOL(vmap_area_list);
#ifndef CONFIG_NEED_MULTIPLE_NODES
VMCOREINFO_SYMBOL(mem_map);
@@ -1527,7 +1615,8 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(free_area, free_list);
VMCOREINFO_OFFSET(list_head, next);
VMCOREINFO_OFFSET(list_head, prev);
VMCOREINFO_OFFSET(vm_struct, addr);
VMCOREINFO_OFFSET(vmap_area, va_start);
VMCOREINFO_OFFSET(vmap_area, list);
VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
log_buf_kexec_setup();
VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);

View File

@@ -794,16 +794,16 @@ out:
}
#ifdef CONFIG_SYSCTL
/* This should be called with kprobe_mutex locked */
static void __kprobes optimize_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
unsigned int i;
mutex_lock(&kprobe_mutex);
/* If optimization is already allowed, just return */
if (kprobes_allow_optimization)
return;
goto out;
kprobes_allow_optimization = true;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
@@ -813,18 +813,22 @@ static void __kprobes optimize_all_kprobes(void)
optimize_kprobe(p);
}
printk(KERN_INFO "Kprobes globally optimized\n");
out:
mutex_unlock(&kprobe_mutex);
}
/* This should be called with kprobe_mutex locked */
static void __kprobes unoptimize_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
unsigned int i;
mutex_lock(&kprobe_mutex);
/* If optimization is already prohibited, just return */
if (!kprobes_allow_optimization)
if (!kprobes_allow_optimization) {
mutex_unlock(&kprobe_mutex);
return;
}
kprobes_allow_optimization = false;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
@@ -834,11 +838,14 @@ static void __kprobes unoptimize_all_kprobes(void)
unoptimize_kprobe(p, false);
}
}
mutex_unlock(&kprobe_mutex);
/* Wait for unoptimizing completion */
wait_for_kprobe_optimizer();
printk(KERN_INFO "Kprobes globally unoptimized\n");
}
static DEFINE_MUTEX(kprobe_sysctl_mutex);
int sysctl_kprobes_optimization;
int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
@@ -846,7 +853,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
{
int ret;
mutex_lock(&kprobe_mutex);
mutex_lock(&kprobe_sysctl_mutex);
sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
@@ -854,7 +861,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
optimize_all_kprobes();
else
unoptimize_all_kprobes();
mutex_unlock(&kprobe_mutex);
mutex_unlock(&kprobe_sysctl_mutex);
return ret;
}

View File

@@ -52,8 +52,21 @@ enum KTHREAD_BITS {
KTHREAD_IS_PARKED,
};
#define to_kthread(tsk) \
container_of((tsk)->vfork_done, struct kthread, exited)
#define __to_kthread(vfork) \
container_of(vfork, struct kthread, exited)
static inline struct kthread *to_kthread(struct task_struct *k)
{
return __to_kthread(k->vfork_done);
}
static struct kthread *to_live_kthread(struct task_struct *k)
{
struct completion *vfork = ACCESS_ONCE(k->vfork_done);
if (likely(vfork))
return __to_kthread(vfork);
return NULL;
}
/**
* kthread_should_stop - should this kthread return now?
@@ -124,12 +137,12 @@ void *kthread_data(struct task_struct *task)
static void __kthread_parkme(struct kthread *self)
{
__set_current_state(TASK_INTERRUPTIBLE);
__set_current_state(TASK_PARKED);
while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
complete(&self->parked);
schedule();
__set_current_state(TASK_INTERRUPTIBLE);
__set_current_state(TASK_PARKED);
}
clear_bit(KTHREAD_IS_PARKED, &self->flags);
__set_current_state(TASK_RUNNING);
@@ -256,11 +269,16 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
}
EXPORT_SYMBOL(kthread_create_on_node);
static void __kthread_bind(struct task_struct *p, unsigned int cpu)
static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
{
/* Must have done schedule() in kthread() before we set_task_cpu */
if (!wait_task_inactive(p, state)) {
WARN_ON(1);
return;
}
/* It's safe because the task is inactive. */
do_set_cpus_allowed(p, cpumask_of(cpu));
p->flags |= PF_THREAD_BOUND;
p->flags |= PF_NO_SETAFFINITY;
}
/**
@@ -274,12 +292,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu)
*/
void kthread_bind(struct task_struct *p, unsigned int cpu)
{
/* Must have done schedule() in kthread() before we set_task_cpu */
if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
WARN_ON(1);
return;
}
__kthread_bind(p, cpu);
__kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(kthread_bind);
@@ -311,17 +324,20 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
return p;
}
static struct kthread *task_get_live_kthread(struct task_struct *k)
static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
{
struct kthread *kthread;
get_task_struct(k);
kthread = to_kthread(k);
/* It might have exited */
barrier();
if (k->vfork_done != NULL)
return kthread;
return NULL;
clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
/*
* We clear the IS_PARKED bit here as we don't wait
* until the task has left the park code. So if we'd
* park before that happens we'd see the IS_PARKED bit
* which might be about to be cleared.
*/
if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
__kthread_bind(k, kthread->cpu, TASK_PARKED);
wake_up_state(k, TASK_PARKED);
}
}
/**
@@ -334,23 +350,10 @@ static struct kthread *task_get_live_kthread(struct task_struct *k)
*/
void kthread_unpark(struct task_struct *k)
{
struct kthread *kthread = task_get_live_kthread(k);
struct kthread *kthread = to_live_kthread(k);
if (kthread) {
clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
/*
* We clear the IS_PARKED bit here as we don't wait
* until the task has left the park code. So if we'd
* park before that happens we'd see the IS_PARKED bit
* which might be about to be cleared.
*/
if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
__kthread_bind(k, kthread->cpu);
wake_up_process(k);
}
}
put_task_struct(k);
if (kthread)
__kthread_unpark(k, kthread);
}
/**
@@ -367,7 +370,7 @@ void kthread_unpark(struct task_struct *k)
*/
int kthread_park(struct task_struct *k)
{
struct kthread *kthread = task_get_live_kthread(k);
struct kthread *kthread = to_live_kthread(k);
int ret = -ENOSYS;
if (kthread) {
@@ -380,7 +383,6 @@ int kthread_park(struct task_struct *k)
}
ret = 0;
}
put_task_struct(k);
return ret;
}
@@ -401,21 +403,23 @@ int kthread_park(struct task_struct *k)
*/
int kthread_stop(struct task_struct *k)
{
struct kthread *kthread = task_get_live_kthread(k);
struct kthread *kthread;
int ret;
trace_sched_kthread_stop(k);
get_task_struct(k);
kthread = to_live_kthread(k);
if (kthread) {
set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
__kthread_unpark(k, kthread);
wake_up_process(k);
wait_for_completion(&kthread->exited);
}
ret = k->exit_code;
put_task_struct(k);
trace_sched_kthread_stop_ret(ret);
trace_sched_kthread_stop_ret(ret);
return ret;
}
EXPORT_SYMBOL(kthread_stop);

View File

@@ -380,6 +380,13 @@ static int verbose(struct lock_class *class)
unsigned long nr_stack_trace_entries;
static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
static void print_lockdep_off(const char *bug_msg)
{
printk(KERN_DEBUG "%s\n", bug_msg);
printk(KERN_DEBUG "turning off the locking correctness validator.\n");
printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n");
}
static int save_trace(struct stack_trace *trace)
{
trace->nr_entries = 0;
@@ -409,8 +416,7 @@ static int save_trace(struct stack_trace *trace)
if (!debug_locks_off_graph_unlock())
return 0;
printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n");
printk("turning off the locking correctness validator.\n");
print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
dump_stack();
return 0;
@@ -763,8 +769,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
}
raw_local_irq_restore(flags);
printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
printk("turning off the locking correctness validator.\n");
print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
dump_stack();
return NULL;
}
@@ -834,8 +839,7 @@ static struct lock_list *alloc_list_entry(void)
if (!debug_locks_off_graph_unlock())
return NULL;
printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
printk("turning off the locking correctness validator.\n");
print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!");
dump_stack();
return NULL;
}
@@ -2000,7 +2004,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
struct lock_class *class = hlock_class(hlock);
struct list_head *hash_head = chainhashentry(chain_key);
struct lock_chain *chain;
struct held_lock *hlock_curr, *hlock_next;
struct held_lock *hlock_curr;
int i, j;
/*
@@ -2048,8 +2052,7 @@ cache_hit:
if (!debug_locks_off_graph_unlock())
return 0;
printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
printk("turning off the locking correctness validator.\n");
print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");
dump_stack();
return 0;
}
@@ -2057,12 +2060,10 @@ cache_hit:
chain->chain_key = chain_key;
chain->irq_context = hlock->irq_context;
/* Find the first held_lock of current chain */
hlock_next = hlock;
for (i = curr->lockdep_depth - 1; i >= 0; i--) {
hlock_curr = curr->held_locks + i;
if (hlock_curr->irq_context != hlock_next->irq_context)
if (hlock_curr->irq_context != hlock->irq_context)
break;
hlock_next = hlock;
}
i++;
chain->depth = curr->lockdep_depth + 1 - i;
@@ -3190,9 +3191,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
#endif
if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
debug_locks_off();
printk("BUG: MAX_LOCK_DEPTH too low, depth: %i max: %lu!\n",
print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!");
printk(KERN_DEBUG "depth: %i max: %lu!\n",
curr->lockdep_depth, MAX_LOCK_DEPTH);
printk("turning off the locking correctness validator.\n");
lockdep_print_held_locks(current);
debug_show_all_locks();
@@ -4088,7 +4089,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
}
EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
static void print_held_locks_bug(void)
static void print_held_locks_bug(struct task_struct *curr)
{
if (!debug_locks_off())
return;
@@ -4097,21 +4098,22 @@ static void print_held_locks_bug(void)
printk("\n");
printk("=====================================\n");
printk("[ BUG: %s/%d still has locks held! ]\n",
current->comm, task_pid_nr(current));
printk("[ BUG: lock held at task exit time! ]\n");
print_kernel_ident();
printk("-------------------------------------\n");
lockdep_print_held_locks(current);
printk("%s/%d is exiting with locks still held!\n",
curr->comm, task_pid_nr(curr));
lockdep_print_held_locks(curr);
printk("\nstack backtrace:\n");
dump_stack();
}
void debug_check_no_locks_held(void)
void debug_check_no_locks_held(struct task_struct *task)
{
if (unlikely(current->lockdep_depth > 0))
print_held_locks_bug();
if (unlikely(task->lockdep_depth > 0))
print_held_locks_bug(task);
}
EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
void debug_show_all_locks(void)
{

View File

@@ -37,6 +37,12 @@
# include <asm/mutex.h>
#endif
/*
* A negative mutex count indicates that waiters are sleeping waiting for the
* mutex.
*/
#define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0)
void
__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
{
@@ -44,6 +50,9 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
spin_lock_init(&lock->wait_lock);
INIT_LIST_HEAD(&lock->wait_list);
mutex_clear_owner(lock);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
lock->spin_mlock = NULL;
#endif
debug_mutex_init(lock, name, key);
}
@@ -95,6 +104,124 @@ void __sched mutex_lock(struct mutex *lock)
EXPORT_SYMBOL(mutex_lock);
#endif
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
/*
* In order to avoid a stampede of mutex spinners from acquiring the mutex
* more or less simultaneously, the spinners need to acquire a MCS lock
* first before spinning on the owner field.
*
* We don't inline mspin_lock() so that perf can correctly account for the
* time spent in this lock function.
*/
struct mspin_node {
struct mspin_node *next ;
int locked; /* 1 if lock acquired */
};
#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock))
static noinline
void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
{
struct mspin_node *prev;
/* Init node */
node->locked = 0;
node->next = NULL;
prev = xchg(lock, node);
if (likely(prev == NULL)) {
/* Lock acquired */
node->locked = 1;
return;
}
ACCESS_ONCE(prev->next) = node;
smp_wmb();
/* Wait until the lock holder passes the lock down */
while (!ACCESS_ONCE(node->locked))
arch_mutex_cpu_relax();
}
static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
{
struct mspin_node *next = ACCESS_ONCE(node->next);
if (likely(!next)) {
/*
* Release the lock by setting it to NULL
*/
if (cmpxchg(lock, node, NULL) == node)
return;
/* Wait until the next pointer is set */
while (!(next = ACCESS_ONCE(node->next)))
arch_mutex_cpu_relax();
}
ACCESS_ONCE(next->locked) = 1;
smp_wmb();
}
/*
* Mutex spinning code migrated from kernel/sched/core.c
*/
static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
{
if (lock->owner != owner)
return false;
/*
* Ensure we emit the owner->on_cpu, dereference _after_ checking
* lock->owner still matches owner, if that fails, owner might
* point to free()d memory, if it still matches, the rcu_read_lock()
* ensures the memory stays valid.
*/
barrier();
return owner->on_cpu;
}
/*
* Look out! "owner" is an entirely speculative pointer
* access and not reliable.
*/
static noinline
int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
{
rcu_read_lock();
while (owner_running(lock, owner)) {
if (need_resched())
break;
arch_mutex_cpu_relax();
}
rcu_read_unlock();
/*
* We break out the loop above on need_resched() and when the
* owner changed, which is a sign for heavy contention. Return
* success only when lock->owner is NULL.
*/
return lock->owner == NULL;
}
/*
* Initial check for entering the mutex spinning loop
*/
static inline int mutex_can_spin_on_owner(struct mutex *lock)
{
int retval = 1;
rcu_read_lock();
if (lock->owner)
retval = lock->owner->on_cpu;
rcu_read_unlock();
/*
* if lock->owner is not set, the mutex owner may have just acquired
* it and not set the owner yet or the mutex has been released.
*/
return retval;
}
#endif
static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
/**
@@ -158,25 +285,39 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
*
* We can't do this for DEBUG_MUTEXES because that relies on wait_lock
* to serialize everything.
*
* The mutex spinners are queued up using MCS lock so that only one
* spinner can compete for the mutex. However, if mutex spinning isn't
* going to happen, there is no point in going through the lock/unlock
* overhead.
*/
if (!mutex_can_spin_on_owner(lock))
goto slowpath;
for (;;) {
struct task_struct *owner;
struct mspin_node node;
/*
* If there's an owner, wait for it to either
* release the lock or go to sleep.
*/
mspin_lock(MLOCK(lock), &node);
owner = ACCESS_ONCE(lock->owner);
if (owner && !mutex_spin_on_owner(lock, owner))
if (owner && !mutex_spin_on_owner(lock, owner)) {
mspin_unlock(MLOCK(lock), &node);
break;
}
if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
if ((atomic_read(&lock->count) == 1) &&
(atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
lock_acquired(&lock->dep_map, ip);
mutex_set_owner(lock);
mspin_unlock(MLOCK(lock), &node);
preempt_enable();
return 0;
}
mspin_unlock(MLOCK(lock), &node);
/*
* When there's no owner, we might have preempted between the
@@ -195,6 +336,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
*/
arch_mutex_cpu_relax();
}
slowpath:
#endif
spin_lock_mutex(&lock->wait_lock, flags);
@@ -205,7 +347,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
list_add_tail(&waiter.list, &lock->wait_list);
waiter.task = task;
if (atomic_xchg(&lock->count, -1) == 1)
if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1))
goto done;
lock_contended(&lock->dep_map, ip);
@@ -220,7 +362,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* that when we release the lock, we properly wake up the
* other waiters:
*/
if (atomic_xchg(&lock->count, -1) == 1)
if (MUTEX_SHOW_NO_WAITER(lock) &&
(atomic_xchg(&lock->count, -1) == 1))
break;
/*

View File

@@ -181,6 +181,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
int nr;
int rc;
struct task_struct *task, *me = current;
int init_pids = thread_group_leader(me) ? 1 : 2;
/* Don't allow any more processes into the pid namespace */
disable_pid_allocation(pid_ns);
@@ -230,7 +231,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
*/
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (pid_ns->nr_hashed == 1)
if (pid_ns->nr_hashed == init_pids)
break;
schedule();
}

View File

@@ -49,13 +49,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/printk.h>
/*
* Architectures can override it:
*/
void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
{
}
/* printk's without a loglevel use this.. */
#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
@@ -63,8 +56,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */
DECLARE_WAIT_QUEUE_HEAD(log_wait);
int console_printk[4] = {
DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */
DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */
@@ -224,6 +215,7 @@ struct log {
static DEFINE_RAW_SPINLOCK(logbuf_lock);
#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
/* the next printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
static u32 syslog_idx;
@@ -609,7 +601,8 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
/* return error when data has vanished underneath us */
if (user->seq < log_first_seq)
ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
ret = POLLIN|POLLRDNORM;
else
ret = POLLIN|POLLRDNORM;
}
raw_spin_unlock_irq(&logbuf_lock);
@@ -1266,7 +1259,7 @@ static void call_console_drivers(int level, const char *text, size_t len)
{
struct console *con;
trace_console(text, 0, len, len);
trace_console(text, len);
if (level >= console_loglevel && !ignore_loglevel)
return;
@@ -1724,6 +1717,29 @@ static size_t cont_print_text(char *text, size_t size) { return 0; }
#endif /* CONFIG_PRINTK */
#ifdef CONFIG_EARLY_PRINTK
struct console *early_console;
void early_vprintk(const char *fmt, va_list ap)
{
if (early_console) {
char buf[512];
int n = vscnprintf(buf, sizeof(buf), fmt, ap);
early_console->write(early_console, buf, n);
}
}
asmlinkage void early_printk(const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
early_vprintk(fmt, ap);
va_end(ap);
}
#endif
static int __add_preferred_console(char *name, int idx, char *options,
char *brl_options)
{
@@ -1957,45 +1973,6 @@ int is_console_locked(void)
return console_locked;
}
/*
* Delayed printk version, for scheduler-internal messages:
*/
#define PRINTK_BUF_SIZE 512
#define PRINTK_PENDING_WAKEUP 0x01
#define PRINTK_PENDING_SCHED 0x02
static DEFINE_PER_CPU(int, printk_pending);
static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
static void wake_up_klogd_work_func(struct irq_work *irq_work)
{
int pending = __this_cpu_xchg(printk_pending, 0);
if (pending & PRINTK_PENDING_SCHED) {
char *buf = __get_cpu_var(printk_sched_buf);
printk(KERN_WARNING "[sched_delayed] %s", buf);
}
if (pending & PRINTK_PENDING_WAKEUP)
wake_up_interruptible(&log_wait);
}
static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
.func = wake_up_klogd_work_func,
.flags = IRQ_WORK_LAZY,
};
void wake_up_klogd(void)
{
preempt_disable();
if (waitqueue_active(&log_wait)) {
this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
}
preempt_enable();
}
static void console_cont_flush(char *text, size_t size)
{
unsigned long flags;
@@ -2458,6 +2435,44 @@ static int __init printk_late_init(void)
late_initcall(printk_late_init);
#if defined CONFIG_PRINTK
/*
* Delayed printk version, for scheduler-internal messages:
*/
#define PRINTK_BUF_SIZE 512
#define PRINTK_PENDING_WAKEUP 0x01
#define PRINTK_PENDING_SCHED 0x02
static DEFINE_PER_CPU(int, printk_pending);
static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
static void wake_up_klogd_work_func(struct irq_work *irq_work)
{
int pending = __this_cpu_xchg(printk_pending, 0);
if (pending & PRINTK_PENDING_SCHED) {
char *buf = __get_cpu_var(printk_sched_buf);
printk(KERN_WARNING "[sched_delayed] %s", buf);
}
if (pending & PRINTK_PENDING_WAKEUP)
wake_up_interruptible(&log_wait);
}
static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
.func = wake_up_klogd_work_func,
.flags = IRQ_WORK_LAZY,
};
void wake_up_klogd(void)
{
preempt_disable();
if (waitqueue_active(&log_wait)) {
this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
}
preempt_enable();
}
int printk_sched(const char *fmt, ...)
{

View File

@@ -64,7 +64,7 @@
static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
#define RCU_STATE_INITIALIZER(sname, cr) { \
#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \
.level = { &sname##_state.node[0] }, \
.call = cr, \
.fqs_state = RCU_GP_IDLE, \
@@ -76,13 +76,14 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
.onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
.name = #sname, \
.abbr = sabbr, \
}
struct rcu_state rcu_sched_state =
RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched);
RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh);
struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
static struct rcu_state *rcu_state;
@@ -223,6 +224,8 @@ static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
module_param(jiffies_till_first_fqs, ulong, 0644);
module_param(jiffies_till_next_fqs, ulong, 0644);
static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp);
static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));
static void force_quiescent_state(struct rcu_state *rsp);
static int rcu_pending(int cpu);
@@ -310,6 +313,8 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
if (rcu_gp_in_progress(rsp))
return 0; /* No, a grace period is already in progress. */
if (rcu_nocb_needs_gp(rsp))
return 1; /* Yes, a no-CBs CPU needs one. */
if (!rdp->nxttail[RCU_NEXT_TAIL])
return 0; /* No, this is a no-CBs (or offline) CPU. */
if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
@@ -1045,10 +1050,11 @@ static void init_callback_list(struct rcu_data *rdp)
{
int i;
if (init_nocb_callback_list(rdp))
return;
rdp->nxtlist = NULL;
for (i = 0; i < RCU_NEXT_SIZE; i++)
rdp->nxttail[i] = &rdp->nxtlist;
init_nocb_callback_list(rdp);
}
/*
@@ -1080,6 +1086,120 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
return rnp->completed + 2;
}
/*
* Trace-event helper function for rcu_start_future_gp() and
* rcu_nocb_wait_gp().
*/
static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
unsigned long c, char *s)
{
trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
rnp->completed, c, rnp->level,
rnp->grplo, rnp->grphi, s);
}
/*
* Start some future grace period, as needed to handle newly arrived
* callbacks. The required future grace periods are recorded in each
* rcu_node structure's ->need_future_gp field.
*
* The caller must hold the specified rcu_node structure's ->lock.
*/
static unsigned long __maybe_unused
rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
{
unsigned long c;
int i;
struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
/*
* Pick up grace-period number for new callbacks. If this
* grace period is already marked as needed, return to the caller.
*/
c = rcu_cbs_completed(rdp->rsp, rnp);
trace_rcu_future_gp(rnp, rdp, c, "Startleaf");
if (rnp->need_future_gp[c & 0x1]) {
trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf");
return c;
}
/*
* If either this rcu_node structure or the root rcu_node structure
* believe that a grace period is in progress, then we must wait
* for the one following, which is in "c". Because our request
* will be noticed at the end of the current grace period, we don't
* need to explicitly start one.
*/
if (rnp->gpnum != rnp->completed ||
ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
rnp->need_future_gp[c & 0x1]++;
trace_rcu_future_gp(rnp, rdp, c, "Startedleaf");
return c;
}
/*
* There might be no grace period in progress. If we don't already
* hold it, acquire the root rcu_node structure's lock in order to
* start one (if needed).
*/
if (rnp != rnp_root)
raw_spin_lock(&rnp_root->lock);
/*
* Get a new grace-period number. If there really is no grace
* period in progress, it will be smaller than the one we obtained
* earlier. Adjust callbacks as needed. Note that even no-CBs
* CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed.
*/
c = rcu_cbs_completed(rdp->rsp, rnp_root);
for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++)
if (ULONG_CMP_LT(c, rdp->nxtcompleted[i]))
rdp->nxtcompleted[i] = c;
/*
* If the needed for the required grace period is already
* recorded, trace and leave.
*/
if (rnp_root->need_future_gp[c & 0x1]) {
trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot");
goto unlock_out;
}
/* Record the need for the future grace period. */
rnp_root->need_future_gp[c & 0x1]++;
/* If a grace period is not already in progress, start one. */
if (rnp_root->gpnum != rnp_root->completed) {
trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot");
} else {
trace_rcu_future_gp(rnp, rdp, c, "Startedroot");
rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
}
unlock_out:
if (rnp != rnp_root)
raw_spin_unlock(&rnp_root->lock);
return c;
}
/*
* Clean up any old requests for the just-ended grace period. Also return
* whether any additional grace periods have been requested. Also invoke
* rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads
* waiting for this grace period to complete.
*/
static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
{
int c = rnp->completed;
int needmore;
struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
rcu_nocb_gp_cleanup(rsp, rnp);
rnp->need_future_gp[c & 0x1] = 0;
needmore = rnp->need_future_gp[(c + 1) & 0x1];
trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup");
return needmore;
}
/*
* If there is room, assign a ->completed number to any callbacks on
* this CPU that have not already been assigned. Also accelerate any
@@ -1139,6 +1259,8 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
rdp->nxtcompleted[i] = c;
}
/* Record any needed additional grace periods. */
rcu_start_future_gp(rnp, rdp);
/* Trace depending on how much we were able to accelerate. */
if (!*rdp->nxttail[RCU_WAIT_TAIL])
@@ -1318,9 +1440,9 @@ static int rcu_gp_init(struct rcu_state *rsp)
rdp = this_cpu_ptr(rsp->rda);
rcu_preempt_check_blocked_tasks(rnp);
rnp->qsmask = rnp->qsmaskinit;
rnp->gpnum = rsp->gpnum;
ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
WARN_ON_ONCE(rnp->completed != rsp->completed);
rnp->completed = rsp->completed;
ACCESS_ONCE(rnp->completed) = rsp->completed;
if (rnp == rdp->mynode)
rcu_start_gp_per_cpu(rsp, rnp, rdp);
rcu_preempt_boost_start_gp(rnp);
@@ -1329,7 +1451,8 @@ static int rcu_gp_init(struct rcu_state *rsp)
rnp->grphi, rnp->qsmask);
raw_spin_unlock_irq(&rnp->lock);
#ifdef CONFIG_PROVE_RCU_DELAY
if ((random32() % (rcu_num_nodes * 8)) == 0)
if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 &&
system_state == SYSTEM_RUNNING)
schedule_timeout_uninterruptible(2);
#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
cond_resched();
@@ -1371,6 +1494,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
static void rcu_gp_cleanup(struct rcu_state *rsp)
{
unsigned long gp_duration;
int nocb = 0;
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
@@ -1400,17 +1524,23 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
*/
rcu_for_each_node_breadth_first(rsp, rnp) {
raw_spin_lock_irq(&rnp->lock);
rnp->completed = rsp->gpnum;
ACCESS_ONCE(rnp->completed) = rsp->gpnum;
rdp = this_cpu_ptr(rsp->rda);
if (rnp == rdp->mynode)
__rcu_process_gp_end(rsp, rnp, rdp);
nocb += rcu_future_gp_cleanup(rsp, rnp);
raw_spin_unlock_irq(&rnp->lock);
cond_resched();
}
rnp = rcu_get_root(rsp);
raw_spin_lock_irq(&rnp->lock);
rcu_nocb_gp_set(rnp, nocb);
rsp->completed = rsp->gpnum; /* Declare grace period done. */
trace_rcu_grace_period(rsp->name, rsp->completed, "end");
rsp->fqs_state = RCU_GP_IDLE;
rdp = this_cpu_ptr(rsp->rda);
rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */
if (cpu_needs_another_gp(rsp, rdp))
rsp->gp_flags = 1;
raw_spin_unlock_irq(&rnp->lock);
@@ -1486,57 +1616,62 @@ static int __noreturn rcu_gp_kthread(void *arg)
/*
* Start a new RCU grace period if warranted, re-initializing the hierarchy
* in preparation for detecting the next grace period. The caller must hold
* the root node's ->lock, which is released before return. Hard irqs must
* be disabled.
* the root node's ->lock and hard irqs must be disabled.
*
* Note that it is legal for a dying CPU (which is marked as offline) to
* invoke this function. This can happen when the dying CPU reports its
* quiescent state.
*/
static void
rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
__releases(rcu_get_root(rsp)->lock)
rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
struct rcu_node *rnp = rcu_get_root(rsp);
if (!rsp->gp_kthread ||
!cpu_needs_another_gp(rsp, rdp)) {
if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
/*
* Either we have not yet spawned the grace-period
* task, this CPU does not need another grace period,
* or a grace period is already in progress.
* Either way, don't start a new grace period.
*/
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
/*
* Because there is no grace period in progress right now,
* any callbacks we have up to this point will be satisfied
* by the next grace period. So this is a good place to
* assign a grace period number to recently posted callbacks.
*/
rcu_accelerate_cbs(rsp, rnp, rdp);
rsp->gp_flags = RCU_GP_FLAG_INIT;
raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
/* Ensure that CPU is aware of completion of last grace period. */
rcu_process_gp_end(rsp, rdp);
local_irq_restore(flags);
/* Wake up rcu_gp_kthread() to start the grace period. */
wake_up(&rsp->gp_wq);
}
/*
* Similar to rcu_start_gp_advanced(), but also advance the calling CPU's
* callbacks. Note that rcu_start_gp_advanced() cannot do this because it
* is invoked indirectly from rcu_advance_cbs(), which would result in
* endless recursion -- or would do so if it wasn't for the self-deadlock
* that is encountered beforehand.
*/
static void
rcu_start_gp(struct rcu_state *rsp)
{
struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
struct rcu_node *rnp = rcu_get_root(rsp);
/*
* If there is no grace period in progress right now, any
* callbacks we have up to this point will be satisfied by the
* next grace period. Also, advancing the callbacks reduces the
* probability of false positives from cpu_needs_another_gp()
* resulting in pointless grace periods. So, advance callbacks
* then start the grace period!
*/
rcu_advance_cbs(rsp, rnp, rdp);
rcu_start_gp_advanced(rsp, rnp, rdp);
}
/*
* Report a full set of quiescent states to the specified rcu_state
* data structure. This involves cleaning up after the prior grace
* period and letting rcu_start_gp() start up the next grace period
* if one is needed. Note that the caller must hold rnp->lock, as
* required by rcu_start_gp(), which will release it.
* if one is needed. Note that the caller must hold rnp->lock, which
* is released before return.
*/
static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
__releases(rcu_get_root(rsp)->lock)
@@ -2134,7 +2269,8 @@ __rcu_process_callbacks(struct rcu_state *rsp)
local_irq_save(flags);
if (cpu_needs_another_gp(rsp, rdp)) {
raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
rcu_start_gp(rsp, flags); /* releases above lock */
rcu_start_gp(rsp);
raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
} else {
local_irq_restore(flags);
}
@@ -2179,7 +2315,8 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
static void invoke_rcu_core(void)
{
raise_softirq(RCU_SOFTIRQ);
if (cpu_online(smp_processor_id()))
raise_softirq(RCU_SOFTIRQ);
}
/*
@@ -2214,11 +2351,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
/* Start a new grace period if one not already started. */
if (!rcu_gp_in_progress(rsp)) {
unsigned long nestflag;
struct rcu_node *rnp_root = rcu_get_root(rsp);
raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
raw_spin_lock(&rnp_root->lock);
rcu_start_gp(rsp);
raw_spin_unlock(&rnp_root->lock);
} else {
/* Give the grace period a kick. */
rdp->blimit = LONG_MAX;
@@ -2638,19 +2775,27 @@ static int rcu_pending(int cpu)
}
/*
* Check to see if any future RCU-related work will need to be done
* by the current CPU, even if none need be done immediately, returning
* 1 if so.
* Return true if the specified CPU has any callback. If all_lazy is
* non-NULL, store an indication of whether all callbacks are lazy.
* (If there are no callbacks, all of them are deemed to be lazy.)
*/
static int rcu_cpu_has_callbacks(int cpu)
static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
{
bool al = true;
bool hc = false;
struct rcu_data *rdp;
struct rcu_state *rsp;
/* RCU callbacks either ready or pending? */
for_each_rcu_flavor(rsp)
if (per_cpu_ptr(rsp->rda, cpu)->nxtlist)
return 1;
return 0;
for_each_rcu_flavor(rsp) {
rdp = per_cpu_ptr(rsp->rda, cpu);
if (rdp->qlen != rdp->qlen_lazy)
al = false;
if (rdp->nxtlist)
hc = true;
}
if (all_lazy)
*all_lazy = al;
return hc;
}
/*
@@ -2869,7 +3014,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
atomic_set(&rdp->dynticks->dynticks,
(atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
rcu_prepare_for_idle_init(cpu);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
/* Add CPU to rcu_node bitmasks. */
@@ -2919,7 +3063,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
struct rcu_node *rnp = rdp->mynode;
struct rcu_state *rsp;
int ret = NOTIFY_OK;
trace_rcu_utilization("Start CPU hotplug");
switch (action) {
@@ -2933,21 +3076,12 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
rcu_boost_kthread_setaffinity(rnp, -1);
break;
case CPU_DOWN_PREPARE:
if (nocb_cpu_expendable(cpu))
rcu_boost_kthread_setaffinity(rnp, cpu);
else
ret = NOTIFY_BAD;
rcu_boost_kthread_setaffinity(rnp, cpu);
break;
case CPU_DYING:
case CPU_DYING_FROZEN:
/*
* The whole machine is "stopped" except this CPU, so we can
* touch any data without introducing corruption. We send the
* dying CPU's callbacks to an arbitrarily chosen online CPU.
*/
for_each_rcu_flavor(rsp)
rcu_cleanup_dying_cpu(rsp);
rcu_cleanup_after_idle(cpu);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
@@ -2960,7 +3094,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
break;
}
trace_rcu_utilization("End CPU hotplug");
return ret;
return NOTIFY_OK;
}
/*
@@ -3095,6 +3229,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
}
rnp->level = i;
INIT_LIST_HEAD(&rnp->blkd_tasks);
rcu_init_one_nocb(rnp);
}
}
@@ -3180,8 +3315,7 @@ void __init rcu_init(void)
rcu_init_one(&rcu_sched_state, &rcu_sched_data);
rcu_init_one(&rcu_bh_state, &rcu_bh_data);
__rcu_init_preempt();
rcu_init_nocb();
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
/*
* We don't need protection against CPU-hotplug here because

View File

@@ -88,18 +88,13 @@ struct rcu_dynticks {
int dynticks_nmi_nesting; /* Track NMI nesting level. */
atomic_t dynticks; /* Even value for idle, else odd. */
#ifdef CONFIG_RCU_FAST_NO_HZ
int dyntick_drain; /* Prepare-for-idle state variable. */
unsigned long dyntick_holdoff;
/* No retries for the jiffy of failure. */
struct timer_list idle_gp_timer;
/* Wake up CPU sleeping with callbacks. */
unsigned long idle_gp_timer_expires;
/* When to wake up CPU (for repost). */
bool idle_first_pass; /* First pass of attempt to go idle? */
bool all_lazy; /* Are all CPU's CBs lazy? */
unsigned long nonlazy_posted;
/* # times non-lazy CBs posted to CPU. */
unsigned long nonlazy_posted_snap;
/* idle-period nonlazy_posted snapshot. */
unsigned long last_accelerate;
/* Last jiffy CBs were accelerated. */
int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
};
@@ -134,9 +129,6 @@ struct rcu_node {
/* elements that need to drain to allow the */
/* current expedited grace period to */
/* complete (only for TREE_PREEMPT_RCU). */
atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */
/* Since this has meaning only for leaf */
/* rcu_node structures, 32 bits suffices. */
unsigned long qsmaskinit;
/* Per-GP initial value for qsmask & expmask. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -196,6 +188,12 @@ struct rcu_node {
/* Refused to boost: not sure why, though. */
/* This can happen due to race conditions. */
#endif /* #ifdef CONFIG_RCU_BOOST */
#ifdef CONFIG_RCU_NOCB_CPU
wait_queue_head_t nocb_gp_wq[2];
/* Place for rcu_nocb_kthread() to wait GP. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
int need_future_gp[2];
/* Counts of upcoming no-CB GP requests. */
raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
} ____cacheline_internodealigned_in_smp;
@@ -328,6 +326,11 @@ struct rcu_data {
struct task_struct *nocb_kthread;
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
/* 8) RCU CPU stall data. */
#ifdef CONFIG_RCU_CPU_STALL_INFO
unsigned int softirq_snap; /* Snapshot of softirq activity. */
#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
int cpu;
struct rcu_state *rsp;
};
@@ -375,12 +378,6 @@ struct rcu_state {
struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
void (*func)(struct rcu_head *head));
#ifdef CONFIG_RCU_NOCB_CPU
void (*call_remote)(struct rcu_head *head,
void (*func)(struct rcu_head *head));
/* call_rcu() flavor, but for */
/* placing on remote CPU. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
/* The following fields are guarded by the root rcu_node's lock. */
@@ -443,6 +440,7 @@ struct rcu_state {
unsigned long gp_max; /* Maximum GP duration in */
/* jiffies. */
char *name; /* Name of structure. */
char abbr; /* Abbreviated name. */
struct list_head flavors; /* List of RCU flavors. */
};
@@ -520,7 +518,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
struct rcu_node *rnp);
#endif /* #ifdef CONFIG_RCU_BOOST */
static void __cpuinit rcu_prepare_kthreads(int cpu);
static void rcu_prepare_for_idle_init(int cpu);
static void rcu_cleanup_after_idle(int cpu);
static void rcu_prepare_for_idle(int cpu);
static void rcu_idle_count_callbacks_posted(void);
@@ -529,16 +526,18 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
static void print_cpu_stall_info_end(void);
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static void increment_cpu_stall_ticks(void);
static int rcu_nocb_needs_gp(struct rcu_state *rsp);
static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
static void rcu_init_one_nocb(struct rcu_node *rnp);
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
bool lazy);
static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
struct rcu_data *rdp);
static bool nocb_cpu_expendable(int cpu);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
static void init_nocb_callback_list(struct rcu_data *rdp);
static void __init rcu_init_nocb(void);
static void rcu_kick_nohz_cpu(int cpu);
static bool init_nocb_callback_list(struct rcu_data *rdp);
#endif /* #ifndef RCU_TREE_NONCORE */

View File

@@ -86,11 +86,21 @@ static void __init rcu_bootup_announce_oddness(void)
if (nr_cpu_ids != NR_CPUS)
printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
#ifdef CONFIG_RCU_NOCB_CPU
#ifndef CONFIG_RCU_NOCB_CPU_NONE
if (!have_rcu_nocb_mask) {
alloc_bootmem_cpumask_var(&rcu_nocb_mask);
have_rcu_nocb_mask = true;
}
#ifdef CONFIG_RCU_NOCB_CPU_ZERO
pr_info("\tExperimental no-CBs CPU 0\n");
cpumask_set_cpu(0, rcu_nocb_mask);
#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
#ifdef CONFIG_RCU_NOCB_CPU_ALL
pr_info("\tExperimental no-CBs for all CPUs\n");
cpumask_setall(rcu_nocb_mask);
#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
if (have_rcu_nocb_mask) {
if (cpumask_test_cpu(0, rcu_nocb_mask)) {
cpumask_clear_cpu(0, rcu_nocb_mask);
pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n");
}
cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf);
if (rcu_nocb_poll)
@@ -102,7 +112,7 @@ static void __init rcu_bootup_announce_oddness(void)
#ifdef CONFIG_TREE_PREEMPT_RCU
struct rcu_state rcu_preempt_state =
RCU_STATE_INITIALIZER(rcu_preempt, call_rcu);
RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
static struct rcu_state *rcu_state = &rcu_preempt_state;
@@ -1534,14 +1544,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
{
*delta_jiffies = ULONG_MAX;
return rcu_cpu_has_callbacks(cpu);
}
/*
* Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
*/
static void rcu_prepare_for_idle_init(int cpu)
{
return rcu_cpu_has_callbacks(cpu, NULL);
}
/*
@@ -1578,16 +1581,6 @@ static void rcu_idle_count_callbacks_posted(void)
*
* The following three proprocessor symbols control this state machine:
*
* RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
* to satisfy RCU. Beyond this point, it is better to incur a periodic
* scheduling-clock interrupt than to loop through the state machine
* at full power.
* RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
* optional if RCU does not need anything immediately from this
* CPU, even if this CPU still has RCU callbacks queued. The first
* times through the state machine are mandatory: we need to give
* the state machine a chance to communicate a quiescent state
* to the RCU core.
* RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
* to sleep in dyntick-idle mode with RCU callbacks pending. This
* is sized to be roughly one RCU grace period. Those energy-efficiency
@@ -1603,186 +1596,108 @@ static void rcu_idle_count_callbacks_posted(void)
* adjustment, they can be converted into kernel config parameters, though
* making the state machine smarter might be a better option.
*/
#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */
#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */
#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
module_param(rcu_idle_gp_delay, int, 0644);
static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
module_param(rcu_idle_lazy_gp_delay, int, 0644);
extern int tick_nohz_enabled;
/*
* Does the specified flavor of RCU have non-lazy callbacks pending on
* the specified CPU? Both RCU flavor and CPU are specified by the
* rcu_data structure.
* Try to advance callbacks for all flavors of RCU on the current CPU.
* Afterwards, if there are any callbacks ready for immediate invocation,
* return true.
*/
static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp)
static bool rcu_try_advance_all_cbs(void)
{
return rdp->qlen != rdp->qlen_lazy;
}
bool cbs_ready = false;
struct rcu_data *rdp;
struct rcu_node *rnp;
struct rcu_state *rsp;
#ifdef CONFIG_TREE_PREEMPT_RCU
for_each_rcu_flavor(rsp) {
rdp = this_cpu_ptr(rsp->rda);
rnp = rdp->mynode;
/*
* Are there non-lazy RCU-preempt callbacks? (There cannot be if there
* is no RCU-preempt in the kernel.)
*/
static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
/*
* Don't bother checking unless a grace period has
* completed since we last checked and there are
* callbacks not yet ready to invoke.
*/
if (rdp->completed != rnp->completed &&
rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
rcu_process_gp_end(rsp, rdp);
return __rcu_cpu_has_nonlazy_callbacks(rdp);
}
#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
{
return 0;
}
#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */
/*
* Does any flavor of RCU have non-lazy callbacks on the specified CPU?
*/
static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
{
return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) ||
__rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) ||
rcu_preempt_cpu_has_nonlazy_callbacks(cpu);
if (cpu_has_callbacks_ready_to_invoke(rdp))
cbs_ready = true;
}
return cbs_ready;
}
/*
* Allow the CPU to enter dyntick-idle mode if either: (1) There are no
* callbacks on this CPU, (2) this CPU has not yet attempted to enter
* dyntick-idle mode, or (3) this CPU is in the process of attempting to
* enter dyntick-idle mode. Otherwise, if we have recently tried and failed
* to enter dyntick-idle mode, we refuse to try to enter it. After all,
* it is better to incur scheduling-clock interrupts than to spin
* continuously for the same time duration!
* Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
* to invoke. If the CPU has callbacks, try to advance them. Tell the
* caller to set the timeout based on whether or not there are non-lazy
* callbacks.
*
* The delta_jiffies argument is used to store the time when RCU is
* going to need the CPU again if it still has callbacks. The reason
* for this is that rcu_prepare_for_idle() might need to post a timer,
* but if so, it will do so after tick_nohz_stop_sched_tick() has set
* the wakeup time for this CPU. This means that RCU's timer can be
* delayed until the wakeup time, which defeats the purpose of posting
* a timer.
* The caller must have disabled interrupts.
*/
int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
int rcu_needs_cpu(int cpu, unsigned long *dj)
{
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
/* Flag a new idle sojourn to the idle-entry state machine. */
rdtp->idle_first_pass = 1;
/* Snapshot to detect later posting of non-lazy callback. */
rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
/* If no callbacks, RCU doesn't need the CPU. */
if (!rcu_cpu_has_callbacks(cpu)) {
*delta_jiffies = ULONG_MAX;
if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) {
*dj = ULONG_MAX;
return 0;
}
if (rdtp->dyntick_holdoff == jiffies) {
/* RCU recently tried and failed, so don't try again. */
*delta_jiffies = 1;
/* Attempt to advance callbacks. */
if (rcu_try_advance_all_cbs()) {
/* Some ready to invoke, so initiate later invocation. */
invoke_rcu_core();
return 1;
}
/* Set up for the possibility that RCU will post a timer. */
if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
*delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies,
RCU_IDLE_GP_DELAY) - jiffies;
rdtp->last_accelerate = jiffies;
/* Request timer delay depending on laziness, and round. */
if (rdtp->all_lazy) {
*dj = round_up(rcu_idle_gp_delay + jiffies,
rcu_idle_gp_delay) - jiffies;
} else {
*delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY;
*delta_jiffies = round_jiffies(*delta_jiffies) - jiffies;
*dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
}
return 0;
}
/*
* Handler for smp_call_function_single(). The only point of this
* handler is to wake the CPU up, so the handler does only tracing.
*/
void rcu_idle_demigrate(void *unused)
{
trace_rcu_prep_idle("Demigrate");
}
/*
* Timer handler used to force CPU to start pushing its remaining RCU
* callbacks in the case where it entered dyntick-idle mode with callbacks
* pending. The hander doesn't really need to do anything because the
* real work is done upon re-entry to idle, or by the next scheduling-clock
* interrupt should idle not be re-entered.
*
* One special case: the timer gets migrated without awakening the CPU
* on which the timer was scheduled on. In this case, we must wake up
* that CPU. We do so with smp_call_function_single().
*/
static void rcu_idle_gp_timer_func(unsigned long cpu_in)
{
int cpu = (int)cpu_in;
trace_rcu_prep_idle("Timer");
if (cpu != smp_processor_id())
smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
else
WARN_ON_ONCE(1); /* Getting here can hang the system... */
}
/*
* Initialize the timer used to pull CPUs out of dyntick-idle mode.
*/
static void rcu_prepare_for_idle_init(int cpu)
{
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
rdtp->dyntick_holdoff = jiffies - 1;
setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu);
rdtp->idle_gp_timer_expires = jiffies - 1;
rdtp->idle_first_pass = 1;
}
/*
* Clean up for exit from idle. Because we are exiting from idle, there
* is no longer any point to ->idle_gp_timer, so cancel it. This will
* do nothing if this timer is not active, so just cancel it unconditionally.
*/
static void rcu_cleanup_after_idle(int cpu)
{
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
del_timer(&rdtp->idle_gp_timer);
trace_rcu_prep_idle("Cleanup after idle");
rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled);
}
/*
* Check to see if any RCU-related work can be done by the current CPU,
* and if so, schedule a softirq to get it done. This function is part
* of the RCU implementation; it is -not- an exported member of the RCU API.
*
* The idea is for the current CPU to clear out all work required by the
* RCU core for the current grace period, so that this CPU can be permitted
* to enter dyntick-idle mode. In some cases, it will need to be awakened
* at the end of the grace period by whatever CPU ends the grace period.
* This allows CPUs to go dyntick-idle more quickly, and to reduce the
* number of wakeups by a modest integer factor.
*
* Because it is not legal to invoke rcu_process_callbacks() with irqs
* disabled, we do one pass of force_quiescent_state(), then do a
* invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
* later. The ->dyntick_drain field controls the sequencing.
* Prepare a CPU for idle from an RCU perspective. The first major task
* is to sense whether nohz mode has been enabled or disabled via sysfs.
* The second major task is to check to see if a non-lazy callback has
* arrived at a CPU that previously had only lazy callbacks. The third
* major task is to accelerate (that is, assign grace-period numbers to)
* any recently arrived callbacks.
*
* The caller must have disabled interrupts.
*/
static void rcu_prepare_for_idle(int cpu)
{
struct timer_list *tp;
struct rcu_data *rdp;
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
struct rcu_node *rnp;
struct rcu_state *rsp;
int tne;
/* Handle nohz enablement switches conservatively. */
tne = ACCESS_ONCE(tick_nohz_enabled);
if (tne != rdtp->tick_nohz_enabled_snap) {
if (rcu_cpu_has_callbacks(cpu))
if (rcu_cpu_has_callbacks(cpu, NULL))
invoke_rcu_core(); /* force nohz to see update. */
rdtp->tick_nohz_enabled_snap = tne;
return;
@@ -1790,125 +1705,56 @@ static void rcu_prepare_for_idle(int cpu)
if (!tne)
return;
/* Adaptive-tick mode, where usermode execution is idle to RCU. */
if (!is_idle_task(current)) {
rdtp->dyntick_holdoff = jiffies - 1;
if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
trace_rcu_prep_idle("User dyntick with callbacks");
rdtp->idle_gp_timer_expires =
round_up(jiffies + RCU_IDLE_GP_DELAY,
RCU_IDLE_GP_DELAY);
} else if (rcu_cpu_has_callbacks(cpu)) {
rdtp->idle_gp_timer_expires =
round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
trace_rcu_prep_idle("User dyntick with lazy callbacks");
} else {
return;
}
tp = &rdtp->idle_gp_timer;
mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
/* If this is a no-CBs CPU, no callbacks, just return. */
if (is_nocb_cpu(cpu))
return;
}
/*
* If this is an idle re-entry, for example, due to use of
* RCU_NONIDLE() or the new idle-loop tracing API within the idle
* loop, then don't take any state-machine actions, unless the
* momentary exit from idle queued additional non-lazy callbacks.
* Instead, repost the ->idle_gp_timer if this CPU has callbacks
* pending.
* If a non-lazy callback arrived at a CPU having only lazy
* callbacks, invoke RCU core for the side-effect of recalculating
* idle duration on re-entry to idle.
*/
if (!rdtp->idle_first_pass &&
(rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) {
if (rcu_cpu_has_callbacks(cpu)) {
tp = &rdtp->idle_gp_timer;
mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
}
return;
}
rdtp->idle_first_pass = 0;
rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;
/*
* If there are no callbacks on this CPU, enter dyntick-idle mode.
* Also reset state to avoid prejudicing later attempts.
*/
if (!rcu_cpu_has_callbacks(cpu)) {
rdtp->dyntick_holdoff = jiffies - 1;
rdtp->dyntick_drain = 0;
trace_rcu_prep_idle("No callbacks");
return;
}
/*
* If in holdoff mode, just return. We will presumably have
* refrained from disabling the scheduling-clock tick.
*/
if (rdtp->dyntick_holdoff == jiffies) {
trace_rcu_prep_idle("In holdoff");
return;
}
/* Check and update the ->dyntick_drain sequencing. */
if (rdtp->dyntick_drain <= 0) {
/* First time through, initialize the counter. */
rdtp->dyntick_drain = RCU_IDLE_FLUSHES;
} else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES &&
!rcu_pending(cpu) &&
!local_softirq_pending()) {
/* Can we go dyntick-idle despite still having callbacks? */
rdtp->dyntick_drain = 0;
rdtp->dyntick_holdoff = jiffies;
if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
trace_rcu_prep_idle("Dyntick with callbacks");
rdtp->idle_gp_timer_expires =
round_up(jiffies + RCU_IDLE_GP_DELAY,
RCU_IDLE_GP_DELAY);
} else {
rdtp->idle_gp_timer_expires =
round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
trace_rcu_prep_idle("Dyntick with lazy callbacks");
}
tp = &rdtp->idle_gp_timer;
mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
return; /* Nothing more to do immediately. */
} else if (--(rdtp->dyntick_drain) <= 0) {
/* We have hit the limit, so time to give up. */
rdtp->dyntick_holdoff = jiffies;
trace_rcu_prep_idle("Begin holdoff");
invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */
return;
}
/*
* Do one step of pushing the remaining RCU callbacks through
* the RCU core state machine.
*/
#ifdef CONFIG_TREE_PREEMPT_RCU
if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
rcu_preempt_qs(cpu);
force_quiescent_state(&rcu_preempt_state);
}
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
if (per_cpu(rcu_sched_data, cpu).nxtlist) {
rcu_sched_qs(cpu);
force_quiescent_state(&rcu_sched_state);
}
if (per_cpu(rcu_bh_data, cpu).nxtlist) {
rcu_bh_qs(cpu);
force_quiescent_state(&rcu_bh_state);
}
/*
* If RCU callbacks are still pending, RCU still needs this CPU.
* So try forcing the callbacks through the grace period.
*/
if (rcu_cpu_has_callbacks(cpu)) {
trace_rcu_prep_idle("More callbacks");
if (rdtp->all_lazy &&
rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
invoke_rcu_core();
} else {
trace_rcu_prep_idle("Callbacks drained");
return;
}
/*
* If we have not yet accelerated this jiffy, accelerate all
* callbacks on this CPU.
*/
if (rdtp->last_accelerate == jiffies)
return;
rdtp->last_accelerate = jiffies;
for_each_rcu_flavor(rsp) {
rdp = per_cpu_ptr(rsp->rda, cpu);
if (!*rdp->nxttail[RCU_DONE_TAIL])
continue;
rnp = rdp->mynode;
raw_spin_lock(&rnp->lock); /* irqs already disabled. */
rcu_accelerate_cbs(rsp, rnp, rdp);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
}
}
/*
* Clean up for exit from idle. Attempt to advance callbacks based on
* any grace periods that elapsed while the CPU was idle, and if any
* callbacks are now ready to invoke, initiate invocation.
*/
static void rcu_cleanup_after_idle(int cpu)
{
struct rcu_data *rdp;
struct rcu_state *rsp;
if (is_nocb_cpu(cpu))
return;
rcu_try_advance_all_cbs();
for_each_rcu_flavor(rsp) {
rdp = per_cpu_ptr(rsp->rda, cpu);
if (cpu_has_callbacks_ready_to_invoke(rdp))
invoke_rcu_core();
}
}
@@ -2016,16 +1862,13 @@ early_initcall(rcu_register_oom_notifier);
static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
{
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
struct timer_list *tltp = &rdtp->idle_gp_timer;
char c;
unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap;
c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.';
if (timer_pending(tltp))
sprintf(cp, "drain=%d %c timer=%lu",
rdtp->dyntick_drain, c, tltp->expires - jiffies);
else
sprintf(cp, "drain=%d %c timer not pending",
rdtp->dyntick_drain, c);
sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c",
rdtp->last_accelerate & 0xffff, jiffies & 0xffff,
ulong2long(nlpd),
rdtp->all_lazy ? 'L' : '.',
rdtp->tick_nohz_enabled_snap ? '.' : 'D');
}
#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
@@ -2071,10 +1914,11 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
ticks_value = rsp->gpnum - rdp->gpnum;
}
print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n",
printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
cpu, ticks_value, ticks_title,
atomic_read(&rdtp->dynticks) & 0xfff,
rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
fast_no_hz);
}
@@ -2088,6 +1932,7 @@ static void print_cpu_stall_info_end(void)
static void zero_cpu_stall_ticks(struct rcu_data *rdp)
{
rdp->ticks_this_gp = 0;
rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
}
/* Increment ->ticks_this_gp for all flavors of RCU. */
@@ -2166,6 +2011,47 @@ static int __init parse_rcu_nocb_poll(char *arg)
}
early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
/*
* Do any no-CBs CPUs need another grace period?
*
* Interrupts must be disabled. If the caller does not hold the root
* rnp_node structure's ->lock, the results are advisory only.
*/
static int rcu_nocb_needs_gp(struct rcu_state *rsp)
{
struct rcu_node *rnp = rcu_get_root(rsp);
return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1];
}
/*
* Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
* grace period.
*/
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
{
wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
}
/*
* Set the root rcu_node structure's ->need_future_gp field
* based on the sum of those of all rcu_node structures. This does
* double-count the root rcu_node structure's requests, but this
* is necessary to handle the possibility of a rcu_nocb_kthread()
* having awakened during the time that the rcu_node structures
* were being updated for the end of the previous grace period.
*/
static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
{
rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
}
static void rcu_init_one_nocb(struct rcu_node *rnp)
{
init_waitqueue_head(&rnp->nocb_gp_wq[0]);
init_waitqueue_head(&rnp->nocb_gp_wq[1]);
}
/* Is the specified CPU a no-CPUs CPU? */
bool rcu_is_nocb_cpu(int cpu)
{
@@ -2228,6 +2114,13 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
if (!rcu_is_nocb_cpu(rdp->cpu))
return 0;
__call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
if (__is_kfree_rcu_offset((unsigned long)rhp->func))
trace_rcu_kfree_callback(rdp->rsp->name, rhp,
(unsigned long)rhp->func,
rdp->qlen_lazy, rdp->qlen);
else
trace_rcu_callback(rdp->rsp->name, rhp,
rdp->qlen_lazy, rdp->qlen);
return 1;
}
@@ -2266,95 +2159,36 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
}
/*
* There must be at least one non-no-CBs CPU in operation at any given
* time, because no-CBs CPUs are not capable of initiating grace periods
* independently. This function therefore complains if the specified
* CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to
* avoid offlining the last such CPU. (Recursion is a wonderful thing,
* but you have to have a base case!)
* If necessary, kick off a new grace period, and either way wait
* for a subsequent grace period to complete.
*/
static bool nocb_cpu_expendable(int cpu)
static void rcu_nocb_wait_gp(struct rcu_data *rdp)
{
cpumask_var_t non_nocb_cpus;
int ret;
unsigned long c;
bool d;
unsigned long flags;
struct rcu_node *rnp = rdp->mynode;
raw_spin_lock_irqsave(&rnp->lock, flags);
c = rcu_start_future_gp(rnp, rdp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
/*
* If there are no no-CB CPUs or if this CPU is not a no-CB CPU,
* then offlining this CPU is harmless. Let it happen.
* Wait for the grace period. Do so interruptibly to avoid messing
* up the load average.
*/
if (!have_rcu_nocb_mask || rcu_is_nocb_cpu(cpu))
return 1;
/* If no memory, play it safe and keep the CPU around. */
if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO))
return 0;
cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask);
cpumask_clear_cpu(cpu, non_nocb_cpus);
ret = !cpumask_empty(non_nocb_cpus);
free_cpumask_var(non_nocb_cpus);
return ret;
}
/*
* Helper structure for remote registry of RCU callbacks.
* This is needed for when a no-CBs CPU needs to start a grace period.
* If it just invokes call_rcu(), the resulting callback will be queued,
* which can result in deadlock.
*/
struct rcu_head_remote {
struct rcu_head *rhp;
call_rcu_func_t *crf;
void (*func)(struct rcu_head *rhp);
};
/*
* Register a callback as specified by the rcu_head_remote struct.
* This function is intended to be invoked via smp_call_function_single().
*/
static void call_rcu_local(void *arg)
{
struct rcu_head_remote *rhrp =
container_of(arg, struct rcu_head_remote, rhp);
rhrp->crf(rhrp->rhp, rhrp->func);
}
/*
* Set up an rcu_head_remote structure and the invoke call_rcu_local()
* on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via
* smp_call_function_single().
*/
static void invoke_crf_remote(struct rcu_head *rhp,
void (*func)(struct rcu_head *rhp),
call_rcu_func_t crf)
{
struct rcu_head_remote rhr;
rhr.rhp = rhp;
rhr.crf = crf;
rhr.func = func;
smp_call_function_single(0, call_rcu_local, &rhr, 1);
}
/*
* Helper functions to be passed to wait_rcu_gp(), each of which
* invokes invoke_crf_remote() to register a callback appropriately.
*/
static void __maybe_unused
call_rcu_preempt_remote(struct rcu_head *rhp,
void (*func)(struct rcu_head *rhp))
{
invoke_crf_remote(rhp, func, call_rcu);
}
static void call_rcu_bh_remote(struct rcu_head *rhp,
void (*func)(struct rcu_head *rhp))
{
invoke_crf_remote(rhp, func, call_rcu_bh);
}
static void call_rcu_sched_remote(struct rcu_head *rhp,
void (*func)(struct rcu_head *rhp))
{
invoke_crf_remote(rhp, func, call_rcu_sched);
trace_rcu_future_gp(rnp, rdp, c, "StartWait");
for (;;) {
wait_event_interruptible(
rnp->nocb_gp_wq[c & 0x1],
(d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
if (likely(d))
break;
flush_signals(current);
trace_rcu_future_gp(rnp, rdp, c, "ResumeWait");
}
trace_rcu_future_gp(rnp, rdp, c, "EndWait");
smp_mb(); /* Ensure that CB invocation happens after GP end. */
}
/*
@@ -2391,7 +2225,7 @@ static int rcu_nocb_kthread(void *arg)
cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
ACCESS_ONCE(rdp->nocb_p_count) += c;
ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
wait_rcu_gp(rdp->rsp->call_remote);
rcu_nocb_wait_gp(rdp);
/* Each pass through the following loop invokes a callback. */
trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
@@ -2437,33 +2271,42 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
return;
for_each_cpu(cpu, rcu_nocb_mask) {
rdp = per_cpu_ptr(rsp->rda, cpu);
t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu);
t = kthread_run(rcu_nocb_kthread, rdp,
"rcuo%c/%d", rsp->abbr, cpu);
BUG_ON(IS_ERR(t));
ACCESS_ONCE(rdp->nocb_kthread) = t;
}
}
/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
static void init_nocb_callback_list(struct rcu_data *rdp)
static bool init_nocb_callback_list(struct rcu_data *rdp)
{
if (rcu_nocb_mask == NULL ||
!cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
return;
return false;
rdp->nxttail[RCU_NEXT_TAIL] = NULL;
}
/* Initialize the ->call_remote fields in the rcu_state structures. */
static void __init rcu_init_nocb(void)
{
#ifdef CONFIG_PREEMPT_RCU
rcu_preempt_state.call_remote = call_rcu_preempt_remote;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
rcu_bh_state.call_remote = call_rcu_bh_remote;
rcu_sched_state.call_remote = call_rcu_sched_remote;
return true;
}
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
static int rcu_nocb_needs_gp(struct rcu_state *rsp)
{
return 0;
}
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
{
}
static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
{
}
static void rcu_init_one_nocb(struct rcu_node *rnp)
{
}
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
bool lazy)
{
@@ -2476,11 +2319,6 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
return 0;
}
static bool nocb_cpu_expendable(int cpu)
{
return 1;
}
static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
}
@@ -2489,12 +2327,9 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
{
}
static void init_nocb_callback_list(struct rcu_data *rdp)
{
}
static void __init rcu_init_nocb(void)
static bool init_nocb_callback_list(struct rcu_data *rdp)
{
return false;
}
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */

View File

@@ -46,8 +46,6 @@
#define RCU_TREE_NONCORE
#include "rcutree.h"
#define ulong2long(a) (*(long *)(&(a)))
static int r_open(struct inode *inode, struct file *file,
const struct seq_operations *op)
{

View File

@@ -21,6 +21,7 @@
#include <linux/seq_file.h>
#include <linux/device.h>
#include <linux/pfn.h>
#include <linux/mm.h>
#include <asm/io.h>
@@ -50,6 +51,14 @@ struct resource_constraint {
static DEFINE_RWLOCK(resource_lock);
/*
* For memory hotplug, there is no way to free resource entries allocated
* by boot mem after the system is up. So for reusing the resource entry
* we need to remember the resource.
*/
static struct resource *bootmem_resource_free;
static DEFINE_SPINLOCK(bootmem_resource_lock);
static void *r_next(struct seq_file *m, void *v, loff_t *pos)
{
struct resource *p = v;
@@ -151,6 +160,40 @@ __initcall(ioresources_init);
#endif /* CONFIG_PROC_FS */
static void free_resource(struct resource *res)
{
if (!res)
return;
if (!PageSlab(virt_to_head_page(res))) {
spin_lock(&bootmem_resource_lock);
res->sibling = bootmem_resource_free;
bootmem_resource_free = res;
spin_unlock(&bootmem_resource_lock);
} else {
kfree(res);
}
}
static struct resource *alloc_resource(gfp_t flags)
{
struct resource *res = NULL;
spin_lock(&bootmem_resource_lock);
if (bootmem_resource_free) {
res = bootmem_resource_free;
bootmem_resource_free = res->sibling;
}
spin_unlock(&bootmem_resource_lock);
if (res)
memset(res, 0, sizeof(struct resource));
else
res = kzalloc(sizeof(struct resource), flags);
return res;
}
/* Return the conflict entry if you can't request it */
static struct resource * __request_resource(struct resource *root, struct resource *new)
{
@@ -706,24 +749,13 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
write_unlock(&resource_lock);
}
/**
* adjust_resource - modify a resource's start and size
* @res: resource to modify
* @start: new start value
* @size: new size
*
* Given an existing resource, change its start and size to match the
* arguments. Returns 0 on success, -EBUSY if it can't fit.
* Existing children of the resource are assumed to be immutable.
*/
int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size)
static int __adjust_resource(struct resource *res, resource_size_t start,
resource_size_t size)
{
struct resource *tmp, *parent = res->parent;
resource_size_t end = start + size - 1;
int result = -EBUSY;
write_lock(&resource_lock);
if (!parent)
goto skip;
@@ -751,6 +783,26 @@ skip:
result = 0;
out:
return result;
}
/**
* adjust_resource - modify a resource's start and size
* @res: resource to modify
* @start: new start value
* @size: new size
*
* Given an existing resource, change its start and size to match the
* arguments. Returns 0 on success, -EBUSY if it can't fit.
* Existing children of the resource are assumed to be immutable.
*/
int adjust_resource(struct resource *res, resource_size_t start,
resource_size_t size)
{
int result;
write_lock(&resource_lock);
result = __adjust_resource(res, start, size);
write_unlock(&resource_lock);
return result;
}
@@ -762,7 +814,7 @@ static void __init __reserve_region_with_split(struct resource *root,
{
struct resource *parent = root;
struct resource *conflict;
struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC);
struct resource *res = alloc_resource(GFP_ATOMIC);
struct resource *next_res = NULL;
if (!res)
@@ -787,7 +839,7 @@ static void __init __reserve_region_with_split(struct resource *root,
/* conflict covered whole area */
if (conflict->start <= res->start &&
conflict->end >= res->end) {
kfree(res);
free_resource(res);
WARN_ON(next_res);
break;
}
@@ -797,10 +849,9 @@ static void __init __reserve_region_with_split(struct resource *root,
end = res->end;
res->end = conflict->start - 1;
if (conflict->end < end) {
next_res = kzalloc(sizeof(*next_res),
GFP_ATOMIC);
next_res = alloc_resource(GFP_ATOMIC);
if (!next_res) {
kfree(res);
free_resource(res);
break;
}
next_res->name = name;
@@ -890,7 +941,7 @@ struct resource * __request_region(struct resource *parent,
const char *name, int flags)
{
DECLARE_WAITQUEUE(wait, current);
struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
struct resource *res = alloc_resource(GFP_KERNEL);
if (!res)
return NULL;
@@ -924,7 +975,7 @@ struct resource * __request_region(struct resource *parent,
continue;
}
/* Uhhuh, that didn't work out.. */
kfree(res);
free_resource(res);
res = NULL;
break;
}
@@ -958,7 +1009,7 @@ int __check_region(struct resource *parent, resource_size_t start,
return -EBUSY;
release_resource(res);
kfree(res);
free_resource(res);
return 0;
}
EXPORT_SYMBOL(__check_region);
@@ -998,7 +1049,7 @@ void __release_region(struct resource *parent, resource_size_t start,
write_unlock(&resource_lock);
if (res->flags & IORESOURCE_MUXED)
wake_up(&muxed_resource_wait);
kfree(res);
free_resource(res);
return;
}
p = &res->sibling;
@@ -1012,6 +1063,109 @@ void __release_region(struct resource *parent, resource_size_t start,
}
EXPORT_SYMBOL(__release_region);
#ifdef CONFIG_MEMORY_HOTREMOVE
/**
* release_mem_region_adjustable - release a previously reserved memory region
* @parent: parent resource descriptor
* @start: resource start address
* @size: resource region size
*
* This interface is intended for memory hot-delete. The requested region
* is released from a currently busy memory resource. The requested region
* must either match exactly or fit into a single busy resource entry. In
* the latter case, the remaining resource is adjusted accordingly.
* Existing children of the busy memory resource must be immutable in the
* request.
*
* Note:
* - Additional release conditions, such as overlapping region, can be
* supported after they are confirmed as valid cases.
* - When a busy memory resource gets split into two entries, the code
* assumes that all children remain in the lower address entry for
* simplicity. Enhance this logic when necessary.
*/
int release_mem_region_adjustable(struct resource *parent,
resource_size_t start, resource_size_t size)
{
struct resource **p;
struct resource *res;
struct resource *new_res;
resource_size_t end;
int ret = -EINVAL;
end = start + size - 1;
if ((start < parent->start) || (end > parent->end))
return ret;
/* The alloc_resource() result gets checked later */
new_res = alloc_resource(GFP_KERNEL);
p = &parent->child;
write_lock(&resource_lock);
while ((res = *p)) {
if (res->start >= end)
break;
/* look for the next resource if it does not fit into */
if (res->start > start || res->end < end) {
p = &res->sibling;
continue;
}
if (!(res->flags & IORESOURCE_MEM))
break;
if (!(res->flags & IORESOURCE_BUSY)) {
p = &res->child;
continue;
}
/* found the target resource; let's adjust accordingly */
if (res->start == start && res->end == end) {
/* free the whole entry */
*p = res->sibling;
free_resource(res);
ret = 0;
} else if (res->start == start && res->end != end) {
/* adjust the start */
ret = __adjust_resource(res, end + 1,
res->end - end);
} else if (res->start != start && res->end == end) {
/* adjust the end */
ret = __adjust_resource(res, res->start,
start - res->start);
} else {
/* split into two entries */
if (!new_res) {
ret = -ENOMEM;
break;
}
new_res->name = res->name;
new_res->start = end + 1;
new_res->end = res->end;
new_res->flags = res->flags;
new_res->parent = res->parent;
new_res->sibling = res->sibling;
new_res->child = NULL;
ret = __adjust_resource(res, res->start,
start - res->start);
if (ret)
break;
res->sibling = new_res;
new_res = NULL;
}
break;
}
write_unlock(&resource_lock);
free_resource(new_res);
return ret;
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
/*
* Managed region resource
*/

View File

@@ -14,6 +14,7 @@
#include <linux/spinlock.h>
#include <linux/timer.h>
#include <linux/freezer.h>
#include <linux/stat.h>
#include "rtmutex.h"
@@ -366,8 +367,8 @@ static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *at
return curr - buf;
}
static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL);
static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command);
static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL);
static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command);
static struct bus_type rttest_subsys = {
.name = "rttest",

View File

@@ -16,3 +16,4 @@ obj-$(CONFIG_SMP) += cpupri.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o

View File

@@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
u64 this_clock, remote_clock;
u64 *ptr, old_val, val;
#if BITS_PER_LONG != 64
again:
/*
* Careful here: The local and the remote clock values need to
* be read out atomic as we need to compare the values and
* then update either the local or the remote side. So the
* cmpxchg64 below only protects one readout.
*
* We must reread via sched_clock_local() in the retry case on
* 32bit as an NMI could use sched_clock_local() via the
* tracer and hit between the readout of
* the low32bit and the high 32bit portion.
*/
this_clock = sched_clock_local(my_scd);
/*
* We must enforce atomic readout on 32bit, otherwise the
* update on the remote cpu can hit inbetween the readout of
* the low32bit and the high 32bit portion.
*/
remote_clock = cmpxchg64(&scd->clock, 0, 0);
#else
/*
* On 64bit the read of [my]scd->clock is atomic versus the
* update, so we can avoid the above 32bit dance.
*/
sched_clock_local(my_scd);
again:
this_clock = my_scd->clock;
remote_clock = scd->clock;
#endif
/*
* Use the opportunity that we have both locks

View File

@@ -512,11 +512,6 @@ static inline void init_hrtick(void)
* the target CPU.
*/
#ifdef CONFIG_SMP
#ifndef tsk_is_polling
#define tsk_is_polling(t) 0
#endif
void resched_task(struct task_struct *p)
{
int cpu;
@@ -1536,8 +1531,10 @@ static void try_to_wake_up_local(struct task_struct *p)
{
struct rq *rq = task_rq(p);
BUG_ON(rq != this_rq());
BUG_ON(p == current);
if (WARN_ON_ONCE(rq != this_rq()) ||
WARN_ON_ONCE(p == current))
return;
lockdep_assert_held(&rq->lock);
if (!raw_spin_trylock(&p->pi_lock)) {
@@ -3037,51 +3034,6 @@ void __sched schedule_preempt_disabled(void)
preempt_disable();
}
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
{
if (lock->owner != owner)
return false;
/*
* Ensure we emit the owner->on_cpu, dereference _after_ checking
* lock->owner still matches owner, if that fails, owner might
* point to free()d memory, if it still matches, the rcu_read_lock()
* ensures the memory stays valid.
*/
barrier();
return owner->on_cpu;
}
/*
* Look out! "owner" is an entirely speculative pointer
* access and not reliable.
*/
int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
{
if (!sched_feat(OWNER_SPIN))
return 0;
rcu_read_lock();
while (owner_running(lock, owner)) {
if (need_resched())
break;
arch_mutex_cpu_relax();
}
rcu_read_unlock();
/*
* We break out the loop above on need_resched() and when the
* owner changed, which is a sign for heavy contention. Return
* success only when lock->owner is NULL.
*/
return lock->owner == NULL;
}
#endif
#ifdef CONFIG_PREEMPT
/*
* this is the entry point to schedule() from in-kernel preemption
@@ -4170,6 +4122,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
get_task_struct(p);
rcu_read_unlock();
if (p->flags & PF_NO_SETAFFINITY) {
retval = -EINVAL;
goto out_put_task;
}
if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
retval = -ENOMEM;
goto out_put_task;
@@ -4817,11 +4773,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
goto out;
}
if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
ret = -EINVAL;
goto out;
}
do_set_cpus_allowed(p, new_mask);
/* Can the task run on the task's current CPU? If so, we're done */
@@ -5043,7 +4994,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
}
static int min_load_idx = 0;
static int max_load_idx = CPU_LOAD_IDX_MAX;
static int max_load_idx = CPU_LOAD_IDX_MAX-1;
static void
set_table_entry(struct ctl_table *entry,
@@ -6292,7 +6243,7 @@ static void sched_init_numa(void)
* 'level' contains the number of unique distances, excluding the
* identity distance node_distance(i,i).
*
* The sched_domains_nume_distance[] array includes the actual distance
* The sched_domains_numa_distance[] array includes the actual distance
* numbers.
*/
@@ -6913,7 +6864,7 @@ struct task_group root_task_group;
LIST_HEAD(task_groups);
#endif
DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
void __init sched_init(void)
{
@@ -6950,7 +6901,7 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_CPUMASK_OFFSTACK
for_each_possible_cpu(i) {
per_cpu(load_balance_tmpmask, i) = (void *)ptr;
per_cpu(load_balance_mask, i) = (void *)ptr;
ptr += cpumask_size();
}
#endif /* CONFIG_CPUMASK_OFFSTACK */
@@ -6976,12 +6927,6 @@ void __init sched_init(void)
#endif /* CONFIG_CGROUP_SCHED */
#ifdef CONFIG_CGROUP_CPUACCT
root_cpuacct.cpustat = &kernel_cpustat;
root_cpuacct.cpuusage = alloc_percpu(u64);
/* Too early, not expected to fail */
BUG_ON(!root_cpuacct.cpuusage);
#endif
for_each_possible_cpu(i) {
struct rq *rq;
@@ -8083,226 +8028,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {
#endif /* CONFIG_CGROUP_SCHED */
#ifdef CONFIG_CGROUP_CPUACCT
/*
* CPU accounting code for task groups.
*
* Based on the work by Paul Menage (menage@google.com) and Balbir Singh
* (balbir@in.ibm.com).
*/
struct cpuacct root_cpuacct;
/* create a new cpu accounting group */
static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
{
struct cpuacct *ca;
if (!cgrp->parent)
return &root_cpuacct.css;
ca = kzalloc(sizeof(*ca), GFP_KERNEL);
if (!ca)
goto out;
ca->cpuusage = alloc_percpu(u64);
if (!ca->cpuusage)
goto out_free_ca;
ca->cpustat = alloc_percpu(struct kernel_cpustat);
if (!ca->cpustat)
goto out_free_cpuusage;
return &ca->css;
out_free_cpuusage:
free_percpu(ca->cpuusage);
out_free_ca:
kfree(ca);
out:
return ERR_PTR(-ENOMEM);
}
/* destroy an existing cpu accounting group */
static void cpuacct_css_free(struct cgroup *cgrp)
{
struct cpuacct *ca = cgroup_ca(cgrp);
free_percpu(ca->cpustat);
free_percpu(ca->cpuusage);
kfree(ca);
}
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
{
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
u64 data;
#ifndef CONFIG_64BIT
/*
* Take rq->lock to make 64-bit read safe on 32-bit platforms.
*/
raw_spin_lock_irq(&cpu_rq(cpu)->lock);
data = *cpuusage;
raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
#else
data = *cpuusage;
#endif
return data;
}
static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
{
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
#ifndef CONFIG_64BIT
/*
* Take rq->lock to make 64-bit write safe on 32-bit platforms.
*/
raw_spin_lock_irq(&cpu_rq(cpu)->lock);
*cpuusage = val;
raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
#else
*cpuusage = val;
#endif
}
/* return total cpu usage (in nanoseconds) of a group */
static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
{
struct cpuacct *ca = cgroup_ca(cgrp);
u64 totalcpuusage = 0;
int i;
for_each_present_cpu(i)
totalcpuusage += cpuacct_cpuusage_read(ca, i);
return totalcpuusage;
}
static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
u64 reset)
{
struct cpuacct *ca = cgroup_ca(cgrp);
int err = 0;
int i;
if (reset) {
err = -EINVAL;
goto out;
}
for_each_present_cpu(i)
cpuacct_cpuusage_write(ca, i, 0);
out:
return err;
}
static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
struct seq_file *m)
{
struct cpuacct *ca = cgroup_ca(cgroup);
u64 percpu;
int i;
for_each_present_cpu(i) {
percpu = cpuacct_cpuusage_read(ca, i);
seq_printf(m, "%llu ", (unsigned long long) percpu);
}
seq_printf(m, "\n");
return 0;
}
static const char *cpuacct_stat_desc[] = {
[CPUACCT_STAT_USER] = "user",
[CPUACCT_STAT_SYSTEM] = "system",
};
static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
struct cgroup_map_cb *cb)
{
struct cpuacct *ca = cgroup_ca(cgrp);
int cpu;
s64 val = 0;
for_each_online_cpu(cpu) {
struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
val += kcpustat->cpustat[CPUTIME_USER];
val += kcpustat->cpustat[CPUTIME_NICE];
}
val = cputime64_to_clock_t(val);
cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
val = 0;
for_each_online_cpu(cpu) {
struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
val += kcpustat->cpustat[CPUTIME_SYSTEM];
val += kcpustat->cpustat[CPUTIME_IRQ];
val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
}
val = cputime64_to_clock_t(val);
cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
return 0;
}
static struct cftype files[] = {
{
.name = "usage",
.read_u64 = cpuusage_read,
.write_u64 = cpuusage_write,
},
{
.name = "usage_percpu",
.read_seq_string = cpuacct_percpu_seq_read,
},
{
.name = "stat",
.read_map = cpuacct_stats_show,
},
{ } /* terminate */
};
/*
* charge this task's execution time to its accounting group.
*
* called with rq->lock held.
*/
void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
struct cpuacct *ca;
int cpu;
if (unlikely(!cpuacct_subsys.active))
return;
cpu = task_cpu(tsk);
rcu_read_lock();
ca = task_ca(tsk);
for (; ca; ca = parent_ca(ca)) {
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
*cpuusage += cputime;
}
rcu_read_unlock();
}
struct cgroup_subsys cpuacct_subsys = {
.name = "cpuacct",
.css_alloc = cpuacct_css_alloc,
.css_free = cpuacct_css_free,
.subsys_id = cpuacct_subsys_id,
.base_cftypes = files,
};
#endif /* CONFIG_CGROUP_CPUACCT */
void dump_cpu_task(int cpu)
{
pr_info("Task dump for CPU %d:\n", cpu);

296
kernel/sched/cpuacct.c Normal file
View File

@@ -0,0 +1,296 @@
#include <linux/cgroup.h>
#include <linux/slab.h>
#include <linux/percpu.h>
#include <linux/spinlock.h>
#include <linux/cpumask.h>
#include <linux/seq_file.h>
#include <linux/rcupdate.h>
#include <linux/kernel_stat.h>
#include <linux/err.h>
#include "sched.h"
/*
* CPU accounting code for task groups.
*
* Based on the work by Paul Menage (menage@google.com) and Balbir Singh
* (balbir@in.ibm.com).
*/
/* Time spent by the tasks of the cpu accounting group executing in ... */
enum cpuacct_stat_index {
CPUACCT_STAT_USER, /* ... user mode */
CPUACCT_STAT_SYSTEM, /* ... kernel mode */
CPUACCT_STAT_NSTATS,
};
/* track cpu usage of a group of tasks and its child groups */
struct cpuacct {
struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every cpu */
u64 __percpu *cpuusage;
struct kernel_cpustat __percpu *cpustat;
};
/* return cpu accounting group corresponding to this container */
static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
{
return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
struct cpuacct, css);
}
/* return cpu accounting group to which this task belongs */
static inline struct cpuacct *task_ca(struct task_struct *tsk)
{
return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
struct cpuacct, css);
}
static inline struct cpuacct *__parent_ca(struct cpuacct *ca)
{
return cgroup_ca(ca->css.cgroup->parent);
}
static inline struct cpuacct *parent_ca(struct cpuacct *ca)
{
if (!ca->css.cgroup->parent)
return NULL;
return cgroup_ca(ca->css.cgroup->parent);
}
static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
static struct cpuacct root_cpuacct = {
.cpustat = &kernel_cpustat,
.cpuusage = &root_cpuacct_cpuusage,
};
/* create a new cpu accounting group */
static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
{
struct cpuacct *ca;
if (!cgrp->parent)
return &root_cpuacct.css;
ca = kzalloc(sizeof(*ca), GFP_KERNEL);
if (!ca)
goto out;
ca->cpuusage = alloc_percpu(u64);
if (!ca->cpuusage)
goto out_free_ca;
ca->cpustat = alloc_percpu(struct kernel_cpustat);
if (!ca->cpustat)
goto out_free_cpuusage;
return &ca->css;
out_free_cpuusage:
free_percpu(ca->cpuusage);
out_free_ca:
kfree(ca);
out:
return ERR_PTR(-ENOMEM);
}
/* destroy an existing cpu accounting group */
static void cpuacct_css_free(struct cgroup *cgrp)
{
struct cpuacct *ca = cgroup_ca(cgrp);
free_percpu(ca->cpustat);
free_percpu(ca->cpuusage);
kfree(ca);
}
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
{
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
u64 data;
#ifndef CONFIG_64BIT
/*
* Take rq->lock to make 64-bit read safe on 32-bit platforms.
*/
raw_spin_lock_irq(&cpu_rq(cpu)->lock);
data = *cpuusage;
raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
#else
data = *cpuusage;
#endif
return data;
}
static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
{
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
#ifndef CONFIG_64BIT
/*
* Take rq->lock to make 64-bit write safe on 32-bit platforms.
*/
raw_spin_lock_irq(&cpu_rq(cpu)->lock);
*cpuusage = val;
raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
#else
*cpuusage = val;
#endif
}
/* return total cpu usage (in nanoseconds) of a group */
static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
{
struct cpuacct *ca = cgroup_ca(cgrp);
u64 totalcpuusage = 0;
int i;
for_each_present_cpu(i)
totalcpuusage += cpuacct_cpuusage_read(ca, i);
return totalcpuusage;
}
static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
u64 reset)
{
struct cpuacct *ca = cgroup_ca(cgrp);
int err = 0;
int i;
if (reset) {
err = -EINVAL;
goto out;
}
for_each_present_cpu(i)
cpuacct_cpuusage_write(ca, i, 0);
out:
return err;
}
static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
struct seq_file *m)
{
struct cpuacct *ca = cgroup_ca(cgroup);
u64 percpu;
int i;
for_each_present_cpu(i) {
percpu = cpuacct_cpuusage_read(ca, i);
seq_printf(m, "%llu ", (unsigned long long) percpu);
}
seq_printf(m, "\n");
return 0;
}
static const char * const cpuacct_stat_desc[] = {
[CPUACCT_STAT_USER] = "user",
[CPUACCT_STAT_SYSTEM] = "system",
};
static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
struct cgroup_map_cb *cb)
{
struct cpuacct *ca = cgroup_ca(cgrp);
int cpu;
s64 val = 0;
for_each_online_cpu(cpu) {
struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
val += kcpustat->cpustat[CPUTIME_USER];
val += kcpustat->cpustat[CPUTIME_NICE];
}
val = cputime64_to_clock_t(val);
cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
val = 0;
for_each_online_cpu(cpu) {
struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
val += kcpustat->cpustat[CPUTIME_SYSTEM];
val += kcpustat->cpustat[CPUTIME_IRQ];
val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
}
val = cputime64_to_clock_t(val);
cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
return 0;
}
static struct cftype files[] = {
{
.name = "usage",
.read_u64 = cpuusage_read,
.write_u64 = cpuusage_write,
},
{
.name = "usage_percpu",
.read_seq_string = cpuacct_percpu_seq_read,
},
{
.name = "stat",
.read_map = cpuacct_stats_show,
},
{ } /* terminate */
};
/*
* charge this task's execution time to its accounting group.
*
* called with rq->lock held.
*/
void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
struct cpuacct *ca;
int cpu;
cpu = task_cpu(tsk);
rcu_read_lock();
ca = task_ca(tsk);
while (true) {
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
*cpuusage += cputime;
ca = parent_ca(ca);
if (!ca)
break;
}
rcu_read_unlock();
}
/*
* Add user/system time to cpuacct.
*
* Note: it's the caller that updates the account of the root cgroup.
*/
void cpuacct_account_field(struct task_struct *p, int index, u64 val)
{
struct kernel_cpustat *kcpustat;
struct cpuacct *ca;
rcu_read_lock();
ca = task_ca(p);
while (ca != &root_cpuacct) {
kcpustat = this_cpu_ptr(ca->cpustat);
kcpustat->cpustat[index] += val;
ca = __parent_ca(ca);
}
rcu_read_unlock();
}
struct cgroup_subsys cpuacct_subsys = {
.name = "cpuacct",
.css_alloc = cpuacct_css_alloc,
.css_free = cpuacct_css_free,
.subsys_id = cpuacct_subsys_id,
.base_cftypes = files,
.early_init = 1,
};

17
kernel/sched/cpuacct.h Normal file
View File

@@ -0,0 +1,17 @@
#ifdef CONFIG_CGROUP_CPUACCT
extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
#else
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
}
static inline void
cpuacct_account_field(struct task_struct *p, int index, u64 val)
{
}
#endif

View File

@@ -115,10 +115,6 @@ static int irqtime_account_si_update(void)
static inline void task_group_account_field(struct task_struct *p, int index,
u64 tmp)
{
#ifdef CONFIG_CGROUP_CPUACCT
struct kernel_cpustat *kcpustat;
struct cpuacct *ca;
#endif
/*
* Since all updates are sure to touch the root cgroup, we
* get ourselves ahead and touch it first. If the root cgroup
@@ -127,19 +123,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
*/
__get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
#ifdef CONFIG_CGROUP_CPUACCT
if (unlikely(!cpuacct_subsys.active))
return;
rcu_read_lock();
ca = task_ca(p);
while (ca && (ca != &root_cpuacct)) {
kcpustat = this_cpu_ptr(ca->cpustat);
kcpustat->cpustat[index] += tmp;
ca = parent_ca(ca);
}
rcu_read_unlock();
#endif
cpuacct_account_field(p, index, tmp);
}
/*
@@ -310,7 +294,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
t = tsk;
do {
task_cputime(tsk, &utime, &stime);
task_cputime(t, &utime, &stime);
times->utime += utime;
times->stime += stime;
times->sum_exec_runtime += task_sched_runtime(t);

View File

@@ -1563,6 +1563,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
} /* migrations, e.g. sleep=0 leave decay_count == 0 */
}
/*
* Update the rq's load with the elapsed running time before entering
* idle. if the last scheduled task is not a CFS task, idle_enter will
* be the only way to update the runnable statistic.
*/
void idle_enter_fair(struct rq *this_rq)
{
update_rq_runnable_avg(this_rq, 1);
}
/*
* Update the rq's load with the elapsed idle time before a task is
* scheduled. if the newly scheduled task is not a CFS task, idle_exit will
* be the only way to update the runnable statistic.
*/
void idle_exit_fair(struct rq *this_rq)
{
update_rq_runnable_avg(this_rq, 0);
}
#else
static inline void update_entity_load_avg(struct sched_entity *se,
int update_cfs_rq) {}
@@ -3875,12 +3896,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
int tsk_cache_hot = 0;
/*
* We do not migrate tasks that are:
* 1) running (obviously), or
* 1) throttled_lb_pair, or
* 2) cannot be migrated to this CPU due to cpus_allowed, or
* 3) are cache-hot on their current CPU.
* 3) running (obviously), or
* 4) are cache-hot on their current CPU.
*/
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
int new_dst_cpu;
int cpu;
schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
@@ -3895,12 +3920,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
return 0;
new_dst_cpu = cpumask_first_and(env->dst_grpmask,
tsk_cpus_allowed(p));
if (new_dst_cpu < nr_cpu_ids) {
env->flags |= LBF_SOME_PINNED;
env->new_dst_cpu = new_dst_cpu;
/* Prevent to re-select dst_cpu via env's cpus */
for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
env->flags |= LBF_SOME_PINNED;
env->new_dst_cpu = cpu;
break;
}
}
return 0;
}
@@ -3921,20 +3949,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
if (!tsk_cache_hot ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
#ifdef CONFIG_SCHEDSTATS
if (tsk_cache_hot) {
schedstat_inc(env->sd, lb_hot_gained[env->idle]);
schedstat_inc(p, se.statistics.nr_forced_migrations);
}
#endif
return 1;
}
if (tsk_cache_hot) {
schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
return 0;
}
return 1;
schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
return 0;
}
/*
@@ -3949,9 +3974,6 @@ static int move_one_task(struct lb_env *env)
struct task_struct *p, *n;
list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
continue;
if (!can_migrate_task(p, env))
continue;
@@ -4003,7 +4025,7 @@ static int move_tasks(struct lb_env *env)
break;
}
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
if (!can_migrate_task(p, env))
goto next;
load = task_h_load(p);
@@ -4014,9 +4036,6 @@ static int move_tasks(struct lb_env *env)
if ((load / 2) > env->imbalance)
goto next;
if (!can_migrate_task(p, env))
goto next;
move_task(p, env);
pulled++;
env->imbalance -= load;
@@ -4961,7 +4980,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
#define MAX_PINNED_INTERVAL 512
/* Working cpumask for load_balance and load_balance_newidle. */
DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
static int need_active_balance(struct lb_env *env)
{
@@ -4992,11 +5011,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
int *balance)
{
int ld_moved, cur_ld_moved, active_balance = 0;
int lb_iterations, max_lb_iterations;
struct sched_group *group;
struct rq *busiest;
unsigned long flags;
struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
struct cpumask *cpus = __get_cpu_var(load_balance_mask);
struct lb_env env = {
.sd = sd,
@@ -5008,8 +5026,14 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.cpus = cpus,
};
/*
* For NEWLY_IDLE load_balancing, we don't need to consider
* other cpus in our group
*/
if (idle == CPU_NEWLY_IDLE)
env.dst_grpmask = NULL;
cpumask_copy(cpus, cpu_active_mask);
max_lb_iterations = cpumask_weight(env.dst_grpmask);
schedstat_inc(sd, lb_count[idle]);
@@ -5035,7 +5059,6 @@ redo:
schedstat_add(sd, lb_imbalance[idle], env.imbalance);
ld_moved = 0;
lb_iterations = 1;
if (busiest->nr_running > 1) {
/*
* Attempt to move tasks. If find_busiest_group has found
@@ -5062,17 +5085,17 @@ more_balance:
double_rq_unlock(env.dst_rq, busiest);
local_irq_restore(flags);
if (env.flags & LBF_NEED_BREAK) {
env.flags &= ~LBF_NEED_BREAK;
goto more_balance;
}
/*
* some other cpu did the load balance for us.
*/
if (cur_ld_moved && env.dst_cpu != smp_processor_id())
resched_cpu(env.dst_cpu);
if (env.flags & LBF_NEED_BREAK) {
env.flags &= ~LBF_NEED_BREAK;
goto more_balance;
}
/*
* Revisit (affine) tasks on src_cpu that couldn't be moved to
* us and move them to an alternate dst_cpu in our sched_group
@@ -5092,14 +5115,17 @@ more_balance:
* moreover subsequent load balance cycles should correct the
* excess load moved.
*/
if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
lb_iterations++ < max_lb_iterations) {
if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
env.dst_rq = cpu_rq(env.new_dst_cpu);
env.dst_cpu = env.new_dst_cpu;
env.flags &= ~LBF_SOME_PINNED;
env.loop = 0;
env.loop_break = sched_nr_migrate_break;
/* Prevent to re-select dst_cpu via env's cpus */
cpumask_clear_cpu(env.dst_cpu, env.cpus);
/*
* Go back to "more_balance" rather than "redo" since we
* need to continue with same src_cpu.
@@ -5220,8 +5246,6 @@ void idle_balance(int this_cpu, struct rq *this_rq)
if (this_rq->avg_idle < sysctl_sched_migration_cost)
return;
update_rq_runnable_avg(this_rq, 1);
/*
* Drop the rq->lock, but keep IRQ/preempt disabled.
*/
@@ -5396,13 +5420,16 @@ static inline void set_cpu_sd_state_busy(void)
struct sched_domain *sd;
int cpu = smp_processor_id();
if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
return;
clear_bit(NOHZ_IDLE, nohz_flags(cpu));
rcu_read_lock();
for_each_domain(cpu, sd)
sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
if (!sd || !sd->nohz_idle)
goto unlock;
sd->nohz_idle = 0;
for (; sd; sd = sd->parent)
atomic_inc(&sd->groups->sgp->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -5411,13 +5438,16 @@ void set_cpu_sd_state_idle(void)
struct sched_domain *sd;
int cpu = smp_processor_id();
if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
return;
set_bit(NOHZ_IDLE, nohz_flags(cpu));
rcu_read_lock();
for_each_domain(cpu, sd)
sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
if (!sd || sd->nohz_idle)
goto unlock;
sd->nohz_idle = 1;
for (; sd; sd = sd->parent)
atomic_dec(&sd->groups->sgp->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -5469,7 +5499,7 @@ void update_max_interval(void)
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
*
* Balancing parameters are set up in arch_init_sched_domains.
* Balancing parameters are set up in init_sched_domains.
*/
static void rebalance_domains(int cpu, enum cpu_idle_type idle)
{
@@ -5507,10 +5537,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
if (time_after_eq(jiffies, sd->last_balance + interval)) {
if (load_balance(cpu, rq, sd, idle, &balance)) {
/*
* We've pulled tasks over so either we're no
* longer idle.
* The LBF_SOME_PINNED logic could have changed
* env->dst_cpu, so we can't know our idle
* state even if we migrated tasks. Update it.
*/
idle = CPU_NOT_IDLE;
idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
}
sd->last_balance = jiffies;
}

View File

@@ -45,13 +45,6 @@ SCHED_FEAT(HRTICK, false)
SCHED_FEAT(DOUBLE_TICK, false)
SCHED_FEAT(LB_BIAS, true)
/*
* Spin-wait on mutex acquisition when the mutex owner is running on
* another cpu -- assumes that when the owner is running, it will soon
* release the lock. Decreases scheduling overhead.
*/
SCHED_FEAT(OWNER_SPIN, true)
/*
* Decrement CPU power based on time not spent running tasks
*/

View File

@@ -13,6 +13,16 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
{
idle_exit_fair(rq);
}
static void post_schedule_idle(struct rq *rq)
{
idle_enter_fair(rq);
}
#endif /* CONFIG_SMP */
/*
* Idle tasks are unconditionally rescheduled:
@@ -25,6 +35,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
static struct task_struct *pick_next_task_idle(struct rq *rq)
{
schedstat_inc(rq, sched_goidle);
#ifdef CONFIG_SMP
/* Trigger the post schedule to do an idle_enter for CFS */
rq->post_schedule = 1;
#endif
return rq->idle;
}
@@ -86,6 +100,8 @@ const struct sched_class idle_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
.pre_schedule = pre_schedule_idle,
.post_schedule = post_schedule_idle,
#endif
.set_curr_task = set_curr_task_idle,

View File

@@ -8,6 +8,7 @@
#include <linux/tick.h>
#include "cpupri.h"
#include "cpuacct.h"
extern __read_mostly int scheduler_running;
@@ -951,14 +952,6 @@ static const u32 prio_to_wmult[40] = {
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
/* Time spent by the tasks of the cpu accounting group executing in ... */
enum cpuacct_stat_index {
CPUACCT_STAT_USER, /* ... user mode */
CPUACCT_STAT_SYSTEM, /* ... kernel mode */
CPUACCT_STAT_NSTATS,
};
#define ENQUEUE_WAKEUP 1
#define ENQUEUE_HEAD 2
#ifdef CONFIG_SMP
@@ -1032,6 +1025,18 @@ extern void update_group_power(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq, int cpu);
extern void idle_balance(int this_cpu, struct rq *this_rq);
/*
* Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg
* becomes useful in lb
*/
#if defined(CONFIG_FAIR_GROUP_SCHED)
extern void idle_enter_fair(struct rq *this_rq);
extern void idle_exit_fair(struct rq *this_rq);
#else
static inline void idle_enter_fair(struct rq *this_rq) {}
static inline void idle_exit_fair(struct rq *this_rq) {}
#endif
#else /* CONFIG_SMP */
static inline void idle_balance(int cpu, struct rq *rq)
@@ -1055,45 +1060,6 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
extern void update_idle_cpu_load(struct rq *this_rq);
#ifdef CONFIG_CGROUP_CPUACCT
#include <linux/cgroup.h>
/* track cpu usage of a group of tasks and its child groups */
struct cpuacct {
struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every cpu */
u64 __percpu *cpuusage;
struct kernel_cpustat __percpu *cpustat;
};
extern struct cgroup_subsys cpuacct_subsys;
extern struct cpuacct root_cpuacct;
/* return cpu accounting group corresponding to this container */
static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
{
return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
struct cpuacct, css);
}
/* return cpu accounting group to which this task belongs */
static inline struct cpuacct *task_ca(struct task_struct *tsk)
{
return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
struct cpuacct, css);
}
static inline struct cpuacct *parent_ca(struct cpuacct *ca)
{
if (!ca || !ca->css.cgroup->parent)
return NULL;
return cgroup_ca(ca->css.cgroup->parent);
}
extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
#else
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
#endif
#ifdef CONFIG_PARAVIRT
static inline u64 steal_ticks(u64 steal)
{
@@ -1348,7 +1314,6 @@ extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
enum rq_nohz_flag_bits {
NOHZ_TICK_STOPPED,
NOHZ_BALANCE_KICK,
NOHZ_IDLE,
};
#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)

View File

@@ -485,6 +485,9 @@ flush_signal_handlers(struct task_struct *t, int force_default)
if (force_default || ka->sa.sa_handler != SIG_IGN)
ka->sa.sa_handler = SIG_DFL;
ka->sa.sa_flags = 0;
#ifdef __ARCH_HAS_SA_RESTORER
ka->sa.sa_restorer = NULL;
#endif
sigemptyset(&ka->sa.sa_mask);
ka++;
}
@@ -2682,7 +2685,7 @@ static int do_sigpending(void *set, unsigned long sigsetsize)
/**
* sys_rt_sigpending - examine a pending signal that has been raised
* while blocked
* @set: stores pending signals
* @uset: stores pending signals
* @sigsetsize: size of sigset_t type or larger
*/
SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
@@ -2945,7 +2948,7 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
static int do_tkill(pid_t tgid, pid_t pid, int sig)
{
struct siginfo info;
struct siginfo info = {};
info.si_signo = sig;
info.si_errno = 0;

View File

@@ -131,7 +131,7 @@ static int smpboot_thread_fn(void *data)
continue;
}
//BUG_ON(td->cpu != smp_processor_id());
BUG_ON(td->cpu != smp_processor_id());
/* Check for state change setup */
switch (td->status) {
@@ -185,8 +185,18 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
}
get_task_struct(tsk);
*per_cpu_ptr(ht->store, cpu) = tsk;
if (ht->create)
ht->create(cpu);
if (ht->create) {
/*
* Make sure that the task has actually scheduled out
* into park position, before calling the create
* callback. At least the migration thread callback
* requires that the task is off the runqueue.
*/
if (!wait_task_inactive(tsk, TASK_PARKED))
WARN_ON(1);
else
ht->create(cpu);
}
return 0;
}
@@ -209,6 +219,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp
{
struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
if (ht->pre_unpark)
ht->pre_unpark(cpu);
kthread_unpark(tsk);
}

View File

@@ -323,18 +323,10 @@ void irq_enter(void)
static inline void invoke_softirq(void)
{
if (!force_irqthreads) {
#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
if (!force_irqthreads)
__do_softirq();
#else
do_softirq();
#endif
} else {
__local_bh_disable((unsigned long)__builtin_return_address(0),
SOFTIRQ_OFFSET);
else
wakeup_softirqd();
__local_bh_enable(SOFTIRQ_OFFSET);
}
}
static inline void tick_irq_exit(void)
@@ -355,15 +347,20 @@ static inline void tick_irq_exit(void)
*/
void irq_exit(void)
{
#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
local_irq_disable();
#else
WARN_ON_ONCE(!irqs_disabled());
#endif
account_irq_exit_time(current);
trace_hardirq_exit();
sub_preempt_count(IRQ_EXIT_OFFSET);
sub_preempt_count(HARDIRQ_OFFSET);
if (!in_interrupt() && local_softirq_pending())
invoke_softirq();
tick_irq_exit();
rcu_irq_exit();
sched_preempt_enable_no_resched();
}
/*

View File

@@ -336,7 +336,7 @@ static struct smp_hotplug_thread cpu_stop_threads = {
.create = cpu_stop_create,
.setup = cpu_stop_unpark,
.park = cpu_stop_park,
.unpark = cpu_stop_unpark,
.pre_unpark = cpu_stop_unpark,
.selfparking = true,
};

View File

@@ -324,7 +324,6 @@ void kernel_restart_prepare(char *cmd)
system_state = SYSTEM_RESTART;
usermodehelper_disable();
device_shutdown();
syscore_shutdown();
}
/**
@@ -370,6 +369,7 @@ void kernel_restart(char *cmd)
{
kernel_restart_prepare(cmd);
disable_nonboot_cpus();
syscore_shutdown();
if (!cmd)
printk(KERN_EMERG "Restarting system.\n");
else
@@ -395,6 +395,7 @@ static void kernel_shutdown_prepare(enum system_states state)
void kernel_halt(void)
{
kernel_shutdown_prepare(SYSTEM_HALT);
disable_nonboot_cpus();
syscore_shutdown();
printk(KERN_EMERG "System halted.\n");
kmsg_dump(KMSG_DUMP_HALT);
@@ -2185,9 +2186,8 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
static int __orderly_poweroff(void)
static int __orderly_poweroff(bool force)
{
int argc;
char **argv;
static char *envp[] = {
"HOME=/",
@@ -2196,35 +2196,19 @@ static int __orderly_poweroff(void)
};
int ret;
argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
if (argv == NULL) {
argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
if (argv) {
ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
argv_free(argv);
} else {
printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
__func__, poweroff_cmd);
return -ENOMEM;
__func__, poweroff_cmd);
ret = -ENOMEM;
}
ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC,
NULL, NULL, NULL);
argv_free(argv);
return ret;
}
/**
* orderly_poweroff - Trigger an orderly system poweroff
* @force: force poweroff if command execution fails
*
* This may be called from any context to trigger a system shutdown.
* If the orderly shutdown fails, it will force an immediate shutdown.
*/
int orderly_poweroff(bool force)
{
int ret = __orderly_poweroff();
if (ret && force) {
printk(KERN_WARNING "Failed to start orderly shutdown: "
"forcing the issue\n");
"forcing the issue\n");
/*
* I guess this should try to kick off some daemon to sync and
* poweroff asap. Or not even bother syncing if we're doing an
@@ -2236,4 +2220,28 @@ int orderly_poweroff(bool force)
return ret;
}
static bool poweroff_force;
static void poweroff_work_func(struct work_struct *work)
{
__orderly_poweroff(poweroff_force);
}
static DECLARE_WORK(poweroff_work, poweroff_work_func);
/**
* orderly_poweroff - Trigger an orderly system poweroff
* @force: force poweroff if command execution fails
*
* This may be called from any context to trigger a system shutdown.
* If the orderly shutdown fails, it will force an immediate shutdown.
*/
int orderly_poweroff(bool force)
{
if (force) /* do not override the pending "true" */
poweroff_force = true;
schedule_work(&poweroff_work);
return 0;
}
EXPORT_SYMBOL_GPL(orderly_poweroff);

View File

@@ -106,7 +106,6 @@ extern unsigned int core_pipe_limit;
#endif
extern int pid_max;
extern int pid_max_min, pid_max_max;
extern int sysctl_drop_caches;
extern int percpu_pagelist_fraction;
extern int compat_log;
extern int latencytop_enabled;
@@ -1430,6 +1429,20 @@ static struct ctl_table vm_table[] = {
.extra2 = &one,
},
#endif
{
.procname = "user_reserve_kbytes",
.data = &sysctl_user_reserve_kbytes,
.maxlen = sizeof(sysctl_user_reserve_kbytes),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
{
.procname = "admin_reserve_kbytes",
.data = &sysctl_admin_reserve_kbytes,
.maxlen = sizeof(sysctl_admin_reserve_kbytes),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
{ }
};

View File

@@ -365,7 +365,7 @@ int init_test_probes(void)
target2 = kprobe_target2;
do {
rand1 = random32();
rand1 = prandom_u32();
} while (rand1 <= div_factor);
printk(KERN_INFO "Kprobe smoke test started\n");

View File

@@ -67,7 +67,8 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
*/
int tick_check_broadcast_device(struct clock_event_device *dev)
{
if ((tick_broadcast_device.evtdev &&
if ((dev->features & CLOCK_EVT_FEAT_DUMMY) ||
(tick_broadcast_device.evtdev &&
tick_broadcast_device.evtdev->rating >= dev->rating) ||
(dev->features & CLOCK_EVT_FEAT_C3STOP))
return 0;

View File

@@ -176,6 +176,8 @@ config IRQSOFF_TRACER
select GENERIC_TRACER
select TRACER_MAX_TRACE
select RING_BUFFER_ALLOW_SWAP
select TRACER_SNAPSHOT
select TRACER_SNAPSHOT_PER_CPU_SWAP
help
This option measures the time spent in irqs-off critical
sections, with microsecond accuracy.
@@ -198,6 +200,8 @@ config PREEMPT_TRACER
select GENERIC_TRACER
select TRACER_MAX_TRACE
select RING_BUFFER_ALLOW_SWAP
select TRACER_SNAPSHOT
select TRACER_SNAPSHOT_PER_CPU_SWAP
help
This option measures the time spent in preemption-off critical
sections, with microsecond accuracy.
@@ -217,6 +221,7 @@ config SCHED_TRACER
select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
select TRACER_MAX_TRACE
select TRACER_SNAPSHOT
help
This tracer tracks the latency of the highest priority task
to be scheduled in, starting from the point it has woken up.
@@ -248,6 +253,27 @@ config TRACER_SNAPSHOT
echo 1 > /sys/kernel/debug/tracing/snapshot
cat snapshot
config TRACER_SNAPSHOT_PER_CPU_SWAP
bool "Allow snapshot to swap per CPU"
depends on TRACER_SNAPSHOT
select RING_BUFFER_ALLOW_SWAP
help
Allow doing a snapshot of a single CPU buffer instead of a
full swap (all buffers). If this is set, then the following is
allowed:
echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot
After which, only the tracing buffer for CPU 2 was swapped with
the main tracing buffer, and the other CPU buffers remain the same.
When this is enabled, this adds a little more overhead to the
trace recording, as it needs to add some checks to synchronize
recording with swaps. But this does not affect the performance
of the overall system. This is enabled by default when the preempt
or irq latency tracers are enabled, as those need to swap as well
and already adds the overhead (plus a lot more).
config TRACE_BRANCH_PROFILING
bool
select GENERIC_TRACER
@@ -414,24 +440,28 @@ config PROBE_EVENTS
def_bool n
config DYNAMIC_FTRACE
bool "enable/disable ftrace tracepoints dynamically"
bool "enable/disable function tracing dynamically"
depends on FUNCTION_TRACER
depends on HAVE_DYNAMIC_FTRACE
default y
help
This option will modify all the calls to ftrace dynamically
(will patch them out of the binary image and replace them
with a No-Op instruction) as they are called. A table is
created to dynamically enable them again.
This option will modify all the calls to function tracing
dynamically (will patch them out of the binary image and
replace them with a No-Op instruction) on boot up. During
compile time, a table is made of all the locations that ftrace
can function trace, and this table is linked into the kernel
image. When this is enabled, functions can be individually
enabled, and the functions not enabled will not affect
performance of the system.
See the files in /sys/kernel/debug/tracing:
available_filter_functions
set_ftrace_filter
set_ftrace_notrace
This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
otherwise has native performance as long as no tracing is active.
The changes to the code are done by a kernel thread that
wakes up once a second and checks to see if any ftrace calls
were made. If so, it runs stop_machine (stops all CPUS)
and modifies the code to jump over the call to ftrace.
config DYNAMIC_FTRACE_WITH_REGS
def_bool y
depends on DYNAMIC_FTRACE
@@ -520,6 +550,29 @@ config RING_BUFFER_BENCHMARK
If unsure, say N.
config RING_BUFFER_STARTUP_TEST
bool "Ring buffer startup self test"
depends on RING_BUFFER
help
Run a simple self test on the ring buffer on boot up. Late in the
kernel boot sequence, the test will start that kicks off
a thread per cpu. Each thread will write various size events
into the ring buffer. Another thread is created to send IPIs
to each of the threads, where the IPI handler will also write
to the ring buffer, to test/stress the nesting ability.
If any anomalies are discovered, a warning will be displayed
and all ring buffers will be disabled.
The test runs for 10 seconds. This will slow your boot time
by at least 10 more seconds.
At the end of the test, statics and more checks are done.
It will output the stats of each per cpu buffer. What
was written, the sizes, what was read, what was lost, and
other similar details.
If unsure, say N
endif # FTRACE
endif # TRACING_SUPPORT

View File

@@ -72,7 +72,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
bool blk_tracer = blk_tracer_enabled;
if (blk_tracer) {
buffer = blk_tr->buffer;
buffer = blk_tr->trace_buffer.buffer;
pc = preempt_count();
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
sizeof(*t) + len,
@@ -218,7 +218,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
if (blk_tracer) {
tracing_record_cmdline(current);
buffer = blk_tr->buffer;
buffer = blk_tr->trace_buffer.buffer;
pc = preempt_count();
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
sizeof(*t) + pdu_len,
@@ -739,12 +739,6 @@ static void blk_add_trace_rq_complete(void *ignore,
struct request_queue *q,
struct request *rq)
{
struct blk_trace *bt = q->blk_trace;
/* if control ever passes through here, it's a request based driver */
if (unlikely(bt && !bt->rq_based))
bt->rq_based = true;
blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
}
@@ -780,24 +774,10 @@ static void blk_add_trace_bio_bounce(void *ignore,
blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
}
static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error)
static void blk_add_trace_bio_complete(void *ignore,
struct request_queue *q, struct bio *bio,
int error)
{
struct request_queue *q;
struct blk_trace *bt;
if (!bio->bi_bdev)
return;
q = bdev_get_queue(bio->bi_bdev);
bt = q->blk_trace;
/*
* Request based drivers will generate both rq and bio completions.
* Ignore bio ones.
*/
if (likely(!bt) || bt->rq_based)
return;
blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
}

View File

@@ -66,7 +66,7 @@
static struct ftrace_ops ftrace_list_end __read_mostly = {
.func = ftrace_stub,
.flags = FTRACE_OPS_FL_RECURSION_SAFE,
.flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
};
/* ftrace_enabled is a method to turn ftrace on or off */
@@ -486,7 +486,6 @@ struct ftrace_profile_stat {
#define PROFILES_PER_PAGE \
(PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
static int ftrace_profile_bits __read_mostly;
static int ftrace_profile_enabled __read_mostly;
/* ftrace_profile_lock - synchronize the enable and disable of the profiler */
@@ -494,7 +493,8 @@ static DEFINE_MUTEX(ftrace_profile_lock);
static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */
#define FTRACE_PROFILE_HASH_BITS 10
#define FTRACE_PROFILE_HASH_SIZE (1 << FTRACE_PROFILE_HASH_BITS)
static void *
function_stat_next(void *v, int idx)
@@ -676,7 +676,7 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
for (i = 0; i < pages; i++) {
for (i = 1; i < pages; i++) {
pg->next = (void *)get_zeroed_page(GFP_KERNEL);
if (!pg->next)
goto out_free;
@@ -694,7 +694,6 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
free_page(tmp);
}
free_page((unsigned long)stat->pages);
stat->pages = NULL;
stat->start = NULL;
@@ -725,13 +724,6 @@ static int ftrace_profile_init_cpu(int cpu)
if (!stat->hash)
return -ENOMEM;
if (!ftrace_profile_bits) {
size--;
for (; size; size >>= 1)
ftrace_profile_bits++;
}
/* Preallocate the function profiling pages */
if (ftrace_profile_pages_init(stat) < 0) {
kfree(stat->hash);
@@ -764,7 +756,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
struct hlist_head *hhd;
unsigned long key;
key = hash_long(ip, ftrace_profile_bits);
key = hash_long(ip, FTRACE_PROFILE_HASH_BITS);
hhd = &stat->hash[key];
if (hlist_empty(hhd))
@@ -783,7 +775,7 @@ static void ftrace_add_profile(struct ftrace_profile_stat *stat,
{
unsigned long key;
key = hash_long(rec->ip, ftrace_profile_bits);
key = hash_long(rec->ip, FTRACE_PROFILE_HASH_BITS);
hlist_add_head_rcu(&rec->node, &stat->hash[key]);
}
@@ -1053,6 +1045,19 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
static struct pid * const ftrace_swapper_pid = &init_struct_pid;
loff_t
ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
{
loff_t ret;
if (file->f_mode & FMODE_READ)
ret = seq_lseek(file, offset, whence);
else
file->f_pos = ret = 1;
return ret;
}
#ifdef CONFIG_DYNAMIC_FTRACE
#ifndef CONFIG_FTRACE_MCOUNT_RECORD
@@ -1067,7 +1072,7 @@ struct ftrace_func_probe {
unsigned long flags;
unsigned long ip;
void *data;
struct rcu_head rcu;
struct list_head free_list;
};
struct ftrace_func_entry {
@@ -1317,7 +1322,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
struct hlist_head *hhd;
struct ftrace_hash *old_hash;
struct ftrace_hash *new_hash;
unsigned long key;
int size = src->count;
int bits = 0;
int ret;
@@ -1360,10 +1364,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
for (i = 0; i < size; i++) {
hhd = &src->buckets[i];
hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
if (bits > 0)
key = hash_long(entry->ip, bits);
else
key = 0;
remove_hash_entry(src, entry);
__add_hash_entry(new_hash, entry);
}
@@ -2613,7 +2613,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
* routine, you can use ftrace_filter_write() for the write
* routine if @flag has FTRACE_ITER_FILTER set, or
* ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
* ftrace_regex_lseek() should be used as the lseek routine, and
* ftrace_filter_lseek() should be used as the lseek routine, and
* release must call ftrace_regex_release().
*/
int
@@ -2697,19 +2697,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
inode, file);
}
loff_t
ftrace_regex_lseek(struct file *file, loff_t offset, int whence)
{
loff_t ret;
if (file->f_mode & FMODE_READ)
ret = seq_lseek(file, offset, whence);
else
file->f_pos = ret = 1;
return ret;
}
static int ftrace_match(char *str, char *regex, int len, int type)
{
int matched = 0;
@@ -2974,28 +2961,27 @@ static void __disable_ftrace_function_probe(void)
}
static void ftrace_free_entry_rcu(struct rcu_head *rhp)
static void ftrace_free_entry(struct ftrace_func_probe *entry)
{
struct ftrace_func_probe *entry =
container_of(rhp, struct ftrace_func_probe, rcu);
if (entry->ops->free)
entry->ops->free(&entry->data);
entry->ops->free(entry->ops, entry->ip, &entry->data);
kfree(entry);
}
int
register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
void *data)
{
struct ftrace_func_probe *entry;
struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
struct ftrace_hash *hash;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
int type, len, not;
unsigned long key;
int count = 0;
char *search;
int ret;
type = filter_parse_regex(glob, strlen(glob), &search, &not);
len = strlen(search);
@@ -3006,8 +2992,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
mutex_lock(&ftrace_lock);
if (unlikely(ftrace_disabled))
hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
if (!hash) {
count = -ENOMEM;
goto out_unlock;
}
if (unlikely(ftrace_disabled)) {
count = -ENODEV;
goto out_unlock;
}
do_for_each_ftrace_rec(pg, rec) {
@@ -3031,14 +3025,21 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
* for each function we find. We call the callback
* to give the caller an opportunity to do so.
*/
if (ops->callback) {
if (ops->callback(rec->ip, &entry->data) < 0) {
if (ops->init) {
if (ops->init(ops, rec->ip, &entry->data) < 0) {
/* caller does not like this func */
kfree(entry);
continue;
}
}
ret = enter_record(hash, rec, 0);
if (ret < 0) {
kfree(entry);
count = ret;
goto out_unlock;
}
entry->ops = ops;
entry->ip = rec->ip;
@@ -3046,10 +3047,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
} while_for_each_ftrace_rec();
ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
if (ret < 0)
count = ret;
__enable_ftrace_function_probe();
out_unlock:
mutex_unlock(&ftrace_lock);
free_ftrace_hash(hash);
return count;
}
@@ -3063,7 +3070,12 @@ static void
__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
void *data, int flags)
{
struct ftrace_func_entry *rec_entry;
struct ftrace_func_probe *entry;
struct ftrace_func_probe *p;
struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
struct list_head free_list;
struct ftrace_hash *hash;
struct hlist_node *tmp;
char str[KSYM_SYMBOL_LEN];
int type = MATCH_FULL;
@@ -3084,6 +3096,14 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
}
mutex_lock(&ftrace_lock);
hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
if (!hash)
/* Hmm, should report this somehow */
goto out_unlock;
INIT_LIST_HEAD(&free_list);
for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
struct hlist_head *hhd = &ftrace_func_hash[i];
@@ -3104,12 +3124,30 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
continue;
}
hlist_del(&entry->node);
call_rcu(&entry->rcu, ftrace_free_entry_rcu);
rec_entry = ftrace_lookup_ip(hash, entry->ip);
/* It is possible more than one entry had this ip */
if (rec_entry)
free_hash_entry(hash, rec_entry);
hlist_del_rcu(&entry->node);
list_add(&entry->free_list, &free_list);
}
}
__disable_ftrace_function_probe();
/*
* Remove after the disable is called. Otherwise, if the last
* probe is removed, a null hash means *all enabled*.
*/
ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
synchronize_sched();
list_for_each_entry_safe(entry, p, &free_list, free_list) {
list_del(&entry->free_list);
ftrace_free_entry(entry);
}
out_unlock:
mutex_unlock(&ftrace_lock);
free_ftrace_hash(hash);
}
void
@@ -3441,14 +3479,14 @@ static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
static int __init set_ftrace_notrace(char *str)
{
strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_notrace=", set_ftrace_notrace);
static int __init set_ftrace_filter(char *str)
{
strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_filter=", set_ftrace_filter);
@@ -3571,7 +3609,7 @@ static const struct file_operations ftrace_filter_fops = {
.open = ftrace_filter_open,
.read = seq_read,
.write = ftrace_filter_write,
.llseek = ftrace_regex_lseek,
.llseek = ftrace_filter_lseek,
.release = ftrace_regex_release,
};
@@ -3579,7 +3617,7 @@ static const struct file_operations ftrace_notrace_fops = {
.open = ftrace_notrace_open,
.read = seq_read,
.write = ftrace_notrace_write,
.llseek = ftrace_regex_lseek,
.llseek = ftrace_filter_lseek,
.release = ftrace_regex_release,
};
@@ -3737,7 +3775,8 @@ out:
if (fail)
return -EINVAL;
ftrace_graph_filter_enabled = 1;
ftrace_graph_filter_enabled = !!(*idx);
return 0;
}
@@ -3784,8 +3823,8 @@ static const struct file_operations ftrace_graph_fops = {
.open = ftrace_graph_open,
.read = seq_read,
.write = ftrace_graph_write,
.llseek = ftrace_filter_lseek,
.release = ftrace_graph_release,
.llseek = seq_lseek,
};
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
@@ -4131,7 +4170,8 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
preempt_disable_notrace();
trace_recursion_set(TRACE_CONTROL_BIT);
do_for_each_ftrace_op(op, ftrace_control_list) {
if (!ftrace_function_local_disabled(op) &&
if (!(op->flags & FTRACE_OPS_FL_STUB) &&
!ftrace_function_local_disabled(op) &&
ftrace_ops_test(op, ip))
op->func(ip, parent_ip, op, regs);
} while_for_each_ftrace_op(op);
@@ -4439,7 +4479,7 @@ static const struct file_operations ftrace_pid_fops = {
.open = ftrace_pid_open,
.write = ftrace_pid_write,
.read = seq_read,
.llseek = seq_lseek,
.llseek = ftrace_filter_lseek,
.release = ftrace_pid_release,
};
@@ -4555,12 +4595,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
ftrace_startup_sysctl();
/* we are starting ftrace again */
if (ftrace_ops_list != &ftrace_list_end) {
if (ftrace_ops_list->next == &ftrace_list_end)
ftrace_trace_function = ftrace_ops_list->func;
else
ftrace_trace_function = ftrace_ops_list_func;
}
if (ftrace_ops_list != &ftrace_list_end)
update_ftrace_function();
} else {
/* stopping ftrace calls (just send to ftrace_stub) */

View File

@@ -8,13 +8,16 @@
#include <linux/trace_clock.h>
#include <linux/trace_seq.h>
#include <linux/spinlock.h>
#include <linux/irq_work.h>
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/hardirq.h>
#include <linux/kthread.h> /* for self test */
#include <linux/kmemcheck.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/mutex.h>
#include <linux/delay.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/hash.h>
@@ -444,6 +447,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
return ret;
}
struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
bool waiters_pending;
};
/*
* head_page == tail_page && head == tail then buffer is empty.
*/
@@ -478,6 +487,8 @@ struct ring_buffer_per_cpu {
struct list_head new_pages; /* new pages to add */
struct work_struct update_pages_work;
struct completion update_done;
struct rb_irq_work irq_work;
};
struct ring_buffer {
@@ -497,6 +508,8 @@ struct ring_buffer {
struct notifier_block cpu_notify;
#endif
u64 (*clock)(void);
struct rb_irq_work irq_work;
};
struct ring_buffer_iter {
@@ -508,6 +521,118 @@ struct ring_buffer_iter {
u64 read_stamp;
};
/*
* rb_wake_up_waiters - wake up tasks waiting for ring buffer input
*
* Schedules a delayed work to wake up any task that is blocked on the
* ring buffer waiters queue.
*/
static void rb_wake_up_waiters(struct irq_work *work)
{
struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
wake_up_all(&rbwork->waiters);
}
/**
* ring_buffer_wait - wait for input to the ring buffer
* @buffer: buffer to wait on
* @cpu: the cpu buffer to wait on
*
* If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
* as data is added to any of the @buffer's cpu buffers. Otherwise
* it will wait for data to be added to a specific cpu buffer.
*/
void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
DEFINE_WAIT(wait);
struct rb_irq_work *work;
/*
* Depending on what the caller is waiting for, either any
* data in any cpu buffer, or a specific buffer, put the
* caller on the appropriate wait queue.
*/
if (cpu == RING_BUFFER_ALL_CPUS)
work = &buffer->irq_work;
else {
cpu_buffer = buffer->buffers[cpu];
work = &cpu_buffer->irq_work;
}
prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
/*
* The events can happen in critical sections where
* checking a work queue can cause deadlocks.
* After adding a task to the queue, this flag is set
* only to notify events to try to wake up the queue
* using irq_work.
*
* We don't clear it even if the buffer is no longer
* empty. The flag only causes the next event to run
* irq_work to do the work queue wake up. The worse
* that can happen if we race with !trace_empty() is that
* an event will cause an irq_work to try to wake up
* an empty queue.
*
* There's no reason to protect this flag either, as
* the work queue and irq_work logic will do the necessary
* synchronization for the wake ups. The only thing
* that is necessary is that the wake up happens after
* a task has been queued. It's OK for spurious wake ups.
*/
work->waiters_pending = true;
if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) ||
(cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu)))
schedule();
finish_wait(&work->waiters, &wait);
}
/**
* ring_buffer_poll_wait - poll on buffer input
* @buffer: buffer to wait on
* @cpu: the cpu buffer to wait on
* @filp: the file descriptor
* @poll_table: The poll descriptor
*
* If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
* as data is added to any of the @buffer's cpu buffers. Otherwise
* it will wait for data to be added to a specific cpu buffer.
*
* Returns POLLIN | POLLRDNORM if data exists in the buffers,
* zero otherwise.
*/
int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
struct file *filp, poll_table *poll_table)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct rb_irq_work *work;
if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
(cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
return POLLIN | POLLRDNORM;
if (cpu == RING_BUFFER_ALL_CPUS)
work = &buffer->irq_work;
else {
cpu_buffer = buffer->buffers[cpu];
work = &cpu_buffer->irq_work;
}
work->waiters_pending = true;
poll_wait(filp, &work->waiters, poll_table);
if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
(cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
return POLLIN | POLLRDNORM;
return 0;
}
/* buffer may be either ring_buffer or ring_buffer_per_cpu */
#define RB_WARN_ON(b, cond) \
({ \
@@ -1063,6 +1188,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);
init_completion(&cpu_buffer->update_done);
init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
init_waitqueue_head(&cpu_buffer->irq_work.waiters);
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL, cpu_to_node(cpu));
@@ -1158,6 +1285,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
buffer->clock = trace_clock_local;
buffer->reader_lock_key = key;
init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters);
init_waitqueue_head(&buffer->irq_work.waiters);
/* need at least two pages */
if (nr_pages < 2)
nr_pages = 2;
@@ -1553,11 +1683,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
if (!cpu_buffer->nr_pages_to_update)
continue;
if (cpu_online(cpu))
/* The update must run on the CPU that is being updated. */
preempt_disable();
if (cpu == smp_processor_id() || !cpu_online(cpu)) {
rb_update_pages(cpu_buffer);
cpu_buffer->nr_pages_to_update = 0;
} else {
/*
* Can not disable preemption for schedule_work_on()
* on PREEMPT_RT.
*/
preempt_enable();
schedule_work_on(cpu,
&cpu_buffer->update_pages_work);
else
rb_update_pages(cpu_buffer);
preempt_disable();
}
preempt_enable();
}
/* wait for all the updates to complete */
@@ -1595,12 +1736,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
get_online_cpus();
if (cpu_online(cpu_id)) {
preempt_disable();
/* The update must run on the CPU that is being updated. */
if (cpu_id == smp_processor_id() || !cpu_online(cpu_id))
rb_update_pages(cpu_buffer);
else {
/*
* Can not disable preemption for schedule_work_on()
* on PREEMPT_RT.
*/
preempt_enable();
schedule_work_on(cpu_id,
&cpu_buffer->update_pages_work);
wait_for_completion(&cpu_buffer->update_done);
} else
rb_update_pages(cpu_buffer);
preempt_disable();
}
preempt_enable();
cpu_buffer->nr_pages_to_update = 0;
put_online_cpus();
@@ -2612,6 +2763,22 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
rb_end_commit(cpu_buffer);
}
static __always_inline void
rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
{
if (buffer->irq_work.waiters_pending) {
buffer->irq_work.waiters_pending = false;
/* irq_work_queue() supplies it's own memory barriers */
irq_work_queue(&buffer->irq_work.work);
}
if (cpu_buffer->irq_work.waiters_pending) {
cpu_buffer->irq_work.waiters_pending = false;
/* irq_work_queue() supplies it's own memory barriers */
irq_work_queue(&cpu_buffer->irq_work.work);
}
}
/**
* ring_buffer_unlock_commit - commit a reserved
* @buffer: The buffer to commit to
@@ -2631,6 +2798,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
rb_commit(cpu_buffer, event);
rb_wakeups(buffer, cpu_buffer);
trace_recursive_unlock();
preempt_enable_notrace();
@@ -2803,6 +2972,8 @@ int ring_buffer_write(struct ring_buffer *buffer,
rb_commit(cpu_buffer, event);
rb_wakeups(buffer, cpu_buffer);
ret = 0;
out:
preempt_enable_notrace();
@@ -4467,3 +4638,320 @@ static int rb_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
#endif
#ifdef CONFIG_RING_BUFFER_STARTUP_TEST
/*
* This is a basic integrity check of the ring buffer.
* Late in the boot cycle this test will run when configured in.
* It will kick off a thread per CPU that will go into a loop
* writing to the per cpu ring buffer various sizes of data.
* Some of the data will be large items, some small.
*
* Another thread is created that goes into a spin, sending out
* IPIs to the other CPUs to also write into the ring buffer.
* this is to test the nesting ability of the buffer.
*
* Basic stats are recorded and reported. If something in the
* ring buffer should happen that's not expected, a big warning
* is displayed and all ring buffers are disabled.
*/
static struct task_struct *rb_threads[NR_CPUS] __initdata;
struct rb_test_data {
struct ring_buffer *buffer;
unsigned long events;
unsigned long bytes_written;
unsigned long bytes_alloc;
unsigned long bytes_dropped;
unsigned long events_nested;
unsigned long bytes_written_nested;
unsigned long bytes_alloc_nested;
unsigned long bytes_dropped_nested;
int min_size_nested;
int max_size_nested;
int max_size;
int min_size;
int cpu;
int cnt;
};
static struct rb_test_data rb_data[NR_CPUS] __initdata;
/* 1 meg per cpu */
#define RB_TEST_BUFFER_SIZE 1048576
static char rb_string[] __initdata =
"abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\"
"?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890"
"!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv";
static bool rb_test_started __initdata;
struct rb_item {
int size;
char str[];
};
static __init int rb_write_something(struct rb_test_data *data, bool nested)
{
struct ring_buffer_event *event;
struct rb_item *item;
bool started;
int event_len;
int size;
int len;
int cnt;
/* Have nested writes different that what is written */
cnt = data->cnt + (nested ? 27 : 0);
/* Multiply cnt by ~e, to make some unique increment */
size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1);
len = size + sizeof(struct rb_item);
started = rb_test_started;
/* read rb_test_started before checking buffer enabled */
smp_rmb();
event = ring_buffer_lock_reserve(data->buffer, len);
if (!event) {
/* Ignore dropped events before test starts. */
if (started) {
if (nested)
data->bytes_dropped += len;
else
data->bytes_dropped_nested += len;
}
return len;
}
event_len = ring_buffer_event_length(event);
if (RB_WARN_ON(data->buffer, event_len < len))
goto out;
item = ring_buffer_event_data(event);
item->size = size;
memcpy(item->str, rb_string, size);
if (nested) {
data->bytes_alloc_nested += event_len;
data->bytes_written_nested += len;
data->events_nested++;
if (!data->min_size_nested || len < data->min_size_nested)
data->min_size_nested = len;
if (len > data->max_size_nested)
data->max_size_nested = len;
} else {
data->bytes_alloc += event_len;
data->bytes_written += len;
data->events++;
if (!data->min_size || len < data->min_size)
data->max_size = len;
if (len > data->max_size)
data->max_size = len;
}
out:
ring_buffer_unlock_commit(data->buffer, event);
return 0;
}
static __init int rb_test(void *arg)
{
struct rb_test_data *data = arg;
while (!kthread_should_stop()) {
rb_write_something(data, false);
data->cnt++;
set_current_state(TASK_INTERRUPTIBLE);
/* Now sleep between a min of 100-300us and a max of 1ms */
usleep_range(((data->cnt % 3) + 1) * 100, 1000);
}
return 0;
}
static __init void rb_ipi(void *ignore)
{
struct rb_test_data *data;
int cpu = smp_processor_id();
data = &rb_data[cpu];
rb_write_something(data, true);
}
static __init int rb_hammer_test(void *arg)
{
while (!kthread_should_stop()) {
/* Send an IPI to all cpus to write data! */
smp_call_function(rb_ipi, NULL, 1);
/* No sleep, but for non preempt, let others run */
schedule();
}
return 0;
}
static __init int test_ringbuffer(void)
{
struct task_struct *rb_hammer;
struct ring_buffer *buffer;
int cpu;
int ret = 0;
pr_info("Running ring buffer tests...\n");
buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE);
if (WARN_ON(!buffer))
return 0;
/* Disable buffer so that threads can't write to it yet */
ring_buffer_record_off(buffer);
for_each_online_cpu(cpu) {
rb_data[cpu].buffer = buffer;
rb_data[cpu].cpu = cpu;
rb_data[cpu].cnt = cpu;
rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu],
"rbtester/%d", cpu);
if (WARN_ON(!rb_threads[cpu])) {
pr_cont("FAILED\n");
ret = -1;
goto out_free;
}
kthread_bind(rb_threads[cpu], cpu);
wake_up_process(rb_threads[cpu]);
}
/* Now create the rb hammer! */
rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer");
if (WARN_ON(!rb_hammer)) {
pr_cont("FAILED\n");
ret = -1;
goto out_free;
}
ring_buffer_record_on(buffer);
/*
* Show buffer is enabled before setting rb_test_started.
* Yes there's a small race window where events could be
* dropped and the thread wont catch it. But when a ring
* buffer gets enabled, there will always be some kind of
* delay before other CPUs see it. Thus, we don't care about
* those dropped events. We care about events dropped after
* the threads see that the buffer is active.
*/
smp_wmb();
rb_test_started = true;
set_current_state(TASK_INTERRUPTIBLE);
/* Just run for 10 seconds */;
schedule_timeout(10 * HZ);
kthread_stop(rb_hammer);
out_free:
for_each_online_cpu(cpu) {
if (!rb_threads[cpu])
break;
kthread_stop(rb_threads[cpu]);
}
if (ret) {
ring_buffer_free(buffer);
return ret;
}
/* Report! */
pr_info("finished\n");
for_each_online_cpu(cpu) {
struct ring_buffer_event *event;
struct rb_test_data *data = &rb_data[cpu];
struct rb_item *item;
unsigned long total_events;
unsigned long total_dropped;
unsigned long total_written;
unsigned long total_alloc;
unsigned long total_read = 0;
unsigned long total_size = 0;
unsigned long total_len = 0;
unsigned long total_lost = 0;
unsigned long lost;
int big_event_size;
int small_event_size;
ret = -1;
total_events = data->events + data->events_nested;
total_written = data->bytes_written + data->bytes_written_nested;
total_alloc = data->bytes_alloc + data->bytes_alloc_nested;
total_dropped = data->bytes_dropped + data->bytes_dropped_nested;
big_event_size = data->max_size + data->max_size_nested;
small_event_size = data->min_size + data->min_size_nested;
pr_info("CPU %d:\n", cpu);
pr_info(" events: %ld\n", total_events);
pr_info(" dropped bytes: %ld\n", total_dropped);
pr_info(" alloced bytes: %ld\n", total_alloc);
pr_info(" written bytes: %ld\n", total_written);
pr_info(" biggest event: %d\n", big_event_size);
pr_info(" smallest event: %d\n", small_event_size);
if (RB_WARN_ON(buffer, total_dropped))
break;
ret = 0;
while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) {
total_lost += lost;
item = ring_buffer_event_data(event);
total_len += ring_buffer_event_length(event);
total_size += item->size + sizeof(struct rb_item);
if (memcmp(&item->str[0], rb_string, item->size) != 0) {
pr_info("FAILED!\n");
pr_info("buffer had: %.*s\n", item->size, item->str);
pr_info("expected: %.*s\n", item->size, rb_string);
RB_WARN_ON(buffer, 1);
ret = -1;
break;
}
total_read++;
}
if (ret)
break;
ret = -1;
pr_info(" read events: %ld\n", total_read);
pr_info(" lost events: %ld\n", total_lost);
pr_info(" total events: %ld\n", total_lost + total_read);
pr_info(" recorded len bytes: %ld\n", total_len);
pr_info(" recorded size bytes: %ld\n", total_size);
if (total_lost)
pr_info(" With dropped events, record len and size may not match\n"
" alloced and written from above\n");
if (!total_lost) {
if (RB_WARN_ON(buffer, total_len != total_alloc ||
total_size != total_written))
break;
}
if (RB_WARN_ON(buffer, total_lost + total_read != total_events))
break;
ret = 0;
}
if (!ret)
pr_info("Ring buffer PASSED!\n");
ring_buffer_free(buffer);
return 0;
}
late_initcall(test_ringbuffer);
#endif /* CONFIG_RING_BUFFER_STARTUP_TEST */

File diff suppressed because it is too large Load Diff

View File

@@ -13,6 +13,11 @@
#include <linux/trace_seq.h>
#include <linux/ftrace_event.h>
#ifdef CONFIG_FTRACE_SYSCALLS
#include <asm/unistd.h> /* For NR_SYSCALLS */
#include <asm/syscall.h> /* some archs define it here */
#endif
enum trace_type {
__TRACE_FIRST_TYPE = 0,
@@ -29,6 +34,7 @@ enum trace_type {
TRACE_GRAPH_ENT,
TRACE_USER_STACK,
TRACE_BLK,
TRACE_BPUTS,
__TRACE_LAST_TYPE,
};
@@ -103,11 +109,6 @@ struct kretprobe_trace_entry_head {
unsigned long ret_ip;
};
struct uprobe_trace_entry_head {
struct trace_entry ent;
unsigned long ip;
};
/*
* trace_flag_type is an enumeration that holds different
* states when a trace occurs. These are:
@@ -127,12 +128,21 @@ enum trace_flag_type {
#define TRACE_BUF_SIZE 1024
struct trace_array;
struct trace_cpu {
struct trace_array *tr;
struct dentry *dir;
int cpu;
};
/*
* The CPU trace array - it consists of thousands of trace entries
* plus some other descriptor data: (for example which task started
* the trace, etc.)
*/
struct trace_array_cpu {
struct trace_cpu trace_cpu;
atomic_t disabled;
void *buffer_page; /* ring buffer spare */
@@ -151,20 +161,83 @@ struct trace_array_cpu {
char comm[TASK_COMM_LEN];
};
struct tracer;
struct trace_buffer {
struct trace_array *tr;
struct ring_buffer *buffer;
struct trace_array_cpu __percpu *data;
cycle_t time_start;
int cpu;
};
/*
* The trace array - an array of per-CPU trace arrays. This is the
* highest level data structure that individual tracers deal with.
* They have on/off state as well:
*/
struct trace_array {
struct ring_buffer *buffer;
int cpu;
struct list_head list;
char *name;
struct trace_buffer trace_buffer;
#ifdef CONFIG_TRACER_MAX_TRACE
/*
* The max_buffer is used to snapshot the trace when a maximum
* latency is reached, or when the user initiates a snapshot.
* Some tracers will use this to store a maximum trace while
* it continues examining live traces.
*
* The buffers for the max_buffer are set up the same as the trace_buffer
* When a snapshot is taken, the buffer of the max_buffer is swapped
* with the buffer of the trace_buffer and the buffers are reset for
* the trace_buffer so the tracing can continue.
*/
struct trace_buffer max_buffer;
bool allocated_snapshot;
#endif
int buffer_disabled;
cycle_t time_start;
struct trace_cpu trace_cpu; /* place holder */
#ifdef CONFIG_FTRACE_SYSCALLS
int sys_refcount_enter;
int sys_refcount_exit;
DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
#endif
int stop_count;
int clock_id;
struct tracer *current_trace;
unsigned int flags;
raw_spinlock_t start_lock;
struct dentry *dir;
struct dentry *options;
struct dentry *percpu_dir;
struct dentry *event_dir;
struct list_head systems;
struct list_head events;
struct task_struct *waiter;
struct trace_array_cpu *data[NR_CPUS];
int ref;
};
enum {
TRACE_ARRAY_FL_GLOBAL = (1 << 0)
};
extern struct list_head ftrace_trace_arrays;
/*
* The global tracer (top) should be the first trace array added,
* but we check the flag anyway.
*/
static inline struct trace_array *top_trace_array(void)
{
struct trace_array *tr;
tr = list_entry(ftrace_trace_arrays.prev,
typeof(*tr), list);
WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
return tr;
}
#define FTRACE_CMP_TYPE(var, type) \
__builtin_types_compatible_p(typeof(var), type *)
@@ -200,6 +273,7 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \
IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
TRACE_MMIO_RW); \
IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
@@ -283,11 +357,16 @@ struct tracer {
enum print_line_t (*print_line)(struct trace_iterator *iter);
/* If you handled the flag setting, return 0 */
int (*set_flag)(u32 old_flags, u32 bit, int set);
/* Return 0 if OK with change, else return non-zero */
int (*flag_changed)(struct tracer *tracer,
u32 mask, int set);
struct tracer *next;
struct tracer_flags *flags;
bool print_max;
bool enabled;
#ifdef CONFIG_TRACER_MAX_TRACE
bool use_max_tr;
bool allocated_snapshot;
#endif
};
@@ -423,8 +502,6 @@ static __always_inline void trace_clear_recursion(int bit)
current->trace_recursion = val;
}
#define TRACE_PIPE_ALL_CPU -1
static inline struct ring_buffer_iter *
trace_buffer_iter(struct trace_iterator *iter, int cpu)
{
@@ -435,10 +512,10 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)
int tracer_init(struct tracer *t, struct trace_array *tr);
int tracing_is_enabled(void);
void tracing_reset(struct trace_array *tr, int cpu);
void tracing_reset_online_cpus(struct trace_array *tr);
void tracing_reset(struct trace_buffer *buf, int cpu);
void tracing_reset_online_cpus(struct trace_buffer *buf);
void tracing_reset_current(int cpu);
void tracing_reset_current_online_cpus(void);
void tracing_reset_all_online_cpus(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
struct dentry *trace_create_file(const char *name,
umode_t mode,
@@ -446,6 +523,7 @@ struct dentry *trace_create_file(const char *name,
void *data,
const struct file_operations *fops);
struct dentry *tracing_init_dentry_tr(struct trace_array *tr);
struct dentry *tracing_init_dentry(void);
struct ring_buffer_event;
@@ -579,7 +657,7 @@ extern int DYN_FTRACE_TEST_NAME(void);
#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
extern int DYN_FTRACE_TEST_NAME2(void);
extern int ring_buffer_expanded;
extern bool ring_buffer_expanded;
extern bool tracing_selftest_disabled;
DECLARE_PER_CPU(int, ftrace_cpu_disabled);
@@ -615,6 +693,8 @@ trace_array_vprintk(struct trace_array *tr,
unsigned long ip, const char *fmt, va_list args);
int trace_array_printk(struct trace_array *tr,
unsigned long ip, const char *fmt, ...);
int trace_array_printk_buf(struct ring_buffer *buffer,
unsigned long ip, const char *fmt, ...);
void trace_printk_seq(struct trace_seq *s);
enum print_line_t print_trace_line(struct trace_iterator *iter);
@@ -782,6 +862,7 @@ enum trace_iterator_flags {
TRACE_ITER_STOP_ON_FREE = 0x400000,
TRACE_ITER_IRQ_INFO = 0x800000,
TRACE_ITER_MARKERS = 0x1000000,
TRACE_ITER_FUNCTION = 0x2000000,
};
/*
@@ -828,8 +909,8 @@ enum {
struct ftrace_event_field {
struct list_head link;
char *name;
char *type;
const char *name;
const char *type;
int filter_type;
int offset;
int size;
@@ -847,12 +928,19 @@ struct event_filter {
struct event_subsystem {
struct list_head list;
const char *name;
struct dentry *entry;
struct event_filter *filter;
int nr_events;
int ref_count;
};
struct ftrace_subsystem_dir {
struct list_head list;
struct event_subsystem *subsystem;
struct trace_array *tr;
struct dentry *entry;
int ref_count;
int nr_events;
};
#define FILTER_PRED_INVALID ((unsigned short)-1)
#define FILTER_PRED_IS_RIGHT (1 << 15)
#define FILTER_PRED_FOLD (1 << 15)
@@ -902,22 +990,20 @@ struct filter_pred {
unsigned short right;
};
extern struct list_head ftrace_common_fields;
extern enum regex_type
filter_parse_regex(char *buff, int len, char **search, int *not);
extern void print_event_filter(struct ftrace_event_call *call,
struct trace_seq *s);
extern int apply_event_filter(struct ftrace_event_call *call,
char *filter_string);
extern int apply_subsystem_event_filter(struct event_subsystem *system,
extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
char *filter_string);
extern void print_subsystem_event_filter(struct event_subsystem *system,
struct trace_seq *s);
extern int filter_assign_type(const char *type);
struct list_head *
trace_get_fields(struct ftrace_event_call *event_call);
struct ftrace_event_field *
trace_find_event_field(struct ftrace_event_call *call, char *name);
static inline int
filter_check_discard(struct ftrace_event_call *call, void *rec,
@@ -934,6 +1020,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
}
extern void trace_event_enable_cmd_record(bool enable);
extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
extern int event_trace_del_tracer(struct trace_array *tr);
extern struct mutex event_mutex;
extern struct list_head ftrace_events;
@@ -943,6 +1031,19 @@ extern const char *__stop___trace_bprintk_fmt[];
void trace_printk_init_buffers(void);
void trace_printk_start_comm(void);
int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
/*
* Normal trace_printk() and friends allocates special buffers
* to do the manipulation, as well as saves the print formats
* into sections to display. But the trace infrastructure wants
* to use these without the added overhead at the price of being
* a bit slower (used mainly for warnings, where we don't care
* about performance). The internal_trace_puts() is for such
* a purpose.
*/
#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str))
#undef FTRACE_ENTRY
#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \

View File

@@ -32,6 +32,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
{
struct ftrace_event_call *call = &event_branch;
struct trace_array *tr = branch_tracer;
struct trace_array_cpu *data;
struct ring_buffer_event *event;
struct trace_branch *entry;
struct ring_buffer *buffer;
@@ -51,11 +52,12 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
local_irq_save(flags);
cpu = raw_smp_processor_id();
if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
data = per_cpu_ptr(tr->trace_buffer.data, cpu);
if (atomic_inc_return(&data->disabled) != 1)
goto out;
pc = preempt_count();
buffer = tr->buffer;
buffer = tr->trace_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
sizeof(*entry), flags, pc);
if (!event)
@@ -80,7 +82,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
__buffer_unlock_commit(buffer, event);
out:
atomic_dec(&tr->data[cpu]->disabled);
atomic_dec(&data->disabled);
local_irq_restore(flags);
}

View File

@@ -57,6 +57,16 @@ u64 notrace trace_clock(void)
return local_clock();
}
/*
* trace_jiffy_clock(): Simply use jiffies as a clock counter.
*/
u64 notrace trace_clock_jiffies(void)
{
u64 jiffy = jiffies - INITIAL_JIFFIES;
/* Return nsecs */
return (u64)jiffies_to_usecs(jiffy) * 1000ULL;
}
/*
* trace_clock_global(): special globally coherent trace clock

View File

@@ -223,8 +223,8 @@ FTRACE_ENTRY(bprint, bprint_entry,
__dynamic_array( u32, buf )
),
F_printk("%08lx fmt:%p",
__entry->ip, __entry->fmt),
F_printk("%pf: %s",
(void *)__entry->ip, __entry->fmt),
FILTER_OTHER
);
@@ -238,8 +238,23 @@ FTRACE_ENTRY(print, print_entry,
__dynamic_array( char, buf )
),
F_printk("%08lx %s",
__entry->ip, __entry->buf),
F_printk("%pf: %s",
(void *)__entry->ip, __entry->buf),
FILTER_OTHER
);
FTRACE_ENTRY(bputs, bputs_entry,
TRACE_BPUTS,
F_STRUCT(
__field( unsigned long, ip )
__field( const char *, str )
),
F_printk("%pf: %s",
(void *)__entry->ip, __entry->str),
FILTER_OTHER
);

File diff suppressed because it is too large Load Diff

View File

@@ -658,33 +658,6 @@ void print_subsystem_event_filter(struct event_subsystem *system,
mutex_unlock(&event_mutex);
}
static struct ftrace_event_field *
__find_event_field(struct list_head *head, char *name)
{
struct ftrace_event_field *field;
list_for_each_entry(field, head, link) {
if (!strcmp(field->name, name))
return field;
}
return NULL;
}
static struct ftrace_event_field *
find_event_field(struct ftrace_event_call *call, char *name)
{
struct ftrace_event_field *field;
struct list_head *head;
field = __find_event_field(&ftrace_common_fields, name);
if (field)
return field;
head = trace_get_fields(call);
return __find_event_field(head, name);
}
static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
{
stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL);
@@ -1337,7 +1310,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps,
return NULL;
}
field = find_event_field(call, operand1);
field = trace_find_event_field(call, operand1);
if (!field) {
parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
return NULL;
@@ -1907,16 +1880,17 @@ out_unlock:
return err;
}
int apply_subsystem_event_filter(struct event_subsystem *system,
int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
char *filter_string)
{
struct event_subsystem *system = dir->subsystem;
struct event_filter *filter;
int err = 0;
mutex_lock(&event_mutex);
/* Make sure the system still has events */
if (!system->nr_events) {
if (!dir->nr_events) {
err = -ENODEV;
goto out_unlock;
}

View File

@@ -129,7 +129,7 @@ static void __always_unused ____ftrace_check_##name(void) \
#undef FTRACE_ENTRY
#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
int \
static int __init \
ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
{ \
struct struct_name field; \
@@ -168,7 +168,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
regfn) \
\
struct ftrace_event_class event_class_ftrace_##call = { \
struct ftrace_event_class __refdata event_class_ftrace_##call = { \
.system = __stringify(TRACE_SYSTEM), \
.define_fields = ftrace_define_fields_##call, \
.fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\

View File

@@ -28,7 +28,7 @@ static void tracing_stop_function_trace(void);
static int function_trace_init(struct trace_array *tr)
{
func_trace = tr;
tr->cpu = get_cpu();
tr->trace_buffer.cpu = get_cpu();
put_cpu();
tracing_start_cmdline_record();
@@ -44,7 +44,7 @@ static void function_trace_reset(struct trace_array *tr)
static void function_trace_start(struct trace_array *tr)
{
tracing_reset_online_cpus(tr);
tracing_reset_online_cpus(&tr->trace_buffer);
}
/* Our option */
@@ -76,7 +76,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
goto out;
cpu = smp_processor_id();
data = tr->data[cpu];
data = per_cpu_ptr(tr->trace_buffer.data, cpu);
if (!atomic_read(&data->disabled)) {
local_save_flags(flags);
trace_function(tr, ip, parent_ip, flags, pc);
@@ -107,7 +107,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
*/
local_irq_save(flags);
cpu = raw_smp_processor_id();
data = tr->data[cpu];
data = per_cpu_ptr(tr->trace_buffer.data, cpu);
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
@@ -214,66 +214,89 @@ static struct tracer function_trace __read_mostly =
};
#ifdef CONFIG_DYNAMIC_FTRACE
static void
ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
static int update_count(void **data)
{
long *count = (long *)data;
if (tracing_is_on())
return;
unsigned long *count = (long *)data;
if (!*count)
return;
return 0;
if (*count != -1)
(*count)--;
return 1;
}
static void
ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data)
{
if (tracing_is_on())
return;
if (update_count(data))
tracing_on();
}
static void
ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data)
{
if (!tracing_is_on())
return;
if (update_count(data))
tracing_off();
}
static void
ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
{
if (tracing_is_on())
return;
tracing_on();
}
static void
ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
{
long *count = (long *)data;
if (!tracing_is_on())
return;
if (!*count)
return;
if (*count != -1)
(*count)--;
tracing_off();
}
static int
ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
struct ftrace_probe_ops *ops, void *data);
/*
* Skip 4:
* ftrace_stacktrace()
* function_trace_probe_call()
* ftrace_ops_list_func()
* ftrace_call()
*/
#define STACK_SKIP 4
static struct ftrace_probe_ops traceon_probe_ops = {
.func = ftrace_traceon,
.print = ftrace_trace_onoff_print,
};
static void
ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data)
{
trace_dump_stack(STACK_SKIP);
}
static struct ftrace_probe_ops traceoff_probe_ops = {
.func = ftrace_traceoff,
.print = ftrace_trace_onoff_print,
};
static void
ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)
{
if (!tracing_is_on())
return;
if (update_count(data))
trace_dump_stack(STACK_SKIP);
}
static int
ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
struct ftrace_probe_ops *ops, void *data)
ftrace_probe_print(const char *name, struct seq_file *m,
unsigned long ip, void *data)
{
long count = (long)data;
seq_printf(m, "%ps:", (void *)ip);
if (ops == &traceon_probe_ops)
seq_printf(m, "traceon");
else
seq_printf(m, "traceoff");
seq_printf(m, "%ps:%s", (void *)ip, name);
if (count == -1)
seq_printf(m, ":unlimited\n");
@@ -284,26 +307,61 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
}
static int
ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
ftrace_traceon_print(struct seq_file *m, unsigned long ip,
struct ftrace_probe_ops *ops, void *data)
{
struct ftrace_probe_ops *ops;
/* we register both traceon and traceoff to this callback */
if (strcmp(cmd, "traceon") == 0)
ops = &traceon_probe_ops;
else
ops = &traceoff_probe_ops;
unregister_ftrace_function_probe_func(glob, ops);
return 0;
return ftrace_probe_print("traceon", m, ip, data);
}
static int
ftrace_trace_onoff_callback(struct ftrace_hash *hash,
char *glob, char *cmd, char *param, int enable)
ftrace_traceoff_print(struct seq_file *m, unsigned long ip,
struct ftrace_probe_ops *ops, void *data)
{
return ftrace_probe_print("traceoff", m, ip, data);
}
static int
ftrace_stacktrace_print(struct seq_file *m, unsigned long ip,
struct ftrace_probe_ops *ops, void *data)
{
return ftrace_probe_print("stacktrace", m, ip, data);
}
static struct ftrace_probe_ops traceon_count_probe_ops = {
.func = ftrace_traceon_count,
.print = ftrace_traceon_print,
};
static struct ftrace_probe_ops traceoff_count_probe_ops = {
.func = ftrace_traceoff_count,
.print = ftrace_traceoff_print,
};
static struct ftrace_probe_ops stacktrace_count_probe_ops = {
.func = ftrace_stacktrace_count,
.print = ftrace_stacktrace_print,
};
static struct ftrace_probe_ops traceon_probe_ops = {
.func = ftrace_traceon,
.print = ftrace_traceon_print,
};
static struct ftrace_probe_ops traceoff_probe_ops = {
.func = ftrace_traceoff,
.print = ftrace_traceoff_print,
};
static struct ftrace_probe_ops stacktrace_probe_ops = {
.func = ftrace_stacktrace,
.print = ftrace_stacktrace_print,
};
static int
ftrace_trace_probe_callback(struct ftrace_probe_ops *ops,
struct ftrace_hash *hash, char *glob,
char *cmd, char *param, int enable)
{
struct ftrace_probe_ops *ops;
void *count = (void *)-1;
char *number;
int ret;
@@ -312,14 +370,10 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
if (!enable)
return -EINVAL;
if (glob[0] == '!')
return ftrace_trace_onoff_unreg(glob+1, cmd, param);
/* we register both traceon and traceoff to this callback */
if (strcmp(cmd, "traceon") == 0)
ops = &traceon_probe_ops;
else
ops = &traceoff_probe_ops;
if (glob[0] == '!') {
unregister_ftrace_function_probe_func(glob+1, ops);
return 0;
}
if (!param)
goto out_reg;
@@ -343,6 +397,34 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
return ret < 0 ? ret : 0;
}
static int
ftrace_trace_onoff_callback(struct ftrace_hash *hash,
char *glob, char *cmd, char *param, int enable)
{
struct ftrace_probe_ops *ops;
/* we register both traceon and traceoff to this callback */
if (strcmp(cmd, "traceon") == 0)
ops = param ? &traceon_count_probe_ops : &traceon_probe_ops;
else
ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops;
return ftrace_trace_probe_callback(ops, hash, glob, cmd,
param, enable);
}
static int
ftrace_stacktrace_callback(struct ftrace_hash *hash,
char *glob, char *cmd, char *param, int enable)
{
struct ftrace_probe_ops *ops;
ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops;
return ftrace_trace_probe_callback(ops, hash, glob, cmd,
param, enable);
}
static struct ftrace_func_command ftrace_traceon_cmd = {
.name = "traceon",
.func = ftrace_trace_onoff_callback,
@@ -353,6 +435,11 @@ static struct ftrace_func_command ftrace_traceoff_cmd = {
.func = ftrace_trace_onoff_callback,
};
static struct ftrace_func_command ftrace_stacktrace_cmd = {
.name = "stacktrace",
.func = ftrace_stacktrace_callback,
};
static int __init init_func_cmd_traceon(void)
{
int ret;
@@ -364,6 +451,12 @@ static int __init init_func_cmd_traceon(void)
ret = register_ftrace_command(&ftrace_traceon_cmd);
if (ret)
unregister_ftrace_command(&ftrace_traceoff_cmd);
ret = register_ftrace_command(&ftrace_stacktrace_cmd);
if (ret) {
unregister_ftrace_command(&ftrace_traceoff_cmd);
unregister_ftrace_command(&ftrace_traceon_cmd);
}
return ret;
}
#else

View File

@@ -218,7 +218,7 @@ int __trace_graph_entry(struct trace_array *tr,
{
struct ftrace_event_call *call = &event_funcgraph_entry;
struct ring_buffer_event *event;
struct ring_buffer *buffer = tr->buffer;
struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ftrace_graph_ent_entry *entry;
if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
@@ -265,7 +265,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
local_irq_save(flags);
cpu = raw_smp_processor_id();
data = tr->data[cpu];
data = per_cpu_ptr(tr->trace_buffer.data, cpu);
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
pc = preempt_count();
@@ -323,7 +323,7 @@ void __trace_graph_return(struct trace_array *tr,
{
struct ftrace_event_call *call = &event_funcgraph_exit;
struct ring_buffer_event *event;
struct ring_buffer *buffer = tr->buffer;
struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ftrace_graph_ret_entry *entry;
if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
@@ -350,7 +350,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
local_irq_save(flags);
cpu = raw_smp_processor_id();
data = tr->data[cpu];
data = per_cpu_ptr(tr->trace_buffer.data, cpu);
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
pc = preempt_count();
@@ -560,9 +560,9 @@ get_return_for_leaf(struct trace_iterator *iter,
* We need to consume the current entry to see
* the next one.
*/
ring_buffer_consume(iter->tr->buffer, iter->cpu,
ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu,
NULL, NULL);
event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
event = ring_buffer_peek(iter->trace_buffer->buffer, iter->cpu,
NULL, NULL);
}

View File

@@ -32,7 +32,8 @@ enum {
static int trace_type __read_mostly;
static int save_lat_flag;
static int save_flags;
static bool function_enabled;
static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
static int start_irqsoff_tracer(struct trace_array *tr, int graph);
@@ -121,7 +122,7 @@ static int func_prolog_dec(struct trace_array *tr,
if (!irqs_disabled_flags(*flags))
return 0;
*data = tr->data[cpu];
*data = per_cpu_ptr(tr->trace_buffer.data, cpu);
disabled = atomic_inc_return(&(*data)->disabled);
if (likely(disabled == 1))
@@ -175,7 +176,7 @@ static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
per_cpu(tracing_cpu, cpu) = 0;
tracing_max_latency = 0;
tracing_reset_online_cpus(irqsoff_trace);
tracing_reset_online_cpus(&irqsoff_trace->trace_buffer);
return start_irqsoff_tracer(irqsoff_trace, set);
}
@@ -380,7 +381,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
if (per_cpu(tracing_cpu, cpu))
return;
data = tr->data[cpu];
data = per_cpu_ptr(tr->trace_buffer.data, cpu);
if (unlikely(!data) || atomic_read(&data->disabled))
return;
@@ -418,7 +419,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
if (!tracer_enabled)
return;
data = tr->data[cpu];
data = per_cpu_ptr(tr->trace_buffer.data, cpu);
if (unlikely(!data) ||
!data->critical_start || atomic_read(&data->disabled))
@@ -528,15 +529,60 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
}
#endif /* CONFIG_PREEMPT_TRACER */
static int start_irqsoff_tracer(struct trace_array *tr, int graph)
static int register_irqsoff_function(int graph, int set)
{
int ret = 0;
int ret;
if (!graph)
ret = register_ftrace_function(&trace_ops);
else
/* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION)))
return 0;
if (graph)
ret = register_ftrace_graph(&irqsoff_graph_return,
&irqsoff_graph_entry);
else
ret = register_ftrace_function(&trace_ops);
if (!ret)
function_enabled = true;
return ret;
}
static void unregister_irqsoff_function(int graph)
{
if (!function_enabled)
return;
if (graph)
unregister_ftrace_graph();
else
unregister_ftrace_function(&trace_ops);
function_enabled = false;
}
static void irqsoff_function_set(int set)
{
if (set)
register_irqsoff_function(is_graph(), 1);
else
unregister_irqsoff_function(is_graph());
}
static int irqsoff_flag_changed(struct tracer *tracer, u32 mask, int set)
{
if (mask & TRACE_ITER_FUNCTION)
irqsoff_function_set(set);
return trace_keep_overwrite(tracer, mask, set);
}
static int start_irqsoff_tracer(struct trace_array *tr, int graph)
{
int ret;
ret = register_irqsoff_function(graph, 0);
if (!ret && tracing_is_enabled())
tracer_enabled = 1;
@@ -550,22 +596,22 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
{
tracer_enabled = 0;
if (!graph)
unregister_ftrace_function(&trace_ops);
else
unregister_ftrace_graph();
unregister_irqsoff_function(graph);
}
static void __irqsoff_tracer_init(struct trace_array *tr)
{
save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
trace_flags |= TRACE_ITER_LATENCY_FMT;
save_flags = trace_flags;
/* non overwrite screws up the latency tracers */
set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
tracing_max_latency = 0;
irqsoff_trace = tr;
/* make sure that the tracer is visible */
smp_wmb();
tracing_reset_online_cpus(tr);
tracing_reset_online_cpus(&tr->trace_buffer);
if (start_irqsoff_tracer(tr, is_graph()))
printk(KERN_ERR "failed to start irqsoff tracer\n");
@@ -573,10 +619,13 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
static void irqsoff_tracer_reset(struct trace_array *tr)
{
int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
stop_irqsoff_tracer(tr, is_graph());
if (!save_lat_flag)
trace_flags &= ~TRACE_ITER_LATENCY_FMT;
set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
}
static void irqsoff_tracer_start(struct trace_array *tr)
@@ -609,6 +658,7 @@ static struct tracer irqsoff_tracer __read_mostly =
.print_line = irqsoff_print_line,
.flags = &tracer_flags,
.set_flag = irqsoff_set_flag,
.flag_changed = irqsoff_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_irqsoff,
#endif
@@ -642,6 +692,7 @@ static struct tracer preemptoff_tracer __read_mostly =
.print_line = irqsoff_print_line,
.flags = &tracer_flags,
.set_flag = irqsoff_set_flag,
.flag_changed = irqsoff_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptoff,
#endif
@@ -677,6 +728,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
.print_line = irqsoff_print_line,
.flags = &tracer_flags,
.set_flag = irqsoff_set_flag,
.flag_changed = irqsoff_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptirqsoff,
#endif

View File

@@ -26,7 +26,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
trace_init_global_iter(&iter);
for_each_tracing_cpu(cpu) {
atomic_inc(&iter.tr->data[cpu]->disabled);
atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
}
old_userobj = trace_flags;
@@ -43,17 +43,17 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
iter.iter_flags |= TRACE_FILE_LAT_FMT;
iter.pos = -1;
if (cpu_file == TRACE_PIPE_ALL_CPU) {
if (cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
iter.buffer_iter[cpu] =
ring_buffer_read_prepare(iter.tr->buffer, cpu);
ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu);
ring_buffer_read_start(iter.buffer_iter[cpu]);
tracing_iter_reset(&iter, cpu);
}
} else {
iter.cpu_file = cpu_file;
iter.buffer_iter[cpu_file] =
ring_buffer_read_prepare(iter.tr->buffer, cpu_file);
ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file);
ring_buffer_read_start(iter.buffer_iter[cpu_file]);
tracing_iter_reset(&iter, cpu_file);
}
@@ -83,7 +83,7 @@ out:
trace_flags = old_userobj;
for_each_tracing_cpu(cpu) {
atomic_dec(&iter.tr->data[cpu]->disabled);
atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
}
for_each_tracing_cpu(cpu)
@@ -115,7 +115,7 @@ static int kdb_ftdump(int argc, const char **argv)
!cpu_online(cpu_file))
return KDB_BADINT;
} else {
cpu_file = TRACE_PIPE_ALL_CPU;
cpu_file = RING_BUFFER_ALL_CPUS;
}
kdb_trap_printk++;

View File

@@ -31,7 +31,7 @@ static void mmio_reset_data(struct trace_array *tr)
overrun_detected = false;
prev_overruns = 0;
tracing_reset_online_cpus(tr);
tracing_reset_online_cpus(&tr->trace_buffer);
}
static int mmio_trace_init(struct trace_array *tr)
@@ -128,7 +128,7 @@ static void mmio_close(struct trace_iterator *iter)
static unsigned long count_overruns(struct trace_iterator *iter)
{
unsigned long cnt = atomic_xchg(&dropped_count, 0);
unsigned long over = ring_buffer_overruns(iter->tr->buffer);
unsigned long over = ring_buffer_overruns(iter->trace_buffer->buffer);
if (over > prev_overruns)
cnt += over - prev_overruns;
@@ -309,7 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
struct mmiotrace_rw *rw)
{
struct ftrace_event_call *call = &event_mmiotrace_rw;
struct ring_buffer *buffer = tr->buffer;
struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_rw *entry;
int pc = preempt_count();
@@ -330,7 +330,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
void mmio_trace_rw(struct mmiotrace_rw *rw)
{
struct trace_array *tr = mmio_trace_array;
struct trace_array_cpu *data = tr->data[smp_processor_id()];
struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());
__trace_mmiotrace_rw(tr, data, rw);
}
@@ -339,7 +339,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
struct mmiotrace_map *map)
{
struct ftrace_event_call *call = &event_mmiotrace_map;
struct ring_buffer *buffer = tr->buffer;
struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_map *entry;
int pc = preempt_count();
@@ -363,7 +363,7 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
struct trace_array_cpu *data;
preempt_disable();
data = tr->data[smp_processor_id()];
data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());
__trace_mmiotrace_map(tr, data, map);
preempt_enable();
}

View File

@@ -14,7 +14,7 @@
/* must be a power of 2 */
#define EVENT_HASHSIZE 128
DECLARE_RWSEM(trace_event_mutex);
DECLARE_RWSEM(trace_event_sem);
static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
@@ -37,6 +37,22 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)
return ret;
}
enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
struct trace_entry *entry = iter->ent;
struct bputs_entry *field;
int ret;
trace_assign_type(field, entry);
ret = trace_seq_puts(s, field->str);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_HANDLED;
}
enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
@@ -397,6 +413,32 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
}
EXPORT_SYMBOL(ftrace_print_hex_seq);
int ftrace_raw_output_prep(struct trace_iterator *iter,
struct trace_event *trace_event)
{
struct ftrace_event_call *event;
struct trace_seq *s = &iter->seq;
struct trace_seq *p = &iter->tmp_seq;
struct trace_entry *entry;
int ret;
event = container_of(trace_event, struct ftrace_event_call, event);
entry = iter->ent;
if (entry->type != event->event.type) {
WARN_ON_ONCE(1);
return TRACE_TYPE_UNHANDLED;
}
trace_seq_init(p);
ret = trace_seq_printf(s, "%s: ", event->name);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
return 0;
}
EXPORT_SYMBOL(ftrace_raw_output_prep);
#ifdef CONFIG_KRETPROBES
static inline const char *kretprobed(const char *name)
{
@@ -617,7 +659,7 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
{
unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE;
unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS;
unsigned long long abs_ts = iter->ts - iter->tr->time_start;
unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start;
unsigned long long rel_ts = next_ts - iter->ts;
struct trace_seq *s = &iter->seq;
@@ -783,12 +825,12 @@ static int trace_search_list(struct list_head **list)
void trace_event_read_lock(void)
{
down_read(&trace_event_mutex);
down_read(&trace_event_sem);
}
void trace_event_read_unlock(void)
{
up_read(&trace_event_mutex);
up_read(&trace_event_sem);
}
/**
@@ -811,7 +853,7 @@ int register_ftrace_event(struct trace_event *event)
unsigned key;
int ret = 0;
down_write(&trace_event_mutex);
down_write(&trace_event_sem);
if (WARN_ON(!event))
goto out;
@@ -866,14 +908,14 @@ int register_ftrace_event(struct trace_event *event)
ret = event->type;
out:
up_write(&trace_event_mutex);
up_write(&trace_event_sem);
return ret;
}
EXPORT_SYMBOL_GPL(register_ftrace_event);
/*
* Used by module code with the trace_event_mutex held for write.
* Used by module code with the trace_event_sem held for write.
*/
int __unregister_ftrace_event(struct trace_event *event)
{
@@ -888,9 +930,9 @@ int __unregister_ftrace_event(struct trace_event *event)
*/
int unregister_ftrace_event(struct trace_event *event)
{
down_write(&trace_event_mutex);
down_write(&trace_event_sem);
__unregister_ftrace_event(event);
up_write(&trace_event_mutex);
up_write(&trace_event_sem);
return 0;
}
@@ -1217,6 +1259,64 @@ static struct trace_event trace_user_stack_event = {
.funcs = &trace_user_stack_funcs,
};
/* TRACE_BPUTS */
static enum print_line_t
trace_bputs_print(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
struct trace_entry *entry = iter->ent;
struct trace_seq *s = &iter->seq;
struct bputs_entry *field;
trace_assign_type(field, entry);
if (!seq_print_ip_sym(s, field->ip, flags))
goto partial;
if (!trace_seq_puts(s, ": "))
goto partial;
if (!trace_seq_puts(s, field->str))
goto partial;
return TRACE_TYPE_HANDLED;
partial:
return TRACE_TYPE_PARTIAL_LINE;
}
static enum print_line_t
trace_bputs_raw(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
struct bputs_entry *field;
struct trace_seq *s = &iter->seq;
trace_assign_type(field, iter->ent);
if (!trace_seq_printf(s, ": %lx : ", field->ip))
goto partial;
if (!trace_seq_puts(s, field->str))
goto partial;
return TRACE_TYPE_HANDLED;
partial:
return TRACE_TYPE_PARTIAL_LINE;
}
static struct trace_event_functions trace_bputs_funcs = {
.trace = trace_bputs_print,
.raw = trace_bputs_raw,
};
static struct trace_event trace_bputs_event = {
.type = TRACE_BPUTS,
.funcs = &trace_bputs_funcs,
};
/* TRACE_BPRINT */
static enum print_line_t
trace_bprint_print(struct trace_iterator *iter, int flags,
@@ -1329,6 +1429,7 @@ static struct trace_event *events[] __initdata = {
&trace_wake_event,
&trace_stack_event,
&trace_user_stack_event,
&trace_bputs_event,
&trace_bprint_event,
&trace_print_event,
NULL

View File

@@ -4,6 +4,8 @@
#include <linux/trace_seq.h>
#include "trace.h"
extern enum print_line_t
trace_print_bputs_msg_only(struct trace_iterator *iter);
extern enum print_line_t
trace_print_bprintk_msg_only(struct trace_iterator *iter);
extern enum print_line_t
@@ -31,7 +33,7 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
/* used by module unregistering */
extern int __unregister_ftrace_event(struct trace_event *event);
extern struct rw_semaphore trace_event_mutex;
extern struct rw_semaphore trace_event_sem;
#define MAX_MEMHEX_BYTES 8
#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)

View File

@@ -28,7 +28,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
unsigned long flags, int pc)
{
struct ftrace_event_call *call = &event_context_switch;
struct ring_buffer *buffer = tr->buffer;
struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ring_buffer_event *event;
struct ctx_switch_entry *entry;
@@ -69,7 +69,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n
pc = preempt_count();
local_irq_save(flags);
cpu = raw_smp_processor_id();
data = ctx_trace->data[cpu];
data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
if (likely(!atomic_read(&data->disabled)))
tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
@@ -86,7 +86,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
struct ftrace_event_call *call = &event_wakeup;
struct ring_buffer_event *event;
struct ctx_switch_entry *entry;
struct ring_buffer *buffer = tr->buffer;
struct ring_buffer *buffer = tr->trace_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
sizeof(*entry), flags, pc);
@@ -123,7 +123,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
pc = preempt_count();
local_irq_save(flags);
cpu = raw_smp_processor_id();
data = ctx_trace->data[cpu];
data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
if (likely(!atomic_read(&data->disabled)))
tracing_sched_wakeup_trace(ctx_trace, wakee, current,

View File

@@ -36,7 +36,8 @@ static void __wakeup_reset(struct trace_array *tr);
static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
static void wakeup_graph_return(struct ftrace_graph_ret *trace);
static int save_lat_flag;
static int save_flags;
static bool function_enabled;
#define TRACE_DISPLAY_GRAPH 1
@@ -89,7 +90,7 @@ func_prolog_preempt_disable(struct trace_array *tr,
if (cpu != wakeup_current_cpu)
goto out_enable;
*data = tr->data[cpu];
*data = per_cpu_ptr(tr->trace_buffer.data, cpu);
disabled = atomic_inc_return(&(*data)->disabled);
if (unlikely(disabled != 1))
goto out;
@@ -134,15 +135,60 @@ static struct ftrace_ops trace_ops __read_mostly =
};
#endif /* CONFIG_FUNCTION_TRACER */
static int register_wakeup_function(int graph, int set)
{
int ret;
/* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION)))
return 0;
if (graph)
ret = register_ftrace_graph(&wakeup_graph_return,
&wakeup_graph_entry);
else
ret = register_ftrace_function(&trace_ops);
if (!ret)
function_enabled = true;
return ret;
}
static void unregister_wakeup_function(int graph)
{
if (!function_enabled)
return;
if (graph)
unregister_ftrace_graph();
else
unregister_ftrace_function(&trace_ops);
function_enabled = false;
}
static void wakeup_function_set(int set)
{
if (set)
register_wakeup_function(is_graph(), 1);
else
unregister_wakeup_function(is_graph());
}
static int wakeup_flag_changed(struct tracer *tracer, u32 mask, int set)
{
if (mask & TRACE_ITER_FUNCTION)
wakeup_function_set(set);
return trace_keep_overwrite(tracer, mask, set);
}
static int start_func_tracer(int graph)
{
int ret;
if (!graph)
ret = register_ftrace_function(&trace_ops);
else
ret = register_ftrace_graph(&wakeup_graph_return,
&wakeup_graph_entry);
ret = register_wakeup_function(graph, 0);
if (!ret && tracing_is_enabled())
tracer_enabled = 1;
@@ -156,10 +202,7 @@ static void stop_func_tracer(int graph)
{
tracer_enabled = 0;
if (!graph)
unregister_ftrace_function(&trace_ops);
else
unregister_ftrace_graph();
unregister_wakeup_function(graph);
}
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -353,7 +396,7 @@ probe_wakeup_sched_switch(void *ignore,
/* disable local data, not wakeup_cpu data */
cpu = raw_smp_processor_id();
disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
if (likely(disabled != 1))
goto out;
@@ -365,7 +408,7 @@ probe_wakeup_sched_switch(void *ignore,
goto out_unlock;
/* The task we are waiting for is waking up */
data = wakeup_trace->data[wakeup_cpu];
data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
__trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
@@ -387,7 +430,7 @@ out_unlock:
arch_spin_unlock(&wakeup_lock);
local_irq_restore(flags);
out:
atomic_dec(&wakeup_trace->data[cpu]->disabled);
atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
}
static void __wakeup_reset(struct trace_array *tr)
@@ -405,7 +448,7 @@ static void wakeup_reset(struct trace_array *tr)
{
unsigned long flags;
tracing_reset_online_cpus(tr);
tracing_reset_online_cpus(&tr->trace_buffer);
local_irq_save(flags);
arch_spin_lock(&wakeup_lock);
@@ -435,7 +478,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
return;
pc = preempt_count();
disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
if (unlikely(disabled != 1))
goto out;
@@ -458,7 +501,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
local_save_flags(flags);
data = wakeup_trace->data[wakeup_cpu];
data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
data->preempt_timestamp = ftrace_now(cpu);
tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
@@ -472,7 +515,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
out_locked:
arch_spin_unlock(&wakeup_lock);
out:
atomic_dec(&wakeup_trace->data[cpu]->disabled);
atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
}
static void start_wakeup_tracer(struct trace_array *tr)
@@ -540,8 +583,11 @@ static void stop_wakeup_tracer(struct trace_array *tr)
static int __wakeup_tracer_init(struct trace_array *tr)
{
save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
trace_flags |= TRACE_ITER_LATENCY_FMT;
save_flags = trace_flags;
/* non overwrite screws up the latency tracers */
set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
tracing_max_latency = 0;
wakeup_trace = tr;
@@ -563,12 +609,15 @@ static int wakeup_rt_tracer_init(struct trace_array *tr)
static void wakeup_tracer_reset(struct trace_array *tr)
{
int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
stop_wakeup_tracer(tr);
/* make sure we put back any tasks we are tracing */
wakeup_reset(tr);
if (!save_lat_flag)
trace_flags &= ~TRACE_ITER_LATENCY_FMT;
set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
}
static void wakeup_tracer_start(struct trace_array *tr)
@@ -594,6 +643,7 @@ static struct tracer wakeup_tracer __read_mostly =
.print_line = wakeup_print_line,
.flags = &tracer_flags,
.set_flag = wakeup_set_flag,
.flag_changed = wakeup_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
#endif
@@ -615,6 +665,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
.print_line = wakeup_print_line,
.flags = &tracer_flags,
.set_flag = wakeup_set_flag,
.flag_changed = wakeup_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
#endif

View File

@@ -21,13 +21,13 @@ static inline int trace_valid_entry(struct trace_entry *entry)
return 0;
}
static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu)
{
struct ring_buffer_event *event;
struct trace_entry *entry;
unsigned int loops = 0;
while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) {
while ((event = ring_buffer_consume(buf->buffer, cpu, NULL, NULL))) {
entry = ring_buffer_event_data(event);
/*
@@ -58,7 +58,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
* Test the trace buffer to see if all the elements
* are still sane.
*/
static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
{
unsigned long flags, cnt = 0;
int cpu, ret = 0;
@@ -67,7 +67,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
local_irq_save(flags);
arch_spin_lock(&ftrace_max_lock);
cnt = ring_buffer_entries(tr->buffer);
cnt = ring_buffer_entries(buf->buffer);
/*
* The trace_test_buffer_cpu runs a while loop to consume all data.
@@ -78,7 +78,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
*/
tracing_off();
for_each_possible_cpu(cpu) {
ret = trace_test_buffer_cpu(tr, cpu);
ret = trace_test_buffer_cpu(buf, cpu);
if (ret)
break;
}
@@ -355,7 +355,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
msleep(100);
/* we should have nothing in the buffer */
ret = trace_test_buffer(tr, &count);
ret = trace_test_buffer(&tr->trace_buffer, &count);
if (ret)
goto out;
@@ -376,7 +376,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
ftrace_enabled = 0;
/* check the trace buffer */
ret = trace_test_buffer(tr, &count);
ret = trace_test_buffer(&tr->trace_buffer, &count);
tracing_start();
/* we should only have one item */
@@ -666,7 +666,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
ftrace_enabled = 0;
/* check the trace buffer */
ret = trace_test_buffer(tr, &count);
ret = trace_test_buffer(&tr->trace_buffer, &count);
trace->reset(tr);
tracing_start();
@@ -703,8 +703,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
/* Maximum number of functions to trace before diagnosing a hang */
#define GRAPH_MAX_FUNC_TEST 100000000
static void
__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
static unsigned int graph_hang_thresh;
/* Wrap the real function entry probe to avoid possible hanging */
@@ -714,8 +712,11 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
ftrace_graph_stop();
printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
if (ftrace_dump_on_oops)
__ftrace_dump(false, DUMP_ALL);
if (ftrace_dump_on_oops) {
ftrace_dump(DUMP_ALL);
/* ftrace_dump() disables tracing */
tracing_on();
}
return 0;
}
@@ -737,7 +738,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
* Simulate the init() callback but we attach a watchdog callback
* to detect and recover from possible hangs
*/
tracing_reset_online_cpus(tr);
tracing_reset_online_cpus(&tr->trace_buffer);
set_graph_array(tr);
ret = register_ftrace_graph(&trace_graph_return,
&trace_graph_entry_watchdog);
@@ -760,7 +761,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
tracing_stop();
/* check the trace buffer */
ret = trace_test_buffer(tr, &count);
ret = trace_test_buffer(&tr->trace_buffer, &count);
trace->reset(tr);
tracing_start();
@@ -815,9 +816,9 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
ret = trace_test_buffer(tr, NULL);
ret = trace_test_buffer(&tr->trace_buffer, NULL);
if (!ret)
ret = trace_test_buffer(&max_tr, &count);
ret = trace_test_buffer(&tr->max_buffer, &count);
trace->reset(tr);
tracing_start();
@@ -877,9 +878,9 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
ret = trace_test_buffer(tr, NULL);
ret = trace_test_buffer(&tr->trace_buffer, NULL);
if (!ret)
ret = trace_test_buffer(&max_tr, &count);
ret = trace_test_buffer(&tr->max_buffer, &count);
trace->reset(tr);
tracing_start();
@@ -943,11 +944,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
ret = trace_test_buffer(tr, NULL);
ret = trace_test_buffer(&tr->trace_buffer, NULL);
if (ret)
goto out;
ret = trace_test_buffer(&max_tr, &count);
ret = trace_test_buffer(&tr->max_buffer, &count);
if (ret)
goto out;
@@ -973,11 +974,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
ret = trace_test_buffer(tr, NULL);
ret = trace_test_buffer(&tr->trace_buffer, NULL);
if (ret)
goto out;
ret = trace_test_buffer(&max_tr, &count);
ret = trace_test_buffer(&tr->max_buffer, &count);
if (!ret && !count) {
printk(KERN_CONT ".. no entries found ..");
@@ -1084,10 +1085,10 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
ret = trace_test_buffer(tr, NULL);
ret = trace_test_buffer(&tr->trace_buffer, NULL);
printk("ret = %d\n", ret);
if (!ret)
ret = trace_test_buffer(&max_tr, &count);
ret = trace_test_buffer(&tr->max_buffer, &count);
trace->reset(tr);
@@ -1126,7 +1127,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
/* stop the tracing. */
tracing_stop();
/* check the trace buffer */
ret = trace_test_buffer(tr, &count);
ret = trace_test_buffer(&tr->trace_buffer, &count);
trace->reset(tr);
tracing_start();

View File

@@ -20,13 +20,24 @@
#define STACK_TRACE_ENTRIES 500
#ifdef CC_USING_FENTRY
# define fentry 1
#else
# define fentry 0
#endif
static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
{ [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
/*
* Reserve one entry for the passed in ip. This will allow
* us to remove most or all of the stack size overhead
* added by the stack tracer itself.
*/
static struct stack_trace max_stack_trace = {
.max_entries = STACK_TRACE_ENTRIES,
.entries = stack_dump_trace,
.max_entries = STACK_TRACE_ENTRIES - 1,
.entries = &stack_dump_trace[1],
};
static unsigned long max_stack_size;
@@ -39,25 +50,34 @@ static DEFINE_MUTEX(stack_sysctl_mutex);
int stack_tracer_enabled;
static int last_stack_tracer_enabled;
static inline void check_stack(void)
static inline void
check_stack(unsigned long ip, unsigned long *stack)
{
unsigned long this_size, flags;
unsigned long *p, *top, *start;
static int tracer_frame;
int frame_size = ACCESS_ONCE(tracer_frame);
int i;
this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1);
this_size = ((unsigned long)stack) & (THREAD_SIZE-1);
this_size = THREAD_SIZE - this_size;
/* Remove the frame of the tracer */
this_size -= frame_size;
if (this_size <= max_stack_size)
return;
/* we do not handle interrupt stacks yet */
if (!object_is_on_stack(&this_size))
if (!object_is_on_stack(stack))
return;
local_irq_save(flags);
arch_spin_lock(&max_stack_lock);
/* In case another CPU set the tracer_frame on us */
if (unlikely(!frame_size))
this_size -= tracer_frame;
/* a race could have already updated it */
if (this_size <= max_stack_size)
goto out;
@@ -69,11 +89,19 @@ static inline void check_stack(void)
save_stack_trace(&max_stack_trace);
/*
* Add the passed in ip from the function tracer.
* Searching for this on the stack will skip over
* most of the overhead from the stack tracer itself.
*/
stack_dump_trace[0] = ip;
max_stack_trace.nr_entries++;
/*
* Now find where in the stack these are.
*/
i = 0;
start = &this_size;
start = stack;
top = (unsigned long *)
(((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE);
@@ -97,6 +125,18 @@ static inline void check_stack(void)
found = 1;
/* Start the search from here */
start = p + 1;
/*
* We do not want to show the overhead
* of the stack tracer stack in the
* max stack. If we haven't figured
* out what that is, then figure it out
* now.
*/
if (unlikely(!tracer_frame) && i == 1) {
tracer_frame = (p - stack) *
sizeof(unsigned long);
max_stack_size -= tracer_frame;
}
}
}
@@ -113,6 +153,7 @@ static void
stack_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *pt_regs)
{
unsigned long stack;
int cpu;
preempt_disable_notrace();
@@ -122,7 +163,26 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
if (per_cpu(trace_active, cpu)++ != 0)
goto out;
check_stack();
/*
* When fentry is used, the traced function does not get
* its stack frame set up, and we lose the parent.
* The ip is pretty useless because the function tracer
* was called before that function set up its stack frame.
* In this case, we use the parent ip.
*
* By adding the return address of either the parent ip
* or the current ip we can disregard most of the stack usage
* caused by the stack tracer itself.
*
* The function tracer always reports the address of where the
* mcount call was, but the stack will hold the return address.
*/
if (fentry)
ip = parent_ip;
else
ip += MCOUNT_INSN_SIZE;
check_stack(ip, &stack);
out:
per_cpu(trace_active, cpu)--;
@@ -322,7 +382,7 @@ static const struct file_operations stack_trace_filter_fops = {
.open = stack_trace_filter_open,
.read = seq_read,
.write = ftrace_filter_write,
.llseek = ftrace_regex_lseek,
.llseek = ftrace_filter_lseek,
.release = ftrace_regex_release,
};
@@ -371,6 +431,8 @@ static __init int stack_trace_init(void)
struct dentry *d_tracer;
d_tracer = tracing_init_dentry();
if (!d_tracer)
return 0;
trace_create_file("stack_max_size", 0644, d_tracer,
&max_stack_size, &stack_max_size_fops);

View File

@@ -307,6 +307,8 @@ static int tracing_stat_init(void)
struct dentry *d_tracing;
d_tracing = tracing_init_dentry();
if (!d_tracing)
return 0;
stat_dir = debugfs_create_dir("trace_stat", d_tracing);
if (!stat_dir)

View File

@@ -12,10 +12,6 @@
#include "trace.h"
static DEFINE_MUTEX(syscall_trace_lock);
static int sys_refcount_enter;
static int sys_refcount_exit;
static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
static int syscall_enter_register(struct ftrace_event_call *event,
enum trace_reg type, void *data);
@@ -41,7 +37,7 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name
/*
* Only compare after the "sys" prefix. Archs that use
* syscall wrappers may have syscalls symbols aliases prefixed
* with "SyS" instead of "sys", leading to an unwanted
* with ".SyS" or ".sys" instead of "sys", leading to an unwanted
* mismatch.
*/
return !strcmp(sym + 3, name + 3);
@@ -265,7 +261,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call)
kfree(call->print_fmt);
}
static int syscall_enter_define_fields(struct ftrace_event_call *call)
static int __init syscall_enter_define_fields(struct ftrace_event_call *call)
{
struct syscall_trace_enter trace;
struct syscall_metadata *meta = call->data;
@@ -288,7 +284,7 @@ static int syscall_enter_define_fields(struct ftrace_event_call *call)
return ret;
}
static int syscall_exit_define_fields(struct ftrace_event_call *call)
static int __init syscall_exit_define_fields(struct ftrace_event_call *call)
{
struct syscall_trace_exit trace;
int ret;
@@ -303,8 +299,9 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call)
return ret;
}
static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
{
struct trace_array *tr = data;
struct syscall_trace_enter *entry;
struct syscall_metadata *sys_data;
struct ring_buffer_event *event;
@@ -315,7 +312,7 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0)
return;
if (!test_bit(syscall_nr, enabled_enter_syscalls))
if (!test_bit(syscall_nr, tr->enabled_enter_syscalls))
return;
sys_data = syscall_nr_to_meta(syscall_nr);
@@ -324,7 +321,8 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
event = trace_current_buffer_lock_reserve(&buffer,
buffer = tr->trace_buffer.buffer;
event = trace_buffer_lock_reserve(buffer,
sys_data->enter_event->event.type, size, 0, 0);
if (!event)
return;
@@ -338,8 +336,9 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
trace_current_buffer_unlock_commit(buffer, event, 0, 0);
}
static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
{
struct trace_array *tr = data;
struct syscall_trace_exit *entry;
struct syscall_metadata *sys_data;
struct ring_buffer_event *event;
@@ -349,14 +348,15 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0)
return;
if (!test_bit(syscall_nr, enabled_exit_syscalls))
if (!test_bit(syscall_nr, tr->enabled_exit_syscalls))
return;
sys_data = syscall_nr_to_meta(syscall_nr);
if (!sys_data)
return;
event = trace_current_buffer_lock_reserve(&buffer,
buffer = tr->trace_buffer.buffer;
event = trace_buffer_lock_reserve(buffer,
sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
if (!event)
return;
@@ -370,8 +370,10 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
trace_current_buffer_unlock_commit(buffer, event, 0, 0);
}
static int reg_event_syscall_enter(struct ftrace_event_call *call)
static int reg_event_syscall_enter(struct ftrace_event_file *file,
struct ftrace_event_call *call)
{
struct trace_array *tr = file->tr;
int ret = 0;
int num;
@@ -379,33 +381,37 @@ static int reg_event_syscall_enter(struct ftrace_event_call *call)
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return -ENOSYS;
mutex_lock(&syscall_trace_lock);
if (!sys_refcount_enter)
ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
if (!tr->sys_refcount_enter)
ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
if (!ret) {
set_bit(num, enabled_enter_syscalls);
sys_refcount_enter++;
set_bit(num, tr->enabled_enter_syscalls);
tr->sys_refcount_enter++;
}
mutex_unlock(&syscall_trace_lock);
return ret;
}
static void unreg_event_syscall_enter(struct ftrace_event_call *call)
static void unreg_event_syscall_enter(struct ftrace_event_file *file,
struct ftrace_event_call *call)
{
struct trace_array *tr = file->tr;
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return;
mutex_lock(&syscall_trace_lock);
sys_refcount_enter--;
clear_bit(num, enabled_enter_syscalls);
if (!sys_refcount_enter)
unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
tr->sys_refcount_enter--;
clear_bit(num, tr->enabled_enter_syscalls);
if (!tr->sys_refcount_enter)
unregister_trace_sys_enter(ftrace_syscall_enter, tr);
mutex_unlock(&syscall_trace_lock);
}
static int reg_event_syscall_exit(struct ftrace_event_call *call)
static int reg_event_syscall_exit(struct ftrace_event_file *file,
struct ftrace_event_call *call)
{
struct trace_array *tr = file->tr;
int ret = 0;
int num;
@@ -413,28 +419,30 @@ static int reg_event_syscall_exit(struct ftrace_event_call *call)
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return -ENOSYS;
mutex_lock(&syscall_trace_lock);
if (!sys_refcount_exit)
ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
if (!tr->sys_refcount_exit)
ret = register_trace_sys_exit(ftrace_syscall_exit, tr);
if (!ret) {
set_bit(num, enabled_exit_syscalls);
sys_refcount_exit++;
set_bit(num, tr->enabled_exit_syscalls);
tr->sys_refcount_exit++;
}
mutex_unlock(&syscall_trace_lock);
return ret;
}
static void unreg_event_syscall_exit(struct ftrace_event_call *call)
static void unreg_event_syscall_exit(struct ftrace_event_file *file,
struct ftrace_event_call *call)
{
struct trace_array *tr = file->tr;
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return;
mutex_lock(&syscall_trace_lock);
sys_refcount_exit--;
clear_bit(num, enabled_exit_syscalls);
if (!sys_refcount_exit)
unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
tr->sys_refcount_exit--;
clear_bit(num, tr->enabled_exit_syscalls);
if (!tr->sys_refcount_exit)
unregister_trace_sys_exit(ftrace_syscall_exit, tr);
mutex_unlock(&syscall_trace_lock);
}
@@ -471,7 +479,7 @@ struct trace_event_functions exit_syscall_print_funcs = {
.trace = print_syscall_exit,
};
struct ftrace_event_class event_class_syscall_enter = {
struct ftrace_event_class __refdata event_class_syscall_enter = {
.system = "syscalls",
.reg = syscall_enter_register,
.define_fields = syscall_enter_define_fields,
@@ -479,7 +487,7 @@ struct ftrace_event_class event_class_syscall_enter = {
.raw_init = init_syscall_trace,
};
struct ftrace_event_class event_class_syscall_exit = {
struct ftrace_event_class __refdata event_class_syscall_exit = {
.system = "syscalls",
.reg = syscall_exit_register,
.define_fields = syscall_exit_define_fields,
@@ -685,11 +693,13 @@ static void perf_sysexit_disable(struct ftrace_event_call *call)
static int syscall_enter_register(struct ftrace_event_call *event,
enum trace_reg type, void *data)
{
struct ftrace_event_file *file = data;
switch (type) {
case TRACE_REG_REGISTER:
return reg_event_syscall_enter(event);
return reg_event_syscall_enter(file, event);
case TRACE_REG_UNREGISTER:
unreg_event_syscall_enter(event);
unreg_event_syscall_enter(file, event);
return 0;
#ifdef CONFIG_PERF_EVENTS
@@ -711,11 +721,13 @@ static int syscall_enter_register(struct ftrace_event_call *event,
static int syscall_exit_register(struct ftrace_event_call *event,
enum trace_reg type, void *data)
{
struct ftrace_event_file *file = data;
switch (type) {
case TRACE_REG_REGISTER:
return reg_event_syscall_exit(event);
return reg_event_syscall_exit(file, event);
case TRACE_REG_UNREGISTER:
unreg_event_syscall_exit(event);
unreg_event_syscall_exit(file, event);
return 0;
#ifdef CONFIG_PERF_EVENTS

View File

@@ -28,6 +28,18 @@
#define UPROBE_EVENT_SYSTEM "uprobes"
struct uprobe_trace_entry_head {
struct trace_entry ent;
unsigned long vaddr[];
};
#define SIZEOF_TRACE_ENTRY(is_return) \
(sizeof(struct uprobe_trace_entry_head) + \
sizeof(unsigned long) * (is_return ? 2 : 1))
#define DATAOF_TRACE_ENTRY(entry, is_return) \
((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return))
struct trace_uprobe_filter {
rwlock_t rwlock;
int nr_systemwide;
@@ -64,6 +76,8 @@ static DEFINE_MUTEX(uprobe_lock);
static LIST_HEAD(uprobe_list);
static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
static int uretprobe_dispatcher(struct uprobe_consumer *con,
unsigned long func, struct pt_regs *regs);
static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
{
@@ -77,11 +91,16 @@ static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)
return !filter->nr_systemwide && list_empty(&filter->perf_events);
}
static inline bool is_ret_probe(struct trace_uprobe *tu)
{
return tu->consumer.ret_handler != NULL;
}
/*
* Allocate new trace_uprobe and initialize it (including uprobes).
*/
static struct trace_uprobe *
alloc_trace_uprobe(const char *group, const char *event, int nargs)
alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
{
struct trace_uprobe *tu;
@@ -106,6 +125,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs)
INIT_LIST_HEAD(&tu->list);
tu->consumer.handler = uprobe_dispatcher;
if (is_ret)
tu->consumer.ret_handler = uretprobe_dispatcher;
init_trace_uprobe_filter(&tu->filter);
return tu;
@@ -180,7 +201,7 @@ end:
/*
* Argument syntax:
* - Add uprobe: p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS]
* - Add uprobe: p|r[:[GRP/]EVENT] PATH:SYMBOL [FETCHARGS]
*
* - Remove uprobe: -:[GRP/]EVENT
*/
@@ -192,20 +213,23 @@ static int create_trace_uprobe(int argc, char **argv)
char buf[MAX_EVENT_NAME_LEN];
struct path path;
unsigned long offset;
bool is_delete;
bool is_delete, is_return;
int i, ret;
inode = NULL;
ret = 0;
is_delete = false;
is_return = false;
event = NULL;
group = NULL;
/* argc must be >= 1 */
if (argv[0][0] == '-')
is_delete = true;
else if (argv[0][0] == 'r')
is_return = true;
else if (argv[0][0] != 'p') {
pr_info("Probe definition must be started with 'p' or '-'.\n");
pr_info("Probe definition must be started with 'p', 'r' or '-'.\n");
return -EINVAL;
}
@@ -303,7 +327,7 @@ static int create_trace_uprobe(int argc, char **argv)
kfree(tail);
}
tu = alloc_trace_uprobe(group, event, argc);
tu = alloc_trace_uprobe(group, event, argc, is_return);
if (IS_ERR(tu)) {
pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu));
ret = PTR_ERR(tu);
@@ -414,9 +438,10 @@ static void probes_seq_stop(struct seq_file *m, void *v)
static int probes_seq_show(struct seq_file *m, void *v)
{
struct trace_uprobe *tu = v;
char c = is_ret_probe(tu) ? 'r' : 'p';
int i;
seq_printf(m, "p:%s/%s", tu->call.class->system, tu->call.name);
seq_printf(m, "%c:%s/%s", c, tu->call.class->system, tu->call.name);
seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
for (i = 0; i < tu->nr_args; i++)
@@ -485,65 +510,81 @@ static const struct file_operations uprobe_profile_ops = {
.release = seq_release,
};
/* uprobe handler */
static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
static void uprobe_trace_print(struct trace_uprobe *tu,
unsigned long func, struct pt_regs *regs)
{
struct uprobe_trace_entry_head *entry;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
u8 *data;
int size, i, pc;
unsigned long irq_flags;
void *data;
int size, i;
struct ftrace_event_call *call = &tu->call;
local_save_flags(irq_flags);
pc = preempt_count();
size = sizeof(*entry) + tu->size;
size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
size, irq_flags, pc);
size + tu->size, 0, 0);
if (!event)
return 0;
return;
entry = ring_buffer_event_data(event);
entry->ip = instruction_pointer(task_pt_regs(current));
data = (u8 *)&entry[1];
if (is_ret_probe(tu)) {
entry->vaddr[0] = func;
entry->vaddr[1] = instruction_pointer(regs);
data = DATAOF_TRACE_ENTRY(entry, true);
} else {
entry->vaddr[0] = instruction_pointer(regs);
data = DATAOF_TRACE_ENTRY(entry, false);
}
for (i = 0; i < tu->nr_args; i++)
call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
if (!filter_current_check_discard(buffer, call, entry, event))
trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
trace_buffer_unlock_commit(buffer, event, 0, 0);
}
/* uprobe handler */
static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
{
if (!is_ret_probe(tu))
uprobe_trace_print(tu, 0, regs);
return 0;
}
static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
struct pt_regs *regs)
{
uprobe_trace_print(tu, func, regs);
}
/* Event entry printers */
static enum print_line_t
print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event)
{
struct uprobe_trace_entry_head *field;
struct uprobe_trace_entry_head *entry;
struct trace_seq *s = &iter->seq;
struct trace_uprobe *tu;
u8 *data;
int i;
field = (struct uprobe_trace_entry_head *)iter->ent;
entry = (struct uprobe_trace_entry_head *)iter->ent;
tu = container_of(event, struct trace_uprobe, call.event);
if (!trace_seq_printf(s, "%s: (", tu->call.name))
goto partial;
if (is_ret_probe(tu)) {
if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->call.name,
entry->vaddr[1], entry->vaddr[0]))
goto partial;
data = DATAOF_TRACE_ENTRY(entry, true);
} else {
if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name,
entry->vaddr[0]))
goto partial;
data = DATAOF_TRACE_ENTRY(entry, false);
}
if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
goto partial;
if (!trace_seq_puts(s, ")"))
goto partial;
data = (u8 *)&field[1];
for (i = 0; i < tu->nr_args; i++) {
if (!tu->args[i].type->print(s, tu->args[i].name,
data + tu->args[i].offset, field))
data + tu->args[i].offset, entry))
goto partial;
}
@@ -595,16 +636,23 @@ static void probe_event_disable(struct trace_uprobe *tu, int flag)
static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
{
int ret, i;
int ret, i, size;
struct uprobe_trace_entry_head field;
struct trace_uprobe *tu = (struct trace_uprobe *)event_call->data;
struct trace_uprobe *tu = event_call->data;
DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
if (is_ret_probe(tu)) {
DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0);
DEFINE_FIELD(unsigned long, vaddr[1], FIELD_STRING_RETIP, 0);
size = SIZEOF_TRACE_ENTRY(true);
} else {
DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0);
size = SIZEOF_TRACE_ENTRY(false);
}
/* Set argument names as fields */
for (i = 0; i < tu->nr_args; i++) {
ret = trace_define_field(event_call, tu->args[i].type->fmttype,
tu->args[i].name,
sizeof(field) + tu->args[i].offset,
size + tu->args[i].offset,
tu->args[i].type->size,
tu->args[i].type->is_signed,
FILTER_OTHER);
@@ -622,8 +670,13 @@ static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len)
int i;
int pos = 0;
fmt = "(%lx)";
arg = "REC->" FIELD_STRING_IP;
if (is_ret_probe(tu)) {
fmt = "(%lx <- %lx)";
arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
} else {
fmt = "(%lx)";
arg = "REC->" FIELD_STRING_IP;
}
/* When len=0, we just calculate the needed length */
@@ -752,49 +805,68 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
return ret;
}
/* uprobe profile handler */
static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
static void uprobe_perf_print(struct trace_uprobe *tu,
unsigned long func, struct pt_regs *regs)
{
struct ftrace_event_call *call = &tu->call;
struct uprobe_trace_entry_head *entry;
struct hlist_head *head;
u8 *data;
int size, __size, i;
int rctx;
void *data;
int size, rctx, i;
if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
return UPROBE_HANDLER_REMOVE;
__size = sizeof(*entry) + tu->size;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32);
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
return 0;
return;
preempt_disable();
head = this_cpu_ptr(call->perf_events);
if (hlist_empty(head))
goto out;
entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
if (!entry)
goto out;
entry->ip = instruction_pointer(task_pt_regs(current));
data = (u8 *)&entry[1];
if (is_ret_probe(tu)) {
entry->vaddr[0] = func;
entry->vaddr[1] = instruction_pointer(regs);
data = DATAOF_TRACE_ENTRY(entry, true);
} else {
entry->vaddr[0] = instruction_pointer(regs);
data = DATAOF_TRACE_ENTRY(entry, false);
}
for (i = 0; i < tu->nr_args; i++)
call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
head = this_cpu_ptr(call->perf_events);
perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL);
perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
out:
preempt_enable();
}
/* uprobe profile handler */
static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
{
if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
return UPROBE_HANDLER_REMOVE;
if (!is_ret_probe(tu))
uprobe_perf_print(tu, 0, regs);
return 0;
}
static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
struct pt_regs *regs)
{
uprobe_perf_print(tu, func, regs);
}
#endif /* CONFIG_PERF_EVENTS */
static
int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data)
{
struct trace_uprobe *tu = (struct trace_uprobe *)event->data;
struct trace_uprobe *tu = event->data;
switch (type) {
case TRACE_REG_REGISTER:
@@ -843,6 +915,23 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
return ret;
}
static int uretprobe_dispatcher(struct uprobe_consumer *con,
unsigned long func, struct pt_regs *regs)
{
struct trace_uprobe *tu;
tu = container_of(con, struct trace_uprobe, consumer);
if (tu->flags & TP_FLAG_TRACE)
uretprobe_trace_func(tu, func, regs);
#ifdef CONFIG_PERF_EVENTS
if (tu->flags & TP_FLAG_PROFILE)
uretprobe_perf_func(tu, func, regs);
#endif
return 0;
}
static struct trace_event_functions uprobe_funcs = {
.trace = print_uprobe_event
};

View File

@@ -112,7 +112,8 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry,
int nr_probes = 0;
struct tracepoint_func *old, *new;
WARN_ON(!probe);
if (WARN_ON(!probe))
return ERR_PTR(-EINVAL);
debug_print_probes(entry);
old = entry->funcs;
@@ -152,13 +153,18 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
debug_print_probes(entry);
/* (N -> M), (N > 1, M >= 0) probes */
for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
if (!probe ||
(old[nr_probes].func == probe &&
old[nr_probes].data == data))
nr_del++;
if (probe) {
for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
if (old[nr_probes].func == probe &&
old[nr_probes].data == data)
nr_del++;
}
}
/*
* If probe is NULL, then nr_probes = nr_del = 0, and then the
* entire entry will be removed.
*/
if (nr_probes - nr_del == 0) {
/* N -> 0, (N > 1) */
entry->funcs = NULL;
@@ -173,8 +179,7 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
if (new == NULL)
return ERR_PTR(-ENOMEM);
for (i = 0; old[i].func; i++)
if (probe &&
(old[i].func != probe || old[i].data != data))
if (old[i].func != probe || old[i].data != data)
new[j++] = old[i];
new[nr_probes - nr_del].func = NULL;
entry->refcount = nr_probes - nr_del;

View File

@@ -51,6 +51,8 @@ struct user_namespace init_user_ns = {
.owner = GLOBAL_ROOT_UID,
.group = GLOBAL_ROOT_GID,
.proc_inum = PROC_USER_INIT_INO,
.may_mount_sysfs = true,
.may_mount_proc = true,
};
EXPORT_SYMBOL_GPL(init_user_ns);

View File

@@ -21,10 +21,12 @@
#include <linux/uaccess.h>
#include <linux/ctype.h>
#include <linux/projid.h>
#include <linux/fs_struct.h>
static struct kmem_cache *user_ns_cachep __read_mostly;
static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
static bool new_idmap_permitted(const struct file *file,
struct user_namespace *ns, int cap_setid,
struct uid_gid_map *map);
static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
@@ -60,6 +62,15 @@ int create_user_ns(struct cred *new)
kgid_t group = new->egid;
int ret;
/*
* Verify that we can not violate the policy of which files
* may be accessed that is specified by the root directory,
* by verifing that the root directory is at the root of the
* mount namespace which allows all files to be accessed.
*/
if (current_chrooted())
return -EPERM;
/* The creator needs a mapping in the parent user namespace
* or else we won't be able to reasonably tell userspace who
* created a user_namespace.
@@ -86,6 +97,8 @@ int create_user_ns(struct cred *new)
set_cred_user_ns(new, ns);
update_mnt_policy(ns);
return 0;
}
@@ -600,10 +613,10 @@ static ssize_t map_write(struct file *file, const char __user *buf,
if (map->nr_extents != 0)
goto out;
/* Require the appropriate privilege CAP_SETUID or CAP_SETGID
* over the user namespace in order to set the id mapping.
/*
* Adjusting namespace settings requires capabilities on the target.
*/
if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid))
if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
goto out;
/* Get a buffer */
@@ -688,7 +701,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
ret = -EPERM;
/* Validate the user is allowed to use user id's mapped to. */
if (!new_idmap_permitted(ns, cap_setid, &new_map))
if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
goto out;
/* Map the lower ids from the parent user namespace to the
@@ -775,7 +788,8 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t
&ns->projid_map, &ns->parent->projid_map);
}
static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
static bool new_idmap_permitted(const struct file *file,
struct user_namespace *ns, int cap_setid,
struct uid_gid_map *new_map)
{
/* Allow mapping to your own filesystem ids */
@@ -783,12 +797,12 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
u32 id = new_map->extent[0].lower_first;
if (cap_setid == CAP_SETUID) {
kuid_t uid = make_kuid(ns->parent, id);
if (uid_eq(uid, current_fsuid()))
if (uid_eq(uid, file->f_cred->fsuid))
return true;
}
else if (cap_setid == CAP_SETGID) {
kgid_t gid = make_kgid(ns->parent, id);
if (gid_eq(gid, current_fsgid()))
if (gid_eq(gid, file->f_cred->fsgid))
return true;
}
}
@@ -799,8 +813,10 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
/* Allow the specified ids if we have the appropriate capability
* (CAP_SETUID or CAP_SETGID) over the parent user namespace.
* And the opener of the id file also had the approprpiate capability.
*/
if (ns_capable(ns->parent, cap_setid))
if (ns_capable(ns->parent, cap_setid) &&
file_ns_capable(file, ns->parent, cap_setid))
return true;
return false;
@@ -837,6 +853,9 @@ static int userns_install(struct nsproxy *nsproxy, void *ns)
if (atomic_read(&current->mm->mm_users) > 1)
return -EINVAL;
if (current->fs->users != 1)
return -EINVAL;
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;

View File

@@ -517,6 +517,11 @@ int proc_dowatchdog(struct ctl_table *table, int write,
return ret;
set_sample_period();
/*
* Watchdog threads shouldn't be enabled if they are
* disabled. The 'watchdog_disabled' variable check in
* watchdog_*_all_cpus() function takes care of this.
*/
if (watchdog_enabled && watchdog_thresh)
watchdog_enable_all_cpus();
else

File diff suppressed because it is too large Load Diff

View File

@@ -32,14 +32,12 @@ struct worker {
struct list_head scheduled; /* L: scheduled works */
struct task_struct *task; /* I: worker task */
struct worker_pool *pool; /* I: the associated pool */
/* L: for rescuers */
/* 64 bytes boundary on 64bit, 32 on 32bit */
unsigned long last_active; /* L: last active timestamp */
unsigned int flags; /* X: flags */
int id; /* I: worker id */
/* for rebinding worker to CPU */
struct work_struct rebind_work; /* L: for busy worker */
/* used only by rescuers to point to the target workqueue */
struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */
};
@@ -58,8 +56,7 @@ static inline struct worker *current_wq_worker(void)
* Scheduler hooks for concurrency managed workqueue. Only to be used from
* sched.c and workqueue.c.
*/
void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
struct task_struct *wq_worker_sleeping(struct task_struct *task,
unsigned int cpu);
void wq_worker_waking_up(struct task_struct *task, int cpu);
struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);
#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */