Merge commit 'v2.6.30-rc1' into core/urgent

Merge reason: need latest upstream to queue up dependent fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
Ingo Molnar
2009-04-08 17:02:50 +02:00
3134 changed files with 505113 additions and 125827 deletions

View File

@@ -74,6 +74,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
@@ -93,6 +94,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
obj-$(CONFIG_SLOW_WORK) += slow-work.o
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is

View File

@@ -766,6 +766,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
audit_log_format(ab, " msg=");
size = nlmsg_len(nlh);
if (size > 0 &&
((unsigned char *)data)[size - 1] == '\0')
size--;
audit_log_n_untrustedstring(ab, data, size);
}
audit_set_pid(ab, pid);
@@ -1382,7 +1385,7 @@ void audit_log_n_string(struct audit_buffer *ab, const char *string,
int audit_string_contains_control(const char *string, size_t len)
{
const unsigned char *p;
for (p = string; p < (const unsigned char *)string + len && *p; p++) {
for (p = string; p < (const unsigned char *)string + len; p++) {
if (*p == '"' || *p < 0x21 || *p > 0x7e)
return 1;
}
@@ -1437,13 +1440,13 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
/* We will allow 11 spaces for ' (deleted)' to be appended */
pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
if (!pathname) {
audit_log_format(ab, "<no memory>");
audit_log_string(ab, "<no_memory>");
return;
}
p = d_path(path, pathname, PATH_MAX+11);
if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
/* FIXME: can we save some information here? */
audit_log_format(ab, "<too long>");
audit_log_string(ab, "<too_long>");
} else
audit_log_untrustedstring(ab, p);
kfree(pathname);

View File

@@ -385,6 +385,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
mutex_lock(&inode->inotify_mutex);
if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
mutex_unlock(&inode->inotify_mutex);
put_inotify_watch(&old->watch);
free_chunk(chunk);
return -ENOSPC;
}
@@ -394,6 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
chunk->dead = 1;
inotify_evict_watch(&chunk->watch);
mutex_unlock(&inode->inotify_mutex);
put_inotify_watch(&old->watch);
put_inotify_watch(&chunk->watch);
return 0;
}

View File

@@ -135,18 +135,18 @@ static void audit_remove_watch(struct audit_watch *watch)
static inline void audit_free_rule(struct audit_entry *e)
{
int i;
struct audit_krule *erule = &e->rule;
/* some rules don't have associated watches */
if (e->rule.watch)
audit_put_watch(e->rule.watch);
if (e->rule.fields)
for (i = 0; i < e->rule.field_count; i++) {
struct audit_field *f = &e->rule.fields[i];
if (erule->watch)
audit_put_watch(erule->watch);
if (erule->fields)
for (i = 0; i < erule->field_count; i++) {
struct audit_field *f = &erule->fields[i];
kfree(f->lsm_str);
security_audit_rule_free(f->lsm_rule);
}
kfree(e->rule.fields);
kfree(e->rule.filterkey);
kfree(erule->fields);
kfree(erule->filterkey);
kfree(e);
}

View File

@@ -66,6 +66,7 @@
#include <linux/syscalls.h>
#include <linux/inotify.h>
#include <linux/capability.h>
#include <linux/fs_struct.h>
#include "audit.h"
@@ -328,6 +329,14 @@ static int audit_match_filetype(struct audit_context *ctx, int which)
*/
#ifdef CONFIG_AUDIT_TREE
static void audit_set_auditable(struct audit_context *ctx)
{
if (!ctx->prio) {
ctx->prio = 1;
ctx->current_state = AUDIT_RECORD_CONTEXT;
}
}
static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk)
{
struct audit_tree_refs *p = ctx->trees;
@@ -741,17 +750,9 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
rcu_read_unlock();
}
static void audit_set_auditable(struct audit_context *ctx)
{
if (!ctx->prio) {
ctx->prio = 1;
ctx->current_state = AUDIT_RECORD_CONTEXT;
}
}
static inline struct audit_context *audit_get_context(struct task_struct *tsk,
int return_valid,
int return_code)
long return_code)
{
struct audit_context *context = tsk->audit_context;
@@ -1023,7 +1024,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
{
char arg_num_len_buf[12];
const char __user *tmp_p = p;
/* how many digits are in arg_num? 3 is the length of a=\n */
/* how many digits are in arg_num? 3 is the length of " a=" */
size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3;
size_t len, len_left, to_send;
size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
@@ -1109,7 +1110,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
* so we can be sure nothing was lost.
*/
if ((i == 0) && (too_long))
audit_log_format(*ab, "a%d_len=%zu ", arg_num,
audit_log_format(*ab, " a%d_len=%zu", arg_num,
has_cntl ? 2*len : len);
/*
@@ -1129,7 +1130,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
buf[to_send] = '\0';
/* actually log it */
audit_log_format(*ab, "a%d", arg_num);
audit_log_format(*ab, " a%d", arg_num);
if (too_long)
audit_log_format(*ab, "[%d]", i);
audit_log_format(*ab, "=");
@@ -1137,7 +1138,6 @@ static int audit_log_single_execve_arg(struct audit_context *context,
audit_log_n_hex(*ab, buf, to_send);
else
audit_log_format(*ab, "\"%s\"", buf);
audit_log_format(*ab, "\n");
p += to_send;
len_left -= to_send;
@@ -1165,7 +1165,7 @@ static void audit_log_execve_info(struct audit_context *context,
p = (const char __user *)axi->mm->arg_start;
audit_log_format(*ab, "argc=%d ", axi->argc);
audit_log_format(*ab, "argc=%d", axi->argc);
/*
* we need some kernel buffer to hold the userspace args. Just
@@ -1478,7 +1478,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
case 0:
/* name was specified as a relative path and the
* directory component is the cwd */
audit_log_d_path(ab, " name=", &context->pwd);
audit_log_d_path(ab, "name=", &context->pwd);
break;
default:
/* log the name's directory component */
@@ -2149,7 +2149,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
* __audit_mq_open - record audit data for a POSIX MQ open
* @oflag: open flag
* @mode: mode bits
* @u_attr: queue attributes
* @attr: queue attributes
*
*/
void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr)
@@ -2196,7 +2196,7 @@ void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
/**
* __audit_mq_notify - record audit data for a POSIX MQ notify
* @mqdes: MQ descriptor
* @u_notification: Notification event
* @notification: Notification event
*
*/

View File

@@ -94,7 +94,6 @@ struct cgroupfs_root {
char release_agent_path[PATH_MAX];
};
/*
* The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
* subsystems that are otherwise unattached - it never has more than a
@@ -102,6 +101,39 @@ struct cgroupfs_root {
*/
static struct cgroupfs_root rootnode;
/*
* CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
* cgroup_subsys->use_id != 0.
*/
#define CSS_ID_MAX (65535)
struct css_id {
/*
* The css to which this ID points. This pointer is set to valid value
* after cgroup is populated. If cgroup is removed, this will be NULL.
* This pointer is expected to be RCU-safe because destroy()
* is called after synchronize_rcu(). But for safe use, css_is_removed()
* css_tryget() should be used for avoiding race.
*/
struct cgroup_subsys_state *css;
/*
* ID of this css.
*/
unsigned short id;
/*
* Depth in hierarchy which this ID belongs to.
*/
unsigned short depth;
/*
* ID is freed by RCU. (and lookup routine is RCU safe.)
*/
struct rcu_head rcu_head;
/*
* Hierarchy of CSS ID belongs to.
*/
unsigned short stack[0]; /* Array of Length (depth+1) */
};
/* The list of hierarchy roots */
static LIST_HEAD(roots);
@@ -185,6 +217,8 @@ struct cg_cgroup_link {
static struct css_set init_css_set;
static struct cg_cgroup_link init_css_set_link;
static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
/* css_set_lock protects the list of css_set objects, and the
* chain of tasks off each css_set. Nests outside task->alloc_lock
* due to cgroup_iter_start() */
@@ -567,6 +601,9 @@ static struct backing_dev_info cgroup_backing_dev_info = {
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
static int alloc_css_id(struct cgroup_subsys *ss,
struct cgroup *parent, struct cgroup *child);
static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
{
struct inode *inode = new_inode(sb);
@@ -585,13 +622,18 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
* Call subsys's pre_destroy handler.
* This is called before css refcnt check.
*/
static void cgroup_call_pre_destroy(struct cgroup *cgrp)
static int cgroup_call_pre_destroy(struct cgroup *cgrp)
{
struct cgroup_subsys *ss;
int ret = 0;
for_each_subsys(cgrp->root, ss)
if (ss->pre_destroy)
ss->pre_destroy(ss, cgrp);
return;
if (ss->pre_destroy) {
ret = ss->pre_destroy(ss, cgrp);
if (ret)
break;
}
return ret;
}
static void free_cgroup_rcu(struct rcu_head *obj)
@@ -685,6 +727,22 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
remove_dir(dentry);
}
/*
* A queue for waiters to do rmdir() cgroup. A tasks will sleep when
* cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
* reference to css->refcnt. In general, this refcnt is expected to goes down
* to zero, soon.
*
* CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
*/
DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
{
if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
wake_up_all(&cgroup_rmdir_waitq);
}
static int rebind_subsystems(struct cgroupfs_root *root,
unsigned long final_bits)
{
@@ -857,16 +915,16 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
}
ret = rebind_subsystems(root, opts.subsys_bits);
if (ret)
goto out_unlock;
/* (re)populate subsystem files */
if (!ret)
cgroup_populate_dir(cgrp);
cgroup_populate_dir(cgrp);
if (opts.release_agent)
strcpy(root->release_agent_path, opts.release_agent);
out_unlock:
if (opts.release_agent)
kfree(opts.release_agent);
kfree(opts.release_agent);
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
return ret;
@@ -969,15 +1027,13 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
/* First find the desired set of subsystems */
ret = parse_cgroupfs_options(data, &opts);
if (ret) {
if (opts.release_agent)
kfree(opts.release_agent);
kfree(opts.release_agent);
return ret;
}
root = kzalloc(sizeof(*root), GFP_KERNEL);
if (!root) {
if (opts.release_agent)
kfree(opts.release_agent);
kfree(opts.release_agent);
return -ENOMEM;
}
@@ -1280,6 +1336,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
synchronize_rcu();
put_css_set(cg);
/*
* wake up rmdir() waiter. the rmdir should fail since the cgroup
* is no longer empty.
*/
cgroup_wakeup_rmdir_waiters(cgrp);
return 0;
}
@@ -1625,7 +1687,7 @@ static struct inode_operations cgroup_dir_inode_operations = {
.rename = cgroup_rename,
};
static int cgroup_create_file(struct dentry *dentry, int mode,
static int cgroup_create_file(struct dentry *dentry, mode_t mode,
struct super_block *sb)
{
static const struct dentry_operations cgroup_dops = {
@@ -1671,7 +1733,7 @@ static int cgroup_create_file(struct dentry *dentry, int mode,
* @mode: mode to set on new directory.
*/
static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
int mode)
mode_t mode)
{
struct dentry *parent;
int error = 0;
@@ -1689,6 +1751,33 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
return error;
}
/**
* cgroup_file_mode - deduce file mode of a control file
* @cft: the control file in question
*
* returns cft->mode if ->mode is not 0
* returns S_IRUGO|S_IWUSR if it has both a read and a write handler
* returns S_IRUGO if it has only a read handler
* returns S_IWUSR if it has only a write hander
*/
static mode_t cgroup_file_mode(const struct cftype *cft)
{
mode_t mode = 0;
if (cft->mode)
return cft->mode;
if (cft->read || cft->read_u64 || cft->read_s64 ||
cft->read_map || cft->read_seq_string)
mode |= S_IRUGO;
if (cft->write || cft->write_u64 || cft->write_s64 ||
cft->write_string || cft->trigger)
mode |= S_IWUSR;
return mode;
}
int cgroup_add_file(struct cgroup *cgrp,
struct cgroup_subsys *subsys,
const struct cftype *cft)
@@ -1696,6 +1785,7 @@ int cgroup_add_file(struct cgroup *cgrp,
struct dentry *dir = cgrp->dentry;
struct dentry *dentry;
int error;
mode_t mode;
char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
@@ -1706,7 +1796,8 @@ int cgroup_add_file(struct cgroup *cgrp,
BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
dentry = lookup_one_len(name, dir, strlen(name));
if (!IS_ERR(dentry)) {
error = cgroup_create_file(dentry, 0644 | S_IFREG,
mode = cgroup_file_mode(cft);
error = cgroup_create_file(dentry, mode | S_IFREG,
cgrp->root->sb);
if (!error)
dentry->d_fsdata = (void *)cft;
@@ -2288,6 +2379,7 @@ static struct cftype files[] = {
.write_u64 = cgroup_tasks_write,
.release = cgroup_tasks_release,
.private = FILE_TASKLIST,
.mode = S_IRUGO | S_IWUSR,
},
{
@@ -2327,6 +2419,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
return err;
}
/* This cgroup is ready now */
for_each_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
/*
* Update id->css pointer and make this css visible from
* CSS ID functions. This pointer will be dereferened
* from RCU-read-side without locks.
*/
if (css->id)
rcu_assign_pointer(css->id->css, css);
}
return 0;
}
@@ -2338,6 +2441,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
css->cgroup = cgrp;
atomic_set(&css->refcnt, 1);
css->flags = 0;
css->id = NULL;
if (cgrp == dummytop)
set_bit(CSS_ROOT, &css->flags);
BUG_ON(cgrp->subsys[ss->subsys_id]);
@@ -2376,7 +2480,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
* Must be called with the mutex on the parent inode held
*/
static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
int mode)
mode_t mode)
{
struct cgroup *cgrp;
struct cgroupfs_root *root = parent->root;
@@ -2413,6 +2517,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
goto err_destroy;
}
init_cgroup_css(css, ss, cgrp);
if (ss->use_id)
if (alloc_css_id(ss, parent, cgrp))
goto err_destroy;
/* At error, ->destroy() callback has to free assigned ID. */
}
cgroup_lock_hierarchy(root);
@@ -2555,9 +2663,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
struct cgroup *cgrp = dentry->d_fsdata;
struct dentry *d;
struct cgroup *parent;
DEFINE_WAIT(wait);
int ret;
/* the vfs holds both inode->i_mutex already */
again:
mutex_lock(&cgroup_mutex);
if (atomic_read(&cgrp->count) != 0) {
mutex_unlock(&cgroup_mutex);
@@ -2573,17 +2683,39 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
* Call pre_destroy handlers of subsys. Notify subsystems
* that rmdir() request comes.
*/
cgroup_call_pre_destroy(cgrp);
ret = cgroup_call_pre_destroy(cgrp);
if (ret)
return ret;
mutex_lock(&cgroup_mutex);
parent = cgrp->parent;
if (atomic_read(&cgrp->count)
|| !list_empty(&cgrp->children)
|| !cgroup_clear_css_refs(cgrp)) {
if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
/*
* css_put/get is provided for subsys to grab refcnt to css. In typical
* case, subsystem has no reference after pre_destroy(). But, under
* hierarchy management, some *temporal* refcnt can be hold.
* To avoid returning -EBUSY to a user, waitqueue is used. If subsys
* is really busy, it should return -EBUSY at pre_destroy(). wake_up
* is called when css_put() is called and refcnt goes down to 0.
*/
set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
if (!cgroup_clear_css_refs(cgrp)) {
mutex_unlock(&cgroup_mutex);
schedule();
finish_wait(&cgroup_rmdir_waitq, &wait);
clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
if (signal_pending(current))
return -EINTR;
goto again;
}
/* NO css_tryget() can success after here. */
finish_wait(&cgroup_rmdir_waitq, &wait);
clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
spin_lock(&release_list_lock);
set_bit(CGRP_REMOVED, &cgrp->flags);
@@ -2708,6 +2840,8 @@ int __init cgroup_init(void)
struct cgroup_subsys *ss = subsys[i];
if (!ss->early_init)
cgroup_init_subsys(ss);
if (ss->use_id)
cgroup_subsys_init_idr(ss);
}
/* Add init_css_set to the hash table */
@@ -3084,18 +3218,19 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
}
/**
* cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp
* cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
* @cgrp: the cgroup in question
* @task: the task in question
*
* See if @cgrp is a descendant of the current task's cgroup in
* the appropriate hierarchy.
* See if @cgrp is a descendant of @task's cgroup in the appropriate
* hierarchy.
*
* If we are sending in dummytop, then presumably we are creating
* the top cgroup in the subsystem.
*
* Called only by the ns (nsproxy) cgroup.
*/
int cgroup_is_descendant(const struct cgroup *cgrp)
int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
{
int ret;
struct cgroup *target;
@@ -3105,7 +3240,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp)
return 1;
get_first_subsys(cgrp, NULL, &subsys_id);
target = task_cgroup(current, subsys_id);
target = task_cgroup(task, subsys_id);
while (cgrp != target && cgrp!= cgrp->top_cgroup)
cgrp = cgrp->parent;
ret = (cgrp == target);
@@ -3138,10 +3273,12 @@ void __css_put(struct cgroup_subsys_state *css)
{
struct cgroup *cgrp = css->cgroup;
rcu_read_lock();
if ((atomic_dec_return(&css->refcnt) == 1) &&
notify_on_release(cgrp)) {
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
if (atomic_dec_return(&css->refcnt) == 1) {
if (notify_on_release(cgrp)) {
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
}
cgroup_wakeup_rmdir_waiters(cgrp);
}
rcu_read_unlock();
}
@@ -3241,3 +3378,232 @@ static int __init cgroup_disable(char *str)
return 1;
}
__setup("cgroup_disable=", cgroup_disable);
/*
* Functons for CSS ID.
*/
/*
*To get ID other than 0, this should be called when !cgroup_is_removed().
*/
unsigned short css_id(struct cgroup_subsys_state *css)
{
struct css_id *cssid = rcu_dereference(css->id);
if (cssid)
return cssid->id;
return 0;
}
unsigned short css_depth(struct cgroup_subsys_state *css)
{
struct css_id *cssid = rcu_dereference(css->id);
if (cssid)
return cssid->depth;
return 0;
}
bool css_is_ancestor(struct cgroup_subsys_state *child,
const struct cgroup_subsys_state *root)
{
struct css_id *child_id = rcu_dereference(child->id);
struct css_id *root_id = rcu_dereference(root->id);
if (!child_id || !root_id || (child_id->depth < root_id->depth))
return false;
return child_id->stack[root_id->depth] == root_id->id;
}
static void __free_css_id_cb(struct rcu_head *head)
{
struct css_id *id;
id = container_of(head, struct css_id, rcu_head);
kfree(id);
}
void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
{
struct css_id *id = css->id;
/* When this is called before css_id initialization, id can be NULL */
if (!id)
return;
BUG_ON(!ss->use_id);
rcu_assign_pointer(id->css, NULL);
rcu_assign_pointer(css->id, NULL);
spin_lock(&ss->id_lock);
idr_remove(&ss->idr, id->id);
spin_unlock(&ss->id_lock);
call_rcu(&id->rcu_head, __free_css_id_cb);
}
/*
* This is called by init or create(). Then, calls to this function are
* always serialized (By cgroup_mutex() at create()).
*/
static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
{
struct css_id *newid;
int myid, error, size;
BUG_ON(!ss->use_id);
size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
newid = kzalloc(size, GFP_KERNEL);
if (!newid)
return ERR_PTR(-ENOMEM);
/* get id */
if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
error = -ENOMEM;
goto err_out;
}
spin_lock(&ss->id_lock);
/* Don't use 0. allocates an ID of 1-65535 */
error = idr_get_new_above(&ss->idr, newid, 1, &myid);
spin_unlock(&ss->id_lock);
/* Returns error when there are no free spaces for new ID.*/
if (error) {
error = -ENOSPC;
goto err_out;
}
if (myid > CSS_ID_MAX)
goto remove_idr;
newid->id = myid;
newid->depth = depth;
return newid;
remove_idr:
error = -ENOSPC;
spin_lock(&ss->id_lock);
idr_remove(&ss->idr, myid);
spin_unlock(&ss->id_lock);
err_out:
kfree(newid);
return ERR_PTR(error);
}
static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
{
struct css_id *newid;
struct cgroup_subsys_state *rootcss;
spin_lock_init(&ss->id_lock);
idr_init(&ss->idr);
rootcss = init_css_set.subsys[ss->subsys_id];
newid = get_new_cssid(ss, 0);
if (IS_ERR(newid))
return PTR_ERR(newid);
newid->stack[0] = newid->id;
newid->css = rootcss;
rootcss->id = newid;
return 0;
}
static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
struct cgroup *child)
{
int subsys_id, i, depth = 0;
struct cgroup_subsys_state *parent_css, *child_css;
struct css_id *child_id, *parent_id = NULL;
subsys_id = ss->subsys_id;
parent_css = parent->subsys[subsys_id];
child_css = child->subsys[subsys_id];
depth = css_depth(parent_css) + 1;
parent_id = parent_css->id;
child_id = get_new_cssid(ss, depth);
if (IS_ERR(child_id))
return PTR_ERR(child_id);
for (i = 0; i < depth; i++)
child_id->stack[i] = parent_id->stack[i];
child_id->stack[depth] = child_id->id;
/*
* child_id->css pointer will be set after this cgroup is available
* see cgroup_populate_dir()
*/
rcu_assign_pointer(child_css->id, child_id);
return 0;
}
/**
* css_lookup - lookup css by id
* @ss: cgroup subsys to be looked into.
* @id: the id
*
* Returns pointer to cgroup_subsys_state if there is valid one with id.
* NULL if not. Should be called under rcu_read_lock()
*/
struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
{
struct css_id *cssid = NULL;
BUG_ON(!ss->use_id);
cssid = idr_find(&ss->idr, id);
if (unlikely(!cssid))
return NULL;
return rcu_dereference(cssid->css);
}
/**
* css_get_next - lookup next cgroup under specified hierarchy.
* @ss: pointer to subsystem
* @id: current position of iteration.
* @root: pointer to css. search tree under this.
* @foundid: position of found object.
*
* Search next css under the specified hierarchy of rootid. Calling under
* rcu_read_lock() is necessary. Returns NULL if it reaches the end.
*/
struct cgroup_subsys_state *
css_get_next(struct cgroup_subsys *ss, int id,
struct cgroup_subsys_state *root, int *foundid)
{
struct cgroup_subsys_state *ret = NULL;
struct css_id *tmp;
int tmpid;
int rootid = css_id(root);
int depth = css_depth(root);
if (!rootid)
return NULL;
BUG_ON(!ss->use_id);
/* fill start point for scan */
tmpid = id;
while (1) {
/*
* scan next entry from bitmap(tree), tmpid is updated after
* idr_get_next().
*/
spin_lock(&ss->id_lock);
tmp = idr_get_next(&ss->idr, &tmpid);
spin_unlock(&ss->id_lock);
if (!tmp)
break;
if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
ret = rcu_dereference(tmp->css);
if (ret) {
*foundid = tmpid;
break;
}
}
/* continue to scan from next id */
tmpid = tmpid + 1;
}
return ret;
}

View File

@@ -40,9 +40,7 @@ static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
{
u64 count;
cgroup_lock();
count = cgroup_task_count(cont);
cgroup_unlock();
return count;
}

View File

@@ -128,10 +128,6 @@ static inline struct cpuset *task_cs(struct task_struct *task)
return container_of(task_subsys_state(task, cpuset_subsys_id),
struct cpuset, css);
}
struct cpuset_hotplug_scanner {
struct cgroup_scanner scan;
struct cgroup *to;
};
/* bits in struct cpuset flags field */
typedef enum {
@@ -521,6 +517,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
return 0;
}
#ifdef CONFIG_SMP
/*
* Helper routine for generate_sched_domains().
* Do cpusets a, b have overlapping cpus_allowed masks?
@@ -815,6 +812,18 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
put_online_cpus();
}
#else /* !CONFIG_SMP */
static void do_rebuild_sched_domains(struct work_struct *unused)
{
}
static int generate_sched_domains(struct cpumask **domains,
struct sched_domain_attr **attributes)
{
*domains = NULL;
return 1;
}
#endif /* CONFIG_SMP */
static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
@@ -1026,101 +1035,70 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
mutex_unlock(&callback_mutex);
}
/*
* Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new
* nodes if memory_migrate flag is set. Called with cgroup_mutex held.
*/
static void cpuset_change_nodemask(struct task_struct *p,
struct cgroup_scanner *scan)
{
struct mm_struct *mm;
struct cpuset *cs;
int migrate;
const nodemask_t *oldmem = scan->data;
mm = get_task_mm(p);
if (!mm)
return;
cs = cgroup_cs(scan->cg);
migrate = is_memory_migrate(cs);
mpol_rebind_mm(mm, &cs->mems_allowed);
if (migrate)
cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
mmput(mm);
}
static void *cpuset_being_rebound;
/**
* update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
* @oldmem: old mems_allowed of cpuset cs
* @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
*
* Called with cgroup_mutex held
* Return 0 if successful, -errno if not.
* No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
* if @heap != NULL.
*/
static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
struct ptr_heap *heap)
{
struct task_struct *p;
struct mm_struct **mmarray;
int i, n, ntasks;
int migrate;
int fudge;
struct cgroup_iter it;
int retval;
struct cgroup_scanner scan;
cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
fudge = 10; /* spare mmarray[] slots */
fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */
retval = -ENOMEM;
scan.cg = cs->css.cgroup;
scan.test_task = NULL;
scan.process_task = cpuset_change_nodemask;
scan.heap = heap;
scan.data = (nodemask_t *)oldmem;
/*
* Allocate mmarray[] to hold mm reference for each task
* in cpuset cs. Can't kmalloc GFP_KERNEL while holding
* tasklist_lock. We could use GFP_ATOMIC, but with a
* few more lines of code, we can retry until we get a big
* enough mmarray[] w/o using GFP_ATOMIC.
*/
while (1) {
ntasks = cgroup_task_count(cs->css.cgroup); /* guess */
ntasks += fudge;
mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
if (!mmarray)
goto done;
read_lock(&tasklist_lock); /* block fork */
if (cgroup_task_count(cs->css.cgroup) <= ntasks)
break; /* got enough */
read_unlock(&tasklist_lock); /* try again */
kfree(mmarray);
}
n = 0;
/* Load up mmarray[] with mm reference for each task in cpuset. */
cgroup_iter_start(cs->css.cgroup, &it);
while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
struct mm_struct *mm;
if (n >= ntasks) {
printk(KERN_WARNING
"Cpuset mempolicy rebind incomplete.\n");
break;
}
mm = get_task_mm(p);
if (!mm)
continue;
mmarray[n++] = mm;
}
cgroup_iter_end(cs->css.cgroup, &it);
read_unlock(&tasklist_lock);
/*
* Now that we've dropped the tasklist spinlock, we can
* rebind the vma mempolicies of each mm in mmarray[] to their
* new cpuset, and release that mm. The mpol_rebind_mm()
* call takes mmap_sem, which we couldn't take while holding
* tasklist_lock. Forks can happen again now - the mpol_dup()
* cpuset_being_rebound check will catch such forks, and rebind
* their vma mempolicies too. Because we still hold the global
* cgroup_mutex, we know that no other rebind effort will
* be contending for the global variable cpuset_being_rebound.
* The mpol_rebind_mm() call takes mmap_sem, which we couldn't
* take while holding tasklist_lock. Forks can happen - the
* mpol_dup() cpuset_being_rebound check will catch such forks,
* and rebind their vma mempolicies too. Because we still hold
* the global cgroup_mutex, we know that no other rebind effort
* will be contending for the global variable cpuset_being_rebound.
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
*/
migrate = is_memory_migrate(cs);
for (i = 0; i < n; i++) {
struct mm_struct *mm = mmarray[i];
mpol_rebind_mm(mm, &cs->mems_allowed);
if (migrate)
cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
mmput(mm);
}
cgroup_scan_tasks(&scan);
/* We're done rebinding vmas to this cpuset's new mems_allowed. */
kfree(mmarray);
cpuset_being_rebound = NULL;
retval = 0;
done:
return retval;
}
/*
@@ -1141,6 +1119,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
{
nodemask_t oldmem;
int retval;
struct ptr_heap heap;
/*
* top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
@@ -1175,12 +1154,18 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
goto done;
retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
if (retval < 0)
goto done;
mutex_lock(&callback_mutex);
cs->mems_allowed = trialcs->mems_allowed;
cs->mems_generation = cpuset_mems_generation++;
mutex_unlock(&callback_mutex);
retval = update_tasks_nodemask(cs, &oldmem);
update_tasks_nodemask(cs, &oldmem, &heap);
heap_free(&heap);
done:
return retval;
}
@@ -1192,8 +1177,10 @@ int current_cpuset_is_being_rebound(void)
static int update_relax_domain_level(struct cpuset *cs, s64 val)
{
#ifdef CONFIG_SMP
if (val < -1 || val >= SD_LV_MAX)
return -EINVAL;
#endif
if (val != cs->relax_domain_level) {
cs->relax_domain_level = val;
@@ -1355,19 +1342,22 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
struct cgroup *cont, struct task_struct *tsk)
{
struct cpuset *cs = cgroup_cs(cont);
int ret = 0;
if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
return -ENOSPC;
if (tsk->flags & PF_THREAD_BOUND) {
mutex_lock(&callback_mutex);
if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed))
ret = -EINVAL;
mutex_unlock(&callback_mutex);
}
/*
* Kthreads bound to specific cpus cannot be moved to a new cpuset; we
* cannot change their cpu affinity and isolating such threads by their
* set of allowed nodes is unnecessary. Thus, cpusets are not
* applicable for such threads. This prevents checking for success of
* set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
* be changed.
*/
if (tsk->flags & PF_THREAD_BOUND)
return -EINVAL;
return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL);
return security_task_setscheduler(tsk, 0, NULL);
}
static void cpuset_attach(struct cgroup_subsys *ss,
@@ -1706,6 +1696,7 @@ static struct cftype files[] = {
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
.private = FILE_MEMORY_PRESSURE,
.mode = S_IRUGO,
},
{
@@ -1913,10 +1904,9 @@ int __init cpuset_init(void)
static void cpuset_do_move_task(struct task_struct *tsk,
struct cgroup_scanner *scan)
{
struct cpuset_hotplug_scanner *chsp;
struct cgroup *new_cgroup = scan->data;
chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);
cgroup_attach_task(chsp->to, tsk);
cgroup_attach_task(new_cgroup, tsk);
}
/**
@@ -1932,15 +1922,15 @@ static void cpuset_do_move_task(struct task_struct *tsk,
*/
static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
{
struct cpuset_hotplug_scanner scan;
struct cgroup_scanner scan;
scan.scan.cg = from->css.cgroup;
scan.scan.test_task = NULL; /* select all tasks in cgroup */
scan.scan.process_task = cpuset_do_move_task;
scan.scan.heap = NULL;
scan.to = to->css.cgroup;
scan.cg = from->css.cgroup;
scan.test_task = NULL; /* select all tasks in cgroup */
scan.process_task = cpuset_do_move_task;
scan.heap = NULL;
scan.data = to->css.cgroup;
if (cgroup_scan_tasks(&scan.scan))
if (cgroup_scan_tasks(&scan))
printk(KERN_ERR "move_member_tasks_to_cpuset: "
"cgroup_scan_tasks failed\n");
}
@@ -2033,7 +2023,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
remove_tasks_in_empty_cpuset(cp);
else {
update_tasks_cpumask(cp, NULL);
update_tasks_nodemask(cp, &oldmems);
update_tasks_nodemask(cp, &oldmems, NULL);
}
}
}
@@ -2069,7 +2059,9 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
}
cgroup_lock();
mutex_lock(&callback_mutex);
cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
mutex_unlock(&callback_mutex);
scan_for_empty_cpusets(&top_cpuset);
ndoms = generate_sched_domains(&doms, &attr);
cgroup_unlock();
@@ -2092,11 +2084,12 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
cgroup_lock();
switch (action) {
case MEM_ONLINE:
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
break;
case MEM_OFFLINE:
mutex_lock(&callback_mutex);
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
scan_for_empty_cpusets(&top_cpuset);
mutex_unlock(&callback_mutex);
if (action == MEM_OFFLINE)
scan_for_empty_cpusets(&top_cpuset);
break;
default:
break;
@@ -2206,26 +2199,24 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
}
/**
* cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node?
* @z: is this zone on an allowed node?
* cpuset_node_allowed_softwall - Can we allocate on a memory node?
* @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
* If we're in interrupt, yes, we can always allocate. If
* __GFP_THISNODE is set, yes, we can always allocate. If zone
* z's node is in our tasks mems_allowed, yes. If it's not a
* __GFP_HARDWALL request and this zone's nodes is in the nearest
* hardwalled cpuset ancestor to this tasks cpuset, yes.
* If the task has been OOM killed and has access to memory reserves
* as specified by the TIF_MEMDIE flag, yes.
* If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
* set, yes, we can always allocate. If node is in our task's mems_allowed,
* yes. If it's not a __GFP_HARDWALL request and this node is in the nearest
* hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been
* OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
* flag, yes.
* Otherwise, no.
*
* If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
* reduces to cpuset_zone_allowed_hardwall(). Otherwise,
* cpuset_zone_allowed_softwall() might sleep, and might allow a zone
* from an enclosing cpuset.
* If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
* cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall()
* might sleep, and might allow a node from an enclosing cpuset.
*
* cpuset_zone_allowed_hardwall() only handles the simpler case of
* hardwall cpusets, and never sleeps.
* cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
* cpusets, and never sleeps.
*
* The __GFP_THISNODE placement logic is really handled elsewhere,
* by forcibly using a zonelist starting at a specified node, and by
@@ -2264,20 +2255,17 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
* GFP_USER - only nodes in current tasks mems allowed ok.
*
* Rule:
* Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you
* Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
* pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
* the code that might scan up ancestor cpusets and sleep.
*/
int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
{
int node; /* node that zone z is on */
const struct cpuset *cs; /* current cpuset ancestors */
int allowed; /* is allocation in zone z allowed? */
if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
return 1;
node = zone_to_nid(z);
might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
if (node_isset(node, current->mems_allowed))
return 1;
@@ -2306,15 +2294,15 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
}
/*
* cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node?
* @z: is this zone on an allowed node?
* cpuset_node_allowed_hardwall - Can we allocate on a memory node?
* @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
* If we're in interrupt, yes, we can always allocate.
* If __GFP_THISNODE is set, yes, we can always allocate. If zone
* z's node is in our tasks mems_allowed, yes. If the task has been
* OOM killed and has access to memory reserves as specified by the
* TIF_MEMDIE flag, yes. Otherwise, no.
* If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
* set, yes, we can always allocate. If node is in our task's mems_allowed,
* yes. If the task has been OOM killed and has access to memory reserves as
* specified by the TIF_MEMDIE flag, yes.
* Otherwise, no.
*
* The __GFP_THISNODE placement logic is really handled elsewhere,
* by forcibly using a zonelist starting at a specified node, and by
@@ -2322,20 +2310,16 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
* any node on the zonelist except the first. By the time any such
* calls get to this routine, we should just shut up and say 'yes'.
*
* Unlike the cpuset_zone_allowed_softwall() variant, above,
* this variant requires that the zone be in the current tasks
* Unlike the cpuset_node_allowed_softwall() variant, above,
* this variant requires that the node be in the current task's
* mems_allowed or that we're in interrupt. It does not scan up the
* cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
* It never sleeps.
*/
int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
{
int node; /* node that zone z is on */
if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
return 1;
node = zone_to_nid(z);
if (node_isset(node, current->mems_allowed))
return 1;
/*

View File

@@ -18,6 +18,7 @@
#include <linux/syscalls.h>
#include <linux/sysctl.h>
#include <linux/types.h>
#include <linux/fs_struct.h>
static void default_handler(int, struct pt_regs *);
@@ -145,28 +146,6 @@ __set_personality(u_long personality)
return 0;
}
if (atomic_read(&current->fs->count) != 1) {
struct fs_struct *fsp, *ofsp;
fsp = copy_fs_struct(current->fs);
if (fsp == NULL) {
module_put(ep->module);
return -ENOMEM;
}
task_lock(current);
ofsp = current->fs;
current->fs = fsp;
task_unlock(current);
put_fs_struct(ofsp);
}
/*
* At that point we are guaranteed to be the sole owner of
* current->fs.
*/
current->personality = personality;
oep = current_thread_info()->exec_domain;
current_thread_info()->exec_domain = ep;

View File

@@ -46,6 +46,7 @@
#include <linux/blkdev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/tracehook.h>
#include <linux/fs_struct.h>
#include <linux/init_task.h>
#include <trace/sched.h>
@@ -61,11 +62,6 @@ DEFINE_TRACE(sched_process_wait);
static void exit_mm(struct task_struct * tsk);
static inline int task_detached(struct task_struct *p)
{
return p->exit_signal == -1;
}
static void __unhash_process(struct task_struct *p)
{
nr_threads--;
@@ -362,16 +358,12 @@ static void reparent_to_kthreadd(void)
void __set_special_pids(struct pid *pid)
{
struct task_struct *curr = current->group_leader;
pid_t nr = pid_nr(pid);
if (task_session(curr) != pid) {
if (task_session(curr) != pid)
change_pid(curr, PIDTYPE_SID, pid);
set_task_session(curr, nr);
}
if (task_pgrp(curr) != pid) {
if (task_pgrp(curr) != pid)
change_pid(curr, PIDTYPE_PGID, pid);
set_task_pgrp(curr, nr);
}
}
static void set_special_pids(struct pid *pid)
@@ -429,7 +421,6 @@ EXPORT_SYMBOL(disallow_signal);
void daemonize(const char *name, ...)
{
va_list args;
struct fs_struct *fs;
sigset_t blocked;
va_start(args, name);
@@ -462,11 +453,7 @@ void daemonize(const char *name, ...)
/* Become as one with the init task */
exit_fs(current); /* current->fs->count--; */
fs = init_task.fs;
current->fs = fs;
atomic_inc(&fs->count);
daemonize_fs_struct();
exit_files(current);
current->files = init_task.files;
atomic_inc(&current->files->count);
@@ -565,30 +552,6 @@ void exit_files(struct task_struct *tsk)
}
}
void put_fs_struct(struct fs_struct *fs)
{
/* No need to hold fs->lock if we are killing it */
if (atomic_dec_and_test(&fs->count)) {
path_put(&fs->root);
path_put(&fs->pwd);
kmem_cache_free(fs_cachep, fs);
}
}
void exit_fs(struct task_struct *tsk)
{
struct fs_struct * fs = tsk->fs;
if (fs) {
task_lock(tsk);
tsk->fs = NULL;
task_unlock(tsk);
put_fs_struct(fs);
}
}
EXPORT_SYMBOL_GPL(exit_fs);
#ifdef CONFIG_MM_OWNER
/*
* Task p is exiting and it owned mm, lets find a new owner for it
@@ -731,119 +694,6 @@ static void exit_mm(struct task_struct * tsk)
mmput(mm);
}
/*
* Return nonzero if @parent's children should reap themselves.
*
* Called with write_lock_irq(&tasklist_lock) held.
*/
static int ignoring_children(struct task_struct *parent)
{
int ret;
struct sighand_struct *psig = parent->sighand;
unsigned long flags;
spin_lock_irqsave(&psig->siglock, flags);
ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
(psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT));
spin_unlock_irqrestore(&psig->siglock, flags);
return ret;
}
/*
* Detach all tasks we were using ptrace on.
* Any that need to be release_task'd are put on the @dead list.
*
* Called with write_lock(&tasklist_lock) held.
*/
static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
{
struct task_struct *p, *n;
int ign = -1;
list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
__ptrace_unlink(p);
if (p->exit_state != EXIT_ZOMBIE)
continue;
/*
* If it's a zombie, our attachedness prevented normal
* parent notification or self-reaping. Do notification
* now if it would have happened earlier. If it should
* reap itself, add it to the @dead list. We can't call
* release_task() here because we already hold tasklist_lock.
*
* If it's our own child, there is no notification to do.
* But if our normal children self-reap, then this child
* was prevented by ptrace and we must reap it now.
*/
if (!task_detached(p) && thread_group_empty(p)) {
if (!same_thread_group(p->real_parent, parent))
do_notify_parent(p, p->exit_signal);
else {
if (ign < 0)
ign = ignoring_children(parent);
if (ign)
p->exit_signal = -1;
}
}
if (task_detached(p)) {
/*
* Mark it as in the process of being reaped.
*/
p->exit_state = EXIT_DEAD;
list_add(&p->ptrace_entry, dead);
}
}
}
/*
* Finish up exit-time ptrace cleanup.
*
* Called without locks.
*/
static void ptrace_exit_finish(struct task_struct *parent,
struct list_head *dead)
{
struct task_struct *p, *n;
BUG_ON(!list_empty(&parent->ptraced));
list_for_each_entry_safe(p, n, dead, ptrace_entry) {
list_del_init(&p->ptrace_entry);
release_task(p);
}
}
static void reparent_thread(struct task_struct *p, struct task_struct *father)
{
if (p->pdeath_signal)
/* We already hold the tasklist_lock here. */
group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
list_move_tail(&p->sibling, &p->real_parent->children);
/* If this is a threaded reparent there is no need to
* notify anyone anything has happened.
*/
if (same_thread_group(p->real_parent, father))
return;
/* We don't want people slaying init. */
if (!task_detached(p))
p->exit_signal = SIGCHLD;
/* If we'd notified the old parent about this child's death,
* also notify the new parent.
*/
if (!ptrace_reparented(p) &&
p->exit_state == EXIT_ZOMBIE &&
!task_detached(p) && thread_group_empty(p))
do_notify_parent(p, p->exit_signal);
kill_orphaned_pgrp(p, father);
}
/*
* When we die, we re-parent all our children.
* Try to give them to another thread in our thread
@@ -883,17 +733,51 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
return pid_ns->child_reaper;
}
/*
* Any that need to be release_task'd are put on the @dead list.
*/
static void reparent_thread(struct task_struct *father, struct task_struct *p,
struct list_head *dead)
{
if (p->pdeath_signal)
group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
list_move_tail(&p->sibling, &p->real_parent->children);
if (task_detached(p))
return;
/*
* If this is a threaded reparent there is no need to
* notify anyone anything has happened.
*/
if (same_thread_group(p->real_parent, father))
return;
/* We don't want people slaying init. */
p->exit_signal = SIGCHLD;
/* If it has exited notify the new parent about this child's death. */
if (!p->ptrace &&
p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
do_notify_parent(p, p->exit_signal);
if (task_detached(p)) {
p->exit_state = EXIT_DEAD;
list_move_tail(&p->sibling, dead);
}
}
kill_orphaned_pgrp(p, father);
}
static void forget_original_parent(struct task_struct *father)
{
struct task_struct *p, *n, *reaper;
LIST_HEAD(ptrace_dead);
LIST_HEAD(dead_children);
exit_ptrace(father);
write_lock_irq(&tasklist_lock);
reaper = find_new_reaper(father);
/*
* First clean up ptrace if we were using it.
*/
ptrace_exit(father, &ptrace_dead);
list_for_each_entry_safe(p, n, &father->children, sibling) {
p->real_parent = reaper;
@@ -901,13 +785,16 @@ static void forget_original_parent(struct task_struct *father)
BUG_ON(p->ptrace);
p->parent = p->real_parent;
}
reparent_thread(p, father);
reparent_thread(father, p, &dead_children);
}
write_unlock_irq(&tasklist_lock);
BUG_ON(!list_empty(&father->children));
ptrace_exit_finish(father, &ptrace_dead);
list_for_each_entry_safe(p, n, &dead_children, sibling) {
list_del_init(&p->sibling);
release_task(p);
}
}
/*
@@ -950,8 +837,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
*/
if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
(tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
tsk->self_exec_id != tsk->parent_exec_id) &&
!capable(CAP_KILL))
tsk->self_exec_id != tsk->parent_exec_id))
tsk->exit_signal = SIGCHLD;
signal = tracehook_notify_death(tsk, &cookie, group_dead);
@@ -1037,6 +923,8 @@ NORET_TYPE void do_exit(long code)
schedule();
}
exit_irq_thread();
exit_signals(tsk); /* sets PF_EXITING */
/*
* tsk->flags are checked in the futex code to protect against
@@ -1417,6 +1305,18 @@ static int wait_task_zombie(struct task_struct *p, int options,
return retval;
}
static int *task_stopped_code(struct task_struct *p, bool ptrace)
{
if (ptrace) {
if (task_is_stopped_or_traced(p))
return &p->exit_code;
} else {
if (p->signal->flags & SIGNAL_STOP_STOPPED)
return &p->signal->group_exit_code;
}
return NULL;
}
/*
* Handle sys_wait4 work for one task in state TASK_STOPPED. We hold
* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
@@ -1427,7 +1327,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
int options, struct siginfo __user *infop,
int __user *stat_addr, struct rusage __user *ru)
{
int retval, exit_code, why;
int retval, exit_code, *p_code, why;
uid_t uid = 0; /* unneeded, required by compiler */
pid_t pid;
@@ -1437,22 +1337,16 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
exit_code = 0;
spin_lock_irq(&p->sighand->siglock);
if (unlikely(!task_is_stopped_or_traced(p)))
p_code = task_stopped_code(p, ptrace);
if (unlikely(!p_code))
goto unlock_sig;
if (!ptrace && p->signal->group_stop_count > 0)
/*
* A group stop is in progress and this is the group leader.
* We won't report until all threads have stopped.
*/
goto unlock_sig;
exit_code = p->exit_code;
exit_code = *p_code;
if (!exit_code)
goto unlock_sig;
if (!unlikely(options & WNOWAIT))
p->exit_code = 0;
*p_code = 0;
/* don't need the RCU readlock here as we're holding a spinlock */
uid = __task_cred(p)->uid;
@@ -1608,7 +1502,7 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
*/
*notask_error = 0;
if (task_is_stopped_or_traced(p))
if (task_stopped_code(p, ptrace))
return wait_task_stopped(ptrace, p, options,
infop, stat_addr, ru);
@@ -1812,7 +1706,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
pid = find_get_pid(-upid);
} else if (upid == 0) {
type = PIDTYPE_PGID;
pid = get_pid(task_pgrp(current));
pid = get_task_pid(current, PIDTYPE_PGID);
} else /* upid > 0 */ {
type = PIDTYPE_PID;
pid = find_get_pid(upid);

View File

@@ -15,11 +15,22 @@
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/module.h>
#include <linux/init.h>
#include <linux/ftrace.h>
#include <asm/uaccess.h>
#include <linux/memory.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/init.h>
#include <asm/sections.h>
#include <asm/uaccess.h>
/*
* mutex protecting text section modification (dynamic code patching).
* some users need to sleep (allocating memory...) while they hold this lock.
*
* NOT exported to modules - patching kernel text is a really delicate matter.
*/
DEFINE_MUTEX(text_mutex);
extern struct exception_table_entry __start___ex_table[];
extern struct exception_table_entry __stop___ex_table[];
@@ -41,31 +52,50 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
return e;
}
__notrace_funcgraph int core_kernel_text(unsigned long addr)
static inline int init_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_sinittext &&
addr <= (unsigned long)_einittext)
return 1;
return 0;
}
int core_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_stext &&
addr <= (unsigned long)_etext)
return 1;
if (system_state == SYSTEM_BOOTING &&
addr >= (unsigned long)_sinittext &&
addr <= (unsigned long)_einittext)
init_kernel_text(addr))
return 1;
return 0;
}
__notrace_funcgraph int __kernel_text_address(unsigned long addr)
int __kernel_text_address(unsigned long addr)
{
if (core_kernel_text(addr))
return 1;
return __module_text_address(addr) != NULL;
if (is_module_text_address(addr))
return 1;
/*
* There might be init symbols in saved stacktraces.
* Give those symbols a chance to be printed in
* backtraces (such as lockdep traces).
*
* Since we are after the module-symbols check, there's
* no danger of address overlap:
*/
if (init_kernel_text(addr))
return 1;
return 0;
}
int kernel_text_address(unsigned long addr)
{
if (core_kernel_text(addr))
return 1;
return module_text_address(addr) != NULL;
return is_module_text_address(addr);
}
/*
@@ -81,5 +111,5 @@ int func_ptr_is_kernel_text(void *ptr)
addr = (unsigned long) dereference_function_descriptor(ptr);
if (core_kernel_text(addr))
return 1;
return module_text_address(addr) != NULL;
return is_module_text_address(addr);
}

View File

@@ -60,6 +60,7 @@
#include <linux/tty.h>
#include <linux/proc_fs.h>
#include <linux/blkdev.h>
#include <linux/fs_struct.h>
#include <trace/sched.h>
#include <linux/magic.h>
@@ -644,6 +645,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
tsk->min_flt = tsk->maj_flt = 0;
tsk->nvcsw = tsk->nivcsw = 0;
#ifdef CONFIG_DETECT_HUNG_TASK
tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
#endif
tsk->mm = NULL;
tsk->active_mm = NULL;
@@ -681,38 +685,21 @@ fail_nomem:
return retval;
}
static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
{
struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
/* We don't need to lock fs - think why ;-) */
if (fs) {
atomic_set(&fs->count, 1);
rwlock_init(&fs->lock);
fs->umask = old->umask;
read_lock(&old->lock);
fs->root = old->root;
path_get(&old->root);
fs->pwd = old->pwd;
path_get(&old->pwd);
read_unlock(&old->lock);
}
return fs;
}
struct fs_struct *copy_fs_struct(struct fs_struct *old)
{
return __copy_fs_struct(old);
}
EXPORT_SYMBOL_GPL(copy_fs_struct);
static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
{
struct fs_struct *fs = current->fs;
if (clone_flags & CLONE_FS) {
atomic_inc(&current->fs->count);
/* tsk->fs is already what we want */
write_lock(&fs->lock);
if (fs->in_exec) {
write_unlock(&fs->lock);
return -EAGAIN;
}
fs->users++;
write_unlock(&fs->lock);
return 0;
}
tsk->fs = __copy_fs_struct(current->fs);
tsk->fs = copy_fs_struct(fs);
if (!tsk->fs)
return -ENOMEM;
return 0;
@@ -841,6 +828,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
atomic_set(&sig->live, 1);
init_waitqueue_head(&sig->wait_chldexit);
sig->flags = 0;
if (clone_flags & CLONE_NEWPID)
sig->flags |= SIGNAL_UNKILLABLE;
sig->group_exit_code = 0;
sig->group_exit_task = NULL;
sig->group_stop_count = 0;
@@ -1046,11 +1035,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->default_timer_slack_ns = current->timer_slack_ns;
#ifdef CONFIG_DETECT_SOFTLOCKUP
p->last_switch_count = 0;
p->last_switch_timestamp = 0;
#endif
task_io_accounting_init(&p->ioac);
acct_clear_integrals(p);
@@ -1125,7 +1109,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto bad_fork_cleanup_mm;
if ((retval = copy_io(clone_flags, p)))
goto bad_fork_cleanup_namespaces;
retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
if (retval)
goto bad_fork_cleanup_io;
@@ -1263,8 +1247,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->signal->leader_pid = pid;
tty_kref_put(p->signal->tty);
p->signal->tty = tty_kref_get(current->signal->tty);
set_task_pgrp(p, task_pgrp_nr(current));
set_task_session(p, task_session_nr(current));
attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
attach_pid(p, PIDTYPE_SID, task_session(current));
list_add_tail_rcu(&p->tasks, &init_task.tasks);
@@ -1488,6 +1470,7 @@ void __init proc_caches_init(void)
mm_cachep = kmem_cache_create("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
mmap_init();
}
@@ -1543,12 +1526,16 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
{
struct fs_struct *fs = current->fs;
if ((unshare_flags & CLONE_FS) &&
(fs && atomic_read(&fs->count) > 1)) {
*new_fsp = __copy_fs_struct(current->fs);
if (!*new_fsp)
return -ENOMEM;
}
if (!(unshare_flags & CLONE_FS) || !fs)
return 0;
/* don't need lock here; in the worst case we'll do useless copy */
if (fs->users == 1)
return 0;
*new_fsp = copy_fs_struct(fs);
if (!*new_fsp)
return -ENOMEM;
return 0;
}
@@ -1664,8 +1651,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
if (new_fs) {
fs = current->fs;
write_lock(&fs->lock);
current->fs = new_fs;
new_fs = fs;
if (--fs->users)
new_fs = NULL;
else
new_fs = fs;
write_unlock(&fs->lock);
}
if (new_mm) {
@@ -1704,7 +1696,7 @@ bad_unshare_cleanup_sigh:
bad_unshare_cleanup_fs:
if (new_fs)
put_fs_struct(new_fs);
free_fs_struct(new_fs);
bad_unshare_cleanup_thread:
bad_unshare_out:

View File

@@ -651,14 +651,20 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
* and expiry check is done in the hrtimer_interrupt or in the softirq.
*/
static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
struct hrtimer_clock_base *base)
struct hrtimer_clock_base *base,
int wakeup)
{
if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
spin_unlock(&base->cpu_base->lock);
raise_softirq_irqoff(HRTIMER_SOFTIRQ);
spin_lock(&base->cpu_base->lock);
if (wakeup) {
spin_unlock(&base->cpu_base->lock);
raise_softirq_irqoff(HRTIMER_SOFTIRQ);
spin_lock(&base->cpu_base->lock);
} else
__raise_softirq_irqoff(HRTIMER_SOFTIRQ);
return 1;
}
return 0;
}
@@ -703,7 +709,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
static inline int hrtimer_switch_to_hres(void) { return 0; }
static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
struct hrtimer_clock_base *base)
struct hrtimer_clock_base *base,
int wakeup)
{
return 0;
}
@@ -886,20 +893,9 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
return 0;
}
/**
* hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
* @timer: the timer to be added
* @tim: expiry time
* @delta_ns: "slack" range for the timer
* @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
*
* Returns:
* 0 on success
* 1 when the timer was active
*/
int
hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns,
const enum hrtimer_mode mode)
int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
unsigned long delta_ns, const enum hrtimer_mode mode,
int wakeup)
{
struct hrtimer_clock_base *base, *new_base;
unsigned long flags;
@@ -940,12 +936,29 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
* XXX send_remote_softirq() ?
*/
if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
hrtimer_enqueue_reprogram(timer, new_base);
hrtimer_enqueue_reprogram(timer, new_base, wakeup);
unlock_hrtimer_base(timer, &flags);
return ret;
}
/**
* hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
* @timer: the timer to be added
* @tim: expiry time
* @delta_ns: "slack" range for the timer
* @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
*
* Returns:
* 0 on success
* 1 when the timer was active
*/
int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
unsigned long delta_ns, const enum hrtimer_mode mode)
{
return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
}
EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
/**
@@ -961,7 +974,7 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
int
hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
{
return hrtimer_start_range_ns(timer, tim, 0, mode);
return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);
}
EXPORT_SYMBOL_GPL(hrtimer_start);

217
kernel/hung_task.c Normal file
View File

@@ -0,0 +1,217 @@
/*
* Detect Hung Task
*
* kernel/hung_task.c - kernel thread for detecting tasks stuck in D state
*
*/
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/nmi.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/lockdep.h>
#include <linux/module.h>
#include <linux/sysctl.h>
/*
* The number of tasks checked:
*/
unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
/*
* Limit number of tasks checked in a batch.
*
* This value controls the preemptibility of khungtaskd since preemption
* is disabled during the critical section. It also controls the size of
* the RCU grace period. So it needs to be upper-bound.
*/
#define HUNG_TASK_BATCHING 1024
/*
* Zero means infinite timeout - no checking done:
*/
unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
unsigned long __read_mostly sysctl_hung_task_warnings = 10;
static int __read_mostly did_panic;
static struct task_struct *watchdog_task;
/*
* Should we panic (and reboot, if panic_timeout= is set) when a
* hung task is detected:
*/
unsigned int __read_mostly sysctl_hung_task_panic =
CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
static int __init hung_task_panic_setup(char *str)
{
sysctl_hung_task_panic = simple_strtoul(str, NULL, 0);
return 1;
}
__setup("hung_task_panic=", hung_task_panic_setup);
static int
hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
{
did_panic = 1;
return NOTIFY_DONE;
}
static struct notifier_block panic_block = {
.notifier_call = hung_task_panic,
};
static void check_hung_task(struct task_struct *t, unsigned long timeout)
{
unsigned long switch_count = t->nvcsw + t->nivcsw;
/*
* Ensure the task is not frozen.
* Also, when a freshly created task is scheduled once, changes
* its state to TASK_UNINTERRUPTIBLE without having ever been
* switched out once, it musn't be checked.
*/
if (unlikely(t->flags & PF_FROZEN || !switch_count))
return;
if (switch_count != t->last_switch_count) {
t->last_switch_count = switch_count;
return;
}
if (!sysctl_hung_task_warnings)
return;
sysctl_hung_task_warnings--;
/*
* Ok, the task did not get scheduled for more than 2 minutes,
* complain:
*/
printk(KERN_ERR "INFO: task %s:%d blocked for more than "
"%ld seconds.\n", t->comm, t->pid, timeout);
printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
" disables this message.\n");
sched_show_task(t);
__debug_show_held_locks(t);
touch_nmi_watchdog();
if (sysctl_hung_task_panic)
panic("hung_task: blocked tasks");
}
/*
* To avoid extending the RCU grace period for an unbounded amount of time,
* periodically exit the critical section and enter a new one.
*
* For preemptible RCU it is sufficient to call rcu_read_unlock in order
* exit the grace period. For classic RCU, a reschedule is required.
*/
static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
{
get_task_struct(g);
get_task_struct(t);
rcu_read_unlock();
cond_resched();
rcu_read_lock();
put_task_struct(t);
put_task_struct(g);
}
/*
* Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
* a really long time (120 seconds). If that happens, print out
* a warning.
*/
static void check_hung_uninterruptible_tasks(unsigned long timeout)
{
int max_count = sysctl_hung_task_check_count;
int batch_count = HUNG_TASK_BATCHING;
struct task_struct *g, *t;
/*
* If the system crashed already then all bets are off,
* do not report extra hung tasks:
*/
if (test_taint(TAINT_DIE) || did_panic)
return;
rcu_read_lock();
do_each_thread(g, t) {
if (!--max_count)
goto unlock;
if (!--batch_count) {
batch_count = HUNG_TASK_BATCHING;
rcu_lock_break(g, t);
/* Exit if t or g was unhashed during refresh. */
if (t->state == TASK_DEAD || g->state == TASK_DEAD)
goto unlock;
}
/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
if (t->state == TASK_UNINTERRUPTIBLE)
check_hung_task(t, timeout);
} while_each_thread(g, t);
unlock:
rcu_read_unlock();
}
static unsigned long timeout_jiffies(unsigned long timeout)
{
/* timeout of 0 will disable the watchdog */
return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT;
}
/*
* Process updating of timeout sysctl
*/
int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
struct file *filp, void __user *buffer,
size_t *lenp, loff_t *ppos)
{
int ret;
ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
if (ret || !write)
goto out;
wake_up_process(watchdog_task);
out:
return ret;
}
/*
* kthread which checks for tasks stuck in D state
*/
static int watchdog(void *dummy)
{
set_user_nice(current, 0);
for ( ; ; ) {
unsigned long timeout = sysctl_hung_task_timeout_secs;
while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
timeout = sysctl_hung_task_timeout_secs;
check_hung_uninterruptible_tasks(timeout);
}
return 0;
}
static int __init hung_task_init(void)
{
atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
return 0;
}
module_init(hung_task_init);

View File

@@ -26,10 +26,12 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
}
/**
* devm_request_irq - allocate an interrupt line for a managed device
* devm_request_threaded_irq - allocate an interrupt line for a managed device
* @dev: device to request interrupt for
* @irq: Interrupt line to allocate
* @handler: Function to be called when the IRQ occurs
* @thread_fn: function to be called in a threaded interrupt context. NULL
* for devices which handle everything in @handler
* @irqflags: Interrupt type flags
* @devname: An ascii name for the claiming device
* @dev_id: A cookie passed back to the handler function
@@ -42,9 +44,10 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
* If an IRQ allocated with this function needs to be freed
* separately, dev_free_irq() must be used.
*/
int devm_request_irq(struct device *dev, unsigned int irq,
irq_handler_t handler, unsigned long irqflags,
const char *devname, void *dev_id)
int devm_request_threaded_irq(struct device *dev, unsigned int irq,
irq_handler_t handler, irq_handler_t thread_fn,
unsigned long irqflags, const char *devname,
void *dev_id)
{
struct irq_devres *dr;
int rc;
@@ -54,7 +57,8 @@ int devm_request_irq(struct device *dev, unsigned int irq,
if (!dr)
return -ENOMEM;
rc = request_irq(irq, handler, irqflags, devname, dev_id);
rc = request_threaded_irq(irq, handler, thread_fn, irqflags, devname,
dev_id);
if (rc) {
devres_free(dr);
return rc;
@@ -66,7 +70,7 @@ int devm_request_irq(struct device *dev, unsigned int irq,
return 0;
}
EXPORT_SYMBOL(devm_request_irq);
EXPORT_SYMBOL(devm_request_threaded_irq);
/**
* devm_free_irq - free an interrupt

View File

@@ -17,6 +17,7 @@
#include <linux/kernel_stat.h>
#include <linux/rculist.h>
#include <linux/hash.h>
#include <trace/irq.h>
#include <linux/bootmem.h>
#include "internals.h"
@@ -338,6 +339,18 @@ irqreturn_t no_action(int cpl, void *dev_id)
return IRQ_NONE;
}
static void warn_no_thread(unsigned int irq, struct irqaction *action)
{
if (test_and_set_bit(IRQTF_WARNED, &action->thread_flags))
return;
printk(KERN_WARNING "IRQ %d device %s returned IRQ_WAKE_THREAD "
"but no thread function available.", irq, action->name);
}
DEFINE_TRACE(irq_handler_entry);
DEFINE_TRACE(irq_handler_exit);
/**
* handle_IRQ_event - irq action chain handler
* @irq: the interrupt number
@@ -356,9 +369,50 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
local_irq_enable_in_hardirq();
do {
trace_irq_handler_entry(irq, action);
ret = action->handler(irq, action->dev_id);
if (ret == IRQ_HANDLED)
trace_irq_handler_exit(irq, action, ret);
switch (ret) {
case IRQ_WAKE_THREAD:
/*
* Set result to handled so the spurious check
* does not trigger.
*/
ret = IRQ_HANDLED;
/*
* Catch drivers which return WAKE_THREAD but
* did not set up a thread function
*/
if (unlikely(!action->thread_fn)) {
warn_no_thread(irq, action);
break;
}
/*
* Wake up the handler thread for this
* action. In case the thread crashed and was
* killed we just pretend that we handled the
* interrupt. The hardirq handler above has
* disabled the device interrupt, so no irq
* storm is lurking.
*/
if (likely(!test_bit(IRQTF_DIED,
&action->thread_flags))) {
set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
wake_up_process(action->thread);
}
/* Fall through to add to randomness */
case IRQ_HANDLED:
status |= action->flags;
break;
default:
break;
}
retval |= ret;
action = action->next;
} while (action);

View File

@@ -8,16 +8,15 @@
*/
#include <linux/irq.h>
#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include "internals.h"
#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
cpumask_var_t irq_default_affinity;
/**
* synchronize_irq - wait for pending IRQ handlers (on other CPUs)
* @irq: interrupt number to wait for
@@ -53,9 +52,18 @@ void synchronize_irq(unsigned int irq)
/* Oops, that failed? */
} while (status & IRQ_INPROGRESS);
/*
* We made sure that no hardirq handler is running. Now verify
* that no threaded handlers are active.
*/
wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active));
}
EXPORT_SYMBOL(synchronize_irq);
#ifdef CONFIG_SMP
cpumask_var_t irq_default_affinity;
/**
* irq_can_set_affinity - Check if the affinity of a given irq can be set
* @irq: Interrupt to check
@@ -72,6 +80,18 @@ int irq_can_set_affinity(unsigned int irq)
return 1;
}
static void
irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask)
{
struct irqaction *action = desc->action;
while (action) {
if (action->thread)
set_cpus_allowed_ptr(action->thread, cpumask);
action = action->next;
}
}
/**
* irq_set_affinity - Set the irq affinity of a given irq
* @irq: Interrupt to set affinity
@@ -100,6 +120,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
cpumask_copy(desc->affinity, cpumask);
desc->chip->set_affinity(irq, cpumask);
#endif
irq_set_thread_affinity(desc, cpumask);
desc->status |= IRQ_AFFINITY_SET;
spin_unlock_irqrestore(&desc->lock, flags);
return 0;
@@ -150,6 +171,8 @@ int irq_select_affinity_usr(unsigned int irq)
spin_lock_irqsave(&desc->lock, flags);
ret = setup_affinity(irq, desc);
if (!ret)
irq_set_thread_affinity(desc, desc->affinity);
spin_unlock_irqrestore(&desc->lock, flags);
return ret;
@@ -401,6 +424,90 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
return ret;
}
static int irq_wait_for_interrupt(struct irqaction *action)
{
while (!kthread_should_stop()) {
set_current_state(TASK_INTERRUPTIBLE);
if (test_and_clear_bit(IRQTF_RUNTHREAD,
&action->thread_flags)) {
__set_current_state(TASK_RUNNING);
return 0;
}
schedule();
}
return -1;
}
/*
* Interrupt handler thread
*/
static int irq_thread(void *data)
{
struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
struct irqaction *action = data;
struct irq_desc *desc = irq_to_desc(action->irq);
int wake;
sched_setscheduler(current, SCHED_FIFO, &param);
current->irqaction = action;
while (!irq_wait_for_interrupt(action)) {
atomic_inc(&desc->threads_active);
spin_lock_irq(&desc->lock);
if (unlikely(desc->status & IRQ_DISABLED)) {
/*
* CHECKME: We might need a dedicated
* IRQ_THREAD_PENDING flag here, which
* retriggers the thread in check_irq_resend()
* but AFAICT IRQ_PENDING should be fine as it
* retriggers the interrupt itself --- tglx
*/
desc->status |= IRQ_PENDING;
spin_unlock_irq(&desc->lock);
} else {
spin_unlock_irq(&desc->lock);
action->thread_fn(action->irq, action->dev_id);
}
wake = atomic_dec_and_test(&desc->threads_active);
if (wake && waitqueue_active(&desc->wait_for_threads))
wake_up(&desc->wait_for_threads);
}
/*
* Clear irqaction. Otherwise exit_irq_thread() would make
* fuzz about an active irq thread going into nirvana.
*/
current->irqaction = NULL;
return 0;
}
/*
* Called from do_exit()
*/
void exit_irq_thread(void)
{
struct task_struct *tsk = current;
if (!tsk->irqaction)
return;
printk(KERN_ERR
"exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
/*
* Set the THREAD DIED flag to prevent further wakeups of the
* soon to be gone threaded handler.
*/
set_bit(IRQTF_DIED, &tsk->irqaction->flags);
}
/*
* Internal function to register an irqaction - typically used to
* allocate special interrupts that are part of the architecture.
@@ -436,6 +543,26 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
rand_initialize_irq(irq);
}
/*
* Threaded handler ?
*/
if (new->thread_fn) {
struct task_struct *t;
t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
new->name);
if (IS_ERR(t))
return PTR_ERR(t);
/*
* We keep the reference to the task struct even if
* the thread dies to avoid that the interrupt code
* references an already freed task_struct.
*/
get_task_struct(t);
new->thread = t;
wake_up_process(t);
}
/*
* The following block of code has to be executed atomically
*/
@@ -473,15 +600,15 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
if (!shared) {
irq_chip_set_defaults(desc->chip);
init_waitqueue_head(&desc->wait_for_threads);
/* Setup the type (level, edge polarity) if configured: */
if (new->flags & IRQF_TRIGGER_MASK) {
ret = __irq_set_trigger(desc, irq,
new->flags & IRQF_TRIGGER_MASK);
if (ret) {
spin_unlock_irqrestore(&desc->lock, flags);
return ret;
}
if (ret)
goto out_thread;
} else
compat_irq_chip_set_default_handler(desc);
#if defined(CONFIG_IRQ_PER_CPU)
@@ -549,8 +676,19 @@ mismatch:
dump_stack();
}
#endif
ret = -EBUSY;
out_thread:
spin_unlock_irqrestore(&desc->lock, flags);
return -EBUSY;
if (new->thread) {
struct task_struct *t = new->thread;
new->thread = NULL;
if (likely(!test_bit(IRQTF_DIED, &new->thread_flags)))
kthread_stop(t);
put_task_struct(t);
}
return ret;
}
/**
@@ -576,6 +714,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irqaction *action, **action_ptr;
struct task_struct *irqthread;
unsigned long flags;
WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
@@ -622,6 +761,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
else
desc->chip->disable(irq);
}
irqthread = action->thread;
action->thread = NULL;
spin_unlock_irqrestore(&desc->lock, flags);
unregister_handler_proc(irq, action);
@@ -629,6 +772,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
/* Make sure it's not being used on another CPU: */
synchronize_irq(irq);
if (irqthread) {
if (!test_bit(IRQTF_DIED, &action->thread_flags))
kthread_stop(irqthread);
put_task_struct(irqthread);
}
#ifdef CONFIG_DEBUG_SHIRQ
/*
* It's a shared IRQ -- the driver ought to be prepared for an IRQ
@@ -681,9 +830,12 @@ void free_irq(unsigned int irq, void *dev_id)
EXPORT_SYMBOL(free_irq);
/**
* request_irq - allocate an interrupt line
* request_threaded_irq - allocate an interrupt line
* @irq: Interrupt line to allocate
* @handler: Function to be called when the IRQ occurs
* @handler: Function to be called when the IRQ occurs.
* Primary handler for threaded interrupts
* @thread_fn: Function called from the irq handler thread
* If NULL, no irq thread is created
* @irqflags: Interrupt type flags
* @devname: An ascii name for the claiming device
* @dev_id: A cookie passed back to the handler function
@@ -695,6 +847,15 @@ EXPORT_SYMBOL(free_irq);
* raises, you must take care both to initialise your hardware
* and to set up the interrupt handler in the right order.
*
* If you want to set up a threaded irq handler for your device
* then you need to supply @handler and @thread_fn. @handler ist
* still called in hard interrupt context and has to check
* whether the interrupt originates from the device. If yes it
* needs to disable the interrupt on the device and return
* IRQ_THREAD_WAKE which will wake up the handler thread and run
* @thread_fn. This split handler design is necessary to support
* shared interrupts.
*
* Dev_id must be globally unique. Normally the address of the
* device data structure is used as the cookie. Since the handler
* receives this value it makes sense to use it.
@@ -710,8 +871,9 @@ EXPORT_SYMBOL(free_irq);
* IRQF_TRIGGER_* Specify active edge(s) or level
*
*/
int request_irq(unsigned int irq, irq_handler_t handler,
unsigned long irqflags, const char *devname, void *dev_id)
int request_threaded_irq(unsigned int irq, irq_handler_t handler,
irq_handler_t thread_fn, unsigned long irqflags,
const char *devname, void *dev_id)
{
struct irqaction *action;
struct irq_desc *desc;
@@ -759,6 +921,7 @@ int request_irq(unsigned int irq, irq_handler_t handler,
return -ENOMEM;
action->handler = handler;
action->thread_fn = thread_fn;
action->flags = irqflags;
action->name = devname;
action->dev_id = dev_id;
@@ -788,4 +951,4 @@ int request_irq(unsigned int irq, irq_handler_t handler,
#endif
return retval;
}
EXPORT_SYMBOL(request_irq);
EXPORT_SYMBOL(request_threaded_irq);

View File

@@ -161,6 +161,25 @@ unsigned long kallsyms_lookup_name(const char *name)
return module_kallsyms_lookup_name(name);
}
int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
unsigned long),
void *data)
{
char namebuf[KSYM_NAME_LEN];
unsigned long i;
unsigned int off;
int ret;
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
off = kallsyms_expand_symbol(off, namebuf);
ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
if (ret != 0)
return ret;
}
return module_kallsyms_on_each_symbol(fn, data);
}
EXPORT_SYMBOL_GPL(kallsyms_on_each_symbol);
static unsigned long get_symbol_pos(unsigned long addr,
unsigned long *symbolsize,
unsigned long *offset)

View File

@@ -42,7 +42,7 @@
note_buf_t* crash_notes;
/* vmcoreinfo stuff */
unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
size_t vmcoreinfo_size;
size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
@@ -1409,6 +1409,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(list_head, prev);
VMCOREINFO_OFFSET(vm_struct, addr);
VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
log_buf_kexec_setup();
VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
VMCOREINFO_NUMBER(NR_FREE_PAGES);
VMCOREINFO_NUMBER(PG_lru);

View File

@@ -50,7 +50,8 @@ static struct workqueue_struct *khelper_wq;
char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
/**
* request_module - try to load a kernel module
* __request_module - try to load a kernel module
* @wait: wait (or not) for the operation to complete
* @fmt: printf style format string for the name of the module
* @...: arguments as specified in the format string
*
@@ -63,7 +64,7 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
* If module auto-loading support is disabled then this function
* becomes a no-operation.
*/
int request_module(const char *fmt, ...)
int __request_module(bool wait, const char *fmt, ...)
{
va_list args;
char module_name[MODULE_NAME_LEN];
@@ -108,11 +109,12 @@ int request_module(const char *fmt, ...)
return -ENOMEM;
}
ret = call_usermodehelper(modprobe_path, argv, envp, 1);
ret = call_usermodehelper(modprobe_path, argv, envp,
wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
atomic_dec(&kmod_concurrent);
return ret;
}
EXPORT_SYMBOL(request_module);
EXPORT_SYMBOL(__request_module);
#endif /* CONFIG_MODULES */
struct subprocess_info {

View File

@@ -43,6 +43,7 @@
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/kdebug.h>
#include <linux/memory.h>
#include <asm-generic/sections.h>
#include <asm/cacheflush.h>
@@ -67,7 +68,7 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
/* NOTE: change this value only with kprobe_mutex held */
static bool kprobe_enabled;
static bool kprobes_all_disarmed;
static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
@@ -327,7 +328,7 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
struct kprobe *kp;
list_for_each_entry_rcu(kp, &p->list, list) {
if (kp->pre_handler && !kprobe_gone(kp)) {
if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
set_kprobe_instance(kp);
if (kp->pre_handler(kp, regs))
return 1;
@@ -343,7 +344,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
struct kprobe *kp;
list_for_each_entry_rcu(kp, &p->list, list) {
if (kp->post_handler && !kprobe_gone(kp)) {
if (kp->post_handler && likely(!kprobe_disabled(kp))) {
set_kprobe_instance(kp);
kp->post_handler(kp, regs, flags);
reset_kprobe_instance();
@@ -517,20 +518,28 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
}
/*
* Add the new probe to old_p->list. Fail if this is the
* Add the new probe to ap->list. Fail if this is the
* second jprobe at the address - two jprobes can't coexist
*/
static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
{
BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
if (p->break_handler) {
if (old_p->break_handler)
if (ap->break_handler)
return -EEXIST;
list_add_tail_rcu(&p->list, &old_p->list);
old_p->break_handler = aggr_break_handler;
list_add_tail_rcu(&p->list, &ap->list);
ap->break_handler = aggr_break_handler;
} else
list_add_rcu(&p->list, &old_p->list);
if (p->post_handler && !old_p->post_handler)
old_p->post_handler = aggr_post_handler;
list_add_rcu(&p->list, &ap->list);
if (p->post_handler && !ap->post_handler)
ap->post_handler = aggr_post_handler;
if (kprobe_disabled(ap) && !kprobe_disabled(p)) {
ap->flags &= ~KPROBE_FLAG_DISABLED;
if (!kprobes_all_disarmed)
/* Arm the breakpoint again. */
arch_arm_kprobe(ap);
}
return 0;
}
@@ -543,6 +552,7 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
copy_kprobe(p, ap);
flush_insn_slot(ap);
ap->addr = p->addr;
ap->flags = p->flags;
ap->pre_handler = aggr_pre_handler;
ap->fault_handler = aggr_fault_handler;
/* We don't care the kprobe which has gone. */
@@ -565,44 +575,59 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
struct kprobe *p)
{
int ret = 0;
struct kprobe *ap;
struct kprobe *ap = old_p;
if (kprobe_gone(old_p)) {
if (old_p->pre_handler != aggr_pre_handler) {
/* If old_p is not an aggr_probe, create new aggr_kprobe. */
ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
if (!ap)
return -ENOMEM;
add_aggr_kprobe(ap, old_p);
}
if (kprobe_gone(ap)) {
/*
* Attempting to insert new probe at the same location that
* had a probe in the module vaddr area which already
* freed. So, the instruction slot has already been
* released. We need a new slot for the new probe.
*/
ret = arch_prepare_kprobe(old_p);
ret = arch_prepare_kprobe(ap);
if (ret)
/*
* Even if fail to allocate new slot, don't need to
* free aggr_probe. It will be used next time, or
* freed by unregister_kprobe.
*/
return ret;
}
if (old_p->pre_handler == aggr_pre_handler) {
copy_kprobe(old_p, p);
ret = add_new_kprobe(old_p, p);
ap = old_p;
} else {
ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
if (!ap) {
if (kprobe_gone(old_p))
arch_remove_kprobe(old_p);
return -ENOMEM;
}
add_aggr_kprobe(ap, old_p);
copy_kprobe(ap, p);
ret = add_new_kprobe(ap, p);
}
if (kprobe_gone(old_p)) {
/*
* If the old_p has gone, its breakpoint has been disarmed.
* We have to arm it again after preparing real kprobes.
* Clear gone flag to prevent allocating new slot again, and
* set disabled flag because it is not armed yet.
*/
ap->flags &= ~KPROBE_FLAG_GONE;
if (kprobe_enabled)
arch_arm_kprobe(ap);
ap->flags = (ap->flags & ~KPROBE_FLAG_GONE)
| KPROBE_FLAG_DISABLED;
}
return ret;
copy_kprobe(ap, p);
return add_new_kprobe(ap, p);
}
/* Try to disable aggr_kprobe, and return 1 if succeeded.*/
static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p)
{
struct kprobe *kp;
list_for_each_entry_rcu(kp, &p->list, list) {
if (!kprobe_disabled(kp))
/*
* There is an active probe on the list.
* We can't disable aggr_kprobe.
*/
return 0;
}
p->flags |= KPROBE_FLAG_DISABLED;
return 1;
}
static int __kprobes in_kprobes_functions(unsigned long addr)
@@ -663,7 +688,9 @@ int __kprobes register_kprobe(struct kprobe *p)
return -EINVAL;
}
p->flags = 0;
/* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
p->flags &= KPROBE_FLAG_DISABLED;
/*
* Check if are we probing a module.
*/
@@ -699,17 +726,20 @@ int __kprobes register_kprobe(struct kprobe *p)
goto out;
}
mutex_lock(&text_mutex);
ret = arch_prepare_kprobe(p);
if (ret)
goto out;
goto out_unlock_text;
INIT_HLIST_NODE(&p->hlist);
hlist_add_head_rcu(&p->hlist,
&kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
if (kprobe_enabled)
if (!kprobes_all_disarmed && !kprobe_disabled(p))
arch_arm_kprobe(p);
out_unlock_text:
mutex_unlock(&text_mutex);
out:
mutex_unlock(&kprobe_mutex);
@@ -718,6 +748,27 @@ out:
return ret;
}
EXPORT_SYMBOL_GPL(register_kprobe);
/* Check passed kprobe is valid and return kprobe in kprobe_table. */
static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
{
struct kprobe *old_p, *list_p;
old_p = get_kprobe(p->addr);
if (unlikely(!old_p))
return NULL;
if (p != old_p) {
list_for_each_entry_rcu(list_p, &old_p->list, list)
if (list_p == p)
/* kprobe p is a valid probe */
goto valid;
return NULL;
}
valid:
return old_p;
}
/*
* Unregister a kprobe without a scheduler synchronization.
@@ -726,18 +777,10 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
{
struct kprobe *old_p, *list_p;
old_p = get_kprobe(p->addr);
if (unlikely(!old_p))
old_p = __get_valid_kprobe(p);
if (old_p == NULL)
return -EINVAL;
if (p != old_p) {
list_for_each_entry_rcu(list_p, &old_p->list, list)
if (list_p == p)
/* kprobe p is a valid probe */
goto valid_p;
return -EINVAL;
}
valid_p:
if (old_p == p ||
(old_p->pre_handler == aggr_pre_handler &&
list_is_singular(&old_p->list))) {
@@ -746,8 +789,11 @@ valid_p:
* enabled and not gone - otherwise, the breakpoint would
* already have been removed. We save on flushing icache.
*/
if (kprobe_enabled && !kprobe_gone(old_p))
if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) {
mutex_lock(&text_mutex);
arch_disarm_kprobe(p);
mutex_unlock(&text_mutex);
}
hlist_del_rcu(&old_p->hlist);
} else {
if (p->break_handler && !kprobe_gone(p))
@@ -761,6 +807,11 @@ valid_p:
}
noclean:
list_del_rcu(&p->list);
if (!kprobe_disabled(old_p)) {
try_to_disable_aggr_kprobe(old_p);
if (!kprobes_all_disarmed && kprobe_disabled(old_p))
arch_disarm_kprobe(old_p);
}
}
return 0;
}
@@ -796,11 +847,13 @@ int __kprobes register_kprobes(struct kprobe **kps, int num)
}
return ret;
}
EXPORT_SYMBOL_GPL(register_kprobes);
void __kprobes unregister_kprobe(struct kprobe *p)
{
unregister_kprobes(&p, 1);
}
EXPORT_SYMBOL_GPL(unregister_kprobe);
void __kprobes unregister_kprobes(struct kprobe **kps, int num)
{
@@ -819,6 +872,7 @@ void __kprobes unregister_kprobes(struct kprobe **kps, int num)
if (kps[i]->addr)
__unregister_kprobe_bottom(kps[i]);
}
EXPORT_SYMBOL_GPL(unregister_kprobes);
static struct notifier_block kprobe_exceptions_nb = {
.notifier_call = kprobe_exceptions_notify,
@@ -858,16 +912,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
}
return ret;
}
EXPORT_SYMBOL_GPL(register_jprobes);
int __kprobes register_jprobe(struct jprobe *jp)
{
return register_jprobes(&jp, 1);
}
EXPORT_SYMBOL_GPL(register_jprobe);
void __kprobes unregister_jprobe(struct jprobe *jp)
{
unregister_jprobes(&jp, 1);
}
EXPORT_SYMBOL_GPL(unregister_jprobe);
void __kprobes unregister_jprobes(struct jprobe **jps, int num)
{
@@ -887,6 +944,7 @@ void __kprobes unregister_jprobes(struct jprobe **jps, int num)
__unregister_kprobe_bottom(&jps[i]->kp);
}
}
EXPORT_SYMBOL_GPL(unregister_jprobes);
#ifdef CONFIG_KRETPROBES
/*
@@ -912,10 +970,8 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
ri->rp = rp;
ri->task = current;
if (rp->entry_handler && rp->entry_handler(ri, regs)) {
spin_unlock_irqrestore(&rp->lock, flags);
if (rp->entry_handler && rp->entry_handler(ri, regs))
return 0;
}
arch_prepare_kretprobe(ri, regs);
@@ -982,6 +1038,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
free_rp_inst(rp);
return ret;
}
EXPORT_SYMBOL_GPL(register_kretprobe);
int __kprobes register_kretprobes(struct kretprobe **rps, int num)
{
@@ -999,11 +1056,13 @@ int __kprobes register_kretprobes(struct kretprobe **rps, int num)
}
return ret;
}
EXPORT_SYMBOL_GPL(register_kretprobes);
void __kprobes unregister_kretprobe(struct kretprobe *rp)
{
unregister_kretprobes(&rp, 1);
}
EXPORT_SYMBOL_GPL(unregister_kretprobe);
void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
{
@@ -1025,24 +1084,30 @@ void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
}
}
}
EXPORT_SYMBOL_GPL(unregister_kretprobes);
#else /* CONFIG_KRETPROBES */
int __kprobes register_kretprobe(struct kretprobe *rp)
{
return -ENOSYS;
}
EXPORT_SYMBOL_GPL(register_kretprobe);
int __kprobes register_kretprobes(struct kretprobe **rps, int num)
{
return -ENOSYS;
}
EXPORT_SYMBOL_GPL(register_kretprobes);
void __kprobes unregister_kretprobe(struct kretprobe *rp)
{
}
EXPORT_SYMBOL_GPL(unregister_kretprobe);
void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
{
}
EXPORT_SYMBOL_GPL(unregister_kretprobes);
static int __kprobes pre_handler_kretprobe(struct kprobe *p,
struct pt_regs *regs)
@@ -1056,6 +1121,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
static void __kprobes kill_kprobe(struct kprobe *p)
{
struct kprobe *kp;
p->flags |= KPROBE_FLAG_GONE;
if (p->pre_handler == aggr_pre_handler) {
/*
@@ -1168,8 +1234,8 @@ static int __init init_kprobes(void)
}
}
/* By default, kprobes are enabled */
kprobe_enabled = true;
/* By default, kprobes are armed */
kprobes_all_disarmed = false;
err = arch_init_kprobes();
if (!err)
@@ -1197,12 +1263,18 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
else
kprobe_type = "k";
if (sym)
seq_printf(pi, "%p %s %s+0x%x %s %s\n", p->addr, kprobe_type,
sym, offset, (modname ? modname : " "),
(kprobe_gone(p) ? "[GONE]" : ""));
seq_printf(pi, "%p %s %s+0x%x %s %s%s\n",
p->addr, kprobe_type, sym, offset,
(modname ? modname : " "),
(kprobe_gone(p) ? "[GONE]" : ""),
((kprobe_disabled(p) && !kprobe_gone(p)) ?
"[DISABLED]" : ""));
else
seq_printf(pi, "%p %s %p %s\n", p->addr, kprobe_type, p->addr,
(kprobe_gone(p) ? "[GONE]" : ""));
seq_printf(pi, "%p %s %p %s%s\n",
p->addr, kprobe_type, p->addr,
(kprobe_gone(p) ? "[GONE]" : ""),
((kprobe_disabled(p) && !kprobe_gone(p)) ?
"[DISABLED]" : ""));
}
static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1267,7 +1339,72 @@ static struct file_operations debugfs_kprobes_operations = {
.release = seq_release,
};
static void __kprobes enable_all_kprobes(void)
/* Disable one kprobe */
int __kprobes disable_kprobe(struct kprobe *kp)
{
int ret = 0;
struct kprobe *p;
mutex_lock(&kprobe_mutex);
/* Check whether specified probe is valid. */
p = __get_valid_kprobe(kp);
if (unlikely(p == NULL)) {
ret = -EINVAL;
goto out;
}
/* If the probe is already disabled (or gone), just return */
if (kprobe_disabled(kp))
goto out;
kp->flags |= KPROBE_FLAG_DISABLED;
if (p != kp)
/* When kp != p, p is always enabled. */
try_to_disable_aggr_kprobe(p);
if (!kprobes_all_disarmed && kprobe_disabled(p))
arch_disarm_kprobe(p);
out:
mutex_unlock(&kprobe_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(disable_kprobe);
/* Enable one kprobe */
int __kprobes enable_kprobe(struct kprobe *kp)
{
int ret = 0;
struct kprobe *p;
mutex_lock(&kprobe_mutex);
/* Check whether specified probe is valid. */
p = __get_valid_kprobe(kp);
if (unlikely(p == NULL)) {
ret = -EINVAL;
goto out;
}
if (kprobe_gone(kp)) {
/* This kprobe has gone, we couldn't enable it. */
ret = -EINVAL;
goto out;
}
if (!kprobes_all_disarmed && kprobe_disabled(p))
arch_arm_kprobe(p);
p->flags &= ~KPROBE_FLAG_DISABLED;
if (p != kp)
kp->flags &= ~KPROBE_FLAG_DISABLED;
out:
mutex_unlock(&kprobe_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(enable_kprobe);
static void __kprobes arm_all_kprobes(void)
{
struct hlist_head *head;
struct hlist_node *node;
@@ -1276,18 +1413,20 @@ static void __kprobes enable_all_kprobes(void)
mutex_lock(&kprobe_mutex);
/* If kprobes are already enabled, just return */
if (kprobe_enabled)
/* If kprobes are armed, just return */
if (!kprobes_all_disarmed)
goto already_enabled;
mutex_lock(&text_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry_rcu(p, node, head, hlist)
if (!kprobe_gone(p))
if (!kprobe_disabled(p))
arch_arm_kprobe(p);
}
mutex_unlock(&text_mutex);
kprobe_enabled = true;
kprobes_all_disarmed = false;
printk(KERN_INFO "Kprobes globally enabled\n");
already_enabled:
@@ -1295,7 +1434,7 @@ already_enabled:
return;
}
static void __kprobes disable_all_kprobes(void)
static void __kprobes disarm_all_kprobes(void)
{
struct hlist_head *head;
struct hlist_node *node;
@@ -1304,20 +1443,22 @@ static void __kprobes disable_all_kprobes(void)
mutex_lock(&kprobe_mutex);
/* If kprobes are already disabled, just return */
if (!kprobe_enabled)
/* If kprobes are already disarmed, just return */
if (kprobes_all_disarmed)
goto already_disabled;
kprobe_enabled = false;
kprobes_all_disarmed = true;
printk(KERN_INFO "Kprobes globally disabled\n");
mutex_lock(&text_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry_rcu(p, node, head, hlist) {
if (!arch_trampoline_kprobe(p) && !kprobe_gone(p))
if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
arch_disarm_kprobe(p);
}
}
mutex_unlock(&text_mutex);
mutex_unlock(&kprobe_mutex);
/* Allow all currently running kprobes to complete */
synchronize_sched();
@@ -1338,7 +1479,7 @@ static ssize_t read_enabled_file_bool(struct file *file,
{
char buf[3];
if (kprobe_enabled)
if (!kprobes_all_disarmed)
buf[0] = '1';
else
buf[0] = '0';
@@ -1361,12 +1502,12 @@ static ssize_t write_enabled_file_bool(struct file *file,
case 'y':
case 'Y':
case '1':
enable_all_kprobes();
arm_all_kprobes();
break;
case 'n':
case 'N':
case '0':
disable_all_kprobes();
disarm_all_kprobes();
break;
}
@@ -1409,16 +1550,5 @@ late_initcall(debugfs_kprobe_init);
module_init(init_kprobes);
EXPORT_SYMBOL_GPL(register_kprobe);
EXPORT_SYMBOL_GPL(unregister_kprobe);
EXPORT_SYMBOL_GPL(register_kprobes);
EXPORT_SYMBOL_GPL(unregister_kprobes);
EXPORT_SYMBOL_GPL(register_jprobe);
EXPORT_SYMBOL_GPL(unregister_jprobe);
EXPORT_SYMBOL_GPL(register_jprobes);
EXPORT_SYMBOL_GPL(unregister_jprobes);
/* defined in arch/.../kernel/kprobes.c */
EXPORT_SYMBOL_GPL(jprobe_return);
EXPORT_SYMBOL_GPL(register_kretprobe);
EXPORT_SYMBOL_GPL(unregister_kretprobe);
EXPORT_SYMBOL_GPL(register_kretprobes);
EXPORT_SYMBOL_GPL(unregister_kretprobes);

View File

@@ -42,6 +42,7 @@
#include <linux/hash.h>
#include <linux/ftrace.h>
#include <linux/stringify.h>
#include <trace/lockdep.h>
#include <asm/sections.h>
@@ -433,13 +434,6 @@ atomic_t nr_find_usage_forwards_checks;
atomic_t nr_find_usage_forwards_recursions;
atomic_t nr_find_usage_backwards_checks;
atomic_t nr_find_usage_backwards_recursions;
# define debug_atomic_inc(ptr) atomic_inc(ptr)
# define debug_atomic_dec(ptr) atomic_dec(ptr)
# define debug_atomic_read(ptr) atomic_read(ptr)
#else
# define debug_atomic_inc(ptr) do { } while (0)
# define debug_atomic_dec(ptr) do { } while (0)
# define debug_atomic_read(ptr) 0
#endif
/*
@@ -799,6 +793,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
printk("turning off the locking correctness validator.\n");
dump_stack();
return NULL;
}
class = lock_classes + nr_lock_classes++;
@@ -862,6 +857,7 @@ static struct lock_list *alloc_list_entry(void)
printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
printk("turning off the locking correctness validator.\n");
dump_stack();
return NULL;
}
return list_entries + nr_list_entries++;
@@ -1688,6 +1684,7 @@ cache_hit:
printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
printk("turning off the locking correctness validator.\n");
dump_stack();
return 0;
}
chain = lock_chains + nr_lock_chains++;
@@ -1900,9 +1897,9 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
curr->comm, task_pid_nr(curr));
print_lock(this);
if (forwards)
printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass);
printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
else
printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass);
printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
print_lock_name(other);
printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
@@ -2015,7 +2012,8 @@ typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
enum lock_usage_bit bit, const char *name);
static int
mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit)
mark_lock_irq(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit)
{
int excl_bit = exclusive_bit(new_bit);
int read = new_bit & 1;
@@ -2043,7 +2041,7 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit)
* states.
*/
if ((!read || !dir || STRICT_READ_CHECKS) &&
!usage(curr, this, excl_bit, state_name(new_bit)))
!usage(curr, this, excl_bit, state_name(new_bit & ~1)))
return 0;
/*
@@ -2546,6 +2544,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
debug_locks_off();
printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
printk("turning off the locking correctness validator.\n");
dump_stack();
return 0;
}
@@ -2642,6 +2641,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
debug_locks_off();
printk("BUG: MAX_LOCK_DEPTH too low!\n");
printk("turning off the locking correctness validator.\n");
dump_stack();
return 0;
}
@@ -2929,6 +2929,8 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
}
EXPORT_SYMBOL_GPL(lock_set_class);
DEFINE_TRACE(lock_acquire);
/*
* We are not always called with irqs disabled - do that here,
* and also avoid lockdep recursion:
@@ -2939,6 +2941,8 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
{
unsigned long flags;
trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
if (unlikely(current->lockdep_recursion))
return;
@@ -2953,11 +2957,15 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
}
EXPORT_SYMBOL_GPL(lock_acquire);
DEFINE_TRACE(lock_release);
void lock_release(struct lockdep_map *lock, int nested,
unsigned long ip)
{
unsigned long flags;
trace_lock_release(lock, nested, ip);
if (unlikely(current->lockdep_recursion))
return;
@@ -3106,10 +3114,14 @@ found_it:
lock->ip = ip;
}
DEFINE_TRACE(lock_contended);
void lock_contended(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
trace_lock_contended(lock, ip);
if (unlikely(!lock_stat))
return;
@@ -3125,10 +3137,14 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
}
EXPORT_SYMBOL_GPL(lock_contended);
DEFINE_TRACE(lock_acquired);
void lock_acquired(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
trace_lock_acquired(lock, ip);
if (unlikely(!lock_stat))
return;

View File

@@ -68,7 +68,8 @@
/* List of modules, protected by module_mutex or preempt_disable
* (delete uses stop_machine/add uses RCU list operations). */
static DEFINE_MUTEX(module_mutex);
DEFINE_MUTEX(module_mutex);
EXPORT_SYMBOL_GPL(module_mutex);
static LIST_HEAD(modules);
/* Waiting for a module to finish initializing? */
@@ -76,7 +77,7 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
static BLOCKING_NOTIFIER_HEAD(module_notify_list);
/* Bounds of module allocation, for speeding __module_text_address */
/* Bounds of module allocation, for speeding __module_address */
static unsigned long module_addr_min = -1UL, module_addr_max = 0;
int register_module_notifier(struct notifier_block * nb)
@@ -186,17 +187,6 @@ extern const unsigned long __start___kcrctab_unused_gpl[];
#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
#endif
struct symsearch {
const struct kernel_symbol *start, *stop;
const unsigned long *crcs;
enum {
NOT_GPL_ONLY,
GPL_ONLY,
WILL_BE_GPL_ONLY,
} licence;
bool unused;
};
static bool each_symbol_in_section(const struct symsearch *arr,
unsigned int arrsize,
struct module *owner,
@@ -217,10 +207,8 @@ static bool each_symbol_in_section(const struct symsearch *arr,
}
/* Returns true as soon as fn returns true, otherwise false. */
static bool each_symbol(bool (*fn)(const struct symsearch *arr,
struct module *owner,
unsigned int symnum, void *data),
void *data)
bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
unsigned int symnum, void *data), void *data)
{
struct module *mod;
const struct symsearch arr[] = {
@@ -273,6 +261,7 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr,
}
return false;
}
EXPORT_SYMBOL_GPL(each_symbol);
struct find_symbol_arg {
/* Input */
@@ -283,7 +272,7 @@ struct find_symbol_arg {
/* Output */
struct module *owner;
const unsigned long *crc;
unsigned long value;
const struct kernel_symbol *sym;
};
static bool find_symbol_in_section(const struct symsearch *syms,
@@ -324,17 +313,17 @@ static bool find_symbol_in_section(const struct symsearch *syms,
fsa->owner = owner;
fsa->crc = symversion(syms->crcs, symnum);
fsa->value = syms->start[symnum].value;
fsa->sym = &syms->start[symnum];
return true;
}
/* Find a symbol, return value, (optional) crc and (optional) module
* which owns it */
static unsigned long find_symbol(const char *name,
struct module **owner,
const unsigned long **crc,
bool gplok,
bool warn)
/* Find a symbol and return it, along with, (optional) crc and
* (optional) module which owns it */
const struct kernel_symbol *find_symbol(const char *name,
struct module **owner,
const unsigned long **crc,
bool gplok,
bool warn)
{
struct find_symbol_arg fsa;
@@ -347,15 +336,16 @@ static unsigned long find_symbol(const char *name,
*owner = fsa.owner;
if (crc)
*crc = fsa.crc;
return fsa.value;
return fsa.sym;
}
DEBUGP("Failed to find symbol %s\n", name);
return -ENOENT;
return NULL;
}
EXPORT_SYMBOL_GPL(find_symbol);
/* Search for module by name: must hold module_mutex. */
static struct module *find_module(const char *name)
struct module *find_module(const char *name)
{
struct module *mod;
@@ -365,6 +355,7 @@ static struct module *find_module(const char *name)
}
return NULL;
}
EXPORT_SYMBOL_GPL(find_module);
#ifdef CONFIG_SMP
@@ -641,7 +632,7 @@ static int already_uses(struct module *a, struct module *b)
}
/* Module a uses b */
static int use_module(struct module *a, struct module *b)
int use_module(struct module *a, struct module *b)
{
struct module_use *use;
int no_warn, err;
@@ -674,6 +665,7 @@ static int use_module(struct module *a, struct module *b)
no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
return 1;
}
EXPORT_SYMBOL_GPL(use_module);
/* Clear the unload stuff of the module. */
static void module_unload_free(struct module *mod)
@@ -894,7 +886,7 @@ void __symbol_put(const char *symbol)
struct module *owner;
preempt_disable();
if (IS_ERR_VALUE(find_symbol(symbol, &owner, NULL, true, false)))
if (!find_symbol(symbol, &owner, NULL, true, false))
BUG();
module_put(owner);
preempt_enable();
@@ -908,8 +900,10 @@ void symbol_put_addr(void *addr)
if (core_kernel_text((unsigned long)addr))
return;
if (!(modaddr = module_text_address((unsigned long)addr)))
BUG();
/* module_text_address is safe here: we're supposed to have reference
* to module from symbol_get, so it can't go away. */
modaddr = __module_text_address((unsigned long)addr);
BUG_ON(!modaddr);
module_put(modaddr);
}
EXPORT_SYMBOL_GPL(symbol_put_addr);
@@ -949,10 +943,11 @@ static inline void module_unload_free(struct module *mod)
{
}
static inline int use_module(struct module *a, struct module *b)
int use_module(struct module *a, struct module *b)
{
return strong_try_module_get(b) == 0;
}
EXPORT_SYMBOL_GPL(use_module);
static inline void module_unload_init(struct module *mod)
{
@@ -995,12 +990,12 @@ static struct module_attribute *modinfo_attrs[] = {
static const char vermagic[] = VERMAGIC_STRING;
static int try_to_force_load(struct module *mod, const char *symname)
static int try_to_force_load(struct module *mod, const char *reason)
{
#ifdef CONFIG_MODULE_FORCE_LOAD
if (!test_taint(TAINT_FORCED_MODULE))
printk("%s: no version for \"%s\" found: kernel tainted.\n",
mod->name, symname);
printk(KERN_WARNING "%s: %s: kernel tainted.\n",
mod->name, reason);
add_taint_module(mod, TAINT_FORCED_MODULE);
return 0;
#else
@@ -1057,9 +1052,9 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
{
const unsigned long *crc;
if (IS_ERR_VALUE(find_symbol("struct_module", NULL, &crc, true, false)))
if (!find_symbol("module_layout", NULL, &crc, true, false))
BUG();
return check_version(sechdrs, versindex, "struct_module", mod, crc);
return check_version(sechdrs, versindex, "module_layout", mod, crc);
}
/* First part is kernel version, which we ignore if module has crcs. */
@@ -1098,25 +1093,25 @@ static inline int same_magic(const char *amagic, const char *bmagic,
/* Resolve a symbol for this module. I.e. if we find one, record usage.
Must be holding module_mutex. */
static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
unsigned int versindex,
const char *name,
struct module *mod)
static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
unsigned int versindex,
const char *name,
struct module *mod)
{
struct module *owner;
unsigned long ret;
const struct kernel_symbol *sym;
const unsigned long *crc;
ret = find_symbol(name, &owner, &crc,
sym = find_symbol(name, &owner, &crc,
!(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
if (!IS_ERR_VALUE(ret)) {
/* use_module can fail due to OOM,
or module initialization or unloading */
/* use_module can fail due to OOM,
or module initialization or unloading */
if (sym) {
if (!check_version(sechdrs, versindex, name, mod, crc) ||
!use_module(mod, owner))
ret = -EINVAL;
sym = NULL;
}
return ret;
return sym;
}
/*
@@ -1491,6 +1486,9 @@ static void free_module(struct module *mod)
/* Module unload stuff */
module_unload_free(mod);
/* Free any allocated parameters. */
destroy_params(mod->kp, mod->num_kp);
/* release any pointers to mcount in this module */
ftrace_release(mod->module_core, mod->core_size);
@@ -1513,17 +1511,15 @@ static void free_module(struct module *mod)
void *__symbol_get(const char *symbol)
{
struct module *owner;
unsigned long value;
const struct kernel_symbol *sym;
preempt_disable();
value = find_symbol(symbol, &owner, NULL, true, true);
if (IS_ERR_VALUE(value))
value = 0;
else if (strong_try_module_get(owner))
value = 0;
sym = find_symbol(symbol, &owner, NULL, true, true);
if (sym && strong_try_module_get(owner))
sym = NULL;
preempt_enable();
return (void *)value;
return sym ? (void *)sym->value : NULL;
}
EXPORT_SYMBOL_GPL(__symbol_get);
@@ -1551,8 +1547,7 @@ static int verify_export_symbols(struct module *mod)
for (i = 0; i < ARRAY_SIZE(arr); i++) {
for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
if (!IS_ERR_VALUE(find_symbol(s->name, &owner,
NULL, true, false))) {
if (find_symbol(s->name, &owner, NULL, true, false)) {
printk(KERN_ERR
"%s: exports duplicate symbol %s"
" (owned by %s)\n",
@@ -1576,6 +1571,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
unsigned long secbase;
unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
int ret = 0;
const struct kernel_symbol *ksym;
for (i = 1; i < n; i++) {
switch (sym[i].st_shndx) {
@@ -1595,13 +1591,14 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
break;
case SHN_UNDEF:
sym[i].st_value
= resolve_symbol(sechdrs, versindex,
strtab + sym[i].st_name, mod);
ksym = resolve_symbol(sechdrs, versindex,
strtab + sym[i].st_name, mod);
/* Ok if resolved. */
if (!IS_ERR_VALUE(sym[i].st_value))
if (ksym) {
sym[i].st_value = ksym->value;
break;
}
/* Ok if weak. */
if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
break;
@@ -1676,8 +1673,7 @@ static void layout_sections(struct module *mod,
if ((s->sh_flags & masks[m][0]) != masks[m][0]
|| (s->sh_flags & masks[m][1])
|| s->sh_entsize != ~0UL
|| strncmp(secstrings + s->sh_name,
".init", 5) == 0)
|| strstarts(secstrings + s->sh_name, ".init"))
continue;
s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
DEBUGP("\t%s\n", secstrings + s->sh_name);
@@ -1694,8 +1690,7 @@ static void layout_sections(struct module *mod,
if ((s->sh_flags & masks[m][0]) != masks[m][0]
|| (s->sh_flags & masks[m][1])
|| s->sh_entsize != ~0UL
|| strncmp(secstrings + s->sh_name,
".init", 5) != 0)
|| !strstarts(secstrings + s->sh_name, ".init"))
continue;
s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
| INIT_OFFSET_MASK);
@@ -1828,8 +1823,7 @@ static char elf_type(const Elf_Sym *sym,
else
return 'b';
}
if (strncmp(secstrings + sechdrs[sym->st_shndx].sh_name,
".debug", strlen(".debug")) == 0)
if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug"))
return 'n';
return '?';
}
@@ -1898,8 +1892,7 @@ static noinline struct module *load_module(void __user *umod,
unsigned int symindex = 0;
unsigned int strindex = 0;
unsigned int modindex, versindex, infoindex, pcpuindex;
unsigned int num_kp, num_mcount;
struct kernel_param *kp;
unsigned int num_mcount;
struct module *mod;
long err = 0;
void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1916,12 +1909,6 @@ static noinline struct module *load_module(void __user *umod,
if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
return ERR_PTR(-ENOMEM);
/* Create stop_machine threads since the error path relies on
* a non-failing stop_machine call. */
err = stop_machine_create();
if (err)
goto free_hdr;
if (copy_from_user(hdr, umod, len) != 0) {
err = -EFAULT;
goto free_hdr;
@@ -1962,7 +1949,7 @@ static noinline struct module *load_module(void __user *umod,
}
#ifndef CONFIG_MODULE_UNLOAD
/* Don't load .exit sections */
if (strncmp(secstrings+sechdrs[i].sh_name, ".exit", 5) == 0)
if (strstarts(secstrings+sechdrs[i].sh_name, ".exit"))
sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
#endif
}
@@ -2006,7 +1993,7 @@ static noinline struct module *load_module(void __user *umod,
modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
/* This is allowed: modprobe --force will invalidate it. */
if (!modmagic) {
err = try_to_force_load(mod, "magic");
err = try_to_force_load(mod, "bad vermagic");
if (err)
goto free_hdr;
} else if (!same_magic(modmagic, vermagic, versindex)) {
@@ -2144,8 +2131,8 @@ static noinline struct module *load_module(void __user *umod,
/* Now we've got everything in the final locations, we can
* find optional sections. */
kp = section_objs(hdr, sechdrs, secstrings, "__param", sizeof(*kp),
&num_kp);
mod->kp = section_objs(hdr, sechdrs, secstrings, "__param",
sizeof(*mod->kp), &mod->num_kp);
mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
sizeof(*mod->syms), &mod->num_syms);
mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
@@ -2195,8 +2182,8 @@ static noinline struct module *load_module(void __user *umod,
|| (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
#endif
) {
printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
err = try_to_force_load(mod, "nocrc");
err = try_to_force_load(mod,
"no versions for exported symbols");
if (err)
goto cleanup;
}
@@ -2291,11 +2278,11 @@ static noinline struct module *load_module(void __user *umod,
*/
list_add_rcu(&mod->list, &modules);
err = parse_args(mod->name, mod->args, kp, num_kp, NULL);
err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
if (err < 0)
goto unlink;
err = mod_sysfs_setup(mod, kp, num_kp);
err = mod_sysfs_setup(mod, mod->kp, mod->num_kp);
if (err < 0)
goto unlink;
add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
@@ -2304,12 +2291,13 @@ static noinline struct module *load_module(void __user *umod,
/* Get rid of temporary copy */
vfree(hdr);
stop_machine_destroy();
/* Done! */
return mod;
unlink:
stop_machine(__unlink_module, mod, NULL);
/* Unlink carefully: kallsyms could be walking list. */
list_del_rcu(&mod->list);
synchronize_sched();
module_arch_cleanup(mod);
cleanup:
kobject_del(&mod->mkobj.kobj);
@@ -2317,8 +2305,8 @@ static noinline struct module *load_module(void __user *umod,
ftrace_release(mod->module_core, mod->core_size);
free_unload:
module_unload_free(mod);
free_init:
#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
free_init:
percpu_modfree(mod->refptr);
#endif
module_free(mod, mod->module_init);
@@ -2332,7 +2320,6 @@ static noinline struct module *load_module(void __user *umod,
kfree(args);
free_hdr:
vfree(hdr);
stop_machine_destroy();
return ERR_PTR(err);
truncated:
@@ -2609,6 +2596,25 @@ unsigned long module_kallsyms_lookup_name(const char *name)
preempt_enable();
return ret;
}
int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
struct module *, unsigned long),
void *data)
{
struct module *mod;
unsigned int i;
int ret;
list_for_each_entry(mod, &modules, list) {
for (i = 0; i < mod->num_symtab; i++) {
ret = fn(data, mod->strtab + mod->symtab[i].st_name,
mod, mod->symtab[i].st_value);
if (ret != 0)
return ret;
}
}
return 0;
}
#endif /* CONFIG_KALLSYMS */
static char *module_flags(struct module *mod, char *buf)
@@ -2744,29 +2750,31 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
}
/*
* Is this a valid module address?
* is_module_address - is this address inside a module?
* @addr: the address to check.
*
* See is_module_text_address() if you simply want to see if the address
* is code (not data).
*/
int is_module_address(unsigned long addr)
bool is_module_address(unsigned long addr)
{
struct module *mod;
bool ret;
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list) {
if (within_module_core(addr, mod)) {
preempt_enable();
return 1;
}
}
ret = __module_address(addr) != NULL;
preempt_enable();
return 0;
return ret;
}
/* Is this a valid kernel address? */
__notrace_funcgraph struct module *__module_text_address(unsigned long addr)
/*
* __module_address - get the module which contains an address.
* @addr: the address.
*
* Must be called with preempt disabled or module mutex held so that
* module doesn't get freed during this.
*/
struct module *__module_address(unsigned long addr)
{
struct module *mod;
@@ -2774,22 +2782,51 @@ __notrace_funcgraph struct module *__module_text_address(unsigned long addr)
return NULL;
list_for_each_entry_rcu(mod, &modules, list)
if (within(addr, mod->module_init, mod->init_text_size)
|| within(addr, mod->module_core, mod->core_text_size))
if (within_module_core(addr, mod)
|| within_module_init(addr, mod))
return mod;
return NULL;
}
EXPORT_SYMBOL_GPL(__module_address);
struct module *module_text_address(unsigned long addr)
/*
* is_module_text_address - is this address inside module code?
* @addr: the address to check.
*
* See is_module_address() if you simply want to see if the address is
* anywhere in a module. See kernel_text_address() for testing if an
* address corresponds to kernel or module code.
*/
bool is_module_text_address(unsigned long addr)
{
struct module *mod;
bool ret;
preempt_disable();
mod = __module_text_address(addr);
ret = __module_text_address(addr) != NULL;
preempt_enable();
return ret;
}
/*
* __module_text_address - get the module whose code contains an address.
* @addr: the address.
*
* Must be called with preempt disabled or module mutex held so that
* module doesn't get freed during this.
*/
struct module *__module_text_address(unsigned long addr)
{
struct module *mod = __module_address(addr);
if (mod) {
/* Make sure it's within the text section. */
if (!within(addr, mod->module_init, mod->init_text_size)
&& !within(addr, mod->module_core, mod->core_text_size))
mod = NULL;
}
return mod;
}
EXPORT_SYMBOL_GPL(__module_text_address);
/* Don't grab lock, we're oopsing. */
void print_modules(void)
@@ -2809,9 +2846,17 @@ void print_modules(void)
}
#ifdef CONFIG_MODVERSIONS
/* Generate the signature for struct module here, too, for modversions. */
void struct_module(struct module *mod) { return; }
EXPORT_SYMBOL(struct_module);
/* Generate the signature for all relevant module structures here.
* If these change, we don't want to try to parse the module. */
void module_layout(struct module *mod,
struct modversion_info *ver,
struct kernel_param *kp,
struct kernel_symbol *ks,
struct marker *marker,
struct tracepoint *tp)
{
}
EXPORT_SYMBOL(module_layout);
#endif
#ifdef CONFIG_MARKERS

View File

@@ -34,7 +34,7 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
/*
* Rules:
* 1. you can only enter a cgroup which is a child of your current
* 1. you can only enter a cgroup which is a descendant of your current
* cgroup
* 2. you can only place another process into a cgroup if
* a. you have CAP_SYS_ADMIN
@@ -45,21 +45,15 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
static int ns_can_attach(struct cgroup_subsys *ss,
struct cgroup *new_cgroup, struct task_struct *task)
{
struct cgroup *orig;
if (current != task) {
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (!cgroup_is_descendant(new_cgroup))
if (!cgroup_is_descendant(new_cgroup, current))
return -EPERM;
}
if (atomic_read(&new_cgroup->count) != 0)
return -EPERM;
orig = task_cgroup(task, ns_subsys_id);
if (orig && orig != new_cgroup->parent)
if (!cgroup_is_descendant(new_cgroup, task))
return -EPERM;
return 0;
@@ -77,7 +71,7 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
if (!cgroup_is_descendant(cgroup))
if (!cgroup_is_descendant(cgroup, current))
return ERR_PTR(-EPERM);
ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);

View File

@@ -8,19 +8,19 @@
* This function is used through-out the kernel (including mm and fs)
* to indicate a major problem.
*/
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/delay.h>
#include <linux/reboot.h>
#include <linux/notifier.h>
#include <linux/init.h>
#include <linux/sysrq.h>
#include <linux/interrupt.h>
#include <linux/nmi.h>
#include <linux/kexec.h>
#include <linux/debug_locks.h>
#include <linux/random.h>
#include <linux/interrupt.h>
#include <linux/kallsyms.h>
#include <linux/notifier.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/reboot.h>
#include <linux/delay.h>
#include <linux/kexec.h>
#include <linux/sched.h>
#include <linux/sysrq.h>
#include <linux/init.h>
#include <linux/nmi.h>
#include <linux/dmi.h>
int panic_on_oops;
@@ -52,19 +52,15 @@ EXPORT_SYMBOL(panic_blink);
*
* This function never returns.
*/
NORET_TYPE void panic(const char * fmt, ...)
{
long i;
static char buf[1024];
va_list args;
#if defined(CONFIG_S390)
unsigned long caller = (unsigned long) __builtin_return_address(0);
#endif
long i;
/*
* It's possible to come here directly from a panic-assertion and not
* have preempt disabled. Some functions called from here want
* It's possible to come here directly from a panic-assertion and
* not have preempt disabled. Some functions called from here want
* preempt to be disabled. No point enabling it later though...
*/
preempt_disable();
@@ -77,7 +73,6 @@ NORET_TYPE void panic(const char * fmt, ...)
#ifdef CONFIG_DEBUG_BUGVERBOSE
dump_stack();
#endif
bust_spinlocks(0);
/*
* If we have crashed and we have a crash kernel loaded let it handle
@@ -86,14 +81,12 @@ NORET_TYPE void panic(const char * fmt, ...)
*/
crash_kexec(NULL);
#ifdef CONFIG_SMP
/*
* Note smp_send_stop is the usual smp shutdown function, which
* unfortunately means it may not be hardened to work in a panic
* situation.
*/
smp_send_stop();
#endif
atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
@@ -102,19 +95,21 @@ NORET_TYPE void panic(const char * fmt, ...)
if (panic_timeout > 0) {
/*
* Delay timeout seconds before rebooting the machine.
* We can't use the "normal" timers since we just panicked..
*/
printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout);
* Delay timeout seconds before rebooting the machine.
* We can't use the "normal" timers since we just panicked.
*/
printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
for (i = 0; i < panic_timeout*1000; ) {
touch_nmi_watchdog();
i += panic_blink(i);
mdelay(1);
i++;
}
/* This will not be a clean reboot, with everything
* shutting down. But if there is a chance of
* rebooting the system it will be rebooted.
/*
* This will not be a clean reboot, with everything
* shutting down. But if there is a chance of
* rebooting the system it will be rebooted.
*/
emergency_restart();
}
@@ -127,38 +122,44 @@ NORET_TYPE void panic(const char * fmt, ...)
}
#endif
#if defined(CONFIG_S390)
disabled_wait(caller);
{
unsigned long caller;
caller = (unsigned long)__builtin_return_address(0);
disabled_wait(caller);
}
#endif
local_irq_enable();
for (i = 0;;) {
for (i = 0; ; ) {
touch_softlockup_watchdog();
i += panic_blink(i);
mdelay(1);
i++;
}
bust_spinlocks(0);
}
EXPORT_SYMBOL(panic);
struct tnt {
u8 bit;
char true;
char false;
u8 bit;
char true;
char false;
};
static const struct tnt tnts[] = {
{ TAINT_PROPRIETARY_MODULE, 'P', 'G' },
{ TAINT_FORCED_MODULE, 'F', ' ' },
{ TAINT_UNSAFE_SMP, 'S', ' ' },
{ TAINT_FORCED_RMMOD, 'R', ' ' },
{ TAINT_MACHINE_CHECK, 'M', ' ' },
{ TAINT_BAD_PAGE, 'B', ' ' },
{ TAINT_USER, 'U', ' ' },
{ TAINT_DIE, 'D', ' ' },
{ TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
{ TAINT_WARN, 'W', ' ' },
{ TAINT_CRAP, 'C', ' ' },
{ TAINT_PROPRIETARY_MODULE, 'P', 'G' },
{ TAINT_FORCED_MODULE, 'F', ' ' },
{ TAINT_UNSAFE_SMP, 'S', ' ' },
{ TAINT_FORCED_RMMOD, 'R', ' ' },
{ TAINT_MACHINE_CHECK, 'M', ' ' },
{ TAINT_BAD_PAGE, 'B', ' ' },
{ TAINT_USER, 'U', ' ' },
{ TAINT_DIE, 'D', ' ' },
{ TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
{ TAINT_WARN, 'W', ' ' },
{ TAINT_CRAP, 'C', ' ' },
};
/**
@@ -195,7 +196,8 @@ const char *print_tainted(void)
*s = 0;
} else
snprintf(buf, sizeof(buf), "Not tainted");
return(buf);
return buf;
}
int test_taint(unsigned flag)
@@ -211,7 +213,8 @@ unsigned long get_taint(void)
void add_taint(unsigned flag)
{
debug_locks = 0; /* can't trust the integrity of the kernel anymore */
/* can't trust the integrity of the kernel anymore: */
debug_locks = 0;
set_bit(flag, &tainted_mask);
}
EXPORT_SYMBOL(add_taint);
@@ -266,8 +269,8 @@ static void do_oops_enter_exit(void)
}
/*
* Return true if the calling CPU is allowed to print oops-related info. This
* is a bit racy..
* Return true if the calling CPU is allowed to print oops-related info.
* This is a bit racy..
*/
int oops_may_print(void)
{
@@ -276,20 +279,22 @@ int oops_may_print(void)
/*
* Called when the architecture enters its oops handler, before it prints
* anything. If this is the first CPU to oops, and it's oopsing the first time
* then let it proceed.
* anything. If this is the first CPU to oops, and it's oopsing the first
* time then let it proceed.
*
* This is all enabled by the pause_on_oops kernel boot option. We do all this
* to ensure that oopses don't scroll off the screen. It has the side-effect
* of preventing later-oopsing CPUs from mucking up the display, too.
* This is all enabled by the pause_on_oops kernel boot option. We do all
* this to ensure that oopses don't scroll off the screen. It has the
* side-effect of preventing later-oopsing CPUs from mucking up the display,
* too.
*
* It turns out that the CPU which is allowed to print ends up pausing for the
* right duration, whereas all the other CPUs pause for twice as long: once in
* oops_enter(), once in oops_exit().
* It turns out that the CPU which is allowed to print ends up pausing for
* the right duration, whereas all the other CPUs pause for twice as long:
* once in oops_enter(), once in oops_exit().
*/
void oops_enter(void)
{
debug_locks_off(); /* can't trust the integrity of the kernel anymore */
/* can't trust the integrity of the kernel anymore: */
debug_locks_off();
do_oops_enter_exit();
}

View File

@@ -24,6 +24,9 @@
#include <linux/err.h>
#include <linux/slab.h>
/* We abuse the high bits of "perm" to record whether we kmalloc'ed. */
#define KPARAM_KMALLOCED 0x80000000
#if 0
#define DEBUGP printk
#else
@@ -217,7 +220,19 @@ int param_set_charp(const char *val, struct kernel_param *kp)
return -ENOSPC;
}
*(char **)kp->arg = (char *)val;
if (kp->perm & KPARAM_KMALLOCED)
kfree(*(char **)kp->arg);
/* This is a hack. We can't need to strdup in early boot, and we
* don't need to; this mangled commandline is preserved. */
if (slab_is_available()) {
kp->perm |= KPARAM_KMALLOCED;
*(char **)kp->arg = kstrdup(val, GFP_KERNEL);
if (!kp->arg)
return -ENOMEM;
} else
*(const char **)kp->arg = val;
return 0;
}
@@ -571,6 +586,15 @@ void module_param_sysfs_remove(struct module *mod)
}
#endif
void destroy_params(const struct kernel_param *params, unsigned num)
{
unsigned int i;
for (i = 0; i < num; i++)
if (params[i].perm & KPARAM_KMALLOCED)
kfree(*(char **)params[i].arg);
}
static void __init kernel_add_sysfs_param(const char *name,
struct kernel_param *kparam,
unsigned int name_skip)

View File

@@ -403,6 +403,8 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
{
struct pid *pid;
rcu_read_lock();
if (type != PIDTYPE_PID)
task = task->group_leader;
pid = get_pid(task->pids[type].pid);
rcu_read_unlock();
return pid;
@@ -450,11 +452,24 @@ pid_t pid_vnr(struct pid *pid)
}
EXPORT_SYMBOL_GPL(pid_vnr);
pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
struct pid_namespace *ns)
{
return pid_nr_ns(task_pid(tsk), ns);
pid_t nr = 0;
rcu_read_lock();
if (!ns)
ns = current->nsproxy->pid_ns;
if (likely(pid_alive(task))) {
if (type != PIDTYPE_PID)
task = task->group_leader;
nr = pid_nr_ns(task->pids[type].pid, ns);
}
rcu_read_unlock();
return nr;
}
EXPORT_SYMBOL(task_pid_nr_ns);
EXPORT_SYMBOL(__task_pid_nr_ns);
pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
@@ -462,18 +477,6 @@ pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
}
EXPORT_SYMBOL(task_tgid_nr_ns);
pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
return pid_nr_ns(task_pgrp(tsk), ns);
}
EXPORT_SYMBOL(task_pgrp_nr_ns);
pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
return pid_nr_ns(task_session(tsk), ns);
}
EXPORT_SYMBOL(task_session_nr_ns);
struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
{
return ns_of_pid(task_pid(tsk));

View File

@@ -152,6 +152,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
{
int nr;
int rc;
struct task_struct *task;
/*
* The last thread in the cgroup-init thread group is terminating.
@@ -169,7 +170,19 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
read_lock(&tasklist_lock);
nr = next_pidmap(pid_ns, 1);
while (nr > 0) {
kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr);
rcu_read_lock();
/*
* Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring
* any nested-container's init processes don't ignore the
* signal
*/
task = pid_task(find_vpid(nr), PIDTYPE_PID);
if (task)
force_sig(SIGKILL, task);
rcu_read_unlock();
nr = next_pidmap(pid_ns, nr);
}
read_unlock(&tasklist_lock);

View File

@@ -289,7 +289,7 @@ static int create_image(int platform_mode)
* hibernation_snapshot - quiesce devices and create the hibernation
* snapshot image.
* @platform_mode - if set, use the platform driver, if available, to
* prepare the platform frimware for the power transition.
* prepare the platform firmware for the power transition.
*
* Must be called with pm_mutex held
*/
@@ -412,7 +412,7 @@ static int resume_target_kernel(bool platform_mode)
* hibernation_restore - quiesce devices and restore the hibernation
* snapshot image. If successful, control returns in hibernation_snaphot()
* @platform_mode - if set, use the platform driver, if available, to
* prepare the platform frimware for the transition.
* prepare the platform firmware for the transition.
*
* Must be called with pm_mutex held
*/

View File

@@ -32,6 +32,7 @@
#include <linux/security.h>
#include <linux/bootmem.h>
#include <linux/syscalls.h>
#include <linux/kexec.h>
#include <asm/uaccess.h>
@@ -135,6 +136,24 @@ static char *log_buf = __log_buf;
static int log_buf_len = __LOG_BUF_LEN;
static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
#ifdef CONFIG_KEXEC
/*
* This appends the listed symbols to /proc/vmcoreinfo
*
* /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to
* obtain access to symbols that are otherwise very difficult to locate. These
* symbols are specifically used so that utilities can access and extract the
* dmesg log from a vmcore file after a crash.
*/
void log_buf_kexec_setup(void)
{
VMCOREINFO_SYMBOL(log_buf);
VMCOREINFO_SYMBOL(log_end);
VMCOREINFO_SYMBOL(log_buf_len);
VMCOREINFO_SYMBOL(logged_chars);
}
#endif
static int __init log_buf_len_setup(char *str)
{
unsigned size = memparse(str, &str);
@@ -1292,8 +1311,11 @@ EXPORT_SYMBOL(printk_ratelimit);
bool printk_timed_ratelimit(unsigned long *caller_jiffies,
unsigned int interval_msecs)
{
if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) {
*caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs);
if (*caller_jiffies == 0
|| !time_in_range(jiffies, *caller_jiffies,
*caller_jiffies
+ msecs_to_jiffies(interval_msecs))) {
*caller_jiffies = jiffies;
return true;
}
return false;

View File

@@ -60,11 +60,15 @@ static void ptrace_untrace(struct task_struct *child)
{
spin_lock(&child->sighand->siglock);
if (task_is_traced(child)) {
if (child->signal->flags & SIGNAL_STOP_STOPPED) {
/*
* If the group stop is completed or in progress,
* this thread was already counted as stopped.
*/
if (child->signal->flags & SIGNAL_STOP_STOPPED ||
child->signal->group_stop_count)
__set_task_state(child, TASK_STOPPED);
} else {
else
signal_wake_up(child, 1);
}
}
spin_unlock(&child->sighand->siglock);
}
@@ -235,18 +239,58 @@ out:
return retval;
}
static inline void __ptrace_detach(struct task_struct *child, unsigned int data)
/*
* Called with irqs disabled, returns true if childs should reap themselves.
*/
static int ignoring_children(struct sighand_struct *sigh)
{
child->exit_code = data;
/* .. re-parent .. */
__ptrace_unlink(child);
/* .. and wake it up. */
if (child->exit_state != EXIT_ZOMBIE)
wake_up_process(child);
int ret;
spin_lock(&sigh->siglock);
ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) ||
(sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT);
spin_unlock(&sigh->siglock);
return ret;
}
/*
* Called with tasklist_lock held for writing.
* Unlink a traced task, and clean it up if it was a traced zombie.
* Return true if it needs to be reaped with release_task().
* (We can't call release_task() here because we already hold tasklist_lock.)
*
* If it's a zombie, our attachedness prevented normal parent notification
* or self-reaping. Do notification now if it would have happened earlier.
* If it should reap itself, return true.
*
* If it's our own child, there is no notification to do.
* But if our normal children self-reap, then this child
* was prevented by ptrace and we must reap it now.
*/
static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
{
__ptrace_unlink(p);
if (p->exit_state == EXIT_ZOMBIE) {
if (!task_detached(p) && thread_group_empty(p)) {
if (!same_thread_group(p->real_parent, tracer))
do_notify_parent(p, p->exit_signal);
else if (ignoring_children(tracer->sighand))
p->exit_signal = -1;
}
if (task_detached(p)) {
/* Mark it as in the process of being reaped. */
p->exit_state = EXIT_DEAD;
return true;
}
}
return false;
}
int ptrace_detach(struct task_struct *child, unsigned int data)
{
bool dead = false;
if (!valid_signal(data))
return -EIO;
@@ -255,14 +299,45 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
write_lock_irq(&tasklist_lock);
/* protect against de_thread()->release_task() */
if (child->ptrace)
__ptrace_detach(child, data);
/*
* This child can be already killed. Make sure de_thread() or
* our sub-thread doing do_wait() didn't do release_task() yet.
*/
if (child->ptrace) {
child->exit_code = data;
dead = __ptrace_detach(current, child);
}
write_unlock_irq(&tasklist_lock);
if (unlikely(dead))
release_task(child);
return 0;
}
/*
* Detach all tasks we were using ptrace on.
*/
void exit_ptrace(struct task_struct *tracer)
{
struct task_struct *p, *n;
LIST_HEAD(ptrace_dead);
write_lock_irq(&tasklist_lock);
list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
if (__ptrace_detach(tracer, p))
list_add(&p->ptrace_entry, &ptrace_dead);
}
write_unlock_irq(&tasklist_lock);
BUG_ON(!list_empty(&tracer->ptraced));
list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
list_del_init(&p->ptrace_entry);
release_task(p);
}
}
int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
{
int copied = 0;
@@ -612,8 +687,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
goto out_put_task_struct;
ret = arch_ptrace(child, request, addr, data);
if (ret < 0)
goto out_put_task_struct;
out_put_task_struct:
put_task_struct(child);

View File

@@ -65,6 +65,7 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
.cpumask = CPU_BITS_NONE,
};
static struct rcu_ctrlblk rcu_bh_ctrlblk = {
.cur = -300,
.completed = -300,
@@ -73,8 +74,26 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
.cpumask = CPU_BITS_NONE,
};
DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
static DEFINE_PER_CPU(struct rcu_data, rcu_data);
static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
/*
* Increment the quiescent state counter.
* The counter is a bit degenerated: We do not need to know
* how many quiescent states passed, just if there was at least
* one since the start of the grace period. Thus just a flag.
*/
void rcu_qsctr_inc(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
rdp->passed_quiesc = 1;
}
void rcu_bh_qsctr_inc(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
rdp->passed_quiesc = 1;
}
static int blimit = 10;
static int qhimark = 10000;

View File

@@ -122,6 +122,8 @@ static void rcu_barrier_func(void *type)
}
}
static inline void wait_migrated_callbacks(void);
/*
* Orchestrate the specified type of RCU barrier, waiting for all
* RCU callbacks of the specified type to complete.
@@ -147,6 +149,7 @@ static void _rcu_barrier(enum rcu_barrier type)
complete(&rcu_barrier_completion);
wait_for_completion(&rcu_barrier_completion);
mutex_unlock(&rcu_barrier_mutex);
wait_migrated_callbacks();
}
/**
@@ -176,9 +179,50 @@ void rcu_barrier_sched(void)
}
EXPORT_SYMBOL_GPL(rcu_barrier_sched);
static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0);
static struct rcu_head rcu_migrate_head[3];
static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq);
static void rcu_migrate_callback(struct rcu_head *notused)
{
if (atomic_dec_and_test(&rcu_migrate_type_count))
wake_up(&rcu_migrate_wq);
}
static inline void wait_migrated_callbacks(void)
{
wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
}
static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
unsigned long action, void *hcpu)
{
if (action == CPU_DYING) {
/*
* preempt_disable() in on_each_cpu() prevents stop_machine(),
* so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
* returns, all online cpus have queued rcu_barrier_func(),
* and the dead cpu(if it exist) queues rcu_migrate_callback()s.
*
* These callbacks ensure _rcu_barrier() waits for all
* RCU callbacks of the specified type to complete.
*/
atomic_set(&rcu_migrate_type_count, 3);
call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
} else if (action == CPU_POST_DEAD) {
/* rcu_migrate_head is protected by cpu_add_remove_lock */
wait_migrated_callbacks();
}
return NOTIFY_OK;
}
void __init rcu_init(void)
{
__rcu_init();
hotcpu_notifier(rcu_barrier_cpu_hotplug, 0);
}
void rcu_scheduler_starting(void)

View File

@@ -147,7 +147,51 @@ struct rcu_ctrlblk {
wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */
};
struct rcu_dyntick_sched {
int dynticks;
int dynticks_snap;
int sched_qs;
int sched_qs_snap;
int sched_dynticks_snap;
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
.dynticks = 1,
};
void rcu_qsctr_inc(int cpu)
{
struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
rdssp->sched_qs++;
}
#ifdef CONFIG_NO_HZ
void rcu_enter_nohz(void)
{
static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
__get_cpu_var(rcu_dyntick_sched).dynticks++;
WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs);
}
void rcu_exit_nohz(void)
{
static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
__get_cpu_var(rcu_dyntick_sched).dynticks++;
smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1),
&rs);
}
#endif /* CONFIG_NO_HZ */
static DEFINE_PER_CPU(struct rcu_data, rcu_data);
static struct rcu_ctrlblk rcu_ctrlblk = {
.fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
.completed = 0,
@@ -427,10 +471,6 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp)
}
}
DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
.dynticks = 1,
};
#ifdef CONFIG_NO_HZ
static DEFINE_PER_CPU(int, rcu_update_flag);

View File

@@ -78,6 +78,26 @@ DEFINE_PER_CPU(struct rcu_data, rcu_data);
struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
/*
* Increment the quiescent state counter.
* The counter is a bit degenerated: We do not need to know
* how many quiescent states passed, just if there was at least
* one since the start of the grace period. Thus just a flag.
*/
void rcu_qsctr_inc(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
rdp->passed_quiesc = 1;
rdp->passed_quiesc_completed = rdp->completed;
}
void rcu_bh_qsctr_inc(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
rdp->passed_quiesc = 1;
rdp->passed_quiesc_completed = rdp->completed;
}
#ifdef CONFIG_NO_HZ
DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
.dynticks_nesting = 1,

10
kernel/rcutree.h Normal file
View File

@@ -0,0 +1,10 @@
/*
* RCU implementation internal declarations:
*/
extern struct rcu_state rcu_state;
DECLARE_PER_CPU(struct rcu_data, rcu_data);
extern struct rcu_state rcu_bh_state;
DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);

View File

@@ -43,6 +43,8 @@
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include "rcutree.h"
static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
{
if (!rdp->beenonline)

View File

@@ -677,9 +677,7 @@ int relay_late_setup_files(struct rchan *chan,
*/
for_each_online_cpu(i) {
if (unlikely(!chan->buf[i])) {
printk(KERN_ERR "relay_late_setup_files: CPU %u "
"has no buffer, it must have!\n", i);
BUG();
WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n");
err = -EINVAL;
break;
}
@@ -797,13 +795,15 @@ void relay_subbufs_consumed(struct rchan *chan,
if (!chan)
return;
if (cpu >= NR_CPUS || !chan->buf[cpu])
if (cpu >= NR_CPUS || !chan->buf[cpu] ||
subbufs_consumed > chan->n_subbufs)
return;
buf = chan->buf[cpu];
buf->subbufs_consumed += subbufs_consumed;
if (buf->subbufs_consumed > buf->subbufs_produced)
if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)
buf->subbufs_consumed = buf->subbufs_produced;
else
buf->subbufs_consumed += subbufs_consumed;
}
EXPORT_SYMBOL_GPL(relay_subbufs_consumed);

View File

@@ -231,13 +231,20 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
spin_lock(&rt_b->rt_runtime_lock);
for (;;) {
unsigned long delta;
ktime_t soft, hard;
if (hrtimer_active(&rt_b->rt_period_timer))
break;
now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
hrtimer_start_expires(&rt_b->rt_period_timer,
HRTIMER_MODE_ABS);
soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
hard = hrtimer_get_expires(&rt_b->rt_period_timer);
delta = ktime_to_ns(ktime_sub(hard, soft));
__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
HRTIMER_MODE_ABS, 0);
}
spin_unlock(&rt_b->rt_runtime_lock);
}
@@ -1110,7 +1117,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
if (rq == this_rq()) {
hrtimer_restart(timer);
} else if (!rq->hrtick_csd_pending) {
__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
rq->hrtick_csd_pending = 1;
}
}
@@ -1146,7 +1153,8 @@ static __init void init_hrtick(void)
*/
static void hrtick_start(struct rq *rq, u64 delay)
{
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
HRTIMER_MODE_REL, 0);
}
static inline void init_hrtick(void)
@@ -3818,19 +3826,23 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
*/
#define MAX_PINNED_INTERVAL 512
/* Working cpumask for load_balance and load_balance_newidle. */
static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *balance, struct cpumask *cpus)
int *balance)
{
int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
struct sched_group *group;
unsigned long imbalance;
struct rq *busiest;
unsigned long flags;
struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
cpumask_setall(cpus);
@@ -3985,8 +3997,7 @@ out:
* this_rq is locked.
*/
static int
load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
struct cpumask *cpus)
load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
{
struct sched_group *group;
struct rq *busiest = NULL;
@@ -3994,6 +4005,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
int ld_moved = 0;
int sd_idle = 0;
int all_pinned = 0;
struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
cpumask_setall(cpus);
@@ -4134,10 +4146,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
struct sched_domain *sd;
int pulled_task = 0;
unsigned long next_balance = jiffies + HZ;
cpumask_var_t tmpmask;
if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
return;
for_each_domain(this_cpu, sd) {
unsigned long interval;
@@ -4148,7 +4156,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
if (sd->flags & SD_BALANCE_NEWIDLE)
/* If we've pulled tasks over stop searching: */
pulled_task = load_balance_newidle(this_cpu, this_rq,
sd, tmpmask);
sd);
interval = msecs_to_jiffies(sd->balance_interval);
if (time_after(next_balance, sd->last_balance + interval))
@@ -4163,7 +4171,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
*/
this_rq->next_balance = next_balance;
}
free_cpumask_var(tmpmask);
}
/*
@@ -4313,11 +4320,6 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
int need_serialize;
cpumask_var_t tmp;
/* Fails alloc? Rebalancing probably not a priority right now. */
if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
return;
for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
@@ -4342,7 +4344,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
}
if (time_after_eq(jiffies, sd->last_balance + interval)) {
if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
if (load_balance(cpu, rq, sd, idle, &balance)) {
/*
* We've pulled tasks over so either we're no
* longer idle, or one of our SMT siblings is
@@ -4376,8 +4378,6 @@ out:
*/
if (likely(update_next_balance))
rq->next_balance = next_balance;
free_cpumask_var(tmp);
}
/*
@@ -4781,10 +4781,7 @@ void scheduler_tick(void)
#endif
}
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_PREEMPT_TRACER))
static inline unsigned long get_parent_ip(unsigned long addr)
unsigned long get_parent_ip(unsigned long addr)
{
if (in_lock_functions(addr)) {
addr = CALLER_ADDR2;
@@ -4794,6 +4791,9 @@ static inline unsigned long get_parent_ip(unsigned long addr)
return addr;
}
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_PREEMPT_TRACER))
void __kprobes add_preempt_count(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
@@ -7728,7 +7728,7 @@ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
{
int group;
cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
group = cpumask_first(mask);
if (sg)
*sg = &per_cpu(sched_group_core, group).sg;
@@ -7757,7 +7757,7 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
group = cpumask_first(mask);
#elif defined(CONFIG_SCHED_SMT)
cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
group = cpumask_first(mask);
#else
group = cpu;
@@ -8100,7 +8100,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
SD_INIT(sd, SIBLING);
set_domain_attribute(sd, attr);
cpumask_and(sched_domain_span(sd),
&per_cpu(cpu_sibling_map, i), cpu_map);
topology_thread_cpumask(i), cpu_map);
sd->parent = p;
p->child = sd;
cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@ -8111,7 +8111,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
/* Set up CPU (sibling) groups */
for_each_cpu(i, cpu_map) {
cpumask_and(this_sibling_map,
&per_cpu(cpu_sibling_map, i), cpu_map);
topology_thread_cpumask(i), cpu_map);
if (i != cpumask_first(this_sibling_map))
continue;
@@ -8786,6 +8786,9 @@ void __init sched_init(void)
#endif
#ifdef CONFIG_USER_SCHED
alloc_size *= 2;
#endif
#ifdef CONFIG_CPUMASK_OFFSTACK
alloc_size += num_possible_cpus() * cpumask_size();
#endif
/*
* As sched_init() is called before page_alloc is setup,
@@ -8824,6 +8827,12 @@ void __init sched_init(void)
ptr += nr_cpu_ids * sizeof(void **);
#endif /* CONFIG_USER_SCHED */
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_CPUMASK_OFFSTACK
for_each_possible_cpu(i) {
per_cpu(load_balance_tmpmask, i) = (void *)ptr;
ptr += cpumask_size();
}
#endif /* CONFIG_CPUMASK_OFFSTACK */
}
#ifdef CONFIG_SMP

View File

@@ -25,6 +25,7 @@
* consistent between cpus (never more than 2 jiffies difference).
*/
#include <linux/spinlock.h>
#include <linux/hardirq.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/ktime.h>
@@ -154,6 +155,17 @@ u64 sched_clock_cpu(int cpu)
return sched_clock();
scd = cpu_sdc(cpu);
/*
* Normally this is not called in NMI context - but if it is,
* trying to do any locking here is totally lethal.
*/
if (unlikely(in_nmi()))
return scd->clock;
if (unlikely(!sched_clock_running))
return 0ull;
WARN_ON_ONCE(!irqs_disabled());
now = sched_clock();

View File

@@ -55,10 +55,22 @@ static int sig_handler_ignored(void __user *handler, int sig)
(handler == SIG_DFL && sig_kernel_ignore(sig));
}
static int sig_ignored(struct task_struct *t, int sig)
static int sig_task_ignored(struct task_struct *t, int sig,
int from_ancestor_ns)
{
void __user *handler;
handler = sig_handler(t, sig);
if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
handler == SIG_DFL && !from_ancestor_ns)
return 1;
return sig_handler_ignored(handler, sig);
}
static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
{
/*
* Blocked signals are never ignored, since the
* signal handler may change by the time it is
@@ -67,14 +79,13 @@ static int sig_ignored(struct task_struct *t, int sig)
if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
return 0;
handler = sig_handler(t, sig);
if (!sig_handler_ignored(handler, sig))
if (!sig_task_ignored(t, sig, from_ancestor_ns))
return 0;
/*
* Tracers may want to know about even ignored signals.
*/
return !tracehook_consider_ignored_signal(t, sig, handler);
return !tracehook_consider_ignored_signal(t, sig);
}
/*
@@ -318,7 +329,7 @@ int unhandled_signal(struct task_struct *tsk, int sig)
return 1;
if (handler != SIG_IGN && handler != SIG_DFL)
return 0;
return !tracehook_consider_fatal_signal(tsk, sig, handler);
return !tracehook_consider_fatal_signal(tsk, sig);
}
@@ -624,7 +635,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
* Returns true if the signal should be actually delivered, otherwise
* it should be dropped.
*/
static int prepare_signal(int sig, struct task_struct *p)
static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
{
struct signal_struct *signal = p->signal;
struct task_struct *t;
@@ -708,7 +719,7 @@ static int prepare_signal(int sig, struct task_struct *p)
}
}
return !sig_ignored(p, sig);
return !sig_ignored(p, sig, from_ancestor_ns);
}
/*
@@ -777,7 +788,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
!(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
!sigismember(&t->real_blocked, sig) &&
(sig == SIGKILL ||
!tracehook_consider_fatal_signal(t, sig, SIG_DFL))) {
!tracehook_consider_fatal_signal(t, sig))) {
/*
* This signal will be fatal to the whole group.
*/
@@ -813,8 +824,8 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
}
static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
int group)
static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
int group, int from_ancestor_ns)
{
struct sigpending *pending;
struct sigqueue *q;
@@ -822,7 +833,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
trace_sched_signal_send(sig, t);
assert_spin_locked(&t->sighand->siglock);
if (!prepare_signal(sig, t))
if (!prepare_signal(sig, t, from_ancestor_ns))
return 0;
pending = group ? &t->signal->shared_pending : &t->pending;
@@ -871,6 +883,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
break;
default:
copy_siginfo(&q->info, info);
if (from_ancestor_ns)
q->info.si_pid = 0;
break;
}
} else if (!is_si_special(info)) {
@@ -889,6 +903,20 @@ out_set:
return 0;
}
static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
int group)
{
int from_ancestor_ns = 0;
#ifdef CONFIG_PID_NS
if (!is_si_special(info) && SI_FROMUSER(info) &&
task_pid_nr_ns(current, task_active_pid_ns(t)) <= 0)
from_ancestor_ns = 1;
#endif
return __send_signal(sig, info, t, group, from_ancestor_ns);
}
int print_fatal_signals;
static void print_fatal_signal(struct pt_regs *regs, int signr)
@@ -1133,7 +1161,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
if (sig && p->sighand) {
unsigned long flags;
spin_lock_irqsave(&p->sighand->siglock, flags);
ret = __group_send_sig_info(sig, info, p);
ret = __send_signal(sig, info, p, 1, 0);
spin_unlock_irqrestore(&p->sighand->siglock, flags);
}
out_unlock:
@@ -1320,7 +1348,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
goto ret;
ret = 1; /* the signal is ignored */
if (!prepare_signal(sig, t))
if (!prepare_signal(sig, t, 0))
goto out;
ret = 0;
@@ -1844,9 +1872,16 @@ relock:
/*
* Global init gets no signals it doesn't want.
* Container-init gets no signals it doesn't want from same
* container.
*
* Note that if global/container-init sees a sig_kernel_only()
* signal here, the signal must have been generated internally
* or must have come from an ancestor namespace. In either
* case, the signal cannot be dropped.
*/
if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
!signal_group_exit(signal))
!sig_kernel_only(signr))
continue;
if (sig_kernel_stop(signr)) {

640
kernel/slow-work.c Normal file
View File

@@ -0,0 +1,640 @@
/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
*
* Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public Licence
* as published by the Free Software Foundation; either version
* 2 of the Licence, or (at your option) any later version.
*
* See Documentation/slow-work.txt
*/
#include <linux/module.h>
#include <linux/slow-work.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/wait.h>
#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of
* things to do */
#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
* OOM */
static void slow_work_cull_timeout(unsigned long);
static void slow_work_oom_timeout(unsigned long);
#ifdef CONFIG_SYSCTL
static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *);
static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *,
void __user *, size_t *, loff_t *);
#endif
/*
* The pool of threads has at least min threads in it as long as someone is
* using the facility, and may have as many as max.
*
* A portion of the pool may be processing very slow operations.
*/
static unsigned slow_work_min_threads = 2;
static unsigned slow_work_max_threads = 4;
static unsigned vslow_work_proportion = 50; /* % of threads that may process
* very slow work */
#ifdef CONFIG_SYSCTL
static const int slow_work_min_min_threads = 2;
static int slow_work_max_max_threads = 255;
static const int slow_work_min_vslow = 1;
static const int slow_work_max_vslow = 99;
ctl_table slow_work_sysctls[] = {
{
.ctl_name = CTL_UNNUMBERED,
.procname = "min-threads",
.data = &slow_work_min_threads,
.maxlen = sizeof(unsigned),
.mode = 0644,
.proc_handler = slow_work_min_threads_sysctl,
.extra1 = (void *) &slow_work_min_min_threads,
.extra2 = &slow_work_max_threads,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "max-threads",
.data = &slow_work_max_threads,
.maxlen = sizeof(unsigned),
.mode = 0644,
.proc_handler = slow_work_max_threads_sysctl,
.extra1 = &slow_work_min_threads,
.extra2 = (void *) &slow_work_max_max_threads,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "vslow-percentage",
.data = &vslow_work_proportion,
.maxlen = sizeof(unsigned),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.extra1 = (void *) &slow_work_min_vslow,
.extra2 = (void *) &slow_work_max_vslow,
},
{ .ctl_name = 0 }
};
#endif
/*
* The active state of the thread pool
*/
static atomic_t slow_work_thread_count;
static atomic_t vslow_work_executing_count;
static bool slow_work_may_not_start_new_thread;
static bool slow_work_cull; /* cull a thread due to lack of activity */
static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
static struct slow_work slow_work_new_thread; /* new thread starter */
/*
* The queues of work items and the lock governing access to them. These are
* shared between all the CPUs. It doesn't make sense to have per-CPU queues
* as the number of threads bears no relation to the number of CPUs.
*
* There are two queues of work items: one for slow work items, and one for
* very slow work items.
*/
static LIST_HEAD(slow_work_queue);
static LIST_HEAD(vslow_work_queue);
static DEFINE_SPINLOCK(slow_work_queue_lock);
/*
* The thread controls. A variable used to signal to the threads that they
* should exit when the queue is empty, a waitqueue used by the threads to wait
* for signals, and a completion set by the last thread to exit.
*/
static bool slow_work_threads_should_exit;
static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
static DECLARE_COMPLETION(slow_work_last_thread_exited);
/*
* The number of users of the thread pool and its lock. Whilst this is zero we
* have no threads hanging around, and when this reaches zero, we wait for all
* active or queued work items to complete and kill all the threads we do have.
*/
static int slow_work_user_count;
static DEFINE_MUTEX(slow_work_user_lock);
/*
* Calculate the maximum number of active threads in the pool that are
* permitted to process very slow work items.
*
* The answer is rounded up to at least 1, but may not equal or exceed the
* maximum number of the threads in the pool. This means we always have at
* least one thread that can process slow work items, and we always have at
* least one thread that won't get tied up doing so.
*/
static unsigned slow_work_calc_vsmax(void)
{
unsigned vsmax;
vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
vsmax /= 100;
vsmax = max(vsmax, 1U);
return min(vsmax, slow_work_max_threads - 1);
}
/*
* Attempt to execute stuff queued on a slow thread. Return true if we managed
* it, false if there was nothing to do.
*/
static bool slow_work_execute(void)
{
struct slow_work *work = NULL;
unsigned vsmax;
bool very_slow;
vsmax = slow_work_calc_vsmax();
/* see if we can schedule a new thread to be started if we're not
* keeping up with the work */
if (!waitqueue_active(&slow_work_thread_wq) &&
(!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
!slow_work_may_not_start_new_thread)
slow_work_enqueue(&slow_work_new_thread);
/* find something to execute */
spin_lock_irq(&slow_work_queue_lock);
if (!list_empty(&vslow_work_queue) &&
atomic_read(&vslow_work_executing_count) < vsmax) {
work = list_entry(vslow_work_queue.next,
struct slow_work, link);
if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
BUG();
list_del_init(&work->link);
atomic_inc(&vslow_work_executing_count);
very_slow = true;
} else if (!list_empty(&slow_work_queue)) {
work = list_entry(slow_work_queue.next,
struct slow_work, link);
if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
BUG();
list_del_init(&work->link);
very_slow = false;
} else {
very_slow = false; /* avoid the compiler warning */
}
spin_unlock_irq(&slow_work_queue_lock);
if (!work)
return false;
if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
BUG();
work->ops->execute(work);
if (very_slow)
atomic_dec(&vslow_work_executing_count);
clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
/* if someone tried to enqueue the item whilst we were executing it,
* then it'll be left unenqueued to avoid multiple threads trying to
* execute it simultaneously
*
* there is, however, a race between us testing the pending flag and
* getting the spinlock, and between the enqueuer setting the pending
* flag and getting the spinlock, so we use a deferral bit to tell us
* if the enqueuer got there first
*/
if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
spin_lock_irq(&slow_work_queue_lock);
if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
goto auto_requeue;
spin_unlock_irq(&slow_work_queue_lock);
}
work->ops->put_ref(work);
return true;
auto_requeue:
/* we must complete the enqueue operation
* - we transfer our ref on the item back to the appropriate queue
* - don't wake another thread up as we're awake already
*/
if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
list_add_tail(&work->link, &vslow_work_queue);
else
list_add_tail(&work->link, &slow_work_queue);
spin_unlock_irq(&slow_work_queue_lock);
return true;
}
/**
* slow_work_enqueue - Schedule a slow work item for processing
* @work: The work item to queue
*
* Schedule a slow work item for processing. If the item is already undergoing
* execution, this guarantees not to re-enter the execution routine until the
* first execution finishes.
*
* The item is pinned by this function as it retains a reference to it, managed
* through the item operations. The item is unpinned once it has been
* executed.
*
* An item may hog the thread that is running it for a relatively large amount
* of time, sufficient, for example, to perform several lookup, mkdir, create
* and setxattr operations. It may sleep on I/O and may sleep to obtain locks.
*
* Conversely, if a number of items are awaiting processing, it may take some
* time before any given item is given attention. The number of threads in the
* pool may be increased to deal with demand, but only up to a limit.
*
* If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
* the very slow queue, from which only a portion of the threads will be
* allowed to pick items to execute. This ensures that very slow items won't
* overly block ones that are just ordinarily slow.
*
* Returns 0 if successful, -EAGAIN if not.
*/
int slow_work_enqueue(struct slow_work *work)
{
unsigned long flags;
BUG_ON(slow_work_user_count <= 0);
BUG_ON(!work);
BUG_ON(!work->ops);
BUG_ON(!work->ops->get_ref);
/* when honouring an enqueue request, we only promise that we will run
* the work function in the future; we do not promise to run it once
* per enqueue request
*
* we use the PENDING bit to merge together repeat requests without
* having to disable IRQs and take the spinlock, whilst still
* maintaining our promise
*/
if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
spin_lock_irqsave(&slow_work_queue_lock, flags);
/* we promise that we will not attempt to execute the work
* function in more than one thread simultaneously
*
* this, however, leaves us with a problem if we're asked to
* enqueue the work whilst someone is executing the work
* function as simply queueing the work immediately means that
* another thread may try executing it whilst it is already
* under execution
*
* to deal with this, we set the ENQ_DEFERRED bit instead of
* enqueueing, and the thread currently executing the work
* function will enqueue the work item when the work function
* returns and it has cleared the EXECUTING bit
*/
if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
} else {
if (work->ops->get_ref(work) < 0)
goto cant_get_ref;
if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
list_add_tail(&work->link, &vslow_work_queue);
else
list_add_tail(&work->link, &slow_work_queue);
wake_up(&slow_work_thread_wq);
}
spin_unlock_irqrestore(&slow_work_queue_lock, flags);
}
return 0;
cant_get_ref:
spin_unlock_irqrestore(&slow_work_queue_lock, flags);
return -EAGAIN;
}
EXPORT_SYMBOL(slow_work_enqueue);
/*
* Worker thread culling algorithm
*/
static bool slow_work_cull_thread(void)
{
unsigned long flags;
bool do_cull = false;
spin_lock_irqsave(&slow_work_queue_lock, flags);
if (slow_work_cull) {
slow_work_cull = false;
if (list_empty(&slow_work_queue) &&
list_empty(&vslow_work_queue) &&
atomic_read(&slow_work_thread_count) >
slow_work_min_threads) {
mod_timer(&slow_work_cull_timer,
jiffies + SLOW_WORK_CULL_TIMEOUT);
do_cull = true;
}
}
spin_unlock_irqrestore(&slow_work_queue_lock, flags);
return do_cull;
}
/*
* Determine if there is slow work available for dispatch
*/
static inline bool slow_work_available(int vsmax)
{
return !list_empty(&slow_work_queue) ||
(!list_empty(&vslow_work_queue) &&
atomic_read(&vslow_work_executing_count) < vsmax);
}
/*
* Worker thread dispatcher
*/
static int slow_work_thread(void *_data)
{
int vsmax;
DEFINE_WAIT(wait);
set_freezable();
set_user_nice(current, -5);
for (;;) {
vsmax = vslow_work_proportion;
vsmax *= atomic_read(&slow_work_thread_count);
vsmax /= 100;
prepare_to_wait(&slow_work_thread_wq, &wait,
TASK_INTERRUPTIBLE);
if (!freezing(current) &&
!slow_work_threads_should_exit &&
!slow_work_available(vsmax) &&
!slow_work_cull)
schedule();
finish_wait(&slow_work_thread_wq, &wait);
try_to_freeze();
vsmax = vslow_work_proportion;
vsmax *= atomic_read(&slow_work_thread_count);
vsmax /= 100;
if (slow_work_available(vsmax) && slow_work_execute()) {
cond_resched();
if (list_empty(&slow_work_queue) &&
list_empty(&vslow_work_queue) &&
atomic_read(&slow_work_thread_count) >
slow_work_min_threads)
mod_timer(&slow_work_cull_timer,
jiffies + SLOW_WORK_CULL_TIMEOUT);
continue;
}
if (slow_work_threads_should_exit)
break;
if (slow_work_cull && slow_work_cull_thread())
break;
}
if (atomic_dec_and_test(&slow_work_thread_count))
complete_and_exit(&slow_work_last_thread_exited, 0);
return 0;
}
/*
* Handle thread cull timer expiration
*/
static void slow_work_cull_timeout(unsigned long data)
{
slow_work_cull = true;
wake_up(&slow_work_thread_wq);
}
/*
* Get a reference on slow work thread starter
*/
static int slow_work_new_thread_get_ref(struct slow_work *work)
{
return 0;
}
/*
* Drop a reference on slow work thread starter
*/
static void slow_work_new_thread_put_ref(struct slow_work *work)
{
}
/*
* Start a new slow work thread
*/
static void slow_work_new_thread_execute(struct slow_work *work)
{
struct task_struct *p;
if (slow_work_threads_should_exit)
return;
if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
return;
if (!mutex_trylock(&slow_work_user_lock))
return;
slow_work_may_not_start_new_thread = true;
atomic_inc(&slow_work_thread_count);
p = kthread_run(slow_work_thread, NULL, "kslowd");
if (IS_ERR(p)) {
printk(KERN_DEBUG "Slow work thread pool: OOM\n");
if (atomic_dec_and_test(&slow_work_thread_count))
BUG(); /* we're running on a slow work thread... */
mod_timer(&slow_work_oom_timer,
jiffies + SLOW_WORK_OOM_TIMEOUT);
} else {
/* ratelimit the starting of new threads */
mod_timer(&slow_work_oom_timer, jiffies + 1);
}
mutex_unlock(&slow_work_user_lock);
}
static const struct slow_work_ops slow_work_new_thread_ops = {
.get_ref = slow_work_new_thread_get_ref,
.put_ref = slow_work_new_thread_put_ref,
.execute = slow_work_new_thread_execute,
};
/*
* post-OOM new thread start suppression expiration
*/
static void slow_work_oom_timeout(unsigned long data)
{
slow_work_may_not_start_new_thread = false;
}
#ifdef CONFIG_SYSCTL
/*
* Handle adjustment of the minimum number of threads
*/
static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
struct file *filp, void __user *buffer,
size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
int n;
if (ret == 0) {
mutex_lock(&slow_work_user_lock);
if (slow_work_user_count > 0) {
/* see if we need to start or stop threads */
n = atomic_read(&slow_work_thread_count) -
slow_work_min_threads;
if (n < 0 && !slow_work_may_not_start_new_thread)
slow_work_enqueue(&slow_work_new_thread);
else if (n > 0)
mod_timer(&slow_work_cull_timer,
jiffies + SLOW_WORK_CULL_TIMEOUT);
}
mutex_unlock(&slow_work_user_lock);
}
return ret;
}
/*
* Handle adjustment of the maximum number of threads
*/
static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
struct file *filp, void __user *buffer,
size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
int n;
if (ret == 0) {
mutex_lock(&slow_work_user_lock);
if (slow_work_user_count > 0) {
/* see if we need to stop threads */
n = slow_work_max_threads -
atomic_read(&slow_work_thread_count);
if (n < 0)
mod_timer(&slow_work_cull_timer,
jiffies + SLOW_WORK_CULL_TIMEOUT);
}
mutex_unlock(&slow_work_user_lock);
}
return ret;
}
#endif /* CONFIG_SYSCTL */
/**
* slow_work_register_user - Register a user of the facility
*
* Register a user of the facility, starting up the initial threads if there
* aren't any other users at this point. This will return 0 if successful, or
* an error if not.
*/
int slow_work_register_user(void)
{
struct task_struct *p;
int loop;
mutex_lock(&slow_work_user_lock);
if (slow_work_user_count == 0) {
printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
init_completion(&slow_work_last_thread_exited);
slow_work_threads_should_exit = false;
slow_work_init(&slow_work_new_thread,
&slow_work_new_thread_ops);
slow_work_may_not_start_new_thread = false;
slow_work_cull = false;
/* start the minimum number of threads */
for (loop = 0; loop < slow_work_min_threads; loop++) {
atomic_inc(&slow_work_thread_count);
p = kthread_run(slow_work_thread, NULL, "kslowd");
if (IS_ERR(p))
goto error;
}
printk(KERN_NOTICE "Slow work thread pool: Ready\n");
}
slow_work_user_count++;
mutex_unlock(&slow_work_user_lock);
return 0;
error:
if (atomic_dec_and_test(&slow_work_thread_count))
complete(&slow_work_last_thread_exited);
if (loop > 0) {
printk(KERN_ERR "Slow work thread pool:"
" Aborting startup on ENOMEM\n");
slow_work_threads_should_exit = true;
wake_up_all(&slow_work_thread_wq);
wait_for_completion(&slow_work_last_thread_exited);
printk(KERN_ERR "Slow work thread pool: Aborted\n");
}
mutex_unlock(&slow_work_user_lock);
return PTR_ERR(p);
}
EXPORT_SYMBOL(slow_work_register_user);
/**
* slow_work_unregister_user - Unregister a user of the facility
*
* Unregister a user of the facility, killing all the threads if this was the
* last one.
*/
void slow_work_unregister_user(void)
{
mutex_lock(&slow_work_user_lock);
BUG_ON(slow_work_user_count <= 0);
slow_work_user_count--;
if (slow_work_user_count == 0) {
printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
slow_work_threads_should_exit = true;
wake_up_all(&slow_work_thread_wq);
wait_for_completion(&slow_work_last_thread_exited);
printk(KERN_NOTICE "Slow work thread pool:"
" Shut down complete\n");
}
del_timer_sync(&slow_work_cull_timer);
mutex_unlock(&slow_work_user_lock);
}
EXPORT_SYMBOL(slow_work_unregister_user);
/*
* Initialise the slow work facility
*/
static int __init init_slow_work(void)
{
unsigned nr_cpus = num_possible_cpus();
if (slow_work_max_threads < nr_cpus)
slow_work_max_threads = nr_cpus;
#ifdef CONFIG_SYSCTL
if (slow_work_max_max_threads < nr_cpus * 2)
slow_work_max_max_threads = nr_cpus * 2;
#endif
return 0;
}
subsys_initcall(init_slow_work);

View File

@@ -2,40 +2,82 @@
* Generic helpers for smp ipi calls
*
* (C) Jens Axboe <jens.axboe@oracle.com> 2008
*
*/
#include <linux/init.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/rculist.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/cpu.h>
static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
static LIST_HEAD(call_function_queue);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock);
static struct {
struct list_head queue;
spinlock_t lock;
} call_function __cacheline_aligned_in_smp =
{
.queue = LIST_HEAD_INIT(call_function.queue),
.lock = __SPIN_LOCK_UNLOCKED(call_function.lock),
};
enum {
CSD_FLAG_WAIT = 0x01,
CSD_FLAG_ALLOC = 0x02,
CSD_FLAG_LOCK = 0x04,
CSD_FLAG_LOCK = 0x01,
};
struct call_function_data {
struct call_single_data csd;
spinlock_t lock;
unsigned int refs;
struct rcu_head rcu_head;
unsigned long cpumask_bits[];
struct call_single_data csd;
spinlock_t lock;
unsigned int refs;
cpumask_var_t cpumask;
};
struct call_single_queue {
struct list_head list;
spinlock_t lock;
struct list_head list;
spinlock_t lock;
};
static DEFINE_PER_CPU(struct call_function_data, cfd_data) = {
.lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock),
};
static int
hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
cpu_to_node(cpu)))
return NOTIFY_BAD;
break;
#ifdef CONFIG_CPU_HOTPLUG
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
free_cpumask_var(cfd->cpumask);
break;
#endif
};
return NOTIFY_OK;
}
static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {
.notifier_call = hotplug_cfd,
};
static int __cpuinit init_call_single_data(void)
{
void *cpu = (void *)(long)smp_processor_id();
int i;
for_each_possible_cpu(i) {
@@ -44,29 +86,63 @@ static int __cpuinit init_call_single_data(void)
spin_lock_init(&q->lock);
INIT_LIST_HEAD(&q->list);
}
hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
register_cpu_notifier(&hotplug_cfd_notifier);
return 0;
}
early_initcall(init_call_single_data);
static void csd_flag_wait(struct call_single_data *data)
/*
* csd_lock/csd_unlock used to serialize access to per-cpu csd resources
*
* For non-synchronous ipi calls the csd can still be in use by the
* previous function call. For multi-cpu calls its even more interesting
* as we'll have to ensure no other cpu is observing our csd.
*/
static void csd_lock_wait(struct call_single_data *data)
{
/* Wait for response */
do {
if (!(data->flags & CSD_FLAG_WAIT))
break;
while (data->flags & CSD_FLAG_LOCK)
cpu_relax();
} while (1);
}
static void csd_lock(struct call_single_data *data)
{
csd_lock_wait(data);
data->flags = CSD_FLAG_LOCK;
/*
* prevent CPU from reordering the above assignment
* to ->flags with any subsequent assignments to other
* fields of the specified call_single_data structure:
*/
smp_mb();
}
static void csd_unlock(struct call_single_data *data)
{
WARN_ON(!(data->flags & CSD_FLAG_LOCK));
/*
* ensure we're all done before releasing data:
*/
smp_mb();
data->flags &= ~CSD_FLAG_LOCK;
}
/*
* Insert a previously allocated call_single_data element for execution
* on the given CPU. data must already have ->func, ->info, and ->flags set.
* Insert a previously allocated call_single_data element
* for execution on the given CPU. data must already have
* ->func, ->info, and ->flags set.
*/
static void generic_exec_single(int cpu, struct call_single_data *data)
static
void generic_exec_single(int cpu, struct call_single_data *data, int wait)
{
struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
int wait = data->flags & CSD_FLAG_WAIT, ipi;
unsigned long flags;
int ipi;
spin_lock_irqsave(&dst->lock, flags);
ipi = list_empty(&dst->list);
@@ -74,24 +150,21 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
spin_unlock_irqrestore(&dst->lock, flags);
/*
* Make the list addition visible before sending the ipi.
* The list addition should be visible before sending the IPI
* handler locks the list to pull the entry off it because of
* normal cache coherency rules implied by spinlocks.
*
* If IPIs can go out of order to the cache coherency protocol
* in an architecture, sufficient synchronisation should be added
* to arch code to make it appear to obey cache coherency WRT
* locking and barrier primitives. Generic code isn't really
* equipped to do the right thing...
*/
smp_mb();
if (ipi)
arch_send_call_function_single_ipi(cpu);
if (wait)
csd_flag_wait(data);
}
static void rcu_free_call_data(struct rcu_head *head)
{
struct call_function_data *data;
data = container_of(head, struct call_function_data, rcu_head);
kfree(data);
csd_lock_wait(data);
}
/*
@@ -104,99 +177,83 @@ void generic_smp_call_function_interrupt(void)
int cpu = get_cpu();
/*
* It's ok to use list_for_each_rcu() here even though we may delete
* 'pos', since list_del_rcu() doesn't clear ->next
* Ensure entry is visible on call_function_queue after we have
* entered the IPI. See comment in smp_call_function_many.
* If we don't have this, then we may miss an entry on the list
* and never get another IPI to process it.
*/
rcu_read_lock();
list_for_each_entry_rcu(data, &call_function_queue, csd.list) {
smp_mb();
/*
* It's ok to use list_for_each_rcu() here even though we may
* delete 'pos', since list_del_rcu() doesn't clear ->next
*/
list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
int refs;
if (!cpumask_test_cpu(cpu, to_cpumask(data->cpumask_bits)))
spin_lock(&data->lock);
if (!cpumask_test_cpu(cpu, data->cpumask)) {
spin_unlock(&data->lock);
continue;
}
cpumask_clear_cpu(cpu, data->cpumask);
spin_unlock(&data->lock);
data->csd.func(data->csd.info);
spin_lock(&data->lock);
cpumask_clear_cpu(cpu, to_cpumask(data->cpumask_bits));
WARN_ON(data->refs == 0);
data->refs--;
refs = data->refs;
refs = --data->refs;
if (!refs) {
spin_lock(&call_function.lock);
list_del_rcu(&data->csd.list);
spin_unlock(&call_function.lock);
}
spin_unlock(&data->lock);
if (refs)
continue;
spin_lock(&call_function_lock);
list_del_rcu(&data->csd.list);
spin_unlock(&call_function_lock);
if (data->csd.flags & CSD_FLAG_WAIT) {
/*
* serialize stores to data with the flag clear
* and wakeup
*/
smp_wmb();
data->csd.flags &= ~CSD_FLAG_WAIT;
}
if (data->csd.flags & CSD_FLAG_ALLOC)
call_rcu(&data->rcu_head, rcu_free_call_data);
csd_unlock(&data->csd);
}
rcu_read_unlock();
put_cpu();
}
/*
* Invoked by arch to handle an IPI for call function single. Must be called
* from the arch with interrupts disabled.
* Invoked by arch to handle an IPI for call function single. Must be
* called from the arch with interrupts disabled.
*/
void generic_smp_call_function_single_interrupt(void)
{
struct call_single_queue *q = &__get_cpu_var(call_single_queue);
unsigned int data_flags;
LIST_HEAD(list);
/*
* Need to see other stores to list head for checking whether
* list is empty without holding q->lock
*/
smp_read_barrier_depends();
while (!list_empty(&q->list)) {
unsigned int data_flags;
spin_lock(&q->lock);
list_replace_init(&q->list, &list);
spin_unlock(&q->lock);
spin_lock(&q->lock);
list_replace_init(&q->list, &list);
spin_unlock(&q->lock);
while (!list_empty(&list)) {
struct call_single_data *data;
while (!list_empty(&list)) {
struct call_single_data *data;
data = list_entry(list.next, struct call_single_data, list);
list_del(&data->list);
data = list_entry(list.next, struct call_single_data,
list);
list_del(&data->list);
/*
* 'data' can be invalid after this call if
* flags == 0 (when called through
* generic_exec_single(), so save them away before
* making the call.
*/
data_flags = data->flags;
data->func(data->info);
if (data_flags & CSD_FLAG_WAIT) {
smp_wmb();
data->flags &= ~CSD_FLAG_WAIT;
} else if (data_flags & CSD_FLAG_LOCK) {
smp_wmb();
data->flags &= ~CSD_FLAG_LOCK;
} else if (data_flags & CSD_FLAG_ALLOC)
kfree(data);
}
/*
* See comment on outer loop
* 'data' can be invalid after this call if flags == 0
* (when called through generic_exec_single()),
* so save them away before making the call:
*/
smp_read_barrier_depends();
data_flags = data->flags;
data->func(data->info);
/*
* Unlocked CSDs are valid through generic_exec_single():
*/
if (data_flags & CSD_FLAG_LOCK)
csd_unlock(data);
}
}
@@ -215,65 +272,45 @@ static DEFINE_PER_CPU(struct call_single_data, csd_data);
int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
int wait)
{
struct call_single_data d;
struct call_single_data d = {
.flags = 0,
};
unsigned long flags;
/* prevent preemption and reschedule on another processor,
as well as CPU removal */
int me = get_cpu();
int this_cpu;
int err = 0;
/* Can deadlock when called with interrupts disabled */
WARN_ON(irqs_disabled());
/*
* prevent preemption and reschedule on another processor,
* as well as CPU removal
*/
this_cpu = get_cpu();
if (cpu == me) {
/* Can deadlock when called with interrupts disabled */
WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);
if (cpu == this_cpu) {
local_irq_save(flags);
func(info);
local_irq_restore(flags);
} else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
struct call_single_data *data;
if (!wait) {
/*
* We are calling a function on a single CPU
* and we are not going to wait for it to finish.
* We first try to allocate the data, but if we
* fail, we fall back to use a per cpu data to pass
* the information to that CPU. Since all callers
* of this code will use the same data, we must
* synchronize the callers to prevent a new caller
* from corrupting the data before the callee
* can access it.
*
* The CSD_FLAG_LOCK is used to let us know when
* the IPI handler is done with the data.
* The first caller will set it, and the callee
* will clear it. The next caller must wait for
* it to clear before we set it again. This
* will make sure the callee is done with the
* data before a new caller will use it.
*/
data = kmalloc(sizeof(*data), GFP_ATOMIC);
if (data)
data->flags = CSD_FLAG_ALLOC;
else {
data = &per_cpu(csd_data, me);
while (data->flags & CSD_FLAG_LOCK)
cpu_relax();
data->flags = CSD_FLAG_LOCK;
}
} else {
data = &d;
data->flags = CSD_FLAG_WAIT;
}
data->func = func;
data->info = info;
generic_exec_single(cpu, data);
} else {
err = -ENXIO; /* CPU not online */
if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
struct call_single_data *data = &d;
if (!wait)
data = &__get_cpu_var(csd_data);
csd_lock(data);
data->func = func;
data->info = info;
generic_exec_single(cpu, data, wait);
} else {
err = -ENXIO; /* CPU not online */
}
}
put_cpu();
return err;
}
EXPORT_SYMBOL(smp_call_function_single);
@@ -283,23 +320,26 @@ EXPORT_SYMBOL(smp_call_function_single);
* @cpu: The CPU to run on.
* @data: Pre-allocated and setup data structure
*
* Like smp_call_function_single(), but allow caller to pass in a pre-allocated
* data structure. Useful for embedding @data inside other structures, for
* instance.
*
* Like smp_call_function_single(), but allow caller to pass in a
* pre-allocated data structure. Useful for embedding @data inside
* other structures, for instance.
*/
void __smp_call_function_single(int cpu, struct call_single_data *data)
void __smp_call_function_single(int cpu, struct call_single_data *data,
int wait)
{
/* Can deadlock when called with interrupts disabled */
WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled());
csd_lock(data);
generic_exec_single(cpu, data);
/* Can deadlock when called with interrupts disabled */
WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress);
generic_exec_single(cpu, data, wait);
}
/* FIXME: Shim for archs using old arch_send_call_function_ipi API. */
/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */
#ifndef arch_send_call_function_ipi_mask
#define arch_send_call_function_ipi_mask(maskp) \
arch_send_call_function_ipi(*(maskp))
# define arch_send_call_function_ipi_mask(maskp) \
arch_send_call_function_ipi(*(maskp))
#endif
/**
@@ -307,7 +347,8 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
* @mask: The set of cpus to run on (only runs on online subset).
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
* @wait: If true, wait (atomically) until function has completed on other CPUs.
* @wait: If true, wait (atomically) until function has completed
* on other CPUs.
*
* If @wait is true, then returns once @func has returned. Note that @wait
* will be implicitly turned on in case of allocation failures, since
@@ -318,27 +359,27 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
* must be disabled when calling this function.
*/
void smp_call_function_many(const struct cpumask *mask,
void (*func)(void *), void *info,
bool wait)
void (*func)(void *), void *info, bool wait)
{
struct call_function_data *data;
unsigned long flags;
int cpu, next_cpu;
int cpu, next_cpu, this_cpu = smp_processor_id();
/* Can deadlock when called with interrupts disabled */
WARN_ON(irqs_disabled());
WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);
/* So, what's a CPU they want? Ignoring this one. */
/* So, what's a CPU they want? Ignoring this one. */
cpu = cpumask_first_and(mask, cpu_online_mask);
if (cpu == smp_processor_id())
if (cpu == this_cpu)
cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
/* No online cpus? We're done. */
if (cpu >= nr_cpu_ids)
return;
/* Do we have another CPU which isn't us? */
next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
if (next_cpu == smp_processor_id())
if (next_cpu == this_cpu)
next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
/* Fastpath: do that cpu by itself. */
@@ -347,43 +388,40 @@ void smp_call_function_many(const struct cpumask *mask,
return;
}
data = kmalloc(sizeof(*data) + cpumask_size(), GFP_ATOMIC);
if (unlikely(!data)) {
/* Slow path. */
for_each_online_cpu(cpu) {
if (cpu == smp_processor_id())
continue;
if (cpumask_test_cpu(cpu, mask))
smp_call_function_single(cpu, func, info, wait);
}
return;
}
data = &__get_cpu_var(cfd_data);
csd_lock(&data->csd);
spin_lock_init(&data->lock);
data->csd.flags = CSD_FLAG_ALLOC;
if (wait)
data->csd.flags |= CSD_FLAG_WAIT;
spin_lock_irqsave(&data->lock, flags);
data->csd.func = func;
data->csd.info = info;
cpumask_and(to_cpumask(data->cpumask_bits), mask, cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), to_cpumask(data->cpumask_bits));
data->refs = cpumask_weight(to_cpumask(data->cpumask_bits));
cpumask_and(data->cpumask, mask, cpu_online_mask);
cpumask_clear_cpu(this_cpu, data->cpumask);
data->refs = cpumask_weight(data->cpumask);
spin_lock_irqsave(&call_function_lock, flags);
list_add_tail_rcu(&data->csd.list, &call_function_queue);
spin_unlock_irqrestore(&call_function_lock, flags);
spin_lock(&call_function.lock);
/*
* Place entry at the _HEAD_ of the list, so that any cpu still
* observing the entry in generic_smp_call_function_interrupt()
* will not miss any other list entries:
*/
list_add_rcu(&data->csd.list, &call_function.queue);
spin_unlock(&call_function.lock);
spin_unlock_irqrestore(&data->lock, flags);
/*
* Make the list addition visible before sending the ipi.
* (IPIs must obey or appear to obey normal Linux cache
* coherency rules -- see comment in generic_exec_single).
*/
smp_mb();
/* Send a message to all CPUs in the map */
arch_send_call_function_ipi_mask(to_cpumask(data->cpumask_bits));
arch_send_call_function_ipi_mask(data->cpumask);
/* optionally wait for the CPUs to complete */
/* Optionally wait for the CPUs to complete */
if (wait)
csd_flag_wait(&data->csd);
csd_lock_wait(&data->csd);
}
EXPORT_SYMBOL(smp_call_function_many);
@@ -391,7 +429,8 @@ EXPORT_SYMBOL(smp_call_function_many);
* smp_call_function(): Run a function on all other CPUs.
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
* @wait: If true, wait (atomically) until function has completed on other CPUs.
* @wait: If true, wait (atomically) until function has completed
* on other CPUs.
*
* Returns 0.
*
@@ -407,26 +446,27 @@ int smp_call_function(void (*func)(void *), void *info, int wait)
preempt_disable();
smp_call_function_many(cpu_online_mask, func, info, wait);
preempt_enable();
return 0;
}
EXPORT_SYMBOL(smp_call_function);
void ipi_call_lock(void)
{
spin_lock(&call_function_lock);
spin_lock(&call_function.lock);
}
void ipi_call_unlock(void)
{
spin_unlock(&call_function_lock);
spin_unlock(&call_function.lock);
}
void ipi_call_lock_irq(void)
{
spin_lock_irq(&call_function_lock);
spin_lock_irq(&call_function.lock);
}
void ipi_call_unlock_irq(void)
{
spin_unlock_irq(&call_function_lock);
spin_unlock_irq(&call_function.lock);
}

View File

@@ -21,8 +21,10 @@
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/rcupdate.h>
#include <linux/ftrace.h>
#include <linux/smp.h>
#include <linux/tick.h>
#include <trace/irq.h>
#include <asm/irq.h>
/*
@@ -52,13 +54,18 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
char *softirq_to_name[NR_SOFTIRQS] = {
"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK",
"TASKLET", "SCHED", "HRTIMER", "RCU"
};
/*
* we cannot loop indefinitely here to avoid userspace starvation,
* but we also don't want to introduce a worst case 1/HZ latency
* to the pending events, so lets the scheduler to balance
* the softirq load for us.
*/
static inline void wakeup_softirqd(void)
void wakeup_softirqd(void)
{
/* Interrupts are disabled: no need to stop preemption */
struct task_struct *tsk = __get_cpu_var(ksoftirqd);
@@ -79,13 +86,23 @@ static void __local_bh_disable(unsigned long ip)
WARN_ON_ONCE(in_irq());
raw_local_irq_save(flags);
add_preempt_count(SOFTIRQ_OFFSET);
/*
* The preempt tracer hooks into add_preempt_count and will break
* lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
* is set and before current->softirq_enabled is cleared.
* We must manually increment preempt_count here and manually
* call the trace_preempt_off later.
*/
preempt_count() += SOFTIRQ_OFFSET;
/*
* Were softirqs turned off above:
*/
if (softirq_count() == SOFTIRQ_OFFSET)
trace_softirqs_off(ip);
raw_local_irq_restore(flags);
if (preempt_count() == SOFTIRQ_OFFSET)
trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
}
#else /* !CONFIG_TRACE_IRQFLAGS */
static inline void __local_bh_disable(unsigned long ip)
@@ -169,6 +186,9 @@ EXPORT_SYMBOL(local_bh_enable_ip);
*/
#define MAX_SOFTIRQ_RESTART 10
DEFINE_TRACE(softirq_entry);
DEFINE_TRACE(softirq_exit);
asmlinkage void __do_softirq(void)
{
struct softirq_action *h;
@@ -180,7 +200,7 @@ asmlinkage void __do_softirq(void)
account_system_vtime(current);
__local_bh_disable((unsigned long)__builtin_return_address(0));
trace_softirq_enter();
lockdep_softirq_enter();
cpu = smp_processor_id();
restart:
@@ -195,12 +215,14 @@ restart:
if (pending & 1) {
int prev_count = preempt_count();
trace_softirq_entry(h, softirq_vec);
h->action(h);
trace_softirq_exit(h, softirq_vec);
if (unlikely(prev_count != preempt_count())) {
printk(KERN_ERR "huh, entered softirq %td %p"
printk(KERN_ERR "huh, entered softirq %td %s %p"
"with preempt_count %08x,"
" exited with %08x?\n", h - softirq_vec,
softirq_to_name[h - softirq_vec],
h->action, prev_count, preempt_count());
preempt_count() = prev_count;
}
@@ -220,7 +242,7 @@ restart:
if (pending)
wakeup_softirqd();
trace_softirq_exit();
lockdep_softirq_exit();
account_system_vtime(current);
_local_bh_enable();
@@ -496,7 +518,7 @@ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softir
cp->flags = 0;
cp->priv = softirq;
__smp_call_function_single(cpu, cp);
__smp_call_function_single(cpu, cp, 0);
return 0;
}
return 1;

View File

@@ -165,98 +165,12 @@ void softlockup_tick(void)
panic("softlockup: hung tasks");
}
/*
* Have a reasonable limit on the number of tasks checked:
*/
unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
/*
* Zero means infinite timeout - no checking done:
*/
unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
unsigned long __read_mostly sysctl_hung_task_warnings = 10;
/*
* Only do the hung-tasks check on one CPU:
*/
static int check_cpu __read_mostly = -1;
static void check_hung_task(struct task_struct *t, unsigned long now)
{
unsigned long switch_count = t->nvcsw + t->nivcsw;
if (t->flags & PF_FROZEN)
return;
if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
t->last_switch_count = switch_count;
t->last_switch_timestamp = now;
return;
}
if ((long)(now - t->last_switch_timestamp) <
sysctl_hung_task_timeout_secs)
return;
if (!sysctl_hung_task_warnings)
return;
sysctl_hung_task_warnings--;
/*
* Ok, the task did not get scheduled for more than 2 minutes,
* complain:
*/
printk(KERN_ERR "INFO: task %s:%d blocked for more than "
"%ld seconds.\n", t->comm, t->pid,
sysctl_hung_task_timeout_secs);
printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
" disables this message.\n");
sched_show_task(t);
__debug_show_held_locks(t);
t->last_switch_timestamp = now;
touch_nmi_watchdog();
if (softlockup_panic)
panic("softlockup: blocked tasks");
}
/*
* Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
* a really long time (120 seconds). If that happens, print out
* a warning.
*/
static void check_hung_uninterruptible_tasks(int this_cpu)
{
int max_count = sysctl_hung_task_check_count;
unsigned long now = get_timestamp(this_cpu);
struct task_struct *g, *t;
/*
* If the system crashed already then all bets are off,
* do not report extra hung tasks:
*/
if (test_taint(TAINT_DIE) || did_panic)
return;
read_lock(&tasklist_lock);
do_each_thread(g, t) {
if (!--max_count)
goto unlock;
/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
if (t->state == TASK_UNINTERRUPTIBLE)
check_hung_task(t, now);
} while_each_thread(g, t);
unlock:
read_unlock(&tasklist_lock);
}
/*
* The watchdog thread - runs every second and touches the timestamp.
*/
static int watchdog(void *__bind_cpu)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
int this_cpu = (long)__bind_cpu;
sched_setscheduler(current, SCHED_FIFO, &param);
@@ -276,11 +190,6 @@ static int watchdog(void *__bind_cpu)
if (kthread_should_stop())
break;
if (this_cpu == check_cpu) {
if (sysctl_hung_task_timeout_secs)
check_hung_uninterruptible_tasks(this_cpu);
}
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
@@ -312,18 +221,9 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
check_cpu = cpumask_any(cpu_online_mask);
wake_up_process(per_cpu(watchdog_task, hotcpu));
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
if (hotcpu == check_cpu) {
/* Pick any other online cpu. */
check_cpu = cpumask_any_but(cpu_online_mask, hotcpu);
}
break;
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
if (!per_cpu(watchdog_task, hotcpu))

View File

@@ -121,7 +121,8 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
local_irq_save(flags);
preempt_disable();
rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock,
_raw_read_lock_flags, &flags);
return flags;
}
EXPORT_SYMBOL(_read_lock_irqsave);
@@ -151,7 +152,8 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
local_irq_save(flags);
preempt_disable();
rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock,
_raw_write_lock_flags, &flags);
return flags;
}
EXPORT_SYMBOL(_write_lock_irqsave);
@@ -299,16 +301,8 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
local_irq_save(flags);
preempt_disable();
spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
/*
* On lockdep we dont want the hand-coded irq-enable of
* _raw_spin_lock_flags() code, because lockdep assumes
* that interrupts are not re-enabled during lock-acquire:
*/
#ifdef CONFIG_LOCKDEP
LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
#else
_raw_spin_lock_flags(lock, &flags);
#endif
LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock,
_raw_spin_lock_flags, &flags);
return flags;
}
EXPORT_SYMBOL(_spin_lock_irqsave_nested);

View File

@@ -34,6 +34,7 @@
#include <linux/seccomp.h>
#include <linux/cpu.h>
#include <linux/ptrace.h>
#include <linux/fs_struct.h>
#include <linux/compat.h>
#include <linux/syscalls.h>
@@ -1013,10 +1014,8 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
if (err)
goto out;
if (task_pgrp(p) != pgrp) {
if (task_pgrp(p) != pgrp)
change_pid(p, PIDTYPE_PGID, pgrp);
set_task_pgrp(p, pid_nr(pgrp));
}
err = 0;
out:

View File

@@ -48,6 +48,7 @@
#include <linux/acpi.h>
#include <linux/reboot.h>
#include <linux/ftrace.h>
#include <linux/slow-work.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -95,14 +96,12 @@ static int sixty = 60;
static int neg_one = -1;
#endif
#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING)
static int two = 2;
#endif
static int zero;
static int one = 1;
static int __maybe_unused one = 1;
static int __maybe_unused two = 2;
static unsigned long one_ul = 1;
static int one_hundred = 100;
static int one_thousand = 1000;
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535;
@@ -815,6 +814,19 @@ static struct ctl_table kern_table[] = {
.extra1 = &neg_one,
.extra2 = &sixty,
},
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
{
.ctl_name = CTL_UNNUMBERED,
.procname = "hung_task_panic",
.data = &sysctl_hung_task_panic,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.strategy = &sysctl_intvec,
.extra1 = &zero,
.extra2 = &one,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "hung_task_check_count",
@@ -830,7 +842,7 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_hung_task_timeout_secs,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = &proc_doulongvec_minmax,
.proc_handler = &proc_dohung_task_timeout_secs,
.strategy = &sysctl_intvec,
},
{
@@ -900,6 +912,14 @@ static struct ctl_table kern_table[] = {
.proc_handler = &scan_unevictable_handler,
},
#endif
#ifdef CONFIG_SLOW_WORK
{
.ctl_name = CTL_UNNUMBERED,
.procname = "slow-work",
.mode = 0555,
.child = slow_work_sysctls,
},
#endif
/*
* NOTE: do not add new entries to this table unless you have read
* Documentation/sysctl/ctl_unnumbered.txt
@@ -1020,6 +1040,28 @@ static struct ctl_table vm_table[] = {
.mode = 0444 /* read-only*/,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "nr_pdflush_threads_min",
.data = &nr_pdflush_threads_min,
.maxlen = sizeof nr_pdflush_threads_min,
.mode = 0644 /* read-write */,
.proc_handler = &proc_dointvec_minmax,
.strategy = &sysctl_intvec,
.extra1 = &one,
.extra2 = &nr_pdflush_threads_max,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "nr_pdflush_threads_max",
.data = &nr_pdflush_threads_max,
.maxlen = sizeof nr_pdflush_threads_max,
.mode = 0644 /* read-write */,
.proc_handler = &proc_dointvec_minmax,
.strategy = &sysctl_intvec,
.extra1 = &nr_pdflush_threads_min,
.extra2 = &one_thousand,
},
{
.ctl_name = VM_SWAPPINESS,
.procname = "swappiness",
@@ -1373,10 +1415,7 @@ static struct ctl_table fs_table[] = {
.data = &lease_break_time,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.strategy = &sysctl_intvec,
.extra1 = &zero,
.extra2 = &two,
.proc_handler = &proc_dointvec,
},
#endif
#ifdef CONFIG_AIO
@@ -1417,7 +1456,10 @@ static struct ctl_table fs_table[] = {
.data = &suid_dumpable,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
.proc_handler = &proc_dointvec_minmax,
.strategy = &sysctl_intvec,
.extra1 = &zero,
.extra2 = &two,
},
#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
{

View File

@@ -9,6 +9,9 @@ config USER_STACKTRACE_SUPPORT
config NOP_TRACER
bool
config HAVE_FTRACE_NMI_ENTER
bool
config HAVE_FUNCTION_TRACER
bool
@@ -31,12 +34,20 @@ config HAVE_FTRACE_MCOUNT_RECORD
config HAVE_HW_BRANCH_TRACER
bool
config HAVE_FTRACE_SYSCALLS
bool
config TRACER_MAX_TRACE
bool
config RING_BUFFER
bool
config FTRACE_NMI_ENTER
bool
depends on HAVE_FTRACE_NMI_ENTER
default y
config TRACING
bool
select DEBUG_FS
@@ -44,13 +55,29 @@ config TRACING
select STACKTRACE if STACKTRACE_SUPPORT
select TRACEPOINTS
select NOP_TRACER
select BINARY_PRINTF
#
# Minimum requirements an architecture has to meet for us to
# be able to offer generic tracing facilities:
#
config TRACING_SUPPORT
bool
# PPC32 has no irqflags tracing support, but it can use most of the
# tracers anyway, they were tested to build and work. Note that new
# exceptions to this list aren't welcomed, better implement the
# irqflags tracing for your architecture.
depends on TRACE_IRQFLAGS_SUPPORT || PPC32
depends on STACKTRACE_SUPPORT
default y
if TRACING_SUPPORT
menu "Tracers"
config FUNCTION_TRACER
bool "Kernel Function Tracer"
depends on HAVE_FUNCTION_TRACER
depends on DEBUG_KERNEL
select FRAME_POINTER
select KALLSYMS
select TRACING
@@ -72,18 +99,16 @@ config FUNCTION_GRAPH_TRACER
help
Enable the kernel to trace a function at both its return
and its entry.
It's first purpose is to trace the duration of functions and
draw a call graph for each thread with some informations like
the return value.
This is done by setting the current return address on the current
task structure into a stack of calls.
Its first purpose is to trace the duration of functions and
draw a call graph for each thread with some information like
the return value. This is done by setting the current return
address on the current task structure into a stack of calls.
config IRQSOFF_TRACER
bool "Interrupts-off Latency Tracer"
default n
depends on TRACE_IRQFLAGS_SUPPORT
depends on GENERIC_TIME
depends on DEBUG_KERNEL
select TRACE_IRQFLAGS
select TRACING
select TRACER_MAX_TRACE
@@ -106,7 +131,6 @@ config PREEMPT_TRACER
default n
depends on GENERIC_TIME
depends on PREEMPT
depends on DEBUG_KERNEL
select TRACING
select TRACER_MAX_TRACE
help
@@ -127,13 +151,13 @@ config SYSPROF_TRACER
bool "Sysprof Tracer"
depends on X86
select TRACING
select CONTEXT_SWITCH_TRACER
help
This tracer provides the trace needed by the 'Sysprof' userspace
tool.
config SCHED_TRACER
bool "Scheduling Latency Tracer"
depends on DEBUG_KERNEL
select TRACING
select CONTEXT_SWITCH_TRACER
select TRACER_MAX_TRACE
@@ -143,16 +167,30 @@ config SCHED_TRACER
config CONTEXT_SWITCH_TRACER
bool "Trace process context switches"
depends on DEBUG_KERNEL
select TRACING
select MARKERS
help
This tracer gets called from the context switch and records
all switching of tasks.
config EVENT_TRACER
bool "Trace various events in the kernel"
select TRACING
help
This tracer hooks to various trace points in the kernel
allowing the user to pick and choose which trace point they
want to trace.
config FTRACE_SYSCALLS
bool "Trace syscalls"
depends on HAVE_FTRACE_SYSCALLS
select TRACING
select KALLSYMS
help
Basic tracer to catch the syscall entry and exit events.
config BOOT_TRACER
bool "Trace boot initcalls"
depends on DEBUG_KERNEL
select TRACING
select CONTEXT_SWITCH_TRACER
help
@@ -165,13 +203,11 @@ config BOOT_TRACER
representation of the delays during initcalls - but the raw
/debug/tracing/trace text output is readable too.
( Note that tracing self tests can't be enabled if this tracer is
selected, because the self-tests are an initcall as well and that
would invalidate the boot trace. )
You must pass in ftrace=initcall to the kernel command line
to enable this on bootup.
config TRACE_BRANCH_PROFILING
bool "Trace likely/unlikely profiler"
depends on DEBUG_KERNEL
select TRACING
help
This tracer profiles all the the likely and unlikely macros
@@ -224,7 +260,6 @@ config BRANCH_TRACER
config POWER_TRACER
bool "Trace power consumption behavior"
depends on DEBUG_KERNEL
depends on X86
select TRACING
help
@@ -236,7 +271,6 @@ config POWER_TRACER
config STACK_TRACER
bool "Trace max stack"
depends on HAVE_FUNCTION_TRACER
depends on DEBUG_KERNEL
select FUNCTION_TRACER
select STACKTRACE
select KALLSYMS
@@ -266,11 +300,66 @@ config HW_BRANCH_TRACER
This tracer records all branches on the system in a circular
buffer giving access to the last N branches for each cpu.
config KMEMTRACE
bool "Trace SLAB allocations"
select TRACING
help
kmemtrace provides tracing for slab allocator functions, such as
kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
data is then fed to the userspace application in order to analyse
allocation hotspots, internal fragmentation and so on, making it
possible to see how well an allocator performs, as well as debug
and profile kernel code.
This requires an userspace application to use. See
Documentation/vm/kmemtrace.txt for more information.
Saying Y will make the kernel somewhat larger and slower. However,
if you disable kmemtrace at run-time or boot-time, the performance
impact is minimal (depending on the arch the kernel is built for).
If unsure, say N.
config WORKQUEUE_TRACER
bool "Trace workqueues"
select TRACING
help
The workqueue tracer provides some statistical informations
about each cpu workqueue thread such as the number of the
works inserted and executed since their creation. It can help
to evaluate the amount of work each of them have to perform.
For example it can help a developer to decide whether he should
choose a per cpu workqueue instead of a singlethreaded one.
config BLK_DEV_IO_TRACE
bool "Support for tracing block io actions"
depends on SYSFS
depends on BLOCK
select RELAY
select DEBUG_FS
select TRACEPOINTS
select TRACING
select STACKTRACE
help
Say Y here if you want to be able to trace the block layer actions
on a given queue. Tracing allows you to see any traffic happening
on a block device queue. For more information (and the userspace
support tools needed), fetch the blktrace tools from:
git://git.kernel.dk/blktrace.git
Tracing also is possible using the ftrace interface, e.g.:
echo 1 > /sys/block/sda/sda1/trace/enable
echo blk > /sys/kernel/debug/tracing/current_tracer
cat /sys/kernel/debug/tracing/trace_pipe
If unsure, say N.
config DYNAMIC_FTRACE
bool "enable/disable ftrace tracepoints dynamically"
depends on FUNCTION_TRACER
depends on HAVE_DYNAMIC_FTRACE
depends on DEBUG_KERNEL
default y
help
This option will modify all the calls to ftrace dynamically
@@ -296,7 +385,7 @@ config FTRACE_SELFTEST
config FTRACE_STARTUP_TEST
bool "Perform a startup test on ftrace"
depends on TRACING && DEBUG_KERNEL && !BOOT_TRACER
depends on TRACING
select FTRACE_SELFTEST
help
This option performs a series of startup tests on ftrace. On bootup
@@ -306,7 +395,7 @@ config FTRACE_STARTUP_TEST
config MMIOTRACE
bool "Memory mapped IO tracing"
depends on HAVE_MMIOTRACE_SUPPORT && DEBUG_KERNEL && PCI
depends on HAVE_MMIOTRACE_SUPPORT && PCI
select TRACING
help
Mmiotrace traces Memory Mapped I/O access and is meant for
@@ -328,3 +417,6 @@ config MMIOTRACE_TEST
Say N, unless you absolutely know what you are doing.
endmenu
endif # TRACING_SUPPORT

View File

@@ -19,6 +19,10 @@ obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
obj-$(CONFIG_TRACING) += trace.o
obj-$(CONFIG_TRACING) += trace_clock.o
obj-$(CONFIG_TRACING) += trace_output.o
obj-$(CONFIG_TRACING) += trace_stat.o
obj-$(CONFIG_TRACING) += trace_printk.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
@@ -33,5 +37,14 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
obj-$(CONFIG_POWER_TRACER) += trace_power.o
obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
obj-$(CONFIG_EVENT_TRACER) += trace_events.o
obj-$(CONFIG_EVENT_TRACER) += events.o
obj-$(CONFIG_EVENT_TRACER) += trace_export.o
obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o
libftrace-y := ftrace.o

1550
kernel/trace/blktrace.c Normal file

File diff suppressed because it is too large Load Diff

14
kernel/trace/events.c Normal file
View File

@@ -0,0 +1,14 @@
/*
* This is the place to register all trace points as events.
*/
#include <linux/stringify.h>
#include <trace/trace_events.h>
#include "trace_output.h"
#include "trace_events_stage_1.h"
#include "trace_events_stage_2.h"
#include "trace_events_stage_3.h"

File diff suppressed because it is too large Load Diff

464
kernel/trace/kmemtrace.c Normal file
View File

@@ -0,0 +1,464 @@
/*
* Memory allocator tracing
*
* Copyright (C) 2008 Eduard - Gabriel Munteanu
* Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
* Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
*/
#include <linux/tracepoint.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/dcache.h>
#include <linux/fs.h>
#include <trace/kmemtrace.h>
#include "trace_output.h"
#include "trace.h"
/* Select an alternative, minimalistic output than the original one */
#define TRACE_KMEM_OPT_MINIMAL 0x1
static struct tracer_opt kmem_opts[] = {
/* Default disable the minimalistic output */
{ TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
{ }
};
static struct tracer_flags kmem_tracer_flags = {
.val = 0,
.opts = kmem_opts
};
static struct trace_array *kmemtrace_array;
/* Trace allocations */
static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
unsigned long call_site,
const void *ptr,
size_t bytes_req,
size_t bytes_alloc,
gfp_t gfp_flags,
int node)
{
struct trace_array *tr = kmemtrace_array;
struct kmemtrace_alloc_entry *entry;
struct ring_buffer_event *event;
event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
if (!event)
return;
entry = ring_buffer_event_data(event);
tracing_generic_entry_update(&entry->ent, 0, 0);
entry->ent.type = TRACE_KMEM_ALLOC;
entry->type_id = type_id;
entry->call_site = call_site;
entry->ptr = ptr;
entry->bytes_req = bytes_req;
entry->bytes_alloc = bytes_alloc;
entry->gfp_flags = gfp_flags;
entry->node = node;
ring_buffer_unlock_commit(tr->buffer, event);
trace_wake_up();
}
static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
unsigned long call_site,
const void *ptr)
{
struct trace_array *tr = kmemtrace_array;
struct kmemtrace_free_entry *entry;
struct ring_buffer_event *event;
event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
if (!event)
return;
entry = ring_buffer_event_data(event);
tracing_generic_entry_update(&entry->ent, 0, 0);
entry->ent.type = TRACE_KMEM_FREE;
entry->type_id = type_id;
entry->call_site = call_site;
entry->ptr = ptr;
ring_buffer_unlock_commit(tr->buffer, event);
trace_wake_up();
}
static void kmemtrace_kmalloc(unsigned long call_site,
const void *ptr,
size_t bytes_req,
size_t bytes_alloc,
gfp_t gfp_flags)
{
kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
bytes_req, bytes_alloc, gfp_flags, -1);
}
static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
const void *ptr,
size_t bytes_req,
size_t bytes_alloc,
gfp_t gfp_flags)
{
kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
bytes_req, bytes_alloc, gfp_flags, -1);
}
static void kmemtrace_kmalloc_node(unsigned long call_site,
const void *ptr,
size_t bytes_req,
size_t bytes_alloc,
gfp_t gfp_flags,
int node)
{
kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
bytes_req, bytes_alloc, gfp_flags, node);
}
static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
const void *ptr,
size_t bytes_req,
size_t bytes_alloc,
gfp_t gfp_flags,
int node)
{
kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
bytes_req, bytes_alloc, gfp_flags, node);
}
static void kmemtrace_kfree(unsigned long call_site, const void *ptr)
{
kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
}
static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr)
{
kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
}
static int kmemtrace_start_probes(void)
{
int err;
err = register_trace_kmalloc(kmemtrace_kmalloc);
if (err)
return err;
err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
if (err)
return err;
err = register_trace_kmalloc_node(kmemtrace_kmalloc_node);
if (err)
return err;
err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
if (err)
return err;
err = register_trace_kfree(kmemtrace_kfree);
if (err)
return err;
err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
return err;
}
static void kmemtrace_stop_probes(void)
{
unregister_trace_kmalloc(kmemtrace_kmalloc);
unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
unregister_trace_kmalloc_node(kmemtrace_kmalloc_node);
unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
unregister_trace_kfree(kmemtrace_kfree);
unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
}
static int kmem_trace_init(struct trace_array *tr)
{
int cpu;
kmemtrace_array = tr;
for_each_cpu_mask(cpu, cpu_possible_map)
tracing_reset(tr, cpu);
kmemtrace_start_probes();
return 0;
}
static void kmem_trace_reset(struct trace_array *tr)
{
kmemtrace_stop_probes();
}
static void kmemtrace_headers(struct seq_file *s)
{
/* Don't need headers for the original kmemtrace output */
if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
return;
seq_printf(s, "#\n");
seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS "
" POINTER NODE CALLER\n");
seq_printf(s, "# FREE | | | | "
" | | | |\n");
seq_printf(s, "# |\n\n");
}
/*
* The following functions give the original output from kmemtrace,
* plus the origin CPU, since reordering occurs in-kernel now.
*/
#define KMEMTRACE_USER_ALLOC 0
#define KMEMTRACE_USER_FREE 1
struct kmemtrace_user_event {
u8 event_id;
u8 type_id;
u16 event_size;
u32 cpu;
u64 timestamp;
unsigned long call_site;
unsigned long ptr;
};
struct kmemtrace_user_event_alloc {
size_t bytes_req;
size_t bytes_alloc;
unsigned gfp_flags;
int node;
};
static enum print_line_t
kmemtrace_print_alloc_user(struct trace_iterator *iter,
struct kmemtrace_alloc_entry *entry)
{
struct kmemtrace_user_event_alloc *ev_alloc;
struct trace_seq *s = &iter->seq;
struct kmemtrace_user_event *ev;
ev = trace_seq_reserve(s, sizeof(*ev));
if (!ev)
return TRACE_TYPE_PARTIAL_LINE;
ev->event_id = KMEMTRACE_USER_ALLOC;
ev->type_id = entry->type_id;
ev->event_size = sizeof(*ev) + sizeof(*ev_alloc);
ev->cpu = iter->cpu;
ev->timestamp = iter->ts;
ev->call_site = entry->call_site;
ev->ptr = (unsigned long)entry->ptr;
ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
if (!ev_alloc)
return TRACE_TYPE_PARTIAL_LINE;
ev_alloc->bytes_req = entry->bytes_req;
ev_alloc->bytes_alloc = entry->bytes_alloc;
ev_alloc->gfp_flags = entry->gfp_flags;
ev_alloc->node = entry->node;
return TRACE_TYPE_HANDLED;
}
static enum print_line_t
kmemtrace_print_free_user(struct trace_iterator *iter,
struct kmemtrace_free_entry *entry)
{
struct trace_seq *s = &iter->seq;
struct kmemtrace_user_event *ev;
ev = trace_seq_reserve(s, sizeof(*ev));
if (!ev)
return TRACE_TYPE_PARTIAL_LINE;
ev->event_id = KMEMTRACE_USER_FREE;
ev->type_id = entry->type_id;
ev->event_size = sizeof(*ev);
ev->cpu = iter->cpu;
ev->timestamp = iter->ts;
ev->call_site = entry->call_site;
ev->ptr = (unsigned long)entry->ptr;
return TRACE_TYPE_HANDLED;
}
/* The two other following provide a more minimalistic output */
static enum print_line_t
kmemtrace_print_alloc_compress(struct trace_iterator *iter,
struct kmemtrace_alloc_entry *entry)
{
struct trace_seq *s = &iter->seq;
int ret;
/* Alloc entry */
ret = trace_seq_printf(s, " + ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Type */
switch (entry->type_id) {
case KMEMTRACE_TYPE_KMALLOC:
ret = trace_seq_printf(s, "K ");
break;
case KMEMTRACE_TYPE_CACHE:
ret = trace_seq_printf(s, "C ");
break;
case KMEMTRACE_TYPE_PAGES:
ret = trace_seq_printf(s, "P ");
break;
default:
ret = trace_seq_printf(s, "? ");
}
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Requested */
ret = trace_seq_printf(s, "%4zu ", entry->bytes_req);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Allocated */
ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Flags
* TODO: would be better to see the name of the GFP flag names
*/
ret = trace_seq_printf(s, "%08x ", entry->gfp_flags);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Pointer to allocated */
ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Node */
ret = trace_seq_printf(s, "%4d ", entry->node);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Call site */
ret = seq_print_ip_sym(s, entry->call_site, 0);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
if (!trace_seq_printf(s, "\n"))
return TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_HANDLED;
}
static enum print_line_t
kmemtrace_print_free_compress(struct trace_iterator *iter,
struct kmemtrace_free_entry *entry)
{
struct trace_seq *s = &iter->seq;
int ret;
/* Free entry */
ret = trace_seq_printf(s, " - ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Type */
switch (entry->type_id) {
case KMEMTRACE_TYPE_KMALLOC:
ret = trace_seq_printf(s, "K ");
break;
case KMEMTRACE_TYPE_CACHE:
ret = trace_seq_printf(s, "C ");
break;
case KMEMTRACE_TYPE_PAGES:
ret = trace_seq_printf(s, "P ");
break;
default:
ret = trace_seq_printf(s, "? ");
}
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Skip requested/allocated/flags */
ret = trace_seq_printf(s, " ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Pointer to allocated */
ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Skip node */
ret = trace_seq_printf(s, " ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Call site */
ret = seq_print_ip_sym(s, entry->call_site, 0);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
if (!trace_seq_printf(s, "\n"))
return TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_HANDLED;
}
static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
{
struct trace_entry *entry = iter->ent;
switch (entry->type) {
case TRACE_KMEM_ALLOC: {
struct kmemtrace_alloc_entry *field;
trace_assign_type(field, entry);
if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
return kmemtrace_print_alloc_compress(iter, field);
else
return kmemtrace_print_alloc_user(iter, field);
}
case TRACE_KMEM_FREE: {
struct kmemtrace_free_entry *field;
trace_assign_type(field, entry);
if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
return kmemtrace_print_free_compress(iter, field);
else
return kmemtrace_print_free_user(iter, field);
}
default:
return TRACE_TYPE_UNHANDLED;
}
}
static struct tracer kmem_tracer __read_mostly = {
.name = "kmemtrace",
.init = kmem_trace_init,
.reset = kmem_trace_reset,
.print_line = kmemtrace_print_line,
.print_header = kmemtrace_headers,
.flags = &kmem_tracer_flags
};
void kmemtrace_init(void)
{
/* earliest opportunity to start kmem tracing */
}
static int __init init_kmem_tracer(void)
{
return register_tracer(&kmem_tracer);
}
device_initcall(init_kmem_tracer);

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -9,6 +9,8 @@
#include <linux/mmiotrace.h>
#include <linux/ftrace.h>
#include <trace/boot.h>
#include <trace/kmemtrace.h>
#include <trace/power.h>
enum trace_type {
__TRACE_FIRST_TYPE = 0,
@@ -16,9 +18,9 @@ enum trace_type {
TRACE_FN,
TRACE_CTX,
TRACE_WAKE,
TRACE_CONT,
TRACE_STACK,
TRACE_PRINT,
TRACE_BPRINT,
TRACE_SPECIAL,
TRACE_MMIO_RW,
TRACE_MMIO_MAP,
@@ -29,9 +31,14 @@ enum trace_type {
TRACE_GRAPH_ENT,
TRACE_USER_STACK,
TRACE_HW_BRANCHES,
TRACE_SYSCALL_ENTER,
TRACE_SYSCALL_EXIT,
TRACE_KMEM_ALLOC,
TRACE_KMEM_FREE,
TRACE_POWER,
TRACE_BLK,
__TRACE_LAST_TYPE
__TRACE_LAST_TYPE,
};
/*
@@ -42,7 +49,6 @@ enum trace_type {
*/
struct trace_entry {
unsigned char type;
unsigned char cpu;
unsigned char flags;
unsigned char preempt_count;
int pid;
@@ -60,13 +66,13 @@ struct ftrace_entry {
/* Function call entry */
struct ftrace_graph_ent_entry {
struct trace_entry ent;
struct trace_entry ent;
struct ftrace_graph_ent graph_ent;
};
/* Function return entry */
struct ftrace_graph_ret_entry {
struct trace_entry ent;
struct trace_entry ent;
struct ftrace_graph_ret ret;
};
extern struct tracer boot_tracer;
@@ -112,12 +118,18 @@ struct userstack_entry {
};
/*
* ftrace_printk entry:
* trace_printk entry:
*/
struct bprint_entry {
struct trace_entry ent;
unsigned long ip;
const char *fmt;
u32 buf[];
};
struct print_entry {
struct trace_entry ent;
unsigned long ip;
int depth;
char buf[];
};
@@ -170,15 +182,51 @@ struct trace_power {
struct power_trace state_data;
};
enum kmemtrace_type_id {
KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
};
struct kmemtrace_alloc_entry {
struct trace_entry ent;
enum kmemtrace_type_id type_id;
unsigned long call_site;
const void *ptr;
size_t bytes_req;
size_t bytes_alloc;
gfp_t gfp_flags;
int node;
};
struct kmemtrace_free_entry {
struct trace_entry ent;
enum kmemtrace_type_id type_id;
unsigned long call_site;
const void *ptr;
};
struct syscall_trace_enter {
struct trace_entry ent;
int nr;
unsigned long args[];
};
struct syscall_trace_exit {
struct trace_entry ent;
int nr;
unsigned long ret;
};
/*
* trace_flag_type is an enumeration that holds different
* states when a trace occurs. These are:
* IRQS_OFF - interrupts were disabled
* IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
* IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
* NEED_RESCED - reschedule is requested
* HARDIRQ - inside an interrupt handler
* SOFTIRQ - inside a softirq handler
* CONT - multiple entries hold the trace item
*/
enum trace_flag_type {
TRACE_FLAG_IRQS_OFF = 0x01,
@@ -186,7 +234,6 @@ enum trace_flag_type {
TRACE_FLAG_NEED_RESCHED = 0x04,
TRACE_FLAG_HARDIRQ = 0x08,
TRACE_FLAG_SOFTIRQ = 0x10,
TRACE_FLAG_CONT = 0x20,
};
#define TRACE_BUF_SIZE 1024
@@ -198,6 +245,7 @@ enum trace_flag_type {
*/
struct trace_array_cpu {
atomic_t disabled;
void *buffer_page; /* ring buffer spare */
/* these fields get copied into max-trace: */
unsigned long trace_idx;
@@ -262,10 +310,10 @@ extern void __ftrace_bad_type(void);
do { \
IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \
IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \
IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \
IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
IF_ASSIGN(var, ent, struct special_entry, 0); \
IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
TRACE_MMIO_RW); \
@@ -279,7 +327,15 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
TRACE_GRAPH_RET); \
IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
TRACE_KMEM_ALLOC); \
IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
TRACE_KMEM_FREE); \
IF_ASSIGN(var, ent, struct syscall_trace_enter, \
TRACE_SYSCALL_ENTER); \
IF_ASSIGN(var, ent, struct syscall_trace_exit, \
TRACE_SYSCALL_EXIT); \
__ftrace_bad_type(); \
} while (0)
@@ -287,7 +343,8 @@ extern void __ftrace_bad_type(void);
enum print_line_t {
TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */
TRACE_TYPE_HANDLED = 1,
TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */
TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */
TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */
};
@@ -297,8 +354,8 @@ enum print_line_t {
* flags value in struct tracer_flags.
*/
struct tracer_opt {
const char *name; /* Will appear on the trace_options file */
u32 bit; /* Mask assigned in val field in tracer_flags */
const char *name; /* Will appear on the trace_options file */
u32 bit; /* Mask assigned in val field in tracer_flags */
};
/*
@@ -307,28 +364,51 @@ struct tracer_opt {
*/
struct tracer_flags {
u32 val;
struct tracer_opt *opts;
struct tracer_opt *opts;
};
/* Makes more easy to define a tracer opt */
#define TRACER_OPT(s, b) .name = #s, .bit = b
/*
* A specific tracer, represented by methods that operate on a trace array:
/**
* struct tracer - a specific tracer and its callbacks to interact with debugfs
* @name: the name chosen to select it on the available_tracers file
* @init: called when one switches to this tracer (echo name > current_tracer)
* @reset: called when one switches to another tracer
* @start: called when tracing is unpaused (echo 1 > tracing_enabled)
* @stop: called when tracing is paused (echo 0 > tracing_enabled)
* @open: called when the trace file is opened
* @pipe_open: called when the trace_pipe file is opened
* @wait_pipe: override how the user waits for traces on trace_pipe
* @close: called when the trace file is released
* @read: override the default read callback on trace_pipe
* @splice_read: override the default splice_read callback on trace_pipe
* @selftest: selftest to run on boot (see trace_selftest.c)
* @print_headers: override the first lines that describe your columns
* @print_line: callback that prints a trace
* @set_flag: signals one of your private flags changed (trace_options file)
* @flags: your private flags
*/
struct tracer {
const char *name;
/* Your tracer should raise a warning if init fails */
int (*init)(struct trace_array *tr);
void (*reset)(struct trace_array *tr);
void (*start)(struct trace_array *tr);
void (*stop)(struct trace_array *tr);
void (*open)(struct trace_iterator *iter);
void (*pipe_open)(struct trace_iterator *iter);
void (*wait_pipe)(struct trace_iterator *iter);
void (*close)(struct trace_iterator *iter);
ssize_t (*read)(struct trace_iterator *iter,
struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos);
ssize_t (*splice_read)(struct trace_iterator *iter,
struct file *filp,
loff_t *ppos,
struct pipe_inode_info *pipe,
size_t len,
unsigned int flags);
#ifdef CONFIG_FTRACE_STARTUP_TEST
int (*selftest)(struct tracer *trace,
struct trace_array *tr);
@@ -339,7 +419,8 @@ struct tracer {
int (*set_flag)(u32 old_flags, u32 bit, int set);
struct tracer *next;
int print_max;
struct tracer_flags *flags;
struct tracer_flags *flags;
struct tracer_stat *stats;
};
struct trace_seq {
@@ -348,6 +429,16 @@ struct trace_seq {
unsigned int readpos;
};
static inline void
trace_seq_init(struct trace_seq *s)
{
s->len = 0;
s->readpos = 0;
}
#define TRACE_PIPE_ALL_CPU -1
/*
* Trace iterator - used by printout routines who present trace
* results to users and which routines might sleep, etc:
@@ -356,6 +447,8 @@ struct trace_iterator {
struct trace_array *tr;
struct tracer *trace;
void *private;
int cpu_file;
struct mutex mutex;
struct ring_buffer_iter *buffer_iter[NR_CPUS];
/* The below is zeroed out in pipe_read */
@@ -371,6 +464,7 @@ struct trace_iterator {
cpumask_var_t started;
};
int tracer_init(struct tracer *t, struct trace_array *tr);
int tracing_is_enabled(void);
void trace_wake_up(void);
void tracing_reset(struct trace_array *tr, int cpu);
@@ -379,26 +473,50 @@ int tracing_open_generic(struct inode *inode, struct file *filp);
struct dentry *tracing_init_dentry(void);
void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
struct ring_buffer_event;
struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
unsigned char type,
unsigned long len,
unsigned long flags,
int pc);
void trace_buffer_unlock_commit(struct trace_array *tr,
struct ring_buffer_event *event,
unsigned long flags, int pc);
struct ring_buffer_event *
trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
unsigned long flags, int pc);
void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
unsigned long flags, int pc);
void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
unsigned long flags, int pc);
struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
struct trace_array_cpu *data);
struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
int *ent_cpu, u64 *ent_ts);
void tracing_generic_entry_update(struct trace_entry *entry,
unsigned long flags,
int pc);
void default_wait_pipe(struct trace_iterator *iter);
void poll_wait_pipe(struct trace_iterator *iter);
void ftrace(struct trace_array *tr,
struct trace_array_cpu *data,
unsigned long ip,
unsigned long parent_ip,
unsigned long flags, int pc);
void tracing_sched_switch_trace(struct trace_array *tr,
struct trace_array_cpu *data,
struct task_struct *prev,
struct task_struct *next,
unsigned long flags, int pc);
void tracing_record_cmdline(struct task_struct *tsk);
void tracing_sched_wakeup_trace(struct trace_array *tr,
struct trace_array_cpu *data,
struct task_struct *wakee,
struct task_struct *cur,
unsigned long flags, int pc);
@@ -408,14 +526,12 @@ void trace_special(struct trace_array *tr,
unsigned long arg2,
unsigned long arg3, int pc);
void trace_function(struct trace_array *tr,
struct trace_array_cpu *data,
unsigned long ip,
unsigned long parent_ip,
unsigned long flags, int pc);
void trace_graph_return(struct ftrace_graph_ret *trace);
int trace_graph_entry(struct ftrace_graph_ent *trace);
void trace_hw_branch(struct trace_array *tr, u64 from, u64 to);
void tracing_start_cmdline_record(void);
void tracing_stop_cmdline_record(void);
@@ -434,15 +550,11 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
void update_max_tr_single(struct trace_array *tr,
struct task_struct *tsk, int cpu);
extern cycle_t ftrace_now(int cpu);
void __trace_stack(struct trace_array *tr,
unsigned long flags,
int skip, int pc);
#ifdef CONFIG_FUNCTION_TRACER
void tracing_start_function_trace(void);
void tracing_stop_function_trace(void);
#else
# define tracing_start_function_trace() do { } while (0)
# define tracing_stop_function_trace() do { } while (0)
#endif
extern cycle_t ftrace_now(int cpu);
#ifdef CONFIG_CONTEXT_SWITCH_TRACER
typedef void
@@ -456,10 +568,10 @@ struct tracer_switch_ops {
void *private;
struct tracer_switch_ops *next;
};
char *trace_find_cmdline(int pid);
#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
extern void trace_find_cmdline(int pid, char comm[]);
#ifdef CONFIG_DYNAMIC_FTRACE
extern unsigned long ftrace_update_tot_cnt;
#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
@@ -469,6 +581,8 @@ extern int DYN_FTRACE_TEST_NAME(void);
#ifdef CONFIG_FTRACE_STARTUP_TEST
extern int trace_selftest_startup_function(struct tracer *trace,
struct trace_array *tr);
extern int trace_selftest_startup_function_graph(struct tracer *trace,
struct trace_array *tr);
extern int trace_selftest_startup_irqsoff(struct tracer *trace,
struct trace_array *tr);
extern int trace_selftest_startup_preemptoff(struct tracer *trace,
@@ -488,18 +602,11 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
#endif /* CONFIG_FTRACE_STARTUP_TEST */
extern void *head_page(struct trace_array_cpu *data);
extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
extern void trace_seq_print_cont(struct trace_seq *s,
struct trace_iterator *iter);
extern unsigned long long ns2usecs(cycle_t nsec);
extern int
seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
unsigned long sym_flags);
extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
size_t cnt);
extern long ns2usecs(cycle_t nsec);
trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
extern int
trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args);
trace_vprintk(unsigned long ip, const char *fmt, va_list args);
extern unsigned long trace_flags;
@@ -580,7 +687,11 @@ enum trace_iterator_flags {
TRACE_ITER_ANNOTATE = 0x2000,
TRACE_ITER_USERSTACKTRACE = 0x4000,
TRACE_ITER_SYM_USEROBJ = 0x8000,
TRACE_ITER_PRINTK_MSGONLY = 0x10000
TRACE_ITER_PRINTK_MSGONLY = 0x10000,
TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */
TRACE_ITER_LATENCY_FMT = 0x40000,
TRACE_ITER_GLOBAL_CLK = 0x80000,
TRACE_ITER_SLEEP_TIME = 0x100000,
};
/*
@@ -601,12 +712,12 @@ extern struct tracer nop_trace;
* preempt_enable (after a disable), a schedule might take place
* causing an infinite recursion.
*
* To prevent this, we read the need_recshed flag before
* To prevent this, we read the need_resched flag before
* disabling preemption. When we want to enable preemption we
* check the flag, if it is set, then we call preempt_enable_no_resched.
* Otherwise, we call preempt_enable.
*
* The rational for doing the above is that if need resched is set
* The rational for doing the above is that if need_resched is set
* and we have yet to reschedule, we are either in an atomic location
* (where we do not need to check for scheduling) or we are inside
* the scheduler and do not want to resched.
@@ -627,7 +738,7 @@ static inline int ftrace_preempt_disable(void)
*
* This is a scheduler safe way to enable preemption and not miss
* any preemption checks. The disabled saved the state of preemption.
* If resched is set, then we were either inside an atomic or
* If resched is set, then we are either inside an atomic or
* are inside the scheduler (we would have already scheduled
* otherwise). In this case, we do not want to call normal
* preempt_enable, but preempt_enable_no_resched instead.
@@ -664,4 +775,118 @@ static inline void trace_branch_disable(void)
}
#endif /* CONFIG_BRANCH_TRACER */
/* set ring buffers to default size if not already done so */
int tracing_update_buffers(void);
/* trace event type bit fields, not numeric */
enum {
TRACE_EVENT_TYPE_PRINTF = 1,
TRACE_EVENT_TYPE_RAW = 2,
};
struct ftrace_event_field {
struct list_head link;
char *name;
char *type;
int offset;
int size;
};
struct ftrace_event_call {
char *name;
char *system;
struct dentry *dir;
int enabled;
int (*regfunc)(void);
void (*unregfunc)(void);
int id;
int (*raw_init)(void);
int (*show_format)(struct trace_seq *s);
int (*define_fields)(void);
struct list_head fields;
struct filter_pred **preds;
#ifdef CONFIG_EVENT_PROFILE
atomic_t profile_count;
int (*profile_enable)(struct ftrace_event_call *);
void (*profile_disable)(struct ftrace_event_call *);
#endif
};
struct event_subsystem {
struct list_head list;
const char *name;
struct dentry *entry;
struct filter_pred **preds;
};
#define events_for_each(event) \
for (event = __start_ftrace_events; \
(unsigned long)event < (unsigned long)__stop_ftrace_events; \
event++)
#define MAX_FILTER_PRED 8
struct filter_pred;
typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
struct filter_pred {
filter_pred_fn_t fn;
u64 val;
char *str_val;
int str_len;
char *field_name;
int offset;
int not;
int or;
int compound;
int clear;
};
int trace_define_field(struct ftrace_event_call *call, char *type,
char *name, int offset, int size);
extern void filter_free_pred(struct filter_pred *pred);
extern void filter_print_preds(struct filter_pred **preds,
struct trace_seq *s);
extern int filter_parse(char **pbuf, struct filter_pred *pred);
extern int filter_add_pred(struct ftrace_event_call *call,
struct filter_pred *pred);
extern void filter_free_preds(struct ftrace_event_call *call);
extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
extern void filter_free_subsystem_preds(struct event_subsystem *system);
extern int filter_add_subsystem_pred(struct event_subsystem *system,
struct filter_pred *pred);
void event_trace_printk(unsigned long ip, const char *fmt, ...);
extern struct ftrace_event_call __start_ftrace_events[];
extern struct ftrace_event_call __stop_ftrace_events[];
#define for_each_event(event) \
for (event = __start_ftrace_events; \
(unsigned long)event < (unsigned long)__stop_ftrace_events; \
event++)
extern const char *__start___trace_bprintk_fmt[];
extern const char *__stop___trace_bprintk_fmt[];
/*
* The double __builtin_constant_p is because gcc will give us an error
* if we try to allocate the static variable to fmt if it is not a
* constant. Even with the outer if statement optimizing out.
*/
#define event_trace_printk(ip, fmt, args...) \
do { \
__trace_printk_check_format(fmt, ##args); \
tracing_record_cmdline(current); \
if (__builtin_constant_p(fmt)) { \
static const char *trace_printk_fmt \
__attribute__((section("__trace_printk_fmt"))) = \
__builtin_constant_p(fmt) ? fmt : NULL; \
\
__trace_bprintk(ip, trace_printk_fmt, ##args); \
} else \
__trace_printk(ip, fmt, ##args); \
} while (0)
#endif /* _LINUX_KERNEL_TRACE_H */

View File

@@ -11,6 +11,7 @@
#include <linux/kallsyms.h>
#include "trace.h"
#include "trace_output.h"
static struct trace_array *boot_trace;
static bool pre_initcalls_finished;
@@ -27,13 +28,13 @@ void start_boot_trace(void)
void enable_boot_trace(void)
{
if (pre_initcalls_finished)
if (boot_trace && pre_initcalls_finished)
tracing_start_sched_switch_record();
}
void disable_boot_trace(void)
{
if (pre_initcalls_finished)
if (boot_trace && pre_initcalls_finished)
tracing_stop_sched_switch_record();
}
@@ -42,6 +43,9 @@ static int boot_trace_init(struct trace_array *tr)
int cpu;
boot_trace = tr;
if (!tr)
return 0;
for_each_cpu(cpu, cpu_possible_mask)
tracing_reset(tr, cpu);
@@ -128,10 +132,9 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
{
struct ring_buffer_event *event;
struct trace_boot_call *entry;
unsigned long irq_flags;
struct trace_array *tr = boot_trace;
if (!pre_initcalls_finished)
if (!tr || !pre_initcalls_finished)
return;
/* Get its name now since this function could
@@ -140,18 +143,13 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
sprint_symbol(bt->func, (unsigned long)fn);
preempt_disable();
event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
&irq_flags);
event = trace_buffer_lock_reserve(tr, TRACE_BOOT_CALL,
sizeof(*entry), 0, 0);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
tracing_generic_entry_update(&entry->ent, 0, 0);
entry->ent.type = TRACE_BOOT_CALL;
entry->boot_call = *bt;
ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
trace_wake_up();
trace_buffer_unlock_commit(tr, event, 0, 0);
out:
preempt_enable();
}
@@ -160,27 +158,21 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
{
struct ring_buffer_event *event;
struct trace_boot_ret *entry;
unsigned long irq_flags;
struct trace_array *tr = boot_trace;
if (!pre_initcalls_finished)
if (!tr || !pre_initcalls_finished)
return;
sprint_symbol(bt->func, (unsigned long)fn);
preempt_disable();
event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
&irq_flags);
event = trace_buffer_lock_reserve(tr, TRACE_BOOT_RET,
sizeof(*entry), 0, 0);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
tracing_generic_entry_update(&entry->ent, 0, 0);
entry->ent.type = TRACE_BOOT_RET;
entry->boot_ret = *bt;
ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
trace_wake_up();
trace_buffer_unlock_commit(tr, event, 0, 0);
out:
preempt_enable();
}

View File

@@ -14,12 +14,17 @@
#include <linux/hash.h>
#include <linux/fs.h>
#include <asm/local.h>
#include "trace.h"
#include "trace_stat.h"
#include "trace_output.h"
#ifdef CONFIG_BRANCH_TRACER
static struct tracer branch_trace;
static int branch_tracing_enabled __read_mostly;
static DEFINE_MUTEX(branch_tracing_mutex);
static struct trace_array *branch_tracer;
static void
@@ -28,7 +33,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
struct trace_array *tr = branch_tracer;
struct ring_buffer_event *event;
struct trace_branch *entry;
unsigned long flags, irq_flags;
unsigned long flags;
int cpu, pc;
const char *p;
@@ -47,15 +52,13 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
goto out;
event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
&irq_flags);
pc = preempt_count();
event = trace_buffer_lock_reserve(tr, TRACE_BRANCH,
sizeof(*entry), flags, pc);
if (!event)
goto out;
pc = preempt_count();
entry = ring_buffer_event_data(event);
tracing_generic_entry_update(&entry->ent, flags, pc);
entry->ent.type = TRACE_BRANCH;
/* Strip off the path, only save the file */
p = f->file + strlen(f->file);
@@ -70,7 +73,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
entry->line = f->line;
entry->correct = val == expect;
ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
ring_buffer_unlock_commit(tr->buffer, event);
out:
atomic_dec(&tr->data[cpu]->disabled);
@@ -88,8 +91,6 @@ void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
int enable_branch_tracing(struct trace_array *tr)
{
int ret = 0;
mutex_lock(&branch_tracing_mutex);
branch_tracer = tr;
/*
@@ -100,7 +101,7 @@ int enable_branch_tracing(struct trace_array *tr)
branch_tracing_enabled++;
mutex_unlock(&branch_tracing_mutex);
return ret;
return 0;
}
void disable_branch_tracing(void)
@@ -128,11 +129,6 @@ static void stop_branch_trace(struct trace_array *tr)
static int branch_trace_init(struct trace_array *tr)
{
int cpu;
for_each_online_cpu(cpu)
tracing_reset(tr, cpu);
start_branch_trace(tr);
return 0;
}
@@ -142,22 +138,53 @@ static void branch_trace_reset(struct trace_array *tr)
stop_branch_trace(tr);
}
struct tracer branch_trace __read_mostly =
static enum print_line_t trace_branch_print(struct trace_iterator *iter,
int flags)
{
struct trace_branch *field;
trace_assign_type(field, iter->ent);
if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n",
field->correct ? " ok " : " MISS ",
field->func,
field->file,
field->line))
return TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_HANDLED;
}
static struct trace_event trace_branch_event = {
.type = TRACE_BRANCH,
.trace = trace_branch_print,
};
static struct tracer branch_trace __read_mostly =
{
.name = "branch",
.init = branch_trace_init,
.reset = branch_trace_reset,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_branch,
#endif
#endif /* CONFIG_FTRACE_SELFTEST */
};
__init static int init_branch_trace(void)
__init static int init_branch_tracer(void)
{
int ret;
ret = register_ftrace_event(&trace_branch_event);
if (!ret) {
printk(KERN_WARNING "Warning: could not register "
"branch events\n");
return 1;
}
return register_tracer(&branch_trace);
}
device_initcall(init_branch_tracer);
device_initcall(init_branch_trace);
#else
static inline
void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
@@ -183,65 +210,38 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
}
EXPORT_SYMBOL(ftrace_likely_update);
struct ftrace_pointer {
void *start;
void *stop;
int hit;
};
extern unsigned long __start_annotated_branch_profile[];
extern unsigned long __stop_annotated_branch_profile[];
static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
static int annotated_branch_stat_headers(struct seq_file *m)
{
const struct ftrace_pointer *f = m->private;
struct ftrace_branch_data *p = v;
(*pos)++;
if (v == (void *)1)
return f->start;
++p;
if ((void *)p >= (void *)f->stop)
return NULL;
return p;
}
static void *t_start(struct seq_file *m, loff_t *pos)
{
void *t = (void *)1;
loff_t l = 0;
for (; t && l < *pos; t = t_next(m, t, &l))
;
return t;
}
static void t_stop(struct seq_file *m, void *p)
{
}
static int t_show(struct seq_file *m, void *v)
{
const struct ftrace_pointer *fp = m->private;
struct ftrace_branch_data *p = v;
const char *f;
long percent;
if (v == (void *)1) {
if (fp->hit)
seq_printf(m, " miss hit %% ");
else
seq_printf(m, " correct incorrect %% ");
seq_printf(m, " Function "
seq_printf(m, " correct incorrect %% ");
seq_printf(m, " Function "
" File Line\n"
" ------- --------- - "
" -------- "
" ---- ----\n");
return 0;
}
return 0;
}
static inline long get_incorrect_percent(struct ftrace_branch_data *p)
{
long percent;
if (p->correct) {
percent = p->incorrect * 100;
percent /= p->correct + p->incorrect;
} else
percent = p->incorrect ? 100 : -1;
return percent;
}
static int branch_stat_show(struct seq_file *m, void *v)
{
struct ftrace_branch_data *p = v;
const char *f;
long percent;
/* Only print the file, not the path */
f = p->file + strlen(p->file);
@@ -252,11 +252,7 @@ static int t_show(struct seq_file *m, void *v)
/*
* The miss is overlayed on correct, and hit on incorrect.
*/
if (p->correct) {
percent = p->incorrect * 100;
percent /= p->correct + p->incorrect;
} else
percent = p->incorrect ? 100 : -1;
percent = get_incorrect_percent(p);
seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
if (percent < 0)
@@ -267,76 +263,118 @@ static int t_show(struct seq_file *m, void *v)
return 0;
}
static struct seq_operations tracing_likely_seq_ops = {
.start = t_start,
.next = t_next,
.stop = t_stop,
.show = t_show,
static void *annotated_branch_stat_start(void)
{
return __start_annotated_branch_profile;
}
static void *
annotated_branch_stat_next(void *v, int idx)
{
struct ftrace_branch_data *p = v;
++p;
if ((void *)p >= (void *)__stop_annotated_branch_profile)
return NULL;
return p;
}
static int annotated_branch_stat_cmp(void *p1, void *p2)
{
struct ftrace_branch_data *a = p1;
struct ftrace_branch_data *b = p2;
long percent_a, percent_b;
percent_a = get_incorrect_percent(a);
percent_b = get_incorrect_percent(b);
if (percent_a < percent_b)
return -1;
if (percent_a > percent_b)
return 1;
else
return 0;
}
static struct tracer_stat annotated_branch_stats = {
.name = "branch_annotated",
.stat_start = annotated_branch_stat_start,
.stat_next = annotated_branch_stat_next,
.stat_cmp = annotated_branch_stat_cmp,
.stat_headers = annotated_branch_stat_headers,
.stat_show = branch_stat_show
};
static int tracing_branch_open(struct inode *inode, struct file *file)
__init static int init_annotated_branch_stats(void)
{
int ret;
ret = seq_open(file, &tracing_likely_seq_ops);
ret = register_stat_tracer(&annotated_branch_stats);
if (!ret) {
struct seq_file *m = file->private_data;
m->private = (void *)inode->i_private;
printk(KERN_WARNING "Warning: could not register "
"annotated branches stats\n");
return 1;
}
return ret;
return 0;
}
static const struct file_operations tracing_branch_fops = {
.open = tracing_branch_open,
.read = seq_read,
.llseek = seq_lseek,
};
fs_initcall(init_annotated_branch_stats);
#ifdef CONFIG_PROFILE_ALL_BRANCHES
extern unsigned long __start_branch_profile[];
extern unsigned long __stop_branch_profile[];
static const struct ftrace_pointer ftrace_branch_pos = {
.start = __start_branch_profile,
.stop = __stop_branch_profile,
.hit = 1,
};
#endif /* CONFIG_PROFILE_ALL_BRANCHES */
extern unsigned long __start_annotated_branch_profile[];
extern unsigned long __stop_annotated_branch_profile[];
static const struct ftrace_pointer ftrace_annotated_branch_pos = {
.start = __start_annotated_branch_profile,
.stop = __stop_annotated_branch_profile,
};
static __init int ftrace_branch_init(void)
static int all_branch_stat_headers(struct seq_file *m)
{
struct dentry *d_tracer;
struct dentry *entry;
d_tracer = tracing_init_dentry();
entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer,
(void *)&ftrace_annotated_branch_pos,
&tracing_branch_fops);
if (!entry)
pr_warning("Could not create debugfs "
"'profile_annotatet_branch' entry\n");
#ifdef CONFIG_PROFILE_ALL_BRANCHES
entry = debugfs_create_file("profile_branch", 0444, d_tracer,
(void *)&ftrace_branch_pos,
&tracing_branch_fops);
if (!entry)
pr_warning("Could not create debugfs"
" 'profile_branch' entry\n");
#endif
seq_printf(m, " miss hit %% ");
seq_printf(m, " Function "
" File Line\n"
" ------- --------- - "
" -------- "
" ---- ----\n");
return 0;
}
device_initcall(ftrace_branch_init);
static void *all_branch_stat_start(void)
{
return __start_branch_profile;
}
static void *
all_branch_stat_next(void *v, int idx)
{
struct ftrace_branch_data *p = v;
++p;
if ((void *)p >= (void *)__stop_branch_profile)
return NULL;
return p;
}
static struct tracer_stat all_branch_stats = {
.name = "branch_all",
.stat_start = all_branch_stat_start,
.stat_next = all_branch_stat_next,
.stat_headers = all_branch_stat_headers,
.stat_show = branch_stat_show
};
__init static int all_annotated_branch_stats(void)
{
int ret;
ret = register_stat_tracer(&all_branch_stats);
if (!ret) {
printk(KERN_WARNING "Warning: could not register "
"all branches stats\n");
return 1;
}
return 0;
}
fs_initcall(all_annotated_branch_stats);
#endif /* CONFIG_PROFILE_ALL_BRANCHES */

109
kernel/trace/trace_clock.c Normal file
View File

@@ -0,0 +1,109 @@
/*
* tracing clocks
*
* Copyright (C) 2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*
* Implements 3 trace clock variants, with differing scalability/precision
* tradeoffs:
*
* - local: CPU-local trace clock
* - medium: scalable global clock with some jitter
* - global: globally monotonic, serialized clock
*
* Tracer plugins will chose a default from these clocks.
*/
#include <linux/spinlock.h>
#include <linux/hardirq.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/ktime.h>
#include <linux/trace_clock.h>
/*
* trace_clock_local(): the simplest and least coherent tracing clock.
*
* Useful for tracing that does not cross to other CPUs nor
* does it go through idle events.
*/
u64 notrace trace_clock_local(void)
{
unsigned long flags;
u64 clock;
/*
* sched_clock() is an architecture implemented, fast, scalable,
* lockless clock. It is not guaranteed to be coherent across
* CPUs, nor across CPU idle events.
*/
raw_local_irq_save(flags);
clock = sched_clock();
raw_local_irq_restore(flags);
return clock;
}
/*
* trace_clock(): 'inbetween' trace clock. Not completely serialized,
* but not completely incorrect when crossing CPUs either.
*
* This is based on cpu_clock(), which will allow at most ~1 jiffy of
* jitter between CPUs. So it's a pretty scalable clock, but there
* can be offsets in the trace data.
*/
u64 notrace trace_clock(void)
{
return cpu_clock(raw_smp_processor_id());
}
/*
* trace_clock_global(): special globally coherent trace clock
*
* It has higher overhead than the other trace clocks but is still
* an order of magnitude faster than GTOD derived hardware clocks.
*
* Used by plugins that need globally coherent timestamps.
*/
static u64 prev_trace_clock_time;
static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp =
(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
u64 notrace trace_clock_global(void)
{
unsigned long flags;
int this_cpu;
u64 now;
raw_local_irq_save(flags);
this_cpu = raw_smp_processor_id();
now = cpu_clock(this_cpu);
/*
* If in an NMI context then dont risk lockups and return the
* cpu_clock() time:
*/
if (unlikely(in_nmi()))
goto out;
__raw_spin_lock(&trace_clock_lock);
/*
* TODO: if this happens often then maybe we should reset
* my_scd->clock to prev_trace_clock_time+1, to make sure
* we start ticking with the local clock from now on?
*/
if ((s64)(now - prev_trace_clock_time) < 0)
now = prev_trace_clock_time + 1;
prev_trace_clock_time = now;
__raw_spin_unlock(&trace_clock_lock);
out:
raw_local_irq_restore(flags);
return now;
}

View File

@@ -0,0 +1,31 @@
/*
* trace event based perf counter profiling
*
* Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
*
*/
#include "trace.h"
int ftrace_profile_enable(int event_id)
{
struct ftrace_event_call *event;
for_each_event(event) {
if (event->id == event_id)
return event->profile_enable(event);
}
return -EINVAL;
}
void ftrace_profile_disable(int event_id)
{
struct ftrace_event_call *event;
for_each_event(event) {
if (event->id == event_id)
return event->profile_disable(event);
}
}

View File

@@ -0,0 +1,173 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM ftrace
/*
* We cheat and use the proto type field as the ID
* and args as the entry type (minus 'struct')
*/
TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned long, ip, ip)
TRACE_FIELD(unsigned long, parent_ip, parent_ip)
),
TP_RAW_FMT(" %lx <-- %lx")
);
TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
ftrace_graph_ent_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned long, graph_ent.func, func)
TRACE_FIELD(int, graph_ent.depth, depth)
),
TP_RAW_FMT("--> %lx (%d)")
);
TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
ftrace_graph_ret_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned long, ret.func, func)
TRACE_FIELD(int, ret.depth, depth)
),
TP_RAW_FMT("<-- %lx (%d)")
);
TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned int, prev_pid, prev_pid)
TRACE_FIELD(unsigned char, prev_prio, prev_prio)
TRACE_FIELD(unsigned char, prev_state, prev_state)
TRACE_FIELD(unsigned int, next_pid, next_pid)
TRACE_FIELD(unsigned char, next_prio, next_prio)
TRACE_FIELD(unsigned char, next_state, next_state)
TRACE_FIELD(unsigned int, next_cpu, next_cpu)
),
TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
);
TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned int, prev_pid, prev_pid)
TRACE_FIELD(unsigned char, prev_prio, prev_prio)
TRACE_FIELD(unsigned char, prev_state, prev_state)
TRACE_FIELD(unsigned int, next_pid, next_pid)
TRACE_FIELD(unsigned char, next_prio, next_prio)
TRACE_FIELD(unsigned char, next_state, next_state)
TRACE_FIELD(unsigned int, next_cpu, next_cpu)
),
TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
);
TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned long, arg1, arg1)
TRACE_FIELD(unsigned long, arg2, arg2)
TRACE_FIELD(unsigned long, arg3, arg3)
),
TP_RAW_FMT("(%08lx) (%08lx) (%08lx)")
);
/*
* Stack-trace entry:
*/
/* #define FTRACE_STACK_ENTRIES 8 */
TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned long, caller[0], stack0)
TRACE_FIELD(unsigned long, caller[1], stack1)
TRACE_FIELD(unsigned long, caller[2], stack2)
TRACE_FIELD(unsigned long, caller[3], stack3)
TRACE_FIELD(unsigned long, caller[4], stack4)
TRACE_FIELD(unsigned long, caller[5], stack5)
TRACE_FIELD(unsigned long, caller[6], stack6)
TRACE_FIELD(unsigned long, caller[7], stack7)
),
TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
"\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
);
TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned long, caller[0], stack0)
TRACE_FIELD(unsigned long, caller[1], stack1)
TRACE_FIELD(unsigned long, caller[2], stack2)
TRACE_FIELD(unsigned long, caller[3], stack3)
TRACE_FIELD(unsigned long, caller[4], stack4)
TRACE_FIELD(unsigned long, caller[5], stack5)
TRACE_FIELD(unsigned long, caller[6], stack6)
TRACE_FIELD(unsigned long, caller[7], stack7)
),
TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
"\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
);
TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned long, ip, ip)
TRACE_FIELD(char *, fmt, fmt)
TRACE_FIELD_ZERO_CHAR(buf)
),
TP_RAW_FMT("%08lx (%d) fmt:%p %s")
);
TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned long, ip, ip)
TRACE_FIELD_ZERO_CHAR(buf)
),
TP_RAW_FMT("%08lx (%d) fmt:%p %s")
);
TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned int, line, line)
TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func)
TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file)
TRACE_FIELD(char, correct, correct)
),
TP_RAW_FMT("%u:%s:%s (%u)")
);
TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(u64, from, from)
TRACE_FIELD(u64, to, to)
),
TP_RAW_FMT("from: %llx to: %llx")
);
TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
TRACE_STRUCT(
TRACE_FIELD(ktime_t, state_data.stamp, stamp)
TRACE_FIELD(ktime_t, state_data.end, end)
TRACE_FIELD(int, state_data.type, type)
TRACE_FIELD(int, state_data.state, state)
),
TP_RAW_FMT("%llx->%llx type:%u state:%u")
);
TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
TRACE_FIELD(unsigned long, call_site, call_site)
TRACE_FIELD(const void *, ptr, ptr)
TRACE_FIELD(size_t, bytes_req, bytes_req)
TRACE_FIELD(size_t, bytes_alloc, bytes_alloc)
TRACE_FIELD(gfp_t, gfp_flags, gfp_flags)
TRACE_FIELD(int, node, node)
),
TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
" flags:%x node:%d")
);
TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
TRACE_FIELD(unsigned long, call_site, call_site)
TRACE_FIELD(const void *, ptr, ptr)
),
TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
);
#undef TRACE_SYSTEM

824
kernel/trace/trace_events.c Normal file
View File

@@ -0,0 +1,824 @@
/*
* event tracer
*
* Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
*
* - Added format output of fields of the trace point.
* This was based off of work by Tom Zanussi <tzanussi@gmail.com>.
*
*/
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ctype.h>
#include "trace_output.h"
#define TRACE_SYSTEM "TRACE_SYSTEM"
static DEFINE_MUTEX(event_mutex);
int trace_define_field(struct ftrace_event_call *call, char *type,
char *name, int offset, int size)
{
struct ftrace_event_field *field;
field = kzalloc(sizeof(*field), GFP_KERNEL);
if (!field)
goto err;
field->name = kstrdup(name, GFP_KERNEL);
if (!field->name)
goto err;
field->type = kstrdup(type, GFP_KERNEL);
if (!field->type)
goto err;
field->offset = offset;
field->size = size;
list_add(&field->link, &call->fields);
return 0;
err:
if (field) {
kfree(field->name);
kfree(field->type);
}
kfree(field);
return -ENOMEM;
}
static void ftrace_clear_events(void)
{
struct ftrace_event_call *call = (void *)__start_ftrace_events;
while ((unsigned long)call < (unsigned long)__stop_ftrace_events) {
if (call->enabled) {
call->enabled = 0;
call->unregfunc();
}
call++;
}
}
static void ftrace_event_enable_disable(struct ftrace_event_call *call,
int enable)
{
switch (enable) {
case 0:
if (call->enabled) {
call->enabled = 0;
call->unregfunc();
}
break;
case 1:
if (!call->enabled) {
call->enabled = 1;
call->regfunc();
}
break;
}
}
static int ftrace_set_clr_event(char *buf, int set)
{
struct ftrace_event_call *call = __start_ftrace_events;
char *event = NULL, *sub = NULL, *match;
int ret = -EINVAL;
/*
* The buf format can be <subsystem>:<event-name>
* *:<event-name> means any event by that name.
* :<event-name> is the same.
*
* <subsystem>:* means all events in that subsystem
* <subsystem>: means the same.
*
* <name> (no ':') means all events in a subsystem with
* the name <name> or any event that matches <name>
*/
match = strsep(&buf, ":");
if (buf) {
sub = match;
event = buf;
match = NULL;
if (!strlen(sub) || strcmp(sub, "*") == 0)
sub = NULL;
if (!strlen(event) || strcmp(event, "*") == 0)
event = NULL;
}
mutex_lock(&event_mutex);
for_each_event(call) {
if (!call->name || !call->regfunc)
continue;
if (match &&
strcmp(match, call->name) != 0 &&
strcmp(match, call->system) != 0)
continue;
if (sub && strcmp(sub, call->system) != 0)
continue;
if (event && strcmp(event, call->name) != 0)
continue;
ftrace_event_enable_disable(call, set);
ret = 0;
}
mutex_unlock(&event_mutex);
return ret;
}
/* 128 should be much more than enough */
#define EVENT_BUF_SIZE 127
static ssize_t
ftrace_event_write(struct file *file, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
size_t read = 0;
int i, set = 1;
ssize_t ret;
char *buf;
char ch;
if (!cnt || cnt < 0)
return 0;
ret = tracing_update_buffers();
if (ret < 0)
return ret;
ret = get_user(ch, ubuf++);
if (ret)
return ret;
read++;
cnt--;
/* skip white space */
while (cnt && isspace(ch)) {
ret = get_user(ch, ubuf++);
if (ret)
return ret;
read++;
cnt--;
}
/* Only white space found? */
if (isspace(ch)) {
file->f_pos += read;
ret = read;
return ret;
}
buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL);
if (!buf)
return -ENOMEM;
if (cnt > EVENT_BUF_SIZE)
cnt = EVENT_BUF_SIZE;
i = 0;
while (cnt && !isspace(ch)) {
if (!i && ch == '!')
set = 0;
else
buf[i++] = ch;
ret = get_user(ch, ubuf++);
if (ret)
goto out_free;
read++;
cnt--;
}
buf[i] = 0;
file->f_pos += read;
ret = ftrace_set_clr_event(buf, set);
if (ret)
goto out_free;
ret = read;
out_free:
kfree(buf);
return ret;
}
static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
struct ftrace_event_call *call = m->private;
struct ftrace_event_call *next = call;
(*pos)++;
for (;;) {
if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
return NULL;
/*
* The ftrace subsystem is for showing formats only.
* They can not be enabled or disabled via the event files.
*/
if (call->regfunc)
break;
call++;
next = call;
}
m->private = ++next;
return call;
}
static void *t_start(struct seq_file *m, loff_t *pos)
{
return t_next(m, NULL, pos);
}
static void *
s_next(struct seq_file *m, void *v, loff_t *pos)
{
struct ftrace_event_call *call = m->private;
struct ftrace_event_call *next;
(*pos)++;
retry:
if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
return NULL;
if (!call->enabled) {
call++;
goto retry;
}
next = call;
m->private = ++next;
return call;
}
static void *s_start(struct seq_file *m, loff_t *pos)
{
return s_next(m, NULL, pos);
}
static int t_show(struct seq_file *m, void *v)
{
struct ftrace_event_call *call = v;
if (strcmp(call->system, TRACE_SYSTEM) != 0)
seq_printf(m, "%s:", call->system);
seq_printf(m, "%s\n", call->name);
return 0;
}
static void t_stop(struct seq_file *m, void *p)
{
}
static int
ftrace_event_seq_open(struct inode *inode, struct file *file)
{
int ret;
const struct seq_operations *seq_ops;
if ((file->f_mode & FMODE_WRITE) &&
!(file->f_flags & O_APPEND))
ftrace_clear_events();
seq_ops = inode->i_private;
ret = seq_open(file, seq_ops);
if (!ret) {
struct seq_file *m = file->private_data;
m->private = __start_ftrace_events;
}
return ret;
}
static ssize_t
event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
struct ftrace_event_call *call = filp->private_data;
char *buf;
if (call->enabled)
buf = "1\n";
else
buf = "0\n";
return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
}
static ssize_t
event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
struct ftrace_event_call *call = filp->private_data;
char buf[64];
unsigned long val;
int ret;
if (cnt >= sizeof(buf))
return -EINVAL;
if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
buf[cnt] = 0;
ret = strict_strtoul(buf, 10, &val);
if (ret < 0)
return ret;
ret = tracing_update_buffers();
if (ret < 0)
return ret;
switch (val) {
case 0:
case 1:
mutex_lock(&event_mutex);
ftrace_event_enable_disable(call, val);
mutex_unlock(&event_mutex);
break;
default:
return -EINVAL;
}
*ppos += cnt;
return cnt;
}
#undef FIELD
#define FIELD(type, name) \
#type, "common_" #name, offsetof(typeof(field), name), \
sizeof(field.name)
static int trace_write_header(struct trace_seq *s)
{
struct trace_entry field;
/* struct trace_entry */
return trace_seq_printf(s,
"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
"\n",
FIELD(unsigned char, type),
FIELD(unsigned char, flags),
FIELD(unsigned char, preempt_count),
FIELD(int, pid),
FIELD(int, tgid));
}
static ssize_t
event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
struct ftrace_event_call *call = filp->private_data;
struct trace_seq *s;
char *buf;
int r;
if (*ppos)
return 0;
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (!s)
return -ENOMEM;
trace_seq_init(s);
/* If any of the first writes fail, so will the show_format. */
trace_seq_printf(s, "name: %s\n", call->name);
trace_seq_printf(s, "ID: %d\n", call->id);
trace_seq_printf(s, "format:\n");
trace_write_header(s);
r = call->show_format(s);
if (!r) {
/*
* ug! The format output is bigger than a PAGE!!
*/
buf = "FORMAT TOO BIG\n";
r = simple_read_from_buffer(ubuf, cnt, ppos,
buf, strlen(buf));
goto out;
}
r = simple_read_from_buffer(ubuf, cnt, ppos,
s->buffer, s->len);
out:
kfree(s);
return r;
}
static ssize_t
event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
{
struct ftrace_event_call *call = filp->private_data;
struct trace_seq *s;
int r;
if (*ppos)
return 0;
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (!s)
return -ENOMEM;
trace_seq_init(s);
trace_seq_printf(s, "%d\n", call->id);
r = simple_read_from_buffer(ubuf, cnt, ppos,
s->buffer, s->len);
kfree(s);
return r;
}
static ssize_t
event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
struct ftrace_event_call *call = filp->private_data;
struct trace_seq *s;
int r;
if (*ppos)
return 0;
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (!s)
return -ENOMEM;
trace_seq_init(s);
filter_print_preds(call->preds, s);
r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
kfree(s);
return r;
}
static ssize_t
event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
struct ftrace_event_call *call = filp->private_data;
char buf[64], *pbuf = buf;
struct filter_pred *pred;
int err;
if (cnt >= sizeof(buf))
return -EINVAL;
if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
pred = kzalloc(sizeof(*pred), GFP_KERNEL);
if (!pred)
return -ENOMEM;
err = filter_parse(&pbuf, pred);
if (err < 0) {
filter_free_pred(pred);
return err;
}
if (pred->clear) {
filter_free_preds(call);
filter_free_pred(pred);
return cnt;
}
if (filter_add_pred(call, pred)) {
filter_free_pred(pred);
return -EINVAL;
}
*ppos += cnt;
return cnt;
}
static ssize_t
subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
struct event_subsystem *system = filp->private_data;
struct trace_seq *s;
int r;
if (*ppos)
return 0;
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (!s)
return -ENOMEM;
trace_seq_init(s);
filter_print_preds(system->preds, s);
r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
kfree(s);
return r;
}
static ssize_t
subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
struct event_subsystem *system = filp->private_data;
char buf[64], *pbuf = buf;
struct filter_pred *pred;
int err;
if (cnt >= sizeof(buf))
return -EINVAL;
if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
pred = kzalloc(sizeof(*pred), GFP_KERNEL);
if (!pred)
return -ENOMEM;
err = filter_parse(&pbuf, pred);
if (err < 0) {
filter_free_pred(pred);
return err;
}
if (pred->clear) {
filter_free_subsystem_preds(system);
filter_free_pred(pred);
return cnt;
}
if (filter_add_subsystem_pred(system, pred)) {
filter_free_subsystem_preds(system);
filter_free_pred(pred);
return -EINVAL;
}
*ppos += cnt;
return cnt;
}
static const struct seq_operations show_event_seq_ops = {
.start = t_start,
.next = t_next,
.show = t_show,
.stop = t_stop,
};
static const struct seq_operations show_set_event_seq_ops = {
.start = s_start,
.next = s_next,
.show = t_show,
.stop = t_stop,
};
static const struct file_operations ftrace_avail_fops = {
.open = ftrace_event_seq_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
static const struct file_operations ftrace_set_event_fops = {
.open = ftrace_event_seq_open,
.read = seq_read,
.write = ftrace_event_write,
.llseek = seq_lseek,
.release = seq_release,
};
static const struct file_operations ftrace_enable_fops = {
.open = tracing_open_generic,
.read = event_enable_read,
.write = event_enable_write,
};
static const struct file_operations ftrace_event_format_fops = {
.open = tracing_open_generic,
.read = event_format_read,
};
static const struct file_operations ftrace_event_id_fops = {
.open = tracing_open_generic,
.read = event_id_read,
};
static const struct file_operations ftrace_event_filter_fops = {
.open = tracing_open_generic,
.read = event_filter_read,
.write = event_filter_write,
};
static const struct file_operations ftrace_subsystem_filter_fops = {
.open = tracing_open_generic,
.read = subsystem_filter_read,
.write = subsystem_filter_write,
};
static struct dentry *event_trace_events_dir(void)
{
static struct dentry *d_tracer;
static struct dentry *d_events;
if (d_events)
return d_events;
d_tracer = tracing_init_dentry();
if (!d_tracer)
return NULL;
d_events = debugfs_create_dir("events", d_tracer);
if (!d_events)
pr_warning("Could not create debugfs "
"'events' directory\n");
return d_events;
}
static LIST_HEAD(event_subsystems);
static struct dentry *
event_subsystem_dir(const char *name, struct dentry *d_events)
{
struct event_subsystem *system;
/* First see if we did not already create this dir */
list_for_each_entry(system, &event_subsystems, list) {
if (strcmp(system->name, name) == 0)
return system->entry;
}
/* need to create new entry */
system = kmalloc(sizeof(*system), GFP_KERNEL);
if (!system) {
pr_warning("No memory to create event subsystem %s\n",
name);
return d_events;
}
system->entry = debugfs_create_dir(name, d_events);
if (!system->entry) {
pr_warning("Could not create event subsystem %s\n",
name);
kfree(system);
return d_events;
}
system->name = name;
list_add(&system->list, &event_subsystems);
system->preds = NULL;
return system->entry;
}
static int
event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
{
struct dentry *entry;
int ret;
/*
* If the trace point header did not define TRACE_SYSTEM
* then the system would be called "TRACE_SYSTEM".
*/
if (strcmp(call->system, "TRACE_SYSTEM") != 0)
d_events = event_subsystem_dir(call->system, d_events);
if (call->raw_init) {
ret = call->raw_init();
if (ret < 0) {
pr_warning("Could not initialize trace point"
" events/%s\n", call->name);
return ret;
}
}
call->dir = debugfs_create_dir(call->name, d_events);
if (!call->dir) {
pr_warning("Could not create debugfs "
"'%s' directory\n", call->name);
return -1;
}
if (call->regfunc) {
entry = debugfs_create_file("enable", 0644, call->dir, call,
&ftrace_enable_fops);
if (!entry)
pr_warning("Could not create debugfs "
"'%s/enable' entry\n", call->name);
}
if (call->id) {
entry = debugfs_create_file("id", 0444, call->dir, call,
&ftrace_event_id_fops);
if (!entry)
pr_warning("Could not create debugfs '%s/id' entry\n",
call->name);
}
if (call->define_fields) {
ret = call->define_fields();
if (ret < 0) {
pr_warning("Could not initialize trace point"
" events/%s\n", call->name);
return ret;
}
entry = debugfs_create_file("filter", 0644, call->dir, call,
&ftrace_event_filter_fops);
if (!entry)
pr_warning("Could not create debugfs "
"'%s/filter' entry\n", call->name);
}
/* A trace may not want to export its format */
if (!call->show_format)
return 0;
entry = debugfs_create_file("format", 0444, call->dir, call,
&ftrace_event_format_fops);
if (!entry)
pr_warning("Could not create debugfs "
"'%s/format' entry\n", call->name);
return 0;
}
static __init int event_trace_init(void)
{
struct ftrace_event_call *call = __start_ftrace_events;
struct dentry *d_tracer;
struct dentry *entry;
struct dentry *d_events;
d_tracer = tracing_init_dentry();
if (!d_tracer)
return 0;
entry = debugfs_create_file("available_events", 0444, d_tracer,
(void *)&show_event_seq_ops,
&ftrace_avail_fops);
if (!entry)
pr_warning("Could not create debugfs "
"'available_events' entry\n");
entry = debugfs_create_file("set_event", 0644, d_tracer,
(void *)&show_set_event_seq_ops,
&ftrace_set_event_fops);
if (!entry)
pr_warning("Could not create debugfs "
"'set_event' entry\n");
d_events = event_trace_events_dir();
if (!d_events)
return 0;
for_each_event(call) {
/* The linker may leave blanks */
if (!call->name)
continue;
event_create_dir(call, d_events);
}
return 0;
}
fs_initcall(event_trace_init);

View File

@@ -0,0 +1,427 @@
/*
* trace_events_filter - generic event filtering
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
*/
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ctype.h>
#include "trace.h"
#include "trace_output.h"
static int filter_pred_64(struct filter_pred *pred, void *event)
{
u64 *addr = (u64 *)(event + pred->offset);
u64 val = (u64)pred->val;
int match;
match = (val == *addr) ^ pred->not;
return match;
}
static int filter_pred_32(struct filter_pred *pred, void *event)
{
u32 *addr = (u32 *)(event + pred->offset);
u32 val = (u32)pred->val;
int match;
match = (val == *addr) ^ pred->not;
return match;
}
static int filter_pred_16(struct filter_pred *pred, void *event)
{
u16 *addr = (u16 *)(event + pred->offset);
u16 val = (u16)pred->val;
int match;
match = (val == *addr) ^ pred->not;
return match;
}
static int filter_pred_8(struct filter_pred *pred, void *event)
{
u8 *addr = (u8 *)(event + pred->offset);
u8 val = (u8)pred->val;
int match;
match = (val == *addr) ^ pred->not;
return match;
}
static int filter_pred_string(struct filter_pred *pred, void *event)
{
char *addr = (char *)(event + pred->offset);
int cmp, match;
cmp = strncmp(addr, pred->str_val, pred->str_len);
match = (!cmp) ^ pred->not;
return match;
}
/* return 1 if event matches, 0 otherwise (discard) */
int filter_match_preds(struct ftrace_event_call *call, void *rec)
{
int i, matched, and_failed = 0;
struct filter_pred *pred;
for (i = 0; i < MAX_FILTER_PRED; i++) {
if (call->preds[i]) {
pred = call->preds[i];
if (and_failed && !pred->or)
continue;
matched = pred->fn(pred, rec);
if (!matched && !pred->or) {
and_failed = 1;
continue;
} else if (matched && pred->or)
return 1;
} else
break;
}
if (and_failed)
return 0;
return 1;
}
void filter_print_preds(struct filter_pred **preds, struct trace_seq *s)
{
char *field_name;
struct filter_pred *pred;
int i;
if (!preds) {
trace_seq_printf(s, "none\n");
return;
}
for (i = 0; i < MAX_FILTER_PRED; i++) {
if (preds[i]) {
pred = preds[i];
field_name = pred->field_name;
if (i)
trace_seq_printf(s, pred->or ? "|| " : "&& ");
trace_seq_printf(s, "%s ", field_name);
trace_seq_printf(s, pred->not ? "!= " : "== ");
if (pred->str_val)
trace_seq_printf(s, "%s\n", pred->str_val);
else
trace_seq_printf(s, "%llu\n", pred->val);
} else
break;
}
}
static struct ftrace_event_field *
find_event_field(struct ftrace_event_call *call, char *name)
{
struct ftrace_event_field *field;
list_for_each_entry(field, &call->fields, link) {
if (!strcmp(field->name, name))
return field;
}
return NULL;
}
void filter_free_pred(struct filter_pred *pred)
{
if (!pred)
return;
kfree(pred->field_name);
kfree(pred->str_val);
kfree(pred);
}
void filter_free_preds(struct ftrace_event_call *call)
{
int i;
if (call->preds) {
for (i = 0; i < MAX_FILTER_PRED; i++)
filter_free_pred(call->preds[i]);
kfree(call->preds);
call->preds = NULL;
}
}
void filter_free_subsystem_preds(struct event_subsystem *system)
{
struct ftrace_event_call *call = __start_ftrace_events;
int i;
if (system->preds) {
for (i = 0; i < MAX_FILTER_PRED; i++)
filter_free_pred(system->preds[i]);
kfree(system->preds);
system->preds = NULL;
}
events_for_each(call) {
if (!call->name || !call->regfunc)
continue;
if (!strcmp(call->system, system->name))
filter_free_preds(call);
}
}
static int __filter_add_pred(struct ftrace_event_call *call,
struct filter_pred *pred)
{
int i;
if (call->preds && !pred->compound)
filter_free_preds(call);
if (!call->preds) {
call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
GFP_KERNEL);
if (!call->preds)
return -ENOMEM;
}
for (i = 0; i < MAX_FILTER_PRED; i++) {
if (!call->preds[i]) {
call->preds[i] = pred;
return 0;
}
}
return -ENOMEM;
}
static int is_string_field(const char *type)
{
if (strchr(type, '[') && strstr(type, "char"))
return 1;
return 0;
}
int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
{
struct ftrace_event_field *field;
field = find_event_field(call, pred->field_name);
if (!field)
return -EINVAL;
pred->offset = field->offset;
if (is_string_field(field->type)) {
if (!pred->str_val)
return -EINVAL;
pred->fn = filter_pred_string;
pred->str_len = field->size;
return __filter_add_pred(call, pred);
} else {
if (pred->str_val)
return -EINVAL;
}
switch (field->size) {
case 8:
pred->fn = filter_pred_64;
break;
case 4:
pred->fn = filter_pred_32;
break;
case 2:
pred->fn = filter_pred_16;
break;
case 1:
pred->fn = filter_pred_8;
break;
default:
return -EINVAL;
}
return __filter_add_pred(call, pred);
}
static struct filter_pred *copy_pred(struct filter_pred *pred)
{
struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL);
if (!new_pred)
return NULL;
memcpy(new_pred, pred, sizeof(*pred));
if (pred->field_name) {
new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
if (!new_pred->field_name) {
kfree(new_pred);
return NULL;
}
}
if (pred->str_val) {
new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL);
if (!new_pred->str_val) {
filter_free_pred(new_pred);
return NULL;
}
}
return new_pred;
}
int filter_add_subsystem_pred(struct event_subsystem *system,
struct filter_pred *pred)
{
struct ftrace_event_call *call = __start_ftrace_events;
struct filter_pred *event_pred;
int i;
if (system->preds && !pred->compound)
filter_free_subsystem_preds(system);
if (!system->preds) {
system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
GFP_KERNEL);
if (!system->preds)
return -ENOMEM;
}
for (i = 0; i < MAX_FILTER_PRED; i++) {
if (!system->preds[i]) {
system->preds[i] = pred;
break;
}
}
if (i == MAX_FILTER_PRED)
return -EINVAL;
events_for_each(call) {
int err;
if (!call->name || !call->regfunc)
continue;
if (strcmp(call->system, system->name))
continue;
if (!find_event_field(call, pred->field_name))
continue;
event_pred = copy_pred(pred);
if (!event_pred)
goto oom;
err = filter_add_pred(call, event_pred);
if (err)
filter_free_pred(event_pred);
if (err == -ENOMEM)
goto oom;
}
return 0;
oom:
system->preds[i] = NULL;
return -ENOMEM;
}
int filter_parse(char **pbuf, struct filter_pred *pred)
{
char *tmp, *tok, *val_str = NULL;
int tok_n = 0;
/* field ==/!= number, or/and field ==/!= number, number */
while ((tok = strsep(pbuf, " \n"))) {
if (tok_n == 0) {
if (!strcmp(tok, "0")) {
pred->clear = 1;
return 0;
} else if (!strcmp(tok, "&&")) {
pred->or = 0;
pred->compound = 1;
} else if (!strcmp(tok, "||")) {
pred->or = 1;
pred->compound = 1;
} else
pred->field_name = tok;
tok_n = 1;
continue;
}
if (tok_n == 1) {
if (!pred->field_name)
pred->field_name = tok;
else if (!strcmp(tok, "!="))
pred->not = 1;
else if (!strcmp(tok, "=="))
pred->not = 0;
else {
pred->field_name = NULL;
return -EINVAL;
}
tok_n = 2;
continue;
}
if (tok_n == 2) {
if (pred->compound) {
if (!strcmp(tok, "!="))
pred->not = 1;
else if (!strcmp(tok, "=="))
pred->not = 0;
else {
pred->field_name = NULL;
return -EINVAL;
}
} else {
val_str = tok;
break; /* done */
}
tok_n = 3;
continue;
}
if (tok_n == 3) {
val_str = tok;
break; /* done */
}
}
pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
if (!pred->field_name)
return -ENOMEM;
pred->val = simple_strtoull(val_str, &tmp, 10);
if (tmp == val_str) {
pred->str_val = kstrdup(val_str, GFP_KERNEL);
if (!pred->str_val)
return -ENOMEM;
}
return 0;
}

View File

@@ -0,0 +1,39 @@
/*
* Stage 1 of the trace events.
*
* Override the macros in <trace/trace_event_types.h> to include the following:
*
* struct ftrace_raw_<call> {
* struct trace_entry ent;
* <type> <item>;
* <type2> <item2>[<len>];
* [...]
* };
*
* The <type> <item> is created by the __field(type, item) macro or
* the __array(type2, item2, len) macro.
* We simply do "type item;", and that will create the fields
* in the structure.
*/
#undef TRACE_FORMAT
#define TRACE_FORMAT(call, proto, args, fmt)
#undef __array
#define __array(type, item, len) type item[len];
#undef __field
#define __field(type, item) type item;
#undef TP_STRUCT__entry
#define TP_STRUCT__entry(args...) args
#undef TRACE_EVENT
#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
struct ftrace_raw_##name { \
struct trace_entry ent; \
tstruct \
}; \
static struct ftrace_event_call event_##name
#include <trace/trace_event_types.h>

View File

@@ -0,0 +1,176 @@
/*
* Stage 2 of the trace events.
*
* Override the macros in <trace/trace_event_types.h> to include the following:
*
* enum print_line_t
* ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
* {
* struct trace_seq *s = &iter->seq;
* struct ftrace_raw_<call> *field; <-- defined in stage 1
* struct trace_entry *entry;
* int ret;
*
* entry = iter->ent;
*
* if (entry->type != event_<call>.id) {
* WARN_ON_ONCE(1);
* return TRACE_TYPE_UNHANDLED;
* }
*
* field = (typeof(field))entry;
*
* ret = trace_seq_printf(s, <TP_printk> "\n");
* if (!ret)
* return TRACE_TYPE_PARTIAL_LINE;
*
* return TRACE_TYPE_HANDLED;
* }
*
* This is the method used to print the raw event to the trace
* output format. Note, this is not needed if the data is read
* in binary.
*/
#undef __entry
#define __entry field
#undef TP_printk
#define TP_printk(fmt, args...) fmt "\n", args
#undef TRACE_EVENT
#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \
enum print_line_t \
ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \
{ \
struct trace_seq *s = &iter->seq; \
struct ftrace_raw_##call *field; \
struct trace_entry *entry; \
int ret; \
\
entry = iter->ent; \
\
if (entry->type != event_##call.id) { \
WARN_ON_ONCE(1); \
return TRACE_TYPE_UNHANDLED; \
} \
\
field = (typeof(field))entry; \
\
ret = trace_seq_printf(s, #call ": " print); \
if (!ret) \
return TRACE_TYPE_PARTIAL_LINE; \
\
return TRACE_TYPE_HANDLED; \
}
#include <trace/trace_event_types.h>
/*
* Setup the showing format of trace point.
*
* int
* ftrace_format_##call(struct trace_seq *s)
* {
* struct ftrace_raw_##call field;
* int ret;
*
* ret = trace_seq_printf(s, #type " " #item ";"
* " offset:%u; size:%u;\n",
* offsetof(struct ftrace_raw_##call, item),
* sizeof(field.type));
*
* }
*/
#undef TP_STRUCT__entry
#define TP_STRUCT__entry(args...) args
#undef __field
#define __field(type, item) \
ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
"offset:%u;\tsize:%u;\n", \
(unsigned int)offsetof(typeof(field), item), \
(unsigned int)sizeof(field.item)); \
if (!ret) \
return 0;
#undef __array
#define __array(type, item, len) \
ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
"offset:%u;\tsize:%u;\n", \
(unsigned int)offsetof(typeof(field), item), \
(unsigned int)sizeof(field.item)); \
if (!ret) \
return 0;
#undef __entry
#define __entry "REC"
#undef TP_printk
#define TP_printk(fmt, args...) "%s, %s\n", #fmt, #args
#undef TP_fast_assign
#define TP_fast_assign(args...) args
#undef TRACE_EVENT
#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
static int \
ftrace_format_##call(struct trace_seq *s) \
{ \
struct ftrace_raw_##call field; \
int ret; \
\
tstruct; \
\
trace_seq_printf(s, "\nprint fmt: " print); \
\
return ret; \
}
#include <trace/trace_event_types.h>
#undef __field
#define __field(type, item) \
ret = trace_define_field(event_call, #type, #item, \
offsetof(typeof(field), item), \
sizeof(field.item)); \
if (ret) \
return ret;
#undef __array
#define __array(type, item, len) \
ret = trace_define_field(event_call, #type "[" #len "]", #item, \
offsetof(typeof(field), item), \
sizeof(field.item)); \
if (ret) \
return ret;
#define __common_field(type, item) \
ret = trace_define_field(event_call, #type, "common_" #item, \
offsetof(typeof(field.ent), item), \
sizeof(field.ent.item)); \
if (ret) \
return ret;
#undef TRACE_EVENT
#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
int \
ftrace_define_fields_##call(void) \
{ \
struct ftrace_raw_##call field; \
struct ftrace_event_call *event_call = &event_##call; \
int ret; \
\
__common_field(unsigned char, type); \
__common_field(unsigned char, flags); \
__common_field(unsigned char, preempt_count); \
__common_field(int, pid); \
__common_field(int, tgid); \
\
tstruct; \
\
return ret; \
}
#include <trace/trace_event_types.h>

View File

@@ -0,0 +1,281 @@
/*
* Stage 3 of the trace events.
*
* Override the macros in <trace/trace_event_types.h> to include the following:
*
* static void ftrace_event_<call>(proto)
* {
* event_trace_printk(_RET_IP_, "<call>: " <fmt>);
* }
*
* static int ftrace_reg_event_<call>(void)
* {
* int ret;
*
* ret = register_trace_<call>(ftrace_event_<call>);
* if (!ret)
* pr_info("event trace: Could not activate trace point "
* "probe to <call>");
* return ret;
* }
*
* static void ftrace_unreg_event_<call>(void)
* {
* unregister_trace_<call>(ftrace_event_<call>);
* }
*
* For those macros defined with TRACE_FORMAT:
*
* static struct ftrace_event_call __used
* __attribute__((__aligned__(4)))
* __attribute__((section("_ftrace_events"))) event_<call> = {
* .name = "<call>",
* .regfunc = ftrace_reg_event_<call>,
* .unregfunc = ftrace_unreg_event_<call>,
* }
*
*
* For those macros defined with TRACE_EVENT:
*
* static struct ftrace_event_call event_<call>;
*
* static void ftrace_raw_event_<call>(proto)
* {
* struct ring_buffer_event *event;
* struct ftrace_raw_<call> *entry; <-- defined in stage 1
* unsigned long irq_flags;
* int pc;
*
* local_save_flags(irq_flags);
* pc = preempt_count();
*
* event = trace_current_buffer_lock_reserve(event_<call>.id,
* sizeof(struct ftrace_raw_<call>),
* irq_flags, pc);
* if (!event)
* return;
* entry = ring_buffer_event_data(event);
*
* <assign>; <-- Here we assign the entries by the __field and
* __array macros.
*
* trace_current_buffer_unlock_commit(event, irq_flags, pc);
* }
*
* static int ftrace_raw_reg_event_<call>(void)
* {
* int ret;
*
* ret = register_trace_<call>(ftrace_raw_event_<call>);
* if (!ret)
* pr_info("event trace: Could not activate trace point "
* "probe to <call>");
* return ret;
* }
*
* static void ftrace_unreg_event_<call>(void)
* {
* unregister_trace_<call>(ftrace_raw_event_<call>);
* }
*
* static struct trace_event ftrace_event_type_<call> = {
* .trace = ftrace_raw_output_<call>, <-- stage 2
* };
*
* static int ftrace_raw_init_event_<call>(void)
* {
* int id;
*
* id = register_ftrace_event(&ftrace_event_type_<call>);
* if (!id)
* return -ENODEV;
* event_<call>.id = id;
* return 0;
* }
*
* static struct ftrace_event_call __used
* __attribute__((__aligned__(4)))
* __attribute__((section("_ftrace_events"))) event_<call> = {
* .name = "<call>",
* .system = "<system>",
* .raw_init = ftrace_raw_init_event_<call>,
* .regfunc = ftrace_reg_event_<call>,
* .unregfunc = ftrace_unreg_event_<call>,
* .show_format = ftrace_format_<call>,
* }
*
*/
#undef TP_FMT
#define TP_FMT(fmt, args...) fmt "\n", ##args
#ifdef CONFIG_EVENT_PROFILE
#define _TRACE_PROFILE(call, proto, args) \
static void ftrace_profile_##call(proto) \
{ \
extern void perf_tpcounter_event(int); \
perf_tpcounter_event(event_##call.id); \
} \
\
static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \
{ \
int ret = 0; \
\
if (!atomic_inc_return(&call->profile_count)) \
ret = register_trace_##call(ftrace_profile_##call); \
\
return ret; \
} \
\
static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \
{ \
if (atomic_add_negative(-1, &call->profile_count)) \
unregister_trace_##call(ftrace_profile_##call); \
}
#define _TRACE_PROFILE_INIT(call) \
.profile_count = ATOMIC_INIT(-1), \
.profile_enable = ftrace_profile_enable_##call, \
.profile_disable = ftrace_profile_disable_##call,
#else
#define _TRACE_PROFILE(call, proto, args)
#define _TRACE_PROFILE_INIT(call)
#endif
#define _TRACE_FORMAT(call, proto, args, fmt) \
static void ftrace_event_##call(proto) \
{ \
event_trace_printk(_RET_IP_, #call ": " fmt); \
} \
\
static int ftrace_reg_event_##call(void) \
{ \
int ret; \
\
ret = register_trace_##call(ftrace_event_##call); \
if (ret) \
pr_info("event trace: Could not activate trace point " \
"probe to " #call "\n"); \
return ret; \
} \
\
static void ftrace_unreg_event_##call(void) \
{ \
unregister_trace_##call(ftrace_event_##call); \
} \
\
static struct ftrace_event_call event_##call; \
\
static int ftrace_init_event_##call(void) \
{ \
int id; \
\
id = register_ftrace_event(NULL); \
if (!id) \
return -ENODEV; \
event_##call.id = id; \
return 0; \
}
#undef TRACE_FORMAT
#define TRACE_FORMAT(call, proto, args, fmt) \
_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) \
_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \
static struct ftrace_event_call __used \
__attribute__((__aligned__(4))) \
__attribute__((section("_ftrace_events"))) event_##call = { \
.name = #call, \
.system = __stringify(TRACE_SYSTEM), \
.raw_init = ftrace_init_event_##call, \
.regfunc = ftrace_reg_event_##call, \
.unregfunc = ftrace_unreg_event_##call, \
_TRACE_PROFILE_INIT(call) \
}
#undef __entry
#define __entry entry
#undef TRACE_EVENT
#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \
_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \
\
static struct ftrace_event_call event_##call; \
\
static void ftrace_raw_event_##call(proto) \
{ \
struct ftrace_event_call *call = &event_##call; \
struct ring_buffer_event *event; \
struct ftrace_raw_##call *entry; \
unsigned long irq_flags; \
int pc; \
\
local_save_flags(irq_flags); \
pc = preempt_count(); \
\
event = trace_current_buffer_lock_reserve(event_##call.id, \
sizeof(struct ftrace_raw_##call), \
irq_flags, pc); \
if (!event) \
return; \
entry = ring_buffer_event_data(event); \
\
assign; \
\
if (call->preds && !filter_match_preds(call, entry)) \
ring_buffer_event_discard(event); \
\
trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
\
} \
\
static int ftrace_raw_reg_event_##call(void) \
{ \
int ret; \
\
ret = register_trace_##call(ftrace_raw_event_##call); \
if (ret) \
pr_info("event trace: Could not activate trace point " \
"probe to " #call "\n"); \
return ret; \
} \
\
static void ftrace_raw_unreg_event_##call(void) \
{ \
unregister_trace_##call(ftrace_raw_event_##call); \
} \
\
static struct trace_event ftrace_event_type_##call = { \
.trace = ftrace_raw_output_##call, \
}; \
\
static int ftrace_raw_init_event_##call(void) \
{ \
int id; \
\
id = register_ftrace_event(&ftrace_event_type_##call); \
if (!id) \
return -ENODEV; \
event_##call.id = id; \
INIT_LIST_HEAD(&event_##call.fields); \
return 0; \
} \
\
static struct ftrace_event_call __used \
__attribute__((__aligned__(4))) \
__attribute__((section("_ftrace_events"))) event_##call = { \
.name = #call, \
.system = __stringify(TRACE_SYSTEM), \
.raw_init = ftrace_raw_init_event_##call, \
.regfunc = ftrace_raw_reg_event_##call, \
.unregfunc = ftrace_raw_unreg_event_##call, \
.show_format = ftrace_format_##call, \
.define_fields = ftrace_define_fields_##call, \
_TRACE_PROFILE_INIT(call) \
}
#include <trace/trace_event_types.h>
#undef _TRACE_PROFILE
#undef _TRACE_PROFILE_INIT

102
kernel/trace/trace_export.c Normal file
View File

@@ -0,0 +1,102 @@
/*
* trace_export.c - export basic ftrace utilities to user space
*
* Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
*/
#include <linux/stringify.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/fs.h>
#include "trace_output.h"
#undef TRACE_STRUCT
#define TRACE_STRUCT(args...) args
#undef TRACE_FIELD
#define TRACE_FIELD(type, item, assign) \
ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
"offset:%u;\tsize:%u;\n", \
(unsigned int)offsetof(typeof(field), item), \
(unsigned int)sizeof(field.item)); \
if (!ret) \
return 0;
#undef TRACE_FIELD_SPECIAL
#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \
"offset:%u;\tsize:%u;\n", \
(unsigned int)offsetof(typeof(field), item), \
(unsigned int)sizeof(field.item)); \
if (!ret) \
return 0;
#undef TRACE_FIELD_ZERO_CHAR
#define TRACE_FIELD_ZERO_CHAR(item) \
ret = trace_seq_printf(s, "\tfield:char " #item ";\t" \
"offset:%u;\tsize:0;\n", \
(unsigned int)offsetof(typeof(field), item)); \
if (!ret) \
return 0;
#undef TP_RAW_FMT
#define TP_RAW_FMT(args...) args
#undef TRACE_EVENT_FORMAT
#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
static int \
ftrace_format_##call(struct trace_seq *s) \
{ \
struct args field; \
int ret; \
\
tstruct; \
\
trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \
\
return ret; \
}
#include "trace_event_types.h"
#undef TRACE_ZERO_CHAR
#define TRACE_ZERO_CHAR(arg)
#undef TRACE_FIELD
#define TRACE_FIELD(type, item, assign)\
entry->item = assign;
#undef TRACE_FIELD
#define TRACE_FIELD(type, item, assign)\
entry->item = assign;
#undef TP_CMD
#define TP_CMD(cmd...) cmd
#undef TRACE_ENTRY
#define TRACE_ENTRY entry
#undef TRACE_FIELD_SPECIAL
#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
cmd;
#undef TRACE_EVENT_FORMAT
#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
\
static struct ftrace_event_call __used \
__attribute__((__aligned__(4))) \
__attribute__((section("_ftrace_events"))) event_##call = { \
.name = #call, \
.id = proto, \
.system = __stringify(TRACE_SYSTEM), \
.show_format = ftrace_format_##call, \
}
#include "trace_event_types.h"

View File

@@ -9,6 +9,7 @@
* Copyright (C) 2004-2006 Ingo Molnar
* Copyright (C) 2004 William Lee Irwin III
*/
#include <linux/ring_buffer.h>
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
@@ -16,31 +17,29 @@
#include "trace.h"
static void start_function_trace(struct trace_array *tr)
/* function tracing enabled */
static int ftrace_function_enabled;
static struct trace_array *func_trace;
static void tracing_start_function_trace(void);
static void tracing_stop_function_trace(void);
static int function_trace_init(struct trace_array *tr)
{
func_trace = tr;
tr->cpu = get_cpu();
tracing_reset_online_cpus(tr);
put_cpu();
tracing_start_cmdline_record();
tracing_start_function_trace();
}
static void stop_function_trace(struct trace_array *tr)
{
tracing_stop_function_trace();
tracing_stop_cmdline_record();
}
static int function_trace_init(struct trace_array *tr)
{
start_function_trace(tr);
return 0;
}
static void function_trace_reset(struct trace_array *tr)
{
stop_function_trace(tr);
tracing_stop_function_trace();
tracing_stop_cmdline_record();
}
static void function_trace_start(struct trace_array *tr)
@@ -48,20 +47,358 @@ static void function_trace_start(struct trace_array *tr)
tracing_reset_online_cpus(tr);
}
static void
function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
{
struct trace_array *tr = func_trace;
struct trace_array_cpu *data;
unsigned long flags;
long disabled;
int cpu, resched;
int pc;
if (unlikely(!ftrace_function_enabled))
return;
pc = preempt_count();
resched = ftrace_preempt_disable();
local_save_flags(flags);
cpu = raw_smp_processor_id();
data = tr->data[cpu];
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1))
trace_function(tr, ip, parent_ip, flags, pc);
atomic_dec(&data->disabled);
ftrace_preempt_enable(resched);
}
static void
function_trace_call(unsigned long ip, unsigned long parent_ip)
{
struct trace_array *tr = func_trace;
struct trace_array_cpu *data;
unsigned long flags;
long disabled;
int cpu;
int pc;
if (unlikely(!ftrace_function_enabled))
return;
/*
* Need to use raw, since this must be called before the
* recursive protection is performed.
*/
local_irq_save(flags);
cpu = raw_smp_processor_id();
data = tr->data[cpu];
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
pc = preempt_count();
trace_function(tr, ip, parent_ip, flags, pc);
}
atomic_dec(&data->disabled);
local_irq_restore(flags);
}
static void
function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
{
struct trace_array *tr = func_trace;
struct trace_array_cpu *data;
unsigned long flags;
long disabled;
int cpu;
int pc;
if (unlikely(!ftrace_function_enabled))
return;
/*
* Need to use raw, since this must be called before the
* recursive protection is performed.
*/
local_irq_save(flags);
cpu = raw_smp_processor_id();
data = tr->data[cpu];
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
pc = preempt_count();
trace_function(tr, ip, parent_ip, flags, pc);
/*
* skip over 5 funcs:
* __ftrace_trace_stack,
* __trace_stack,
* function_stack_trace_call
* ftrace_list_func
* ftrace_call
*/
__trace_stack(tr, flags, 5, pc);
}
atomic_dec(&data->disabled);
local_irq_restore(flags);
}
static struct ftrace_ops trace_ops __read_mostly =
{
.func = function_trace_call,
};
static struct ftrace_ops trace_stack_ops __read_mostly =
{
.func = function_stack_trace_call,
};
/* Our two options */
enum {
TRACE_FUNC_OPT_STACK = 0x1,
};
static struct tracer_opt func_opts[] = {
#ifdef CONFIG_STACKTRACE
{ TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
#endif
{ } /* Always set a last empty entry */
};
static struct tracer_flags func_flags = {
.val = 0, /* By default: all flags disabled */
.opts = func_opts
};
static void tracing_start_function_trace(void)
{
ftrace_function_enabled = 0;
if (trace_flags & TRACE_ITER_PREEMPTONLY)
trace_ops.func = function_trace_call_preempt_only;
else
trace_ops.func = function_trace_call;
if (func_flags.val & TRACE_FUNC_OPT_STACK)
register_ftrace_function(&trace_stack_ops);
else
register_ftrace_function(&trace_ops);
ftrace_function_enabled = 1;
}
static void tracing_stop_function_trace(void)
{
ftrace_function_enabled = 0;
/* OK if they are not registered */
unregister_ftrace_function(&trace_stack_ops);
unregister_ftrace_function(&trace_ops);
}
static int func_set_flag(u32 old_flags, u32 bit, int set)
{
if (bit == TRACE_FUNC_OPT_STACK) {
/* do nothing if already set */
if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
return 0;
if (set) {
unregister_ftrace_function(&trace_ops);
register_ftrace_function(&trace_stack_ops);
} else {
unregister_ftrace_function(&trace_stack_ops);
register_ftrace_function(&trace_ops);
}
return 0;
}
return -EINVAL;
}
static struct tracer function_trace __read_mostly =
{
.name = "function",
.init = function_trace_init,
.reset = function_trace_reset,
.start = function_trace_start,
.name = "function",
.init = function_trace_init,
.reset = function_trace_reset,
.start = function_trace_start,
.wait_pipe = poll_wait_pipe,
.flags = &func_flags,
.set_flag = func_set_flag,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_function,
.selftest = trace_selftest_startup_function,
#endif
};
static __init int init_function_trace(void)
#ifdef CONFIG_DYNAMIC_FTRACE
static void
ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
{
return register_tracer(&function_trace);
long *count = (long *)data;
if (tracing_is_on())
return;
if (!*count)
return;
if (*count != -1)
(*count)--;
tracing_on();
}
static void
ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
{
long *count = (long *)data;
if (!tracing_is_on())
return;
if (!*count)
return;
if (*count != -1)
(*count)--;
tracing_off();
}
static int
ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
struct ftrace_probe_ops *ops, void *data);
static struct ftrace_probe_ops traceon_probe_ops = {
.func = ftrace_traceon,
.print = ftrace_trace_onoff_print,
};
static struct ftrace_probe_ops traceoff_probe_ops = {
.func = ftrace_traceoff,
.print = ftrace_trace_onoff_print,
};
static int
ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
struct ftrace_probe_ops *ops, void *data)
{
char str[KSYM_SYMBOL_LEN];
long count = (long)data;
kallsyms_lookup(ip, NULL, NULL, NULL, str);
seq_printf(m, "%s:", str);
if (ops == &traceon_probe_ops)
seq_printf(m, "traceon");
else
seq_printf(m, "traceoff");
if (count == -1)
seq_printf(m, ":unlimited\n");
else
seq_printf(m, ":count=%ld", count);
seq_putc(m, '\n');
return 0;
}
static int
ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
{
struct ftrace_probe_ops *ops;
/* we register both traceon and traceoff to this callback */
if (strcmp(cmd, "traceon") == 0)
ops = &traceon_probe_ops;
else
ops = &traceoff_probe_ops;
unregister_ftrace_function_probe_func(glob, ops);
return 0;
}
static int
ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
{
struct ftrace_probe_ops *ops;
void *count = (void *)-1;
char *number;
int ret;
/* hash funcs only work with set_ftrace_filter */
if (!enable)
return -EINVAL;
if (glob[0] == '!')
return ftrace_trace_onoff_unreg(glob+1, cmd, param);
/* we register both traceon and traceoff to this callback */
if (strcmp(cmd, "traceon") == 0)
ops = &traceon_probe_ops;
else
ops = &traceoff_probe_ops;
if (!param)
goto out_reg;
number = strsep(&param, ":");
if (!strlen(number))
goto out_reg;
/*
* We use the callback data field (which is a pointer)
* as our counter.
*/
ret = strict_strtoul(number, 0, (unsigned long *)&count);
if (ret)
return ret;
out_reg:
ret = register_ftrace_function_probe(glob, ops, count);
return ret;
}
static struct ftrace_func_command ftrace_traceon_cmd = {
.name = "traceon",
.func = ftrace_trace_onoff_callback,
};
static struct ftrace_func_command ftrace_traceoff_cmd = {
.name = "traceoff",
.func = ftrace_trace_onoff_callback,
};
static int __init init_func_cmd_traceon(void)
{
int ret;
ret = register_ftrace_command(&ftrace_traceoff_cmd);
if (ret)
return ret;
ret = register_ftrace_command(&ftrace_traceon_cmd);
if (ret)
unregister_ftrace_command(&ftrace_traceoff_cmd);
return ret;
}
#else
static inline int init_func_cmd_traceon(void)
{
return 0;
}
#endif /* CONFIG_DYNAMIC_FTRACE */
static __init int init_function_trace(void)
{
init_func_cmd_traceon();
return register_tracer(&function_trace);
}
device_initcall(init_function_trace);

View File

@@ -1,7 +1,7 @@
/*
*
* Function graph tracer.
* Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com>
* Copyright (c) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com>
* Mostly borrowed from function tracer which
* is Copyright (c) Steven Rostedt <srostedt@redhat.com>
*
@@ -12,6 +12,12 @@
#include <linux/fs.h>
#include "trace.h"
#include "trace_output.h"
struct fgraph_data {
pid_t last_pid;
int depth;
};
#define TRACE_GRAPH_INDENT 2
@@ -20,9 +26,11 @@
#define TRACE_GRAPH_PRINT_CPU 0x2
#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
#define TRACE_GRAPH_PRINT_PROC 0x8
#define TRACE_GRAPH_PRINT_DURATION 0x10
#define TRACE_GRAPH_PRINT_ABS_TIME 0X20
static struct tracer_opt trace_opts[] = {
/* Display overruns ? */
/* Display overruns? (for self-debug purpose) */
{ TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
/* Display CPU ? */
{ TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) },
@@ -30,23 +38,28 @@ static struct tracer_opt trace_opts[] = {
{ TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) },
/* Display proc name/pid */
{ TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) },
/* Display duration of execution */
{ TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) },
/* Display absolute time of an entry */
{ TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
{ } /* Empty entry */
};
static struct tracer_flags tracer_flags = {
/* Don't display overruns and proc by default */
.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD,
.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
TRACE_GRAPH_PRINT_DURATION,
.opts = trace_opts
};
/* pid on the last trace processed */
static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 };
/* Add a function return address to the trace stack on thread info.*/
int
ftrace_push_return_trace(unsigned long ret, unsigned long long time,
unsigned long func, int *depth)
ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
{
unsigned long long calltime;
int index;
if (!current->ret_stack)
@@ -58,11 +71,13 @@ ftrace_push_return_trace(unsigned long ret, unsigned long long time,
return -EBUSY;
}
calltime = trace_clock_local();
index = ++current->curr_ret_stack;
barrier();
current->ret_stack[index].ret = ret;
current->ret_stack[index].func = func;
current->ret_stack[index].calltime = time;
current->ret_stack[index].calltime = calltime;
*depth = index;
return 0;
@@ -104,7 +119,7 @@ unsigned long ftrace_return_to_handler(void)
unsigned long ret;
ftrace_pop_return_trace(&trace, &ret);
trace.rettime = cpu_clock(raw_smp_processor_id());
trace.rettime = trace_clock_local();
ftrace_graph_return(&trace);
if (unlikely(!ret)) {
@@ -119,12 +134,7 @@ unsigned long ftrace_return_to_handler(void)
static int graph_trace_init(struct trace_array *tr)
{
int cpu, ret;
for_each_online_cpu(cpu)
tracing_reset(tr, cpu);
ret = register_ftrace_graph(&trace_graph_return,
int ret = register_ftrace_graph(&trace_graph_return,
&trace_graph_entry);
if (ret)
return ret;
@@ -187,15 +197,15 @@ print_graph_cpu(struct trace_seq *s, int cpu)
static enum print_line_t
print_graph_proc(struct trace_seq *s, pid_t pid)
{
int i;
int ret;
int len;
char comm[8];
int spaces = 0;
char comm[TASK_COMM_LEN];
/* sign + log10(MAX_INT) + '\0' */
char pid_str[11];
int spaces = 0;
int ret;
int len;
int i;
strncpy(comm, trace_find_cmdline(pid), 7);
trace_find_cmdline(pid, comm);
comm[7] = '\0';
sprintf(pid_str, "%d", pid);
@@ -228,17 +238,25 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
/* If the pid changed since the last trace, output this event */
static enum print_line_t
verif_pid(struct trace_seq *s, pid_t pid, int cpu)
verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
{
pid_t prev_pid;
pid_t *last_pid;
int ret;
if (last_pid[cpu] != -1 && last_pid[cpu] == pid)
if (!data)
return TRACE_TYPE_HANDLED;
prev_pid = last_pid[cpu];
last_pid[cpu] = pid;
last_pid = &(per_cpu_ptr(data, cpu)->last_pid);
if (*last_pid == pid)
return TRACE_TYPE_HANDLED;
prev_pid = *last_pid;
*last_pid = pid;
if (prev_pid == -1)
return TRACE_TYPE_HANDLED;
/*
* Context-switch trace line:
@@ -250,34 +268,34 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu)
ret = trace_seq_printf(s,
" ------------------------------------------\n");
if (!ret)
TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_PARTIAL_LINE;
ret = print_graph_cpu(s, cpu);
if (ret == TRACE_TYPE_PARTIAL_LINE)
TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_PARTIAL_LINE;
ret = print_graph_proc(s, prev_pid);
if (ret == TRACE_TYPE_PARTIAL_LINE)
TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_PARTIAL_LINE;
ret = trace_seq_printf(s, " => ");
if (!ret)
TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_PARTIAL_LINE;
ret = print_graph_proc(s, pid);
if (ret == TRACE_TYPE_PARTIAL_LINE)
TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_PARTIAL_LINE;
ret = trace_seq_printf(s,
"\n ------------------------------------------\n\n");
if (!ret)
TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_PARTIAL_LINE;
return ret;
return TRACE_TYPE_HANDLED;
}
static bool
trace_branch_is_leaf(struct trace_iterator *iter,
static struct ftrace_graph_ret_entry *
get_return_for_leaf(struct trace_iterator *iter,
struct ftrace_graph_ent_entry *curr)
{
struct ring_buffer_iter *ring_iter;
@@ -286,65 +304,123 @@ trace_branch_is_leaf(struct trace_iterator *iter,
ring_iter = iter->buffer_iter[iter->cpu];
if (!ring_iter)
return false;
event = ring_buffer_iter_peek(ring_iter, NULL);
/* First peek to compare current entry and the next one */
if (ring_iter)
event = ring_buffer_iter_peek(ring_iter, NULL);
else {
/* We need to consume the current entry to see the next one */
ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
NULL);
}
if (!event)
return false;
return NULL;
next = ring_buffer_event_data(event);
if (next->ent.type != TRACE_GRAPH_RET)
return false;
return NULL;
if (curr->ent.pid != next->ent.pid ||
curr->graph_ent.func != next->ret.func)
return false;
return NULL;
return true;
/* this is a leaf, now advance the iterator */
if (ring_iter)
ring_buffer_read(ring_iter, NULL);
return next;
}
/* Signal a overhead of time execution to the output */
static int
print_graph_overhead(unsigned long long duration, struct trace_seq *s)
{
/* If duration disappear, we don't need anything */
if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION))
return 1;
/* Non nested entry or return */
if (duration == -1)
return trace_seq_printf(s, " ");
if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
/* Duration exceeded 100 msecs */
if (duration > 100000ULL)
return trace_seq_printf(s, "! ");
/* Duration exceeded 10 msecs */
if (duration > 10000ULL)
return trace_seq_printf(s, "+ ");
}
return trace_seq_printf(s, " ");
}
static int print_graph_abs_time(u64 t, struct trace_seq *s)
{
unsigned long usecs_rem;
usecs_rem = do_div(t, NSEC_PER_SEC);
usecs_rem /= 1000;
return trace_seq_printf(s, "%5lu.%06lu | ",
(unsigned long)t, usecs_rem);
}
static enum print_line_t
print_graph_irq(struct trace_seq *s, unsigned long addr,
enum trace_type type, int cpu, pid_t pid)
print_graph_irq(struct trace_iterator *iter, unsigned long addr,
enum trace_type type, int cpu, pid_t pid)
{
int ret;
struct trace_seq *s = &iter->seq;
if (addr < (unsigned long)__irqentry_text_start ||
addr >= (unsigned long)__irqentry_text_end)
return TRACE_TYPE_UNHANDLED;
if (type == TRACE_GRAPH_ENT) {
ret = trace_seq_printf(s, "==========> | ");
} else {
/* Cpu */
if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
ret = print_graph_cpu(s, cpu);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Proc */
if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
ret = print_graph_proc(s, pid);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
ret = trace_seq_printf(s, " | ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* No overhead */
if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
ret = trace_seq_printf(s, " ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
ret = trace_seq_printf(s, "<========== |\n");
/* Absolute time */
if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
ret = print_graph_abs_time(iter->ts, s);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Cpu */
if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
ret = print_graph_cpu(s, cpu);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Proc */
if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
ret = print_graph_proc(s, pid);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
ret = trace_seq_printf(s, " | ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* No overhead */
ret = print_graph_overhead(-1, s);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
if (type == TRACE_GRAPH_ENT)
ret = trace_seq_printf(s, "==========>");
else
ret = trace_seq_printf(s, "<==========");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Don't close the duration column if haven't one */
if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
trace_seq_printf(s, " |");
ret = trace_seq_printf(s, "\n");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_HANDLED;
@@ -363,7 +439,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
sprintf(msecs_str, "%lu", (unsigned long) duration);
/* Print msecs */
ret = trace_seq_printf(s, msecs_str);
ret = trace_seq_printf(s, "%s", msecs_str);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -396,52 +472,47 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
}
/* Signal a overhead of time execution to the output */
static int
print_graph_overhead(unsigned long long duration, struct trace_seq *s)
{
/* Duration exceeded 100 msecs */
if (duration > 100000ULL)
return trace_seq_printf(s, "! ");
/* Duration exceeded 10 msecs */
if (duration > 10000ULL)
return trace_seq_printf(s, "+ ");
return trace_seq_printf(s, " ");
}
/* Case of a leaf function on its call entry */
static enum print_line_t
print_graph_entry_leaf(struct trace_iterator *iter,
struct ftrace_graph_ent_entry *entry, struct trace_seq *s)
struct ftrace_graph_ent_entry *entry,
struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s)
{
struct ftrace_graph_ret_entry *ret_entry;
struct fgraph_data *data = iter->private;
struct ftrace_graph_ret *graph_ret;
struct ring_buffer_event *event;
struct ftrace_graph_ent *call;
unsigned long long duration;
int ret;
int i;
event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
ret_entry = ring_buffer_event_data(event);
graph_ret = &ret_entry->ret;
call = &entry->graph_ent;
duration = graph_ret->rettime - graph_ret->calltime;
/* Overhead */
if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
ret = print_graph_overhead(duration, s);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
if (data) {
int cpu = iter->cpu;
int *depth = &(per_cpu_ptr(data, cpu)->depth);
/*
* Comments display at + 1 to depth. Since
* this is a leaf function, keep the comments
* equal to this depth.
*/
*depth = call->depth - 1;
}
/* Duration */
ret = print_graph_duration(duration, s);
if (ret == TRACE_TYPE_PARTIAL_LINE)
/* Overhead */
ret = print_graph_overhead(duration, s);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Duration */
if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
ret = print_graph_duration(duration, s);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Function */
for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
ret = trace_seq_printf(s, " ");
@@ -461,33 +532,34 @@ print_graph_entry_leaf(struct trace_iterator *iter,
}
static enum print_line_t
print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
struct trace_seq *s, pid_t pid, int cpu)
print_graph_entry_nested(struct trace_iterator *iter,
struct ftrace_graph_ent_entry *entry,
struct trace_seq *s, int cpu)
{
int i;
int ret;
struct ftrace_graph_ent *call = &entry->graph_ent;
struct fgraph_data *data = iter->private;
int ret;
int i;
/* No overhead */
if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
ret = trace_seq_printf(s, " ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
if (data) {
int cpu = iter->cpu;
int *depth = &(per_cpu_ptr(data, cpu)->depth);
*depth = call->depth;
}
/* Interrupt */
ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, pid);
if (ret == TRACE_TYPE_UNHANDLED) {
/* No time */
/* No overhead */
ret = print_graph_overhead(-1, s);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* No time */
if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
ret = trace_seq_printf(s, " | ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
} else {
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Function */
for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
ret = trace_seq_printf(s, " ");
@@ -503,20 +575,40 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_HANDLED;
/*
* we already consumed the current entry to check the next one
* and see if this is a leaf.
*/
return TRACE_TYPE_NO_CONSUME;
}
static enum print_line_t
print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
struct trace_iterator *iter, int cpu)
print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
int type, unsigned long addr)
{
int ret;
struct fgraph_data *data = iter->private;
struct trace_entry *ent = iter->ent;
int cpu = iter->cpu;
int ret;
/* Pid */
if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
if (type) {
/* Interrupt */
ret = print_graph_irq(iter, addr, type, cpu, ent->pid);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Absolute time */
if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
ret = print_graph_abs_time(iter->ts, s);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Cpu */
if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
ret = print_graph_cpu(s, cpu);
@@ -535,54 +627,65 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
return TRACE_TYPE_PARTIAL_LINE;
}
if (trace_branch_is_leaf(iter, field))
return print_graph_entry_leaf(iter, field, s);
return 0;
}
static enum print_line_t
print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
struct trace_iterator *iter)
{
int cpu = iter->cpu;
struct ftrace_graph_ent *call = &field->graph_ent;
struct ftrace_graph_ret_entry *leaf_ret;
if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
return TRACE_TYPE_PARTIAL_LINE;
leaf_ret = get_return_for_leaf(iter, field);
if (leaf_ret)
return print_graph_entry_leaf(iter, field, leaf_ret, s);
else
return print_graph_entry_nested(field, s, iter->ent->pid, cpu);
return print_graph_entry_nested(iter, field, s, cpu);
}
static enum print_line_t
print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
struct trace_entry *ent, int cpu)
struct trace_entry *ent, struct trace_iterator *iter)
{
int i;
int ret;
unsigned long long duration = trace->rettime - trace->calltime;
struct fgraph_data *data = iter->private;
pid_t pid = ent->pid;
int cpu = iter->cpu;
int ret;
int i;
/* Pid */
if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
if (data) {
int cpu = iter->cpu;
int *depth = &(per_cpu_ptr(data, cpu)->depth);
/*
* Comments display at + 1 to depth. This is the
* return from a function, we now want the comments
* to display at the same level of the bracket.
*/
*depth = trace->depth - 1;
}
if (print_graph_prologue(iter, s, 0, 0))
return TRACE_TYPE_PARTIAL_LINE;
/* Cpu */
if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
ret = print_graph_cpu(s, cpu);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Proc */
if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
ret = print_graph_proc(s, ent->pid);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
ret = trace_seq_printf(s, " | ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Overhead */
if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
ret = print_graph_overhead(duration, s);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
ret = print_graph_overhead(duration, s);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Duration */
ret = print_graph_duration(duration, s);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
ret = print_graph_duration(duration, s);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Closing brace */
for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
@@ -603,7 +706,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
return TRACE_TYPE_PARTIAL_LINE;
}
ret = print_graph_irq(s, trace->func, TRACE_GRAPH_RET, cpu, ent->pid);
ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
@@ -611,61 +714,73 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
}
static enum print_line_t
print_graph_comment(struct print_entry *trace, struct trace_seq *s,
struct trace_entry *ent, struct trace_iterator *iter)
print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
struct trace_iterator *iter)
{
int i;
unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
struct fgraph_data *data = iter->private;
struct trace_event *event;
int depth = 0;
int ret;
int i;
/* Pid */
if (verif_pid(s, ent->pid, iter->cpu) == TRACE_TYPE_PARTIAL_LINE)
if (data)
depth = per_cpu_ptr(data, iter->cpu)->depth;
if (print_graph_prologue(iter, s, 0, 0))
return TRACE_TYPE_PARTIAL_LINE;
/* Cpu */
if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
ret = print_graph_cpu(s, iter->cpu);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Proc */
if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
ret = print_graph_proc(s, ent->pid);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
ret = trace_seq_printf(s, " | ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* No overhead */
if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
ret = trace_seq_printf(s, " ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* No time */
ret = trace_seq_printf(s, " | ");
ret = print_graph_overhead(-1, s);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* No time */
if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
ret = trace_seq_printf(s, " | ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Indentation */
if (trace->depth > 0)
for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) {
if (depth > 0)
for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) {
ret = trace_seq_printf(s, " ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* The comment */
ret = trace_seq_printf(s, "/* %s", trace->buf);
ret = trace_seq_printf(s, "/* ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
if (ent->flags & TRACE_FLAG_CONT)
trace_seq_print_cont(s, iter);
switch (iter->ent->type) {
case TRACE_BPRINT:
ret = trace_print_bprintk_msg_only(iter);
if (ret != TRACE_TYPE_HANDLED)
return ret;
break;
case TRACE_PRINT:
ret = trace_print_printk_msg_only(iter);
if (ret != TRACE_TYPE_HANDLED)
return ret;
break;
default:
event = ftrace_find_event(ent->type);
if (!event)
return TRACE_TYPE_UNHANDLED;
ret = event->trace(iter, sym_flags);
if (ret != TRACE_TYPE_HANDLED)
return ret;
}
/* Strip ending newline */
if (s->buffer[s->len - 1] == '\n') {
s->buffer[s->len - 1] = '\0';
s->len--;
}
ret = trace_seq_printf(s, " */\n");
if (!ret)
@@ -678,62 +793,91 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
enum print_line_t
print_graph_function(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
struct trace_entry *entry = iter->ent;
struct trace_seq *s = &iter->seq;
switch (entry->type) {
case TRACE_GRAPH_ENT: {
struct ftrace_graph_ent_entry *field;
trace_assign_type(field, entry);
return print_graph_entry(field, s, iter,
iter->cpu);
return print_graph_entry(field, s, iter);
}
case TRACE_GRAPH_RET: {
struct ftrace_graph_ret_entry *field;
trace_assign_type(field, entry);
return print_graph_return(&field->ret, s, entry, iter->cpu);
}
case TRACE_PRINT: {
struct print_entry *field;
trace_assign_type(field, entry);
return print_graph_comment(field, s, entry, iter);
return print_graph_return(&field->ret, s, entry, iter);
}
default:
return TRACE_TYPE_UNHANDLED;
return print_graph_comment(s, entry, iter);
}
return TRACE_TYPE_HANDLED;
}
static void print_graph_headers(struct seq_file *s)
{
/* 1st line */
seq_printf(s, "# ");
if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
seq_printf(s, " TIME ");
if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
seq_printf(s, "CPU ");
seq_printf(s, "CPU");
if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
seq_printf(s, "TASK/PID ");
if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD)
seq_printf(s, "OVERHEAD/");
seq_printf(s, "DURATION FUNCTION CALLS\n");
seq_printf(s, " TASK/PID ");
if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
seq_printf(s, " DURATION ");
seq_printf(s, " FUNCTION CALLS\n");
/* 2nd line */
seq_printf(s, "# ");
if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
seq_printf(s, " | ");
if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
seq_printf(s, "| ");
seq_printf(s, "| ");
if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
seq_printf(s, "| | ");
if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
seq_printf(s, "| ");
seq_printf(s, "| | | | |\n");
} else
seq_printf(s, " | | | | |\n");
seq_printf(s, " | | ");
if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
seq_printf(s, " | | ");
seq_printf(s, " | | | |\n");
}
static void graph_trace_open(struct trace_iterator *iter)
{
/* pid and depth on the last trace processed */
struct fgraph_data *data = alloc_percpu(struct fgraph_data);
int cpu;
if (!data)
pr_warning("function graph tracer: not enough memory\n");
else
for_each_possible_cpu(cpu) {
pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid);
int *depth = &(per_cpu_ptr(data, cpu)->depth);
*pid = -1;
*depth = 0;
}
iter->private = data;
}
static void graph_trace_close(struct trace_iterator *iter)
{
free_percpu(iter->private);
}
static struct tracer graph_trace __read_mostly = {
.name = "function_graph",
.init = graph_trace_init,
.reset = graph_trace_reset,
.name = "function_graph",
.open = graph_trace_open,
.close = graph_trace_close,
.wait_pipe = poll_wait_pipe,
.init = graph_trace_init,
.reset = graph_trace_reset,
.print_line = print_graph_function,
.print_header = print_graph_headers,
.flags = &tracer_flags,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_function_graph,
#endif
};
static __init int init_graph_trace(void)

View File

@@ -1,30 +1,53 @@
/*
* h/w branch tracer for x86 based on bts
*
* Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com>
*
* Copyright (C) 2008-2009 Intel Corporation.
* Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/spinlock.h>
#include <linux/kallsyms.h>
#include <linux/debugfs.h>
#include <linux/ftrace.h>
#include <linux/kallsyms.h>
#include <linux/module.h>
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/fs.h>
#include <asm/ds.h>
#include "trace.h"
#include "trace_output.h"
#define SIZEOF_BTS (1 << 13)
/*
* The tracer lock protects the below per-cpu tracer array.
* It needs to be held to:
* - start tracing on all cpus
* - stop tracing on all cpus
* - start tracing on a single hotplug cpu
* - stop tracing on a single hotplug cpu
* - read the trace from all cpus
* - read the trace from a single cpu
*/
static DEFINE_SPINLOCK(bts_tracer_lock);
static DEFINE_PER_CPU(struct bts_tracer *, tracer);
static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
#define this_tracer per_cpu(tracer, smp_processor_id())
#define this_buffer per_cpu(buffer, smp_processor_id())
static int __read_mostly trace_hw_branches_enabled;
static struct trace_array *hw_branch_trace __read_mostly;
/*
* Start tracing on the current cpu.
* The argument is ignored.
*
* pre: bts_tracer_lock must be locked.
*/
static void bts_trace_start_cpu(void *arg)
{
if (this_tracer)
@@ -42,14 +65,20 @@ static void bts_trace_start_cpu(void *arg)
static void bts_trace_start(struct trace_array *tr)
{
int cpu;
spin_lock(&bts_tracer_lock);
tracing_reset_online_cpus(tr);
on_each_cpu(bts_trace_start_cpu, NULL, 1);
trace_hw_branches_enabled = 1;
for_each_cpu(cpu, cpu_possible_mask)
smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
spin_unlock(&bts_tracer_lock);
}
/*
* Stop tracing on the current cpu.
* The argument is ignored.
*
* pre: bts_tracer_lock must be locked.
*/
static void bts_trace_stop_cpu(void *arg)
{
if (this_tracer) {
@@ -60,26 +89,60 @@ static void bts_trace_stop_cpu(void *arg)
static void bts_trace_stop(struct trace_array *tr)
{
int cpu;
spin_lock(&bts_tracer_lock);
for_each_cpu(cpu, cpu_possible_mask)
smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
trace_hw_branches_enabled = 0;
on_each_cpu(bts_trace_stop_cpu, NULL, 1);
spin_unlock(&bts_tracer_lock);
}
static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
spin_lock(&bts_tracer_lock);
if (!trace_hw_branches_enabled)
goto out;
switch (action) {
case CPU_ONLINE:
case CPU_DOWN_FAILED:
smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
break;
case CPU_DOWN_PREPARE:
smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
break;
}
out:
spin_unlock(&bts_tracer_lock);
return NOTIFY_DONE;
}
static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
.notifier_call = bts_hotcpu_handler
};
static int bts_trace_init(struct trace_array *tr)
{
tracing_reset_online_cpus(tr);
hw_branch_trace = tr;
bts_trace_start(tr);
return 0;
}
static void bts_trace_reset(struct trace_array *tr)
{
bts_trace_stop(tr);
}
static void bts_trace_print_header(struct seq_file *m)
{
seq_puts(m,
"# CPU# FROM TO FUNCTION\n");
seq_puts(m,
"# | | | |\n");
seq_puts(m, "# CPU# TO <- FROM\n");
}
static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
@@ -87,15 +150,15 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
struct trace_entry *entry = iter->ent;
struct trace_seq *seq = &iter->seq;
struct hw_branch_entry *it;
unsigned long symflags = TRACE_ITER_SYM_OFFSET;
trace_assign_type(it, entry);
if (entry->type == TRACE_HW_BRANCHES) {
if (trace_seq_printf(seq, "%4d ", entry->cpu) &&
trace_seq_printf(seq, "0x%016llx -> 0x%016llx ",
it->from, it->to) &&
(!it->from ||
seq_print_ip_sym(seq, it->from, /* sym_flags = */ 0)) &&
if (trace_seq_printf(seq, "%4d ", iter->cpu) &&
seq_print_ip_sym(seq, it->to, symflags) &&
trace_seq_printf(seq, "\t <- ") &&
seq_print_ip_sym(seq, it->from, symflags) &&
trace_seq_printf(seq, "\n"))
return TRACE_TYPE_HANDLED;
return TRACE_TYPE_PARTIAL_LINE;;
@@ -103,26 +166,42 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
return TRACE_TYPE_UNHANDLED;
}
void trace_hw_branch(struct trace_array *tr, u64 from, u64 to)
void trace_hw_branch(u64 from, u64 to)
{
struct trace_array *tr = hw_branch_trace;
struct ring_buffer_event *event;
struct hw_branch_entry *entry;
unsigned long irq;
unsigned long irq1;
int cpu;
event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq);
if (!event)
if (unlikely(!tr))
return;
if (unlikely(!trace_hw_branches_enabled))
return;
local_irq_save(irq1);
cpu = raw_smp_processor_id();
if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
goto out;
event = trace_buffer_lock_reserve(tr, TRACE_HW_BRANCHES,
sizeof(*entry), 0, 0);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
tracing_generic_entry_update(&entry->ent, 0, from);
entry->ent.type = TRACE_HW_BRANCHES;
entry->ent.cpu = smp_processor_id();
entry->from = from;
entry->to = to;
ring_buffer_unlock_commit(tr->buffer, event, irq);
trace_buffer_unlock_commit(tr, event, 0, 0);
out:
atomic_dec(&tr->data[cpu]->disabled);
local_irq_restore(irq1);
}
static void trace_bts_at(struct trace_array *tr,
const struct bts_trace *trace, void *at)
static void trace_bts_at(const struct bts_trace *trace, void *at)
{
struct bts_struct bts;
int err = 0;
@@ -137,18 +216,29 @@ static void trace_bts_at(struct trace_array *tr,
switch (bts.qualifier) {
case BTS_BRANCH:
trace_hw_branch(tr, bts.variant.lbr.from, bts.variant.lbr.to);
trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
break;
}
}
/*
* Collect the trace on the current cpu and write it into the ftrace buffer.
*
* pre: bts_tracer_lock must be locked
*/
static void trace_bts_cpu(void *arg)
{
struct trace_array *tr = (struct trace_array *) arg;
const struct bts_trace *trace;
unsigned char *at;
if (!this_tracer)
if (unlikely(!tr))
return;
if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
return;
if (unlikely(!this_tracer))
return;
ds_suspend_bts(this_tracer);
@@ -158,11 +248,11 @@ static void trace_bts_cpu(void *arg)
for (at = trace->ds.top; (void *)at < trace->ds.end;
at += trace->ds.size)
trace_bts_at(tr, trace, at);
trace_bts_at(trace, at);
for (at = trace->ds.begin; (void *)at < trace->ds.top;
at += trace->ds.size)
trace_bts_at(tr, trace, at);
trace_bts_at(trace, at);
out:
ds_resume_bts(this_tracer);
@@ -170,26 +260,43 @@ out:
static void trace_bts_prepare(struct trace_iterator *iter)
{
int cpu;
spin_lock(&bts_tracer_lock);
for_each_cpu(cpu, cpu_possible_mask)
smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
on_each_cpu(trace_bts_cpu, iter->tr, 1);
spin_unlock(&bts_tracer_lock);
}
static void trace_bts_close(struct trace_iterator *iter)
{
tracing_reset_online_cpus(iter->tr);
}
void trace_hw_branch_oops(void)
{
spin_lock(&bts_tracer_lock);
trace_bts_cpu(hw_branch_trace);
spin_unlock(&bts_tracer_lock);
}
struct tracer bts_tracer __read_mostly =
{
.name = "hw-branch-tracer",
.init = bts_trace_init,
.reset = bts_trace_stop,
.reset = bts_trace_reset,
.print_header = bts_trace_print_header,
.print_line = bts_trace_print_line,
.start = bts_trace_start,
.stop = bts_trace_stop,
.open = trace_bts_prepare
.open = trace_bts_prepare,
.close = trace_bts_close
};
__init static int init_bts_trace(void)
{
register_hotcpu_notifier(&bts_hotcpu_notifier);
return register_tracer(&bts_tracer);
}
device_initcall(init_bts_trace);

View File

@@ -1,5 +1,5 @@
/*
* trace irqs off criticall timings
* trace irqs off critical timings
*
* Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
* Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
@@ -32,6 +32,8 @@ enum {
static int trace_type __read_mostly;
static int save_lat_flag;
#ifdef CONFIG_PREEMPT_TRACER
static inline int
preempt_trace(void)
@@ -95,7 +97,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1))
trace_function(tr, data, ip, parent_ip, flags, preempt_count());
trace_function(tr, ip, parent_ip, flags, preempt_count());
atomic_dec(&data->disabled);
}
@@ -153,7 +155,7 @@ check_critical_timing(struct trace_array *tr,
if (!report_latency(delta))
goto out_unlock;
trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc);
trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
latency = nsecs_to_usecs(delta);
@@ -177,7 +179,7 @@ out:
data->critical_sequence = max_sequence;
data->preempt_timestamp = ftrace_now(cpu);
tracing_reset(tr, cpu);
trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc);
trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
}
static inline void
@@ -210,7 +212,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
local_save_flags(flags);
trace_function(tr, data, ip, parent_ip, flags, preempt_count());
trace_function(tr, ip, parent_ip, flags, preempt_count());
per_cpu(tracing_cpu, cpu) = 1;
@@ -244,7 +246,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
atomic_inc(&data->disabled);
local_save_flags(flags);
trace_function(tr, data, ip, parent_ip, flags, preempt_count());
trace_function(tr, ip, parent_ip, flags, preempt_count());
check_critical_timing(tr, data, parent_ip ? : ip, cpu);
data->critical_start = 0;
atomic_dec(&data->disabled);
@@ -353,33 +355,26 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
}
#endif /* CONFIG_PREEMPT_TRACER */
/*
* save_tracer_enabled is used to save the state of the tracer_enabled
* variable when we disable it when we open a trace output file.
*/
static int save_tracer_enabled;
static void start_irqsoff_tracer(struct trace_array *tr)
{
register_ftrace_function(&trace_ops);
if (tracing_is_enabled()) {
if (tracing_is_enabled())
tracer_enabled = 1;
save_tracer_enabled = 1;
} else {
else
tracer_enabled = 0;
save_tracer_enabled = 0;
}
}
static void stop_irqsoff_tracer(struct trace_array *tr)
{
tracer_enabled = 0;
save_tracer_enabled = 0;
unregister_ftrace_function(&trace_ops);
}
static void __irqsoff_tracer_init(struct trace_array *tr)
{
save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
trace_flags |= TRACE_ITER_LATENCY_FMT;
tracing_max_latency = 0;
irqsoff_trace = tr;
/* make sure that the tracer is visible */
@@ -390,30 +385,19 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
static void irqsoff_tracer_reset(struct trace_array *tr)
{
stop_irqsoff_tracer(tr);
if (!save_lat_flag)
trace_flags &= ~TRACE_ITER_LATENCY_FMT;
}
static void irqsoff_tracer_start(struct trace_array *tr)
{
tracer_enabled = 1;
save_tracer_enabled = 1;
}
static void irqsoff_tracer_stop(struct trace_array *tr)
{
tracer_enabled = 0;
save_tracer_enabled = 0;
}
static void irqsoff_tracer_open(struct trace_iterator *iter)
{
/* stop the trace while dumping */
tracer_enabled = 0;
}
static void irqsoff_tracer_close(struct trace_iterator *iter)
{
/* restart tracing */
tracer_enabled = save_tracer_enabled;
}
#ifdef CONFIG_IRQSOFF_TRACER
@@ -431,8 +415,6 @@ static struct tracer irqsoff_tracer __read_mostly =
.reset = irqsoff_tracer_reset,
.start = irqsoff_tracer_start,
.stop = irqsoff_tracer_stop,
.open = irqsoff_tracer_open,
.close = irqsoff_tracer_close,
.print_max = 1,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_irqsoff,
@@ -459,8 +441,6 @@ static struct tracer preemptoff_tracer __read_mostly =
.reset = irqsoff_tracer_reset,
.start = irqsoff_tracer_start,
.stop = irqsoff_tracer_stop,
.open = irqsoff_tracer_open,
.close = irqsoff_tracer_close,
.print_max = 1,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptoff,
@@ -489,8 +469,6 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
.reset = irqsoff_tracer_reset,
.start = irqsoff_tracer_start,
.stop = irqsoff_tracer_stop,
.open = irqsoff_tracer_open,
.close = irqsoff_tracer_close,
.print_max = 1,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptirqsoff,

View File

@@ -12,6 +12,7 @@
#include <asm/atomic.h>
#include "trace.h"
#include "trace_output.h"
struct header_iter {
struct pci_dev *dev;
@@ -183,21 +184,22 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
switch (rw->opcode) {
case MMIO_READ:
ret = trace_seq_printf(s,
"R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
"R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
rw->width, secs, usec_rem, rw->map_id,
(unsigned long long)rw->phys,
rw->value, rw->pc, 0);
break;
case MMIO_WRITE:
ret = trace_seq_printf(s,
"W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
"W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
rw->width, secs, usec_rem, rw->map_id,
(unsigned long long)rw->phys,
rw->value, rw->pc, 0);
break;
case MMIO_UNKNOWN_OP:
ret = trace_seq_printf(s,
"UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n",
"UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx,"
"%02lx 0x%lx %d\n",
secs, usec_rem, rw->map_id,
(unsigned long long)rw->phys,
(rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
@@ -229,14 +231,14 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
switch (m->opcode) {
case MMIO_PROBE:
ret = trace_seq_printf(s,
"MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
"MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
secs, usec_rem, m->map_id,
(unsigned long long)m->phys, m->virt, m->len,
0UL, 0);
break;
case MMIO_UNPROBE:
ret = trace_seq_printf(s,
"UNMAP %lu.%06lu %d 0x%lx %d\n",
"UNMAP %u.%06lu %d 0x%lx %d\n",
secs, usec_rem, m->map_id, 0UL, 0);
break;
default:
@@ -255,18 +257,15 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
const char *msg = print->buf;
struct trace_seq *s = &iter->seq;
unsigned long long t = ns2usecs(iter->ts);
unsigned long usec_rem = do_div(t, 1000000ULL);
unsigned long usec_rem = do_div(t, USEC_PER_SEC);
unsigned secs = (unsigned long)t;
int ret;
/* The trailing newline must be in the message. */
ret = trace_seq_printf(s, "MARK %lu.%06lu %s", secs, usec_rem, msg);
ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
if (entry->flags & TRACE_FLAG_CONT)
trace_seq_print_cont(s, iter);
return TRACE_TYPE_HANDLED;
}
@@ -308,21 +307,17 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
{
struct ring_buffer_event *event;
struct trace_mmiotrace_rw *entry;
unsigned long irq_flags;
int pc = preempt_count();
event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
&irq_flags);
event = trace_buffer_lock_reserve(tr, TRACE_MMIO_RW,
sizeof(*entry), 0, pc);
if (!event) {
atomic_inc(&dropped_count);
return;
}
entry = ring_buffer_event_data(event);
tracing_generic_entry_update(&entry->ent, 0, preempt_count());
entry->ent.type = TRACE_MMIO_RW;
entry->rw = *rw;
ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
trace_wake_up();
trace_buffer_unlock_commit(tr, event, 0, pc);
}
void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -338,21 +333,17 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
{
struct ring_buffer_event *event;
struct trace_mmiotrace_map *entry;
unsigned long irq_flags;
int pc = preempt_count();
event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
&irq_flags);
event = trace_buffer_lock_reserve(tr, TRACE_MMIO_MAP,
sizeof(*entry), 0, pc);
if (!event) {
atomic_inc(&dropped_count);
return;
}
entry = ring_buffer_event_data(event);
tracing_generic_entry_update(&entry->ent, 0, preempt_count());
entry->ent.type = TRACE_MMIO_MAP;
entry->map = *map;
ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
trace_wake_up();
trace_buffer_unlock_commit(tr, event, 0, pc);
}
void mmio_trace_mapping(struct mmiotrace_map *map)
@@ -368,5 +359,5 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
int mmio_trace_printk(const char *fmt, va_list args)
{
return trace_vprintk(0, -1, fmt, args);
return trace_vprintk(0, fmt, args);
}

View File

@@ -47,12 +47,7 @@ static void stop_nop_trace(struct trace_array *tr)
static int nop_trace_init(struct trace_array *tr)
{
int cpu;
ctx_trace = tr;
for_each_online_cpu(cpu)
tracing_reset(tr, cpu);
start_nop_trace(tr);
return 0;
}
@@ -96,6 +91,7 @@ struct tracer nop_trace __read_mostly =
.name = "nop",
.init = nop_trace_init,
.reset = nop_trace_reset,
.wait_pipe = poll_wait_pipe,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_nop,
#endif

1017
kernel/trace/trace_output.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,71 @@
#ifndef __TRACE_EVENTS_H
#define __TRACE_EVENTS_H
#include "trace.h"
typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
int flags);
struct trace_event {
struct hlist_node node;
int type;
trace_print_func trace;
trace_print_func raw;
trace_print_func hex;
trace_print_func binary;
};
extern enum print_line_t
trace_print_bprintk_msg_only(struct trace_iterator *iter);
extern enum print_line_t
trace_print_printk_msg_only(struct trace_iterator *iter);
extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
__attribute__ ((format (printf, 2, 3)));
extern int
trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
extern int
seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
unsigned long sym_flags);
extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
size_t cnt);
extern int trace_seq_puts(struct trace_seq *s, const char *str);
extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len);
extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
size_t len);
extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
extern int trace_seq_path(struct trace_seq *s, struct path *path);
extern int seq_print_userip_objs(const struct userstack_entry *entry,
struct trace_seq *s, unsigned long sym_flags);
extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
unsigned long ip, unsigned long sym_flags);
extern int trace_print_context(struct trace_iterator *iter);
extern int trace_print_lat_context(struct trace_iterator *iter);
extern struct trace_event *ftrace_find_event(int type);
extern int register_ftrace_event(struct trace_event *event);
extern int unregister_ftrace_event(struct trace_event *event);
extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
int flags);
#define MAX_MEMHEX_BYTES 8
#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
#define SEQ_PUT_FIELD_RET(s, x) \
do { \
if (!trace_seq_putmem(s, &(x), sizeof(x))) \
return TRACE_TYPE_PARTIAL_LINE; \
} while (0)
#define SEQ_PUT_HEX_FIELD_RET(s, x) \
do { \
BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \
if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
return TRACE_TYPE_PARTIAL_LINE; \
} while (0)
#endif

View File

@@ -11,15 +11,113 @@
#include <linux/init.h>
#include <linux/debugfs.h>
#include <linux/ftrace.h>
#include <trace/power.h>
#include <linux/kallsyms.h>
#include <linux/module.h>
#include "trace.h"
#include "trace_output.h"
static struct trace_array *power_trace;
static int __read_mostly trace_power_enabled;
static void probe_power_start(struct power_trace *it, unsigned int type,
unsigned int level)
{
if (!trace_power_enabled)
return;
memset(it, 0, sizeof(struct power_trace));
it->state = level;
it->type = type;
it->stamp = ktime_get();
}
static void probe_power_end(struct power_trace *it)
{
struct ring_buffer_event *event;
struct trace_power *entry;
struct trace_array_cpu *data;
struct trace_array *tr = power_trace;
if (!trace_power_enabled)
return;
preempt_disable();
it->end = ktime_get();
data = tr->data[smp_processor_id()];
event = trace_buffer_lock_reserve(tr, TRACE_POWER,
sizeof(*entry), 0, 0);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
entry->state_data = *it;
trace_buffer_unlock_commit(tr, event, 0, 0);
out:
preempt_enable();
}
static void probe_power_mark(struct power_trace *it, unsigned int type,
unsigned int level)
{
struct ring_buffer_event *event;
struct trace_power *entry;
struct trace_array_cpu *data;
struct trace_array *tr = power_trace;
if (!trace_power_enabled)
return;
memset(it, 0, sizeof(struct power_trace));
it->state = level;
it->type = type;
it->stamp = ktime_get();
preempt_disable();
it->end = it->stamp;
data = tr->data[smp_processor_id()];
event = trace_buffer_lock_reserve(tr, TRACE_POWER,
sizeof(*entry), 0, 0);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
entry->state_data = *it;
trace_buffer_unlock_commit(tr, event, 0, 0);
out:
preempt_enable();
}
static int tracing_power_register(void)
{
int ret;
ret = register_trace_power_start(probe_power_start);
if (ret) {
pr_info("power trace: Couldn't activate tracepoint"
" probe to trace_power_start\n");
return ret;
}
ret = register_trace_power_end(probe_power_end);
if (ret) {
pr_info("power trace: Couldn't activate tracepoint"
" probe to trace_power_end\n");
goto fail_start;
}
ret = register_trace_power_mark(probe_power_mark);
if (ret) {
pr_info("power trace: Couldn't activate tracepoint"
" probe to trace_power_mark\n");
goto fail_end;
}
return ret;
fail_end:
unregister_trace_power_end(probe_power_end);
fail_start:
unregister_trace_power_start(probe_power_start);
return ret;
}
static void start_power_trace(struct trace_array *tr)
{
@@ -31,6 +129,14 @@ static void stop_power_trace(struct trace_array *tr)
trace_power_enabled = 0;
}
static void power_trace_reset(struct trace_array *tr)
{
trace_power_enabled = 0;
unregister_trace_power_start(probe_power_start);
unregister_trace_power_end(probe_power_end);
unregister_trace_power_mark(probe_power_mark);
}
static int power_trace_init(struct trace_array *tr)
{
@@ -38,6 +144,7 @@ static int power_trace_init(struct trace_array *tr)
power_trace = tr;
trace_power_enabled = 1;
tracing_power_register();
for_each_cpu(cpu, cpu_possible_mask)
tracing_reset(tr, cpu);
@@ -85,7 +192,7 @@ static struct tracer power_tracer __read_mostly =
.init = power_trace_init,
.start = start_power_trace,
.stop = stop_power_trace,
.reset = stop_power_trace,
.reset = power_trace_reset,
.print_line = power_print_line,
};
@@ -94,86 +201,3 @@ static int init_power_trace(void)
return register_tracer(&power_tracer);
}
device_initcall(init_power_trace);
void trace_power_start(struct power_trace *it, unsigned int type,
unsigned int level)
{
if (!trace_power_enabled)
return;
memset(it, 0, sizeof(struct power_trace));
it->state = level;
it->type = type;
it->stamp = ktime_get();
}
EXPORT_SYMBOL_GPL(trace_power_start);
void trace_power_end(struct power_trace *it)
{
struct ring_buffer_event *event;
struct trace_power *entry;
struct trace_array_cpu *data;
unsigned long irq_flags;
struct trace_array *tr = power_trace;
if (!trace_power_enabled)
return;
preempt_disable();
it->end = ktime_get();
data = tr->data[smp_processor_id()];
event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
&irq_flags);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
tracing_generic_entry_update(&entry->ent, 0, 0);
entry->ent.type = TRACE_POWER;
entry->state_data = *it;
ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
trace_wake_up();
out:
preempt_enable();
}
EXPORT_SYMBOL_GPL(trace_power_end);
void trace_power_mark(struct power_trace *it, unsigned int type,
unsigned int level)
{
struct ring_buffer_event *event;
struct trace_power *entry;
struct trace_array_cpu *data;
unsigned long irq_flags;
struct trace_array *tr = power_trace;
if (!trace_power_enabled)
return;
memset(it, 0, sizeof(struct power_trace));
it->state = level;
it->type = type;
it->stamp = ktime_get();
preempt_disable();
it->end = it->stamp;
data = tr->data[smp_processor_id()];
event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
&irq_flags);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
tracing_generic_entry_update(&entry->ent, 0, 0);
entry->ent.type = TRACE_POWER;
entry->state_data = *it;
ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
trace_wake_up();
out:
preempt_enable();
}
EXPORT_SYMBOL_GPL(trace_power_mark);

270
kernel/trace/trace_printk.c Normal file
View File

@@ -0,0 +1,270 @@
/*
* trace binary printk
*
* Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com>
*
*/
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/ftrace.h>
#include <linux/string.h>
#include <linux/module.h>
#include <linux/marker.h>
#include <linux/mutex.h>
#include <linux/ctype.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include "trace.h"
#ifdef CONFIG_MODULES
/*
* modules trace_printk()'s formats are autosaved in struct trace_bprintk_fmt
* which are queued on trace_bprintk_fmt_list.
*/
static LIST_HEAD(trace_bprintk_fmt_list);
/* serialize accesses to trace_bprintk_fmt_list */
static DEFINE_MUTEX(btrace_mutex);
struct trace_bprintk_fmt {
struct list_head list;
char fmt[0];
};
static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
{
struct trace_bprintk_fmt *pos;
list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
if (!strcmp(pos->fmt, fmt))
return pos;
}
return NULL;
}
static
void hold_module_trace_bprintk_format(const char **start, const char **end)
{
const char **iter;
mutex_lock(&btrace_mutex);
for (iter = start; iter < end; iter++) {
struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
if (tb_fmt) {
*iter = tb_fmt->fmt;
continue;
}
tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt)
+ strlen(*iter) + 1, GFP_KERNEL);
if (tb_fmt) {
list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
strcpy(tb_fmt->fmt, *iter);
*iter = tb_fmt->fmt;
} else
*iter = NULL;
}
mutex_unlock(&btrace_mutex);
}
static int module_trace_bprintk_format_notify(struct notifier_block *self,
unsigned long val, void *data)
{
struct module *mod = data;
if (mod->num_trace_bprintk_fmt) {
const char **start = mod->trace_bprintk_fmt_start;
const char **end = start + mod->num_trace_bprintk_fmt;
if (val == MODULE_STATE_COMING)
hold_module_trace_bprintk_format(start, end);
}
return 0;
}
#else /* !CONFIG_MODULES */
__init static int
module_trace_bprintk_format_notify(struct notifier_block *self,
unsigned long val, void *data)
{
return 0;
}
#endif /* CONFIG_MODULES */
__initdata_or_module static
struct notifier_block module_trace_bprintk_format_nb = {
.notifier_call = module_trace_bprintk_format_notify,
};
int __trace_bprintk(unsigned long ip, const char *fmt, ...)
{
int ret;
va_list ap;
if (unlikely(!fmt))
return 0;
if (!(trace_flags & TRACE_ITER_PRINTK))
return 0;
va_start(ap, fmt);
ret = trace_vbprintk(ip, fmt, ap);
va_end(ap);
return ret;
}
EXPORT_SYMBOL_GPL(__trace_bprintk);
int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap)
{
if (unlikely(!fmt))
return 0;
if (!(trace_flags & TRACE_ITER_PRINTK))
return 0;
return trace_vbprintk(ip, fmt, ap);
}
EXPORT_SYMBOL_GPL(__ftrace_vbprintk);
int __trace_printk(unsigned long ip, const char *fmt, ...)
{
int ret;
va_list ap;
if (!(trace_flags & TRACE_ITER_PRINTK))
return 0;
va_start(ap, fmt);
ret = trace_vprintk(ip, fmt, ap);
va_end(ap);
return ret;
}
EXPORT_SYMBOL_GPL(__trace_printk);
int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
{
if (!(trace_flags & TRACE_ITER_PRINTK))
return 0;
return trace_vprintk(ip, fmt, ap);
}
EXPORT_SYMBOL_GPL(__ftrace_vprintk);
static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
const char **fmt = m->private;
const char **next = fmt;
(*pos)++;
if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
return NULL;
next = fmt;
m->private = ++next;
return fmt;
}
static void *t_start(struct seq_file *m, loff_t *pos)
{
return t_next(m, NULL, pos);
}
static int t_show(struct seq_file *m, void *v)
{
const char **fmt = v;
const char *str = *fmt;
int i;
seq_printf(m, "0x%lx : \"", (unsigned long)fmt);
/*
* Tabs and new lines need to be converted.
*/
for (i = 0; str[i]; i++) {
switch (str[i]) {
case '\n':
seq_puts(m, "\\n");
break;
case '\t':
seq_puts(m, "\\t");
break;
case '\\':
seq_puts(m, "\\");
break;
case '"':
seq_puts(m, "\\\"");
break;
default:
seq_putc(m, str[i]);
}
}
seq_puts(m, "\"\n");
return 0;
}
static void t_stop(struct seq_file *m, void *p)
{
}
static const struct seq_operations show_format_seq_ops = {
.start = t_start,
.next = t_next,
.show = t_show,
.stop = t_stop,
};
static int
ftrace_formats_open(struct inode *inode, struct file *file)
{
int ret;
ret = seq_open(file, &show_format_seq_ops);
if (!ret) {
struct seq_file *m = file->private_data;
m->private = __start___trace_bprintk_fmt;
}
return ret;
}
static const struct file_operations ftrace_formats_fops = {
.open = ftrace_formats_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
static __init int init_trace_printk_function_export(void)
{
struct dentry *d_tracer;
struct dentry *entry;
d_tracer = tracing_init_dentry();
if (!d_tracer)
return 0;
entry = debugfs_create_file("printk_formats", 0444, d_tracer,
NULL, &ftrace_formats_fops);
if (!entry)
pr_warning("Could not create debugfs "
"'printk_formats' entry\n");
return 0;
}
fs_initcall(init_trace_printk_function_export);
static __init int init_trace_printk(void)
{
return register_module_notifier(&module_trace_bprintk_format_nb);
}
early_initcall(init_trace_printk);

View File

@@ -18,6 +18,7 @@ static struct trace_array *ctx_trace;
static int __read_mostly tracer_enabled;
static int sched_ref;
static DEFINE_MUTEX(sched_register_mutex);
static int sched_stopped;
static void
probe_sched_switch(struct rq *__rq, struct task_struct *prev,
@@ -28,7 +29,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
int cpu;
int pc;
if (!sched_ref)
if (!sched_ref || sched_stopped)
return;
tracing_record_cmdline(prev);
@@ -43,7 +44,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
data = ctx_trace->data[cpu];
if (likely(!atomic_read(&data->disabled)))
tracing_sched_switch_trace(ctx_trace, data, prev, next, flags, pc);
tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
local_irq_restore(flags);
}
@@ -61,12 +62,15 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
pc = preempt_count();
tracing_record_cmdline(current);
if (sched_stopped)
return;
local_irq_save(flags);
cpu = raw_smp_processor_id();
data = ctx_trace->data[cpu];
if (likely(!atomic_read(&data->disabled)))
tracing_sched_wakeup_trace(ctx_trace, data, wakee, current,
tracing_sched_wakeup_trace(ctx_trace, wakee, current,
flags, pc);
local_irq_restore(flags);
@@ -93,7 +97,7 @@ static int tracing_sched_register(void)
ret = register_trace_sched_switch(probe_sched_switch);
if (ret) {
pr_info("sched trace: Couldn't activate tracepoint"
" probe to kernel_sched_schedule\n");
" probe to kernel_sched_switch\n");
goto fail_deprobe_wake_new;
}
@@ -185,12 +189,6 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
ctx_trace = tr;
}
static void start_sched_trace(struct trace_array *tr)
{
tracing_reset_online_cpus(tr);
tracing_start_sched_switch_record();
}
static void stop_sched_trace(struct trace_array *tr)
{
tracing_stop_sched_switch_record();
@@ -199,7 +197,8 @@ static void stop_sched_trace(struct trace_array *tr)
static int sched_switch_trace_init(struct trace_array *tr)
{
ctx_trace = tr;
start_sched_trace(tr);
tracing_reset_online_cpus(tr);
tracing_start_sched_switch_record();
return 0;
}
@@ -211,13 +210,12 @@ static void sched_switch_trace_reset(struct trace_array *tr)
static void sched_switch_trace_start(struct trace_array *tr)
{
tracing_reset_online_cpus(tr);
tracing_start_sched_switch();
sched_stopped = 0;
}
static void sched_switch_trace_stop(struct trace_array *tr)
{
tracing_stop_sched_switch();
sched_stopped = 1;
}
static struct tracer sched_switch_trace __read_mostly =
@@ -227,6 +225,7 @@ static struct tracer sched_switch_trace __read_mostly =
.reset = sched_switch_trace_reset,
.start = sched_switch_trace_start,
.stop = sched_switch_trace_stop,
.wait_pipe = poll_wait_pipe,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_sched_switch,
#endif

View File

@@ -25,12 +25,15 @@ static int __read_mostly tracer_enabled;
static struct task_struct *wakeup_task;
static int wakeup_cpu;
static unsigned wakeup_prio = -1;
static int wakeup_rt;
static raw_spinlock_t wakeup_lock =
(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
static void __wakeup_reset(struct trace_array *tr);
static int save_lat_flag;
#ifdef CONFIG_FUNCTION_TRACER
/*
* irqsoff uses its own tracer function to keep the overhead down:
@@ -71,7 +74,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
if (task_cpu(wakeup_task) != cpu)
goto unlock;
trace_function(tr, data, ip, parent_ip, flags, pc);
trace_function(tr, ip, parent_ip, flags, pc);
unlock:
__raw_spin_unlock(&wakeup_lock);
@@ -151,7 +154,8 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
if (unlikely(!tracer_enabled || next != wakeup_task))
goto out_unlock;
trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
/*
* usecs conversion is slow so we try to delay the conversion
@@ -182,13 +186,10 @@ out:
static void __wakeup_reset(struct trace_array *tr)
{
struct trace_array_cpu *data;
int cpu;
for_each_possible_cpu(cpu) {
data = tr->data[cpu];
for_each_possible_cpu(cpu)
tracing_reset(tr, cpu);
}
wakeup_cpu = -1;
wakeup_prio = -1;
@@ -213,6 +214,7 @@ static void wakeup_reset(struct trace_array *tr)
static void
probe_wakeup(struct rq *rq, struct task_struct *p, int success)
{
struct trace_array_cpu *data;
int cpu = smp_processor_id();
unsigned long flags;
long disabled;
@@ -224,7 +226,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
tracing_record_cmdline(p);
tracing_record_cmdline(current);
if (likely(!rt_task(p)) ||
if ((wakeup_rt && !rt_task(p)) ||
p->prio >= wakeup_prio ||
p->prio >= current->prio)
return;
@@ -252,9 +254,16 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
local_save_flags(flags);
wakeup_trace->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu);
trace_function(wakeup_trace, wakeup_trace->data[wakeup_cpu],
CALLER_ADDR1, CALLER_ADDR2, flags, pc);
data = wakeup_trace->data[wakeup_cpu];
data->preempt_timestamp = ftrace_now(cpu);
tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
/*
* We must be careful in using CALLER_ADDR2. But since wake_up
* is not called by an assembly function (where as schedule is)
* it should be safe to use it here.
*/
trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
out_locked:
__raw_spin_unlock(&wakeup_lock);
@@ -262,12 +271,6 @@ out:
atomic_dec(&wakeup_trace->data[cpu]->disabled);
}
/*
* save_tracer_enabled is used to save the state of the tracer_enabled
* variable when we disable it when we open a trace output file.
*/
static int save_tracer_enabled;
static void start_wakeup_tracer(struct trace_array *tr)
{
int ret;
@@ -289,7 +292,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
ret = register_trace_sched_switch(probe_wakeup_sched_switch);
if (ret) {
pr_info("sched trace: Couldn't activate tracepoint"
" probe to kernel_sched_schedule\n");
" probe to kernel_sched_switch\n");
goto fail_deprobe_wake_new;
}
@@ -306,13 +309,10 @@ static void start_wakeup_tracer(struct trace_array *tr)
register_ftrace_function(&trace_ops);
if (tracing_is_enabled()) {
if (tracing_is_enabled())
tracer_enabled = 1;
save_tracer_enabled = 1;
} else {
else
tracer_enabled = 0;
save_tracer_enabled = 0;
}
return;
fail_deprobe_wake_new:
@@ -324,54 +324,54 @@ fail_deprobe:
static void stop_wakeup_tracer(struct trace_array *tr)
{
tracer_enabled = 0;
save_tracer_enabled = 0;
unregister_ftrace_function(&trace_ops);
unregister_trace_sched_switch(probe_wakeup_sched_switch);
unregister_trace_sched_wakeup_new(probe_wakeup);
unregister_trace_sched_wakeup(probe_wakeup);
}
static int wakeup_tracer_init(struct trace_array *tr)
static int __wakeup_tracer_init(struct trace_array *tr)
{
save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
trace_flags |= TRACE_ITER_LATENCY_FMT;
tracing_max_latency = 0;
wakeup_trace = tr;
start_wakeup_tracer(tr);
return 0;
}
static int wakeup_tracer_init(struct trace_array *tr)
{
wakeup_rt = 0;
return __wakeup_tracer_init(tr);
}
static int wakeup_rt_tracer_init(struct trace_array *tr)
{
wakeup_rt = 1;
return __wakeup_tracer_init(tr);
}
static void wakeup_tracer_reset(struct trace_array *tr)
{
stop_wakeup_tracer(tr);
/* make sure we put back any tasks we are tracing */
wakeup_reset(tr);
if (!save_lat_flag)
trace_flags &= ~TRACE_ITER_LATENCY_FMT;
}
static void wakeup_tracer_start(struct trace_array *tr)
{
wakeup_reset(tr);
tracer_enabled = 1;
save_tracer_enabled = 1;
}
static void wakeup_tracer_stop(struct trace_array *tr)
{
tracer_enabled = 0;
save_tracer_enabled = 0;
}
static void wakeup_tracer_open(struct trace_iterator *iter)
{
/* stop the trace while dumping */
tracer_enabled = 0;
}
static void wakeup_tracer_close(struct trace_iterator *iter)
{
/* forget about any processes we were recording */
if (save_tracer_enabled) {
wakeup_reset(iter->tr);
tracer_enabled = 1;
}
}
static struct tracer wakeup_tracer __read_mostly =
@@ -381,8 +381,20 @@ static struct tracer wakeup_tracer __read_mostly =
.reset = wakeup_tracer_reset,
.start = wakeup_tracer_start,
.stop = wakeup_tracer_stop,
.open = wakeup_tracer_open,
.close = wakeup_tracer_close,
.print_max = 1,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
#endif
};
static struct tracer wakeup_rt_tracer __read_mostly =
{
.name = "wakeup_rt",
.init = wakeup_rt_tracer_init,
.reset = wakeup_tracer_reset,
.start = wakeup_tracer_start,
.stop = wakeup_tracer_stop,
.wait_pipe = poll_wait_pipe,
.print_max = 1,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
@@ -397,6 +409,10 @@ __init static int init_wakeup_tracer(void)
if (ret)
return ret;
ret = register_tracer(&wakeup_rt_tracer);
if (ret)
return ret;
return 0;
}
device_initcall(init_wakeup_tracer);

View File

@@ -1,5 +1,6 @@
/* Include in trace.c */
#include <linux/stringify.h>
#include <linux/kthread.h>
#include <linux/delay.h>
@@ -9,11 +10,12 @@ static inline int trace_valid_entry(struct trace_entry *entry)
case TRACE_FN:
case TRACE_CTX:
case TRACE_WAKE:
case TRACE_CONT:
case TRACE_STACK:
case TRACE_PRINT:
case TRACE_SPECIAL:
case TRACE_BRANCH:
case TRACE_GRAPH_ENT:
case TRACE_GRAPH_RET:
return 1;
}
return 0;
@@ -99,9 +101,6 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
#ifdef CONFIG_DYNAMIC_FTRACE
#define __STR(x) #x
#define STR(x) __STR(x)
/* Test dynamic code modification and ftrace filters */
int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
struct trace_array *tr,
@@ -125,17 +124,17 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
func();
/*
* Some archs *cough*PowerPC*cough* add charachters to the
* Some archs *cough*PowerPC*cough* add characters to the
* start of the function names. We simply put a '*' to
* accomodate them.
* accommodate them.
*/
func_name = "*" STR(DYN_FTRACE_TEST_NAME);
func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
/* filter only on our function */
ftrace_set_filter(func_name, strlen(func_name), 1);
/* enable tracing */
ret = trace->init(tr);
ret = tracer_init(trace, tr);
if (ret) {
warn_failed_init_tracer(trace, ret);
goto out;
@@ -209,7 +208,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
ftrace_enabled = 1;
tracer_enabled = 1;
ret = trace->init(tr);
ret = tracer_init(trace, tr);
if (ret) {
warn_failed_init_tracer(trace, ret);
goto out;
@@ -247,6 +246,90 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
}
#endif /* CONFIG_FUNCTION_TRACER */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
/* Maximum number of functions to trace before diagnosing a hang */
#define GRAPH_MAX_FUNC_TEST 100000000
static void __ftrace_dump(bool disable_tracing);
static unsigned int graph_hang_thresh;
/* Wrap the real function entry probe to avoid possible hanging */
static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
{
/* This is harmlessly racy, we want to approximately detect a hang */
if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
ftrace_graph_stop();
printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
if (ftrace_dump_on_oops)
__ftrace_dump(false);
return 0;
}
return trace_graph_entry(trace);
}
/*
* Pretty much the same than for the function tracer from which the selftest
* has been borrowed.
*/
int
trace_selftest_startup_function_graph(struct tracer *trace,
struct trace_array *tr)
{
int ret;
unsigned long count;
/*
* Simulate the init() callback but we attach a watchdog callback
* to detect and recover from possible hangs
*/
tracing_reset_online_cpus(tr);
ret = register_ftrace_graph(&trace_graph_return,
&trace_graph_entry_watchdog);
if (ret) {
warn_failed_init_tracer(trace, ret);
goto out;
}
tracing_start_cmdline_record();
/* Sleep for a 1/10 of a second */
msleep(100);
/* Have we just recovered from a hang? */
if (graph_hang_thresh > GRAPH_MAX_FUNC_TEST) {
tracing_selftest_disabled = true;
ret = -1;
goto out;
}
tracing_stop();
/* check the trace buffer */
ret = trace_test_buffer(tr, &count);
trace->reset(tr);
tracing_start();
if (!ret && !count) {
printk(KERN_CONT ".. no entries found ..");
ret = -1;
goto out;
}
/* Don't test dynamic tracing, the function tracer already did */
out:
/* Stop it if we failed */
if (ret)
ftrace_graph_stop();
return ret;
}
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
#ifdef CONFIG_IRQSOFF_TRACER
int
trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
@@ -256,7 +339,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
int ret;
/* start the tracing */
ret = trace->init(tr);
ret = tracer_init(trace, tr);
if (ret) {
warn_failed_init_tracer(trace, ret);
return ret;
@@ -268,6 +351,14 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
local_irq_disable();
udelay(100);
local_irq_enable();
/*
* Stop the tracer to avoid a warning subsequent
* to buffer flipping failure because tracing_stop()
* disables the tr and max buffers, making flipping impossible
* in case of parallels max irqs off latencies.
*/
trace->stop(tr);
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
@@ -310,7 +401,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
}
/* start the tracing */
ret = trace->init(tr);
ret = tracer_init(trace, tr);
if (ret) {
warn_failed_init_tracer(trace, ret);
return ret;
@@ -322,6 +413,14 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
preempt_disable();
udelay(100);
preempt_enable();
/*
* Stop the tracer to avoid a warning subsequent
* to buffer flipping failure because tracing_stop()
* disables the tr and max buffers, making flipping impossible
* in case of parallels max preempt off latencies.
*/
trace->stop(tr);
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
@@ -364,10 +463,10 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
}
/* start the tracing */
ret = trace->init(tr);
ret = tracer_init(trace, tr);
if (ret) {
warn_failed_init_tracer(trace, ret);
goto out;
goto out_no_start;
}
/* reset the max latency */
@@ -381,31 +480,35 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
/* reverse the order of preempt vs irqs */
local_irq_enable();
/*
* Stop the tracer to avoid a warning subsequent
* to buffer flipping failure because tracing_stop()
* disables the tr and max buffers, making flipping impossible
* in case of parallels max irqs/preempt off latencies.
*/
trace->stop(tr);
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
ret = trace_test_buffer(tr, NULL);
if (ret) {
tracing_start();
if (ret)
goto out;
}
ret = trace_test_buffer(&max_tr, &count);
if (ret) {
tracing_start();
if (ret)
goto out;
}
if (!ret && !count) {
printk(KERN_CONT ".. no entries found ..");
ret = -1;
tracing_start();
goto out;
}
/* do the test by disabling interrupts first this time */
tracing_max_latency = 0;
tracing_start();
trace->start(tr);
preempt_disable();
local_irq_disable();
udelay(100);
@@ -413,6 +516,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
/* reverse the order of preempt vs irqs */
local_irq_enable();
trace->stop(tr);
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
@@ -428,9 +532,10 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
goto out;
}
out:
trace->reset(tr);
out:
tracing_start();
out_no_start:
trace->reset(tr);
tracing_max_latency = save_max;
return ret;
@@ -496,7 +601,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
wait_for_completion(&isrt);
/* start the tracing */
ret = trace->init(tr);
ret = tracer_init(trace, tr);
if (ret) {
warn_failed_init_tracer(trace, ret);
return ret;
@@ -557,7 +662,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
int ret;
/* start the tracing */
ret = trace->init(tr);
ret = tracer_init(trace, tr);
if (ret) {
warn_failed_init_tracer(trace, ret);
return ret;
@@ -589,34 +694,7 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
int ret;
/* start the tracing */
ret = trace->init(tr);
if (ret) {
warn_failed_init_tracer(trace, ret);
return 0;
}
/* Sleep for a 1/10 of a second */
msleep(100);
/* stop the tracing. */
tracing_stop();
/* check the trace buffer */
ret = trace_test_buffer(tr, &count);
trace->reset(tr);
tracing_start();
return ret;
}
#endif /* CONFIG_SYSPROF_TRACER */
#ifdef CONFIG_BRANCH_TRACER
int
trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
{
unsigned long count;
int ret;
/* start the tracing */
ret = trace->init(tr);
ret = tracer_init(trace, tr);
if (ret) {
warn_failed_init_tracer(trace, ret);
return ret;
@@ -631,6 +709,43 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
trace->reset(tr);
tracing_start();
if (!ret && !count) {
printk(KERN_CONT ".. no entries found ..");
ret = -1;
}
return ret;
}
#endif /* CONFIG_SYSPROF_TRACER */
#ifdef CONFIG_BRANCH_TRACER
int
trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
{
unsigned long count;
int ret;
/* start the tracing */
ret = tracer_init(trace, tr);
if (ret) {
warn_failed_init_tracer(trace, ret);
return ret;
}
/* Sleep for a 1/10 of a second */
msleep(100);
/* stop the tracing. */
tracing_stop();
/* check the trace buffer */
ret = trace_test_buffer(tr, &count);
trace->reset(tr);
tracing_start();
if (!ret && !count) {
printk(KERN_CONT ".. no entries found ..");
ret = -1;
}
return ret;
}
#endif /* CONFIG_BRANCH_TRACER */

View File

@@ -245,16 +245,31 @@ static int trace_lookup_stack(struct seq_file *m, long i)
#endif
}
static void print_disabled(struct seq_file *m)
{
seq_puts(m, "#\n"
"# Stack tracer disabled\n"
"#\n"
"# To enable the stack tracer, either add 'stacktrace' to the\n"
"# kernel command line\n"
"# or 'echo 1 > /proc/sys/kernel/stack_tracer_enabled'\n"
"#\n");
}
static int t_show(struct seq_file *m, void *v)
{
long i;
int size;
if (v == SEQ_START_TOKEN) {
seq_printf(m, " Depth Size Location"
seq_printf(m, " Depth Size Location"
" (%d entries)\n"
" ----- ---- --------\n",
" ----- ---- --------\n",
max_stack_trace.nr_entries);
if (!stack_tracer_enabled && !max_stack_size)
print_disabled(m);
return 0;
}

326
kernel/trace/trace_stat.c Normal file
View File

@@ -0,0 +1,326 @@
/*
* Infrastructure for statistic tracing (histogram output).
*
* Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
*
* Based on the code from trace_branch.c which is
* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
*
*/
#include <linux/list.h>
#include <linux/debugfs.h>
#include "trace_stat.h"
#include "trace.h"
/* List of stat entries from a tracer */
struct trace_stat_list {
struct list_head list;
void *stat;
};
/* A stat session is the stats output in one file */
struct tracer_stat_session {
struct list_head session_list;
struct tracer_stat *ts;
struct list_head stat_list;
struct mutex stat_mutex;
struct dentry *file;
};
/* All of the sessions currently in use. Each stat file embed one session */
static LIST_HEAD(all_stat_sessions);
static DEFINE_MUTEX(all_stat_sessions_mutex);
/* The root directory for all stat files */
static struct dentry *stat_dir;
static void reset_stat_session(struct tracer_stat_session *session)
{
struct trace_stat_list *node, *next;
list_for_each_entry_safe(node, next, &session->stat_list, list)
kfree(node);
INIT_LIST_HEAD(&session->stat_list);
}
static void destroy_session(struct tracer_stat_session *session)
{
debugfs_remove(session->file);
reset_stat_session(session);
mutex_destroy(&session->stat_mutex);
kfree(session);
}
/*
* For tracers that don't provide a stat_cmp callback.
* This one will force an immediate insertion on tail of
* the list.
*/
static int dummy_cmp(void *p1, void *p2)
{
return 1;
}
/*
* Initialize the stat list at each trace_stat file opening.
* All of these copies and sorting are required on all opening
* since the stats could have changed between two file sessions.
*/
static int stat_seq_init(struct tracer_stat_session *session)
{
struct trace_stat_list *iter_entry, *new_entry;
struct tracer_stat *ts = session->ts;
void *stat;
int ret = 0;
int i;
mutex_lock(&session->stat_mutex);
reset_stat_session(session);
if (!ts->stat_cmp)
ts->stat_cmp = dummy_cmp;
stat = ts->stat_start();
if (!stat)
goto exit;
/*
* The first entry. Actually this is the second, but the first
* one (the stat_list head) is pointless.
*/
new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
if (!new_entry) {
ret = -ENOMEM;
goto exit;
}
INIT_LIST_HEAD(&new_entry->list);
list_add(&new_entry->list, &session->stat_list);
new_entry->stat = stat;
/*
* Iterate over the tracer stat entries and store them in a sorted
* list.
*/
for (i = 1; ; i++) {
stat = ts->stat_next(stat, i);
/* End of insertion */
if (!stat)
break;
new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
if (!new_entry) {
ret = -ENOMEM;
goto exit_free_list;
}
INIT_LIST_HEAD(&new_entry->list);
new_entry->stat = stat;
list_for_each_entry_reverse(iter_entry, &session->stat_list,
list) {
/* Insertion with a descendent sorting */
if (ts->stat_cmp(iter_entry->stat,
new_entry->stat) >= 0) {
list_add(&new_entry->list, &iter_entry->list);
break;
}
}
/* The current larger value */
if (list_empty(&new_entry->list))
list_add(&new_entry->list, &session->stat_list);
}
exit:
mutex_unlock(&session->stat_mutex);
return ret;
exit_free_list:
reset_stat_session(session);
mutex_unlock(&session->stat_mutex);
return ret;
}
static void *stat_seq_start(struct seq_file *s, loff_t *pos)
{
struct tracer_stat_session *session = s->private;
/* Prevent from tracer switch or stat_list modification */
mutex_lock(&session->stat_mutex);
/* If we are in the beginning of the file, print the headers */
if (!*pos && session->ts->stat_headers)
return SEQ_START_TOKEN;
return seq_list_start(&session->stat_list, *pos);
}
static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
{
struct tracer_stat_session *session = s->private;
if (p == SEQ_START_TOKEN)
return seq_list_start(&session->stat_list, *pos);
return seq_list_next(p, &session->stat_list, pos);
}
static void stat_seq_stop(struct seq_file *s, void *p)
{
struct tracer_stat_session *session = s->private;
mutex_unlock(&session->stat_mutex);
}
static int stat_seq_show(struct seq_file *s, void *v)
{
struct tracer_stat_session *session = s->private;
struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
if (v == SEQ_START_TOKEN)
return session->ts->stat_headers(s);
return session->ts->stat_show(s, l->stat);
}
static const struct seq_operations trace_stat_seq_ops = {
.start = stat_seq_start,
.next = stat_seq_next,
.stop = stat_seq_stop,
.show = stat_seq_show
};
/* The session stat is refilled and resorted at each stat file opening */
static int tracing_stat_open(struct inode *inode, struct file *file)
{
int ret;
struct tracer_stat_session *session = inode->i_private;
ret = seq_open(file, &trace_stat_seq_ops);
if (!ret) {
struct seq_file *m = file->private_data;
m->private = session;
ret = stat_seq_init(session);
}
return ret;
}
/*
* Avoid consuming memory with our now useless list.
*/
static int tracing_stat_release(struct inode *i, struct file *f)
{
struct tracer_stat_session *session = i->i_private;
mutex_lock(&session->stat_mutex);
reset_stat_session(session);
mutex_unlock(&session->stat_mutex);
return 0;
}
static const struct file_operations tracing_stat_fops = {
.open = tracing_stat_open,
.read = seq_read,
.llseek = seq_lseek,
.release = tracing_stat_release
};
static int tracing_stat_init(void)
{
struct dentry *d_tracing;
d_tracing = tracing_init_dentry();
stat_dir = debugfs_create_dir("trace_stat", d_tracing);
if (!stat_dir)
pr_warning("Could not create debugfs "
"'trace_stat' entry\n");
return 0;
}
static int init_stat_file(struct tracer_stat_session *session)
{
if (!stat_dir && tracing_stat_init())
return -ENODEV;
session->file = debugfs_create_file(session->ts->name, 0644,
stat_dir,
session, &tracing_stat_fops);
if (!session->file)
return -ENOMEM;
return 0;
}
int register_stat_tracer(struct tracer_stat *trace)
{
struct tracer_stat_session *session, *node, *tmp;
int ret;
if (!trace)
return -EINVAL;
if (!trace->stat_start || !trace->stat_next || !trace->stat_show)
return -EINVAL;
/* Already registered? */
mutex_lock(&all_stat_sessions_mutex);
list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
if (node->ts == trace) {
mutex_unlock(&all_stat_sessions_mutex);
return -EINVAL;
}
}
mutex_unlock(&all_stat_sessions_mutex);
/* Init the session */
session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL);
if (!session)
return -ENOMEM;
session->ts = trace;
INIT_LIST_HEAD(&session->session_list);
INIT_LIST_HEAD(&session->stat_list);
mutex_init(&session->stat_mutex);
session->file = NULL;
ret = init_stat_file(session);
if (ret) {
destroy_session(session);
return ret;
}
/* Register */
mutex_lock(&all_stat_sessions_mutex);
list_add_tail(&session->session_list, &all_stat_sessions);
mutex_unlock(&all_stat_sessions_mutex);
return 0;
}
void unregister_stat_tracer(struct tracer_stat *trace)
{
struct tracer_stat_session *node, *tmp;
mutex_lock(&all_stat_sessions_mutex);
list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
if (node->ts == trace) {
list_del(&node->session_list);
destroy_session(node);
break;
}
}
mutex_unlock(&all_stat_sessions_mutex);
}

31
kernel/trace/trace_stat.h Normal file
View File

@@ -0,0 +1,31 @@
#ifndef __TRACE_STAT_H
#define __TRACE_STAT_H
#include <linux/seq_file.h>
/*
* If you want to provide a stat file (one-shot statistics), fill
* an iterator with stat_start/stat_next and a stat_show callbacks.
* The others callbacks are optional.
*/
struct tracer_stat {
/* The name of your stat file */
const char *name;
/* Iteration over statistic entries */
void *(*stat_start)(void);
void *(*stat_next)(void *prev, int idx);
/* Compare two entries for stats sorting */
int (*stat_cmp)(void *p1, void *p2);
/* Print a stat entry */
int (*stat_show)(struct seq_file *s, void *p);
/* Print the headers of your stat entries */
int (*stat_headers)(struct seq_file *s);
};
/*
* Destroy or create a stat file
*/
extern int register_stat_tracer(struct tracer_stat *trace);
extern void unregister_stat_tracer(struct tracer_stat *trace);
#endif /* __TRACE_STAT_H */

View File

@@ -0,0 +1,250 @@
#include <linux/kernel.h>
#include <linux/ftrace.h>
#include <asm/syscall.h>
#include "trace_output.h"
#include "trace.h"
/* Keep a counter of the syscall tracing users */
static int refcount;
/* Prevent from races on thread flags toggling */
static DEFINE_MUTEX(syscall_trace_lock);
/* Option to display the parameters types */
enum {
TRACE_SYSCALLS_OPT_TYPES = 0x1,
};
static struct tracer_opt syscalls_opts[] = {
{ TRACER_OPT(syscall_arg_type, TRACE_SYSCALLS_OPT_TYPES) },
{ }
};
static struct tracer_flags syscalls_flags = {
.val = 0, /* By default: no parameters types */
.opts = syscalls_opts
};
enum print_line_t
print_syscall_enter(struct trace_iterator *iter, int flags)
{
struct trace_seq *s = &iter->seq;
struct trace_entry *ent = iter->ent;
struct syscall_trace_enter *trace;
struct syscall_metadata *entry;
int i, ret, syscall;
trace_assign_type(trace, ent);
syscall = trace->nr;
entry = syscall_nr_to_meta(syscall);
if (!entry)
goto end;
ret = trace_seq_printf(s, "%s(", entry->name);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
for (i = 0; i < entry->nb_args; i++) {
/* parameter types */
if (syscalls_flags.val & TRACE_SYSCALLS_OPT_TYPES) {
ret = trace_seq_printf(s, "%s ", entry->types[i]);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* parameter values */
ret = trace_seq_printf(s, "%s: %lx%s ", entry->args[i],
trace->args[i],
i == entry->nb_args - 1 ? ")" : ",");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
end:
trace_seq_printf(s, "\n");
return TRACE_TYPE_HANDLED;
}
enum print_line_t
print_syscall_exit(struct trace_iterator *iter, int flags)
{
struct trace_seq *s = &iter->seq;
struct trace_entry *ent = iter->ent;
struct syscall_trace_exit *trace;
int syscall;
struct syscall_metadata *entry;
int ret;
trace_assign_type(trace, ent);
syscall = trace->nr;
entry = syscall_nr_to_meta(syscall);
if (!entry) {
trace_seq_printf(s, "\n");
return TRACE_TYPE_HANDLED;
}
ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
trace->ret);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_HANDLED;
}
void start_ftrace_syscalls(void)
{
unsigned long flags;
struct task_struct *g, *t;
mutex_lock(&syscall_trace_lock);
/* Don't enable the flag on the tasks twice */
if (++refcount != 1)
goto unlock;
arch_init_ftrace_syscalls();
read_lock_irqsave(&tasklist_lock, flags);
do_each_thread(g, t) {
set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
} while_each_thread(g, t);
read_unlock_irqrestore(&tasklist_lock, flags);
unlock:
mutex_unlock(&syscall_trace_lock);
}
void stop_ftrace_syscalls(void)
{
unsigned long flags;
struct task_struct *g, *t;
mutex_lock(&syscall_trace_lock);
/* There are perhaps still some users */
if (--refcount)
goto unlock;
read_lock_irqsave(&tasklist_lock, flags);
do_each_thread(g, t) {
clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
} while_each_thread(g, t);
read_unlock_irqrestore(&tasklist_lock, flags);
unlock:
mutex_unlock(&syscall_trace_lock);
}
void ftrace_syscall_enter(struct pt_regs *regs)
{
struct syscall_trace_enter *entry;
struct syscall_metadata *sys_data;
struct ring_buffer_event *event;
int size;
int syscall_nr;
syscall_nr = syscall_get_nr(current, regs);
sys_data = syscall_nr_to_meta(syscall_nr);
if (!sys_data)
return;
size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size,
0, 0);
if (!event)
return;
entry = ring_buffer_event_data(event);
entry->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
trace_current_buffer_unlock_commit(event, 0, 0);
trace_wake_up();
}
void ftrace_syscall_exit(struct pt_regs *regs)
{
struct syscall_trace_exit *entry;
struct syscall_metadata *sys_data;
struct ring_buffer_event *event;
int syscall_nr;
syscall_nr = syscall_get_nr(current, regs);
sys_data = syscall_nr_to_meta(syscall_nr);
if (!sys_data)
return;
event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT,
sizeof(*entry), 0, 0);
if (!event)
return;
entry = ring_buffer_event_data(event);
entry->nr = syscall_nr;
entry->ret = syscall_get_return_value(current, regs);
trace_current_buffer_unlock_commit(event, 0, 0);
trace_wake_up();
}
static int init_syscall_tracer(struct trace_array *tr)
{
start_ftrace_syscalls();
return 0;
}
static void reset_syscall_tracer(struct trace_array *tr)
{
stop_ftrace_syscalls();
tracing_reset_online_cpus(tr);
}
static struct trace_event syscall_enter_event = {
.type = TRACE_SYSCALL_ENTER,
.trace = print_syscall_enter,
};
static struct trace_event syscall_exit_event = {
.type = TRACE_SYSCALL_EXIT,
.trace = print_syscall_exit,
};
static struct tracer syscall_tracer __read_mostly = {
.name = "syscall",
.init = init_syscall_tracer,
.reset = reset_syscall_tracer,
.flags = &syscalls_flags,
};
__init int register_ftrace_syscalls(void)
{
int ret;
ret = register_ftrace_event(&syscall_enter_event);
if (!ret) {
printk(KERN_WARNING "event %d failed to register\n",
syscall_enter_event.type);
WARN_ON_ONCE(1);
}
ret = register_ftrace_event(&syscall_exit_event);
if (!ret) {
printk(KERN_WARNING "event %d failed to register\n",
syscall_exit_event.type);
WARN_ON_ONCE(1);
}
return register_tracer(&syscall_tracer);
}
device_initcall(register_ftrace_syscalls);

View File

@@ -88,7 +88,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
}
}
const static struct stacktrace_ops backtrace_ops = {
static const struct stacktrace_ops backtrace_ops = {
.warning = backtrace_warning,
.warning_symbol = backtrace_warning_symbol,
.stack = backtrace_stack,
@@ -226,15 +226,6 @@ static void stop_stack_timers(void)
stop_stack_timer(cpu);
}
static void start_stack_trace(struct trace_array *tr)
{
mutex_lock(&sample_timer_lock);
tracing_reset_online_cpus(tr);
start_stack_timers();
tracer_enabled = 1;
mutex_unlock(&sample_timer_lock);
}
static void stop_stack_trace(struct trace_array *tr)
{
mutex_lock(&sample_timer_lock);
@@ -247,12 +238,18 @@ static int stack_trace_init(struct trace_array *tr)
{
sysprof_trace = tr;
start_stack_trace(tr);
tracing_start_cmdline_record();
mutex_lock(&sample_timer_lock);
start_stack_timers();
tracer_enabled = 1;
mutex_unlock(&sample_timer_lock);
return 0;
}
static void stack_trace_reset(struct trace_array *tr)
{
tracing_stop_cmdline_record();
stop_stack_trace(tr);
}
@@ -317,7 +314,7 @@ sysprof_sample_write(struct file *filp, const char __user *ubuf,
return cnt;
}
static struct file_operations sysprof_sample_fops = {
static const struct file_operations sysprof_sample_fops = {
.read = sysprof_sample_read,
.write = sysprof_sample_write,
};
@@ -330,5 +327,5 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
d_tracer, NULL, &sysprof_sample_fops);
if (entry)
return;
pr_warning("Could not create debugfs 'dyn_ftrace_total_info' entry\n");
pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n");
}

View File

@@ -0,0 +1,288 @@
/*
* Workqueue statistical tracer.
*
* Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
*
*/
#include <trace/workqueue.h>
#include <linux/list.h>
#include <linux/percpu.h>
#include "trace_stat.h"
#include "trace.h"
/* A cpu workqueue thread */
struct cpu_workqueue_stats {
struct list_head list;
/* Useful to know if we print the cpu headers */
bool first_entry;
int cpu;
pid_t pid;
/* Can be inserted from interrupt or user context, need to be atomic */
atomic_t inserted;
/*
* Don't need to be atomic, works are serialized in a single workqueue thread
* on a single CPU.
*/
unsigned int executed;
};
/* List of workqueue threads on one cpu */
struct workqueue_global_stats {
struct list_head list;
spinlock_t lock;
};
/* Don't need a global lock because allocated before the workqueues, and
* never freed.
*/
static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
/* Insertion of a work */
static void
probe_workqueue_insertion(struct task_struct *wq_thread,
struct work_struct *work)
{
int cpu = cpumask_first(&wq_thread->cpus_allowed);
struct cpu_workqueue_stats *node, *next;
unsigned long flags;
spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
list) {
if (node->pid == wq_thread->pid) {
atomic_inc(&node->inserted);
goto found;
}
}
pr_debug("trace_workqueue: entry not found\n");
found:
spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
}
/* Execution of a work */
static void
probe_workqueue_execution(struct task_struct *wq_thread,
struct work_struct *work)
{
int cpu = cpumask_first(&wq_thread->cpus_allowed);
struct cpu_workqueue_stats *node, *next;
unsigned long flags;
spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
list) {
if (node->pid == wq_thread->pid) {
node->executed++;
goto found;
}
}
pr_debug("trace_workqueue: entry not found\n");
found:
spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
}
/* Creation of a cpu workqueue thread */
static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
{
struct cpu_workqueue_stats *cws;
unsigned long flags;
WARN_ON(cpu < 0);
/* Workqueues are sometimes created in atomic context */
cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC);
if (!cws) {
pr_warning("trace_workqueue: not enough memory\n");
return;
}
INIT_LIST_HEAD(&cws->list);
cws->cpu = cpu;
cws->pid = wq_thread->pid;
spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
if (list_empty(&workqueue_cpu_stat(cpu)->list))
cws->first_entry = true;
list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
}
/* Destruction of a cpu workqueue thread */
static void probe_workqueue_destruction(struct task_struct *wq_thread)
{
/* Workqueue only execute on one cpu */
int cpu = cpumask_first(&wq_thread->cpus_allowed);
struct cpu_workqueue_stats *node, *next;
unsigned long flags;
spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
list) {
if (node->pid == wq_thread->pid) {
list_del(&node->list);
kfree(node);
goto found;
}
}
pr_debug("trace_workqueue: don't find workqueue to destroy\n");
found:
spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
}
static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
{
unsigned long flags;
struct cpu_workqueue_stats *ret = NULL;
spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
if (!list_empty(&workqueue_cpu_stat(cpu)->list))
ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
struct cpu_workqueue_stats, list);
spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
return ret;
}
static void *workqueue_stat_start(void)
{
int cpu;
void *ret = NULL;
for_each_possible_cpu(cpu) {
ret = workqueue_stat_start_cpu(cpu);
if (ret)
return ret;
}
return NULL;
}
static void *workqueue_stat_next(void *prev, int idx)
{
struct cpu_workqueue_stats *prev_cws = prev;
int cpu = prev_cws->cpu;
unsigned long flags;
void *ret = NULL;
spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
do {
cpu = cpumask_next(cpu, cpu_possible_mask);
if (cpu >= nr_cpu_ids)
return NULL;
} while (!(ret = workqueue_stat_start_cpu(cpu)));
return ret;
}
spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
return list_entry(prev_cws->list.next, struct cpu_workqueue_stats,
list);
}
static int workqueue_stat_show(struct seq_file *s, void *p)
{
struct cpu_workqueue_stats *cws = p;
unsigned long flags;
int cpu = cws->cpu;
struct pid *pid;
struct task_struct *tsk;
spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
seq_printf(s, "\n");
spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
pid = find_get_pid(cws->pid);
if (pid) {
tsk = get_pid_task(pid, PIDTYPE_PID);
if (tsk) {
seq_printf(s, "%3d %6d %6u %s\n", cws->cpu,
atomic_read(&cws->inserted), cws->executed,
tsk->comm);
put_task_struct(tsk);
}
put_pid(pid);
}
return 0;
}
static int workqueue_stat_headers(struct seq_file *s)
{
seq_printf(s, "# CPU INSERTED EXECUTED NAME\n");
seq_printf(s, "# | | | |\n");
return 0;
}
struct tracer_stat workqueue_stats __read_mostly = {
.name = "workqueues",
.stat_start = workqueue_stat_start,
.stat_next = workqueue_stat_next,
.stat_show = workqueue_stat_show,
.stat_headers = workqueue_stat_headers
};
int __init stat_workqueue_init(void)
{
if (register_stat_tracer(&workqueue_stats)) {
pr_warning("Unable to register workqueue stat tracer\n");
return 1;
}
return 0;
}
fs_initcall(stat_workqueue_init);
/*
* Workqueues are created very early, just after pre-smp initcalls.
* So we must register our tracepoints at this stage.
*/
int __init trace_workqueue_early_init(void)
{
int ret, cpu;
ret = register_trace_workqueue_insertion(probe_workqueue_insertion);
if (ret)
goto out;
ret = register_trace_workqueue_execution(probe_workqueue_execution);
if (ret)
goto no_insertion;
ret = register_trace_workqueue_creation(probe_workqueue_creation);
if (ret)
goto no_execution;
ret = register_trace_workqueue_destruction(probe_workqueue_destruction);
if (ret)
goto no_creation;
for_each_possible_cpu(cpu) {
spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
}
return 0;
no_creation:
unregister_trace_workqueue_creation(probe_workqueue_creation);
no_execution:
unregister_trace_workqueue_execution(probe_workqueue_execution);
no_insertion:
unregister_trace_workqueue_insertion(probe_workqueue_insertion);
out:
pr_warning("trace_workqueue: unable to trace workqueues\n");
return 1;
}
early_initcall(trace_workqueue_early_init);

View File

@@ -272,12 +272,15 @@ static void disable_tracepoint(struct tracepoint *elem)
*
* Updates the probe callback corresponding to a range of tracepoints.
*/
void tracepoint_update_probe_range(struct tracepoint *begin,
struct tracepoint *end)
void
tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
{
struct tracepoint *iter;
struct tracepoint_entry *mark_entry;
if (!begin)
return;
mutex_lock(&tracepoints_mutex);
for (iter = begin; iter < end; iter++) {
mark_entry = get_tracepoint(iter->name);

View File

@@ -37,7 +37,7 @@ static void put_uts(ctl_table *table, int write, void *which)
up_write(&uts_sem);
}
#ifdef CONFIG_PROC_FS
#ifdef CONFIG_PROC_SYSCTL
/*
* Special case of dostring for the UTS structure. This has locks
* to observe. Should this be in kernel/sys.c ????

View File

@@ -33,6 +33,7 @@
#include <linux/kallsyms.h>
#include <linux/debug_locks.h>
#include <linux/lockdep.h>
#include <trace/workqueue.h>
/*
* The per-CPU workqueue (if single thread, we always use the first
@@ -48,8 +49,6 @@ struct cpu_workqueue_struct {
struct workqueue_struct *wq;
struct task_struct *thread;
int run_depth; /* Detect run_workqueue() recursion depth */
} ____cacheline_aligned;
/*
@@ -125,9 +124,13 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
}
DEFINE_TRACE(workqueue_insertion);
static void insert_work(struct cpu_workqueue_struct *cwq,
struct work_struct *work, struct list_head *head)
{
trace_workqueue_insertion(cwq->thread, work);
set_wq_data(work, cwq);
/*
* Ensure that we get the right work->data if we see the
@@ -259,16 +262,11 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
}
EXPORT_SYMBOL_GPL(queue_delayed_work_on);
DEFINE_TRACE(workqueue_execution);
static void run_workqueue(struct cpu_workqueue_struct *cwq)
{
spin_lock_irq(&cwq->lock);
cwq->run_depth++;
if (cwq->run_depth > 3) {
/* morton gets to eat his hat */
printk("%s: recursion depth exceeded: %d\n",
__func__, cwq->run_depth);
dump_stack();
}
while (!list_empty(&cwq->worklist)) {
struct work_struct *work = list_entry(cwq->worklist.next,
struct work_struct, entry);
@@ -284,7 +282,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
*/
struct lockdep_map lockdep_map = work->lockdep_map;
#endif
trace_workqueue_execution(cwq->thread, work);
cwq->current_work = work;
list_del_init(cwq->worklist.next);
spin_unlock_irq(&cwq->lock);
@@ -311,7 +309,6 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
spin_lock_irq(&cwq->lock);
cwq->current_work = NULL;
}
cwq->run_depth--;
spin_unlock_irq(&cwq->lock);
}
@@ -368,29 +365,20 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
{
int active;
int active = 0;
struct wq_barrier barr;
if (cwq->thread == current) {
/*
* Probably keventd trying to flush its own queue. So simply run
* it by hand rather than deadlocking.
*/
run_workqueue(cwq);
WARN_ON(cwq->thread == current);
spin_lock_irq(&cwq->lock);
if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
insert_wq_barrier(cwq, &barr, &cwq->worklist);
active = 1;
} else {
struct wq_barrier barr;
active = 0;
spin_lock_irq(&cwq->lock);
if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
insert_wq_barrier(cwq, &barr, &cwq->worklist);
active = 1;
}
spin_unlock_irq(&cwq->lock);
if (active)
wait_for_completion(&barr.done);
}
spin_unlock_irq(&cwq->lock);
if (active)
wait_for_completion(&barr.done);
return active;
}
@@ -765,6 +753,8 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
return cwq;
}
DEFINE_TRACE(workqueue_creation);
static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
@@ -787,6 +777,8 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
cwq->thread = p;
trace_workqueue_creation(cwq->thread, cpu);
return 0;
}
@@ -868,6 +860,8 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
}
EXPORT_SYMBOL_GPL(__create_workqueue_key);
DEFINE_TRACE(workqueue_destruction);
static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
{
/*
@@ -891,6 +885,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
* checks list_empty(), and a "normal" queue_work() can't use
* a dead CPU.
*/
trace_workqueue_destruction(cwq->thread);
kthread_stop(cwq->thread);
cwq->thread = NULL;
}