Merge tag 'v3.16-rc5' into timers/core

Reason: Bring in upstream modifications, so the pending changes which
depend on them can be queued.
This commit is contained in:
Thomas Gleixner
2014-07-16 21:57:38 +02:00
4678 changed files with 184249 additions and 90401 deletions

View File

@@ -223,3 +223,10 @@ endif
config MUTEX_SPIN_ON_OWNER
def_bool y
depends on SMP && !DEBUG_MUTEXES
config ARCH_USE_QUEUE_RWLOCK
bool
config QUEUE_RWLOCK
def_bool y if ARCH_USE_QUEUE_RWLOCK
depends on SMP

View File

@@ -423,6 +423,38 @@ static void kauditd_send_skb(struct sk_buff *skb)
consume_skb(skb);
}
/*
* kauditd_send_multicast_skb - send the skb to multicast userspace listeners
*
* This function doesn't consume an skb as might be expected since it has to
* copy it anyways.
*/
static void kauditd_send_multicast_skb(struct sk_buff *skb)
{
struct sk_buff *copy;
struct audit_net *aunet = net_generic(&init_net, audit_net_id);
struct sock *sock = aunet->nlsk;
if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG))
return;
/*
* The seemingly wasteful skb_copy() rather than bumping the refcount
* using skb_get() is necessary because non-standard mods are made to
* the skb by the original kaudit unicast socket send routine. The
* existing auditd daemon assumes this breakage. Fixing this would
* require co-ordinating a change in the established protocol between
* the kaudit kernel subsystem and the auditd userspace code. There is
* no reason for new multicast clients to continue with this
* non-compliance.
*/
copy = skb_copy(skb, GFP_KERNEL);
if (!copy)
return;
nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL);
}
/*
* flush_hold_queue - empty the hold queue if auditd appears
*
@@ -1076,10 +1108,22 @@ static void audit_receive(struct sk_buff *skb)
mutex_unlock(&audit_cmd_mutex);
}
/* Run custom bind function on netlink socket group connect or bind requests. */
static int audit_bind(int group)
{
if (!capable(CAP_AUDIT_READ))
return -EPERM;
return 0;
}
static int __net_init audit_net_init(struct net *net)
{
struct netlink_kernel_cfg cfg = {
.input = audit_receive,
.bind = audit_bind,
.flags = NL_CFG_F_NONROOT_RECV,
.groups = AUDIT_NLGRP_MAX,
};
struct audit_net *aunet = net_generic(net, audit_net_id);
@@ -1901,10 +1945,10 @@ out:
* audit_log_end - end one audit record
* @ab: the audit_buffer
*
* The netlink_* functions cannot be called inside an irq context, so
* the audit buffer is placed on a queue and a tasklet is scheduled to
* remove them from the queue outside the irq context. May be called in
* any context.
* netlink_unicast() cannot be called inside an irq context because it blocks
* (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed
* on a queue and a tasklet is scheduled to remove them from the queue outside
* the irq context. May be called in any context.
*/
void audit_log_end(struct audit_buffer *ab)
{
@@ -1914,6 +1958,18 @@ void audit_log_end(struct audit_buffer *ab)
audit_log_lost("rate limit exceeded");
} else {
struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
kauditd_send_multicast_skb(ab->skb);
/*
* The original kaudit unicast socket sends up messages with
* nlmsg_len set to the payload length rather than the entire
* message length. This breaks the standard set by netlink.
* The existing auditd daemon assumes this breakage. Fixing
* this would require co-ordinating a change in the established
* protocol between the kaudit kernel subsystem and the auditd
* userspace code.
*/
nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN;
if (audit_pid) {

View File

@@ -728,6 +728,22 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
return AUDIT_BUILD_CONTEXT;
}
static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
{
int word, bit;
if (val > 0xffffffff)
return false;
word = AUDIT_WORD(val);
if (word >= AUDIT_BITMASK_SIZE)
return false;
bit = AUDIT_BIT(val);
return rule->mask[word] & bit;
}
/* At syscall entry and exit time, this filter is called if the
* audit_state is not low enough that auditing cannot take place, but is
* also not high enough that we already know we have to write an audit
@@ -745,11 +761,8 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
rcu_read_lock();
if (!list_empty(list)) {
int word = AUDIT_WORD(ctx->major);
int bit = AUDIT_BIT(ctx->major);
list_for_each_entry_rcu(e, list, list) {
if ((e->rule.mask[word] & bit) == bit &&
if (audit_in_mask(&e->rule, ctx->major) &&
audit_filter_rules(tsk, &e->rule, ctx, NULL,
&state, false)) {
rcu_read_unlock();
@@ -769,20 +782,16 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
static int audit_filter_inode_name(struct task_struct *tsk,
struct audit_names *n,
struct audit_context *ctx) {
int word, bit;
int h = audit_hash_ino((u32)n->ino);
struct list_head *list = &audit_inode_hash[h];
struct audit_entry *e;
enum audit_state state;
word = AUDIT_WORD(ctx->major);
bit = AUDIT_BIT(ctx->major);
if (list_empty(list))
return 0;
list_for_each_entry_rcu(e, list, list) {
if ((e->rule.mask[word] & bit) == bit &&
if (audit_in_mask(&e->rule, ctx->major) &&
audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
ctx->current_state = state;
return 1;

View File

@@ -424,23 +424,19 @@ bool capable(int cap)
EXPORT_SYMBOL(capable);
/**
* inode_capable - Check superior capability over inode
* capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
* @inode: The inode in question
* @cap: The capability in question
*
* Return true if the current task has the given superior capability
* targeted at it's own user namespace and that the given inode is owned
* by the current user namespace or a child namespace.
*
* Currently we check to see if an inode is owned by the current
* user namespace by seeing if the inode's owner maps into the
* current user namespace.
*
* Return true if the current task has the given capability targeted at
* its own user namespace and that the given inode's uid and gid are
* mapped into the current user namespace.
*/
bool inode_capable(const struct inode *inode, int cap)
bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
{
struct user_namespace *ns = current_user_ns();
return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid);
return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) &&
kgid_has_mapping(ns, inode->i_gid);
}
EXPORT_SYMBOL(inode_capable);
EXPORT_SYMBOL(capable_wrt_inode_uidgid);

File diff suppressed because it is too large Load Diff

View File

@@ -59,7 +59,7 @@ static inline struct freezer *task_freezer(struct task_struct *task)
static struct freezer *parent_freezer(struct freezer *freezer)
{
return css_freezer(css_parent(&freezer->css));
return css_freezer(freezer->css.parent);
}
bool cgroup_freezing(struct task_struct *task)
@@ -73,10 +73,6 @@ bool cgroup_freezing(struct task_struct *task)
return ret;
}
/*
* cgroups_write_string() limits the size of freezer state strings to
* CGROUP_LOCAL_BUFFER_SIZE
*/
static const char *freezer_state_strs(unsigned int state)
{
if (state & CGROUP_FROZEN)
@@ -304,7 +300,7 @@ static int freezer_read(struct seq_file *m, void *v)
/* update states bottom-up */
css_for_each_descendant_post(pos, css) {
if (!css_tryget(pos))
if (!css_tryget_online(pos))
continue;
rcu_read_unlock();
@@ -404,7 +400,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
struct freezer *pos_f = css_freezer(pos);
struct freezer *parent = parent_freezer(pos_f);
if (!css_tryget(pos))
if (!css_tryget_online(pos))
continue;
rcu_read_unlock();
@@ -423,20 +419,22 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
mutex_unlock(&freezer_mutex);
}
static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft,
char *buffer)
static ssize_t freezer_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
bool freeze;
if (strcmp(buffer, freezer_state_strs(0)) == 0)
buf = strstrip(buf);
if (strcmp(buf, freezer_state_strs(0)) == 0)
freeze = false;
else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0)
else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
freeze = true;
else
return -EINVAL;
freezer_change_state(css_freezer(css), freeze);
return 0;
freezer_change_state(css_freezer(of_css(of)), freeze);
return nbytes;
}
static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
@@ -460,7 +458,7 @@ static struct cftype files[] = {
.name = "state",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = freezer_read,
.write_string = freezer_write,
.write = freezer_write,
},
{
.name = "self_freezing",

View File

@@ -19,6 +19,7 @@
#include <linux/sched.h>
#include <linux/hardirq.h>
#include <linux/export.h>
#include <linux/kprobes.h>
#define CREATE_TRACE_POINTS
#include <trace/events/context_tracking.h>
@@ -104,6 +105,7 @@ void context_tracking_user_enter(void)
}
local_irq_restore(flags);
}
NOKPROBE_SYMBOL(context_tracking_user_enter);
#ifdef CONFIG_PREEMPT
/**
@@ -181,6 +183,7 @@ void context_tracking_user_exit(void)
}
local_irq_restore(flags);
}
NOKPROBE_SYMBOL(context_tracking_user_exit);
/**
* __context_tracking_task_switch - context switch the syscall callbacks

View File

@@ -20,6 +20,7 @@
#include <linux/gfp.h>
#include <linux/suspend.h>
#include <linux/lockdep.h>
#include <trace/events/power.h>
#include "smpboot.h"
@@ -520,7 +521,9 @@ int disable_nonboot_cpus(void)
for_each_online_cpu(cpu) {
if (cpu == first_cpu)
continue;
trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
error = _cpu_down(cpu, 1);
trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
if (!error)
cpumask_set_cpu(cpu, frozen_cpus);
else {
@@ -563,7 +566,9 @@ void __ref enable_nonboot_cpus(void)
arch_enable_nonboot_cpus_begin();
for_each_cpu(cpu, frozen_cpus) {
trace_suspend_resume(TPS("CPU_ON"), cpu, true);
error = _cpu_up(cpu, 1);
trace_suspend_resume(TPS("CPU_ON"), cpu, false);
if (!error) {
pr_info("CPU%d is up\n", cpu);
continue;

View File

@@ -119,7 +119,7 @@ static inline struct cpuset *task_cs(struct task_struct *task)
static inline struct cpuset *parent_cs(struct cpuset *cs)
{
return css_cs(css_parent(&cs->css));
return css_cs(cs->css.parent);
}
#ifdef CONFIG_NUMA
@@ -691,11 +691,8 @@ restart:
if (nslot == ndoms) {
static int warnings = 10;
if (warnings) {
printk(KERN_WARNING
"rebuild_sched_domains confused:"
" nslot %d, ndoms %d, csn %d, i %d,"
" apn %d\n",
nslot, ndoms, csn, i, apn);
pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
nslot, ndoms, csn, i, apn);
warnings--;
}
continue;
@@ -870,7 +867,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
continue;
}
}
if (!css_tryget(&cp->css))
if (!css_tryget_online(&cp->css))
continue;
rcu_read_unlock();
@@ -885,6 +882,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
/**
* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
* @cs: the cpuset to consider
* @trialcs: trial cpuset
* @buf: buffer of cpu numbers written to this cpuset
*/
static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
@@ -1105,7 +1103,7 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
continue;
}
}
if (!css_tryget(&cp->css))
if (!css_tryget_online(&cp->css))
continue;
rcu_read_unlock();
@@ -1183,7 +1181,13 @@ done:
int current_cpuset_is_being_rebound(void)
{
return task_cs(current) == cpuset_being_rebound;
int ret;
rcu_read_lock();
ret = task_cs(current) == cpuset_being_rebound;
rcu_read_unlock();
return ret;
}
static int update_relax_domain_level(struct cpuset *cs, s64 val)
@@ -1600,13 +1604,15 @@ out_unlock:
/*
* Common handling for a write to a "cpus" or "mems" file.
*/
static int cpuset_write_resmask(struct cgroup_subsys_state *css,
struct cftype *cft, char *buf)
static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct cpuset *cs = css_cs(css);
struct cpuset *cs = css_cs(of_css(of));
struct cpuset *trialcs;
int retval = -ENODEV;
buf = strstrip(buf);
/*
* CPU or memory hotunplug may leave @cs w/o any execution
* resources, in which case the hotplug code asynchronously updates
@@ -1617,7 +1623,17 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css,
* resources, wait for the previously scheduled operations before
* proceeding, so that we don't end up keep removing tasks added
* after execution capability is restored.
*
* cpuset_hotplug_work calls back into cgroup core via
* cgroup_transfer_tasks() and waiting for it from a cgroupfs
* operation like this one can lead to a deadlock through kernfs
* active_ref protection. Let's break the protection. Losing the
* protection is okay as we check whether @cs is online after
* grabbing cpuset_mutex anyway. This only happens on the legacy
* hierarchies.
*/
css_get(&cs->css);
kernfs_break_active_protection(of->kn);
flush_work(&cpuset_hotplug_work);
mutex_lock(&cpuset_mutex);
@@ -1630,7 +1646,7 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css,
goto out_unlock;
}
switch (cft->private) {
switch (of_cft(of)->private) {
case FILE_CPULIST:
retval = update_cpumask(cs, trialcs, buf);
break;
@@ -1645,7 +1661,9 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css,
free_trial_cpuset(trialcs);
out_unlock:
mutex_unlock(&cpuset_mutex);
return retval;
kernfs_unbreak_active_protection(of->kn);
css_put(&cs->css);
return retval ?: nbytes;
}
/*
@@ -1747,7 +1765,7 @@ static struct cftype files[] = {
{
.name = "cpus",
.seq_show = cpuset_common_seq_show,
.write_string = cpuset_write_resmask,
.write = cpuset_write_resmask,
.max_write_len = (100U + 6 * NR_CPUS),
.private = FILE_CPULIST,
},
@@ -1755,7 +1773,7 @@ static struct cftype files[] = {
{
.name = "mems",
.seq_show = cpuset_common_seq_show,
.write_string = cpuset_write_resmask,
.write = cpuset_write_resmask,
.max_write_len = (100U + 6 * MAX_NUMNODES),
.private = FILE_MEMLIST,
},
@@ -2011,7 +2029,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
parent = parent_cs(parent);
if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset ");
pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
pr_cont_cgroup_name(cs->css.cgroup);
pr_cont("\n");
}
@@ -2149,7 +2167,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
rcu_read_lock();
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
if (cs == &top_cpuset || !css_tryget(&cs->css))
if (cs == &top_cpuset || !css_tryget_online(&cs->css))
continue;
rcu_read_unlock();
@@ -2530,7 +2548,7 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
/**
* cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
* @task: pointer to task_struct of some task.
* @tsk: pointer to task_struct of some task.
*
* Description: Prints @task's name, cpuset name, and cached copy of its
* mems_allowed to the kernel log.
@@ -2548,7 +2566,7 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk)
cgrp = task_cs(tsk)->css.cgroup;
nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
tsk->mems_allowed);
printk(KERN_INFO "%s cpuset=", tsk->comm);
pr_info("%s cpuset=", tsk->comm);
pr_cont_cgroup_name(cgrp);
pr_cont(" mems_allowed=%s\n", cpuset_nodelist);
@@ -2640,10 +2658,10 @@ out:
/* Display task mems_allowed in /proc/<pid>/status file. */
void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
{
seq_printf(m, "Mems_allowed:\t");
seq_puts(m, "Mems_allowed:\t");
seq_nodemask(m, &task->mems_allowed);
seq_printf(m, "\n");
seq_printf(m, "Mems_allowed_list:\t");
seq_puts(m, "\n");
seq_puts(m, "Mems_allowed_list:\t");
seq_nodemask_list(m, &task->mems_allowed);
seq_printf(m, "\n");
seq_puts(m, "\n");
}

View File

@@ -40,6 +40,7 @@
#include <linux/mm_types.h>
#include <linux/cgroup.h>
#include <linux/module.h>
#include <linux/mman.h>
#include "internal.h"
@@ -608,7 +609,8 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
if (!f.file)
return -EBADF;
css = css_tryget_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys);
css = css_tryget_online_from_dir(f.file->f_dentry,
&perf_event_cgrp_subsys);
if (IS_ERR(css)) {
ret = PTR_ERR(css);
goto out;
@@ -2973,6 +2975,22 @@ out:
local_irq_restore(flags);
}
void perf_event_exec(void)
{
struct perf_event_context *ctx;
int ctxn;
rcu_read_lock();
for_each_task_context_nr(ctxn) {
ctx = current->perf_event_ctxp[ctxn];
if (!ctx)
continue;
perf_event_enable_on_exec(ctx);
}
rcu_read_unlock();
}
/*
* Cross CPU call to read the hardware event
*/
@@ -5074,21 +5092,9 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
NULL);
}
void perf_event_comm(struct task_struct *task)
void perf_event_comm(struct task_struct *task, bool exec)
{
struct perf_comm_event comm_event;
struct perf_event_context *ctx;
int ctxn;
rcu_read_lock();
for_each_task_context_nr(ctxn) {
ctx = task->perf_event_ctxp[ctxn];
if (!ctx)
continue;
perf_event_enable_on_exec(ctx);
}
rcu_read_unlock();
if (!atomic_read(&nr_comm_events))
return;
@@ -5100,7 +5106,7 @@ void perf_event_comm(struct task_struct *task)
.event_id = {
.header = {
.type = PERF_RECORD_COMM,
.misc = 0,
.misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
/* .size */
},
/* .pid */
@@ -5123,6 +5129,7 @@ struct perf_mmap_event {
int maj, min;
u64 ino;
u64 ino_generation;
u32 prot, flags;
struct {
struct perf_event_header header;
@@ -5164,6 +5171,8 @@ static void perf_event_mmap_output(struct perf_event *event,
mmap_event->event_id.header.size += sizeof(mmap_event->min);
mmap_event->event_id.header.size += sizeof(mmap_event->ino);
mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
mmap_event->event_id.header.size += sizeof(mmap_event->prot);
mmap_event->event_id.header.size += sizeof(mmap_event->flags);
}
perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
@@ -5182,6 +5191,8 @@ static void perf_event_mmap_output(struct perf_event *event,
perf_output_put(&handle, mmap_event->min);
perf_output_put(&handle, mmap_event->ino);
perf_output_put(&handle, mmap_event->ino_generation);
perf_output_put(&handle, mmap_event->prot);
perf_output_put(&handle, mmap_event->flags);
}
__output_copy(&handle, mmap_event->file_name,
@@ -5200,6 +5211,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
struct file *file = vma->vm_file;
int maj = 0, min = 0;
u64 ino = 0, gen = 0;
u32 prot = 0, flags = 0;
unsigned int size;
char tmp[16];
char *buf = NULL;
@@ -5230,6 +5242,28 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
gen = inode->i_generation;
maj = MAJOR(dev);
min = MINOR(dev);
if (vma->vm_flags & VM_READ)
prot |= PROT_READ;
if (vma->vm_flags & VM_WRITE)
prot |= PROT_WRITE;
if (vma->vm_flags & VM_EXEC)
prot |= PROT_EXEC;
if (vma->vm_flags & VM_MAYSHARE)
flags = MAP_SHARED;
else
flags = MAP_PRIVATE;
if (vma->vm_flags & VM_DENYWRITE)
flags |= MAP_DENYWRITE;
if (vma->vm_flags & VM_MAYEXEC)
flags |= MAP_EXECUTABLE;
if (vma->vm_flags & VM_LOCKED)
flags |= MAP_LOCKED;
if (vma->vm_flags & VM_HUGETLB)
flags |= MAP_HUGETLB;
goto got_name;
} else {
name = (char *)arch_vma_name(vma);
@@ -5270,6 +5304,8 @@ got_name:
mmap_event->min = min;
mmap_event->ino = ino;
mmap_event->ino_generation = gen;
mmap_event->prot = prot;
mmap_event->flags = flags;
if (!(vma->vm_flags & VM_EXEC))
mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
@@ -5310,6 +5346,8 @@ void perf_event_mmap(struct vm_area_struct *vma)
/* .min (attr_mmap2 only) */
/* .ino (attr_mmap2 only) */
/* .ino_generation (attr_mmap2 only) */
/* .prot (attr_mmap2 only) */
/* .flags (attr_mmap2 only) */
};
perf_event_mmap_event(&mmap_event);
@@ -6892,10 +6930,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
if (ret)
return -EFAULT;
/* disabled for now */
if (attr->mmap2)
return -EINVAL;
if (attr->__reserved_1)
return -EINVAL;
@@ -7121,6 +7155,13 @@ SYSCALL_DEFINE5(perf_event_open,
}
}
if (is_sampling_event(event)) {
if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
err = -ENOTSUPP;
goto err_alloc;
}
}
account_event(event);
/*
@@ -7432,7 +7473,7 @@ __perf_event_exit_task(struct perf_event *child_event,
static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
{
struct perf_event *child_event;
struct perf_event *child_event, *next;
struct perf_event_context *child_ctx;
unsigned long flags;
@@ -7486,7 +7527,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
*/
mutex_lock(&child_ctx->mutex);
list_for_each_entry_rcu(child_event, &child_ctx->event_list, event_entry)
list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
__perf_event_exit_task(child_event, child_ctx, child);
mutex_unlock(&child_ctx->mutex);

View File

@@ -36,6 +36,7 @@
#include "../../mm/internal.h" /* munlock_vma_page */
#include <linux/percpu-rwsem.h>
#include <linux/task_work.h>
#include <linux/shmem_fs.h>
#include <linux/uprobes.h>
@@ -127,7 +128,7 @@ struct xol_area {
*/
static bool valid_vma(struct vm_area_struct *vma, bool is_register)
{
vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED;
vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
if (is_register)
flags |= VM_WRITE;
@@ -279,18 +280,13 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
* supported by that architecture then we need to modify is_trap_at_addr and
* uprobe_write_opcode accordingly. This would never be a problem for archs
* that have fixed length instructions.
*/
/*
*
* uprobe_write_opcode - write the opcode at a given virtual address.
* @mm: the probed process address space.
* @vaddr: the virtual address to store the opcode.
* @opcode: opcode to be written at @vaddr.
*
* Called with mm->mmap_sem held (for read and with a reference to
* mm).
*
* For mm @mm, write the opcode at @vaddr.
* Called with mm->mmap_sem held for write.
* Return 0 (success) or a negative errno.
*/
int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
@@ -310,21 +306,25 @@ retry:
if (ret <= 0)
goto put_old;
ret = anon_vma_prepare(vma);
if (ret)
goto put_old;
ret = -ENOMEM;
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
if (!new_page)
goto put_old;
__SetPageUptodate(new_page);
if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
goto put_new;
__SetPageUptodate(new_page);
copy_highpage(new_page, old_page);
copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
ret = anon_vma_prepare(vma);
if (ret)
goto put_new;
ret = __replace_page(vma, vaddr, old_page, new_page);
if (ret)
mem_cgroup_uncharge_page(new_page);
put_new:
page_cache_release(new_page);
@@ -537,14 +537,15 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
void *insn, int nbytes, loff_t offset)
{
struct page *page;
if (!mapping->a_ops->readpage)
return -EIO;
/*
* Ensure that the page that has the original instruction is
* populated and in page-cache.
* Ensure that the page that has the original instruction is populated
* and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
* see uprobe_register().
*/
page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
if (mapping->a_ops->readpage)
page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
else
page = shmem_read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT);
if (IS_ERR(page))
return PTR_ERR(page);
@@ -845,7 +846,7 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u
{
int err;
if (!consumer_del(uprobe, uc)) /* WARN? */
if (WARN_ON(!consumer_del(uprobe, uc)))
return;
err = register_for_each_vma(uprobe, NULL);
@@ -880,6 +881,9 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
if (!uc->handler && !uc->ret_handler)
return -EINVAL;
/* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
return -EIO;
/* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))
return -EINVAL;
@@ -923,7 +927,7 @@ int uprobe_apply(struct inode *inode, loff_t offset,
int ret = -ENOENT;
uprobe = find_uprobe(inode, offset);
if (!uprobe)
if (WARN_ON(!uprobe))
return ret;
down_write(&uprobe->register_rwsem);
@@ -948,7 +952,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
struct uprobe *uprobe;
uprobe = find_uprobe(inode, offset);
if (!uprobe)
if (WARN_ON(!uprobe))
return;
down_write(&uprobe->register_rwsem);
@@ -1361,6 +1365,16 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
}
unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
{
struct uprobe_task *utask = current->utask;
if (unlikely(utask && utask->active_uprobe))
return utask->vaddr;
return instruction_pointer(regs);
}
/*
* Called with no locks held.
* Called in context of a exiting or a exec-ing thread.

View File

@@ -1487,7 +1487,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
total_forks++;
spin_unlock(&current->sighand->siglock);
syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p);
if (clone_flags & CLONE_THREAD)

View File

@@ -85,6 +85,12 @@ void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
}
EXPORT_SYMBOL(__gcov_merge_ior);
void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
{
/* Unused. */
}
EXPORT_SYMBOL(__gcov_merge_time_profile);
/**
* gcov_enable_events - enable event reporting through gcov_event()
*

View File

@@ -18,7 +18,12 @@
#include <linux/vmalloc.h>
#include "gcov.h"
#if __GNUC__ == 4 && __GNUC_MINOR__ >= 9
#define GCOV_COUNTERS 9
#else
#define GCOV_COUNTERS 8
#endif
#define GCOV_TAG_FUNCTION_LENGTH 3
static struct gcov_info *gcov_info_head;

View File

@@ -455,9 +455,9 @@ EXPORT_SYMBOL_GPL(irq_alloc_hwirqs);
*/
void irq_free_hwirqs(unsigned int from, int cnt)
{
int i;
int i, j;
for (i = from; cnt > 0; i++, cnt--) {
for (i = from, j = cnt; j > 0; i++, j--) {
irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
arch_teardown_hwirq(i);
}

View File

@@ -1617,6 +1617,7 @@ static int __init crash_save_vmcoreinfo_init(void)
#ifdef CONFIG_MEMORY_FAILURE
VMCOREINFO_NUMBER(PG_hwpoison);
#endif
VMCOREINFO_NUMBER(PG_head_mask);
VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
arch_crash_save_vmcoreinfo();

View File

@@ -86,21 +86,8 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
return &(kretprobe_table_locks[hash].lock);
}
/*
* Normally, functions that we'd want to prohibit kprobes in, are marked
* __kprobes. But, there are cases where such functions already belong to
* a different section (__sched for preempt_schedule)
*
* For such cases, we now have a blacklist
*/
static struct kprobe_blackpoint kprobe_blacklist[] = {
{"preempt_schedule",},
{"native_get_debugreg",},
{"irq_entries_start",},
{"common_interrupt",},
{"mcount",}, /* mcount can be called from everywhere */
{NULL} /* Terminator */
};
/* Blacklist -- list of struct kprobe_blacklist_entry */
static LIST_HEAD(kprobe_blacklist);
#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
/*
@@ -151,13 +138,13 @@ struct kprobe_insn_cache kprobe_insn_slots = {
.insn_size = MAX_INSN_SIZE,
.nr_garbage = 0,
};
static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
static int collect_garbage_slots(struct kprobe_insn_cache *c);
/**
* __get_insn_slot() - Find a slot on an executable page for an instruction.
* We allocate an executable page if there's no room on existing ones.
*/
kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
{
struct kprobe_insn_page *kip;
kprobe_opcode_t *slot = NULL;
@@ -214,7 +201,7 @@ out:
}
/* Return 1 if all garbages are collected, otherwise 0. */
static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
{
kip->slot_used[idx] = SLOT_CLEAN;
kip->nused--;
@@ -235,7 +222,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
return 0;
}
static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
static int collect_garbage_slots(struct kprobe_insn_cache *c)
{
struct kprobe_insn_page *kip, *next;
@@ -257,8 +244,8 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
return 0;
}
void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
kprobe_opcode_t *slot, int dirty)
void __free_insn_slot(struct kprobe_insn_cache *c,
kprobe_opcode_t *slot, int dirty)
{
struct kprobe_insn_page *kip;
@@ -314,7 +301,7 @@ static inline void reset_kprobe_instance(void)
* OR
* - with preemption disabled - from arch/xxx/kernel/kprobes.c
*/
struct kprobe __kprobes *get_kprobe(void *addr)
struct kprobe *get_kprobe(void *addr)
{
struct hlist_head *head;
struct kprobe *p;
@@ -327,8 +314,9 @@ struct kprobe __kprobes *get_kprobe(void *addr)
return NULL;
}
NOKPROBE_SYMBOL(get_kprobe);
static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
/* Return true if the kprobe is an aggregator */
static inline int kprobe_aggrprobe(struct kprobe *p)
@@ -360,7 +348,7 @@ static bool kprobes_allow_optimization;
* Call all pre_handler on the list, but ignores its return value.
* This must be called from arch-dep optimized caller.
*/
void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
void opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kprobe *kp;
@@ -372,9 +360,10 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
reset_kprobe_instance();
}
}
NOKPROBE_SYMBOL(opt_pre_handler);
/* Free optimized instructions and optimized_kprobe */
static __kprobes void free_aggr_kprobe(struct kprobe *p)
static void free_aggr_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -412,7 +401,7 @@ static inline int kprobe_disarmed(struct kprobe *p)
}
/* Return true(!0) if the probe is queued on (un)optimizing lists */
static int __kprobes kprobe_queued(struct kprobe *p)
static int kprobe_queued(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -428,7 +417,7 @@ static int __kprobes kprobe_queued(struct kprobe *p)
* Return an optimized kprobe whose optimizing code replaces
* instructions including addr (exclude breakpoint).
*/
static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
static struct kprobe *get_optimized_kprobe(unsigned long addr)
{
int i;
struct kprobe *p = NULL;
@@ -460,7 +449,7 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
* Optimize (replace a breakpoint with a jump) kprobes listed on
* optimizing_list.
*/
static __kprobes void do_optimize_kprobes(void)
static void do_optimize_kprobes(void)
{
/* Optimization never be done when disarmed */
if (kprobes_all_disarmed || !kprobes_allow_optimization ||
@@ -488,7 +477,7 @@ static __kprobes void do_optimize_kprobes(void)
* Unoptimize (replace a jump with a breakpoint and remove the breakpoint
* if need) kprobes listed on unoptimizing_list.
*/
static __kprobes void do_unoptimize_kprobes(void)
static void do_unoptimize_kprobes(void)
{
struct optimized_kprobe *op, *tmp;
@@ -520,7 +509,7 @@ static __kprobes void do_unoptimize_kprobes(void)
}
/* Reclaim all kprobes on the free_list */
static __kprobes void do_free_cleaned_kprobes(void)
static void do_free_cleaned_kprobes(void)
{
struct optimized_kprobe *op, *tmp;
@@ -532,13 +521,13 @@ static __kprobes void do_free_cleaned_kprobes(void)
}
/* Start optimizer after OPTIMIZE_DELAY passed */
static __kprobes void kick_kprobe_optimizer(void)
static void kick_kprobe_optimizer(void)
{
schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
}
/* Kprobe jump optimizer */
static __kprobes void kprobe_optimizer(struct work_struct *work)
static void kprobe_optimizer(struct work_struct *work)
{
mutex_lock(&kprobe_mutex);
/* Lock modules while optimizing kprobes */
@@ -574,7 +563,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
}
/* Wait for completing optimization and unoptimization */
static __kprobes void wait_for_kprobe_optimizer(void)
static void wait_for_kprobe_optimizer(void)
{
mutex_lock(&kprobe_mutex);
@@ -593,7 +582,7 @@ static __kprobes void wait_for_kprobe_optimizer(void)
}
/* Optimize kprobe if p is ready to be optimized */
static __kprobes void optimize_kprobe(struct kprobe *p)
static void optimize_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -627,7 +616,7 @@ static __kprobes void optimize_kprobe(struct kprobe *p)
}
/* Short cut to direct unoptimizing */
static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
static void force_unoptimize_kprobe(struct optimized_kprobe *op)
{
get_online_cpus();
arch_unoptimize_kprobe(op);
@@ -637,7 +626,7 @@ static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
}
/* Unoptimize a kprobe if p is optimized */
static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force)
static void unoptimize_kprobe(struct kprobe *p, bool force)
{
struct optimized_kprobe *op;
@@ -697,7 +686,7 @@ static void reuse_unused_kprobe(struct kprobe *ap)
}
/* Remove optimized instructions */
static void __kprobes kill_optimized_kprobe(struct kprobe *p)
static void kill_optimized_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -723,7 +712,7 @@ static void __kprobes kill_optimized_kprobe(struct kprobe *p)
}
/* Try to prepare optimized instructions */
static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
static void prepare_optimized_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -732,7 +721,7 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
}
/* Allocate new optimized_kprobe and try to prepare optimized instructions */
static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -747,13 +736,13 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
return &op->kp;
}
static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
/*
* Prepare an optimized_kprobe and optimize it
* NOTE: p must be a normal registered kprobe
*/
static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
static void try_to_optimize_kprobe(struct kprobe *p)
{
struct kprobe *ap;
struct optimized_kprobe *op;
@@ -787,7 +776,7 @@ out:
}
#ifdef CONFIG_SYSCTL
static void __kprobes optimize_all_kprobes(void)
static void optimize_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
@@ -810,7 +799,7 @@ out:
mutex_unlock(&kprobe_mutex);
}
static void __kprobes unoptimize_all_kprobes(void)
static void unoptimize_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
@@ -861,7 +850,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
#endif /* CONFIG_SYSCTL */
/* Put a breakpoint for a probe. Must be called with text_mutex locked */
static void __kprobes __arm_kprobe(struct kprobe *p)
static void __arm_kprobe(struct kprobe *p)
{
struct kprobe *_p;
@@ -876,7 +865,7 @@ static void __kprobes __arm_kprobe(struct kprobe *p)
}
/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt)
static void __disarm_kprobe(struct kprobe *p, bool reopt)
{
struct kprobe *_p;
@@ -911,13 +900,13 @@ static void reuse_unused_kprobe(struct kprobe *ap)
BUG_ON(kprobe_unused(ap));
}
static __kprobes void free_aggr_kprobe(struct kprobe *p)
static void free_aggr_kprobe(struct kprobe *p)
{
arch_remove_kprobe(p);
kfree(p);
}
static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
{
return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
}
@@ -931,7 +920,7 @@ static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
static int kprobe_ftrace_enabled;
/* Must ensure p->addr is really on ftrace */
static int __kprobes prepare_kprobe(struct kprobe *p)
static int prepare_kprobe(struct kprobe *p)
{
if (!kprobe_ftrace(p))
return arch_prepare_kprobe(p);
@@ -940,7 +929,7 @@ static int __kprobes prepare_kprobe(struct kprobe *p)
}
/* Caller must lock kprobe_mutex */
static void __kprobes arm_kprobe_ftrace(struct kprobe *p)
static void arm_kprobe_ftrace(struct kprobe *p)
{
int ret;
@@ -955,7 +944,7 @@ static void __kprobes arm_kprobe_ftrace(struct kprobe *p)
}
/* Caller must lock kprobe_mutex */
static void __kprobes disarm_kprobe_ftrace(struct kprobe *p)
static void disarm_kprobe_ftrace(struct kprobe *p)
{
int ret;
@@ -975,7 +964,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p)
#endif
/* Arm a kprobe with text_mutex */
static void __kprobes arm_kprobe(struct kprobe *kp)
static void arm_kprobe(struct kprobe *kp)
{
if (unlikely(kprobe_ftrace(kp))) {
arm_kprobe_ftrace(kp);
@@ -992,7 +981,7 @@ static void __kprobes arm_kprobe(struct kprobe *kp)
}
/* Disarm a kprobe with text_mutex */
static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt)
static void disarm_kprobe(struct kprobe *kp, bool reopt)
{
if (unlikely(kprobe_ftrace(kp))) {
disarm_kprobe_ftrace(kp);
@@ -1008,7 +997,7 @@ static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt)
* Aggregate handlers for multiple kprobes support - these handlers
* take care of invoking the individual kprobe handlers on p->list
*/
static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kprobe *kp;
@@ -1022,9 +1011,10 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
}
return 0;
}
NOKPROBE_SYMBOL(aggr_pre_handler);
static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
unsigned long flags)
static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
unsigned long flags)
{
struct kprobe *kp;
@@ -1036,9 +1026,10 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
}
}
}
NOKPROBE_SYMBOL(aggr_post_handler);
static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
int trapnr)
static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
int trapnr)
{
struct kprobe *cur = __this_cpu_read(kprobe_instance);
@@ -1052,8 +1043,9 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
}
return 0;
}
NOKPROBE_SYMBOL(aggr_fault_handler);
static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kprobe *cur = __this_cpu_read(kprobe_instance);
int ret = 0;
@@ -1065,9 +1057,10 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
reset_kprobe_instance();
return ret;
}
NOKPROBE_SYMBOL(aggr_break_handler);
/* Walks the list and increments nmissed count for multiprobe case */
void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
void kprobes_inc_nmissed_count(struct kprobe *p)
{
struct kprobe *kp;
if (!kprobe_aggrprobe(p)) {
@@ -1078,9 +1071,10 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
}
return;
}
NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);
void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
struct hlist_head *head)
void recycle_rp_inst(struct kretprobe_instance *ri,
struct hlist_head *head)
{
struct kretprobe *rp = ri->rp;
@@ -1095,8 +1089,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
/* Unregistering */
hlist_add_head(&ri->hlist, head);
}
NOKPROBE_SYMBOL(recycle_rp_inst);
void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
void kretprobe_hash_lock(struct task_struct *tsk,
struct hlist_head **head, unsigned long *flags)
__acquires(hlist_lock)
{
@@ -1107,17 +1102,19 @@ __acquires(hlist_lock)
hlist_lock = kretprobe_table_lock_ptr(hash);
raw_spin_lock_irqsave(hlist_lock, *flags);
}
NOKPROBE_SYMBOL(kretprobe_hash_lock);
static void __kprobes kretprobe_table_lock(unsigned long hash,
unsigned long *flags)
static void kretprobe_table_lock(unsigned long hash,
unsigned long *flags)
__acquires(hlist_lock)
{
raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
raw_spin_lock_irqsave(hlist_lock, *flags);
}
NOKPROBE_SYMBOL(kretprobe_table_lock);
void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
unsigned long *flags)
void kretprobe_hash_unlock(struct task_struct *tsk,
unsigned long *flags)
__releases(hlist_lock)
{
unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
@@ -1126,14 +1123,16 @@ __releases(hlist_lock)
hlist_lock = kretprobe_table_lock_ptr(hash);
raw_spin_unlock_irqrestore(hlist_lock, *flags);
}
NOKPROBE_SYMBOL(kretprobe_hash_unlock);
static void __kprobes kretprobe_table_unlock(unsigned long hash,
unsigned long *flags)
static void kretprobe_table_unlock(unsigned long hash,
unsigned long *flags)
__releases(hlist_lock)
{
raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
raw_spin_unlock_irqrestore(hlist_lock, *flags);
}
NOKPROBE_SYMBOL(kretprobe_table_unlock);
/*
* This function is called from finish_task_switch when task tk becomes dead,
@@ -1141,7 +1140,7 @@ __releases(hlist_lock)
* with this task. These left over instances represent probed functions
* that have been called but will never return.
*/
void __kprobes kprobe_flush_task(struct task_struct *tk)
void kprobe_flush_task(struct task_struct *tk)
{
struct kretprobe_instance *ri;
struct hlist_head *head, empty_rp;
@@ -1166,6 +1165,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
kfree(ri);
}
}
NOKPROBE_SYMBOL(kprobe_flush_task);
static inline void free_rp_inst(struct kretprobe *rp)
{
@@ -1178,7 +1178,7 @@ static inline void free_rp_inst(struct kretprobe *rp)
}
}
static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
static void cleanup_rp_inst(struct kretprobe *rp)
{
unsigned long flags, hash;
struct kretprobe_instance *ri;
@@ -1197,12 +1197,13 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
}
free_rp_inst(rp);
}
NOKPROBE_SYMBOL(cleanup_rp_inst);
/*
* Add the new probe to ap->list. Fail if this is the
* second jprobe at the address - two jprobes can't coexist
*/
static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
{
BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
@@ -1226,7 +1227,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
* Fill in the required fields of the "manager kprobe". Replace the
* earlier kprobe in the hlist with the manager kprobe
*/
static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
{
/* Copy p's insn slot to ap */
copy_kprobe(p, ap);
@@ -1252,8 +1253,7 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
* This is the second or subsequent kprobe at the address - handle
* the intricacies
*/
static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
struct kprobe *p)
static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
{
int ret = 0;
struct kprobe *ap = orig_p;
@@ -1324,25 +1324,29 @@ out:
return ret;
}
static int __kprobes in_kprobes_functions(unsigned long addr)
bool __weak arch_within_kprobe_blacklist(unsigned long addr)
{
struct kprobe_blackpoint *kb;
/* The __kprobes marked functions and entry code must not be probed */
return addr >= (unsigned long)__kprobes_text_start &&
addr < (unsigned long)__kprobes_text_end;
}
if (addr >= (unsigned long)__kprobes_text_start &&
addr < (unsigned long)__kprobes_text_end)
return -EINVAL;
static bool within_kprobe_blacklist(unsigned long addr)
{
struct kprobe_blacklist_entry *ent;
if (arch_within_kprobe_blacklist(addr))
return true;
/*
* If there exists a kprobe_blacklist, verify and
* fail any probe registration in the prohibited area
*/
for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
if (kb->start_addr) {
if (addr >= kb->start_addr &&
addr < (kb->start_addr + kb->range))
return -EINVAL;
}
list_for_each_entry(ent, &kprobe_blacklist, list) {
if (addr >= ent->start_addr && addr < ent->end_addr)
return true;
}
return 0;
return false;
}
/*
@@ -1351,7 +1355,7 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
* This returns encoded errors if it fails to look up symbol or invalid
* combination of parameters.
*/
static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
{
kprobe_opcode_t *addr = p->addr;
@@ -1374,7 +1378,7 @@ invalid:
}
/* Check passed kprobe is valid and return kprobe in kprobe_table. */
static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
static struct kprobe *__get_valid_kprobe(struct kprobe *p)
{
struct kprobe *ap, *list_p;
@@ -1406,8 +1410,8 @@ static inline int check_kprobe_rereg(struct kprobe *p)
return ret;
}
static __kprobes int check_kprobe_address_safe(struct kprobe *p,
struct module **probed_mod)
static int check_kprobe_address_safe(struct kprobe *p,
struct module **probed_mod)
{
int ret = 0;
unsigned long ftrace_addr;
@@ -1433,7 +1437,7 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p,
/* Ensure it is not in reserved area nor out of text */
if (!kernel_text_address((unsigned long) p->addr) ||
in_kprobes_functions((unsigned long) p->addr) ||
within_kprobe_blacklist((unsigned long) p->addr) ||
jump_label_text_reserved(p->addr, p->addr)) {
ret = -EINVAL;
goto out;
@@ -1469,7 +1473,7 @@ out:
return ret;
}
int __kprobes register_kprobe(struct kprobe *p)
int register_kprobe(struct kprobe *p)
{
int ret;
struct kprobe *old_p;
@@ -1531,7 +1535,7 @@ out:
EXPORT_SYMBOL_GPL(register_kprobe);
/* Check if all probes on the aggrprobe are disabled */
static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
static int aggr_kprobe_disabled(struct kprobe *ap)
{
struct kprobe *kp;
@@ -1547,7 +1551,7 @@ static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
}
/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
static struct kprobe *__disable_kprobe(struct kprobe *p)
{
struct kprobe *orig_p;
@@ -1574,7 +1578,7 @@ static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
/*
* Unregister a kprobe without a scheduler synchronization.
*/
static int __kprobes __unregister_kprobe_top(struct kprobe *p)
static int __unregister_kprobe_top(struct kprobe *p)
{
struct kprobe *ap, *list_p;
@@ -1631,7 +1635,7 @@ disarmed:
return 0;
}
static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
static void __unregister_kprobe_bottom(struct kprobe *p)
{
struct kprobe *ap;
@@ -1647,7 +1651,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
/* Otherwise, do nothing. */
}
int __kprobes register_kprobes(struct kprobe **kps, int num)
int register_kprobes(struct kprobe **kps, int num)
{
int i, ret = 0;
@@ -1665,13 +1669,13 @@ int __kprobes register_kprobes(struct kprobe **kps, int num)
}
EXPORT_SYMBOL_GPL(register_kprobes);
void __kprobes unregister_kprobe(struct kprobe *p)
void unregister_kprobe(struct kprobe *p)
{
unregister_kprobes(&p, 1);
}
EXPORT_SYMBOL_GPL(unregister_kprobe);
void __kprobes unregister_kprobes(struct kprobe **kps, int num)
void unregister_kprobes(struct kprobe **kps, int num)
{
int i;
@@ -1700,7 +1704,7 @@ unsigned long __weak arch_deref_entry_point(void *entry)
return (unsigned long)entry;
}
int __kprobes register_jprobes(struct jprobe **jps, int num)
int register_jprobes(struct jprobe **jps, int num)
{
struct jprobe *jp;
int ret = 0, i;
@@ -1731,19 +1735,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
}
EXPORT_SYMBOL_GPL(register_jprobes);
int __kprobes register_jprobe(struct jprobe *jp)
int register_jprobe(struct jprobe *jp)
{
return register_jprobes(&jp, 1);
}
EXPORT_SYMBOL_GPL(register_jprobe);
void __kprobes unregister_jprobe(struct jprobe *jp)
void unregister_jprobe(struct jprobe *jp)
{
unregister_jprobes(&jp, 1);
}
EXPORT_SYMBOL_GPL(unregister_jprobe);
void __kprobes unregister_jprobes(struct jprobe **jps, int num)
void unregister_jprobes(struct jprobe **jps, int num)
{
int i;
@@ -1768,8 +1772,7 @@ EXPORT_SYMBOL_GPL(unregister_jprobes);
* This kprobe pre_handler is registered with every kretprobe. When probe
* hits it will set up the return probe.
*/
static int __kprobes pre_handler_kretprobe(struct kprobe *p,
struct pt_regs *regs)
static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
struct kretprobe *rp = container_of(p, struct kretprobe, kp);
unsigned long hash, flags = 0;
@@ -1807,8 +1810,9 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
}
return 0;
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);
int __kprobes register_kretprobe(struct kretprobe *rp)
int register_kretprobe(struct kretprobe *rp)
{
int ret = 0;
struct kretprobe_instance *inst;
@@ -1861,7 +1865,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
}
EXPORT_SYMBOL_GPL(register_kretprobe);
int __kprobes register_kretprobes(struct kretprobe **rps, int num)
int register_kretprobes(struct kretprobe **rps, int num)
{
int ret = 0, i;
@@ -1879,13 +1883,13 @@ int __kprobes register_kretprobes(struct kretprobe **rps, int num)
}
EXPORT_SYMBOL_GPL(register_kretprobes);
void __kprobes unregister_kretprobe(struct kretprobe *rp)
void unregister_kretprobe(struct kretprobe *rp)
{
unregister_kretprobes(&rp, 1);
}
EXPORT_SYMBOL_GPL(unregister_kretprobe);
void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
void unregister_kretprobes(struct kretprobe **rps, int num)
{
int i;
@@ -1908,38 +1912,38 @@ void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
EXPORT_SYMBOL_GPL(unregister_kretprobes);
#else /* CONFIG_KRETPROBES */
int __kprobes register_kretprobe(struct kretprobe *rp)
int register_kretprobe(struct kretprobe *rp)
{
return -ENOSYS;
}
EXPORT_SYMBOL_GPL(register_kretprobe);
int __kprobes register_kretprobes(struct kretprobe **rps, int num)
int register_kretprobes(struct kretprobe **rps, int num)
{
return -ENOSYS;
}
EXPORT_SYMBOL_GPL(register_kretprobes);
void __kprobes unregister_kretprobe(struct kretprobe *rp)
void unregister_kretprobe(struct kretprobe *rp)
{
}
EXPORT_SYMBOL_GPL(unregister_kretprobe);
void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
void unregister_kretprobes(struct kretprobe **rps, int num)
{
}
EXPORT_SYMBOL_GPL(unregister_kretprobes);
static int __kprobes pre_handler_kretprobe(struct kprobe *p,
struct pt_regs *regs)
static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
return 0;
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);
#endif /* CONFIG_KRETPROBES */
/* Set the kprobe gone and remove its instruction buffer. */
static void __kprobes kill_kprobe(struct kprobe *p)
static void kill_kprobe(struct kprobe *p)
{
struct kprobe *kp;
@@ -1963,7 +1967,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
}
/* Disable one kprobe */
int __kprobes disable_kprobe(struct kprobe *kp)
int disable_kprobe(struct kprobe *kp)
{
int ret = 0;
@@ -1979,7 +1983,7 @@ int __kprobes disable_kprobe(struct kprobe *kp)
EXPORT_SYMBOL_GPL(disable_kprobe);
/* Enable one kprobe */
int __kprobes enable_kprobe(struct kprobe *kp)
int enable_kprobe(struct kprobe *kp)
{
int ret = 0;
struct kprobe *p;
@@ -2012,16 +2016,49 @@ out:
}
EXPORT_SYMBOL_GPL(enable_kprobe);
void __kprobes dump_kprobe(struct kprobe *kp)
void dump_kprobe(struct kprobe *kp)
{
printk(KERN_WARNING "Dumping kprobe:\n");
printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n",
kp->symbol_name, kp->addr, kp->offset);
}
NOKPROBE_SYMBOL(dump_kprobe);
/*
* Lookup and populate the kprobe_blacklist.
*
* Unlike the kretprobe blacklist, we'll need to determine
* the range of addresses that belong to the said functions,
* since a kprobe need not necessarily be at the beginning
* of a function.
*/
static int __init populate_kprobe_blacklist(unsigned long *start,
unsigned long *end)
{
unsigned long *iter;
struct kprobe_blacklist_entry *ent;
unsigned long offset = 0, size = 0;
for (iter = start; iter < end; iter++) {
if (!kallsyms_lookup_size_offset(*iter, &size, &offset)) {
pr_err("Failed to find blacklist %p\n", (void *)*iter);
continue;
}
ent = kmalloc(sizeof(*ent), GFP_KERNEL);
if (!ent)
return -ENOMEM;
ent->start_addr = *iter;
ent->end_addr = *iter + size;
INIT_LIST_HEAD(&ent->list);
list_add_tail(&ent->list, &kprobe_blacklist);
}
return 0;
}
/* Module notifier call back, checking kprobes on the module */
static int __kprobes kprobes_module_callback(struct notifier_block *nb,
unsigned long val, void *data)
static int kprobes_module_callback(struct notifier_block *nb,
unsigned long val, void *data)
{
struct module *mod = data;
struct hlist_head *head;
@@ -2062,14 +2099,13 @@ static struct notifier_block kprobe_module_nb = {
.priority = 0
};
/* Markers of _kprobe_blacklist section */
extern unsigned long __start_kprobe_blacklist[];
extern unsigned long __stop_kprobe_blacklist[];
static int __init init_kprobes(void)
{
int i, err = 0;
unsigned long offset = 0, size = 0;
char *modname, namebuf[KSYM_NAME_LEN];
const char *symbol_name;
void *addr;
struct kprobe_blackpoint *kb;
/* FIXME allocate the probe table, currently defined statically */
/* initialize all list heads */
@@ -2079,26 +2115,11 @@ static int __init init_kprobes(void)
raw_spin_lock_init(&(kretprobe_table_locks[i].lock));
}
/*
* Lookup and populate the kprobe_blacklist.
*
* Unlike the kretprobe blacklist, we'll need to determine
* the range of addresses that belong to the said functions,
* since a kprobe need not necessarily be at the beginning
* of a function.
*/
for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
kprobe_lookup_name(kb->name, addr);
if (!addr)
continue;
kb->start_addr = (unsigned long)addr;
symbol_name = kallsyms_lookup(kb->start_addr,
&size, &offset, &modname, namebuf);
if (!symbol_name)
kb->range = 0;
else
kb->range = size;
err = populate_kprobe_blacklist(__start_kprobe_blacklist,
__stop_kprobe_blacklist);
if (err) {
pr_err("kprobes: failed to populate blacklist: %d\n", err);
pr_err("Please take care of using kprobes.\n");
}
if (kretprobe_blacklist_size) {
@@ -2138,7 +2159,7 @@ static int __init init_kprobes(void)
}
#ifdef CONFIG_DEBUG_FS
static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
static void report_probe(struct seq_file *pi, struct kprobe *p,
const char *sym, int offset, char *modname, struct kprobe *pp)
{
char *kprobe_type;
@@ -2167,12 +2188,12 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
(kprobe_ftrace(pp) ? "[FTRACE]" : ""));
}
static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
static void *kprobe_seq_start(struct seq_file *f, loff_t *pos)
{
return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL;
}
static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
static void *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
{
(*pos)++;
if (*pos >= KPROBE_TABLE_SIZE)
@@ -2180,12 +2201,12 @@ static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
return pos;
}
static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v)
static void kprobe_seq_stop(struct seq_file *f, void *v)
{
/* Nothing to do */
}
static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
static int show_kprobe_addr(struct seq_file *pi, void *v)
{
struct hlist_head *head;
struct kprobe *p, *kp;
@@ -2216,7 +2237,7 @@ static const struct seq_operations kprobes_seq_ops = {
.show = show_kprobe_addr
};
static int __kprobes kprobes_open(struct inode *inode, struct file *filp)
static int kprobes_open(struct inode *inode, struct file *filp)
{
return seq_open(filp, &kprobes_seq_ops);
}
@@ -2228,7 +2249,47 @@ static const struct file_operations debugfs_kprobes_operations = {
.release = seq_release,
};
static void __kprobes arm_all_kprobes(void)
/* kprobes/blacklist -- shows which functions can not be probed */
static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos)
{
return seq_list_start(&kprobe_blacklist, *pos);
}
static void *kprobe_blacklist_seq_next(struct seq_file *m, void *v, loff_t *pos)
{
return seq_list_next(v, &kprobe_blacklist, pos);
}
static int kprobe_blacklist_seq_show(struct seq_file *m, void *v)
{
struct kprobe_blacklist_entry *ent =
list_entry(v, struct kprobe_blacklist_entry, list);
seq_printf(m, "0x%p-0x%p\t%ps\n", (void *)ent->start_addr,
(void *)ent->end_addr, (void *)ent->start_addr);
return 0;
}
static const struct seq_operations kprobe_blacklist_seq_ops = {
.start = kprobe_blacklist_seq_start,
.next = kprobe_blacklist_seq_next,
.stop = kprobe_seq_stop, /* Reuse void function */
.show = kprobe_blacklist_seq_show,
};
static int kprobe_blacklist_open(struct inode *inode, struct file *filp)
{
return seq_open(filp, &kprobe_blacklist_seq_ops);
}
static const struct file_operations debugfs_kprobe_blacklist_ops = {
.open = kprobe_blacklist_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
static void arm_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
@@ -2256,7 +2317,7 @@ already_enabled:
return;
}
static void __kprobes disarm_all_kprobes(void)
static void disarm_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
@@ -2340,7 +2401,7 @@ static const struct file_operations fops_kp = {
.llseek = default_llseek,
};
static int __kprobes debugfs_kprobe_init(void)
static int __init debugfs_kprobe_init(void)
{
struct dentry *dir, *file;
unsigned int value = 1;
@@ -2351,19 +2412,24 @@ static int __kprobes debugfs_kprobe_init(void)
file = debugfs_create_file("list", 0444, dir, NULL,
&debugfs_kprobes_operations);
if (!file) {
debugfs_remove(dir);
return -ENOMEM;
}
if (!file)
goto error;
file = debugfs_create_file("enabled", 0600, dir,
&value, &fops_kp);
if (!file) {
debugfs_remove(dir);
return -ENOMEM;
}
if (!file)
goto error;
file = debugfs_create_file("blacklist", 0444, dir, NULL,
&debugfs_kprobe_blacklist_ops);
if (!file)
goto error;
return 0;
error:
debugfs_remove(dir);
return -ENOMEM;
}
late_initcall(debugfs_kprobe_init);

View File

@@ -24,4 +24,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o
obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o

133
kernel/locking/qrwlock.c Normal file
View File

@@ -0,0 +1,133 @@
/*
* Queue read/write lock
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P.
*
* Authors: Waiman Long <waiman.long@hp.com>
*/
#include <linux/smp.h>
#include <linux/bug.h>
#include <linux/cpumask.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/mutex.h>
#include <asm/qrwlock.h>
/**
* rspin_until_writer_unlock - inc reader count & spin until writer is gone
* @lock : Pointer to queue rwlock structure
* @writer: Current queue rwlock writer status byte
*
* In interrupt context or at the head of the queue, the reader will just
* increment the reader count & wait until the writer releases the lock.
*/
static __always_inline void
rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
{
while ((cnts & _QW_WMASK) == _QW_LOCKED) {
arch_mutex_cpu_relax();
cnts = smp_load_acquire((u32 *)&lock->cnts);
}
}
/**
* queue_read_lock_slowpath - acquire read lock of a queue rwlock
* @lock: Pointer to queue rwlock structure
*/
void queue_read_lock_slowpath(struct qrwlock *lock)
{
u32 cnts;
/*
* Readers come here when they cannot get the lock without waiting
*/
if (unlikely(in_interrupt())) {
/*
* Readers in interrupt context will spin until the lock is
* available without waiting in the queue.
*/
cnts = smp_load_acquire((u32 *)&lock->cnts);
rspin_until_writer_unlock(lock, cnts);
return;
}
atomic_sub(_QR_BIAS, &lock->cnts);
/*
* Put the reader into the wait queue
*/
arch_spin_lock(&lock->lock);
/*
* At the head of the wait queue now, wait until the writer state
* goes to 0 and then try to increment the reader count and get
* the lock. It is possible that an incoming writer may steal the
* lock in the interim, so it is necessary to check the writer byte
* to make sure that the write lock isn't taken.
*/
while (atomic_read(&lock->cnts) & _QW_WMASK)
arch_mutex_cpu_relax();
cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS;
rspin_until_writer_unlock(lock, cnts);
/*
* Signal the next one in queue to become queue head
*/
arch_spin_unlock(&lock->lock);
}
EXPORT_SYMBOL(queue_read_lock_slowpath);
/**
* queue_write_lock_slowpath - acquire write lock of a queue rwlock
* @lock : Pointer to queue rwlock structure
*/
void queue_write_lock_slowpath(struct qrwlock *lock)
{
u32 cnts;
/* Put the writer into the wait queue */
arch_spin_lock(&lock->lock);
/* Try to acquire the lock directly if no reader is present */
if (!atomic_read(&lock->cnts) &&
(atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0))
goto unlock;
/*
* Set the waiting flag to notify readers that a writer is pending,
* or wait for a previous writer to go away.
*/
for (;;) {
cnts = atomic_read(&lock->cnts);
if (!(cnts & _QW_WMASK) &&
(atomic_cmpxchg(&lock->cnts, cnts,
cnts | _QW_WAITING) == cnts))
break;
arch_mutex_cpu_relax();
}
/* When no more readers, set the locked flag */
for (;;) {
cnts = atomic_read(&lock->cnts);
if ((cnts == _QW_WAITING) &&
(atomic_cmpxchg(&lock->cnts, _QW_WAITING,
_QW_LOCKED) == _QW_WAITING))
break;
arch_mutex_cpu_relax();
}
unlock:
arch_spin_unlock(&lock->lock);
}
EXPORT_SYMBOL(queue_write_lock_slowpath);

View File

@@ -31,3 +31,8 @@ static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
{
return (waiter != NULL);
}
static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
{
debug_rt_mutex_print_deadlock(w);
}

View File

@@ -83,6 +83,47 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
owner = *p;
} while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
}
/*
* Safe fastpath aware unlock:
* 1) Clear the waiters bit
* 2) Drop lock->wait_lock
* 3) Try to unlock the lock with cmpxchg
*/
static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
__releases(lock->wait_lock)
{
struct task_struct *owner = rt_mutex_owner(lock);
clear_rt_mutex_waiters(lock);
raw_spin_unlock(&lock->wait_lock);
/*
* If a new waiter comes in between the unlock and the cmpxchg
* we have two situations:
*
* unlock(wait_lock);
* lock(wait_lock);
* cmpxchg(p, owner, 0) == owner
* mark_rt_mutex_waiters(lock);
* acquire(lock);
* or:
*
* unlock(wait_lock);
* lock(wait_lock);
* mark_rt_mutex_waiters(lock);
*
* cmpxchg(p, owner, 0) != owner
* enqueue_waiter();
* unlock(wait_lock);
* lock(wait_lock);
* wake waiter();
* unlock(wait_lock);
* lock(wait_lock);
* acquire(lock);
*/
return rt_mutex_cmpxchg(lock, owner, NULL);
}
#else
# define rt_mutex_cmpxchg(l,c,n) (0)
static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
@@ -90,6 +131,17 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
lock->owner = (struct task_struct *)
((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
}
/*
* Simple slow path only version: lock->owner is protected by lock->wait_lock.
*/
static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
__releases(lock->wait_lock)
{
lock->owner = NULL;
raw_spin_unlock(&lock->wait_lock);
return true;
}
#endif
static inline int
@@ -260,27 +312,36 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
*/
int max_lock_depth = 1024;
static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
{
return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
}
/*
* Adjust the priority chain. Also used for deadlock detection.
* Decreases task's usage by one - may thus free the task.
*
* @task: the task owning the mutex (owner) for which a chain walk is probably
* needed
* @task: the task owning the mutex (owner) for which a chain walk is
* probably needed
* @deadlock_detect: do we have to carry out deadlock detection?
* @orig_lock: the mutex (can be NULL if we are walking the chain to recheck
* things for a task that has just got its priority adjusted, and
* is waiting on a mutex)
* @orig_lock: the mutex (can be NULL if we are walking the chain to recheck
* things for a task that has just got its priority adjusted, and
* is waiting on a mutex)
* @next_lock: the mutex on which the owner of @orig_lock was blocked before
* we dropped its pi_lock. Is never dereferenced, only used for
* comparison to detect lock chain changes.
* @orig_waiter: rt_mutex_waiter struct for the task that has just donated
* its priority to the mutex owner (can be NULL in the case
* depicted above or if the top waiter is gone away and we are
* actually deboosting the owner)
* @top_task: the current top waiter
* its priority to the mutex owner (can be NULL in the case
* depicted above or if the top waiter is gone away and we are
* actually deboosting the owner)
* @top_task: the current top waiter
*
* Returns 0 or -EDEADLK.
*/
static int rt_mutex_adjust_prio_chain(struct task_struct *task,
int deadlock_detect,
struct rt_mutex *orig_lock,
struct rt_mutex *next_lock,
struct rt_mutex_waiter *orig_waiter,
struct task_struct *top_task)
{
@@ -314,7 +375,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
}
put_task_struct(task);
return deadlock_detect ? -EDEADLK : 0;
return -EDEADLK;
}
retry:
/*
@@ -338,6 +399,18 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
if (orig_waiter && !rt_mutex_owner(orig_lock))
goto out_unlock_pi;
/*
* We dropped all locks after taking a refcount on @task, so
* the task might have moved on in the lock chain or even left
* the chain completely and blocks now on an unrelated lock or
* on @orig_lock.
*
* We stored the lock on which @task was blocked in @next_lock,
* so we can detect the chain change.
*/
if (next_lock != waiter->lock)
goto out_unlock_pi;
/*
* Drop out, when the task has no waiters. Note,
* top_waiter can be NULL, when we are in the deboosting
@@ -377,7 +450,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
raw_spin_unlock(&lock->wait_lock);
ret = deadlock_detect ? -EDEADLK : 0;
ret = -EDEADLK;
goto out_unlock_pi;
}
@@ -422,11 +495,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
__rt_mutex_adjust_prio(task);
}
/*
* Check whether the task which owns the current lock is pi
* blocked itself. If yes we store a pointer to the lock for
* the lock chain change detection above. After we dropped
* task->pi_lock next_lock cannot be dereferenced anymore.
*/
next_lock = task_blocked_on_lock(task);
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
top_waiter = rt_mutex_top_waiter(lock);
raw_spin_unlock(&lock->wait_lock);
/*
* We reached the end of the lock chain. Stop right here. No
* point to go back just to figure that out.
*/
if (!next_lock)
goto out_put_task;
if (!detect_deadlock && waiter != top_waiter)
goto out_put_task;
@@ -536,8 +624,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
{
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex_waiter *top_waiter = waiter;
unsigned long flags;
struct rt_mutex *next_lock;
int chain_walk = 0, res;
unsigned long flags;
/*
* Early deadlock detection. We really don't want the task to
@@ -548,7 +637,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
* which is wrong, as the other waiter is not in a deadlock
* situation.
*/
if (detect_deadlock && owner == task)
if (owner == task)
return -EDEADLK;
raw_spin_lock_irqsave(&task->pi_lock, flags);
@@ -569,20 +658,28 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
if (!owner)
return 0;
raw_spin_lock_irqsave(&owner->pi_lock, flags);
if (waiter == rt_mutex_top_waiter(lock)) {
raw_spin_lock_irqsave(&owner->pi_lock, flags);
rt_mutex_dequeue_pi(owner, top_waiter);
rt_mutex_enqueue_pi(owner, waiter);
__rt_mutex_adjust_prio(owner);
if (owner->pi_blocked_on)
chain_walk = 1;
raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
}
else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
} else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
chain_walk = 1;
}
if (!chain_walk)
/* Store the lock on which owner is blocked or NULL */
next_lock = task_blocked_on_lock(owner);
raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
/*
* Even if full deadlock detection is on, if the owner is not
* blocked itself, we can avoid finding this out in the chain
* walk.
*/
if (!chain_walk || !next_lock)
return 0;
/*
@@ -594,8 +691,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
raw_spin_unlock(&lock->wait_lock);
res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
task);
res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock,
next_lock, waiter, task);
raw_spin_lock(&lock->wait_lock);
@@ -605,7 +702,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
/*
* Wake up the next waiter on the lock.
*
* Remove the top waiter from the current tasks waiter list and wake it up.
* Remove the top waiter from the current tasks pi waiter list and
* wake it up.
*
* Called with lock->wait_lock held.
*/
@@ -626,10 +724,23 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
*/
rt_mutex_dequeue_pi(current, waiter);
rt_mutex_set_owner(lock, NULL);
/*
* As we are waking up the top waiter, and the waiter stays
* queued on the lock until it gets the lock, this lock
* obviously has waiters. Just set the bit here and this has
* the added benefit of forcing all new tasks into the
* slow path making sure no task of lower priority than
* the top waiter can steal this lock.
*/
lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
raw_spin_unlock_irqrestore(&current->pi_lock, flags);
/*
* It's safe to dereference waiter as it cannot go away as
* long as we hold lock->wait_lock. The waiter task needs to
* acquire it in order to dequeue the waiter.
*/
wake_up_process(waiter->task);
}
@@ -644,8 +755,8 @@ static void remove_waiter(struct rt_mutex *lock,
{
int first = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex *next_lock = NULL;
unsigned long flags;
int chain_walk = 0;
raw_spin_lock_irqsave(&current->pi_lock, flags);
rt_mutex_dequeue(lock, waiter);
@@ -669,13 +780,13 @@ static void remove_waiter(struct rt_mutex *lock,
}
__rt_mutex_adjust_prio(owner);
if (owner->pi_blocked_on)
chain_walk = 1;
/* Store the lock on which owner is blocked or NULL */
next_lock = task_blocked_on_lock(owner);
raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
}
if (!chain_walk)
if (!next_lock)
return;
/* gets dropped in rt_mutex_adjust_prio_chain()! */
@@ -683,7 +794,7 @@ static void remove_waiter(struct rt_mutex *lock,
raw_spin_unlock(&lock->wait_lock);
rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current);
raw_spin_lock(&lock->wait_lock);
}
@@ -696,6 +807,7 @@ static void remove_waiter(struct rt_mutex *lock,
void rt_mutex_adjust_pi(struct task_struct *task)
{
struct rt_mutex_waiter *waiter;
struct rt_mutex *next_lock;
unsigned long flags;
raw_spin_lock_irqsave(&task->pi_lock, flags);
@@ -706,12 +818,13 @@ void rt_mutex_adjust_pi(struct task_struct *task)
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
return;
}
next_lock = waiter->lock;
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
/* gets dropped in rt_mutex_adjust_prio_chain()! */
get_task_struct(task);
rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task);
}
/**
@@ -763,6 +876,26 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
return ret;
}
static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
struct rt_mutex_waiter *w)
{
/*
* If the result is not -EDEADLOCK or the caller requested
* deadlock detection, nothing to do here.
*/
if (res != -EDEADLOCK || detect_deadlock)
return;
/*
* Yell lowdly and stop the task right here.
*/
rt_mutex_print_deadlock(w);
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
schedule();
}
}
/*
* Slow path lock function:
*/
@@ -802,8 +935,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
set_current_state(TASK_RUNNING);
if (unlikely(ret))
if (unlikely(ret)) {
remove_waiter(lock, &waiter);
rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter);
}
/*
* try_to_take_rt_mutex() sets the waiter bit
@@ -859,12 +994,49 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
rt_mutex_deadlock_account_unlock(current);
if (!rt_mutex_has_waiters(lock)) {
lock->owner = NULL;
raw_spin_unlock(&lock->wait_lock);
return;
/*
* We must be careful here if the fast path is enabled. If we
* have no waiters queued we cannot set owner to NULL here
* because of:
*
* foo->lock->owner = NULL;
* rtmutex_lock(foo->lock); <- fast path
* free = atomic_dec_and_test(foo->refcnt);
* rtmutex_unlock(foo->lock); <- fast path
* if (free)
* kfree(foo);
* raw_spin_unlock(foo->lock->wait_lock);
*
* So for the fastpath enabled kernel:
*
* Nothing can set the waiters bit as long as we hold
* lock->wait_lock. So we do the following sequence:
*
* owner = rt_mutex_owner(lock);
* clear_rt_mutex_waiters(lock);
* raw_spin_unlock(&lock->wait_lock);
* if (cmpxchg(&lock->owner, owner, 0) == owner)
* return;
* goto retry;
*
* The fastpath disabled variant is simple as all access to
* lock->owner is serialized by lock->wait_lock:
*
* lock->owner = NULL;
* raw_spin_unlock(&lock->wait_lock);
*/
while (!rt_mutex_has_waiters(lock)) {
/* Drops lock->wait_lock ! */
if (unlock_rt_mutex_safe(lock) == true)
return;
/* Relock the rtmutex and try again */
raw_spin_lock(&lock->wait_lock);
}
/*
* The wakeup next waiter path does not suffer from the above
* race. See the comments there.
*/
wakeup_next_waiter(lock);
raw_spin_unlock(&lock->wait_lock);
@@ -1112,7 +1284,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
return 1;
}
ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
/* We enforce deadlock detection for futexes */
ret = task_blocks_on_rt_mutex(lock, waiter, task, 1);
if (ret && !rt_mutex_owner(lock)) {
/*

View File

@@ -24,3 +24,8 @@
#define debug_rt_mutex_print_deadlock(w) do { } while (0)
#define debug_rt_mutex_detect_deadlock(w,d) (d)
#define debug_rt_mutex_reset_waiter(w) do { } while (0)
static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
{
WARN(1, "rtmutex deadlock detected\n");
}

View File

@@ -5,11 +5,17 @@
*
* Writer lock-stealing by Alex Shi <alex.shi@intel.com>
* and Michel Lespinasse <walken@google.com>
*
* Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
* and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
*/
#include <linux/rwsem.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/export.h>
#include <linux/sched/rt.h>
#include "mcs_spinlock.h"
/*
* Guide to the rw_semaphore's count field for common values.
@@ -76,6 +82,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
sem->count = RWSEM_UNLOCKED_VALUE;
raw_spin_lock_init(&sem->wait_lock);
INIT_LIST_HEAD(&sem->wait_list);
#ifdef CONFIG_SMP
sem->owner = NULL;
sem->osq = NULL;
#endif
}
EXPORT_SYMBOL(__init_rwsem);
@@ -190,7 +200,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
}
/*
* wait for the read lock to be granted
* Wait for the read lock to be granted
*/
__visible
struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
@@ -237,64 +247,221 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
return sem;
}
static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
{
if (!(count & RWSEM_ACTIVE_MASK)) {
/* try acquiring the write lock */
if (sem->count == RWSEM_WAITING_BIAS &&
cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
if (!list_is_singular(&sem->wait_list))
rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
return true;
}
}
return false;
}
#ifdef CONFIG_SMP
/*
* wait until we successfully acquire the write lock
* Try to acquire write lock before the writer has been put on wait queue.
*/
static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
{
long old, count = ACCESS_ONCE(sem->count);
while (true) {
if (!(count == 0 || count == RWSEM_WAITING_BIAS))
return false;
old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
if (old == count)
return true;
count = old;
}
}
static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
{
struct task_struct *owner;
bool on_cpu = true;
if (need_resched())
return 0;
rcu_read_lock();
owner = ACCESS_ONCE(sem->owner);
if (owner)
on_cpu = owner->on_cpu;
rcu_read_unlock();
/*
* If sem->owner is not set, the rwsem owner may have
* just acquired it and not set the owner yet or the rwsem
* has been released.
*/
return on_cpu;
}
static inline bool owner_running(struct rw_semaphore *sem,
struct task_struct *owner)
{
if (sem->owner != owner)
return false;
/*
* Ensure we emit the owner->on_cpu, dereference _after_ checking
* sem->owner still matches owner, if that fails, owner might
* point to free()d memory, if it still matches, the rcu_read_lock()
* ensures the memory stays valid.
*/
barrier();
return owner->on_cpu;
}
static noinline
bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
{
rcu_read_lock();
while (owner_running(sem, owner)) {
if (need_resched())
break;
arch_mutex_cpu_relax();
}
rcu_read_unlock();
/*
* We break out the loop above on need_resched() or when the
* owner changed, which is a sign for heavy contention. Return
* success only when sem->owner is NULL.
*/
return sem->owner == NULL;
}
static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
{
struct task_struct *owner;
bool taken = false;
preempt_disable();
/* sem->wait_lock should not be held when doing optimistic spinning */
if (!rwsem_can_spin_on_owner(sem))
goto done;
if (!osq_lock(&sem->osq))
goto done;
while (true) {
owner = ACCESS_ONCE(sem->owner);
if (owner && !rwsem_spin_on_owner(sem, owner))
break;
/* wait_lock will be acquired if write_lock is obtained */
if (rwsem_try_write_lock_unqueued(sem)) {
taken = true;
break;
}
/*
* When there's no owner, we might have preempted between the
* owner acquiring the lock and setting the owner field. If
* we're an RT task that will live-lock because we won't let
* the owner complete.
*/
if (!owner && (need_resched() || rt_task(current)))
break;
/*
* The cpu_relax() call is a compiler barrier which forces
* everything in this loop to be re-loaded. We don't need
* memory barriers as we'll eventually observe the right
* values at the cost of a few extra spins.
*/
arch_mutex_cpu_relax();
}
osq_unlock(&sem->osq);
done:
preempt_enable();
return taken;
}
#else
static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
{
return false;
}
#endif
/*
* Wait until we successfully acquire the write lock
*/
__visible
struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
{
long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
long count;
bool waiting = true; /* any queued threads before us */
struct rwsem_waiter waiter;
struct task_struct *tsk = current;
/* set up my own style of waitqueue */
waiter.task = tsk;
/* undo write bias from down_write operation, stop active locking */
count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
/* do optimistic spinning and steal lock if possible */
if (rwsem_optimistic_spin(sem))
return sem;
/*
* Optimistic spinning failed, proceed to the slowpath
* and block until we can acquire the sem.
*/
waiter.task = current;
waiter.type = RWSEM_WAITING_FOR_WRITE;
raw_spin_lock_irq(&sem->wait_lock);
/* account for this before adding a new element to the list */
if (list_empty(&sem->wait_list))
adjustment += RWSEM_WAITING_BIAS;
waiting = false;
list_add_tail(&waiter.list, &sem->wait_list);
/* we're now waiting on the lock, but no longer actively locking */
count = rwsem_atomic_update(adjustment, sem);
if (waiting) {
count = ACCESS_ONCE(sem->count);
/* If there were already threads queued before us and there are no
* active writers, the lock must be read owned; so we try to wake
* any read locks that were queued ahead of us. */
if (count > RWSEM_WAITING_BIAS &&
adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
/*
* If there were already threads queued before us and there are
* no active writers, the lock must be read owned; so we try to
* wake any read locks that were queued ahead of us.
*/
if (count > RWSEM_WAITING_BIAS)
sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
} else
count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
/* wait until we successfully acquire the lock */
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
set_current_state(TASK_UNINTERRUPTIBLE);
while (true) {
if (!(count & RWSEM_ACTIVE_MASK)) {
/* Try acquiring the write lock. */
count = RWSEM_ACTIVE_WRITE_BIAS;
if (!list_is_singular(&sem->wait_list))
count += RWSEM_WAITING_BIAS;
if (sem->count == RWSEM_WAITING_BIAS &&
cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
RWSEM_WAITING_BIAS)
break;
}
if (rwsem_try_write_lock(count, sem))
break;
raw_spin_unlock_irq(&sem->wait_lock);
/* Block until there are no active lockers. */
do {
schedule();
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
set_current_state(TASK_UNINTERRUPTIBLE);
} while ((count = sem->count) & RWSEM_ACTIVE_MASK);
raw_spin_lock_irq(&sem->wait_lock);
}
__set_current_state(TASK_RUNNING);
list_del(&waiter.list);
raw_spin_unlock_irq(&sem->wait_lock);
tsk->state = TASK_RUNNING;
return sem;
}

View File

@@ -12,6 +12,27 @@
#include <linux/atomic.h>
#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM)
static inline void rwsem_set_owner(struct rw_semaphore *sem)
{
sem->owner = current;
}
static inline void rwsem_clear_owner(struct rw_semaphore *sem)
{
sem->owner = NULL;
}
#else
static inline void rwsem_set_owner(struct rw_semaphore *sem)
{
}
static inline void rwsem_clear_owner(struct rw_semaphore *sem)
{
}
#endif
/*
* lock for reading
*/
@@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem)
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
rwsem_set_owner(sem);
}
EXPORT_SYMBOL(down_write);
@@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem)
{
int ret = __down_write_trylock(sem);
if (ret == 1)
if (ret == 1) {
rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
rwsem_set_owner(sem);
}
return ret;
}
@@ -85,6 +110,7 @@ void up_write(struct rw_semaphore *sem)
{
rwsem_release(&sem->dep_map, 1, _RET_IP_);
rwsem_clear_owner(sem);
__up_write(sem);
}
@@ -99,6 +125,7 @@ void downgrade_write(struct rw_semaphore *sem)
* lockdep: a downgraded write will live on as a write
* dependency.
*/
rwsem_clear_owner(sem);
__downgrade_write(sem);
}
@@ -122,6 +149,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
rwsem_set_owner(sem);
}
EXPORT_SYMBOL(_down_write_nest_lock);
@@ -141,6 +169,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
rwsem_set_owner(sem);
}
EXPORT_SYMBOL(down_write_nested);

View File

@@ -3020,21 +3020,6 @@ static int do_init_module(struct module *mod)
*/
current->flags &= ~PF_USED_ASYNC;
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_COMING, mod);
/* Set RO and NX regions for core */
set_section_ro_nx(mod->module_core,
mod->core_text_size,
mod->core_ro_size,
mod->core_size);
/* Set RO and NX regions for init */
set_section_ro_nx(mod->module_init,
mod->init_text_size,
mod->init_ro_size,
mod->init_size);
do_mod_ctors(mod);
/* Start the module */
if (mod->init != NULL)
@@ -3165,9 +3150,26 @@ static int complete_formation(struct module *mod, struct load_info *info)
/* This relies on module_mutex for list integrity. */
module_bug_finalize(info->hdr, info->sechdrs, mod);
/* Set RO and NX regions for core */
set_section_ro_nx(mod->module_core,
mod->core_text_size,
mod->core_ro_size,
mod->core_size);
/* Set RO and NX regions for init */
set_section_ro_nx(mod->module_init,
mod->init_text_size,
mod->init_ro_size,
mod->init_size);
/* Mark state as coming so strong_try_module_get() ignores us,
* but kallsyms etc. can see us. */
mod->state = MODULE_STATE_COMING;
mutex_unlock(&module_mutex);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_COMING, mod);
return 0;
out:
mutex_unlock(&module_mutex);
@@ -3190,6 +3192,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
{
struct module *mod;
long err;
char *after_dashes;
err = module_sig_check(info);
if (err)
@@ -3277,10 +3280,15 @@ static int load_module(struct load_info *info, const char __user *uargs,
goto ddebug_cleanup;
/* Module is ready to execute: parsing args may do that. */
err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
-32768, 32767, unknown_module_param_cb);
if (err < 0)
after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
-32768, 32767, unknown_module_param_cb);
if (IS_ERR(after_dashes)) {
err = PTR_ERR(after_dashes);
goto bug_cleanup;
} else if (after_dashes) {
pr_warn("%s: parameters '%s' after `--' ignored\n",
mod->name, after_dashes);
}
/* Link in to syfs. */
err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);

View File

@@ -71,9 +71,9 @@ static int notifier_chain_unregister(struct notifier_block **nl,
* @returns: notifier_call_chain returns the value returned by the
* last notifier function called.
*/
static int __kprobes notifier_call_chain(struct notifier_block **nl,
unsigned long val, void *v,
int nr_to_call, int *nr_calls)
static int notifier_call_chain(struct notifier_block **nl,
unsigned long val, void *v,
int nr_to_call, int *nr_calls)
{
int ret = NOTIFY_DONE;
struct notifier_block *nb, *next_nb;
@@ -102,6 +102,7 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
}
return ret;
}
NOKPROBE_SYMBOL(notifier_call_chain);
/*
* Atomic notifier chain routines. Registration and unregistration
@@ -172,9 +173,9 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
* Otherwise the return value is the return value
* of the last notifier function called.
*/
int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
unsigned long val, void *v,
int nr_to_call, int *nr_calls)
int __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
unsigned long val, void *v,
int nr_to_call, int *nr_calls)
{
int ret;
@@ -184,13 +185,15 @@ int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
return ret;
}
EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
NOKPROBE_SYMBOL(__atomic_notifier_call_chain);
int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
unsigned long val, void *v)
int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
unsigned long val, void *v)
{
return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
}
EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
NOKPROBE_SYMBOL(atomic_notifier_call_chain);
/*
* Blocking notifier chain routines. All access to the chain is
@@ -527,7 +530,7 @@ EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
static ATOMIC_NOTIFIER_HEAD(die_chain);
int notrace __kprobes notify_die(enum die_val val, const char *str,
int notrace notify_die(enum die_val val, const char *str,
struct pt_regs *regs, long err, int trap, int sig)
{
struct die_args args = {
@@ -540,6 +543,7 @@ int notrace __kprobes notify_die(enum die_val val, const char *str,
};
return atomic_notifier_call_chain(&die_chain, val, &args);
}
NOKPROBE_SYMBOL(notify_die);
int register_die_notifier(struct notifier_block *nb)
{

View File

@@ -177,13 +177,13 @@ static char *next_arg(char *args, char **param, char **val)
}
/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
int parse_args(const char *doing,
char *args,
const struct kernel_param *params,
unsigned num,
s16 min_level,
s16 max_level,
int (*unknown)(char *param, char *val, const char *doing))
char *parse_args(const char *doing,
char *args,
const struct kernel_param *params,
unsigned num,
s16 min_level,
s16 max_level,
int (*unknown)(char *param, char *val, const char *doing))
{
char *param, *val;
@@ -198,6 +198,9 @@ int parse_args(const char *doing,
int irq_was_disabled;
args = next_arg(args, &param, &val);
/* Stop at -- */
if (!val && strcmp(param, "--") == 0)
return args;
irq_was_disabled = irqs_disabled();
ret = parse_one(param, val, doing, params, num,
min_level, max_level, unknown);
@@ -208,22 +211,22 @@ int parse_args(const char *doing,
switch (ret) {
case -ENOENT:
pr_err("%s: Unknown parameter `%s'\n", doing, param);
return ret;
return ERR_PTR(ret);
case -ENOSPC:
pr_err("%s: `%s' too large for parameter `%s'\n",
doing, val ?: "", param);
return ret;
return ERR_PTR(ret);
case 0:
break;
default:
pr_err("%s: `%s' invalid for parameter `%s'\n",
doing, val ?: "", param);
return ret;
return ERR_PTR(ret);
}
}
/* All parsed OK. */
return 0;
return NULL;
}
/* Lazy bastard, eh? */

View File

@@ -28,12 +28,14 @@
#include <linux/syscore_ops.h>
#include <linux/ctype.h>
#include <linux/genhd.h>
#include <trace/events/power.h>
#include "power.h"
static int nocompress;
static int noresume;
static int nohibernate;
static int resume_wait;
static unsigned int resume_delay;
static char resume_file[256] = CONFIG_PM_STD_PARTITION;
@@ -61,6 +63,11 @@ bool freezer_test_done;
static const struct platform_hibernation_ops *hibernation_ops;
bool hibernation_available(void)
{
return (nohibernate == 0);
}
/**
* hibernation_set_ops - Set the global hibernate operations.
* @ops: Hibernation operations to use in subsequent hibernation transitions.
@@ -292,7 +299,9 @@ static int create_image(int platform_mode)
in_suspend = 1;
save_processor_state();
trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true);
error = swsusp_arch_suspend();
trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
if (error)
printk(KERN_ERR "PM: Error %d creating hibernation image\n",
error);
@@ -639,6 +648,11 @@ int hibernate(void)
{
int error;
if (!hibernation_available()) {
pr_debug("PM: Hibernation not available.\n");
return -EPERM;
}
lock_system_sleep();
/* The snapshot device should not be opened while we're running */
if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
@@ -731,7 +745,7 @@ static int software_resume(void)
/*
* If the user said "noresume".. bail out early.
*/
if (noresume)
if (noresume || !hibernation_available())
return 0;
/*
@@ -897,6 +911,9 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
int i;
char *start = buf;
if (!hibernation_available())
return sprintf(buf, "[disabled]\n");
for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
if (!hibernation_modes[i])
continue;
@@ -931,6 +948,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
char *p;
int mode = HIBERNATION_INVALID;
if (!hibernation_available())
return -EPERM;
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
@@ -1098,6 +1118,10 @@ static int __init hibernate_setup(char *str)
noresume = 1;
else if (!strncmp(str, "nocompress", 10))
nocompress = 1;
else if (!strncmp(str, "no", 2)) {
noresume = 1;
nohibernate = 1;
}
return 1;
}
@@ -1122,9 +1146,23 @@ static int __init resumedelay_setup(char *str)
return 1;
}
static int __init nohibernate_setup(char *str)
{
noresume = 1;
nohibernate = 1;
return 1;
}
static int __init kaslr_nohibernate_setup(char *str)
{
return nohibernate_setup(str);
}
__setup("noresume", noresume_setup);
__setup("resume_offset=", resume_offset_setup);
__setup("resume=", resume_setup);
__setup("hibernate=", hibernate_setup);
__setup("resumewait", resumewait_setup);
__setup("resumedelay=", resumedelay_setup);
__setup("nohibernate", nohibernate_setup);
__setup("kaslr", kaslr_nohibernate_setup);

View File

@@ -300,13 +300,11 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
s += sprintf(s,"%s ", pm_states[i].label);
#endif
#ifdef CONFIG_HIBERNATION
s += sprintf(s, "%s\n", "disk");
#else
if (hibernation_available())
s += sprintf(s, "disk ");
if (s != buf)
/* convert the last space to a newline */
*(s-1) = '\n';
#endif
return (s - buf);
}

View File

@@ -17,6 +17,7 @@
#include <linux/delay.h>
#include <linux/workqueue.h>
#include <linux/kmod.h>
#include <trace/events/power.h>
/*
* Timeout for stopping processes
@@ -175,6 +176,7 @@ void thaw_processes(void)
struct task_struct *g, *p;
struct task_struct *curr = current;
trace_suspend_resume(TPS("thaw_processes"), 0, true);
if (pm_freezing)
atomic_dec(&system_freezing_cnt);
pm_freezing = false;
@@ -201,6 +203,7 @@ void thaw_processes(void)
schedule();
printk("done.\n");
trace_suspend_resume(TPS("thaw_processes"), 0, false);
}
void thaw_kernel_threads(void)

View File

@@ -177,7 +177,9 @@ static int suspend_prepare(suspend_state_t state)
if (error)
goto Finish;
trace_suspend_resume(TPS("freeze_processes"), 0, true);
error = suspend_freeze_processes();
trace_suspend_resume(TPS("freeze_processes"), 0, false);
if (!error)
return 0;
@@ -240,7 +242,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
* all the devices are suspended.
*/
if (state == PM_SUSPEND_FREEZE) {
trace_suspend_resume(TPS("machine_suspend"), state, true);
freeze_enter();
trace_suspend_resume(TPS("machine_suspend"), state, false);
goto Platform_wake;
}
@@ -256,7 +260,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
if (!error) {
*wakeup = pm_wakeup_pending();
if (!(suspend_test(TEST_CORE) || *wakeup)) {
trace_suspend_resume(TPS("machine_suspend"),
state, true);
error = suspend_ops->enter(state);
trace_suspend_resume(TPS("machine_suspend"),
state, false);
events_check_enabled = false;
}
syscore_resume();
@@ -294,7 +302,6 @@ int suspend_devices_and_enter(suspend_state_t state)
if (need_suspend_ops(state) && !suspend_ops)
return -ENOSYS;
trace_machine_suspend(state);
if (need_suspend_ops(state) && suspend_ops->begin) {
error = suspend_ops->begin(state);
if (error)
@@ -331,7 +338,6 @@ int suspend_devices_and_enter(suspend_state_t state)
else if (state == PM_SUSPEND_FREEZE && freeze_ops->end)
freeze_ops->end();
trace_machine_suspend(PWR_EVENT_EXIT);
return error;
Recover_platform:
@@ -365,6 +371,7 @@ static int enter_state(suspend_state_t state)
{
int error;
trace_suspend_resume(TPS("suspend_enter"), state, true);
if (state == PM_SUSPEND_FREEZE) {
#ifdef CONFIG_PM_DEBUG
if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) {
@@ -382,9 +389,11 @@ static int enter_state(suspend_state_t state)
if (state == PM_SUSPEND_FREEZE)
freeze_begin();
trace_suspend_resume(TPS("sync_filesystems"), 0, true);
printk(KERN_INFO "PM: Syncing filesystems ... ");
sys_sync();
printk("done.\n");
trace_suspend_resume(TPS("sync_filesystems"), 0, false);
pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label);
error = suspend_prepare(state);
@@ -394,6 +403,7 @@ static int enter_state(suspend_state_t state)
if (suspend_test(TEST_FREEZER))
goto Finish;
trace_suspend_resume(TPS("suspend_enter"), state, false);
pr_debug("PM: Entering %s sleep\n", pm_states[state].label);
pm_restrict_gfp_mask();
error = suspend_devices_and_enter(state);

View File

@@ -49,6 +49,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
struct snapshot_data *data;
int error;
if (!hibernation_available())
return -EPERM;
lock_system_sleep();
if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {

View File

@@ -1416,9 +1416,10 @@ static int have_callable_console(void)
/*
* Can we actually use the console at this time on this cpu?
*
* Console drivers may assume that per-cpu resources have been allocated. So
* unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
* call them until this CPU is officially up.
* Console drivers may assume that per-cpu resources have
* been allocated. So unless they're explicitly marked as
* being able to cope (CON_ANYTIME) don't call them until
* this CPU is officially up.
*/
static inline int can_use_console(unsigned int cpu)
{
@@ -1431,10 +1432,8 @@ static inline int can_use_console(unsigned int cpu)
* console_lock held, and 'console_locked' set) if it
* is successful, false otherwise.
*/
static int console_trylock_for_printk(void)
static int console_trylock_for_printk(unsigned int cpu)
{
unsigned int cpu = smp_processor_id();
if (!console_trylock())
return 0;
/*
@@ -1609,8 +1608,7 @@ asmlinkage int vprintk_emit(int facility, int level,
*/
if (!oops_in_progress && !lockdep_recursing(current)) {
recursion_bug = 1;
local_irq_restore(flags);
return 0;
goto out_restore_irqs;
}
zap_locks();
}
@@ -1718,27 +1716,21 @@ asmlinkage int vprintk_emit(int facility, int level,
logbuf_cpu = UINT_MAX;
raw_spin_unlock(&logbuf_lock);
lockdep_on();
local_irq_restore(flags);
/* If called from the scheduler, we can not call up(). */
if (in_sched)
return printed_len;
/*
* Disable preemption to avoid being preempted while holding
* console_sem which would prevent anyone from printing to console
*/
preempt_disable();
/*
* Try to acquire and then immediately release the console semaphore.
* The release will print out buffers and wake up /dev/kmsg and syslog()
* users.
*/
if (console_trylock_for_printk())
console_unlock();
preempt_enable();
if (!in_sched) {
/*
* Try to acquire and then immediately release the console
* semaphore. The release will print out buffers and wake up
* /dev/kmsg and syslog() users.
*/
if (console_trylock_for_printk(this_cpu))
console_unlock();
}
lockdep_on();
out_restore_irqs:
local_irq_restore(flags);
return printed_len;
}
EXPORT_SYMBOL(vprintk_emit);

View File

@@ -535,7 +535,7 @@ static inline void init_hrtick(void)
__old; \
})
#ifdef TIF_POLLING_NRFLAG
#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
/*
* Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
* this avoids any races wrt polling state changes and thereby avoids
@@ -546,12 +546,44 @@ static bool set_nr_and_not_polling(struct task_struct *p)
struct thread_info *ti = task_thread_info(p);
return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
}
/*
* Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
*
* If this returns true, then the idle task promises to call
* sched_ttwu_pending() and reschedule soon.
*/
static bool set_nr_if_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
for (;;) {
if (!(val & _TIF_POLLING_NRFLAG))
return false;
if (val & _TIF_NEED_RESCHED)
return true;
old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
if (old == val)
break;
val = old;
}
return true;
}
#else
static bool set_nr_and_not_polling(struct task_struct *p)
{
set_tsk_need_resched(p);
return true;
}
#ifdef CONFIG_SMP
static bool set_nr_if_polling(struct task_struct *p)
{
return false;
}
#endif
#endif
/*
@@ -580,6 +612,8 @@ void resched_task(struct task_struct *p)
if (set_nr_and_not_polling(p))
smp_send_reschedule(cpu);
else
trace_sched_wake_idle_without_ipi(cpu);
}
void resched_cpu(int cpu)
@@ -642,27 +676,10 @@ static void wake_up_idle_cpu(int cpu)
if (cpu == smp_processor_id())
return;
/*
* This is safe, as this function is called with the timer
* wheel base lock of (cpu) held. When the CPU is on the way
* to idle and has not yet set rq->curr to idle then it will
* be serialized on the timer wheel base lock and take the new
* timer into account automatically.
*/
if (rq->curr != rq->idle)
return;
/*
* We can set TIF_RESCHED on the idle task of the other CPU
* lockless. The worst case is that the other CPU runs the
* idle task through an additional NOOP schedule()
*/
set_tsk_need_resched(rq->idle);
/* NEED_RESCHED must be visible before we test polling */
smp_mb();
if (!tsk_is_polling(rq->idle))
if (set_nr_and_not_polling(rq->idle))
smp_send_reschedule(cpu);
else
trace_sched_wake_idle_without_ipi(cpu);
}
static bool wake_up_full_nohz_cpu(int cpu)
@@ -888,7 +905,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->clock_task += delta;
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
sched_rt_avg_update(rq, irq_delta + steal);
#endif
}
@@ -1521,13 +1538,17 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
}
#ifdef CONFIG_SMP
static void sched_ttwu_pending(void)
void sched_ttwu_pending(void)
{
struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list);
struct task_struct *p;
unsigned long flags;
raw_spin_lock(&rq->lock);
if (!llist)
return;
raw_spin_lock_irqsave(&rq->lock, flags);
while (llist) {
p = llist_entry(llist, struct task_struct, wake_entry);
@@ -1535,7 +1556,7 @@ static void sched_ttwu_pending(void)
ttwu_do_activate(rq, p, 0);
}
raw_spin_unlock(&rq->lock);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
void scheduler_ipi(void)
@@ -1581,8 +1602,14 @@ void scheduler_ipi(void)
static void ttwu_queue_remote(struct task_struct *p, int cpu)
{
if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
smp_send_reschedule(cpu);
struct rq *rq = cpu_rq(cpu);
if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
if (!set_nr_if_polling(rq->idle))
smp_send_reschedule(cpu);
else
trace_sched_wake_idle_without_ipi(cpu);
}
}
bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -2527,7 +2554,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_PREEMPT_TRACER))
void __kprobes preempt_count_add(int val)
void preempt_count_add(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
/*
@@ -2553,8 +2580,9 @@ void __kprobes preempt_count_add(int val)
}
}
EXPORT_SYMBOL(preempt_count_add);
NOKPROBE_SYMBOL(preempt_count_add);
void __kprobes preempt_count_sub(int val)
void preempt_count_sub(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
/*
@@ -2575,6 +2603,7 @@ void __kprobes preempt_count_sub(int val)
__preempt_count_sub(val);
}
EXPORT_SYMBOL(preempt_count_sub);
NOKPROBE_SYMBOL(preempt_count_sub);
#endif
@@ -2857,6 +2886,7 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
barrier();
} while (need_resched());
}
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
#endif /* CONFIG_PREEMPT */
@@ -4216,7 +4246,7 @@ EXPORT_SYMBOL(yield);
* false (0) if we failed to boost the target.
* -ESRCH if there's no task to yield to.
*/
bool __sched yield_to(struct task_struct *p, bool preempt)
int __sched yield_to(struct task_struct *p, bool preempt)
{
struct task_struct *curr = current;
struct rq *rq, *p_rq;
@@ -5242,14 +5272,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
}
/*
* Even though we initialize ->power to something semi-sane,
* we leave power_orig unset. This allows us to detect if
* Even though we initialize ->capacity to something semi-sane,
* we leave capacity_orig unset. This allows us to detect if
* domain iteration is still funny without causing /0 traps.
*/
if (!group->sgp->power_orig) {
if (!group->sgc->capacity_orig) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: domain->cpu_power not "
"set\n");
printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
break;
}
@@ -5271,9 +5300,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
printk(KERN_CONT " %s", str);
if (group->sgp->power != SCHED_POWER_SCALE) {
printk(KERN_CONT " (cpu_power = %d)",
group->sgp->power);
if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
printk(KERN_CONT " (cpu_capacity = %d)",
group->sgc->capacity);
}
group = group->next;
@@ -5331,7 +5360,7 @@ static int sd_degenerate(struct sched_domain *sd)
SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
SD_SHARE_CPUPOWER |
SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_SHARE_POWERDOMAIN)) {
if (sd->groups != sd->groups->next)
@@ -5362,7 +5391,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
SD_SHARE_CPUPOWER |
SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_PREFER_SIBLING |
SD_SHARE_POWERDOMAIN);
@@ -5487,7 +5516,7 @@ static struct root_domain *alloc_rootdomain(void)
return rd;
}
static void free_sched_groups(struct sched_group *sg, int free_sgp)
static void free_sched_groups(struct sched_group *sg, int free_sgc)
{
struct sched_group *tmp, *first;
@@ -5498,8 +5527,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgp)
do {
tmp = sg->next;
if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
kfree(sg->sgp);
if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
kfree(sg->sgc);
kfree(sg);
sg = tmp;
@@ -5517,7 +5546,7 @@ static void free_sched_domain(struct rcu_head *rcu)
if (sd->flags & SD_OVERLAP) {
free_sched_groups(sd->groups, 1);
} else if (atomic_dec_and_test(&sd->groups->ref)) {
kfree(sd->groups->sgp);
kfree(sd->groups->sgc);
kfree(sd->groups);
}
kfree(sd);
@@ -5728,17 +5757,17 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
cpumask_or(covered, covered, sg_span);
sg->sgp = *per_cpu_ptr(sdd->sgp, i);
if (atomic_inc_return(&sg->sgp->ref) == 1)
sg->sgc = *per_cpu_ptr(sdd->sgc, i);
if (atomic_inc_return(&sg->sgc->ref) == 1)
build_group_mask(sd, sg);
/*
* Initialize sgp->power such that even if we mess up the
* Initialize sgc->capacity such that even if we mess up the
* domains and no possible iteration will get us here, we won't
* die on a /0 trap.
*/
sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
sg->sgp->power_orig = sg->sgp->power;
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
sg->sgc->capacity_orig = sg->sgc->capacity;
/*
* Make sure the first group of this domain contains the
@@ -5776,8 +5805,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
if (sg) {
*sg = *per_cpu_ptr(sdd->sg, cpu);
(*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
(*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */
}
return cpu;
@@ -5786,7 +5815,7 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
/*
* build_sched_groups will build a circular linked list of the groups
* covered by the given span, and will set each group's ->cpumask correctly,
* and ->cpu_power to 0.
* and ->cpu_capacity to 0.
*
* Assumes the sched_domain tree is fully constructed
*/
@@ -5840,16 +5869,16 @@ build_sched_groups(struct sched_domain *sd, int cpu)
}
/*
* Initialize sched groups cpu_power.
* Initialize sched groups cpu_capacity.
*
* cpu_power indicates the capacity of sched group, which is used while
* cpu_capacity indicates the capacity of sched group, which is used while
* distributing the load between different sched groups in a sched domain.
* Typically cpu_power for all the groups in a sched domain will be same unless
* there are asymmetries in the topology. If there are asymmetries, group
* having more cpu_power will pickup more load compared to the group having
* less cpu_power.
* Typically cpu_capacity for all the groups in a sched domain will be same
* unless there are asymmetries in the topology. If there are asymmetries,
* group having more cpu_capacity will pickup more load compared to the
* group having less cpu_capacity.
*/
static void init_sched_groups_power(int cpu, struct sched_domain *sd)
static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
{
struct sched_group *sg = sd->groups;
@@ -5863,8 +5892,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
if (cpu != group_balance_cpu(sg))
return;
update_group_power(sd, cpu);
atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
update_group_capacity(sd, cpu);
atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
}
/*
@@ -5955,8 +5984,8 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
*per_cpu_ptr(sdd->sg, cpu) = NULL;
if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
*per_cpu_ptr(sdd->sgp, cpu) = NULL;
if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
*per_cpu_ptr(sdd->sgc, cpu) = NULL;
}
#ifdef CONFIG_NUMA
@@ -5969,7 +5998,7 @@ static int sched_domains_curr_level;
/*
* SD_flags allowed in topology descriptions.
*
* SD_SHARE_CPUPOWER - describes SMT topologies
* SD_SHARE_CPUCAPACITY - describes SMT topologies
* SD_SHARE_PKG_RESOURCES - describes shared caches
* SD_NUMA - describes NUMA topologies
* SD_SHARE_POWERDOMAIN - describes shared power domain
@@ -5978,7 +6007,7 @@ static int sched_domains_curr_level;
* SD_ASYM_PACKING - describes SMT quirks
*/
#define TOPOLOGY_SD_FLAGS \
(SD_SHARE_CPUPOWER | \
(SD_SHARE_CPUCAPACITY | \
SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \
SD_ASYM_PACKING | \
@@ -6024,7 +6053,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
| 1*SD_BALANCE_FORK
| 0*SD_BALANCE_WAKE
| 1*SD_WAKE_AFFINE
| 0*SD_SHARE_CPUPOWER
| 0*SD_SHARE_CPUCAPACITY
| 0*SD_SHARE_PKG_RESOURCES
| 0*SD_SERIALIZE
| 0*SD_PREFER_SIBLING
@@ -6046,7 +6075,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
* Convert topological properties into behaviour.
*/
if (sd->flags & SD_SHARE_CPUPOWER) {
if (sd->flags & SD_SHARE_CPUCAPACITY) {
sd->imbalance_pct = 110;
sd->smt_gain = 1178; /* ~15% */
@@ -6358,14 +6387,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sdd->sg)
return -ENOMEM;
sdd->sgp = alloc_percpu(struct sched_group_power *);
if (!sdd->sgp)
sdd->sgc = alloc_percpu(struct sched_group_capacity *);
if (!sdd->sgc)
return -ENOMEM;
for_each_cpu(j, cpu_map) {
struct sched_domain *sd;
struct sched_group *sg;
struct sched_group_power *sgp;
struct sched_group_capacity *sgc;
sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
@@ -6383,12 +6412,12 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
*per_cpu_ptr(sdd->sg, j) = sg;
sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
if (!sgp)
if (!sgc)
return -ENOMEM;
*per_cpu_ptr(sdd->sgp, j) = sgp;
*per_cpu_ptr(sdd->sgc, j) = sgc;
}
}
@@ -6415,15 +6444,15 @@ static void __sdt_free(const struct cpumask *cpu_map)
if (sdd->sg)
kfree(*per_cpu_ptr(sdd->sg, j));
if (sdd->sgp)
kfree(*per_cpu_ptr(sdd->sgp, j));
if (sdd->sgc)
kfree(*per_cpu_ptr(sdd->sgc, j));
}
free_percpu(sdd->sd);
sdd->sd = NULL;
free_percpu(sdd->sg);
sdd->sg = NULL;
free_percpu(sdd->sgp);
sdd->sgp = NULL;
free_percpu(sdd->sgc);
sdd->sgc = NULL;
}
}
@@ -6493,14 +6522,14 @@ static int build_sched_domains(const struct cpumask *cpu_map,
}
}
/* Calculate CPU power for physical packages and nodes */
/* Calculate CPU capacity for physical packages and nodes */
for (i = nr_cpumask_bits-1; i >= 0; i--) {
if (!cpumask_test_cpu(i, cpu_map))
continue;
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
claim_allocations(i, sd);
init_sched_groups_power(i, sd);
init_sched_groups_capacity(i, sd);
}
}
@@ -6943,7 +6972,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
rq->cpu_power = SCHED_POWER_SCALE;
rq->cpu_capacity = SCHED_CAPACITY_SCALE;
rq->post_schedule = 0;
rq->active_balance = 0;
rq->next_balance = jiffies;
@@ -7669,7 +7698,7 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
struct task_group *parent = css_tg(css_parent(css));
struct task_group *parent = css_tg(css->parent);
if (parent)
sched_online_group(tg, parent);

View File

@@ -46,7 +46,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk)
static inline struct cpuacct *parent_ca(struct cpuacct *ca)
{
return css_ca(css_parent(&ca->css));
return css_ca(ca->css.parent);
}
static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);

View File

@@ -57,8 +57,6 @@ void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
dl_b->dl_runtime = runtime;
}
extern unsigned long to_ratio(u64 period, u64 runtime);
void init_dl_bw(struct dl_bw *dl_b)
{
raw_spin_lock_init(&dl_b->lock);

View File

@@ -1017,7 +1017,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
static unsigned long weighted_cpuload(const int cpu);
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
static unsigned long power_of(int cpu);
static unsigned long capacity_of(int cpu);
static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
/* Cached statistics for all CPUs within a node */
@@ -1026,11 +1026,11 @@ struct numa_stats {
unsigned long load;
/* Total compute capacity of CPUs on a node */
unsigned long power;
unsigned long compute_capacity;
/* Approximate capacity in terms of runnable tasks on a node */
unsigned long capacity;
int has_capacity;
unsigned long task_capacity;
int has_free_capacity;
};
/*
@@ -1046,7 +1046,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
ns->nr_running += rq->nr_running;
ns->load += weighted_cpuload(cpu);
ns->power += power_of(cpu);
ns->compute_capacity += capacity_of(cpu);
cpus++;
}
@@ -1056,15 +1056,16 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
* the @ns structure is NULL'ed and task_numa_compare() will
* not find this node attractive.
*
* We'll either bail at !has_capacity, or we'll detect a huge imbalance
* and bail there.
* We'll either bail at !has_free_capacity, or we'll detect a huge
* imbalance and bail there.
*/
if (!cpus)
return;
ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
ns->has_capacity = (ns->nr_running < ns->capacity);
ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity;
ns->task_capacity =
DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
}
struct task_numa_env {
@@ -1195,8 +1196,8 @@ static void task_numa_compare(struct task_numa_env *env,
if (!cur) {
/* Is there capacity at our destination? */
if (env->src_stats.has_capacity &&
!env->dst_stats.has_capacity)
if (env->src_stats.has_free_capacity &&
!env->dst_stats.has_free_capacity)
goto unlock;
goto balance;
@@ -1213,7 +1214,7 @@ balance:
orig_dst_load = env->dst_stats.load;
orig_src_load = env->src_stats.load;
/* XXX missing power terms */
/* XXX missing capacity terms */
load = task_h_load(env->p);
dst_load = orig_dst_load + load;
src_load = orig_src_load - load;
@@ -1301,8 +1302,8 @@ static int task_numa_migrate(struct task_struct *p)
groupimp = group_weight(p, env.dst_nid) - groupweight;
update_numa_stats(&env.dst_stats, env.dst_nid);
/* If the preferred nid has capacity, try to use it. */
if (env.dst_stats.has_capacity)
/* If the preferred nid has free capacity, try to use it. */
if (env.dst_stats.has_free_capacity)
task_numa_find_cpu(&env, taskimp, groupimp);
/* No space available on the preferred nid. Look elsewhere. */
@@ -3225,10 +3226,12 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
* has not truly expired.
*
* Fortunately we can check determine whether this the case by checking
* whether the global deadline has advanced.
* whether the global deadline has advanced. It is valid to compare
* cfs_b->runtime_expires without any locks since we only care about
* exact equality, so a partial write will still work.
*/
if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
/* extend local deadline, drift is bounded above by 2 ticks */
cfs_rq->runtime_expires += TICK_NSEC;
} else {
@@ -3457,21 +3460,21 @@ next:
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
{
u64 runtime, runtime_expires;
int idle = 1, throttled;
int throttled;
raw_spin_lock(&cfs_b->lock);
/* no need to continue the timer with no bandwidth constraint */
if (cfs_b->quota == RUNTIME_INF)
goto out_unlock;
goto out_deactivate;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
/* idle depends on !throttled (for the case of a large deficit) */
idle = cfs_b->idle && !throttled;
cfs_b->nr_periods += overrun;
/* if we're going inactive then everything else can be deferred */
if (idle)
goto out_unlock;
/*
* idle depends on !throttled (for the case of a large deficit), and if
* we're going inactive then everything else can be deferred
*/
if (cfs_b->idle && !throttled)
goto out_deactivate;
/*
* if we have relooped after returning idle once, we need to update our
@@ -3485,7 +3488,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
if (!throttled) {
/* mark as potentially idle for the upcoming period */
cfs_b->idle = 1;
goto out_unlock;
return 0;
}
/* account preceding periods in which throttling occurred */
@@ -3525,12 +3528,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
* timer to remain active while there are any throttled entities.)
*/
cfs_b->idle = 0;
out_unlock:
if (idle)
cfs_b->timer_active = 0;
raw_spin_unlock(&cfs_b->lock);
return idle;
return 0;
out_deactivate:
cfs_b->timer_active = 0;
return 1;
}
/* a cfs_rq won't donate quota below this amount */
@@ -3707,6 +3710,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
int overrun;
int idle = 0;
raw_spin_lock(&cfs_b->lock);
for (;;) {
now = hrtimer_cb_get_time(timer);
overrun = hrtimer_forward(timer, now, cfs_b->period);
@@ -3716,6 +3720,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
idle = do_sched_cfs_period_timer(cfs_b, overrun);
}
raw_spin_unlock(&cfs_b->lock);
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
@@ -3775,8 +3780,6 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
struct cfs_rq *cfs_rq;
for_each_leaf_cfs_rq(rq, cfs_rq) {
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
if (!cfs_rq->runtime_enabled)
continue;
@@ -3784,7 +3787,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
* clock_task is not advancing so we just need to make sure
* there's some valid quota amount
*/
cfs_rq->runtime_remaining = cfs_b->quota;
cfs_rq->runtime_remaining = 1;
if (cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
}
@@ -4041,9 +4044,9 @@ static unsigned long target_load(int cpu, int type)
return max(rq->cpu_load[type-1], total);
}
static unsigned long power_of(int cpu)
static unsigned long capacity_of(int cpu)
{
return cpu_rq(cpu)->cpu_power;
return cpu_rq(cpu)->cpu_capacity;
}
static unsigned long cpu_avg_load_per_task(int cpu)
@@ -4065,7 +4068,7 @@ static void record_wakee(struct task_struct *p)
* about the boundary, really active task won't care
* about the loss.
*/
if (jiffies > current->wakee_flip_decay_ts + HZ) {
if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
current->wakee_flips >>= 1;
current->wakee_flip_decay_ts = jiffies;
}
@@ -4286,12 +4289,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
s64 this_eff_load, prev_eff_load;
this_eff_load = 100;
this_eff_load *= power_of(prev_cpu);
this_eff_load *= capacity_of(prev_cpu);
this_eff_load *= this_load +
effective_load(tg, this_cpu, weight, weight);
prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
prev_eff_load *= power_of(this_cpu);
prev_eff_load *= capacity_of(this_cpu);
prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
balanced = this_eff_load <= prev_eff_load;
@@ -4367,8 +4370,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
avg_load += load;
}
/* Adjust by relative CPU power of the group */
avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
/* Adjust by relative CPU capacity of the group */
avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
if (local_group) {
this_load = avg_load;
@@ -4948,14 +4951,14 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
*
* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
*
* P_i is the cpu power (or compute capacity) of cpu i, typically it is the
* C_i is the compute capacity of cpu i, typically it is the
* fraction of 'recent' time available for SCHED_OTHER task execution. But it
* can also include other factors [XXX].
*
* To achieve this balance we define a measure of imbalance which follows
* directly from (1):
*
* imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
* imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
*
* We them move tasks around to minimize the imbalance. In the continuous
* function space it is obvious this converges, in the discrete case we get
@@ -5530,13 +5533,13 @@ struct sg_lb_stats {
unsigned long group_load; /* Total load over the CPUs of the group */
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long load_per_task;
unsigned long group_power;
unsigned long group_capacity;
unsigned int sum_nr_running; /* Nr tasks running in the group */
unsigned int group_capacity;
unsigned int group_capacity_factor;
unsigned int idle_cpus;
unsigned int group_weight;
int group_imb; /* Is there an imbalance in the group ? */
int group_has_capacity; /* Is there extra capacity in the group? */
int group_has_free_capacity;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -5551,7 +5554,7 @@ struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *local; /* Local group in this sd */
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_pwr; /* Total power of all groups in sd */
unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
@@ -5570,7 +5573,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
.busiest = NULL,
.local = NULL,
.total_load = 0UL,
.total_pwr = 0UL,
.total_capacity = 0UL,
.busiest_stat = {
.avg_load = 0UL,
},
@@ -5605,17 +5608,17 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
return load_idx;
}
static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)
{
return SCHED_POWER_SCALE;
return SCHED_CAPACITY_SCALE;
}
unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
{
return default_scale_freq_power(sd, cpu);
return default_scale_capacity(sd, cpu);
}
static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu)
{
unsigned long weight = sd->span_weight;
unsigned long smt_gain = sd->smt_gain;
@@ -5625,12 +5628,12 @@ static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
return smt_gain;
}
unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu)
{
return default_scale_smt_power(sd, cpu);
return default_scale_smt_capacity(sd, cpu);
}
static unsigned long scale_rt_power(int cpu)
static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
u64 total, available, age_stamp, avg;
@@ -5650,71 +5653,71 @@ static unsigned long scale_rt_power(int cpu)
total = sched_avg_period() + delta;
if (unlikely(total < avg)) {
/* Ensures that power won't end up being negative */
/* Ensures that capacity won't end up being negative */
available = 0;
} else {
available = total - avg;
}
if (unlikely((s64)total < SCHED_POWER_SCALE))
total = SCHED_POWER_SCALE;
if (unlikely((s64)total < SCHED_CAPACITY_SCALE))
total = SCHED_CAPACITY_SCALE;
total >>= SCHED_POWER_SHIFT;
total >>= SCHED_CAPACITY_SHIFT;
return div_u64(available, total);
}
static void update_cpu_power(struct sched_domain *sd, int cpu)
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
unsigned long weight = sd->span_weight;
unsigned long power = SCHED_POWER_SCALE;
unsigned long capacity = SCHED_CAPACITY_SCALE;
struct sched_group *sdg = sd->groups;
if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
if (sched_feat(ARCH_POWER))
power *= arch_scale_smt_power(sd, cpu);
if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) {
if (sched_feat(ARCH_CAPACITY))
capacity *= arch_scale_smt_capacity(sd, cpu);
else
power *= default_scale_smt_power(sd, cpu);
capacity *= default_scale_smt_capacity(sd, cpu);
power >>= SCHED_POWER_SHIFT;
capacity >>= SCHED_CAPACITY_SHIFT;
}
sdg->sgp->power_orig = power;
sdg->sgc->capacity_orig = capacity;
if (sched_feat(ARCH_POWER))
power *= arch_scale_freq_power(sd, cpu);
if (sched_feat(ARCH_CAPACITY))
capacity *= arch_scale_freq_capacity(sd, cpu);
else
power *= default_scale_freq_power(sd, cpu);
capacity *= default_scale_capacity(sd, cpu);
power >>= SCHED_POWER_SHIFT;
capacity >>= SCHED_CAPACITY_SHIFT;
power *= scale_rt_power(cpu);
power >>= SCHED_POWER_SHIFT;
capacity *= scale_rt_capacity(cpu);
capacity >>= SCHED_CAPACITY_SHIFT;
if (!power)
power = 1;
if (!capacity)
capacity = 1;
cpu_rq(cpu)->cpu_power = power;
sdg->sgp->power = power;
cpu_rq(cpu)->cpu_capacity = capacity;
sdg->sgc->capacity = capacity;
}
void update_group_power(struct sched_domain *sd, int cpu)
void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
unsigned long power, power_orig;
unsigned long capacity, capacity_orig;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
interval = clamp(interval, 1UL, max_load_balance_interval);
sdg->sgp->next_update = jiffies + interval;
sdg->sgc->next_update = jiffies + interval;
if (!child) {
update_cpu_power(sd, cpu);
update_cpu_capacity(sd, cpu);
return;
}
power_orig = power = 0;
capacity_orig = capacity = 0;
if (child->flags & SD_OVERLAP) {
/*
@@ -5723,31 +5726,31 @@ void update_group_power(struct sched_domain *sd, int cpu)
*/
for_each_cpu(cpu, sched_group_cpus(sdg)) {
struct sched_group_power *sgp;
struct sched_group_capacity *sgc;
struct rq *rq = cpu_rq(cpu);
/*
* build_sched_domains() -> init_sched_groups_power()
* build_sched_domains() -> init_sched_groups_capacity()
* gets here before we've attached the domains to the
* runqueues.
*
* Use power_of(), which is set irrespective of domains
* in update_cpu_power().
* Use capacity_of(), which is set irrespective of domains
* in update_cpu_capacity().
*
* This avoids power/power_orig from being 0 and
* This avoids capacity/capacity_orig from being 0 and
* causing divide-by-zero issues on boot.
*
* Runtime updates will correct power_orig.
* Runtime updates will correct capacity_orig.
*/
if (unlikely(!rq->sd)) {
power_orig += power_of(cpu);
power += power_of(cpu);
capacity_orig += capacity_of(cpu);
capacity += capacity_of(cpu);
continue;
}
sgp = rq->sd->groups->sgp;
power_orig += sgp->power_orig;
power += sgp->power;
sgc = rq->sd->groups->sgc;
capacity_orig += sgc->capacity_orig;
capacity += sgc->capacity;
}
} else {
/*
@@ -5757,14 +5760,14 @@ void update_group_power(struct sched_domain *sd, int cpu)
group = child->groups;
do {
power_orig += group->sgp->power_orig;
power += group->sgp->power;
capacity_orig += group->sgc->capacity_orig;
capacity += group->sgc->capacity;
group = group->next;
} while (group != child->groups);
}
sdg->sgp->power_orig = power_orig;
sdg->sgp->power = power;
sdg->sgc->capacity_orig = capacity_orig;
sdg->sgc->capacity = capacity;
}
/*
@@ -5778,15 +5781,15 @@ static inline int
fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
{
/*
* Only siblings can have significantly less than SCHED_POWER_SCALE
* Only siblings can have significantly less than SCHED_CAPACITY_SCALE
*/
if (!(sd->flags & SD_SHARE_CPUPOWER))
if (!(sd->flags & SD_SHARE_CPUCAPACITY))
return 0;
/*
* If ~90% of the cpu_power is still there, we're good.
* If ~90% of the cpu_capacity is still there, we're good.
*/
if (group->sgp->power * 32 > group->sgp->power_orig * 29)
if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)
return 1;
return 0;
@@ -5823,34 +5826,35 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
static inline int sg_imbalanced(struct sched_group *group)
{
return group->sgp->imbalance;
return group->sgc->imbalance;
}
/*
* Compute the group capacity.
* Compute the group capacity factor.
*
* Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
* Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by
* first dividing out the smt factor and computing the actual number of cores
* and limit power unit capacity with that.
* and limit unit capacity with that.
*/
static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group)
{
unsigned int capacity, smt, cpus;
unsigned int power, power_orig;
unsigned int capacity_factor, smt, cpus;
unsigned int capacity, capacity_orig;
power = group->sgp->power;
power_orig = group->sgp->power_orig;
capacity = group->sgc->capacity;
capacity_orig = group->sgc->capacity_orig;
cpus = group->group_weight;
/* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
capacity = cpus / smt; /* cores */
/* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */
smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig);
capacity_factor = cpus / smt; /* cores */
capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
if (!capacity)
capacity = fix_small_capacity(env->sd, group);
capacity_factor = min_t(unsigned,
capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE));
if (!capacity_factor)
capacity_factor = fix_small_capacity(env->sd, group);
return capacity;
return capacity_factor;
}
/**
@@ -5890,9 +5894,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->idle_cpus++;
}
/* Adjust by relative CPU power of the group */
sgs->group_power = group->sgp->power;
sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
/* Adjust by relative CPU capacity of the group */
sgs->group_capacity = group->sgc->capacity;
sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
if (sgs->sum_nr_running)
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
@@ -5900,10 +5904,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_weight = group->group_weight;
sgs->group_imb = sg_imbalanced(group);
sgs->group_capacity = sg_capacity(env, group);
sgs->group_capacity_factor = sg_capacity_factor(env, group);
if (sgs->group_capacity > sgs->sum_nr_running)
sgs->group_has_capacity = 1;
if (sgs->group_capacity_factor > sgs->sum_nr_running)
sgs->group_has_free_capacity = 1;
}
/**
@@ -5927,7 +5931,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (sgs->avg_load <= sds->busiest_stat.avg_load)
return false;
if (sgs->sum_nr_running > sgs->group_capacity)
if (sgs->sum_nr_running > sgs->group_capacity_factor)
return true;
if (sgs->group_imb)
@@ -6007,8 +6011,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
sgs = &sds->local_stat;
if (env->idle != CPU_NEWLY_IDLE ||
time_after_eq(jiffies, sg->sgp->next_update))
update_group_power(env->sd, env->dst_cpu);
time_after_eq(jiffies, sg->sgc->next_update))
update_group_capacity(env->sd, env->dst_cpu);
}
update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
@@ -6018,17 +6022,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
/*
* In case the child domain prefers tasks go to siblings
* first, lower the sg capacity to one so that we'll try
* first, lower the sg capacity factor to one so that we'll try
* and move all the excess tasks away. We lower the capacity
* of a group only if the local group has the capacity to fit
* these excess tasks, i.e. nr_running < group_capacity. The
* these excess tasks, i.e. nr_running < group_capacity_factor. The
* extra check prevents the case where you always pull from the
* heaviest group when it is already under-utilized (possible
* with a large weight task outweighs the tasks on the system).
*/
if (prefer_sibling && sds->local &&
sds->local_stat.group_has_capacity)
sgs->group_capacity = min(sgs->group_capacity, 1U);
sds->local_stat.group_has_free_capacity)
sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
@@ -6038,7 +6042,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
next_group:
/* Now, start updating sd_lb_stats */
sds->total_load += sgs->group_load;
sds->total_pwr += sgs->group_power;
sds->total_capacity += sgs->group_capacity;
sg = sg->next;
} while (sg != env->sd->groups);
@@ -6085,8 +6089,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
return 0;
env->imbalance = DIV_ROUND_CLOSEST(
sds->busiest_stat.avg_load * sds->busiest_stat.group_power,
SCHED_POWER_SCALE);
sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
SCHED_CAPACITY_SCALE);
return 1;
}
@@ -6101,7 +6105,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
static inline
void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
unsigned long tmp, pwr_now = 0, pwr_move = 0;
unsigned long tmp, capa_now = 0, capa_move = 0;
unsigned int imbn = 2;
unsigned long scaled_busy_load_per_task;
struct sg_lb_stats *local, *busiest;
@@ -6115,8 +6119,8 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
imbn = 1;
scaled_busy_load_per_task =
(busiest->load_per_task * SCHED_POWER_SCALE) /
busiest->group_power;
(busiest->load_per_task * SCHED_CAPACITY_SCALE) /
busiest->group_capacity;
if (busiest->avg_load + scaled_busy_load_per_task >=
local->avg_load + (scaled_busy_load_per_task * imbn)) {
@@ -6126,38 +6130,38 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
/*
* OK, we don't have enough imbalance to justify moving tasks,
* however we may be able to increase total CPU power used by
* however we may be able to increase total CPU capacity used by
* moving them.
*/
pwr_now += busiest->group_power *
capa_now += busiest->group_capacity *
min(busiest->load_per_task, busiest->avg_load);
pwr_now += local->group_power *
capa_now += local->group_capacity *
min(local->load_per_task, local->avg_load);
pwr_now /= SCHED_POWER_SCALE;
capa_now /= SCHED_CAPACITY_SCALE;
/* Amount of load we'd subtract */
if (busiest->avg_load > scaled_busy_load_per_task) {
pwr_move += busiest->group_power *
capa_move += busiest->group_capacity *
min(busiest->load_per_task,
busiest->avg_load - scaled_busy_load_per_task);
}
/* Amount of load we'd add */
if (busiest->avg_load * busiest->group_power <
busiest->load_per_task * SCHED_POWER_SCALE) {
tmp = (busiest->avg_load * busiest->group_power) /
local->group_power;
if (busiest->avg_load * busiest->group_capacity <
busiest->load_per_task * SCHED_CAPACITY_SCALE) {
tmp = (busiest->avg_load * busiest->group_capacity) /
local->group_capacity;
} else {
tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
local->group_power;
tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
local->group_capacity;
}
pwr_move += local->group_power *
capa_move += local->group_capacity *
min(local->load_per_task, local->avg_load + tmp);
pwr_move /= SCHED_POWER_SCALE;
capa_move /= SCHED_CAPACITY_SCALE;
/* Move if we gain throughput */
if (pwr_move > pwr_now)
if (capa_move > capa_now)
env->imbalance = busiest->load_per_task;
}
@@ -6187,7 +6191,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/*
* In the presence of smp nice balancing, certain scenarios can have
* max load less than avg load(as we skip the groups at or below
* its cpu_power, while calculating max_load..)
* its cpu_capacity, while calculating max_load..)
*/
if (busiest->avg_load <= sds->avg_load ||
local->avg_load >= sds->avg_load) {
@@ -6202,10 +6206,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* have to drop below capacity to reach cpu-load equilibrium.
*/
load_above_capacity =
(busiest->sum_nr_running - busiest->group_capacity);
(busiest->sum_nr_running - busiest->group_capacity_factor);
load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
load_above_capacity /= busiest->group_power;
load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE);
load_above_capacity /= busiest->group_capacity;
}
/*
@@ -6220,9 +6224,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/* How much load to actually move to equalise the imbalance */
env->imbalance = min(
max_pull * busiest->group_power,
(sds->avg_load - local->avg_load) * local->group_power
) / SCHED_POWER_SCALE;
max_pull * busiest->group_capacity,
(sds->avg_load - local->avg_load) * local->group_capacity
) / SCHED_CAPACITY_SCALE;
/*
* if *imbalance is less than the average load per runnable task
@@ -6276,7 +6280,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (!sds.busiest || busiest->sum_nr_running == 0)
goto out_balanced;
sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
/ sds.total_capacity;
/*
* If the busiest group is imbalanced the below checks don't
@@ -6287,8 +6292,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto force_balance;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&
!busiest->group_has_capacity)
if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity &&
!busiest->group_has_free_capacity)
goto force_balance;
/*
@@ -6342,11 +6347,11 @@ static struct rq *find_busiest_queue(struct lb_env *env,
struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
unsigned long busiest_load = 0, busiest_power = 1;
unsigned long busiest_load = 0, busiest_capacity = 1;
int i;
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
unsigned long power, capacity, wl;
unsigned long capacity, capacity_factor, wl;
enum fbq_type rt;
rq = cpu_rq(i);
@@ -6374,34 +6379,34 @@ static struct rq *find_busiest_queue(struct lb_env *env,
if (rt > env->fbq_type)
continue;
power = power_of(i);
capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
if (!capacity)
capacity = fix_small_capacity(env->sd, group);
capacity = capacity_of(i);
capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);
if (!capacity_factor)
capacity_factor = fix_small_capacity(env->sd, group);
wl = weighted_cpuload(i);
/*
* When comparing with imbalance, use weighted_cpuload()
* which is not scaled with the cpu power.
* which is not scaled with the cpu capacity.
*/
if (capacity && rq->nr_running == 1 && wl > env->imbalance)
if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance)
continue;
/*
* For the load comparisons with the other cpu's, consider
* the weighted_cpuload() scaled with the cpu power, so that
* the load can be moved away from the cpu that is potentially
* running at a lower capacity.
* the weighted_cpuload() scaled with the cpu capacity, so
* that the load can be moved away from the cpu that is
* potentially running at a lower capacity.
*
* Thus we're looking for max(wl_i / power_i), crosswise
* Thus we're looking for max(wl_i / capacity_i), crosswise
* multiplication to rid ourselves of the division works out
* to: wl_i * power_j > wl_j * power_i; where j is our
* previous maximum.
* to: wl_i * capacity_j > wl_j * capacity_i; where j is
* our previous maximum.
*/
if (wl * busiest_power > busiest_load * power) {
if (wl * busiest_capacity > busiest_load * capacity) {
busiest_load = wl;
busiest_power = power;
busiest_capacity = capacity;
busiest = rq;
}
}
@@ -6609,7 +6614,7 @@ more_balance:
* We failed to reach balance because of affinity.
*/
if (sd_parent) {
int *group_imbalance = &sd_parent->groups->sgp->imbalance;
int *group_imbalance = &sd_parent->groups->sgc->imbalance;
if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
*group_imbalance = 1;
@@ -6996,7 +7001,7 @@ static inline void set_cpu_sd_state_busy(void)
goto unlock;
sd->nohz_idle = 0;
atomic_inc(&sd->groups->sgp->nr_busy_cpus);
atomic_inc(&sd->groups->sgc->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -7013,7 +7018,7 @@ void set_cpu_sd_state_idle(void)
goto unlock;
sd->nohz_idle = 1;
atomic_dec(&sd->groups->sgp->nr_busy_cpus);
atomic_dec(&sd->groups->sgc->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -7192,12 +7197,17 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
rq = cpu_rq(balance_cpu);
raw_spin_lock_irq(&rq->lock);
update_rq_clock(rq);
update_idle_cpu_load(rq);
raw_spin_unlock_irq(&rq->lock);
rebalance_domains(rq, CPU_IDLE);
/*
* If time for next balance is due,
* do the balance.
*/
if (time_after_eq(jiffies, rq->next_balance)) {
raw_spin_lock_irq(&rq->lock);
update_rq_clock(rq);
update_idle_cpu_load(rq);
raw_spin_unlock_irq(&rq->lock);
rebalance_domains(rq, CPU_IDLE);
}
if (time_after(this_rq->next_balance, rq->next_balance))
this_rq->next_balance = rq->next_balance;
@@ -7212,7 +7222,7 @@ end:
* of an idle cpu is the system.
* - This rq has more than one task.
* - At any scheduler domain level, this cpu's scheduler group has multiple
* busy cpu's exceeding the group's power.
* busy cpu's exceeding the group's capacity.
* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
* domain span are idle.
*/
@@ -7220,7 +7230,7 @@ static inline int nohz_kick_needed(struct rq *rq)
{
unsigned long now = jiffies;
struct sched_domain *sd;
struct sched_group_power *sgp;
struct sched_group_capacity *sgc;
int nr_busy, cpu = rq->cpu;
if (unlikely(rq->idle_balance))
@@ -7250,8 +7260,8 @@ static inline int nohz_kick_needed(struct rq *rq)
sd = rcu_dereference(per_cpu(sd_busy, cpu));
if (sd) {
sgp = sd->groups->sgp;
nr_busy = atomic_read(&sgp->nr_busy_cpus);
sgc = sd->groups->sgc;
nr_busy = atomic_read(&sgc->nr_busy_cpus);
if (nr_busy > 1)
goto need_kick_unlock;

View File

@@ -37,18 +37,18 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
SCHED_FEAT(WAKEUP_PREEMPTION, true)
/*
* Use arch dependent cpu power functions
* Use arch dependent cpu capacity functions
*/
SCHED_FEAT(ARCH_POWER, true)
SCHED_FEAT(ARCH_CAPACITY, true)
SCHED_FEAT(HRTICK, false)
SCHED_FEAT(DOUBLE_TICK, false)
SCHED_FEAT(LB_BIAS, true)
/*
* Decrement CPU power based on time not spent running tasks
* Decrement CPU capacity based on time not spent running tasks
*/
SCHED_FEAT(NONTASK_POWER, true)
SCHED_FEAT(NONTASK_CAPACITY, true)
/*
* Queue remote wakeups on the target CPU and process them

View File

@@ -12,6 +12,8 @@
#include <trace/events/power.h>
#include "sched.h"
static int __read_mostly cpu_idle_force_poll;
void cpu_idle_poll_ctrl(bool enable)
@@ -67,6 +69,10 @@ void __weak arch_cpu_idle(void)
* cpuidle_idle_call - the main idle function
*
* NOTE: no locks or semaphores should be used here
*
* On archs that support TIF_POLLING_NRFLAG, is called with polling
* set, and it returns with polling set. If it ever stops polling, it
* must clear the polling bit.
*/
static void cpuidle_idle_call(void)
{
@@ -175,10 +181,22 @@ exit_idle:
/*
* Generic idle loop implementation
*
* Called with polling cleared.
*/
static void cpu_idle_loop(void)
{
while (1) {
/*
* If the arch has a polling bit, we maintain an invariant:
*
* Our polling bit is clear if we're not scheduled (i.e. if
* rq->curr != rq->idle). This means that, if rq->idle has
* the polling bit set, then setting need_resched is
* guaranteed to cause the cpu to reschedule.
*/
__current_set_polling();
tick_nohz_idle_enter();
while (!need_resched()) {
@@ -218,6 +236,17 @@ static void cpu_idle_loop(void)
*/
preempt_set_need_resched();
tick_nohz_idle_exit();
__current_clr_polling();
/*
* We promise to call sched_ttwu_pending and reschedule
* if need_resched is set while polling is set. That
* means that clearing polling needs to be visible
* before doing these things.
*/
smp_mb__after_atomic();
sched_ttwu_pending();
schedule_preempt_disabled();
}
}
@@ -239,7 +268,6 @@ void cpu_startup_entry(enum cpuhp_state state)
*/
boot_init_stack_canary();
#endif
__current_set_polling();
arch_cpu_idle_prepare();
cpu_idle_loop();
}

View File

@@ -918,7 +918,6 @@ static void update_curr_rt(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct sched_rt_entity *rt_se = &curr->rt;
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
u64 delta_exec;
if (curr->sched_class != &rt_sched_class)
@@ -943,7 +942,7 @@ static void update_curr_rt(struct rq *rq)
return;
for_each_sched_rt_entity(rt_se) {
rt_rq = rt_rq_of_se(rt_se);
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
raw_spin_lock(&rt_rq->rt_runtime_lock);

View File

@@ -567,7 +567,7 @@ struct rq {
struct root_domain *rd;
struct sched_domain *sd;
unsigned long cpu_power;
unsigned long cpu_capacity;
unsigned char idle_balance;
/* For active balancing */
@@ -670,6 +670,8 @@ extern int migrate_swap(struct task_struct *, struct task_struct *);
#ifdef CONFIG_SMP
extern void sched_ttwu_pending(void);
#define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \
lockdep_is_held(&sched_domains_mutex))
@@ -728,15 +730,15 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa);
DECLARE_PER_CPU(struct sched_domain *, sd_busy);
DECLARE_PER_CPU(struct sched_domain *, sd_asym);
struct sched_group_power {
struct sched_group_capacity {
atomic_t ref;
/*
* CPU power of this group, SCHED_LOAD_SCALE being max power for a
* single CPU.
* CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
* for a single CPU.
*/
unsigned int power, power_orig;
unsigned int capacity, capacity_orig;
unsigned long next_update;
int imbalance; /* XXX unrelated to power but shared group state */
int imbalance; /* XXX unrelated to capacity but shared group state */
/*
* Number of busy cpus in this group.
*/
@@ -750,7 +752,7 @@ struct sched_group {
atomic_t ref;
unsigned int group_weight;
struct sched_group_power *sgp;
struct sched_group_capacity *sgc;
/*
* The CPUs this group covers.
@@ -773,7 +775,7 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
*/
static inline struct cpumask *sched_group_mask(struct sched_group *sg)
{
return to_cpumask(sg->sgp->cpumask);
return to_cpumask(sg->sgc->cpumask);
}
/**
@@ -787,6 +789,10 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
extern int group_balance_cpu(struct sched_group *sg);
#else
static inline void sched_ttwu_pending(void) { }
#endif /* CONFIG_SMP */
#include "stats.h"
@@ -1167,7 +1173,7 @@ extern const struct sched_class idle_sched_class;
#ifdef CONFIG_SMP
extern void update_group_power(struct sched_domain *sd, int cpu);
extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq);

View File

@@ -54,8 +54,7 @@
struct seccomp_filter {
atomic_t usage;
struct seccomp_filter *prev;
unsigned short len; /* Instruction count */
struct sock_filter_int insnsi[];
struct sk_filter *prog;
};
/* Limit any path through the tree to 256KB worth of instructions. */
@@ -104,60 +103,59 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
u32 k = ftest->k;
switch (code) {
case BPF_S_LD_W_ABS:
case BPF_LD | BPF_W | BPF_ABS:
ftest->code = BPF_LDX | BPF_W | BPF_ABS;
/* 32-bit aligned and not out of bounds. */
if (k >= sizeof(struct seccomp_data) || k & 3)
return -EINVAL;
continue;
case BPF_S_LD_W_LEN:
case BPF_LD | BPF_W | BPF_LEN:
ftest->code = BPF_LD | BPF_IMM;
ftest->k = sizeof(struct seccomp_data);
continue;
case BPF_S_LDX_W_LEN:
case BPF_LDX | BPF_W | BPF_LEN:
ftest->code = BPF_LDX | BPF_IMM;
ftest->k = sizeof(struct seccomp_data);
continue;
/* Explicitly include allowed calls. */
case BPF_S_RET_K:
case BPF_S_RET_A:
case BPF_S_ALU_ADD_K:
case BPF_S_ALU_ADD_X:
case BPF_S_ALU_SUB_K:
case BPF_S_ALU_SUB_X:
case BPF_S_ALU_MUL_K:
case BPF_S_ALU_MUL_X:
case BPF_S_ALU_DIV_X:
case BPF_S_ALU_AND_K:
case BPF_S_ALU_AND_X:
case BPF_S_ALU_OR_K:
case BPF_S_ALU_OR_X:
case BPF_S_ALU_XOR_K:
case BPF_S_ALU_XOR_X:
case BPF_S_ALU_LSH_K:
case BPF_S_ALU_LSH_X:
case BPF_S_ALU_RSH_K:
case BPF_S_ALU_RSH_X:
case BPF_S_ALU_NEG:
case BPF_S_LD_IMM:
case BPF_S_LDX_IMM:
case BPF_S_MISC_TAX:
case BPF_S_MISC_TXA:
case BPF_S_ALU_DIV_K:
case BPF_S_LD_MEM:
case BPF_S_LDX_MEM:
case BPF_S_ST:
case BPF_S_STX:
case BPF_S_JMP_JA:
case BPF_S_JMP_JEQ_K:
case BPF_S_JMP_JEQ_X:
case BPF_S_JMP_JGE_K:
case BPF_S_JMP_JGE_X:
case BPF_S_JMP_JGT_K:
case BPF_S_JMP_JGT_X:
case BPF_S_JMP_JSET_K:
case BPF_S_JMP_JSET_X:
sk_decode_filter(ftest, ftest);
case BPF_RET | BPF_K:
case BPF_RET | BPF_A:
case BPF_ALU | BPF_ADD | BPF_K:
case BPF_ALU | BPF_ADD | BPF_X:
case BPF_ALU | BPF_SUB | BPF_K:
case BPF_ALU | BPF_SUB | BPF_X:
case BPF_ALU | BPF_MUL | BPF_K:
case BPF_ALU | BPF_MUL | BPF_X:
case BPF_ALU | BPF_DIV | BPF_K:
case BPF_ALU | BPF_DIV | BPF_X:
case BPF_ALU | BPF_AND | BPF_K:
case BPF_ALU | BPF_AND | BPF_X:
case BPF_ALU | BPF_OR | BPF_K:
case BPF_ALU | BPF_OR | BPF_X:
case BPF_ALU | BPF_XOR | BPF_K:
case BPF_ALU | BPF_XOR | BPF_X:
case BPF_ALU | BPF_LSH | BPF_K:
case BPF_ALU | BPF_LSH | BPF_X:
case BPF_ALU | BPF_RSH | BPF_K:
case BPF_ALU | BPF_RSH | BPF_X:
case BPF_ALU | BPF_NEG:
case BPF_LD | BPF_IMM:
case BPF_LDX | BPF_IMM:
case BPF_MISC | BPF_TAX:
case BPF_MISC | BPF_TXA:
case BPF_LD | BPF_MEM:
case BPF_LDX | BPF_MEM:
case BPF_ST:
case BPF_STX:
case BPF_JMP | BPF_JA:
case BPF_JMP | BPF_JEQ | BPF_K:
case BPF_JMP | BPF_JEQ | BPF_X:
case BPF_JMP | BPF_JGE | BPF_K:
case BPF_JMP | BPF_JGE | BPF_X:
case BPF_JMP | BPF_JGT | BPF_K:
case BPF_JMP | BPF_JGT | BPF_X:
case BPF_JMP | BPF_JSET | BPF_K:
case BPF_JMP | BPF_JSET | BPF_X:
continue;
default:
return -EINVAL;
@@ -189,7 +187,8 @@ static u32 seccomp_run_filters(int syscall)
* value always takes priority (ignoring the DATA).
*/
for (f = current->seccomp.filter; f; f = f->prev) {
u32 cur_ret = sk_run_filter_int_seccomp(&sd, f->insnsi);
u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd);
if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
ret = cur_ret;
}
@@ -215,7 +214,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
return -EINVAL;
for (filter = current->seccomp.filter; filter; filter = filter->prev)
total_insns += filter->len + 4; /* include a 4 instr penalty */
total_insns += filter->prog->len + 4; /* include a 4 instr penalty */
if (total_insns > MAX_INSNS_PER_PATH)
return -ENOMEM;
@@ -256,19 +255,25 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
/* Allocate a new seccomp_filter */
ret = -ENOMEM;
filter = kzalloc(sizeof(struct seccomp_filter) +
sizeof(struct sock_filter_int) * new_len,
filter = kzalloc(sizeof(struct seccomp_filter),
GFP_KERNEL|__GFP_NOWARN);
if (!filter)
goto free_prog;
ret = sk_convert_filter(fp, fprog->len, filter->insnsi, &new_len);
if (ret)
filter->prog = kzalloc(sk_filter_size(new_len),
GFP_KERNEL|__GFP_NOWARN);
if (!filter->prog)
goto free_filter;
ret = sk_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
if (ret)
goto free_filter_prog;
kfree(fp);
atomic_set(&filter->usage, 1);
filter->len = new_len;
filter->prog->len = new_len;
sk_filter_select_runtime(filter->prog);
/*
* If there is an existing filter, make it the prev and don't drop its
@@ -278,6 +283,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
current->seccomp.filter = filter;
return 0;
free_filter_prog:
kfree(filter->prog);
free_filter:
kfree(filter);
free_prog:
@@ -330,6 +337,7 @@ void put_seccomp_filter(struct task_struct *tsk)
while (orig && atomic_dec_and_test(&orig->usage)) {
struct seccomp_filter *freeme = orig;
orig = orig->prev;
sk_filter_free(freeme->prog);
kfree(freeme);
}
}

View File

@@ -29,6 +29,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
static void flush_smp_call_function_queue(bool warn_cpu_offline);
static int
hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
@@ -51,12 +53,27 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
/* Fall-through to the CPU_DEAD[_FROZEN] case. */
case CPU_DEAD:
case CPU_DEAD_FROZEN:
free_cpumask_var(cfd->cpumask);
free_percpu(cfd->csd);
break;
case CPU_DYING:
case CPU_DYING_FROZEN:
/*
* The IPIs for the smp-call-function callbacks queued by other
* CPUs might arrive late, either due to hardware latencies or
* because this CPU disabled interrupts (inside stop-machine)
* before the IPIs were sent. So flush out any pending callbacks
* explicitly (without waiting for the IPIs to arrive), to
* ensure that the outgoing CPU doesn't go offline with work
* still pending.
*/
flush_smp_call_function_queue(false);
break;
#endif
};
@@ -177,23 +194,47 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
return 0;
}
/*
* Invoked by arch to handle an IPI for call function single. Must be
* called from the arch with interrupts disabled.
/**
* generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks
*
* Invoked by arch to handle an IPI for call function single.
* Must be called with interrupts disabled.
*/
void generic_smp_call_function_single_interrupt(void)
{
flush_smp_call_function_queue(true);
}
/**
* flush_smp_call_function_queue - Flush pending smp-call-function callbacks
*
* @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
* offline CPU. Skip this check if set to 'false'.
*
* Flush any pending smp-call-function callbacks queued on this CPU. This is
* invoked by the generic IPI handler, as well as by a CPU about to go offline,
* to ensure that all pending IPI callbacks are run before it goes completely
* offline.
*
* Loop through the call_single_queue and run all the queued callbacks.
* Must be called with interrupts disabled.
*/
static void flush_smp_call_function_queue(bool warn_cpu_offline)
{
struct llist_head *head;
struct llist_node *entry;
struct call_single_data *csd, *csd_next;
static bool warned;
entry = llist_del_all(&__get_cpu_var(call_single_queue));
WARN_ON(!irqs_disabled());
head = &__get_cpu_var(call_single_queue);
entry = llist_del_all(head);
entry = llist_reverse_order(entry);
/*
* Shouldn't receive this interrupt on a cpu that is not yet online.
*/
if (unlikely(!cpu_online(smp_processor_id()) && !warned)) {
/* There shouldn't be any pending callbacks on an offline CPU. */
if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
!warned && !llist_empty(head))) {
warned = true;
WARN(1, "IPI on offline CPU %d\n", smp_processor_id());

View File

@@ -136,7 +136,6 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535;
static int minolduid;
static int min_percpu_pagelist_fract = 8;
static int ngroups_max = NGROUPS_MAX;
static const int cap_last_cap = CAP_LAST_CAP;
@@ -152,10 +151,6 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
#ifdef CONFIG_SPARC
#endif
#ifdef CONFIG_SPARC64
extern int sysctl_tsb_ratio;
#endif
#ifdef __hppa__
extern int pwrsw_enabled;
#endif
@@ -865,6 +860,17 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one,
},
#ifdef CONFIG_SMP
{
.procname = "softlockup_all_cpu_backtrace",
.data = &sysctl_softlockup_all_cpu_backtrace,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &one,
},
#endif /* CONFIG_SMP */
{
.procname = "nmi_watchdog",
.data = &watchdog_user_enabled,
@@ -1321,7 +1327,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(percpu_pagelist_fraction),
.mode = 0644,
.proc_handler = percpu_pagelist_fraction_sysctl_handler,
.extra1 = &min_percpu_pagelist_fract,
.extra1 = &zero,
},
#ifdef CONFIG_MMU
{
@@ -2568,11 +2574,11 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
bool first = 1;
size_t left = *lenp;
unsigned long bitmap_len = table->maxlen;
unsigned long *bitmap = (unsigned long *) table->data;
unsigned long *bitmap = *(unsigned long **) table->data;
unsigned long *tmp_bitmap = NULL;
char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
if (!bitmap_len || !left || (*ppos && !write)) {
if (!bitmap || !bitmap_len || !left || (*ppos && !write)) {
*lenp = 0;
return 0;
}

View File

@@ -535,6 +535,36 @@ config MMIOTRACE_TEST
Say N, unless you absolutely know what you are doing.
config TRACEPOINT_BENCHMARK
bool "Add tracepoint that benchmarks tracepoints"
help
This option creates the tracepoint "benchmark:benchmark_event".
When the tracepoint is enabled, it kicks off a kernel thread that
goes into an infinite loop (calling cond_sched() to let other tasks
run), and calls the tracepoint. Each iteration will record the time
it took to write to the tracepoint and the next iteration that
data will be passed to the tracepoint itself. That is, the tracepoint
will report the time it took to do the previous tracepoint.
The string written to the tracepoint is a static string of 128 bytes
to keep the time the same. The initial string is simply a write of
"START". The second string records the cold cache time of the first
write which is not added to the rest of the calculations.
As it is a tight loop, it benchmarks as hot cache. That's fine because
we care most about hot paths that are probably in cache already.
An example of the output:
START
first=3672 [COLD CACHED]
last=632 first=3672 max=632 min=632 avg=316 std=446 std^2=199712
last=278 first=3672 max=632 min=278 avg=303 std=316 std^2=100337
last=277 first=3672 max=632 min=277 avg=296 std=258 std^2=67064
last=273 first=3672 max=632 min=273 avg=292 std=224 std^2=50411
last=273 first=3672 max=632 min=273 avg=288 std=200 std^2=40389
last=281 first=3672 max=632 min=273 avg=287 std=183 std^2=33666
config RING_BUFFER_BENCHMARK
tristate "Ring buffer benchmark stress tester"
depends on RING_BUFFER

View File

@@ -17,6 +17,7 @@ ifdef CONFIG_TRACING_BRANCHES
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
endif
CFLAGS_trace_benchmark.o := -I$(src)
CFLAGS_trace_events_filter.o := -I$(src)
obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o
@@ -62,4 +63,6 @@ endif
obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o
obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
libftrace-y := ftrace.o

View File

@@ -62,7 +62,7 @@
#define FTRACE_HASH_DEFAULT_BITS 10
#define FTRACE_HASH_MAX_BITS 12
#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL)
#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL)
#ifdef CONFIG_DYNAMIC_FTRACE
#define INIT_REGEX_LOCK(opsname) \
@@ -103,7 +103,6 @@ static int ftrace_disabled __read_mostly;
static DEFINE_MUTEX(ftrace_lock);
static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
@@ -171,23 +170,6 @@ int ftrace_nr_registered_ops(void)
return cnt;
}
static void
ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs)
{
int bit;
bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX);
if (bit < 0)
return;
do_for_each_ftrace_op(op, ftrace_global_list) {
op->func(ip, parent_ip, op, regs);
} while_for_each_ftrace_op(op);
trace_clear_recursion(bit);
}
static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs)
{
@@ -237,43 +219,6 @@ static int control_ops_alloc(struct ftrace_ops *ops)
return 0;
}
static void update_global_ops(void)
{
ftrace_func_t func = ftrace_global_list_func;
void *private = NULL;
/* The list has its own recursion protection. */
global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
/*
* If there's only one function registered, then call that
* function directly. Otherwise, we need to iterate over the
* registered callers.
*/
if (ftrace_global_list == &ftrace_list_end ||
ftrace_global_list->next == &ftrace_list_end) {
func = ftrace_global_list->func;
private = ftrace_global_list->private;
/*
* As we are calling the function directly.
* If it does not have recursion protection,
* the function_trace_op needs to be updated
* accordingly.
*/
if (!(ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE))
global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE;
}
/* If we filter on pids, update to use the pid function */
if (!list_empty(&ftrace_pids)) {
set_ftrace_pid_function(func);
func = ftrace_pid_func;
}
global_ops.func = func;
global_ops.private = private;
}
static void ftrace_sync(struct work_struct *work)
{
/*
@@ -301,8 +246,6 @@ static void update_ftrace_function(void)
{
ftrace_func_t func;
update_global_ops();
/*
* If we are at the end of the list and this ops is
* recursion safe and not dynamic and the arch supports passing ops,
@@ -314,10 +257,7 @@ static void update_ftrace_function(void)
(ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) &&
!FTRACE_FORCE_LIST_FUNC)) {
/* Set the ftrace_ops that the arch callback uses */
if (ftrace_ops_list == &global_ops)
set_function_trace_op = ftrace_global_list;
else
set_function_trace_op = ftrace_ops_list;
set_function_trace_op = ftrace_ops_list;
func = ftrace_ops_list->func;
} else {
/* Just use the default ftrace_ops */
@@ -373,6 +313,11 @@ static void update_ftrace_function(void)
ftrace_trace_function = func;
}
int using_ftrace_ops_list_func(void)
{
return ftrace_trace_function == ftrace_ops_list_func;
}
static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
{
ops->next = *list;
@@ -434,16 +379,9 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
if (ops->flags & FTRACE_OPS_FL_DELETED)
return -EINVAL;
if (FTRACE_WARN_ON(ops == &global_ops))
return -EINVAL;
if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
return -EBUSY;
/* We don't support both control and global flags set. */
if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
return -EINVAL;
#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS
/*
* If the ftrace_ops specifies SAVE_REGS, then it only can be used
@@ -461,10 +399,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
if (!core_kernel_data((unsigned long)ops))
ops->flags |= FTRACE_OPS_FL_DYNAMIC;
if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops);
ops->flags |= FTRACE_OPS_FL_ENABLED;
} else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
if (ops->flags & FTRACE_OPS_FL_CONTROL) {
if (control_ops_alloc(ops))
return -ENOMEM;
add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
@@ -484,15 +419,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
return -EBUSY;
if (FTRACE_WARN_ON(ops == &global_ops))
return -EINVAL;
if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
ret = remove_ftrace_list_ops(&ftrace_global_list,
&global_ops, ops);
if (!ret)
ops->flags &= ~FTRACE_OPS_FL_ENABLED;
} else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
if (ops->flags & FTRACE_OPS_FL_CONTROL) {
ret = remove_ftrace_list_ops(&ftrace_control_list,
&control_ops, ops);
} else
@@ -895,7 +822,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip,
local_irq_save(flags);
stat = &__get_cpu_var(ftrace_profile_stats);
stat = this_cpu_ptr(&ftrace_profile_stats);
if (!stat->hash || !ftrace_profile_enabled)
goto out;
@@ -926,7 +853,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
unsigned long flags;
local_irq_save(flags);
stat = &__get_cpu_var(ftrace_profile_stats);
stat = this_cpu_ptr(&ftrace_profile_stats);
if (!stat->hash || !ftrace_profile_enabled)
goto out;
@@ -1178,7 +1105,7 @@ struct ftrace_page {
static struct ftrace_page *ftrace_pages_start;
static struct ftrace_page *ftrace_pages;
static bool ftrace_hash_empty(struct ftrace_hash *hash)
static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash)
{
return !hash || !hash->count;
}
@@ -1625,7 +1552,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);
/*
* If filter_hash is set, we want to match all functions
* that are in the hash but not in the other hash.
*
* If filter_hash is not set, then we are decrementing.
* That means we match anything that is in the hash
* and also in the other_hash. That is, we need to turn
* off functions in the other hash because they are disabled
* by this hash.
*/
if (filter_hash && in_hash && !in_other_hash)
match = 1;
@@ -1767,19 +1701,15 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
/*
* If this record is being updated from a nop, then
* return UPDATE_MAKE_CALL.
* Otherwise, if the EN flag is set, then return
* UPDATE_MODIFY_CALL_REGS to tell the caller to convert
* from the non-save regs, to a save regs function.
* Otherwise,
* return UPDATE_MODIFY_CALL to tell the caller to convert
* from the save regs, to a non-save regs function.
* from the save regs, to a non-save regs function or
* vice versa.
*/
if (flag & FTRACE_FL_ENABLED)
return FTRACE_UPDATE_MAKE_CALL;
else if (rec->flags & FTRACE_FL_REGS_EN)
return FTRACE_UPDATE_MODIFY_CALL_REGS;
else
return FTRACE_UPDATE_MODIFY_CALL;
return FTRACE_UPDATE_MODIFY_CALL;
}
if (update) {
@@ -1821,6 +1751,42 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
return ftrace_check_record(rec, enable, 0);
}
/**
* ftrace_get_addr_new - Get the call address to set to
* @rec: The ftrace record descriptor
*
* If the record has the FTRACE_FL_REGS set, that means that it
* wants to convert to a callback that saves all regs. If FTRACE_FL_REGS
* is not not set, then it wants to convert to the normal callback.
*
* Returns the address of the trampoline to set to
*/
unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
{
if (rec->flags & FTRACE_FL_REGS)
return (unsigned long)FTRACE_REGS_ADDR;
else
return (unsigned long)FTRACE_ADDR;
}
/**
* ftrace_get_addr_curr - Get the call address that is already there
* @rec: The ftrace record descriptor
*
* The FTRACE_FL_REGS_EN is set when the record already points to
* a function that saves all the regs. Basically the '_EN' version
* represents the current state of the function.
*
* Returns the address of the trampoline that is currently being called
*/
unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
{
if (rec->flags & FTRACE_FL_REGS_EN)
return (unsigned long)FTRACE_REGS_ADDR;
else
return (unsigned long)FTRACE_ADDR;
}
static int
__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
{
@@ -1828,12 +1794,12 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
unsigned long ftrace_addr;
int ret;
ret = ftrace_update_record(rec, enable);
ftrace_addr = ftrace_get_addr_new(rec);
if (rec->flags & FTRACE_FL_REGS)
ftrace_addr = (unsigned long)FTRACE_REGS_ADDR;
else
ftrace_addr = (unsigned long)FTRACE_ADDR;
/* This needs to be done before we call ftrace_update_record */
ftrace_old_addr = ftrace_get_addr_curr(rec);
ret = ftrace_update_record(rec, enable);
switch (ret) {
case FTRACE_UPDATE_IGNORE:
@@ -1845,13 +1811,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
case FTRACE_UPDATE_MAKE_NOP:
return ftrace_make_nop(NULL, rec, ftrace_addr);
case FTRACE_UPDATE_MODIFY_CALL_REGS:
case FTRACE_UPDATE_MODIFY_CALL:
if (rec->flags & FTRACE_FL_REGS)
ftrace_old_addr = (unsigned long)FTRACE_ADDR;
else
ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR;
return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
}
@@ -2115,7 +2075,6 @@ static void ftrace_startup_enable(int command)
static int ftrace_startup(struct ftrace_ops *ops, int command)
{
bool hash_enable = true;
int ret;
if (unlikely(ftrace_disabled))
@@ -2128,18 +2087,9 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
ftrace_start_up++;
command |= FTRACE_UPDATE_CALLS;
/* ops marked global share the filter hashes */
if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
ops = &global_ops;
/* Don't update hash if global is already set */
if (global_start_up)
hash_enable = false;
global_start_up++;
}
ops->flags |= FTRACE_OPS_FL_ENABLED;
if (hash_enable)
ftrace_hash_rec_enable(ops, 1);
ftrace_hash_rec_enable(ops, 1);
ftrace_startup_enable(command);
@@ -2148,7 +2098,6 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
static int ftrace_shutdown(struct ftrace_ops *ops, int command)
{
bool hash_disable = true;
int ret;
if (unlikely(ftrace_disabled))
@@ -2166,21 +2115,9 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
*/
WARN_ON_ONCE(ftrace_start_up < 0);
if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
ops = &global_ops;
global_start_up--;
WARN_ON_ONCE(global_start_up < 0);
/* Don't update hash if global still has users */
if (global_start_up) {
WARN_ON_ONCE(!ftrace_start_up);
hash_disable = false;
}
}
ftrace_hash_rec_disable(ops, 1);
if (hash_disable)
ftrace_hash_rec_disable(ops, 1);
if (ops != &global_ops || !global_start_up)
if (!global_start_up)
ops->flags &= ~FTRACE_OPS_FL_ENABLED;
command |= FTRACE_UPDATE_CALLS;
@@ -3524,10 +3461,6 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
struct ftrace_hash *hash;
int ret;
/* All global ops uses the global ops filters */
if (ops->flags & FTRACE_OPS_FL_GLOBAL)
ops = &global_ops;
if (unlikely(ftrace_disabled))
return -ENODEV;
@@ -3639,8 +3572,7 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
}
EXPORT_SYMBOL_GPL(ftrace_set_notrace);
/**
* ftrace_set_filter - set a function to filter on in ftrace
* @ops - the ops to set the filter with
* ftrace_set_global_filter - set a function to filter on with global tracers
* @buf - the string that holds the function filter text.
* @len - the length of the string.
* @reset - non zero to reset all filters before applying this filter.
@@ -3655,8 +3587,7 @@ void ftrace_set_global_filter(unsigned char *buf, int len, int reset)
EXPORT_SYMBOL_GPL(ftrace_set_global_filter);
/**
* ftrace_set_notrace - set a function to not trace in ftrace
* @ops - the ops to set the notrace filter with
* ftrace_set_global_notrace - set a function to not trace with global tracers
* @buf - the string that holds the function notrace text.
* @len - the length of the string.
* @reset - non zero to reset all filters before applying this filter.
@@ -4443,6 +4374,34 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
#endif /* CONFIG_DYNAMIC_FTRACE */
__init void ftrace_init_global_array_ops(struct trace_array *tr)
{
tr->ops = &global_ops;
tr->ops->private = tr;
}
void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
{
/* If we filter on pids, update to use the pid function */
if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
if (WARN_ON(tr->ops->func != ftrace_stub))
printk("ftrace ops had %pS for function\n",
tr->ops->func);
/* Only the top level instance does pid tracing */
if (!list_empty(&ftrace_pids)) {
set_ftrace_pid_function(func);
func = ftrace_pid_func;
}
}
tr->ops->func = func;
tr->ops->private = tr;
}
void ftrace_reset_array_ops(struct trace_array *tr)
{
tr->ops->func = ftrace_stub;
}
static void
ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs)
@@ -4501,9 +4460,16 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
*/
preempt_disable_notrace();
do_for_each_ftrace_op(op, ftrace_ops_list) {
if (ftrace_ops_test(op, ip, regs))
if (ftrace_ops_test(op, ip, regs)) {
if (WARN_ON(!op->func)) {
function_trace_stop = 1;
printk("op=%p %pS\n", op, op);
goto out;
}
op->func(ip, parent_ip, op, regs);
}
} while_for_each_ftrace_op(op);
out:
preempt_enable_notrace();
trace_clear_recursion(bit);
}
@@ -4908,7 +4874,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static int ftrace_graph_active;
static struct notifier_block ftrace_suspend_notifier;
int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
{
@@ -5054,13 +5019,6 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
return NOTIFY_DONE;
}
/* Just a place holder for function graph */
static struct ftrace_ops fgraph_ops __read_mostly = {
.func = ftrace_stub,
.flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL |
FTRACE_OPS_FL_RECURSION_SAFE,
};
static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
{
if (!ftrace_ops_test(&global_ops, trace->func, NULL))
@@ -5085,6 +5043,10 @@ static void update_function_graph_func(void)
ftrace_graph_entry = ftrace_graph_entry_test;
}
static struct notifier_block ftrace_suspend_notifier = {
.notifier_call = ftrace_suspend_notifier_call,
};
int register_ftrace_graph(trace_func_graph_ret_t retfunc,
trace_func_graph_ent_t entryfunc)
{
@@ -5098,7 +5060,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
goto out;
}
ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
register_pm_notifier(&ftrace_suspend_notifier);
ftrace_graph_active++;
@@ -5120,7 +5081,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
ftrace_graph_entry = ftrace_graph_entry_test;
update_function_graph_func();
ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET);
/* Function graph doesn't use the .func field of global_ops */
global_ops.flags |= FTRACE_OPS_FL_STUB;
ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
out:
mutex_unlock(&ftrace_lock);
@@ -5138,7 +5102,8 @@ void unregister_ftrace_graph(void)
ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
ftrace_graph_entry = ftrace_graph_entry_stub;
__ftrace_graph_entry = ftrace_graph_entry_stub;
ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET);
ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
global_ops.flags &= ~FTRACE_OPS_FL_STUB;
unregister_pm_notifier(&ftrace_suspend_notifier);
unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);

View File

@@ -543,7 +543,7 @@ static void rb_wake_up_waiters(struct irq_work *work)
* as data is added to any of the @buffer's cpu buffers. Otherwise
* it will wait for data to be added to a specific cpu buffer.
*/
void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
DEFINE_WAIT(wait);
@@ -557,6 +557,8 @@ void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
if (cpu == RING_BUFFER_ALL_CPUS)
work = &buffer->irq_work;
else {
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return -ENODEV;
cpu_buffer = buffer->buffers[cpu];
work = &cpu_buffer->irq_work;
}
@@ -591,6 +593,7 @@ void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
schedule();
finish_wait(&work->waiters, &wait);
return 0;
}
/**

View File

@@ -275,7 +275,7 @@ int call_filter_check_discard(struct ftrace_event_call *call, void *rec,
}
EXPORT_SYMBOL_GPL(call_filter_check_discard);
cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
{
u64 ts;
@@ -599,7 +599,7 @@ static int alloc_snapshot(struct trace_array *tr)
return 0;
}
void free_snapshot(struct trace_array *tr)
static void free_snapshot(struct trace_array *tr)
{
/*
* We don't free the ring buffer. instead, resize it because
@@ -963,27 +963,9 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
return cnt;
}
/*
* ftrace_max_lock is used to protect the swapping of buffers
* when taking a max snapshot. The buffers themselves are
* protected by per_cpu spinlocks. But the action of the swap
* needs its own lock.
*
* This is defined as a arch_spinlock_t in order to help
* with performance when lockdep debugging is enabled.
*
* It is also used in other places outside the update_max_tr
* so it needs to be defined outside of the
* CONFIG_TRACER_MAX_TRACE.
*/
static arch_spinlock_t ftrace_max_lock =
(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
unsigned long __read_mostly tracing_thresh;
#ifdef CONFIG_TRACER_MAX_TRACE
unsigned long __read_mostly tracing_max_latency;
/*
* Copy the new maximum trace into the separate maximum-trace
* structure. (this way the maximum trace is permanently saved,
@@ -1000,7 +982,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
max_buf->cpu = cpu;
max_buf->time_start = data->preempt_timestamp;
max_data->saved_latency = tracing_max_latency;
max_data->saved_latency = tr->max_latency;
max_data->critical_start = data->critical_start;
max_data->critical_end = data->critical_end;
@@ -1048,14 +1030,14 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
return;
}
arch_spin_lock(&ftrace_max_lock);
arch_spin_lock(&tr->max_lock);
buf = tr->trace_buffer.buffer;
tr->trace_buffer.buffer = tr->max_buffer.buffer;
tr->max_buffer.buffer = buf;
__update_max_tr(tr, tsk, cpu);
arch_spin_unlock(&ftrace_max_lock);
arch_spin_unlock(&tr->max_lock);
}
/**
@@ -1081,7 +1063,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
return;
}
arch_spin_lock(&ftrace_max_lock);
arch_spin_lock(&tr->max_lock);
ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu);
@@ -1099,17 +1081,17 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
__update_max_tr(tr, tsk, cpu);
arch_spin_unlock(&ftrace_max_lock);
arch_spin_unlock(&tr->max_lock);
}
#endif /* CONFIG_TRACER_MAX_TRACE */
static void default_wait_pipe(struct trace_iterator *iter)
static int wait_on_pipe(struct trace_iterator *iter)
{
/* Iterators are static, they should be filled or empty */
if (trace_buffer_iter(iter, iter->cpu_file))
return;
return 0;
ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
}
#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -1220,8 +1202,6 @@ int register_tracer(struct tracer *type)
else
if (!type->flags->opts)
type->flags->opts = dummy_tracer_opt;
if (!type->wait_pipe)
type->wait_pipe = default_wait_pipe;
ret = run_tracer_selftest(type);
if (ret < 0)
@@ -1305,22 +1285,71 @@ void tracing_reset_all_online_cpus(void)
}
}
#define SAVED_CMDLINES 128
#define SAVED_CMDLINES_DEFAULT 128
#define NO_CMDLINE_MAP UINT_MAX
static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
static int cmdline_idx;
static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
struct saved_cmdlines_buffer {
unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
unsigned *map_cmdline_to_pid;
unsigned cmdline_num;
int cmdline_idx;
char *saved_cmdlines;
};
static struct saved_cmdlines_buffer *savedcmd;
/* temporary disable recording */
static atomic_t trace_record_cmdline_disabled __read_mostly;
static void trace_init_cmdlines(void)
static inline char *get_saved_cmdlines(int idx)
{
memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline));
memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid));
cmdline_idx = 0;
return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN];
}
static inline void set_cmdline(int idx, const char *cmdline)
{
memcpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
}
static int allocate_cmdlines_buffer(unsigned int val,
struct saved_cmdlines_buffer *s)
{
s->map_cmdline_to_pid = kmalloc(val * sizeof(*s->map_cmdline_to_pid),
GFP_KERNEL);
if (!s->map_cmdline_to_pid)
return -ENOMEM;
s->saved_cmdlines = kmalloc(val * TASK_COMM_LEN, GFP_KERNEL);
if (!s->saved_cmdlines) {
kfree(s->map_cmdline_to_pid);
return -ENOMEM;
}
s->cmdline_idx = 0;
s->cmdline_num = val;
memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
sizeof(s->map_pid_to_cmdline));
memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
val * sizeof(*s->map_cmdline_to_pid));
return 0;
}
static int trace_create_savedcmd(void)
{
int ret;
savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL);
if (!savedcmd)
return -ENOMEM;
ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd);
if (ret < 0) {
kfree(savedcmd);
savedcmd = NULL;
return -ENOMEM;
}
return 0;
}
int is_tracing_stopped(void)
@@ -1353,7 +1382,7 @@ void tracing_start(void)
}
/* Prevent the buffers from switching */
arch_spin_lock(&ftrace_max_lock);
arch_spin_lock(&global_trace.max_lock);
buffer = global_trace.trace_buffer.buffer;
if (buffer)
@@ -1365,9 +1394,8 @@ void tracing_start(void)
ring_buffer_record_enable(buffer);
#endif
arch_spin_unlock(&ftrace_max_lock);
arch_spin_unlock(&global_trace.max_lock);
ftrace_start();
out:
raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
}
@@ -1414,13 +1442,12 @@ void tracing_stop(void)
struct ring_buffer *buffer;
unsigned long flags;
ftrace_stop();
raw_spin_lock_irqsave(&global_trace.start_lock, flags);
if (global_trace.stop_count++)
goto out;
/* Prevent the buffers from switching */
arch_spin_lock(&ftrace_max_lock);
arch_spin_lock(&global_trace.max_lock);
buffer = global_trace.trace_buffer.buffer;
if (buffer)
@@ -1432,7 +1459,7 @@ void tracing_stop(void)
ring_buffer_record_disable(buffer);
#endif
arch_spin_unlock(&ftrace_max_lock);
arch_spin_unlock(&global_trace.max_lock);
out:
raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
@@ -1461,12 +1488,12 @@ static void tracing_stop_tr(struct trace_array *tr)
void trace_stop_cmdline_recording(void);
static void trace_save_cmdline(struct task_struct *tsk)
static int trace_save_cmdline(struct task_struct *tsk)
{
unsigned pid, idx;
if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
return;
return 0;
/*
* It's not the end of the world if we don't get
@@ -1475,11 +1502,11 @@ static void trace_save_cmdline(struct task_struct *tsk)
* so if we miss here, then better luck next time.
*/
if (!arch_spin_trylock(&trace_cmdline_lock))
return;
return 0;
idx = map_pid_to_cmdline[tsk->pid];
idx = savedcmd->map_pid_to_cmdline[tsk->pid];
if (idx == NO_CMDLINE_MAP) {
idx = (cmdline_idx + 1) % SAVED_CMDLINES;
idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;
/*
* Check whether the cmdline buffer at idx has a pid
@@ -1487,22 +1514,24 @@ static void trace_save_cmdline(struct task_struct *tsk)
* need to clear the map_pid_to_cmdline. Otherwise we
* would read the new comm for the old pid.
*/
pid = map_cmdline_to_pid[idx];
pid = savedcmd->map_cmdline_to_pid[idx];
if (pid != NO_CMDLINE_MAP)
map_pid_to_cmdline[pid] = NO_CMDLINE_MAP;
savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP;
map_cmdline_to_pid[idx] = tsk->pid;
map_pid_to_cmdline[tsk->pid] = idx;
savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
savedcmd->map_pid_to_cmdline[tsk->pid] = idx;
cmdline_idx = idx;
savedcmd->cmdline_idx = idx;
}
memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
set_cmdline(idx, tsk->comm);
arch_spin_unlock(&trace_cmdline_lock);
return 1;
}
void trace_find_cmdline(int pid, char comm[])
static void __trace_find_cmdline(int pid, char comm[])
{
unsigned map;
@@ -1521,13 +1550,19 @@ void trace_find_cmdline(int pid, char comm[])
return;
}
preempt_disable();
arch_spin_lock(&trace_cmdline_lock);
map = map_pid_to_cmdline[pid];
map = savedcmd->map_pid_to_cmdline[pid];
if (map != NO_CMDLINE_MAP)
strcpy(comm, saved_cmdlines[map]);
strcpy(comm, get_saved_cmdlines(map));
else
strcpy(comm, "<...>");
}
void trace_find_cmdline(int pid, char comm[])
{
preempt_disable();
arch_spin_lock(&trace_cmdline_lock);
__trace_find_cmdline(pid, comm);
arch_spin_unlock(&trace_cmdline_lock);
preempt_enable();
@@ -1541,9 +1576,8 @@ void tracing_record_cmdline(struct task_struct *tsk)
if (!__this_cpu_read(trace_cmdline_save))
return;
__this_cpu_write(trace_cmdline_save, false);
trace_save_cmdline(tsk);
if (trace_save_cmdline(tsk))
__this_cpu_write(trace_cmdline_save, false);
}
void
@@ -1746,7 +1780,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
*/
barrier();
if (use_stack == 1) {
trace.entries = &__get_cpu_var(ftrace_stack).calls[0];
trace.entries = this_cpu_ptr(ftrace_stack.calls);
trace.max_entries = FTRACE_STACK_MAX_ENTRIES;
if (regs)
@@ -1995,7 +2029,21 @@ void trace_printk_init_buffers(void)
if (alloc_percpu_trace_buffer())
return;
pr_info("ftrace: Allocated trace_printk buffers\n");
/* trace_printk() is for debug use only. Don't use it in production. */
pr_warning("\n**********************************************************\n");
pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
pr_warning("** **\n");
pr_warning("** trace_printk() being used. Allocating extra memory. **\n");
pr_warning("** **\n");
pr_warning("** This means that this is a DEBUG kernel and it is **\n");
pr_warning("** unsafe for produciton use. **\n");
pr_warning("** **\n");
pr_warning("** If you see this message and you are not debugging **\n");
pr_warning("** the kernel, report this immediately to your vendor! **\n");
pr_warning("** **\n");
pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
pr_warning("**********************************************************\n");
/* Expand the buffers to set size */
tracing_update_buffers();
@@ -3333,7 +3381,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
mutex_lock(&tracing_cpumask_update_lock);
local_irq_disable();
arch_spin_lock(&ftrace_max_lock);
arch_spin_lock(&tr->max_lock);
for_each_tracing_cpu(cpu) {
/*
* Increase/decrease the disabled counter if we are
@@ -3350,7 +3398,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);
}
}
arch_spin_unlock(&ftrace_max_lock);
arch_spin_unlock(&tr->max_lock);
local_irq_enable();
cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
@@ -3592,6 +3640,7 @@ static const char readme_msg[] =
" trace_options\t\t- Set format or modify how tracing happens\n"
"\t\t\t Disable an option by adding a suffix 'no' to the\n"
"\t\t\t option name\n"
" saved_cmdlines_size\t- echo command number in here to store comm-pid list\n"
#ifdef CONFIG_DYNAMIC_FTRACE
"\n available_filter_functions - list of functions that can be filtered on\n"
" set_ftrace_filter\t- echo function name in here to only trace these\n"
@@ -3705,55 +3754,153 @@ static const struct file_operations tracing_readme_fops = {
.llseek = generic_file_llseek,
};
static ssize_t
tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
size_t cnt, loff_t *ppos)
static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
{
char *buf_comm;
char *file_buf;
char *buf;
int len = 0;
int pid;
int i;
unsigned int *ptr = v;
file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL);
if (!file_buf)
return -ENOMEM;
if (*pos || m->count)
ptr++;
buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL);
if (!buf_comm) {
kfree(file_buf);
return -ENOMEM;
}
(*pos)++;
buf = file_buf;
for (i = 0; i < SAVED_CMDLINES; i++) {
int r;
pid = map_cmdline_to_pid[i];
if (pid == -1 || pid == NO_CMDLINE_MAP)
for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num];
ptr++) {
if (*ptr == -1 || *ptr == NO_CMDLINE_MAP)
continue;
trace_find_cmdline(pid, buf_comm);
r = sprintf(buf, "%d %s\n", pid, buf_comm);
buf += r;
len += r;
return ptr;
}
len = simple_read_from_buffer(ubuf, cnt, ppos,
file_buf, len);
return NULL;
}
kfree(file_buf);
kfree(buf_comm);
static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos)
{
void *v;
loff_t l = 0;
return len;
preempt_disable();
arch_spin_lock(&trace_cmdline_lock);
v = &savedcmd->map_cmdline_to_pid[0];
while (l <= *pos) {
v = saved_cmdlines_next(m, v, &l);
if (!v)
return NULL;
}
return v;
}
static void saved_cmdlines_stop(struct seq_file *m, void *v)
{
arch_spin_unlock(&trace_cmdline_lock);
preempt_enable();
}
static int saved_cmdlines_show(struct seq_file *m, void *v)
{
char buf[TASK_COMM_LEN];
unsigned int *pid = v;
__trace_find_cmdline(*pid, buf);
seq_printf(m, "%d %s\n", *pid, buf);
return 0;
}
static const struct seq_operations tracing_saved_cmdlines_seq_ops = {
.start = saved_cmdlines_start,
.next = saved_cmdlines_next,
.stop = saved_cmdlines_stop,
.show = saved_cmdlines_show,
};
static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp)
{
if (tracing_disabled)
return -ENODEV;
return seq_open(filp, &tracing_saved_cmdlines_seq_ops);
}
static const struct file_operations tracing_saved_cmdlines_fops = {
.open = tracing_open_generic,
.read = tracing_saved_cmdlines_read,
.llseek = generic_file_llseek,
.open = tracing_saved_cmdlines_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
static ssize_t
tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
char buf[64];
int r;
arch_spin_lock(&trace_cmdline_lock);
r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
arch_spin_unlock(&trace_cmdline_lock);
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
{
kfree(s->saved_cmdlines);
kfree(s->map_cmdline_to_pid);
kfree(s);
}
static int tracing_resize_saved_cmdlines(unsigned int val)
{
struct saved_cmdlines_buffer *s, *savedcmd_temp;
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (!s)
return -ENOMEM;
if (allocate_cmdlines_buffer(val, s) < 0) {
kfree(s);
return -ENOMEM;
}
arch_spin_lock(&trace_cmdline_lock);
savedcmd_temp = savedcmd;
savedcmd = s;
arch_spin_unlock(&trace_cmdline_lock);
free_saved_cmdlines_buffer(savedcmd_temp);
return 0;
}
static ssize_t
tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
unsigned long val;
int ret;
ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
if (ret)
return ret;
/* must have at least 1 entry or less than PID_MAX_DEFAULT */
if (!val || val > PID_MAX_DEFAULT)
return -EINVAL;
ret = tracing_resize_saved_cmdlines((unsigned int)val);
if (ret < 0)
return ret;
*ppos += cnt;
return cnt;
}
static const struct file_operations tracing_saved_cmdlines_size_fops = {
.open = tracing_open_generic,
.read = tracing_saved_cmdlines_size_read,
.write = tracing_saved_cmdlines_size_write,
};
static ssize_t
@@ -4225,29 +4372,11 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
return trace_poll(iter, filp, poll_table);
}
/*
* This is a make-shift waitqueue.
* A tracer might use this callback on some rare cases:
*
* 1) the current tracer might hold the runqueue lock when it wakes up
* a reader, hence a deadlock (sched, function, and function graph tracers)
* 2) the function tracers, trace all functions, we don't want
* the overhead of calling wake_up and friends
* (and tracing them too)
*
* Anyway, this is really very primitive wakeup.
*/
void poll_wait_pipe(struct trace_iterator *iter)
{
set_current_state(TASK_INTERRUPTIBLE);
/* sleep for 100 msecs, and try again. */
schedule_timeout(HZ / 10);
}
/* Must be called with trace_types_lock mutex held. */
static int tracing_wait_pipe(struct file *filp)
{
struct trace_iterator *iter = filp->private_data;
int ret;
while (trace_empty(iter)) {
@@ -4255,15 +4384,6 @@ static int tracing_wait_pipe(struct file *filp)
return -EAGAIN;
}
mutex_unlock(&iter->mutex);
iter->trace->wait_pipe(iter);
mutex_lock(&iter->mutex);
if (signal_pending(current))
return -EINTR;
/*
* We block until we read something and tracing is disabled.
* We still block if tracing is disabled, but we have never
@@ -4275,6 +4395,18 @@ static int tracing_wait_pipe(struct file *filp)
*/
if (!tracing_is_on() && iter->pos)
break;
mutex_unlock(&iter->mutex);
ret = wait_on_pipe(iter);
mutex_lock(&iter->mutex);
if (ret)
return ret;
if (signal_pending(current))
return -EINTR;
}
return 1;
@@ -5197,8 +5329,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
goto out_unlock;
}
mutex_unlock(&trace_types_lock);
iter->trace->wait_pipe(iter);
ret = wait_on_pipe(iter);
mutex_lock(&trace_types_lock);
if (ret) {
size = ret;
goto out_unlock;
}
if (signal_pending(current)) {
size = -EINTR;
goto out_unlock;
@@ -5408,8 +5544,10 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
goto out;
}
mutex_unlock(&trace_types_lock);
iter->trace->wait_pipe(iter);
ret = wait_on_pipe(iter);
mutex_lock(&trace_types_lock);
if (ret)
goto out;
if (signal_pending(current)) {
ret = -EINTR;
goto out;
@@ -6102,6 +6240,28 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
return 0;
}
static void free_trace_buffer(struct trace_buffer *buf)
{
if (buf->buffer) {
ring_buffer_free(buf->buffer);
buf->buffer = NULL;
free_percpu(buf->data);
buf->data = NULL;
}
}
static void free_trace_buffers(struct trace_array *tr)
{
if (!tr)
return;
free_trace_buffer(&tr->trace_buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
free_trace_buffer(&tr->max_buffer);
#endif
}
static int new_instance_create(const char *name)
{
struct trace_array *tr;
@@ -6131,6 +6291,8 @@ static int new_instance_create(const char *name)
raw_spin_lock_init(&tr->start_lock);
tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
tr->current_trace = &nop_trace;
INIT_LIST_HEAD(&tr->systems);
@@ -6158,8 +6320,7 @@ static int new_instance_create(const char *name)
return 0;
out_free_tr:
if (tr->trace_buffer.buffer)
ring_buffer_free(tr->trace_buffer.buffer);
free_trace_buffers(tr);
free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
@@ -6199,8 +6360,7 @@ static int instance_delete(const char *name)
event_trace_del_tracer(tr);
ftrace_destroy_function_files(tr);
debugfs_remove_recursive(tr->dir);
free_percpu(tr->trace_buffer.data);
ring_buffer_free(tr->trace_buffer.buffer);
free_trace_buffers(tr);
kfree(tr->name);
kfree(tr);
@@ -6328,6 +6488,11 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("tracing_on", 0644, d_tracer,
tr, &rb_simple_fops);
#ifdef CONFIG_TRACER_MAX_TRACE
trace_create_file("tracing_max_latency", 0644, d_tracer,
&tr->max_latency, &tracing_max_lat_fops);
#endif
if (ftrace_create_function_files(tr, d_tracer))
WARN(1, "Could not allocate function filter files");
@@ -6353,11 +6518,6 @@ static __init int tracer_init_debugfs(void)
init_tracer_debugfs(&global_trace, d_tracer);
#ifdef CONFIG_TRACER_MAX_TRACE
trace_create_file("tracing_max_latency", 0644, d_tracer,
&tracing_max_latency, &tracing_max_lat_fops);
#endif
trace_create_file("tracing_thresh", 0644, d_tracer,
&tracing_thresh, &tracing_max_lat_fops);
@@ -6367,6 +6527,9 @@ static __init int tracer_init_debugfs(void)
trace_create_file("saved_cmdlines", 0444, d_tracer,
NULL, &tracing_saved_cmdlines_fops);
trace_create_file("saved_cmdlines_size", 0644, d_tracer,
NULL, &tracing_saved_cmdlines_size_fops);
#ifdef CONFIG_DYNAMIC_FTRACE
trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -6603,18 +6766,19 @@ __init static int tracer_alloc_buffers(void)
if (!temp_buffer)
goto out_free_cpumask;
if (trace_create_savedcmd() < 0)
goto out_free_temp_buffer;
/* TODO: make the number of buffers hot pluggable with CPUS */
if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
WARN_ON(1);
goto out_free_temp_buffer;
goto out_free_savedcmd;
}
if (global_trace.buffer_disabled)
tracing_off();
trace_init_cmdlines();
if (trace_boot_clock) {
ret = tracing_set_clock(&global_trace, trace_boot_clock);
if (ret < 0)
@@ -6629,6 +6793,10 @@ __init static int tracer_alloc_buffers(void)
*/
global_trace.current_trace = &nop_trace;
global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
ftrace_init_global_array_ops(&global_trace);
register_tracer(&nop_trace);
/* All seems OK, enable tracing */
@@ -6656,13 +6824,11 @@ __init static int tracer_alloc_buffers(void)
return 0;
out_free_savedcmd:
free_saved_cmdlines_buffer(savedcmd);
out_free_temp_buffer:
ring_buffer_free(temp_buffer);
out_free_cpumask:
free_percpu(global_trace.trace_buffer.data);
#ifdef CONFIG_TRACER_MAX_TRACE
free_percpu(global_trace.max_buffer.data);
#endif
free_cpumask_var(global_trace.tracing_cpumask);
out_free_buffer_mask:
free_cpumask_var(tracing_buffer_mask);

View File

@@ -190,7 +190,22 @@ struct trace_array {
*/
struct trace_buffer max_buffer;
bool allocated_snapshot;
unsigned long max_latency;
#endif
/*
* max_lock is used to protect the swapping of buffers
* when taking a max snapshot. The buffers themselves are
* protected by per_cpu spinlocks. But the action of the swap
* needs its own lock.
*
* This is defined as a arch_spinlock_t in order to help
* with performance when lockdep debugging is enabled.
*
* It is also used in other places outside the update_max_tr
* so it needs to be defined outside of the
* CONFIG_TRACER_MAX_TRACE.
*/
arch_spinlock_t max_lock;
int buffer_disabled;
#ifdef CONFIG_FTRACE_SYSCALLS
int sys_refcount_enter;
@@ -237,6 +252,9 @@ static inline struct trace_array *top_trace_array(void)
{
struct trace_array *tr;
if (list_empty(&ftrace_trace_arrays))
return NULL;
tr = list_entry(ftrace_trace_arrays.prev,
typeof(*tr), list);
WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
@@ -323,7 +341,6 @@ struct tracer_flags {
* @stop: called when tracing is paused (echo 0 > tracing_enabled)
* @open: called when the trace file is opened
* @pipe_open: called when the trace_pipe file is opened
* @wait_pipe: override how the user waits for traces on trace_pipe
* @close: called when the trace file is released
* @pipe_close: called when the trace_pipe file is released
* @read: override the default read callback on trace_pipe
@@ -342,7 +359,6 @@ struct tracer {
void (*stop)(struct trace_array *tr);
void (*open)(struct trace_iterator *iter);
void (*pipe_open)(struct trace_iterator *iter);
void (*wait_pipe)(struct trace_iterator *iter);
void (*close)(struct trace_iterator *iter);
void (*pipe_close)(struct trace_iterator *iter);
ssize_t (*read)(struct trace_iterator *iter,
@@ -416,13 +432,7 @@ enum {
TRACE_FTRACE_IRQ_BIT,
TRACE_FTRACE_SIRQ_BIT,
/* GLOBAL_BITs must be greater than FTRACE_BITs */
TRACE_GLOBAL_BIT,
TRACE_GLOBAL_NMI_BIT,
TRACE_GLOBAL_IRQ_BIT,
TRACE_GLOBAL_SIRQ_BIT,
/* INTERNAL_BITs must be greater than GLOBAL_BITs */
/* INTERNAL_BITs must be greater than FTRACE_BITs */
TRACE_INTERNAL_BIT,
TRACE_INTERNAL_NMI_BIT,
TRACE_INTERNAL_IRQ_BIT,
@@ -449,9 +459,6 @@ enum {
#define TRACE_FTRACE_START TRACE_FTRACE_BIT
#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
#define TRACE_GLOBAL_START TRACE_GLOBAL_BIT
#define TRACE_GLOBAL_MAX ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1)
#define TRACE_LIST_START TRACE_INTERNAL_BIT
#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
@@ -560,8 +567,6 @@ void trace_init_global_iter(struct trace_iterator *iter);
void tracing_iter_reset(struct trace_iterator *iter, int cpu);
void poll_wait_pipe(struct trace_iterator *iter);
void tracing_sched_switch_trace(struct trace_array *tr,
struct task_struct *prev,
struct task_struct *next,
@@ -608,8 +613,6 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs);
extern unsigned long tracing_thresh;
#ifdef CONFIG_TRACER_MAX_TRACE
extern unsigned long tracing_max_latency;
void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
void update_max_tr_single(struct trace_array *tr,
struct task_struct *tsk, int cpu);
@@ -724,6 +727,8 @@ extern unsigned long trace_flags;
#define TRACE_GRAPH_PRINT_PROC 0x8
#define TRACE_GRAPH_PRINT_DURATION 0x10
#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
#define TRACE_GRAPH_PRINT_IRQS 0x40
#define TRACE_GRAPH_PRINT_TAIL 0x80
#define TRACE_GRAPH_PRINT_FILL_SHIFT 28
#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
@@ -823,6 +828,10 @@ extern int ftrace_is_dead(void);
int ftrace_create_function_files(struct trace_array *tr,
struct dentry *parent);
void ftrace_destroy_function_files(struct trace_array *tr);
void ftrace_init_global_array_ops(struct trace_array *tr);
void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
void ftrace_reset_array_ops(struct trace_array *tr);
int using_ftrace_ops_list_func(void);
#else
static inline int ftrace_trace_task(struct task_struct *task)
{
@@ -836,6 +845,11 @@ ftrace_create_function_files(struct trace_array *tr,
return 0;
}
static inline void ftrace_destroy_function_files(struct trace_array *tr) { }
static inline __init void
ftrace_init_global_array_ops(struct trace_array *tr) { }
static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
/* ftace_func_t type is not defined, use macro instead of static inline */
#define ftrace_init_array_ops(tr, func) do { } while (0)
#endif /* CONFIG_FUNCTION_TRACER */
#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE)

View File

@@ -0,0 +1,198 @@
#include <linux/delay.h>
#include <linux/module.h>
#include <linux/kthread.h>
#include <linux/trace_clock.h>
#define CREATE_TRACE_POINTS
#include "trace_benchmark.h"
static struct task_struct *bm_event_thread;
static char bm_str[BENCHMARK_EVENT_STRLEN] = "START";
static u64 bm_total;
static u64 bm_totalsq;
static u64 bm_last;
static u64 bm_max;
static u64 bm_min;
static u64 bm_first;
static u64 bm_cnt;
static u64 bm_stddev;
static unsigned int bm_avg;
static unsigned int bm_std;
/*
* This gets called in a loop recording the time it took to write
* the tracepoint. What it writes is the time statistics of the last
* tracepoint write. As there is nothing to write the first time
* it simply writes "START". As the first write is cold cache and
* the rest is hot, we save off that time in bm_first and it is
* reported as "first", which is shown in the second write to the
* tracepoint. The "first" field is writen within the statics from
* then on but never changes.
*/
static void trace_do_benchmark(void)
{
u64 start;
u64 stop;
u64 delta;
u64 stddev;
u64 seed;
u64 last_seed;
unsigned int avg;
unsigned int std = 0;
/* Only run if the tracepoint is actually active */
if (!trace_benchmark_event_enabled())
return;
local_irq_disable();
start = trace_clock_local();
trace_benchmark_event(bm_str);
stop = trace_clock_local();
local_irq_enable();
bm_cnt++;
delta = stop - start;
/*
* The first read is cold cached, keep it separate from the
* other calculations.
*/
if (bm_cnt == 1) {
bm_first = delta;
scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
"first=%llu [COLD CACHED]", bm_first);
return;
}
bm_last = delta;
if (delta > bm_max)
bm_max = delta;
if (!bm_min || delta < bm_min)
bm_min = delta;
/*
* When bm_cnt is greater than UINT_MAX, it breaks the statistics
* accounting. Freeze the statistics when that happens.
* We should have enough data for the avg and stddev anyway.
*/
if (bm_cnt > UINT_MAX) {
scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
"last=%llu first=%llu max=%llu min=%llu ** avg=%u std=%d std^2=%lld",
bm_last, bm_first, bm_max, bm_min, bm_avg, bm_std, bm_stddev);
return;
}
bm_total += delta;
bm_totalsq += delta * delta;
if (bm_cnt > 1) {
/*
* Apply Welford's method to calculate standard deviation:
* s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2)
*/
stddev = (u64)bm_cnt * bm_totalsq - bm_total * bm_total;
do_div(stddev, (u32)bm_cnt);
do_div(stddev, (u32)bm_cnt - 1);
} else
stddev = 0;
delta = bm_total;
do_div(delta, bm_cnt);
avg = delta;
if (stddev > 0) {
int i = 0;
/*
* stddev is the square of standard deviation but
* we want the actualy number. Use the average
* as our seed to find the std.
*
* The next try is:
* x = (x + N/x) / 2
*
* Where N is the squared number to find the square
* root of.
*/
seed = avg;
do {
last_seed = seed;
seed = stddev;
if (!last_seed)
break;
do_div(seed, last_seed);
seed += last_seed;
do_div(seed, 2);
} while (i++ < 10 && last_seed != seed);
std = seed;
}
scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
"last=%llu first=%llu max=%llu min=%llu avg=%u std=%d std^2=%lld",
bm_last, bm_first, bm_max, bm_min, avg, std, stddev);
bm_std = std;
bm_avg = avg;
bm_stddev = stddev;
}
static int benchmark_event_kthread(void *arg)
{
/* sleep a bit to make sure the tracepoint gets activated */
msleep(100);
while (!kthread_should_stop()) {
trace_do_benchmark();
/*
* We don't go to sleep, but let others
* run as well.
*/
cond_resched();
}
return 0;
}
/*
* When the benchmark tracepoint is enabled, it calls this
* function and the thread that calls the tracepoint is created.
*/
void trace_benchmark_reg(void)
{
bm_event_thread = kthread_run(benchmark_event_kthread,
NULL, "event_benchmark");
WARN_ON(!bm_event_thread);
}
/*
* When the benchmark tracepoint is disabled, it calls this
* function and the thread that calls the tracepoint is deleted
* and all the numbers are reset.
*/
void trace_benchmark_unreg(void)
{
if (!bm_event_thread)
return;
kthread_stop(bm_event_thread);
strcpy(bm_str, "START");
bm_total = 0;
bm_totalsq = 0;
bm_last = 0;
bm_max = 0;
bm_min = 0;
bm_cnt = 0;
/* These don't need to be reset but reset them anyway */
bm_first = 0;
bm_std = 0;
bm_avg = 0;
bm_stddev = 0;
}

View File

@@ -0,0 +1,41 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM benchmark
#if !defined(_TRACE_BENCHMARK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_BENCHMARK_H
#include <linux/tracepoint.h>
extern void trace_benchmark_reg(void);
extern void trace_benchmark_unreg(void);
#define BENCHMARK_EVENT_STRLEN 128
TRACE_EVENT_FN(benchmark_event,
TP_PROTO(const char *str),
TP_ARGS(str),
TP_STRUCT__entry(
__array( char, str, BENCHMARK_EVENT_STRLEN )
),
TP_fast_assign(
memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN);
),
TP_printk("%s", __entry->str),
trace_benchmark_reg, trace_benchmark_unreg
);
#endif /* _TRACE_BENCHMARK_H */
#undef TRACE_INCLUDE_FILE
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
#define TRACE_INCLUDE_FILE trace_benchmark
/* This part must be outside protection */
#include <trace/define_trace.h>

View File

@@ -248,8 +248,8 @@ void perf_trace_del(struct perf_event *p_event, int flags)
tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
}
__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
struct pt_regs *regs, int *rctxp)
void *perf_trace_buf_prepare(int size, unsigned short type,
struct pt_regs *regs, int *rctxp)
{
struct trace_entry *entry;
unsigned long flags;
@@ -281,6 +281,7 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
return raw_data;
}
EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
NOKPROBE_SYMBOL(perf_trace_buf_prepare);
#ifdef CONFIG_FUNCTION_TRACER
static void

View File

@@ -574,6 +574,9 @@ int trace_set_clr_event(const char *system, const char *event, int set)
{
struct trace_array *tr = top_trace_array();
if (!tr)
return -ENODEV;
return __ftrace_set_clr_event(tr, NULL, system, event, set);
}
EXPORT_SYMBOL_GPL(trace_set_clr_event);
@@ -2065,6 +2068,9 @@ event_enable_func(struct ftrace_hash *hash,
bool enable;
int ret;
if (!tr)
return -ENODEV;
/* hash funcs only work with set_ftrace_filter */
if (!enabled || !param)
return -EINVAL;
@@ -2396,6 +2402,9 @@ static __init int event_trace_enable(void)
char *token;
int ret;
if (!tr)
return -ENODEV;
for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) {
call = *iter;
@@ -2442,6 +2451,8 @@ static __init int event_trace_init(void)
int ret;
tr = top_trace_array();
if (!tr)
return -ENODEV;
d_tracer = tracing_init_dentry();
if (!d_tracer)
@@ -2535,6 +2546,8 @@ static __init void event_trace_self_tests(void)
int ret;
tr = top_trace_array();
if (!tr)
return;
pr_info("Running tests on trace events:\n");

View File

@@ -26,8 +26,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
static void
function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *pt_regs);
static struct ftrace_ops trace_ops;
static struct ftrace_ops trace_stack_ops;
static struct tracer_flags func_flags;
/* Our option */
@@ -83,28 +81,24 @@ void ftrace_destroy_function_files(struct trace_array *tr)
static int function_trace_init(struct trace_array *tr)
{
struct ftrace_ops *ops;
ftrace_func_t func;
if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
/* There's only one global tr */
if (!trace_ops.private) {
trace_ops.private = tr;
trace_stack_ops.private = tr;
}
if (func_flags.val & TRACE_FUNC_OPT_STACK)
ops = &trace_stack_ops;
else
ops = &trace_ops;
tr->ops = ops;
} else if (!tr->ops) {
/*
* Instance trace_arrays get their ops allocated
* at instance creation. Unless it failed
* the allocation.
*/
/*
* Instance trace_arrays get their ops allocated
* at instance creation. Unless it failed
* the allocation.
*/
if (!tr->ops)
return -ENOMEM;
}
/* Currently only the global instance can do stack tracing */
if (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
func_flags.val & TRACE_FUNC_OPT_STACK)
func = function_stack_trace_call;
else
func = function_trace_call;
ftrace_init_array_ops(tr, func);
tr->trace_buffer.cpu = get_cpu();
put_cpu();
@@ -118,6 +112,7 @@ static void function_trace_reset(struct trace_array *tr)
{
tracing_stop_function_trace(tr);
tracing_stop_cmdline_record();
ftrace_reset_array_ops(tr);
}
static void function_trace_start(struct trace_array *tr)
@@ -199,18 +194,6 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
local_irq_restore(flags);
}
static struct ftrace_ops trace_ops __read_mostly =
{
.func = function_trace_call,
.flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
};
static struct ftrace_ops trace_stack_ops __read_mostly =
{
.func = function_stack_trace_call,
.flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
};
static struct tracer_opt func_opts[] = {
#ifdef CONFIG_STACKTRACE
{ TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
@@ -248,10 +231,10 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
unregister_ftrace_function(tr->ops);
if (set) {
tr->ops = &trace_stack_ops;
tr->ops->func = function_stack_trace_call;
register_ftrace_function(tr->ops);
} else {
tr->ops = &trace_ops;
tr->ops->func = function_trace_call;
register_ftrace_function(tr->ops);
}
@@ -269,7 +252,6 @@ static struct tracer function_trace __tracer_data =
.init = function_trace_init,
.reset = function_trace_reset,
.start = function_trace_start,
.wait_pipe = poll_wait_pipe,
.flags = &func_flags,
.set_flag = func_set_flag,
.allow_instances = true,

View File

@@ -38,15 +38,6 @@ struct fgraph_data {
#define TRACE_GRAPH_INDENT 2
/* Flag options */
#define TRACE_GRAPH_PRINT_OVERRUN 0x1
#define TRACE_GRAPH_PRINT_CPU 0x2
#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
#define TRACE_GRAPH_PRINT_PROC 0x8
#define TRACE_GRAPH_PRINT_DURATION 0x10
#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
#define TRACE_GRAPH_PRINT_IRQS 0x40
static unsigned int max_depth;
static struct tracer_opt trace_opts[] = {
@@ -64,11 +55,13 @@ static struct tracer_opt trace_opts[] = {
{ TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
/* Display interrupts */
{ TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
/* Display function name after trailing } */
{ TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) },
{ } /* Empty entry */
};
static struct tracer_flags tracer_flags = {
/* Don't display overruns and proc by default */
/* Don't display overruns, proc, or tail by default */
.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
.opts = trace_opts
@@ -1176,9 +1169,10 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
* If the return function does not have a matching entry,
* then the entry was lost. Instead of just printing
* the '}' and letting the user guess what function this
* belongs to, write out the function name.
* belongs to, write out the function name. Always do
* that if the funcgraph-tail option is enabled.
*/
if (func_match) {
if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) {
ret = trace_seq_puts(s, "}\n");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -1505,7 +1499,6 @@ static struct tracer graph_trace __tracer_data = {
.pipe_open = graph_trace_open,
.close = graph_trace_close,
.pipe_close = graph_trace_close,
.wait_pipe = poll_wait_pipe,
.init = graph_trace_init,
.reset = graph_trace_reset,
.print_line = print_graph_function,

View File

@@ -151,12 +151,6 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
atomic_dec(&data->disabled);
}
static struct ftrace_ops trace_ops __read_mostly =
{
.func = irqsoff_tracer_call,
.flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
};
#endif /* CONFIG_FUNCTION_TRACER */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -176,7 +170,7 @@ irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
for_each_possible_cpu(cpu)
per_cpu(tracing_cpu, cpu) = 0;
tracing_max_latency = 0;
tr->max_latency = 0;
tracing_reset_online_cpus(&irqsoff_trace->trace_buffer);
return start_irqsoff_tracer(irqsoff_trace, set);
@@ -303,13 +297,13 @@ static void irqsoff_print_header(struct seq_file *s)
/*
* Should this new latency be reported/recorded?
*/
static int report_latency(cycle_t delta)
static int report_latency(struct trace_array *tr, cycle_t delta)
{
if (tracing_thresh) {
if (delta < tracing_thresh)
return 0;
} else {
if (delta <= tracing_max_latency)
if (delta <= tr->max_latency)
return 0;
}
return 1;
@@ -333,13 +327,13 @@ check_critical_timing(struct trace_array *tr,
pc = preempt_count();
if (!report_latency(delta))
if (!report_latency(tr, delta))
goto out;
raw_spin_lock_irqsave(&max_trace_lock, flags);
/* check if we are still the max latency */
if (!report_latency(delta))
if (!report_latency(tr, delta))
goto out_unlock;
__trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
@@ -352,7 +346,7 @@ check_critical_timing(struct trace_array *tr,
data->critical_end = parent_ip;
if (likely(!is_tracing_stopped())) {
tracing_max_latency = delta;
tr->max_latency = delta;
update_max_tr_single(tr, current, cpu);
}
@@ -531,7 +525,7 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
}
#endif /* CONFIG_PREEMPT_TRACER */
static int register_irqsoff_function(int graph, int set)
static int register_irqsoff_function(struct trace_array *tr, int graph, int set)
{
int ret;
@@ -543,7 +537,7 @@ static int register_irqsoff_function(int graph, int set)
ret = register_ftrace_graph(&irqsoff_graph_return,
&irqsoff_graph_entry);
else
ret = register_ftrace_function(&trace_ops);
ret = register_ftrace_function(tr->ops);
if (!ret)
function_enabled = true;
@@ -551,7 +545,7 @@ static int register_irqsoff_function(int graph, int set)
return ret;
}
static void unregister_irqsoff_function(int graph)
static void unregister_irqsoff_function(struct trace_array *tr, int graph)
{
if (!function_enabled)
return;
@@ -559,17 +553,17 @@ static void unregister_irqsoff_function(int graph)
if (graph)
unregister_ftrace_graph();
else
unregister_ftrace_function(&trace_ops);
unregister_ftrace_function(tr->ops);
function_enabled = false;
}
static void irqsoff_function_set(int set)
static void irqsoff_function_set(struct trace_array *tr, int set)
{
if (set)
register_irqsoff_function(is_graph(), 1);
register_irqsoff_function(tr, is_graph(), 1);
else
unregister_irqsoff_function(is_graph());
unregister_irqsoff_function(tr, is_graph());
}
static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)
@@ -577,7 +571,7 @@ static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)
struct tracer *tracer = tr->current_trace;
if (mask & TRACE_ITER_FUNCTION)
irqsoff_function_set(set);
irqsoff_function_set(tr, set);
return trace_keep_overwrite(tracer, mask, set);
}
@@ -586,7 +580,7 @@ static int start_irqsoff_tracer(struct trace_array *tr, int graph)
{
int ret;
ret = register_irqsoff_function(graph, 0);
ret = register_irqsoff_function(tr, graph, 0);
if (!ret && tracing_is_enabled())
tracer_enabled = 1;
@@ -600,25 +594,37 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
{
tracer_enabled = 0;
unregister_irqsoff_function(graph);
unregister_irqsoff_function(tr, graph);
}
static void __irqsoff_tracer_init(struct trace_array *tr)
static bool irqsoff_busy;
static int __irqsoff_tracer_init(struct trace_array *tr)
{
if (irqsoff_busy)
return -EBUSY;
save_flags = trace_flags;
/* non overwrite screws up the latency tracers */
set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
tracing_max_latency = 0;
tr->max_latency = 0;
irqsoff_trace = tr;
/* make sure that the tracer is visible */
smp_wmb();
tracing_reset_online_cpus(&tr->trace_buffer);
if (start_irqsoff_tracer(tr, is_graph()))
ftrace_init_array_ops(tr, irqsoff_tracer_call);
/* Only toplevel instance supports graph tracing */
if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
is_graph())))
printk(KERN_ERR "failed to start irqsoff tracer\n");
irqsoff_busy = true;
return 0;
}
static void irqsoff_tracer_reset(struct trace_array *tr)
@@ -630,6 +636,9 @@ static void irqsoff_tracer_reset(struct trace_array *tr)
set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
ftrace_reset_array_ops(tr);
irqsoff_busy = false;
}
static void irqsoff_tracer_start(struct trace_array *tr)
@@ -647,8 +656,7 @@ static int irqsoff_tracer_init(struct trace_array *tr)
{
trace_type = TRACER_IRQS_OFF;
__irqsoff_tracer_init(tr);
return 0;
return __irqsoff_tracer_init(tr);
}
static struct tracer irqsoff_tracer __read_mostly =
{
@@ -668,6 +676,7 @@ static struct tracer irqsoff_tracer __read_mostly =
#endif
.open = irqsoff_trace_open,
.close = irqsoff_trace_close,
.allow_instances = true,
.use_max_tr = true,
};
# define register_irqsoff(trace) register_tracer(&trace)
@@ -680,8 +689,7 @@ static int preemptoff_tracer_init(struct trace_array *tr)
{
trace_type = TRACER_PREEMPT_OFF;
__irqsoff_tracer_init(tr);
return 0;
return __irqsoff_tracer_init(tr);
}
static struct tracer preemptoff_tracer __read_mostly =
@@ -702,6 +710,7 @@ static struct tracer preemptoff_tracer __read_mostly =
#endif
.open = irqsoff_trace_open,
.close = irqsoff_trace_close,
.allow_instances = true,
.use_max_tr = true,
};
# define register_preemptoff(trace) register_tracer(&trace)
@@ -716,8 +725,7 @@ static int preemptirqsoff_tracer_init(struct trace_array *tr)
{
trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
__irqsoff_tracer_init(tr);
return 0;
return __irqsoff_tracer_init(tr);
}
static struct tracer preemptirqsoff_tracer __read_mostly =
@@ -738,6 +746,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
#endif
.open = irqsoff_trace_open,
.close = irqsoff_trace_close,
.allow_instances = true,
.use_max_tr = true,
};

View File

@@ -40,27 +40,27 @@ struct trace_kprobe {
(sizeof(struct probe_arg) * (n)))
static __kprobes bool trace_kprobe_is_return(struct trace_kprobe *tk)
static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
{
return tk->rp.handler != NULL;
}
static __kprobes const char *trace_kprobe_symbol(struct trace_kprobe *tk)
static nokprobe_inline const char *trace_kprobe_symbol(struct trace_kprobe *tk)
{
return tk->symbol ? tk->symbol : "unknown";
}
static __kprobes unsigned long trace_kprobe_offset(struct trace_kprobe *tk)
static nokprobe_inline unsigned long trace_kprobe_offset(struct trace_kprobe *tk)
{
return tk->rp.kp.offset;
}
static __kprobes bool trace_kprobe_has_gone(struct trace_kprobe *tk)
static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk)
{
return !!(kprobe_gone(&tk->rp.kp));
}
static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk,
static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
struct module *mod)
{
int len = strlen(mod->name);
@@ -68,7 +68,7 @@ static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk,
return strncmp(mod->name, name, len) == 0 && name[len] == ':';
}
static __kprobes bool trace_kprobe_is_on_module(struct trace_kprobe *tk)
static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk)
{
return !!strchr(trace_kprobe_symbol(tk), ':');
}
@@ -132,19 +132,21 @@ struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
* Kprobes-specific fetch functions
*/
#define DEFINE_FETCH_stack(type) \
static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \
void *offset, void *dest) \
{ \
*(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
(unsigned int)((unsigned long)offset)); \
}
} \
NOKPROBE_SYMBOL(FETCH_FUNC_NAME(stack, type));
DEFINE_BASIC_FETCH_FUNCS(stack)
/* No string on the stack entry */
#define fetch_stack_string NULL
#define fetch_stack_string_size NULL
#define DEFINE_FETCH_memory(type) \
static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \
void *addr, void *dest) \
{ \
type retval; \
@@ -152,14 +154,16 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
*(type *)dest = 0; \
else \
*(type *)dest = retval; \
}
} \
NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, type));
DEFINE_BASIC_FETCH_FUNCS(memory)
/*
* Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
* length and relative data location.
*/
static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
void *addr, void *dest)
static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
void *addr, void *dest)
{
long ret;
int maxlen = get_rloc_len(*(u32 *)dest);
@@ -193,10 +197,11 @@ static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
get_rloc_offs(*(u32 *)dest));
}
}
NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string));
/* Return the length of string -- including null terminal byte */
static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
void *addr, void *dest)
static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
void *addr, void *dest)
{
mm_segment_t old_fs;
int ret, len = 0;
@@ -219,17 +224,19 @@ static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
else
*(u32 *)dest = len;
}
NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string_size));
#define DEFINE_FETCH_symbol(type) \
__kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, \
void *data, void *dest) \
void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, void *data, void *dest)\
{ \
struct symbol_cache *sc = data; \
if (sc->addr) \
fetch_memory_##type(regs, (void *)sc->addr, dest); \
else \
*(type *)dest = 0; \
}
} \
NOKPROBE_SYMBOL(FETCH_FUNC_NAME(symbol, type));
DEFINE_BASIC_FETCH_FUNCS(symbol)
DEFINE_FETCH_symbol(string)
DEFINE_FETCH_symbol(string_size)
@@ -907,7 +914,7 @@ static const struct file_operations kprobe_profile_ops = {
};
/* Kprobe handler */
static __kprobes void
static nokprobe_inline void
__kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
struct ftrace_event_file *ftrace_file)
{
@@ -943,7 +950,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
entry, irq_flags, pc, regs);
}
static __kprobes void
static void
kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs)
{
struct event_file_link *link;
@@ -951,9 +958,10 @@ kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs)
list_for_each_entry_rcu(link, &tk->tp.files, list)
__kprobe_trace_func(tk, regs, link->file);
}
NOKPROBE_SYMBOL(kprobe_trace_func);
/* Kretprobe handler */
static __kprobes void
static nokprobe_inline void
__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs,
struct ftrace_event_file *ftrace_file)
@@ -991,7 +999,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
entry, irq_flags, pc, regs);
}
static __kprobes void
static void
kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs)
{
@@ -1000,6 +1008,7 @@ kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
list_for_each_entry_rcu(link, &tk->tp.files, list)
__kretprobe_trace_func(tk, ri, regs, link->file);
}
NOKPROBE_SYMBOL(kretprobe_trace_func);
/* Event entry printers */
static enum print_line_t
@@ -1131,7 +1140,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
#ifdef CONFIG_PERF_EVENTS
/* Kprobe profile handler */
static __kprobes void
static void
kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
{
struct ftrace_event_call *call = &tk->tp.call;
@@ -1158,9 +1167,10 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
}
NOKPROBE_SYMBOL(kprobe_perf_func);
/* Kretprobe profile handler */
static __kprobes void
static void
kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs)
{
@@ -1188,6 +1198,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
}
NOKPROBE_SYMBOL(kretprobe_perf_func);
#endif /* CONFIG_PERF_EVENTS */
/*
@@ -1196,9 +1207,8 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
* kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe
* lockless, but we can't race with this __init function.
*/
static __kprobes
int kprobe_register(struct ftrace_event_call *event,
enum trace_reg type, void *data)
static int kprobe_register(struct ftrace_event_call *event,
enum trace_reg type, void *data)
{
struct trace_kprobe *tk = (struct trace_kprobe *)event->data;
struct ftrace_event_file *file = data;
@@ -1224,8 +1234,7 @@ int kprobe_register(struct ftrace_event_call *event,
return 0;
}
static __kprobes
int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
{
struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
@@ -1239,9 +1248,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
#endif
return 0; /* We don't tweek kernel, so just return 0 */
}
NOKPROBE_SYMBOL(kprobe_dispatcher);
static __kprobes
int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
static int
kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
{
struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp);
@@ -1255,6 +1265,7 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
#endif
return 0; /* We don't tweek kernel, so just return 0 */
}
NOKPROBE_SYMBOL(kretprobe_dispatcher);
static struct trace_event_functions kretprobe_funcs = {
.trace = print_kretprobe_event
@@ -1377,6 +1388,9 @@ static __init int kprobe_trace_self_tests_init(void)
struct trace_kprobe *tk;
struct ftrace_event_file *file;
if (tracing_is_disabled())
return -ENODEV;
target = kprobe_trace_selftest_target;
pr_info("Testing kprobe tracing: ");

View File

@@ -91,7 +91,6 @@ struct tracer nop_trace __read_mostly =
.name = "nop",
.init = nop_trace_init,
.reset = nop_trace_reset,
.wait_pipe = poll_wait_pipe,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_nop,
#endif

View File

@@ -125,6 +125,34 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
}
EXPORT_SYMBOL_GPL(trace_seq_printf);
/**
* trace_seq_bitmask - put a list of longs as a bitmask print output
* @s: trace sequence descriptor
* @maskp: points to an array of unsigned longs that represent a bitmask
* @nmaskbits: The number of bits that are valid in @maskp
*
* It returns 0 if the trace oversizes the buffer's free
* space, 1 otherwise.
*
* Writes a ASCII representation of a bitmask string into @s.
*/
int
trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
int nmaskbits)
{
int len = (PAGE_SIZE - 1) - s->len;
int ret;
if (s->full || !len)
return 0;
ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits);
s->len += ret;
return 1;
}
EXPORT_SYMBOL_GPL(trace_seq_bitmask);
/**
* trace_seq_vprintf - sequence printing of trace information
* @s: trace sequence descriptor
@@ -398,6 +426,19 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
EXPORT_SYMBOL(ftrace_print_symbols_seq_u64);
#endif
const char *
ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
unsigned int bitmask_size)
{
const char *ret = p->buffer + p->len;
trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8);
trace_seq_putc(p, 0);
return ret;
}
EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq);
const char *
ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
{

View File

@@ -37,13 +37,13 @@ const char *reserved_field_names[] = {
/* Printing in basic type function template */
#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \
__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
const char *name, \
void *data, void *ent) \
int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \
void *data, void *ent) \
{ \
return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \
} \
const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \
NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type));
DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x")
DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x")
@@ -55,9 +55,8 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d")
DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld")
/* Print type function for string type */
__kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
const char *name,
void *data, void *ent)
int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name,
void *data, void *ent)
{
int len = *(u32 *)data >> 16;
@@ -67,6 +66,7 @@ __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
return trace_seq_printf(s, " %s=\"%s\"", name,
(const char *)get_loc_data(data, ent));
}
NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string));
const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
@@ -81,23 +81,24 @@ const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
/* Data fetch function templates */
#define DEFINE_FETCH_reg(type) \
__kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
void *offset, void *dest) \
void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, void *offset, void *dest) \
{ \
*(type *)dest = (type)regs_get_register(regs, \
(unsigned int)((unsigned long)offset)); \
}
} \
NOKPROBE_SYMBOL(FETCH_FUNC_NAME(reg, type));
DEFINE_BASIC_FETCH_FUNCS(reg)
/* No string on the register */
#define fetch_reg_string NULL
#define fetch_reg_string_size NULL
#define DEFINE_FETCH_retval(type) \
__kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \
void *dummy, void *dest) \
void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \
void *dummy, void *dest) \
{ \
*(type *)dest = (type)regs_return_value(regs); \
}
} \
NOKPROBE_SYMBOL(FETCH_FUNC_NAME(retval, type));
DEFINE_BASIC_FETCH_FUNCS(retval)
/* No string on the retval */
#define fetch_retval_string NULL
@@ -112,8 +113,8 @@ struct deref_fetch_param {
};
#define DEFINE_FETCH_deref(type) \
__kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \
void *data, void *dest) \
void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \
void *data, void *dest) \
{ \
struct deref_fetch_param *dprm = data; \
unsigned long addr; \
@@ -123,12 +124,13 @@ __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \
dprm->fetch(regs, (void *)addr, dest); \
} else \
*(type *)dest = 0; \
}
} \
NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, type));
DEFINE_BASIC_FETCH_FUNCS(deref)
DEFINE_FETCH_deref(string)
__kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs,
void *data, void *dest)
void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs,
void *data, void *dest)
{
struct deref_fetch_param *dprm = data;
unsigned long addr;
@@ -140,16 +142,18 @@ __kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs,
} else
*(string_size *)dest = 0;
}
NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, string_size));
static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
static void update_deref_fetch_param(struct deref_fetch_param *data)
{
if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
update_deref_fetch_param(data->orig.data);
else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
update_symbol_cache(data->orig.data);
}
NOKPROBE_SYMBOL(update_deref_fetch_param);
static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
static void free_deref_fetch_param(struct deref_fetch_param *data)
{
if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
free_deref_fetch_param(data->orig.data);
@@ -157,6 +161,7 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
free_symbol_cache(data->orig.data);
kfree(data);
}
NOKPROBE_SYMBOL(free_deref_fetch_param);
/* Bitfield fetch function */
struct bitfield_fetch_param {
@@ -166,8 +171,8 @@ struct bitfield_fetch_param {
};
#define DEFINE_FETCH_bitfield(type) \
__kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \
void *data, void *dest) \
void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \
void *data, void *dest) \
{ \
struct bitfield_fetch_param *bprm = data; \
type buf = 0; \
@@ -177,13 +182,13 @@ __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \
buf >>= bprm->low_shift; \
} \
*(type *)dest = buf; \
}
} \
NOKPROBE_SYMBOL(FETCH_FUNC_NAME(bitfield, type));
DEFINE_BASIC_FETCH_FUNCS(bitfield)
#define fetch_bitfield_string NULL
#define fetch_bitfield_string_size NULL
static __kprobes void
static void
update_bitfield_fetch_param(struct bitfield_fetch_param *data)
{
/*
@@ -196,7 +201,7 @@ update_bitfield_fetch_param(struct bitfield_fetch_param *data)
update_symbol_cache(data->orig.data);
}
static __kprobes void
static void
free_bitfield_fetch_param(struct bitfield_fetch_param *data)
{
/*
@@ -255,17 +260,17 @@ fail:
}
/* Special function : only accept unsigned long */
static __kprobes void fetch_kernel_stack_address(struct pt_regs *regs,
void *dummy, void *dest)
static void fetch_kernel_stack_address(struct pt_regs *regs, void *dummy, void *dest)
{
*(unsigned long *)dest = kernel_stack_pointer(regs);
}
NOKPROBE_SYMBOL(fetch_kernel_stack_address);
static __kprobes void fetch_user_stack_address(struct pt_regs *regs,
void *dummy, void *dest)
static void fetch_user_stack_address(struct pt_regs *regs, void *dummy, void *dest)
{
*(unsigned long *)dest = user_stack_pointer(regs);
}
NOKPROBE_SYMBOL(fetch_user_stack_address);
static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
fetch_func_t orig_fn,

View File

@@ -81,13 +81,13 @@
*/
#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
static inline void *get_rloc_data(u32 *dl)
static nokprobe_inline void *get_rloc_data(u32 *dl)
{
return (u8 *)dl + get_rloc_offs(*dl);
}
/* For data_loc conversion */
static inline void *get_loc_data(u32 *dl, void *ent)
static nokprobe_inline void *get_loc_data(u32 *dl, void *ent)
{
return (u8 *)ent + get_rloc_offs(*dl);
}
@@ -136,9 +136,8 @@ typedef u32 string_size;
/* Printing in basic type function template */
#define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \
__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
const char *name, \
void *data, void *ent); \
int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \
void *data, void *ent); \
extern const char PRINT_TYPE_FMT_NAME(type)[]
DECLARE_BASIC_PRINT_TYPE_FUNC(u8);
@@ -303,7 +302,7 @@ static inline bool trace_probe_is_registered(struct trace_probe *tp)
return !!(tp->flags & TP_FLAG_REGISTERED);
}
static inline __kprobes void call_fetch(struct fetch_param *fprm,
static nokprobe_inline void call_fetch(struct fetch_param *fprm,
struct pt_regs *regs, void *dest)
{
return fprm->fn(regs, fprm->data, dest);
@@ -351,7 +350,7 @@ extern ssize_t traceprobe_probes_write(struct file *file,
extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
/* Sum up total data length for dynamic arraies (strings) */
static inline __kprobes int
static nokprobe_inline int
__get_data_size(struct trace_probe *tp, struct pt_regs *regs)
{
int i, ret = 0;
@@ -367,7 +366,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
}
/* Store the value of each argument */
static inline __kprobes void
static nokprobe_inline void
store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs,
u8 *data, int maxlen)
{

View File

@@ -130,15 +130,9 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
atomic_dec(&data->disabled);
preempt_enable_notrace();
}
static struct ftrace_ops trace_ops __read_mostly =
{
.func = wakeup_tracer_call,
.flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
};
#endif /* CONFIG_FUNCTION_TRACER */
static int register_wakeup_function(int graph, int set)
static int register_wakeup_function(struct trace_array *tr, int graph, int set)
{
int ret;
@@ -150,7 +144,7 @@ static int register_wakeup_function(int graph, int set)
ret = register_ftrace_graph(&wakeup_graph_return,
&wakeup_graph_entry);
else
ret = register_ftrace_function(&trace_ops);
ret = register_ftrace_function(tr->ops);
if (!ret)
function_enabled = true;
@@ -158,7 +152,7 @@ static int register_wakeup_function(int graph, int set)
return ret;
}
static void unregister_wakeup_function(int graph)
static void unregister_wakeup_function(struct trace_array *tr, int graph)
{
if (!function_enabled)
return;
@@ -166,17 +160,17 @@ static void unregister_wakeup_function(int graph)
if (graph)
unregister_ftrace_graph();
else
unregister_ftrace_function(&trace_ops);
unregister_ftrace_function(tr->ops);
function_enabled = false;
}
static void wakeup_function_set(int set)
static void wakeup_function_set(struct trace_array *tr, int set)
{
if (set)
register_wakeup_function(is_graph(), 1);
register_wakeup_function(tr, is_graph(), 1);
else
unregister_wakeup_function(is_graph());
unregister_wakeup_function(tr, is_graph());
}
static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
@@ -184,16 +178,16 @@ static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
struct tracer *tracer = tr->current_trace;
if (mask & TRACE_ITER_FUNCTION)
wakeup_function_set(set);
wakeup_function_set(tr, set);
return trace_keep_overwrite(tracer, mask, set);
}
static int start_func_tracer(int graph)
static int start_func_tracer(struct trace_array *tr, int graph)
{
int ret;
ret = register_wakeup_function(graph, 0);
ret = register_wakeup_function(tr, graph, 0);
if (!ret && tracing_is_enabled())
tracer_enabled = 1;
@@ -203,11 +197,11 @@ static int start_func_tracer(int graph)
return ret;
}
static void stop_func_tracer(int graph)
static void stop_func_tracer(struct trace_array *tr, int graph)
{
tracer_enabled = 0;
unregister_wakeup_function(graph);
unregister_wakeup_function(tr, graph);
}
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -221,12 +215,12 @@ wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
if (!(is_graph() ^ set))
return 0;
stop_func_tracer(!set);
stop_func_tracer(tr, !set);
wakeup_reset(wakeup_trace);
tracing_max_latency = 0;
tr->max_latency = 0;
return start_func_tracer(set);
return start_func_tracer(tr, set);
}
static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
@@ -350,13 +344,13 @@ static void wakeup_print_header(struct seq_file *s)
/*
* Should this new latency be reported/recorded?
*/
static int report_latency(cycle_t delta)
static int report_latency(struct trace_array *tr, cycle_t delta)
{
if (tracing_thresh) {
if (delta < tracing_thresh)
return 0;
} else {
if (delta <= tracing_max_latency)
if (delta <= tr->max_latency)
return 0;
}
return 1;
@@ -424,11 +418,11 @@ probe_wakeup_sched_switch(void *ignore,
T1 = ftrace_now(cpu);
delta = T1-T0;
if (!report_latency(delta))
if (!report_latency(wakeup_trace, delta))
goto out_unlock;
if (likely(!is_tracing_stopped())) {
tracing_max_latency = delta;
wakeup_trace->max_latency = delta;
update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
}
@@ -587,7 +581,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
*/
smp_wmb();
if (start_func_tracer(is_graph()))
if (start_func_tracer(tr, is_graph()))
printk(KERN_ERR "failed to start wakeup tracer\n");
return;
@@ -600,13 +594,15 @@ fail_deprobe:
static void stop_wakeup_tracer(struct trace_array *tr)
{
tracer_enabled = 0;
stop_func_tracer(is_graph());
stop_func_tracer(tr, is_graph());
unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
unregister_trace_sched_wakeup(probe_wakeup, NULL);
unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
}
static bool wakeup_busy;
static int __wakeup_tracer_init(struct trace_array *tr)
{
save_flags = trace_flags;
@@ -615,14 +611,20 @@ static int __wakeup_tracer_init(struct trace_array *tr)
set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
tracing_max_latency = 0;
tr->max_latency = 0;
wakeup_trace = tr;
ftrace_init_array_ops(tr, wakeup_tracer_call);
start_wakeup_tracer(tr);
wakeup_busy = true;
return 0;
}
static int wakeup_tracer_init(struct trace_array *tr)
{
if (wakeup_busy)
return -EBUSY;
wakeup_dl = 0;
wakeup_rt = 0;
return __wakeup_tracer_init(tr);
@@ -630,6 +632,9 @@ static int wakeup_tracer_init(struct trace_array *tr)
static int wakeup_rt_tracer_init(struct trace_array *tr)
{
if (wakeup_busy)
return -EBUSY;
wakeup_dl = 0;
wakeup_rt = 1;
return __wakeup_tracer_init(tr);
@@ -637,6 +642,9 @@ static int wakeup_rt_tracer_init(struct trace_array *tr)
static int wakeup_dl_tracer_init(struct trace_array *tr)
{
if (wakeup_busy)
return -EBUSY;
wakeup_dl = 1;
wakeup_rt = 0;
return __wakeup_tracer_init(tr);
@@ -653,6 +661,8 @@ static void wakeup_tracer_reset(struct trace_array *tr)
set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
ftrace_reset_array_ops(tr);
wakeup_busy = false;
}
static void wakeup_tracer_start(struct trace_array *tr)
@@ -684,6 +694,7 @@ static struct tracer wakeup_tracer __read_mostly =
#endif
.open = wakeup_trace_open,
.close = wakeup_trace_close,
.allow_instances = true,
.use_max_tr = true,
};
@@ -694,7 +705,6 @@ static struct tracer wakeup_rt_tracer __read_mostly =
.reset = wakeup_tracer_reset,
.start = wakeup_tracer_start,
.stop = wakeup_tracer_stop,
.wait_pipe = poll_wait_pipe,
.print_max = true,
.print_header = wakeup_print_header,
.print_line = wakeup_print_line,
@@ -706,6 +716,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
#endif
.open = wakeup_trace_open,
.close = wakeup_trace_close,
.allow_instances = true,
.use_max_tr = true,
};
@@ -716,7 +727,6 @@ static struct tracer wakeup_dl_tracer __read_mostly =
.reset = wakeup_tracer_reset,
.start = wakeup_tracer_start,
.stop = wakeup_tracer_stop,
.wait_pipe = poll_wait_pipe,
.print_max = true,
.print_header = wakeup_print_header,
.print_line = wakeup_print_line,

View File

@@ -65,7 +65,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
/* Don't allow flipping of max traces now */
local_irq_save(flags);
arch_spin_lock(&ftrace_max_lock);
arch_spin_lock(&buf->tr->max_lock);
cnt = ring_buffer_entries(buf->buffer);
@@ -83,7 +83,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
break;
}
tracing_on();
arch_spin_unlock(&ftrace_max_lock);
arch_spin_unlock(&buf->tr->max_lock);
local_irq_restore(flags);
if (count)
@@ -161,11 +161,6 @@ static struct ftrace_ops test_probe3 = {
.flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static struct ftrace_ops test_global = {
.func = trace_selftest_test_global_func,
.flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
};
static void print_counts(void)
{
printk("(%d %d %d %d %d) ",
@@ -185,7 +180,7 @@ static void reset_counts(void)
trace_selftest_test_dyn_cnt = 0;
}
static int trace_selftest_ops(int cnt)
static int trace_selftest_ops(struct trace_array *tr, int cnt)
{
int save_ftrace_enabled = ftrace_enabled;
struct ftrace_ops *dyn_ops;
@@ -220,7 +215,11 @@ static int trace_selftest_ops(int cnt)
register_ftrace_function(&test_probe1);
register_ftrace_function(&test_probe2);
register_ftrace_function(&test_probe3);
register_ftrace_function(&test_global);
/* First time we are running with main function */
if (cnt > 1) {
ftrace_init_array_ops(tr, trace_selftest_test_global_func);
register_ftrace_function(tr->ops);
}
DYN_FTRACE_TEST_NAME();
@@ -232,8 +231,10 @@ static int trace_selftest_ops(int cnt)
goto out;
if (trace_selftest_test_probe3_cnt != 1)
goto out;
if (trace_selftest_test_global_cnt == 0)
goto out;
if (cnt > 1) {
if (trace_selftest_test_global_cnt == 0)
goto out;
}
DYN_FTRACE_TEST_NAME2();
@@ -269,8 +270,10 @@ static int trace_selftest_ops(int cnt)
goto out_free;
if (trace_selftest_test_probe3_cnt != 3)
goto out_free;
if (trace_selftest_test_global_cnt == 0)
goto out;
if (cnt > 1) {
if (trace_selftest_test_global_cnt == 0)
goto out;
}
if (trace_selftest_test_dyn_cnt == 0)
goto out_free;
@@ -295,7 +298,9 @@ static int trace_selftest_ops(int cnt)
unregister_ftrace_function(&test_probe1);
unregister_ftrace_function(&test_probe2);
unregister_ftrace_function(&test_probe3);
unregister_ftrace_function(&test_global);
if (cnt > 1)
unregister_ftrace_function(tr->ops);
ftrace_reset_array_ops(tr);
/* Make sure everything is off */
reset_counts();
@@ -315,9 +320,9 @@ static int trace_selftest_ops(int cnt)
}
/* Test dynamic code modification and ftrace filters */
int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
struct trace_array *tr,
int (*func)(void))
static int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
struct trace_array *tr,
int (*func)(void))
{
int save_ftrace_enabled = ftrace_enabled;
unsigned long count;
@@ -388,7 +393,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
}
/* Test the ops with global tracing running */
ret = trace_selftest_ops(1);
ret = trace_selftest_ops(tr, 1);
trace->reset(tr);
out:
@@ -399,7 +404,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
/* Test the ops with global tracing off */
if (!ret)
ret = trace_selftest_ops(2);
ret = trace_selftest_ops(tr, 2);
return ret;
}
@@ -802,7 +807,7 @@ out:
int
trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
{
unsigned long save_max = tracing_max_latency;
unsigned long save_max = tr->max_latency;
unsigned long count;
int ret;
@@ -814,7 +819,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
}
/* reset the max latency */
tracing_max_latency = 0;
tr->max_latency = 0;
/* disable interrupts for a bit */
local_irq_disable();
udelay(100);
@@ -841,7 +846,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
ret = -1;
}
tracing_max_latency = save_max;
tr->max_latency = save_max;
return ret;
}
@@ -851,7 +856,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
int
trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
{
unsigned long save_max = tracing_max_latency;
unsigned long save_max = tr->max_latency;
unsigned long count;
int ret;
@@ -876,7 +881,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
}
/* reset the max latency */
tracing_max_latency = 0;
tr->max_latency = 0;
/* disable preemption for a bit */
preempt_disable();
udelay(100);
@@ -903,7 +908,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
ret = -1;
}
tracing_max_latency = save_max;
tr->max_latency = save_max;
return ret;
}
@@ -913,7 +918,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
int
trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr)
{
unsigned long save_max = tracing_max_latency;
unsigned long save_max = tr->max_latency;
unsigned long count;
int ret;
@@ -938,7 +943,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
}
/* reset the max latency */
tracing_max_latency = 0;
tr->max_latency = 0;
/* disable preemption and interrupts for a bit */
preempt_disable();
@@ -973,7 +978,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
}
/* do the test by disabling interrupts first this time */
tracing_max_latency = 0;
tr->max_latency = 0;
tracing_start();
trace->start(tr);
@@ -1004,7 +1009,7 @@ out:
tracing_start();
out_no_start:
trace->reset(tr);
tracing_max_latency = save_max;
tr->max_latency = save_max;
return ret;
}
@@ -1057,7 +1062,7 @@ static int trace_wakeup_test_thread(void *data)
int
trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
{
unsigned long save_max = tracing_max_latency;
unsigned long save_max = tr->max_latency;
struct task_struct *p;
struct completion is_ready;
unsigned long count;
@@ -1083,7 +1088,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
}
/* reset the max latency */
tracing_max_latency = 0;
tr->max_latency = 0;
while (p->on_rq) {
/*
@@ -1113,7 +1118,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
trace->reset(tr);
tracing_start();
tracing_max_latency = save_max;
tr->max_latency = save_max;
/* kill the thread */
kthread_stop(p);

View File

@@ -51,11 +51,33 @@ static DEFINE_MUTEX(stack_sysctl_mutex);
int stack_tracer_enabled;
static int last_stack_tracer_enabled;
static inline void print_max_stack(void)
{
long i;
int size;
pr_emerg(" Depth Size Location (%d entries)\n"
" ----- ---- --------\n",
max_stack_trace.nr_entries - 1);
for (i = 0; i < max_stack_trace.nr_entries; i++) {
if (stack_dump_trace[i] == ULONG_MAX)
break;
if (i+1 == max_stack_trace.nr_entries ||
stack_dump_trace[i+1] == ULONG_MAX)
size = stack_dump_index[i];
else
size = stack_dump_index[i] - stack_dump_index[i+1];
pr_emerg("%3ld) %8d %5d %pS\n", i, stack_dump_index[i],
size, (void *)stack_dump_trace[i]);
}
}
static inline void
check_stack(unsigned long ip, unsigned long *stack)
{
unsigned long this_size, flags;
unsigned long *p, *top, *start;
unsigned long this_size, flags; unsigned long *p, *top, *start;
static int tracer_frame;
int frame_size = ACCESS_ONCE(tracer_frame);
int i;
@@ -85,8 +107,12 @@ check_stack(unsigned long ip, unsigned long *stack)
max_stack_size = this_size;
max_stack_trace.nr_entries = 0;
max_stack_trace.skip = 3;
max_stack_trace.nr_entries = 0;
if (using_ftrace_ops_list_func())
max_stack_trace.skip = 4;
else
max_stack_trace.skip = 3;
save_stack_trace(&max_stack_trace);
@@ -145,8 +171,12 @@ check_stack(unsigned long ip, unsigned long *stack)
i++;
}
BUG_ON(current != &init_task &&
*(end_of_stack(current)) != STACK_END_MAGIC);
if ((current != &init_task &&
*(end_of_stack(current)) != STACK_END_MAGIC)) {
print_max_stack();
BUG();
}
out:
arch_spin_unlock(&max_stack_lock);
local_irq_restore(flags);

View File

@@ -108,8 +108,8 @@ static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n)
* Uprobes-specific fetch functions
*/
#define DEFINE_FETCH_stack(type) \
static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
void *offset, void *dest) \
static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \
void *offset, void *dest) \
{ \
*(type *)dest = (type)get_user_stack_nth(regs, \
((unsigned long)offset)); \
@@ -120,8 +120,8 @@ DEFINE_BASIC_FETCH_FUNCS(stack)
#define fetch_stack_string_size NULL
#define DEFINE_FETCH_memory(type) \
static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
void *addr, void *dest) \
static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \
void *addr, void *dest) \
{ \
type retval; \
void __user *vaddr = (void __force __user *) addr; \
@@ -136,8 +136,8 @@ DEFINE_BASIC_FETCH_FUNCS(memory)
* Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
* length and relative data location.
*/
static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
void *addr, void *dest)
static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
void *addr, void *dest)
{
long ret;
u32 rloc = *(u32 *)dest;
@@ -158,8 +158,8 @@ static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
}
}
static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
void *addr, void *dest)
static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
void *addr, void *dest)
{
int len;
void __user *vaddr = (void __force __user *) addr;
@@ -184,8 +184,8 @@ static unsigned long translate_user_vaddr(void *file_offset)
}
#define DEFINE_FETCH_file_offset(type) \
static __kprobes void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs,\
void *offset, void *dest) \
static void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs, \
void *offset, void *dest)\
{ \
void *vaddr = (void *)translate_user_vaddr(offset); \
\
@@ -893,6 +893,9 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
int ret;
if (file) {
if (tu->tp.flags & TP_FLAG_PROFILE)
return -EINTR;
link = kmalloc(sizeof(*link), GFP_KERNEL);
if (!link)
return -ENOMEM;
@@ -901,29 +904,40 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
list_add_tail_rcu(&link->list, &tu->tp.files);
tu->tp.flags |= TP_FLAG_TRACE;
} else
tu->tp.flags |= TP_FLAG_PROFILE;
} else {
if (tu->tp.flags & TP_FLAG_TRACE)
return -EINTR;
ret = uprobe_buffer_enable();
if (ret < 0)
return ret;
tu->tp.flags |= TP_FLAG_PROFILE;
}
WARN_ON(!uprobe_filter_is_empty(&tu->filter));
if (enabled)
return 0;
ret = uprobe_buffer_enable();
if (ret)
goto err_flags;
tu->consumer.filter = filter;
ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
if (ret) {
if (file) {
list_del(&link->list);
kfree(link);
tu->tp.flags &= ~TP_FLAG_TRACE;
} else
tu->tp.flags &= ~TP_FLAG_PROFILE;
}
if (ret)
goto err_buffer;
return 0;
err_buffer:
uprobe_buffer_disable();
err_flags:
if (file) {
list_del(&link->list);
kfree(link);
tu->tp.flags &= ~TP_FLAG_TRACE;
} else {
tu->tp.flags &= ~TP_FLAG_PROFILE;
}
return ret;
}
@@ -1009,9 +1023,32 @@ uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
}
static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
{
bool done;
write_lock(&tu->filter.rwlock);
if (event->hw.tp_target) {
list_del(&event->hw.tp_list);
done = tu->filter.nr_systemwide ||
(event->hw.tp_target->flags & PF_EXITING) ||
uprobe_filter_event(tu, event);
} else {
tu->filter.nr_systemwide--;
done = tu->filter.nr_systemwide;
}
write_unlock(&tu->filter.rwlock);
if (!done)
return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
return 0;
}
static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
{
bool done;
int err;
write_lock(&tu->filter.rwlock);
if (event->hw.tp_target) {
@@ -1033,32 +1070,13 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
}
write_unlock(&tu->filter.rwlock);
if (!done)
uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
return 0;
}
static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
{
bool done;
write_lock(&tu->filter.rwlock);
if (event->hw.tp_target) {
list_del(&event->hw.tp_list);
done = tu->filter.nr_systemwide ||
(event->hw.tp_target->flags & PF_EXITING) ||
uprobe_filter_event(tu, event);
} else {
tu->filter.nr_systemwide--;
done = tu->filter.nr_systemwide;
err = 0;
if (!done) {
err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
if (err)
uprobe_perf_close(tu, event);
}
write_unlock(&tu->filter.rwlock);
if (!done)
uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
return 0;
return err;
}
static bool uprobe_perf_filter(struct uprobe_consumer *uc,
@@ -1197,12 +1215,6 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
current->utask->vaddr = (unsigned long) &udd;
#ifdef CONFIG_PERF_EVENTS
if ((tu->tp.flags & TP_FLAG_TRACE) == 0 &&
!uprobe_perf_filter(&tu->consumer, 0, current->mm))
return UPROBE_HANDLER_REMOVE;
#endif
if (WARN_ON_ONCE(!uprobe_cpu_buffer))
return 0;

View File

@@ -492,33 +492,29 @@ static int sys_tracepoint_refcount;
void syscall_regfunc(void)
{
unsigned long flags;
struct task_struct *g, *t;
struct task_struct *p, *t;
if (!sys_tracepoint_refcount) {
read_lock_irqsave(&tasklist_lock, flags);
do_each_thread(g, t) {
/* Skip kernel threads. */
if (t->mm)
set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
} while_each_thread(g, t);
read_unlock_irqrestore(&tasklist_lock, flags);
read_lock(&tasklist_lock);
for_each_process_thread(p, t) {
set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
}
read_unlock(&tasklist_lock);
}
sys_tracepoint_refcount++;
}
void syscall_unregfunc(void)
{
unsigned long flags;
struct task_struct *g, *t;
struct task_struct *p, *t;
sys_tracepoint_refcount--;
if (!sys_tracepoint_refcount) {
read_lock_irqsave(&tasklist_lock, flags);
do_each_thread(g, t) {
read_lock(&tasklist_lock);
for_each_process_thread(p, t) {
clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
} while_each_thread(g, t);
read_unlock_irqrestore(&tasklist_lock, flags);
}
read_unlock(&tasklist_lock);
}
}
#endif

View File

@@ -31,6 +31,12 @@
int watchdog_user_enabled = 1;
int __read_mostly watchdog_thresh = 10;
#ifdef CONFIG_SMP
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
#else
#define sysctl_softlockup_all_cpu_backtrace 0
#endif
static int __read_mostly watchdog_running;
static u64 __read_mostly sample_period;
@@ -47,6 +53,7 @@ static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
#endif
static unsigned long soft_lockup_nmi_warn;
/* boot commands */
/*
@@ -95,6 +102,15 @@ static int __init nosoftlockup_setup(char *str)
}
__setup("nosoftlockup", nosoftlockup_setup);
/* */
#ifdef CONFIG_SMP
static int __init softlockup_all_cpu_backtrace_setup(char *str)
{
sysctl_softlockup_all_cpu_backtrace =
!!simple_strtol(str, NULL, 0);
return 1;
}
__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
#endif
/*
* Hard-lockup warnings should be triggered after just a few seconds. Soft-
@@ -271,6 +287,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
struct pt_regs *regs = get_irq_regs();
int duration;
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
/* kick the hardlockup detector */
watchdog_interrupt_count();
@@ -317,6 +334,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
if (__this_cpu_read(soft_watchdog_warn) == true)
return HRTIMER_RESTART;
if (softlockup_all_cpu_backtrace) {
/* Prevent multiple soft-lockup reports if one cpu is already
* engaged in dumping cpu back traces
*/
if (test_and_set_bit(0, &soft_lockup_nmi_warn)) {
/* Someone else will report us. Let's give up */
__this_cpu_write(soft_watchdog_warn, true);
return HRTIMER_RESTART;
}
}
printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
@@ -327,6 +355,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
else
dump_stack();
if (softlockup_all_cpu_backtrace) {
/* Avoid generating two back traces for current
* given that one is already made above
*/
trigger_allbutself_cpu_backtrace();
clear_bit(0, &soft_lockup_nmi_warn);
/* Barrier to sync with other cpus */
smp_mb__after_atomic();
}
if (softlockup_panic)
panic("softlockup: hung tasks");
__this_cpu_write(soft_watchdog_warn, true);
@@ -527,10 +566,8 @@ static void update_timers_all_cpus(void)
int cpu;
get_online_cpus();
preempt_disable();
for_each_online_cpu(cpu)
update_timers(cpu);
preempt_enable();
put_online_cpus();
}

View File

@@ -65,15 +65,12 @@ enum {
* be executing on any CPU. The pool behaves as an unbound one.
*
* Note that DISASSOCIATED should be flipped only while holding
* manager_mutex to avoid changing binding state while
* create_worker() is in progress.
* attach_mutex to avoid changing binding state while
* worker_attach_to_pool() is in progress.
*/
POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
POOL_FREEZING = 1 << 3, /* freeze in progress */
/* worker flags */
WORKER_STARTED = 1 << 0, /* started */
WORKER_DIE = 1 << 1, /* die die die */
WORKER_IDLE = 1 << 2, /* is idle */
WORKER_PREP = 1 << 3, /* preparing to run works */
@@ -124,8 +121,7 @@ enum {
* cpu or grabbing pool->lock is enough for read access. If
* POOL_DISASSOCIATED is set, it's identical to L.
*
* MG: pool->manager_mutex and pool->lock protected. Writes require both
* locks. Reads can happen under either lock.
* A: pool->attach_mutex protected.
*
* PL: wq_pool_mutex protected.
*
@@ -163,8 +159,11 @@ struct worker_pool {
/* see manage_workers() for details on the two manager mutexes */
struct mutex manager_arb; /* manager arbitration */
struct mutex manager_mutex; /* manager exclusion */
struct idr worker_idr; /* MG: worker IDs and iteration */
struct mutex attach_mutex; /* attach/detach exclusion */
struct list_head workers; /* A: attached workers */
struct completion *detach_completion; /* all workers detached */
struct ida worker_ida; /* worker IDs for task name */
struct workqueue_attrs *attrs; /* I: worker attributes */
struct hlist_node hash_node; /* PL: unbound_pool_hash node */
@@ -340,16 +339,6 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
lockdep_is_held(&wq->mutex), \
"sched RCU or wq->mutex should be held")
#ifdef CONFIG_LOCKDEP
#define assert_manager_or_pool_lock(pool) \
WARN_ONCE(debug_locks && \
!lockdep_is_held(&(pool)->manager_mutex) && \
!lockdep_is_held(&(pool)->lock), \
"pool->manager_mutex or ->lock should be held")
#else
#define assert_manager_or_pool_lock(pool) do { } while (0)
#endif
#define for_each_cpu_worker_pool(pool, cpu) \
for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
(pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
@@ -375,17 +364,16 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
/**
* for_each_pool_worker - iterate through all workers of a worker_pool
* @worker: iteration cursor
* @wi: integer used for iteration
* @pool: worker_pool to iterate workers of
*
* This must be called with either @pool->manager_mutex or ->lock held.
* This must be called with @pool->attach_mutex.
*
* The if/else clause exists only for the lockdep assertion and can be
* ignored.
*/
#define for_each_pool_worker(worker, wi, pool) \
idr_for_each_entry(&(pool)->worker_idr, (worker), (wi)) \
if (({ assert_manager_or_pool_lock((pool)); false; })) { } \
#define for_each_pool_worker(worker, pool) \
list_for_each_entry((worker), &(pool)->workers, node) \
if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \
else
/**
@@ -763,13 +751,6 @@ static bool need_to_create_worker(struct worker_pool *pool)
return need_more_worker(pool) && !may_start_working(pool);
}
/* Do I need to be the manager? */
static bool need_to_manage_workers(struct worker_pool *pool)
{
return need_to_create_worker(pool) ||
(pool->flags & POOL_MANAGE_WORKERS);
}
/* Do we have too many workers and should some go away? */
static bool too_many_workers(struct worker_pool *pool)
{
@@ -791,8 +772,8 @@ static bool too_many_workers(struct worker_pool *pool)
* Wake up functions.
*/
/* Return the first worker. Safe with preemption disabled */
static struct worker *first_worker(struct worker_pool *pool)
/* Return the first idle worker. Safe with preemption disabled */
static struct worker *first_idle_worker(struct worker_pool *pool)
{
if (unlikely(list_empty(&pool->idle_list)))
return NULL;
@@ -811,7 +792,7 @@ static struct worker *first_worker(struct worker_pool *pool)
*/
static void wake_up_worker(struct worker_pool *pool)
{
struct worker *worker = first_worker(pool);
struct worker *worker = first_idle_worker(pool);
if (likely(worker))
wake_up_process(worker->task);
@@ -885,7 +866,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
*/
if (atomic_dec_and_test(&pool->nr_running) &&
!list_empty(&pool->worklist))
to_wakeup = first_worker(pool);
to_wakeup = first_idle_worker(pool);
return to_wakeup ? to_wakeup->task : NULL;
}
@@ -1621,70 +1602,6 @@ static void worker_leave_idle(struct worker *worker)
list_del_init(&worker->entry);
}
/**
* worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it
* @pool: target worker_pool
*
* Bind %current to the cpu of @pool if it is associated and lock @pool.
*
* Works which are scheduled while the cpu is online must at least be
* scheduled to a worker which is bound to the cpu so that if they are
* flushed from cpu callbacks while cpu is going down, they are
* guaranteed to execute on the cpu.
*
* This function is to be used by unbound workers and rescuers to bind
* themselves to the target cpu and may race with cpu going down or
* coming online. kthread_bind() can't be used because it may put the
* worker to already dead cpu and set_cpus_allowed_ptr() can't be used
* verbatim as it's best effort and blocking and pool may be
* [dis]associated in the meantime.
*
* This function tries set_cpus_allowed() and locks pool and verifies the
* binding against %POOL_DISASSOCIATED which is set during
* %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
* enters idle state or fetches works without dropping lock, it can
* guarantee the scheduling requirement described in the first paragraph.
*
* CONTEXT:
* Might sleep. Called without any lock but returns with pool->lock
* held.
*
* Return:
* %true if the associated pool is online (@worker is successfully
* bound), %false if offline.
*/
static bool worker_maybe_bind_and_lock(struct worker_pool *pool)
__acquires(&pool->lock)
{
while (true) {
/*
* The following call may fail, succeed or succeed
* without actually migrating the task to the cpu if
* it races with cpu hotunplug operation. Verify
* against POOL_DISASSOCIATED.
*/
if (!(pool->flags & POOL_DISASSOCIATED))
set_cpus_allowed_ptr(current, pool->attrs->cpumask);
spin_lock_irq(&pool->lock);
if (pool->flags & POOL_DISASSOCIATED)
return false;
if (task_cpu(current) == pool->cpu &&
cpumask_equal(&current->cpus_allowed, pool->attrs->cpumask))
return true;
spin_unlock_irq(&pool->lock);
/*
* We've raced with CPU hot[un]plug. Give it a breather
* and retry migration. cond_resched() is required here;
* otherwise, we might deadlock against cpu_stop trying to
* bring down the CPU on non-preemptive kernel.
*/
cpu_relax();
cond_resched();
}
}
static struct worker *alloc_worker(void)
{
struct worker *worker;
@@ -1693,19 +1610,76 @@ static struct worker *alloc_worker(void)
if (worker) {
INIT_LIST_HEAD(&worker->entry);
INIT_LIST_HEAD(&worker->scheduled);
INIT_LIST_HEAD(&worker->node);
/* on creation a worker is in !idle && prep state */
worker->flags = WORKER_PREP;
}
return worker;
}
/**
* worker_attach_to_pool() - attach a worker to a pool
* @worker: worker to be attached
* @pool: the target pool
*
* Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and
* cpu-binding of @worker are kept coordinated with the pool across
* cpu-[un]hotplugs.
*/
static void worker_attach_to_pool(struct worker *worker,
struct worker_pool *pool)
{
mutex_lock(&pool->attach_mutex);
/*
* set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
* online CPUs. It'll be re-applied when any of the CPUs come up.
*/
set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
/*
* The pool->attach_mutex ensures %POOL_DISASSOCIATED remains
* stable across this function. See the comments above the
* flag definition for details.
*/
if (pool->flags & POOL_DISASSOCIATED)
worker->flags |= WORKER_UNBOUND;
list_add_tail(&worker->node, &pool->workers);
mutex_unlock(&pool->attach_mutex);
}
/**
* worker_detach_from_pool() - detach a worker from its pool
* @worker: worker which is attached to its pool
* @pool: the pool @worker is attached to
*
* Undo the attaching which had been done in worker_attach_to_pool(). The
* caller worker shouldn't access to the pool after detached except it has
* other reference to the pool.
*/
static void worker_detach_from_pool(struct worker *worker,
struct worker_pool *pool)
{
struct completion *detach_completion = NULL;
mutex_lock(&pool->attach_mutex);
list_del(&worker->node);
if (list_empty(&pool->workers))
detach_completion = pool->detach_completion;
mutex_unlock(&pool->attach_mutex);
if (detach_completion)
complete(detach_completion);
}
/**
* create_worker - create a new workqueue worker
* @pool: pool the new worker will belong to
*
* Create a new worker which is bound to @pool. The returned worker
* can be started by calling start_worker() or destroyed using
* destroy_worker().
* Create a new worker which is attached to @pool. The new worker must be
* started by start_worker().
*
* CONTEXT:
* Might sleep. Does GFP_KERNEL allocations.
@@ -1719,19 +1693,8 @@ static struct worker *create_worker(struct worker_pool *pool)
int id = -1;
char id_buf[16];
lockdep_assert_held(&pool->manager_mutex);
/*
* ID is needed to determine kthread name. Allocate ID first
* without installing the pointer.
*/
idr_preload(GFP_KERNEL);
spin_lock_irq(&pool->lock);
id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT);
spin_unlock_irq(&pool->lock);
idr_preload_end();
/* ID is needed to determine kthread name */
id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);
if (id < 0)
goto fail;
@@ -1758,33 +1721,14 @@ static struct worker *create_worker(struct worker_pool *pool)
/* prevent userland from meddling with cpumask of workqueue workers */
worker->task->flags |= PF_NO_SETAFFINITY;
/*
* set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
* online CPUs. It'll be re-applied when any of the CPUs come up.
*/
set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
/*
* The caller is responsible for ensuring %POOL_DISASSOCIATED
* remains stable across this function. See the comments above the
* flag definition for details.
*/
if (pool->flags & POOL_DISASSOCIATED)
worker->flags |= WORKER_UNBOUND;
/* successful, commit the pointer to idr */
spin_lock_irq(&pool->lock);
idr_replace(&pool->worker_idr, worker, worker->id);
spin_unlock_irq(&pool->lock);
/* successful, attach the worker to the pool */
worker_attach_to_pool(worker, pool);
return worker;
fail:
if (id >= 0) {
spin_lock_irq(&pool->lock);
idr_remove(&pool->worker_idr, id);
spin_unlock_irq(&pool->lock);
}
if (id >= 0)
ida_simple_remove(&pool->worker_ida, id);
kfree(worker);
return NULL;
}
@@ -1800,7 +1744,6 @@ fail:
*/
static void start_worker(struct worker *worker)
{
worker->flags |= WORKER_STARTED;
worker->pool->nr_workers++;
worker_enter_idle(worker);
wake_up_process(worker->task);
@@ -1818,8 +1761,6 @@ static int create_and_start_worker(struct worker_pool *pool)
{
struct worker *worker;
mutex_lock(&pool->manager_mutex);
worker = create_worker(pool);
if (worker) {
spin_lock_irq(&pool->lock);
@@ -1827,8 +1768,6 @@ static int create_and_start_worker(struct worker_pool *pool)
spin_unlock_irq(&pool->lock);
}
mutex_unlock(&pool->manager_mutex);
return worker ? 0 : -ENOMEM;
}
@@ -1836,46 +1775,30 @@ static int create_and_start_worker(struct worker_pool *pool)
* destroy_worker - destroy a workqueue worker
* @worker: worker to be destroyed
*
* Destroy @worker and adjust @pool stats accordingly.
* Destroy @worker and adjust @pool stats accordingly. The worker should
* be idle.
*
* CONTEXT:
* spin_lock_irq(pool->lock) which is released and regrabbed.
* spin_lock_irq(pool->lock).
*/
static void destroy_worker(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
lockdep_assert_held(&pool->manager_mutex);
lockdep_assert_held(&pool->lock);
/* sanity check frenzy */
if (WARN_ON(worker->current_work) ||
WARN_ON(!list_empty(&worker->scheduled)))
WARN_ON(!list_empty(&worker->scheduled)) ||
WARN_ON(!(worker->flags & WORKER_IDLE)))
return;
if (worker->flags & WORKER_STARTED)
pool->nr_workers--;
if (worker->flags & WORKER_IDLE)
pool->nr_idle--;
/*
* Once WORKER_DIE is set, the kworker may destroy itself at any
* point. Pin to ensure the task stays until we're done with it.
*/
get_task_struct(worker->task);
pool->nr_workers--;
pool->nr_idle--;
list_del_init(&worker->entry);
worker->flags |= WORKER_DIE;
idr_remove(&pool->worker_idr, worker->id);
spin_unlock_irq(&pool->lock);
kthread_stop(worker->task);
put_task_struct(worker->task);
kfree(worker);
spin_lock_irq(&pool->lock);
wake_up_process(worker->task);
}
static void idle_worker_timeout(unsigned long __pool)
@@ -1884,7 +1807,7 @@ static void idle_worker_timeout(unsigned long __pool)
spin_lock_irq(&pool->lock);
if (too_many_workers(pool)) {
while (too_many_workers(pool)) {
struct worker *worker;
unsigned long expires;
@@ -1892,13 +1815,12 @@ static void idle_worker_timeout(unsigned long __pool)
worker = list_entry(pool->idle_list.prev, struct worker, entry);
expires = worker->last_active + IDLE_WORKER_TIMEOUT;
if (time_before(jiffies, expires))
if (time_before(jiffies, expires)) {
mod_timer(&pool->idle_timer, expires);
else {
/* it's been idle for too long, wake up manager */
pool->flags |= POOL_MANAGE_WORKERS;
wake_up_worker(pool);
break;
}
destroy_worker(worker);
}
spin_unlock_irq(&pool->lock);
@@ -2016,44 +1938,6 @@ restart:
return true;
}
/**
* maybe_destroy_worker - destroy workers which have been idle for a while
* @pool: pool to destroy workers for
*
* Destroy @pool workers which have been idle for longer than
* IDLE_WORKER_TIMEOUT.
*
* LOCKING:
* spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times. Called only from manager.
*
* Return:
* %false if no action was taken and pool->lock stayed locked, %true
* otherwise.
*/
static bool maybe_destroy_workers(struct worker_pool *pool)
{
bool ret = false;
while (too_many_workers(pool)) {
struct worker *worker;
unsigned long expires;
worker = list_entry(pool->idle_list.prev, struct worker, entry);
expires = worker->last_active + IDLE_WORKER_TIMEOUT;
if (time_before(jiffies, expires)) {
mod_timer(&pool->idle_timer, expires);
break;
}
destroy_worker(worker);
ret = true;
}
return ret;
}
/**
* manage_workers - manage worker pool
* @worker: self
@@ -2083,8 +1967,6 @@ static bool manage_workers(struct worker *worker)
bool ret = false;
/*
* Managership is governed by two mutexes - manager_arb and
* manager_mutex. manager_arb handles arbitration of manager role.
* Anyone who successfully grabs manager_arb wins the arbitration
* and becomes the manager. mutex_trylock() on pool->manager_arb
* failure while holding pool->lock reliably indicates that someone
@@ -2093,40 +1975,12 @@ static bool manage_workers(struct worker *worker)
* grabbing manager_arb is responsible for actually performing
* manager duties. If manager_arb is grabbed and released without
* actual management, the pool may stall indefinitely.
*
* manager_mutex is used for exclusion of actual management
* operations. The holder of manager_mutex can be sure that none
* of management operations, including creation and destruction of
* workers, won't take place until the mutex is released. Because
* manager_mutex doesn't interfere with manager role arbitration,
* it is guaranteed that the pool's management, while may be
* delayed, won't be disturbed by someone else grabbing
* manager_mutex.
*/
if (!mutex_trylock(&pool->manager_arb))
return ret;
/*
* With manager arbitration won, manager_mutex would be free in
* most cases. trylock first without dropping @pool->lock.
*/
if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
spin_unlock_irq(&pool->lock);
mutex_lock(&pool->manager_mutex);
spin_lock_irq(&pool->lock);
ret = true;
}
pool->flags &= ~POOL_MANAGE_WORKERS;
/*
* Destroy and then create so that may_start_working() is true
* on return.
*/
ret |= maybe_destroy_workers(pool);
ret |= maybe_create_worker(pool);
mutex_unlock(&pool->manager_mutex);
mutex_unlock(&pool->manager_arb);
return ret;
}
@@ -2314,6 +2168,11 @@ woke_up:
spin_unlock_irq(&pool->lock);
WARN_ON_ONCE(!list_empty(&worker->entry));
worker->task->flags &= ~PF_WQ_WORKER;
set_task_comm(worker->task, "kworker/dying");
ida_simple_remove(&pool->worker_ida, worker->id);
worker_detach_from_pool(worker, pool);
kfree(worker);
return 0;
}
@@ -2361,9 +2220,6 @@ recheck:
worker_set_flags(worker, WORKER_PREP, false);
sleep:
if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker))
goto recheck;
/*
* pool->lock is held and there's no work to process and no need to
* manage, sleep. Workers are woken up only while holding
@@ -2440,8 +2296,9 @@ repeat:
spin_unlock_irq(&wq_mayday_lock);
/* migrate to the target cpu if possible */
worker_maybe_bind_and_lock(pool);
worker_attach_to_pool(rescuer, pool);
spin_lock_irq(&pool->lock);
rescuer->pool = pool;
/*
@@ -2454,6 +2311,11 @@ repeat:
move_linked_works(work, scheduled, &n);
process_scheduled_works(rescuer);
spin_unlock_irq(&pool->lock);
worker_detach_from_pool(rescuer, pool);
spin_lock_irq(&pool->lock);
/*
* Put the reference grabbed by send_mayday(). @pool won't
@@ -3422,6 +3284,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
}
}
dev_set_uevent_suppress(&wq_dev->dev, false);
kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
return 0;
}
@@ -3550,9 +3413,10 @@ static int init_worker_pool(struct worker_pool *pool)
(unsigned long)pool);
mutex_init(&pool->manager_arb);
mutex_init(&pool->manager_mutex);
idr_init(&pool->worker_idr);
mutex_init(&pool->attach_mutex);
INIT_LIST_HEAD(&pool->workers);
ida_init(&pool->worker_ida);
INIT_HLIST_NODE(&pool->hash_node);
pool->refcnt = 1;
@@ -3567,7 +3431,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
{
struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
idr_destroy(&pool->worker_idr);
ida_destroy(&pool->worker_ida);
free_workqueue_attrs(pool->attrs);
kfree(pool);
}
@@ -3585,6 +3449,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
*/
static void put_unbound_pool(struct worker_pool *pool)
{
DECLARE_COMPLETION_ONSTACK(detach_completion);
struct worker *worker;
lockdep_assert_held(&wq_pool_mutex);
@@ -3605,18 +3470,24 @@ static void put_unbound_pool(struct worker_pool *pool)
/*
* Become the manager and destroy all workers. Grabbing
* manager_arb prevents @pool's workers from blocking on
* manager_mutex.
* attach_mutex.
*/
mutex_lock(&pool->manager_arb);
mutex_lock(&pool->manager_mutex);
spin_lock_irq(&pool->lock);
while ((worker = first_worker(pool)))
spin_lock_irq(&pool->lock);
while ((worker = first_idle_worker(pool)))
destroy_worker(worker);
WARN_ON(pool->nr_workers || pool->nr_idle);
spin_unlock_irq(&pool->lock);
mutex_unlock(&pool->manager_mutex);
mutex_lock(&pool->attach_mutex);
if (!list_empty(&pool->workers))
pool->detach_completion = &detach_completion;
mutex_unlock(&pool->attach_mutex);
if (pool->detach_completion)
wait_for_completion(pool->detach_completion);
mutex_unlock(&pool->manager_arb);
/* shut down the timers */
@@ -3662,9 +3533,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
if (!pool || init_worker_pool(pool) < 0)
goto fail;
if (workqueue_freezing)
pool->flags |= POOL_FREEZING;
lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
copy_workqueue_attrs(pool->attrs, attrs);
@@ -3771,7 +3639,12 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)
spin_lock_irq(&pwq->pool->lock);
if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) {
/*
* During [un]freezing, the caller is responsible for ensuring that
* this function is called at least once after @workqueue_freezing
* is updated and visible.
*/
if (!freezable || !workqueue_freezing) {
pwq->max_active = wq->saved_max_active;
while (!list_empty(&pwq->delayed_works) &&
@@ -4103,17 +3976,13 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
* Let's determine what needs to be done. If the target cpumask is
* different from wq's, we need to compare it to @pwq's and create
* a new one if they don't match. If the target cpumask equals
* wq's, the default pwq should be used. If @pwq is already the
* default one, nothing to do; otherwise, install the default one.
* wq's, the default pwq should be used.
*/
if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
goto out_unlock;
} else {
if (pwq == wq->dfl_pwq)
goto out_unlock;
else
goto use_dfl_pwq;
goto use_dfl_pwq;
}
mutex_unlock(&wq->mutex);
@@ -4121,8 +3990,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
/* create a new pwq */
pwq = alloc_unbound_pwq(wq, target_attrs);
if (!pwq) {
pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
wq->name);
pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
wq->name);
mutex_lock(&wq->mutex);
goto use_dfl_pwq;
}
@@ -4599,28 +4468,27 @@ static void wq_unbind_fn(struct work_struct *work)
int cpu = smp_processor_id();
struct worker_pool *pool;
struct worker *worker;
int wi;
for_each_cpu_worker_pool(pool, cpu) {
WARN_ON_ONCE(cpu != smp_processor_id());
mutex_lock(&pool->manager_mutex);
mutex_lock(&pool->attach_mutex);
spin_lock_irq(&pool->lock);
/*
* We've blocked all manager operations. Make all workers
* We've blocked all attach/detach operations. Make all workers
* unbound and set DISASSOCIATED. Before this, all workers
* except for the ones which are still executing works from
* before the last CPU down must be on the cpu. After
* this, they may become diasporas.
*/
for_each_pool_worker(worker, wi, pool)
for_each_pool_worker(worker, pool)
worker->flags |= WORKER_UNBOUND;
pool->flags |= POOL_DISASSOCIATED;
spin_unlock_irq(&pool->lock);
mutex_unlock(&pool->manager_mutex);
mutex_unlock(&pool->attach_mutex);
/*
* Call schedule() so that we cross rq->lock and thus can
@@ -4660,9 +4528,8 @@ static void wq_unbind_fn(struct work_struct *work)
static void rebind_workers(struct worker_pool *pool)
{
struct worker *worker;
int wi;
lockdep_assert_held(&pool->manager_mutex);
lockdep_assert_held(&pool->attach_mutex);
/*
* Restore CPU affinity of all workers. As all idle workers should
@@ -4671,13 +4538,13 @@ static void rebind_workers(struct worker_pool *pool)
* of all workers first and then clear UNBOUND. As we're called
* from CPU_ONLINE, the following shouldn't fail.
*/
for_each_pool_worker(worker, wi, pool)
for_each_pool_worker(worker, pool)
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
pool->attrs->cpumask) < 0);
spin_lock_irq(&pool->lock);
for_each_pool_worker(worker, wi, pool) {
for_each_pool_worker(worker, pool) {
unsigned int worker_flags = worker->flags;
/*
@@ -4729,9 +4596,8 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
{
static cpumask_t cpumask;
struct worker *worker;
int wi;
lockdep_assert_held(&pool->manager_mutex);
lockdep_assert_held(&pool->attach_mutex);
/* is @cpu allowed for @pool? */
if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
@@ -4743,7 +4609,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
return;
/* as we're called from CPU_ONLINE, the following shouldn't fail */
for_each_pool_worker(worker, wi, pool)
for_each_pool_worker(worker, pool)
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
pool->attrs->cpumask) < 0);
}
@@ -4776,7 +4642,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
mutex_lock(&wq_pool_mutex);
for_each_pool(pool, pi) {
mutex_lock(&pool->manager_mutex);
mutex_lock(&pool->attach_mutex);
if (pool->cpu == cpu) {
spin_lock_irq(&pool->lock);
@@ -4788,7 +4654,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
restore_unbound_workers_cpumask(pool, cpu);
}
mutex_unlock(&pool->manager_mutex);
mutex_unlock(&pool->attach_mutex);
}
/* update NUMA affinity of unbound workqueues */
@@ -4887,24 +4753,14 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
*/
void freeze_workqueues_begin(void)
{
struct worker_pool *pool;
struct workqueue_struct *wq;
struct pool_workqueue *pwq;
int pi;
mutex_lock(&wq_pool_mutex);
WARN_ON_ONCE(workqueue_freezing);
workqueue_freezing = true;
/* set FREEZING */
for_each_pool(pool, pi) {
spin_lock_irq(&pool->lock);
WARN_ON_ONCE(pool->flags & POOL_FREEZING);
pool->flags |= POOL_FREEZING;
spin_unlock_irq(&pool->lock);
}
list_for_each_entry(wq, &workqueues, list) {
mutex_lock(&wq->mutex);
for_each_pwq(pwq, wq)
@@ -4974,21 +4830,13 @@ void thaw_workqueues(void)
{
struct workqueue_struct *wq;
struct pool_workqueue *pwq;
struct worker_pool *pool;
int pi;
mutex_lock(&wq_pool_mutex);
if (!workqueue_freezing)
goto out_unlock;
/* clear FREEZING */
for_each_pool(pool, pi) {
spin_lock_irq(&pool->lock);
WARN_ON_ONCE(!(pool->flags & POOL_FREEZING));
pool->flags &= ~POOL_FREEZING;
spin_unlock_irq(&pool->lock);
}
workqueue_freezing = false;
/* restore max_active and repopulate worklist */
list_for_each_entry(wq, &workqueues, list) {
@@ -4998,7 +4846,6 @@ void thaw_workqueues(void)
mutex_unlock(&wq->mutex);
}
workqueue_freezing = false;
out_unlock:
mutex_unlock(&wq_pool_mutex);
}
@@ -5033,7 +4880,7 @@ static void __init wq_numa_init(void)
BUG_ON(!tbl);
for_each_node(node)
BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
node_online(node) ? node : NUMA_NO_NODE));
for_each_possible_cpu(cpu) {

View File

@@ -37,6 +37,8 @@ struct worker {
struct task_struct *task; /* I: worker task */
struct worker_pool *pool; /* I: the associated pool */
/* L: for rescuers */
struct list_head node; /* A: anchored at pool->workers */
/* A: runs through worker->node */
unsigned long last_active; /* L: last active timestamp */
unsigned int flags; /* X: flags */