Merge branch 'tracing/urgent' into tracing/core
Merge reason: Pick up latest fixes and update to latest upstream. Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
@@ -58,7 +58,6 @@ obj-$(CONFIG_KEXEC) += kexec.o
|
||||
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
|
||||
obj-$(CONFIG_COMPAT) += compat.o
|
||||
obj-$(CONFIG_CGROUPS) += cgroup.o
|
||||
obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
|
||||
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
|
||||
obj-$(CONFIG_CPUSETS) += cpuset.o
|
||||
obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
|
||||
@@ -87,7 +86,6 @@ obj-$(CONFIG_RELAY) += relay.o
|
||||
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
|
||||
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
|
||||
obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
|
||||
obj-$(CONFIG_MARKERS) += marker.o
|
||||
obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
|
||||
obj-$(CONFIG_LATENCYTOP) += latencytop.o
|
||||
obj-$(CONFIG_FUNCTION_TRACER) += trace/
|
||||
@@ -96,7 +94,7 @@ obj-$(CONFIG_X86_DS) += trace/
|
||||
obj-$(CONFIG_RING_BUFFER) += trace/
|
||||
obj-$(CONFIG_SMP) += sched_cpupri.o
|
||||
obj-$(CONFIG_SLOW_WORK) += slow-work.o
|
||||
obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
|
||||
obj-$(CONFIG_PERF_EVENTS) += perf_event.o
|
||||
|
||||
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
|
||||
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
|
||||
|
@@ -855,18 +855,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
|
||||
break;
|
||||
}
|
||||
case AUDIT_SIGNAL_INFO:
|
||||
err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
|
||||
if (err)
|
||||
return err;
|
||||
len = 0;
|
||||
if (audit_sig_sid) {
|
||||
err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
|
||||
if (!sig_data) {
|
||||
security_release_secctx(ctx, len);
|
||||
if (audit_sig_sid)
|
||||
security_release_secctx(ctx, len);
|
||||
return -ENOMEM;
|
||||
}
|
||||
sig_data->uid = audit_sig_uid;
|
||||
sig_data->pid = audit_sig_pid;
|
||||
memcpy(sig_data->ctx, ctx, len);
|
||||
security_release_secctx(ctx, len);
|
||||
if (audit_sig_sid) {
|
||||
memcpy(sig_data->ctx, ctx, len);
|
||||
security_release_secctx(ctx, len);
|
||||
}
|
||||
audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
|
||||
0, 0, sig_data, sizeof(*sig_data) + len);
|
||||
kfree(sig_data);
|
||||
|
@@ -45,8 +45,8 @@
|
||||
|
||||
struct audit_watch {
|
||||
atomic_t count; /* reference count */
|
||||
char *path; /* insertion path */
|
||||
dev_t dev; /* associated superblock device */
|
||||
char *path; /* insertion path */
|
||||
unsigned long ino; /* associated inode number */
|
||||
struct audit_parent *parent; /* associated parent */
|
||||
struct list_head wlist; /* entry in parent->watches list */
|
||||
|
@@ -168,12 +168,12 @@ struct audit_context {
|
||||
int in_syscall; /* 1 if task is in a syscall */
|
||||
enum audit_state state, current_state;
|
||||
unsigned int serial; /* serial number for record */
|
||||
struct timespec ctime; /* time of syscall entry */
|
||||
int major; /* syscall number */
|
||||
struct timespec ctime; /* time of syscall entry */
|
||||
unsigned long argv[4]; /* syscall arguments */
|
||||
int return_valid; /* return code is valid */
|
||||
long return_code;/* syscall return code */
|
||||
u64 prio;
|
||||
int return_valid; /* return code is valid */
|
||||
int name_count;
|
||||
struct audit_names names[AUDIT_NAMES];
|
||||
char * filterkey; /* key for rule that triggered record */
|
||||
@@ -198,8 +198,8 @@ struct audit_context {
|
||||
char target_comm[TASK_COMM_LEN];
|
||||
|
||||
struct audit_tree_refs *trees, *first_trees;
|
||||
int tree_count;
|
||||
struct list_head killed_trees;
|
||||
int tree_count;
|
||||
|
||||
int type;
|
||||
union {
|
||||
|
1115
kernel/cgroup.c
1115
kernel/cgroup.c
File diff suppressed because it is too large
Load Diff
@@ -1,105 +0,0 @@
|
||||
/*
|
||||
* kernel/cgroup_debug.c - Example cgroup subsystem that
|
||||
* exposes debug info
|
||||
*
|
||||
* Copyright (C) Google Inc, 2007
|
||||
*
|
||||
* Developed by Paul Menage (menage@google.com)
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/cgroup.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/rcupdate.h>
|
||||
|
||||
#include <asm/atomic.h>
|
||||
|
||||
static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
|
||||
struct cgroup *cont)
|
||||
{
|
||||
struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
|
||||
|
||||
if (!css)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
return css;
|
||||
}
|
||||
|
||||
static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
|
||||
{
|
||||
kfree(cont->subsys[debug_subsys_id]);
|
||||
}
|
||||
|
||||
static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
|
||||
{
|
||||
return atomic_read(&cont->count);
|
||||
}
|
||||
|
||||
static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
|
||||
{
|
||||
u64 count;
|
||||
|
||||
count = cgroup_task_count(cont);
|
||||
return count;
|
||||
}
|
||||
|
||||
static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
|
||||
{
|
||||
return (u64)(long)current->cgroups;
|
||||
}
|
||||
|
||||
static u64 current_css_set_refcount_read(struct cgroup *cont,
|
||||
struct cftype *cft)
|
||||
{
|
||||
u64 count;
|
||||
|
||||
rcu_read_lock();
|
||||
count = atomic_read(¤t->cgroups->refcount);
|
||||
rcu_read_unlock();
|
||||
return count;
|
||||
}
|
||||
|
||||
static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
|
||||
{
|
||||
return test_bit(CGRP_RELEASABLE, &cgrp->flags);
|
||||
}
|
||||
|
||||
static struct cftype files[] = {
|
||||
{
|
||||
.name = "cgroup_refcount",
|
||||
.read_u64 = cgroup_refcount_read,
|
||||
},
|
||||
{
|
||||
.name = "taskcount",
|
||||
.read_u64 = taskcount_read,
|
||||
},
|
||||
|
||||
{
|
||||
.name = "current_css_set",
|
||||
.read_u64 = current_css_set_read,
|
||||
},
|
||||
|
||||
{
|
||||
.name = "current_css_set_refcount",
|
||||
.read_u64 = current_css_set_refcount_read,
|
||||
},
|
||||
|
||||
{
|
||||
.name = "releasable",
|
||||
.read_u64 = releasable_read,
|
||||
},
|
||||
};
|
||||
|
||||
static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
|
||||
{
|
||||
return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
|
||||
}
|
||||
|
||||
struct cgroup_subsys debug_subsys = {
|
||||
.name = "debug",
|
||||
.create = debug_create,
|
||||
.destroy = debug_destroy,
|
||||
.populate = debug_populate,
|
||||
.subsys_id = debug_subsys_id,
|
||||
};
|
@@ -159,7 +159,7 @@ static bool is_task_frozen_enough(struct task_struct *task)
|
||||
*/
|
||||
static int freezer_can_attach(struct cgroup_subsys *ss,
|
||||
struct cgroup *new_cgroup,
|
||||
struct task_struct *task)
|
||||
struct task_struct *task, bool threadgroup)
|
||||
{
|
||||
struct freezer *freezer;
|
||||
|
||||
@@ -177,6 +177,19 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
|
||||
if (freezer->state == CGROUP_FROZEN)
|
||||
return -EBUSY;
|
||||
|
||||
if (threadgroup) {
|
||||
struct task_struct *c;
|
||||
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
|
||||
if (is_task_frozen_enough(c)) {
|
||||
rcu_read_unlock();
|
||||
return -EBUSY;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@@ -1324,9 +1324,10 @@ static int fmeter_getrate(struct fmeter *fmp)
|
||||
static cpumask_var_t cpus_attach;
|
||||
|
||||
/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
|
||||
static int cpuset_can_attach(struct cgroup_subsys *ss,
|
||||
struct cgroup *cont, struct task_struct *tsk)
|
||||
static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
|
||||
struct task_struct *tsk, bool threadgroup)
|
||||
{
|
||||
int ret;
|
||||
struct cpuset *cs = cgroup_cs(cont);
|
||||
|
||||
if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
|
||||
@@ -1343,18 +1344,51 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
|
||||
if (tsk->flags & PF_THREAD_BOUND)
|
||||
return -EINVAL;
|
||||
|
||||
return security_task_setscheduler(tsk, 0, NULL);
|
||||
ret = security_task_setscheduler(tsk, 0, NULL);
|
||||
if (ret)
|
||||
return ret;
|
||||
if (threadgroup) {
|
||||
struct task_struct *c;
|
||||
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
|
||||
ret = security_task_setscheduler(c, 0, NULL);
|
||||
if (ret) {
|
||||
rcu_read_unlock();
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void cpuset_attach(struct cgroup_subsys *ss,
|
||||
struct cgroup *cont, struct cgroup *oldcont,
|
||||
struct task_struct *tsk)
|
||||
static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
|
||||
struct cpuset *cs)
|
||||
{
|
||||
int err;
|
||||
/*
|
||||
* can_attach beforehand should guarantee that this doesn't fail.
|
||||
* TODO: have a better way to handle failure here
|
||||
*/
|
||||
err = set_cpus_allowed_ptr(tsk, cpus_attach);
|
||||
WARN_ON_ONCE(err);
|
||||
|
||||
task_lock(tsk);
|
||||
cpuset_change_task_nodemask(tsk, to);
|
||||
task_unlock(tsk);
|
||||
cpuset_update_task_spread_flag(cs, tsk);
|
||||
|
||||
}
|
||||
|
||||
static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
|
||||
struct cgroup *oldcont, struct task_struct *tsk,
|
||||
bool threadgroup)
|
||||
{
|
||||
nodemask_t from, to;
|
||||
struct mm_struct *mm;
|
||||
struct cpuset *cs = cgroup_cs(cont);
|
||||
struct cpuset *oldcs = cgroup_cs(oldcont);
|
||||
int err;
|
||||
|
||||
if (cs == &top_cpuset) {
|
||||
cpumask_copy(cpus_attach, cpu_possible_mask);
|
||||
@@ -1363,15 +1397,19 @@ static void cpuset_attach(struct cgroup_subsys *ss,
|
||||
guarantee_online_cpus(cs, cpus_attach);
|
||||
guarantee_online_mems(cs, &to);
|
||||
}
|
||||
err = set_cpus_allowed_ptr(tsk, cpus_attach);
|
||||
if (err)
|
||||
return;
|
||||
|
||||
task_lock(tsk);
|
||||
cpuset_change_task_nodemask(tsk, &to);
|
||||
task_unlock(tsk);
|
||||
cpuset_update_task_spread_flag(cs, tsk);
|
||||
/* do per-task migration stuff possibly for each in the threadgroup */
|
||||
cpuset_attach_task(tsk, &to, cs);
|
||||
if (threadgroup) {
|
||||
struct task_struct *c;
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
|
||||
cpuset_attach_task(c, &to, cs);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/* change mm; only needs to be done once even if threadgroup */
|
||||
from = oldcs->mems_allowed;
|
||||
to = cs->mems_allowed;
|
||||
mm = get_task_mm(tsk);
|
||||
|
@@ -782,6 +782,25 @@ EXPORT_SYMBOL(set_create_files_as);
|
||||
|
||||
#ifdef CONFIG_DEBUG_CREDENTIALS
|
||||
|
||||
bool creds_are_invalid(const struct cred *cred)
|
||||
{
|
||||
if (cred->magic != CRED_MAGIC)
|
||||
return true;
|
||||
if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
|
||||
return true;
|
||||
#ifdef CONFIG_SECURITY_SELINUX
|
||||
if (selinux_is_enabled()) {
|
||||
if ((unsigned long) cred->security < PAGE_SIZE)
|
||||
return true;
|
||||
if ((*(u32 *)cred->security & 0xffffff00) ==
|
||||
(POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
EXPORT_SYMBOL(creds_are_invalid);
|
||||
|
||||
/*
|
||||
* dump invalid credentials
|
||||
*/
|
||||
|
@@ -15,6 +15,7 @@
|
||||
|
||||
#include <linux/sched.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/taskstats.h>
|
||||
#include <linux/time.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <linux/delayacct.h>
|
||||
|
164
kernel/exit.c
164
kernel/exit.c
@@ -47,7 +47,7 @@
|
||||
#include <linux/tracehook.h>
|
||||
#include <linux/fs_struct.h>
|
||||
#include <linux/init_task.h>
|
||||
#include <linux/perf_counter.h>
|
||||
#include <linux/perf_event.h>
|
||||
#include <trace/events/sched.h>
|
||||
|
||||
#include <asm/uaccess.h>
|
||||
@@ -154,8 +154,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
|
||||
{
|
||||
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
|
||||
|
||||
#ifdef CONFIG_PERF_COUNTERS
|
||||
WARN_ON_ONCE(tsk->perf_counter_ctxp);
|
||||
#ifdef CONFIG_PERF_EVENTS
|
||||
WARN_ON_ONCE(tsk->perf_event_ctxp);
|
||||
#endif
|
||||
trace_sched_process_free(tsk);
|
||||
put_task_struct(tsk);
|
||||
@@ -359,8 +359,10 @@ void __set_special_pids(struct pid *pid)
|
||||
{
|
||||
struct task_struct *curr = current->group_leader;
|
||||
|
||||
if (task_session(curr) != pid)
|
||||
if (task_session(curr) != pid) {
|
||||
change_pid(curr, PIDTYPE_SID, pid);
|
||||
proc_sid_connector(curr);
|
||||
}
|
||||
|
||||
if (task_pgrp(curr) != pid)
|
||||
change_pid(curr, PIDTYPE_PGID, pid);
|
||||
@@ -945,6 +947,8 @@ NORET_TYPE void do_exit(long code)
|
||||
if (group_dead) {
|
||||
hrtimer_cancel(&tsk->signal->real_timer);
|
||||
exit_itimers(tsk->signal);
|
||||
if (tsk->mm)
|
||||
setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
|
||||
}
|
||||
acct_collect(code, group_dead);
|
||||
if (group_dead)
|
||||
@@ -972,8 +976,6 @@ NORET_TYPE void do_exit(long code)
|
||||
disassociate_ctty(1);
|
||||
|
||||
module_put(task_thread_info(tsk)->exec_domain->module);
|
||||
if (tsk->binfmt)
|
||||
module_put(tsk->binfmt->module);
|
||||
|
||||
proc_exit_connector(tsk);
|
||||
|
||||
@@ -981,7 +983,7 @@ NORET_TYPE void do_exit(long code)
|
||||
* Flush inherited counters to the parent - before the parent
|
||||
* gets woken up by child-exit notifications.
|
||||
*/
|
||||
perf_counter_exit_task(tsk);
|
||||
perf_event_exit_task(tsk);
|
||||
|
||||
exit_notify(tsk, group_dead);
|
||||
#ifdef CONFIG_NUMA
|
||||
@@ -1093,28 +1095,28 @@ struct wait_opts {
|
||||
int __user *wo_stat;
|
||||
struct rusage __user *wo_rusage;
|
||||
|
||||
wait_queue_t child_wait;
|
||||
int notask_error;
|
||||
};
|
||||
|
||||
static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
|
||||
static inline
|
||||
struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
|
||||
{
|
||||
struct pid *pid = NULL;
|
||||
if (type == PIDTYPE_PID)
|
||||
pid = task->pids[type].pid;
|
||||
else if (type < PIDTYPE_MAX)
|
||||
pid = task->group_leader->pids[type].pid;
|
||||
return pid;
|
||||
if (type != PIDTYPE_PID)
|
||||
task = task->group_leader;
|
||||
return task->pids[type].pid;
|
||||
}
|
||||
|
||||
static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
|
||||
{
|
||||
return wo->wo_type == PIDTYPE_MAX ||
|
||||
task_pid_type(p, wo->wo_type) == wo->wo_pid;
|
||||
}
|
||||
|
||||
static int eligible_child(struct wait_opts *wo, struct task_struct *p)
|
||||
{
|
||||
int err;
|
||||
|
||||
if (wo->wo_type < PIDTYPE_MAX) {
|
||||
if (task_pid_type(p, wo->wo_type) != wo->wo_pid)
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!eligible_pid(wo, p))
|
||||
return 0;
|
||||
/* Wait for all children (clone and not) if __WALL is set;
|
||||
* otherwise, wait for clone children *only* if __WCLONE is
|
||||
* set; otherwise, wait for non-clone children *only*. (Note:
|
||||
@@ -1124,10 +1126,6 @@ static int eligible_child(struct wait_opts *wo, struct task_struct *p)
|
||||
&& !(wo->wo_flags & __WALL))
|
||||
return 0;
|
||||
|
||||
err = security_task_wait(p);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -1140,18 +1138,20 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
|
||||
|
||||
put_task_struct(p);
|
||||
infop = wo->wo_info;
|
||||
if (!retval)
|
||||
retval = put_user(SIGCHLD, &infop->si_signo);
|
||||
if (!retval)
|
||||
retval = put_user(0, &infop->si_errno);
|
||||
if (!retval)
|
||||
retval = put_user((short)why, &infop->si_code);
|
||||
if (!retval)
|
||||
retval = put_user(pid, &infop->si_pid);
|
||||
if (!retval)
|
||||
retval = put_user(uid, &infop->si_uid);
|
||||
if (!retval)
|
||||
retval = put_user(status, &infop->si_status);
|
||||
if (infop) {
|
||||
if (!retval)
|
||||
retval = put_user(SIGCHLD, &infop->si_signo);
|
||||
if (!retval)
|
||||
retval = put_user(0, &infop->si_errno);
|
||||
if (!retval)
|
||||
retval = put_user((short)why, &infop->si_code);
|
||||
if (!retval)
|
||||
retval = put_user(pid, &infop->si_pid);
|
||||
if (!retval)
|
||||
retval = put_user(uid, &infop->si_uid);
|
||||
if (!retval)
|
||||
retval = put_user(status, &infop->si_status);
|
||||
}
|
||||
if (!retval)
|
||||
retval = pid;
|
||||
return retval;
|
||||
@@ -1208,6 +1208,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
|
||||
if (likely(!traced) && likely(!task_detached(p))) {
|
||||
struct signal_struct *psig;
|
||||
struct signal_struct *sig;
|
||||
unsigned long maxrss;
|
||||
|
||||
/*
|
||||
* The resource counters for the group leader are in its
|
||||
@@ -1256,6 +1257,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
|
||||
psig->coublock +=
|
||||
task_io_get_oublock(p) +
|
||||
sig->oublock + sig->coublock;
|
||||
maxrss = max(sig->maxrss, sig->cmaxrss);
|
||||
if (psig->cmaxrss < maxrss)
|
||||
psig->cmaxrss = maxrss;
|
||||
task_io_accounting_add(&psig->ioac, &p->ioac);
|
||||
task_io_accounting_add(&psig->ioac, &sig->ioac);
|
||||
spin_unlock_irq(&p->real_parent->sighand->siglock);
|
||||
@@ -1477,13 +1481,14 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
|
||||
* then ->notask_error is 0 if @p is an eligible child,
|
||||
* or another error from security_task_wait(), or still -ECHILD.
|
||||
*/
|
||||
static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent,
|
||||
int ptrace, struct task_struct *p)
|
||||
static int wait_consider_task(struct wait_opts *wo, int ptrace,
|
||||
struct task_struct *p)
|
||||
{
|
||||
int ret = eligible_child(wo, p);
|
||||
if (!ret)
|
||||
return ret;
|
||||
|
||||
ret = security_task_wait(p);
|
||||
if (unlikely(ret < 0)) {
|
||||
/*
|
||||
* If we have not yet seen any eligible child,
|
||||
@@ -1545,7 +1550,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
|
||||
* Do not consider detached threads.
|
||||
*/
|
||||
if (!task_detached(p)) {
|
||||
int ret = wait_consider_task(wo, tsk, 0, p);
|
||||
int ret = wait_consider_task(wo, 0, p);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
@@ -1559,7 +1564,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
|
||||
struct task_struct *p;
|
||||
|
||||
list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
|
||||
int ret = wait_consider_task(wo, tsk, 1, p);
|
||||
int ret = wait_consider_task(wo, 1, p);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
@@ -1567,15 +1572,38 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int child_wait_callback(wait_queue_t *wait, unsigned mode,
|
||||
int sync, void *key)
|
||||
{
|
||||
struct wait_opts *wo = container_of(wait, struct wait_opts,
|
||||
child_wait);
|
||||
struct task_struct *p = key;
|
||||
|
||||
if (!eligible_pid(wo, p))
|
||||
return 0;
|
||||
|
||||
if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
|
||||
return 0;
|
||||
|
||||
return default_wake_function(wait, mode, sync, key);
|
||||
}
|
||||
|
||||
void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
|
||||
{
|
||||
__wake_up_sync_key(&parent->signal->wait_chldexit,
|
||||
TASK_INTERRUPTIBLE, 1, p);
|
||||
}
|
||||
|
||||
static long do_wait(struct wait_opts *wo)
|
||||
{
|
||||
DECLARE_WAITQUEUE(wait, current);
|
||||
struct task_struct *tsk;
|
||||
int retval;
|
||||
|
||||
trace_sched_process_wait(wo->wo_pid);
|
||||
|
||||
add_wait_queue(¤t->signal->wait_chldexit,&wait);
|
||||
init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
|
||||
wo->child_wait.private = current;
|
||||
add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait);
|
||||
repeat:
|
||||
/*
|
||||
* If there is nothing that can match our critiera just get out.
|
||||
@@ -1616,32 +1644,7 @@ notask:
|
||||
}
|
||||
end:
|
||||
__set_current_state(TASK_RUNNING);
|
||||
remove_wait_queue(¤t->signal->wait_chldexit,&wait);
|
||||
if (wo->wo_info) {
|
||||
struct siginfo __user *infop = wo->wo_info;
|
||||
|
||||
if (retval > 0)
|
||||
retval = 0;
|
||||
else {
|
||||
/*
|
||||
* For a WNOHANG return, clear out all the fields
|
||||
* we would set so the user can easily tell the
|
||||
* difference.
|
||||
*/
|
||||
if (!retval)
|
||||
retval = put_user(0, &infop->si_signo);
|
||||
if (!retval)
|
||||
retval = put_user(0, &infop->si_errno);
|
||||
if (!retval)
|
||||
retval = put_user(0, &infop->si_code);
|
||||
if (!retval)
|
||||
retval = put_user(0, &infop->si_pid);
|
||||
if (!retval)
|
||||
retval = put_user(0, &infop->si_uid);
|
||||
if (!retval)
|
||||
retval = put_user(0, &infop->si_status);
|
||||
}
|
||||
}
|
||||
remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait);
|
||||
return retval;
|
||||
}
|
||||
|
||||
@@ -1686,6 +1689,29 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
|
||||
wo.wo_stat = NULL;
|
||||
wo.wo_rusage = ru;
|
||||
ret = do_wait(&wo);
|
||||
|
||||
if (ret > 0) {
|
||||
ret = 0;
|
||||
} else if (infop) {
|
||||
/*
|
||||
* For a WNOHANG return, clear out all the fields
|
||||
* we would set so the user can easily tell the
|
||||
* difference.
|
||||
*/
|
||||
if (!ret)
|
||||
ret = put_user(0, &infop->si_signo);
|
||||
if (!ret)
|
||||
ret = put_user(0, &infop->si_errno);
|
||||
if (!ret)
|
||||
ret = put_user(0, &infop->si_code);
|
||||
if (!ret)
|
||||
ret = put_user(0, &infop->si_pid);
|
||||
if (!ret)
|
||||
ret = put_user(0, &infop->si_uid);
|
||||
if (!ret)
|
||||
ret = put_user(0, &infop->si_status);
|
||||
}
|
||||
|
||||
put_pid(pid);
|
||||
|
||||
/* avoid REGPARM breakage on x86: */
|
||||
|
@@ -49,6 +49,7 @@
|
||||
#include <linux/ftrace.h>
|
||||
#include <linux/profile.h>
|
||||
#include <linux/rmap.h>
|
||||
#include <linux/ksm.h>
|
||||
#include <linux/acct.h>
|
||||
#include <linux/tsacct_kern.h>
|
||||
#include <linux/cn_proc.h>
|
||||
@@ -61,7 +62,8 @@
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/fs_struct.h>
|
||||
#include <linux/magic.h>
|
||||
#include <linux/perf_counter.h>
|
||||
#include <linux/perf_event.h>
|
||||
#include <linux/posix-timers.h>
|
||||
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/pgalloc.h>
|
||||
@@ -136,9 +138,17 @@ struct kmem_cache *vm_area_cachep;
|
||||
/* SLAB cache for mm_struct structures (tsk->mm) */
|
||||
static struct kmem_cache *mm_cachep;
|
||||
|
||||
static void account_kernel_stack(struct thread_info *ti, int account)
|
||||
{
|
||||
struct zone *zone = page_zone(virt_to_page(ti));
|
||||
|
||||
mod_zone_page_state(zone, NR_KERNEL_STACK, account);
|
||||
}
|
||||
|
||||
void free_task(struct task_struct *tsk)
|
||||
{
|
||||
prop_local_destroy_single(&tsk->dirties);
|
||||
account_kernel_stack(tsk->stack, -1);
|
||||
free_thread_info(tsk->stack);
|
||||
rt_mutex_debug_task_free(tsk);
|
||||
ftrace_graph_exit_task(tsk);
|
||||
@@ -253,6 +263,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
|
||||
tsk->btrace_seq = 0;
|
||||
#endif
|
||||
tsk->splice_pipe = NULL;
|
||||
|
||||
account_kernel_stack(ti, 1);
|
||||
|
||||
return tsk;
|
||||
|
||||
out:
|
||||
@@ -288,6 +301,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
|
||||
rb_link = &mm->mm_rb.rb_node;
|
||||
rb_parent = NULL;
|
||||
pprev = &mm->mmap;
|
||||
retval = ksm_fork(mm, oldmm);
|
||||
if (retval)
|
||||
goto out;
|
||||
|
||||
for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
|
||||
struct file *file;
|
||||
@@ -418,22 +434,30 @@ __setup("coredump_filter=", coredump_filter_setup);
|
||||
|
||||
#include <linux/init_task.h>
|
||||
|
||||
static void mm_init_aio(struct mm_struct *mm)
|
||||
{
|
||||
#ifdef CONFIG_AIO
|
||||
spin_lock_init(&mm->ioctx_lock);
|
||||
INIT_HLIST_HEAD(&mm->ioctx_list);
|
||||
#endif
|
||||
}
|
||||
|
||||
static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
|
||||
{
|
||||
atomic_set(&mm->mm_users, 1);
|
||||
atomic_set(&mm->mm_count, 1);
|
||||
init_rwsem(&mm->mmap_sem);
|
||||
INIT_LIST_HEAD(&mm->mmlist);
|
||||
mm->flags = (current->mm) ? current->mm->flags : default_dump_filter;
|
||||
mm->flags = (current->mm) ?
|
||||
(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
|
||||
mm->core_state = NULL;
|
||||
mm->nr_ptes = 0;
|
||||
set_mm_counter(mm, file_rss, 0);
|
||||
set_mm_counter(mm, anon_rss, 0);
|
||||
spin_lock_init(&mm->page_table_lock);
|
||||
spin_lock_init(&mm->ioctx_lock);
|
||||
INIT_HLIST_HEAD(&mm->ioctx_list);
|
||||
mm->free_area_cache = TASK_UNMAPPED_BASE;
|
||||
mm->cached_hole_size = ~0UL;
|
||||
mm_init_aio(mm);
|
||||
mm_init_owner(mm, p);
|
||||
|
||||
if (likely(!mm_alloc_pgd(mm))) {
|
||||
@@ -485,6 +509,7 @@ void mmput(struct mm_struct *mm)
|
||||
|
||||
if (atomic_dec_and_test(&mm->mm_users)) {
|
||||
exit_aio(mm);
|
||||
ksm_exit(mm);
|
||||
exit_mmap(mm);
|
||||
set_mm_exe_file(mm, NULL);
|
||||
if (!list_empty(&mm->mmlist)) {
|
||||
@@ -493,6 +518,8 @@ void mmput(struct mm_struct *mm)
|
||||
spin_unlock(&mmlist_lock);
|
||||
}
|
||||
put_swap_token(mm);
|
||||
if (mm->binfmt)
|
||||
module_put(mm->binfmt->module);
|
||||
mmdrop(mm);
|
||||
}
|
||||
}
|
||||
@@ -618,9 +645,14 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
|
||||
mm->hiwater_rss = get_mm_rss(mm);
|
||||
mm->hiwater_vm = mm->total_vm;
|
||||
|
||||
if (mm->binfmt && !try_module_get(mm->binfmt->module))
|
||||
goto free_pt;
|
||||
|
||||
return mm;
|
||||
|
||||
free_pt:
|
||||
/* don't put binfmt in mmput, we haven't got module yet */
|
||||
mm->binfmt = NULL;
|
||||
mmput(mm);
|
||||
|
||||
fail_nomem:
|
||||
@@ -788,10 +820,10 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
|
||||
thread_group_cputime_init(sig);
|
||||
|
||||
/* Expiration times and increments. */
|
||||
sig->it_virt_expires = cputime_zero;
|
||||
sig->it_virt_incr = cputime_zero;
|
||||
sig->it_prof_expires = cputime_zero;
|
||||
sig->it_prof_incr = cputime_zero;
|
||||
sig->it[CPUCLOCK_PROF].expires = cputime_zero;
|
||||
sig->it[CPUCLOCK_PROF].incr = cputime_zero;
|
||||
sig->it[CPUCLOCK_VIRT].expires = cputime_zero;
|
||||
sig->it[CPUCLOCK_VIRT].incr = cputime_zero;
|
||||
|
||||
/* Cached expiration times. */
|
||||
sig->cputime_expires.prof_exp = cputime_zero;
|
||||
@@ -849,6 +881,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
|
||||
sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
|
||||
sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
|
||||
sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
|
||||
sig->maxrss = sig->cmaxrss = 0;
|
||||
task_io_accounting_init(&sig->ioac);
|
||||
sig->sum_sched_runtime = 0;
|
||||
taskstats_tgid_init(sig);
|
||||
@@ -863,6 +896,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
|
||||
|
||||
tty_audit_fork(sig);
|
||||
|
||||
sig->oom_adj = current->signal->oom_adj;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -958,6 +993,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
|
||||
if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
/*
|
||||
* Siblings of global init remain as zombies on exit since they are
|
||||
* not reaped by their parent (swapper). To solve this and to avoid
|
||||
* multi-rooted process trees, prevent global and container-inits
|
||||
* from creating siblings.
|
||||
*/
|
||||
if ((clone_flags & CLONE_PARENT) &&
|
||||
current->signal->flags & SIGNAL_UNKILLABLE)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
retval = security_task_create(clone_flags);
|
||||
if (retval)
|
||||
goto fork_out;
|
||||
@@ -999,9 +1044,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
|
||||
if (!try_module_get(task_thread_info(p)->exec_domain->module))
|
||||
goto bad_fork_cleanup_count;
|
||||
|
||||
if (p->binfmt && !try_module_get(p->binfmt->module))
|
||||
goto bad_fork_cleanup_put_domain;
|
||||
|
||||
p->did_exec = 0;
|
||||
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
|
||||
copy_flags(clone_flags, p);
|
||||
@@ -1075,10 +1117,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
|
||||
|
||||
p->bts = NULL;
|
||||
|
||||
p->stack_start = stack_start;
|
||||
|
||||
/* Perform scheduler related setup. Assign this task to a CPU. */
|
||||
sched_fork(p, clone_flags);
|
||||
|
||||
retval = perf_counter_init_task(p);
|
||||
retval = perf_event_init_task(p);
|
||||
if (retval)
|
||||
goto bad_fork_cleanup_policy;
|
||||
|
||||
@@ -1253,7 +1297,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
|
||||
write_unlock_irq(&tasklist_lock);
|
||||
proc_fork_connector(p);
|
||||
cgroup_post_fork(p);
|
||||
perf_counter_fork(p);
|
||||
perf_event_fork(p);
|
||||
return p;
|
||||
|
||||
bad_fork_free_pid:
|
||||
@@ -1280,16 +1324,13 @@ bad_fork_cleanup_semundo:
|
||||
bad_fork_cleanup_audit:
|
||||
audit_free(p);
|
||||
bad_fork_cleanup_policy:
|
||||
perf_counter_free_task(p);
|
||||
perf_event_free_task(p);
|
||||
#ifdef CONFIG_NUMA
|
||||
mpol_put(p->mempolicy);
|
||||
bad_fork_cleanup_cgroup:
|
||||
#endif
|
||||
cgroup_exit(p, cgroup_callbacks_done);
|
||||
delayacct_tsk_free(p);
|
||||
if (p->binfmt)
|
||||
module_put(p->binfmt->module);
|
||||
bad_fork_cleanup_put_domain:
|
||||
module_put(task_thread_info(p)->exec_domain->module);
|
||||
bad_fork_cleanup_count:
|
||||
atomic_dec(&p->cred->user->processes);
|
||||
|
137
kernel/futex.c
137
kernel/futex.c
@@ -89,36 +89,36 @@ struct futex_pi_state {
|
||||
union futex_key key;
|
||||
};
|
||||
|
||||
/*
|
||||
* We use this hashed waitqueue instead of a normal wait_queue_t, so
|
||||
/**
|
||||
* struct futex_q - The hashed futex queue entry, one per waiting task
|
||||
* @task: the task waiting on the futex
|
||||
* @lock_ptr: the hash bucket lock
|
||||
* @key: the key the futex is hashed on
|
||||
* @pi_state: optional priority inheritance state
|
||||
* @rt_waiter: rt_waiter storage for use with requeue_pi
|
||||
* @requeue_pi_key: the requeue_pi target futex key
|
||||
* @bitset: bitset for the optional bitmasked wakeup
|
||||
*
|
||||
* We use this hashed waitqueue, instead of a normal wait_queue_t, so
|
||||
* we can wake only the relevant ones (hashed queues may be shared).
|
||||
*
|
||||
* A futex_q has a woken state, just like tasks have TASK_RUNNING.
|
||||
* It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
|
||||
* The order of wakup is always to make the first condition true, then
|
||||
* wake up q->waiter, then make the second condition true.
|
||||
* the second.
|
||||
*
|
||||
* PI futexes are typically woken before they are removed from the hash list via
|
||||
* the rt_mutex code. See unqueue_me_pi().
|
||||
*/
|
||||
struct futex_q {
|
||||
struct plist_node list;
|
||||
/* Waiter reference */
|
||||
|
||||
struct task_struct *task;
|
||||
|
||||
/* Which hash list lock to use: */
|
||||
spinlock_t *lock_ptr;
|
||||
|
||||
/* Key which the futex is hashed on: */
|
||||
union futex_key key;
|
||||
|
||||
/* Optional priority inheritance state: */
|
||||
struct futex_pi_state *pi_state;
|
||||
|
||||
/* rt_waiter storage for requeue_pi: */
|
||||
struct rt_mutex_waiter *rt_waiter;
|
||||
|
||||
/* The expected requeue pi target futex key: */
|
||||
union futex_key *requeue_pi_key;
|
||||
|
||||
/* Bitset for the optional bitmasked wakeup */
|
||||
u32 bitset;
|
||||
};
|
||||
|
||||
@@ -198,11 +198,12 @@ static void drop_futex_key_refs(union futex_key *key)
|
||||
}
|
||||
|
||||
/**
|
||||
* get_futex_key - Get parameters which are the keys for a futex.
|
||||
* @uaddr: virtual address of the futex
|
||||
* @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
|
||||
* @key: address where result is stored.
|
||||
* @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE)
|
||||
* get_futex_key() - Get parameters which are the keys for a futex
|
||||
* @uaddr: virtual address of the futex
|
||||
* @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
|
||||
* @key: address where result is stored.
|
||||
* @rw: mapping needs to be read/write (values: VERIFY_READ,
|
||||
* VERIFY_WRITE)
|
||||
*
|
||||
* Returns a negative error code or 0
|
||||
* The key words are stored in *key on success.
|
||||
@@ -288,8 +289,8 @@ void put_futex_key(int fshared, union futex_key *key)
|
||||
drop_futex_key_refs(key);
|
||||
}
|
||||
|
||||
/*
|
||||
* fault_in_user_writeable - fault in user address and verify RW access
|
||||
/**
|
||||
* fault_in_user_writeable() - Fault in user address and verify RW access
|
||||
* @uaddr: pointer to faulting user space address
|
||||
*
|
||||
* Slow path to fixup the fault we just took in the atomic write
|
||||
@@ -309,8 +310,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
|
||||
|
||||
/**
|
||||
* futex_top_waiter() - Return the highest priority waiter on a futex
|
||||
* @hb: the hash bucket the futex_q's reside in
|
||||
* @key: the futex key (to distinguish it from other futex futex_q's)
|
||||
* @hb: the hash bucket the futex_q's reside in
|
||||
* @key: the futex key (to distinguish it from other futex futex_q's)
|
||||
*
|
||||
* Must be called with the hb lock held.
|
||||
*/
|
||||
@@ -588,7 +589,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
|
||||
}
|
||||
|
||||
/**
|
||||
* futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex
|
||||
* futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
|
||||
* @uaddr: the pi futex user address
|
||||
* @hb: the pi futex hash bucket
|
||||
* @key: the futex key associated with uaddr and hb
|
||||
@@ -1011,9 +1012,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
|
||||
|
||||
/**
|
||||
* requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
|
||||
* q: the futex_q
|
||||
* key: the key of the requeue target futex
|
||||
* hb: the hash_bucket of the requeue target futex
|
||||
* @q: the futex_q
|
||||
* @key: the key of the requeue target futex
|
||||
* @hb: the hash_bucket of the requeue target futex
|
||||
*
|
||||
* During futex_requeue, with requeue_pi=1, it is possible to acquire the
|
||||
* target futex if it is uncontended or via a lock steal. Set the futex_q key
|
||||
@@ -1350,6 +1351,25 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
|
||||
return hb;
|
||||
}
|
||||
|
||||
static inline void
|
||||
queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
|
||||
{
|
||||
spin_unlock(&hb->lock);
|
||||
drop_futex_key_refs(&q->key);
|
||||
}
|
||||
|
||||
/**
|
||||
* queue_me() - Enqueue the futex_q on the futex_hash_bucket
|
||||
* @q: The futex_q to enqueue
|
||||
* @hb: The destination hash bucket
|
||||
*
|
||||
* The hb->lock must be held by the caller, and is released here. A call to
|
||||
* queue_me() is typically paired with exactly one call to unqueue_me(). The
|
||||
* exceptions involve the PI related operations, which may use unqueue_me_pi()
|
||||
* or nothing if the unqueue is done as part of the wake process and the unqueue
|
||||
* state is implicit in the state of woken task (see futex_wait_requeue_pi() for
|
||||
* an example).
|
||||
*/
|
||||
static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
|
||||
{
|
||||
int prio;
|
||||
@@ -1373,19 +1393,17 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
|
||||
spin_unlock(&hb->lock);
|
||||
}
|
||||
|
||||
static inline void
|
||||
queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
|
||||
{
|
||||
spin_unlock(&hb->lock);
|
||||
drop_futex_key_refs(&q->key);
|
||||
}
|
||||
|
||||
/*
|
||||
* queue_me and unqueue_me must be called as a pair, each
|
||||
* exactly once. They are called with the hashed spinlock held.
|
||||
/**
|
||||
* unqueue_me() - Remove the futex_q from its futex_hash_bucket
|
||||
* @q: The futex_q to unqueue
|
||||
*
|
||||
* The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
|
||||
* be paired with exactly one earlier call to queue_me().
|
||||
*
|
||||
* Returns:
|
||||
* 1 - if the futex_q was still queued (and we removed unqueued it)
|
||||
* 0 - if the futex_q was already removed by the waking thread
|
||||
*/
|
||||
|
||||
/* Return 1 if we were still queued (ie. 0 means we were woken) */
|
||||
static int unqueue_me(struct futex_q *q)
|
||||
{
|
||||
spinlock_t *lock_ptr;
|
||||
@@ -1638,17 +1656,14 @@ out:
|
||||
static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
|
||||
struct hrtimer_sleeper *timeout)
|
||||
{
|
||||
queue_me(q, hb);
|
||||
|
||||
/*
|
||||
* There might have been scheduling since the queue_me(), as we
|
||||
* cannot hold a spinlock across the get_user() in case it
|
||||
* faults, and we cannot just set TASK_INTERRUPTIBLE state when
|
||||
* queueing ourselves into the futex hash. This code thus has to
|
||||
* rely on the futex_wake() code removing us from hash when it
|
||||
* wakes us up.
|
||||
* The task state is guaranteed to be set before another task can
|
||||
* wake it. set_current_state() is implemented using set_mb() and
|
||||
* queue_me() calls spin_unlock() upon completion, both serializing
|
||||
* access to the hash list and forcing another memory barrier.
|
||||
*/
|
||||
set_current_state(TASK_INTERRUPTIBLE);
|
||||
queue_me(q, hb);
|
||||
|
||||
/* Arm the timer */
|
||||
if (timeout) {
|
||||
@@ -1658,8 +1673,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
|
||||
}
|
||||
|
||||
/*
|
||||
* !plist_node_empty() is safe here without any lock.
|
||||
* q.lock_ptr != 0 is not safe, because of ordering against wakeup.
|
||||
* If we have been removed from the hash list, then another task
|
||||
* has tried to wake us, and we can skip the call to schedule().
|
||||
*/
|
||||
if (likely(!plist_node_empty(&q->list))) {
|
||||
/*
|
||||
@@ -2114,12 +2129,12 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
|
||||
|
||||
/**
|
||||
* futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
|
||||
* @uaddr: the futex we initialyl wait on (non-pi)
|
||||
* @uaddr: the futex we initially wait on (non-pi)
|
||||
* @fshared: whether the futexes are shared (1) or not (0). They must be
|
||||
* the same type, no requeueing from private to shared, etc.
|
||||
* @val: the expected value of uaddr
|
||||
* @abs_time: absolute timeout
|
||||
* @bitset: 32 bit wakeup bitset set by userspace, defaults to all.
|
||||
* @bitset: 32 bit wakeup bitset set by userspace, defaults to all
|
||||
* @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
|
||||
* @uaddr2: the pi futex we will take prior to returning to user-space
|
||||
*
|
||||
@@ -2246,7 +2261,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
|
||||
res = fixup_owner(uaddr2, fshared, &q, !ret);
|
||||
/*
|
||||
* If fixup_owner() returned an error, proprogate that. If it
|
||||
* acquired the lock, clear our -ETIMEDOUT or -EINTR.
|
||||
* acquired the lock, clear -ETIMEDOUT or -EINTR.
|
||||
*/
|
||||
if (res)
|
||||
ret = (res < 0) ? res : 0;
|
||||
@@ -2302,9 +2317,9 @@ out:
|
||||
*/
|
||||
|
||||
/**
|
||||
* sys_set_robust_list - set the robust-futex list head of a task
|
||||
* @head: pointer to the list-head
|
||||
* @len: length of the list-head, as userspace expects
|
||||
* sys_set_robust_list() - Set the robust-futex list head of a task
|
||||
* @head: pointer to the list-head
|
||||
* @len: length of the list-head, as userspace expects
|
||||
*/
|
||||
SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
|
||||
size_t, len)
|
||||
@@ -2323,10 +2338,10 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
|
||||
}
|
||||
|
||||
/**
|
||||
* sys_get_robust_list - get the robust-futex list head of a task
|
||||
* @pid: pid of the process [zero for current task]
|
||||
* @head_ptr: pointer to a list-head pointer, the kernel fills it in
|
||||
* @len_ptr: pointer to a length field, the kernel fills in the header size
|
||||
* sys_get_robust_list() - Get the robust-futex list head of a task
|
||||
* @pid: pid of the process [zero for current task]
|
||||
* @head_ptr: pointer to a list-head pointer, the kernel fills it in
|
||||
* @len_ptr: pointer to a length field, the kernel fills in the header size
|
||||
*/
|
||||
SYSCALL_DEFINE3(get_robust_list, int, pid,
|
||||
struct robust_list_head __user * __user *, head_ptr,
|
||||
|
@@ -34,7 +34,7 @@ config GCOV_KERNEL
|
||||
config GCOV_PROFILE_ALL
|
||||
bool "Profile entire Kernel"
|
||||
depends on GCOV_KERNEL
|
||||
depends on S390 || X86 || (PPC && EXPERIMENTAL)
|
||||
depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
|
||||
default n
|
||||
---help---
|
||||
This options activates profiling for the entire kernel.
|
||||
|
148
kernel/hrtimer.c
148
kernel/hrtimer.c
@@ -48,36 +48,7 @@
|
||||
|
||||
#include <asm/uaccess.h>
|
||||
|
||||
/**
|
||||
* ktime_get - get the monotonic time in ktime_t format
|
||||
*
|
||||
* returns the time in ktime_t format
|
||||
*/
|
||||
ktime_t ktime_get(void)
|
||||
{
|
||||
struct timespec now;
|
||||
|
||||
ktime_get_ts(&now);
|
||||
|
||||
return timespec_to_ktime(now);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(ktime_get);
|
||||
|
||||
/**
|
||||
* ktime_get_real - get the real (wall-) time in ktime_t format
|
||||
*
|
||||
* returns the time in ktime_t format
|
||||
*/
|
||||
ktime_t ktime_get_real(void)
|
||||
{
|
||||
struct timespec now;
|
||||
|
||||
getnstimeofday(&now);
|
||||
|
||||
return timespec_to_ktime(now);
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL_GPL(ktime_get_real);
|
||||
#include <trace/events/timer.h>
|
||||
|
||||
/*
|
||||
* The timer bases:
|
||||
@@ -106,31 +77,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* ktime_get_ts - get the monotonic clock in timespec format
|
||||
* @ts: pointer to timespec variable
|
||||
*
|
||||
* The function calculates the monotonic clock from the realtime
|
||||
* clock and the wall_to_monotonic offset and stores the result
|
||||
* in normalized timespec format in the variable pointed to by @ts.
|
||||
*/
|
||||
void ktime_get_ts(struct timespec *ts)
|
||||
{
|
||||
struct timespec tomono;
|
||||
unsigned long seq;
|
||||
|
||||
do {
|
||||
seq = read_seqbegin(&xtime_lock);
|
||||
getnstimeofday(ts);
|
||||
tomono = wall_to_monotonic;
|
||||
|
||||
} while (read_seqretry(&xtime_lock, seq));
|
||||
|
||||
set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
|
||||
ts->tv_nsec + tomono.tv_nsec);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(ktime_get_ts);
|
||||
|
||||
/*
|
||||
* Get the coarse grained time at the softirq based on xtime and
|
||||
* wall_to_monotonic.
|
||||
@@ -498,6 +444,26 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
|
||||
static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
|
||||
#endif
|
||||
|
||||
static inline void
|
||||
debug_init(struct hrtimer *timer, clockid_t clockid,
|
||||
enum hrtimer_mode mode)
|
||||
{
|
||||
debug_hrtimer_init(timer);
|
||||
trace_hrtimer_init(timer, clockid, mode);
|
||||
}
|
||||
|
||||
static inline void debug_activate(struct hrtimer *timer)
|
||||
{
|
||||
debug_hrtimer_activate(timer);
|
||||
trace_hrtimer_start(timer);
|
||||
}
|
||||
|
||||
static inline void debug_deactivate(struct hrtimer *timer)
|
||||
{
|
||||
debug_hrtimer_deactivate(timer);
|
||||
trace_hrtimer_cancel(timer);
|
||||
}
|
||||
|
||||
/* High resolution timer related functions */
|
||||
#ifdef CONFIG_HIGH_RES_TIMERS
|
||||
|
||||
@@ -543,13 +509,14 @@ static inline int hrtimer_hres_active(void)
|
||||
* next event
|
||||
* Called with interrupts disabled and base->lock held
|
||||
*/
|
||||
static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
|
||||
static void
|
||||
hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
|
||||
{
|
||||
int i;
|
||||
struct hrtimer_clock_base *base = cpu_base->clock_base;
|
||||
ktime_t expires;
|
||||
ktime_t expires, expires_next;
|
||||
|
||||
cpu_base->expires_next.tv64 = KTIME_MAX;
|
||||
expires_next.tv64 = KTIME_MAX;
|
||||
|
||||
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
|
||||
struct hrtimer *timer;
|
||||
@@ -565,10 +532,15 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
|
||||
*/
|
||||
if (expires.tv64 < 0)
|
||||
expires.tv64 = 0;
|
||||
if (expires.tv64 < cpu_base->expires_next.tv64)
|
||||
cpu_base->expires_next = expires;
|
||||
if (expires.tv64 < expires_next.tv64)
|
||||
expires_next = expires;
|
||||
}
|
||||
|
||||
if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
|
||||
return;
|
||||
|
||||
cpu_base->expires_next.tv64 = expires_next.tv64;
|
||||
|
||||
if (cpu_base->expires_next.tv64 != KTIME_MAX)
|
||||
tick_program_event(cpu_base->expires_next, 1);
|
||||
}
|
||||
@@ -651,7 +623,7 @@ static void retrigger_next_event(void *arg)
|
||||
base->clock_base[CLOCK_REALTIME].offset =
|
||||
timespec_to_ktime(realtime_offset);
|
||||
|
||||
hrtimer_force_reprogram(base);
|
||||
hrtimer_force_reprogram(base, 0);
|
||||
spin_unlock(&base->lock);
|
||||
}
|
||||
|
||||
@@ -764,7 +736,8 @@ static int hrtimer_switch_to_hres(void)
|
||||
static inline int hrtimer_hres_active(void) { return 0; }
|
||||
static inline int hrtimer_is_hres_enabled(void) { return 0; }
|
||||
static inline int hrtimer_switch_to_hres(void) { return 0; }
|
||||
static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
|
||||
static inline void
|
||||
hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
|
||||
static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
|
||||
struct hrtimer_clock_base *base,
|
||||
int wakeup)
|
||||
@@ -854,7 +827,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
|
||||
struct hrtimer *entry;
|
||||
int leftmost = 1;
|
||||
|
||||
debug_hrtimer_activate(timer);
|
||||
debug_activate(timer);
|
||||
|
||||
/*
|
||||
* Find the right place in the rbtree:
|
||||
@@ -907,19 +880,29 @@ static void __remove_hrtimer(struct hrtimer *timer,
|
||||
struct hrtimer_clock_base *base,
|
||||
unsigned long newstate, int reprogram)
|
||||
{
|
||||
if (timer->state & HRTIMER_STATE_ENQUEUED) {
|
||||
/*
|
||||
* Remove the timer from the rbtree and replace the
|
||||
* first entry pointer if necessary.
|
||||
*/
|
||||
if (base->first == &timer->node) {
|
||||
base->first = rb_next(&timer->node);
|
||||
/* Reprogram the clock event device. if enabled */
|
||||
if (reprogram && hrtimer_hres_active())
|
||||
hrtimer_force_reprogram(base->cpu_base);
|
||||
if (!(timer->state & HRTIMER_STATE_ENQUEUED))
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Remove the timer from the rbtree and replace the first
|
||||
* entry pointer if necessary.
|
||||
*/
|
||||
if (base->first == &timer->node) {
|
||||
base->first = rb_next(&timer->node);
|
||||
#ifdef CONFIG_HIGH_RES_TIMERS
|
||||
/* Reprogram the clock event device. if enabled */
|
||||
if (reprogram && hrtimer_hres_active()) {
|
||||
ktime_t expires;
|
||||
|
||||
expires = ktime_sub(hrtimer_get_expires(timer),
|
||||
base->offset);
|
||||
if (base->cpu_base->expires_next.tv64 == expires.tv64)
|
||||
hrtimer_force_reprogram(base->cpu_base, 1);
|
||||
}
|
||||
rb_erase(&timer->node, &base->active);
|
||||
#endif
|
||||
}
|
||||
rb_erase(&timer->node, &base->active);
|
||||
out:
|
||||
timer->state = newstate;
|
||||
}
|
||||
|
||||
@@ -940,7 +923,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
|
||||
* reprogramming happens in the interrupt handler. This is a
|
||||
* rare case and less expensive than a smp call.
|
||||
*/
|
||||
debug_hrtimer_deactivate(timer);
|
||||
debug_deactivate(timer);
|
||||
timer_stats_hrtimer_clear_start_info(timer);
|
||||
reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
|
||||
__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
|
||||
@@ -1155,7 +1138,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
|
||||
clock_id = CLOCK_MONOTONIC;
|
||||
|
||||
timer->base = &cpu_base->clock_base[clock_id];
|
||||
INIT_LIST_HEAD(&timer->cb_entry);
|
||||
hrtimer_init_timer_hres(timer);
|
||||
|
||||
#ifdef CONFIG_TIMER_STATS
|
||||
@@ -1174,7 +1156,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
|
||||
void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
|
||||
enum hrtimer_mode mode)
|
||||
{
|
||||
debug_hrtimer_init(timer);
|
||||
debug_init(timer, clock_id, mode);
|
||||
__hrtimer_init(timer, clock_id, mode);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(hrtimer_init);
|
||||
@@ -1198,7 +1180,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(hrtimer_get_res);
|
||||
|
||||
static void __run_hrtimer(struct hrtimer *timer)
|
||||
static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
|
||||
{
|
||||
struct hrtimer_clock_base *base = timer->base;
|
||||
struct hrtimer_cpu_base *cpu_base = base->cpu_base;
|
||||
@@ -1207,7 +1189,7 @@ static void __run_hrtimer(struct hrtimer *timer)
|
||||
|
||||
WARN_ON(!irqs_disabled());
|
||||
|
||||
debug_hrtimer_deactivate(timer);
|
||||
debug_deactivate(timer);
|
||||
__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
|
||||
timer_stats_account_hrtimer(timer);
|
||||
fn = timer->function;
|
||||
@@ -1218,7 +1200,9 @@ static void __run_hrtimer(struct hrtimer *timer)
|
||||
* the timer base.
|
||||
*/
|
||||
spin_unlock(&cpu_base->lock);
|
||||
trace_hrtimer_expire_entry(timer, now);
|
||||
restart = fn(timer);
|
||||
trace_hrtimer_expire_exit(timer);
|
||||
spin_lock(&cpu_base->lock);
|
||||
|
||||
/*
|
||||
@@ -1329,7 +1313,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
|
||||
break;
|
||||
}
|
||||
|
||||
__run_hrtimer(timer);
|
||||
__run_hrtimer(timer, &basenow);
|
||||
}
|
||||
base++;
|
||||
}
|
||||
@@ -1451,7 +1435,7 @@ void hrtimer_run_queues(void)
|
||||
hrtimer_get_expires_tv64(timer))
|
||||
break;
|
||||
|
||||
__run_hrtimer(timer);
|
||||
__run_hrtimer(timer, &base->softirq_time);
|
||||
}
|
||||
spin_unlock(&cpu_base->lock);
|
||||
}
|
||||
@@ -1628,7 +1612,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
|
||||
while ((node = rb_first(&old_base->active))) {
|
||||
timer = rb_entry(node, struct hrtimer, node);
|
||||
BUG_ON(hrtimer_callback_running(timer));
|
||||
debug_hrtimer_deactivate(timer);
|
||||
debug_deactivate(timer);
|
||||
|
||||
/*
|
||||
* Mark it as STATE_MIGRATE not INACTIVE otherwise the
|
||||
|
@@ -171,12 +171,12 @@ static unsigned long timeout_jiffies(unsigned long timeout)
|
||||
* Process updating of timeout sysctl
|
||||
*/
|
||||
int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
|
||||
struct file *filp, void __user *buffer,
|
||||
void __user *buffer,
|
||||
size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
|
||||
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
|
||||
|
||||
if (ret || !write)
|
||||
goto out;
|
||||
|
169
kernel/itimer.c
169
kernel/itimer.c
@@ -12,6 +12,7 @@
|
||||
#include <linux/time.h>
|
||||
#include <linux/posix-timers.h>
|
||||
#include <linux/hrtimer.h>
|
||||
#include <trace/events/timer.h>
|
||||
|
||||
#include <asm/uaccess.h>
|
||||
|
||||
@@ -41,10 +42,43 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer)
|
||||
return ktime_to_timeval(rem);
|
||||
}
|
||||
|
||||
static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
|
||||
struct itimerval *const value)
|
||||
{
|
||||
cputime_t cval, cinterval;
|
||||
struct cpu_itimer *it = &tsk->signal->it[clock_id];
|
||||
|
||||
spin_lock_irq(&tsk->sighand->siglock);
|
||||
|
||||
cval = it->expires;
|
||||
cinterval = it->incr;
|
||||
if (!cputime_eq(cval, cputime_zero)) {
|
||||
struct task_cputime cputime;
|
||||
cputime_t t;
|
||||
|
||||
thread_group_cputimer(tsk, &cputime);
|
||||
if (clock_id == CPUCLOCK_PROF)
|
||||
t = cputime_add(cputime.utime, cputime.stime);
|
||||
else
|
||||
/* CPUCLOCK_VIRT */
|
||||
t = cputime.utime;
|
||||
|
||||
if (cputime_le(cval, t))
|
||||
/* about to fire */
|
||||
cval = cputime_one_jiffy;
|
||||
else
|
||||
cval = cputime_sub(cval, t);
|
||||
}
|
||||
|
||||
spin_unlock_irq(&tsk->sighand->siglock);
|
||||
|
||||
cputime_to_timeval(cval, &value->it_value);
|
||||
cputime_to_timeval(cinterval, &value->it_interval);
|
||||
}
|
||||
|
||||
int do_getitimer(int which, struct itimerval *value)
|
||||
{
|
||||
struct task_struct *tsk = current;
|
||||
cputime_t cinterval, cval;
|
||||
|
||||
switch (which) {
|
||||
case ITIMER_REAL:
|
||||
@@ -55,44 +89,10 @@ int do_getitimer(int which, struct itimerval *value)
|
||||
spin_unlock_irq(&tsk->sighand->siglock);
|
||||
break;
|
||||
case ITIMER_VIRTUAL:
|
||||
spin_lock_irq(&tsk->sighand->siglock);
|
||||
cval = tsk->signal->it_virt_expires;
|
||||
cinterval = tsk->signal->it_virt_incr;
|
||||
if (!cputime_eq(cval, cputime_zero)) {
|
||||
struct task_cputime cputime;
|
||||
cputime_t utime;
|
||||
|
||||
thread_group_cputimer(tsk, &cputime);
|
||||
utime = cputime.utime;
|
||||
if (cputime_le(cval, utime)) { /* about to fire */
|
||||
cval = jiffies_to_cputime(1);
|
||||
} else {
|
||||
cval = cputime_sub(cval, utime);
|
||||
}
|
||||
}
|
||||
spin_unlock_irq(&tsk->sighand->siglock);
|
||||
cputime_to_timeval(cval, &value->it_value);
|
||||
cputime_to_timeval(cinterval, &value->it_interval);
|
||||
get_cpu_itimer(tsk, CPUCLOCK_VIRT, value);
|
||||
break;
|
||||
case ITIMER_PROF:
|
||||
spin_lock_irq(&tsk->sighand->siglock);
|
||||
cval = tsk->signal->it_prof_expires;
|
||||
cinterval = tsk->signal->it_prof_incr;
|
||||
if (!cputime_eq(cval, cputime_zero)) {
|
||||
struct task_cputime times;
|
||||
cputime_t ptime;
|
||||
|
||||
thread_group_cputimer(tsk, ×);
|
||||
ptime = cputime_add(times.utime, times.stime);
|
||||
if (cputime_le(cval, ptime)) { /* about to fire */
|
||||
cval = jiffies_to_cputime(1);
|
||||
} else {
|
||||
cval = cputime_sub(cval, ptime);
|
||||
}
|
||||
}
|
||||
spin_unlock_irq(&tsk->sighand->siglock);
|
||||
cputime_to_timeval(cval, &value->it_value);
|
||||
cputime_to_timeval(cinterval, &value->it_interval);
|
||||
get_cpu_itimer(tsk, CPUCLOCK_PROF, value);
|
||||
break;
|
||||
default:
|
||||
return(-EINVAL);
|
||||
@@ -123,11 +123,62 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
|
||||
struct signal_struct *sig =
|
||||
container_of(timer, struct signal_struct, real_timer);
|
||||
|
||||
trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0);
|
||||
kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid);
|
||||
|
||||
return HRTIMER_NORESTART;
|
||||
}
|
||||
|
||||
static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns)
|
||||
{
|
||||
struct timespec ts;
|
||||
s64 cpu_ns;
|
||||
|
||||
cputime_to_timespec(ct, &ts);
|
||||
cpu_ns = timespec_to_ns(&ts);
|
||||
|
||||
return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns;
|
||||
}
|
||||
|
||||
static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
|
||||
const struct itimerval *const value,
|
||||
struct itimerval *const ovalue)
|
||||
{
|
||||
cputime_t cval, nval, cinterval, ninterval;
|
||||
s64 ns_ninterval, ns_nval;
|
||||
struct cpu_itimer *it = &tsk->signal->it[clock_id];
|
||||
|
||||
nval = timeval_to_cputime(&value->it_value);
|
||||
ns_nval = timeval_to_ns(&value->it_value);
|
||||
ninterval = timeval_to_cputime(&value->it_interval);
|
||||
ns_ninterval = timeval_to_ns(&value->it_interval);
|
||||
|
||||
it->incr_error = cputime_sub_ns(ninterval, ns_ninterval);
|
||||
it->error = cputime_sub_ns(nval, ns_nval);
|
||||
|
||||
spin_lock_irq(&tsk->sighand->siglock);
|
||||
|
||||
cval = it->expires;
|
||||
cinterval = it->incr;
|
||||
if (!cputime_eq(cval, cputime_zero) ||
|
||||
!cputime_eq(nval, cputime_zero)) {
|
||||
if (cputime_gt(nval, cputime_zero))
|
||||
nval = cputime_add(nval, cputime_one_jiffy);
|
||||
set_process_cpu_timer(tsk, clock_id, &nval, &cval);
|
||||
}
|
||||
it->expires = nval;
|
||||
it->incr = ninterval;
|
||||
trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
|
||||
ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
|
||||
|
||||
spin_unlock_irq(&tsk->sighand->siglock);
|
||||
|
||||
if (ovalue) {
|
||||
cputime_to_timeval(cval, &ovalue->it_value);
|
||||
cputime_to_timeval(cinterval, &ovalue->it_interval);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns true if the timeval is in canonical form
|
||||
*/
|
||||
@@ -139,7 +190,6 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
|
||||
struct task_struct *tsk = current;
|
||||
struct hrtimer *timer;
|
||||
ktime_t expires;
|
||||
cputime_t cval, cinterval, nval, ninterval;
|
||||
|
||||
/*
|
||||
* Validate the timevals in value.
|
||||
@@ -171,51 +221,14 @@ again:
|
||||
} else
|
||||
tsk->signal->it_real_incr.tv64 = 0;
|
||||
|
||||
trace_itimer_state(ITIMER_REAL, value, 0);
|
||||
spin_unlock_irq(&tsk->sighand->siglock);
|
||||
break;
|
||||
case ITIMER_VIRTUAL:
|
||||
nval = timeval_to_cputime(&value->it_value);
|
||||
ninterval = timeval_to_cputime(&value->it_interval);
|
||||
spin_lock_irq(&tsk->sighand->siglock);
|
||||
cval = tsk->signal->it_virt_expires;
|
||||
cinterval = tsk->signal->it_virt_incr;
|
||||
if (!cputime_eq(cval, cputime_zero) ||
|
||||
!cputime_eq(nval, cputime_zero)) {
|
||||
if (cputime_gt(nval, cputime_zero))
|
||||
nval = cputime_add(nval,
|
||||
jiffies_to_cputime(1));
|
||||
set_process_cpu_timer(tsk, CPUCLOCK_VIRT,
|
||||
&nval, &cval);
|
||||
}
|
||||
tsk->signal->it_virt_expires = nval;
|
||||
tsk->signal->it_virt_incr = ninterval;
|
||||
spin_unlock_irq(&tsk->sighand->siglock);
|
||||
if (ovalue) {
|
||||
cputime_to_timeval(cval, &ovalue->it_value);
|
||||
cputime_to_timeval(cinterval, &ovalue->it_interval);
|
||||
}
|
||||
set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue);
|
||||
break;
|
||||
case ITIMER_PROF:
|
||||
nval = timeval_to_cputime(&value->it_value);
|
||||
ninterval = timeval_to_cputime(&value->it_interval);
|
||||
spin_lock_irq(&tsk->sighand->siglock);
|
||||
cval = tsk->signal->it_prof_expires;
|
||||
cinterval = tsk->signal->it_prof_incr;
|
||||
if (!cputime_eq(cval, cputime_zero) ||
|
||||
!cputime_eq(nval, cputime_zero)) {
|
||||
if (cputime_gt(nval, cputime_zero))
|
||||
nval = cputime_add(nval,
|
||||
jiffies_to_cputime(1));
|
||||
set_process_cpu_timer(tsk, CPUCLOCK_PROF,
|
||||
&nval, &cval);
|
||||
}
|
||||
tsk->signal->it_prof_expires = nval;
|
||||
tsk->signal->it_prof_incr = ninterval;
|
||||
spin_unlock_irq(&tsk->sighand->siglock);
|
||||
if (ovalue) {
|
||||
cputime_to_timeval(cval, &ovalue->it_value);
|
||||
cputime_to_timeval(cinterval, &ovalue->it_interval);
|
||||
}
|
||||
set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue);
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
|
@@ -59,7 +59,8 @@ static inline int is_kernel_inittext(unsigned long addr)
|
||||
|
||||
static inline int is_kernel_text(unsigned long addr)
|
||||
{
|
||||
if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext)
|
||||
if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
|
||||
arch_is_kernel_text(addr))
|
||||
return 1;
|
||||
return in_gate_area_no_task(addr);
|
||||
}
|
||||
|
@@ -117,7 +117,7 @@ EXPORT_SYMBOL(kfifo_free);
|
||||
* writer, you don't need extra locking to use these functions.
|
||||
*/
|
||||
unsigned int __kfifo_put(struct kfifo *fifo,
|
||||
unsigned char *buffer, unsigned int len)
|
||||
const unsigned char *buffer, unsigned int len)
|
||||
{
|
||||
unsigned int l;
|
||||
|
||||
|
@@ -1321,7 +1321,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct seq_operations kprobes_seq_ops = {
|
||||
static const struct seq_operations kprobes_seq_ops = {
|
||||
.start = kprobe_seq_start,
|
||||
.next = kprobe_seq_next,
|
||||
.stop = kprobe_seq_stop,
|
||||
|
@@ -578,6 +578,9 @@ static int static_obj(void *obj)
|
||||
if ((addr >= start) && (addr < end))
|
||||
return 1;
|
||||
|
||||
if (arch_is_kernel_data(addr))
|
||||
return 1;
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
/*
|
||||
* percpu var?
|
||||
|
@@ -594,7 +594,7 @@ static int ls_show(struct seq_file *m, void *v)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct seq_operations lockstat_ops = {
|
||||
static const struct seq_operations lockstat_ops = {
|
||||
.start = ls_start,
|
||||
.next = ls_next,
|
||||
.stop = ls_stop,
|
||||
|
930
kernel/marker.c
930
kernel/marker.c
@@ -1,930 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2007 Mathieu Desnoyers
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#include <linux/module.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/jhash.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/marker.h>
|
||||
#include <linux/err.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
extern struct marker __start___markers[];
|
||||
extern struct marker __stop___markers[];
|
||||
|
||||
/* Set to 1 to enable marker debug output */
|
||||
static const int marker_debug;
|
||||
|
||||
/*
|
||||
* markers_mutex nests inside module_mutex. Markers mutex protects the builtin
|
||||
* and module markers and the hash table.
|
||||
*/
|
||||
static DEFINE_MUTEX(markers_mutex);
|
||||
|
||||
/*
|
||||
* Marker hash table, containing the active markers.
|
||||
* Protected by module_mutex.
|
||||
*/
|
||||
#define MARKER_HASH_BITS 6
|
||||
#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
|
||||
static struct hlist_head marker_table[MARKER_TABLE_SIZE];
|
||||
|
||||
/*
|
||||
* Note about RCU :
|
||||
* It is used to make sure every handler has finished using its private data
|
||||
* between two consecutive operation (add or remove) on a given marker. It is
|
||||
* also used to delay the free of multiple probes array until a quiescent state
|
||||
* is reached.
|
||||
* marker entries modifications are protected by the markers_mutex.
|
||||
*/
|
||||
struct marker_entry {
|
||||
struct hlist_node hlist;
|
||||
char *format;
|
||||
/* Probe wrapper */
|
||||
void (*call)(const struct marker *mdata, void *call_private, ...);
|
||||
struct marker_probe_closure single;
|
||||
struct marker_probe_closure *multi;
|
||||
int refcount; /* Number of times armed. 0 if disarmed. */
|
||||
struct rcu_head rcu;
|
||||
void *oldptr;
|
||||
int rcu_pending;
|
||||
unsigned char ptype:1;
|
||||
unsigned char format_allocated:1;
|
||||
char name[0]; /* Contains name'\0'format'\0' */
|
||||
};
|
||||
|
||||
/**
|
||||
* __mark_empty_function - Empty probe callback
|
||||
* @probe_private: probe private data
|
||||
* @call_private: call site private data
|
||||
* @fmt: format string
|
||||
* @...: variable argument list
|
||||
*
|
||||
* Empty callback provided as a probe to the markers. By providing this to a
|
||||
* disabled marker, we make sure the execution flow is always valid even
|
||||
* though the function pointer change and the marker enabling are two distinct
|
||||
* operations that modifies the execution flow of preemptible code.
|
||||
*/
|
||||
notrace void __mark_empty_function(void *probe_private, void *call_private,
|
||||
const char *fmt, va_list *args)
|
||||
{
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__mark_empty_function);
|
||||
|
||||
/*
|
||||
* marker_probe_cb Callback that prepares the variable argument list for probes.
|
||||
* @mdata: pointer of type struct marker
|
||||
* @call_private: caller site private data
|
||||
* @...: Variable argument list.
|
||||
*
|
||||
* Since we do not use "typical" pointer based RCU in the 1 argument case, we
|
||||
* need to put a full smp_rmb() in this branch. This is why we do not use
|
||||
* rcu_dereference() for the pointer read.
|
||||
*/
|
||||
notrace void marker_probe_cb(const struct marker *mdata,
|
||||
void *call_private, ...)
|
||||
{
|
||||
va_list args;
|
||||
char ptype;
|
||||
|
||||
/*
|
||||
* rcu_read_lock_sched does two things : disabling preemption to make
|
||||
* sure the teardown of the callbacks can be done correctly when they
|
||||
* are in modules and they insure RCU read coherency.
|
||||
*/
|
||||
rcu_read_lock_sched_notrace();
|
||||
ptype = mdata->ptype;
|
||||
if (likely(!ptype)) {
|
||||
marker_probe_func *func;
|
||||
/* Must read the ptype before ptr. They are not data dependant,
|
||||
* so we put an explicit smp_rmb() here. */
|
||||
smp_rmb();
|
||||
func = mdata->single.func;
|
||||
/* Must read the ptr before private data. They are not data
|
||||
* dependant, so we put an explicit smp_rmb() here. */
|
||||
smp_rmb();
|
||||
va_start(args, call_private);
|
||||
func(mdata->single.probe_private, call_private, mdata->format,
|
||||
&args);
|
||||
va_end(args);
|
||||
} else {
|
||||
struct marker_probe_closure *multi;
|
||||
int i;
|
||||
/*
|
||||
* Read mdata->ptype before mdata->multi.
|
||||
*/
|
||||
smp_rmb();
|
||||
multi = mdata->multi;
|
||||
/*
|
||||
* multi points to an array, therefore accessing the array
|
||||
* depends on reading multi. However, even in this case,
|
||||
* we must insure that the pointer is read _before_ the array
|
||||
* data. Same as rcu_dereference, but we need a full smp_rmb()
|
||||
* in the fast path, so put the explicit barrier here.
|
||||
*/
|
||||
smp_read_barrier_depends();
|
||||
for (i = 0; multi[i].func; i++) {
|
||||
va_start(args, call_private);
|
||||
multi[i].func(multi[i].probe_private, call_private,
|
||||
mdata->format, &args);
|
||||
va_end(args);
|
||||
}
|
||||
}
|
||||
rcu_read_unlock_sched_notrace();
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(marker_probe_cb);
|
||||
|
||||
/*
|
||||
* marker_probe_cb Callback that does not prepare the variable argument list.
|
||||
* @mdata: pointer of type struct marker
|
||||
* @call_private: caller site private data
|
||||
* @...: Variable argument list.
|
||||
*
|
||||
* Should be connected to markers "MARK_NOARGS".
|
||||
*/
|
||||
static notrace void marker_probe_cb_noarg(const struct marker *mdata,
|
||||
void *call_private, ...)
|
||||
{
|
||||
va_list args; /* not initialized */
|
||||
char ptype;
|
||||
|
||||
rcu_read_lock_sched_notrace();
|
||||
ptype = mdata->ptype;
|
||||
if (likely(!ptype)) {
|
||||
marker_probe_func *func;
|
||||
/* Must read the ptype before ptr. They are not data dependant,
|
||||
* so we put an explicit smp_rmb() here. */
|
||||
smp_rmb();
|
||||
func = mdata->single.func;
|
||||
/* Must read the ptr before private data. They are not data
|
||||
* dependant, so we put an explicit smp_rmb() here. */
|
||||
smp_rmb();
|
||||
func(mdata->single.probe_private, call_private, mdata->format,
|
||||
&args);
|
||||
} else {
|
||||
struct marker_probe_closure *multi;
|
||||
int i;
|
||||
/*
|
||||
* Read mdata->ptype before mdata->multi.
|
||||
*/
|
||||
smp_rmb();
|
||||
multi = mdata->multi;
|
||||
/*
|
||||
* multi points to an array, therefore accessing the array
|
||||
* depends on reading multi. However, even in this case,
|
||||
* we must insure that the pointer is read _before_ the array
|
||||
* data. Same as rcu_dereference, but we need a full smp_rmb()
|
||||
* in the fast path, so put the explicit barrier here.
|
||||
*/
|
||||
smp_read_barrier_depends();
|
||||
for (i = 0; multi[i].func; i++)
|
||||
multi[i].func(multi[i].probe_private, call_private,
|
||||
mdata->format, &args);
|
||||
}
|
||||
rcu_read_unlock_sched_notrace();
|
||||
}
|
||||
|
||||
static void free_old_closure(struct rcu_head *head)
|
||||
{
|
||||
struct marker_entry *entry = container_of(head,
|
||||
struct marker_entry, rcu);
|
||||
kfree(entry->oldptr);
|
||||
/* Make sure we free the data before setting the pending flag to 0 */
|
||||
smp_wmb();
|
||||
entry->rcu_pending = 0;
|
||||
}
|
||||
|
||||
static void debug_print_probes(struct marker_entry *entry)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (!marker_debug)
|
||||
return;
|
||||
|
||||
if (!entry->ptype) {
|
||||
printk(KERN_DEBUG "Single probe : %p %p\n",
|
||||
entry->single.func,
|
||||
entry->single.probe_private);
|
||||
} else {
|
||||
for (i = 0; entry->multi[i].func; i++)
|
||||
printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
|
||||
entry->multi[i].func,
|
||||
entry->multi[i].probe_private);
|
||||
}
|
||||
}
|
||||
|
||||
static struct marker_probe_closure *
|
||||
marker_entry_add_probe(struct marker_entry *entry,
|
||||
marker_probe_func *probe, void *probe_private)
|
||||
{
|
||||
int nr_probes = 0;
|
||||
struct marker_probe_closure *old, *new;
|
||||
|
||||
WARN_ON(!probe);
|
||||
|
||||
debug_print_probes(entry);
|
||||
old = entry->multi;
|
||||
if (!entry->ptype) {
|
||||
if (entry->single.func == probe &&
|
||||
entry->single.probe_private == probe_private)
|
||||
return ERR_PTR(-EBUSY);
|
||||
if (entry->single.func == __mark_empty_function) {
|
||||
/* 0 -> 1 probes */
|
||||
entry->single.func = probe;
|
||||
entry->single.probe_private = probe_private;
|
||||
entry->refcount = 1;
|
||||
entry->ptype = 0;
|
||||
debug_print_probes(entry);
|
||||
return NULL;
|
||||
} else {
|
||||
/* 1 -> 2 probes */
|
||||
nr_probes = 1;
|
||||
old = NULL;
|
||||
}
|
||||
} else {
|
||||
/* (N -> N+1), (N != 0, 1) probes */
|
||||
for (nr_probes = 0; old[nr_probes].func; nr_probes++)
|
||||
if (old[nr_probes].func == probe
|
||||
&& old[nr_probes].probe_private
|
||||
== probe_private)
|
||||
return ERR_PTR(-EBUSY);
|
||||
}
|
||||
/* + 2 : one for new probe, one for NULL func */
|
||||
new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure),
|
||||
GFP_KERNEL);
|
||||
if (new == NULL)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
if (!old)
|
||||
new[0] = entry->single;
|
||||
else
|
||||
memcpy(new, old,
|
||||
nr_probes * sizeof(struct marker_probe_closure));
|
||||
new[nr_probes].func = probe;
|
||||
new[nr_probes].probe_private = probe_private;
|
||||
entry->refcount = nr_probes + 1;
|
||||
entry->multi = new;
|
||||
entry->ptype = 1;
|
||||
debug_print_probes(entry);
|
||||
return old;
|
||||
}
|
||||
|
||||
static struct marker_probe_closure *
|
||||
marker_entry_remove_probe(struct marker_entry *entry,
|
||||
marker_probe_func *probe, void *probe_private)
|
||||
{
|
||||
int nr_probes = 0, nr_del = 0, i;
|
||||
struct marker_probe_closure *old, *new;
|
||||
|
||||
old = entry->multi;
|
||||
|
||||
debug_print_probes(entry);
|
||||
if (!entry->ptype) {
|
||||
/* 0 -> N is an error */
|
||||
WARN_ON(entry->single.func == __mark_empty_function);
|
||||
/* 1 -> 0 probes */
|
||||
WARN_ON(probe && entry->single.func != probe);
|
||||
WARN_ON(entry->single.probe_private != probe_private);
|
||||
entry->single.func = __mark_empty_function;
|
||||
entry->refcount = 0;
|
||||
entry->ptype = 0;
|
||||
debug_print_probes(entry);
|
||||
return NULL;
|
||||
} else {
|
||||
/* (N -> M), (N > 1, M >= 0) probes */
|
||||
for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
|
||||
if ((!probe || old[nr_probes].func == probe)
|
||||
&& old[nr_probes].probe_private
|
||||
== probe_private)
|
||||
nr_del++;
|
||||
}
|
||||
}
|
||||
|
||||
if (nr_probes - nr_del == 0) {
|
||||
/* N -> 0, (N > 1) */
|
||||
entry->single.func = __mark_empty_function;
|
||||
entry->refcount = 0;
|
||||
entry->ptype = 0;
|
||||
} else if (nr_probes - nr_del == 1) {
|
||||
/* N -> 1, (N > 1) */
|
||||
for (i = 0; old[i].func; i++)
|
||||
if ((probe && old[i].func != probe) ||
|
||||
old[i].probe_private != probe_private)
|
||||
entry->single = old[i];
|
||||
entry->refcount = 1;
|
||||
entry->ptype = 0;
|
||||
} else {
|
||||
int j = 0;
|
||||
/* N -> M, (N > 1, M > 1) */
|
||||
/* + 1 for NULL */
|
||||
new = kzalloc((nr_probes - nr_del + 1)
|
||||
* sizeof(struct marker_probe_closure), GFP_KERNEL);
|
||||
if (new == NULL)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
for (i = 0; old[i].func; i++)
|
||||
if ((probe && old[i].func != probe) ||
|
||||
old[i].probe_private != probe_private)
|
||||
new[j++] = old[i];
|
||||
entry->refcount = nr_probes - nr_del;
|
||||
entry->ptype = 1;
|
||||
entry->multi = new;
|
||||
}
|
||||
debug_print_probes(entry);
|
||||
return old;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get marker if the marker is present in the marker hash table.
|
||||
* Must be called with markers_mutex held.
|
||||
* Returns NULL if not present.
|
||||
*/
|
||||
static struct marker_entry *get_marker(const char *name)
|
||||
{
|
||||
struct hlist_head *head;
|
||||
struct hlist_node *node;
|
||||
struct marker_entry *e;
|
||||
u32 hash = jhash(name, strlen(name), 0);
|
||||
|
||||
head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
|
||||
hlist_for_each_entry(e, node, head, hlist) {
|
||||
if (!strcmp(name, e->name))
|
||||
return e;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Add the marker to the marker hash table. Must be called with markers_mutex
|
||||
* held.
|
||||
*/
|
||||
static struct marker_entry *add_marker(const char *name, const char *format)
|
||||
{
|
||||
struct hlist_head *head;
|
||||
struct hlist_node *node;
|
||||
struct marker_entry *e;
|
||||
size_t name_len = strlen(name) + 1;
|
||||
size_t format_len = 0;
|
||||
u32 hash = jhash(name, name_len-1, 0);
|
||||
|
||||
if (format)
|
||||
format_len = strlen(format) + 1;
|
||||
head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
|
||||
hlist_for_each_entry(e, node, head, hlist) {
|
||||
if (!strcmp(name, e->name)) {
|
||||
printk(KERN_NOTICE
|
||||
"Marker %s busy\n", name);
|
||||
return ERR_PTR(-EBUSY); /* Already there */
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Using kmalloc here to allocate a variable length element. Could
|
||||
* cause some memory fragmentation if overused.
|
||||
*/
|
||||
e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
|
||||
GFP_KERNEL);
|
||||
if (!e)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
memcpy(&e->name[0], name, name_len);
|
||||
if (format) {
|
||||
e->format = &e->name[name_len];
|
||||
memcpy(e->format, format, format_len);
|
||||
if (strcmp(e->format, MARK_NOARGS) == 0)
|
||||
e->call = marker_probe_cb_noarg;
|
||||
else
|
||||
e->call = marker_probe_cb;
|
||||
trace_mark(core_marker_format, "name %s format %s",
|
||||
e->name, e->format);
|
||||
} else {
|
||||
e->format = NULL;
|
||||
e->call = marker_probe_cb;
|
||||
}
|
||||
e->single.func = __mark_empty_function;
|
||||
e->single.probe_private = NULL;
|
||||
e->multi = NULL;
|
||||
e->ptype = 0;
|
||||
e->format_allocated = 0;
|
||||
e->refcount = 0;
|
||||
e->rcu_pending = 0;
|
||||
hlist_add_head(&e->hlist, head);
|
||||
return e;
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove the marker from the marker hash table. Must be called with mutex_lock
|
||||
* held.
|
||||
*/
|
||||
static int remove_marker(const char *name)
|
||||
{
|
||||
struct hlist_head *head;
|
||||
struct hlist_node *node;
|
||||
struct marker_entry *e;
|
||||
int found = 0;
|
||||
size_t len = strlen(name) + 1;
|
||||
u32 hash = jhash(name, len-1, 0);
|
||||
|
||||
head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
|
||||
hlist_for_each_entry(e, node, head, hlist) {
|
||||
if (!strcmp(name, e->name)) {
|
||||
found = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found)
|
||||
return -ENOENT;
|
||||
if (e->single.func != __mark_empty_function)
|
||||
return -EBUSY;
|
||||
hlist_del(&e->hlist);
|
||||
if (e->format_allocated)
|
||||
kfree(e->format);
|
||||
/* Make sure the call_rcu has been executed */
|
||||
if (e->rcu_pending)
|
||||
rcu_barrier_sched();
|
||||
kfree(e);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the mark_entry format to the format found in the element.
|
||||
*/
|
||||
static int marker_set_format(struct marker_entry *entry, const char *format)
|
||||
{
|
||||
entry->format = kstrdup(format, GFP_KERNEL);
|
||||
if (!entry->format)
|
||||
return -ENOMEM;
|
||||
entry->format_allocated = 1;
|
||||
|
||||
trace_mark(core_marker_format, "name %s format %s",
|
||||
entry->name, entry->format);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Sets the probe callback corresponding to one marker.
|
||||
*/
|
||||
static int set_marker(struct marker_entry *entry, struct marker *elem,
|
||||
int active)
|
||||
{
|
||||
int ret = 0;
|
||||
WARN_ON(strcmp(entry->name, elem->name) != 0);
|
||||
|
||||
if (entry->format) {
|
||||
if (strcmp(entry->format, elem->format) != 0) {
|
||||
printk(KERN_NOTICE
|
||||
"Format mismatch for probe %s "
|
||||
"(%s), marker (%s)\n",
|
||||
entry->name,
|
||||
entry->format,
|
||||
elem->format);
|
||||
return -EPERM;
|
||||
}
|
||||
} else {
|
||||
ret = marker_set_format(entry, elem->format);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* probe_cb setup (statically known) is done here. It is
|
||||
* asynchronous with the rest of execution, therefore we only
|
||||
* pass from a "safe" callback (with argument) to an "unsafe"
|
||||
* callback (does not set arguments).
|
||||
*/
|
||||
elem->call = entry->call;
|
||||
/*
|
||||
* Sanity check :
|
||||
* We only update the single probe private data when the ptr is
|
||||
* set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
|
||||
*/
|
||||
WARN_ON(elem->single.func != __mark_empty_function
|
||||
&& elem->single.probe_private != entry->single.probe_private
|
||||
&& !elem->ptype);
|
||||
elem->single.probe_private = entry->single.probe_private;
|
||||
/*
|
||||
* Make sure the private data is valid when we update the
|
||||
* single probe ptr.
|
||||
*/
|
||||
smp_wmb();
|
||||
elem->single.func = entry->single.func;
|
||||
/*
|
||||
* We also make sure that the new probe callbacks array is consistent
|
||||
* before setting a pointer to it.
|
||||
*/
|
||||
rcu_assign_pointer(elem->multi, entry->multi);
|
||||
/*
|
||||
* Update the function or multi probe array pointer before setting the
|
||||
* ptype.
|
||||
*/
|
||||
smp_wmb();
|
||||
elem->ptype = entry->ptype;
|
||||
|
||||
if (elem->tp_name && (active ^ elem->state)) {
|
||||
WARN_ON(!elem->tp_cb);
|
||||
/*
|
||||
* It is ok to directly call the probe registration because type
|
||||
* checking has been done in the __trace_mark_tp() macro.
|
||||
*/
|
||||
|
||||
if (active) {
|
||||
/*
|
||||
* try_module_get should always succeed because we hold
|
||||
* lock_module() to get the tp_cb address.
|
||||
*/
|
||||
ret = try_module_get(__module_text_address(
|
||||
(unsigned long)elem->tp_cb));
|
||||
BUG_ON(!ret);
|
||||
ret = tracepoint_probe_register_noupdate(
|
||||
elem->tp_name,
|
||||
elem->tp_cb);
|
||||
} else {
|
||||
ret = tracepoint_probe_unregister_noupdate(
|
||||
elem->tp_name,
|
||||
elem->tp_cb);
|
||||
/*
|
||||
* tracepoint_probe_update_all() must be called
|
||||
* before the module containing tp_cb is unloaded.
|
||||
*/
|
||||
module_put(__module_text_address(
|
||||
(unsigned long)elem->tp_cb));
|
||||
}
|
||||
}
|
||||
elem->state = active;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Disable a marker and its probe callback.
|
||||
* Note: only waiting an RCU period after setting elem->call to the empty
|
||||
* function insures that the original callback is not used anymore. This insured
|
||||
* by rcu_read_lock_sched around the call site.
|
||||
*/
|
||||
static void disable_marker(struct marker *elem)
|
||||
{
|
||||
int ret;
|
||||
|
||||
/* leave "call" as is. It is known statically. */
|
||||
if (elem->tp_name && elem->state) {
|
||||
WARN_ON(!elem->tp_cb);
|
||||
/*
|
||||
* It is ok to directly call the probe registration because type
|
||||
* checking has been done in the __trace_mark_tp() macro.
|
||||
*/
|
||||
ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
|
||||
elem->tp_cb);
|
||||
WARN_ON(ret);
|
||||
/*
|
||||
* tracepoint_probe_update_all() must be called
|
||||
* before the module containing tp_cb is unloaded.
|
||||
*/
|
||||
module_put(__module_text_address((unsigned long)elem->tp_cb));
|
||||
}
|
||||
elem->state = 0;
|
||||
elem->single.func = __mark_empty_function;
|
||||
/* Update the function before setting the ptype */
|
||||
smp_wmb();
|
||||
elem->ptype = 0; /* single probe */
|
||||
/*
|
||||
* Leave the private data and id there, because removal is racy and
|
||||
* should be done only after an RCU period. These are never used until
|
||||
* the next initialization anyway.
|
||||
*/
|
||||
}
|
||||
|
||||
/**
|
||||
* marker_update_probe_range - Update a probe range
|
||||
* @begin: beginning of the range
|
||||
* @end: end of the range
|
||||
*
|
||||
* Updates the probe callback corresponding to a range of markers.
|
||||
*/
|
||||
void marker_update_probe_range(struct marker *begin,
|
||||
struct marker *end)
|
||||
{
|
||||
struct marker *iter;
|
||||
struct marker_entry *mark_entry;
|
||||
|
||||
mutex_lock(&markers_mutex);
|
||||
for (iter = begin; iter < end; iter++) {
|
||||
mark_entry = get_marker(iter->name);
|
||||
if (mark_entry) {
|
||||
set_marker(mark_entry, iter, !!mark_entry->refcount);
|
||||
/*
|
||||
* ignore error, continue
|
||||
*/
|
||||
} else {
|
||||
disable_marker(iter);
|
||||
}
|
||||
}
|
||||
mutex_unlock(&markers_mutex);
|
||||
}
|
||||
|
||||
/*
|
||||
* Update probes, removing the faulty probes.
|
||||
*
|
||||
* Internal callback only changed before the first probe is connected to it.
|
||||
* Single probe private data can only be changed on 0 -> 1 and 2 -> 1
|
||||
* transitions. All other transitions will leave the old private data valid.
|
||||
* This makes the non-atomicity of the callback/private data updates valid.
|
||||
*
|
||||
* "special case" updates :
|
||||
* 0 -> 1 callback
|
||||
* 1 -> 0 callback
|
||||
* 1 -> 2 callbacks
|
||||
* 2 -> 1 callbacks
|
||||
* Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
|
||||
* Site effect : marker_set_format may delete the marker entry (creating a
|
||||
* replacement).
|
||||
*/
|
||||
static void marker_update_probes(void)
|
||||
{
|
||||
/* Core kernel markers */
|
||||
marker_update_probe_range(__start___markers, __stop___markers);
|
||||
/* Markers in modules. */
|
||||
module_update_markers();
|
||||
tracepoint_probe_update_all();
|
||||
}
|
||||
|
||||
/**
|
||||
* marker_probe_register - Connect a probe to a marker
|
||||
* @name: marker name
|
||||
* @format: format string
|
||||
* @probe: probe handler
|
||||
* @probe_private: probe private data
|
||||
*
|
||||
* private data must be a valid allocated memory address, or NULL.
|
||||
* Returns 0 if ok, error value on error.
|
||||
* The probe address must at least be aligned on the architecture pointer size.
|
||||
*/
|
||||
int marker_probe_register(const char *name, const char *format,
|
||||
marker_probe_func *probe, void *probe_private)
|
||||
{
|
||||
struct marker_entry *entry;
|
||||
int ret = 0;
|
||||
struct marker_probe_closure *old;
|
||||
|
||||
mutex_lock(&markers_mutex);
|
||||
entry = get_marker(name);
|
||||
if (!entry) {
|
||||
entry = add_marker(name, format);
|
||||
if (IS_ERR(entry))
|
||||
ret = PTR_ERR(entry);
|
||||
} else if (format) {
|
||||
if (!entry->format)
|
||||
ret = marker_set_format(entry, format);
|
||||
else if (strcmp(entry->format, format))
|
||||
ret = -EPERM;
|
||||
}
|
||||
if (ret)
|
||||
goto end;
|
||||
|
||||
/*
|
||||
* If we detect that a call_rcu is pending for this marker,
|
||||
* make sure it's executed now.
|
||||
*/
|
||||
if (entry->rcu_pending)
|
||||
rcu_barrier_sched();
|
||||
old = marker_entry_add_probe(entry, probe, probe_private);
|
||||
if (IS_ERR(old)) {
|
||||
ret = PTR_ERR(old);
|
||||
goto end;
|
||||
}
|
||||
mutex_unlock(&markers_mutex);
|
||||
marker_update_probes();
|
||||
mutex_lock(&markers_mutex);
|
||||
entry = get_marker(name);
|
||||
if (!entry)
|
||||
goto end;
|
||||
if (entry->rcu_pending)
|
||||
rcu_barrier_sched();
|
||||
entry->oldptr = old;
|
||||
entry->rcu_pending = 1;
|
||||
/* write rcu_pending before calling the RCU callback */
|
||||
smp_wmb();
|
||||
call_rcu_sched(&entry->rcu, free_old_closure);
|
||||
end:
|
||||
mutex_unlock(&markers_mutex);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(marker_probe_register);
|
||||
|
||||
/**
|
||||
* marker_probe_unregister - Disconnect a probe from a marker
|
||||
* @name: marker name
|
||||
* @probe: probe function pointer
|
||||
* @probe_private: probe private data
|
||||
*
|
||||
* Returns the private data given to marker_probe_register, or an ERR_PTR().
|
||||
* We do not need to call a synchronize_sched to make sure the probes have
|
||||
* finished running before doing a module unload, because the module unload
|
||||
* itself uses stop_machine(), which insures that every preempt disabled section
|
||||
* have finished.
|
||||
*/
|
||||
int marker_probe_unregister(const char *name,
|
||||
marker_probe_func *probe, void *probe_private)
|
||||
{
|
||||
struct marker_entry *entry;
|
||||
struct marker_probe_closure *old;
|
||||
int ret = -ENOENT;
|
||||
|
||||
mutex_lock(&markers_mutex);
|
||||
entry = get_marker(name);
|
||||
if (!entry)
|
||||
goto end;
|
||||
if (entry->rcu_pending)
|
||||
rcu_barrier_sched();
|
||||
old = marker_entry_remove_probe(entry, probe, probe_private);
|
||||
mutex_unlock(&markers_mutex);
|
||||
marker_update_probes();
|
||||
mutex_lock(&markers_mutex);
|
||||
entry = get_marker(name);
|
||||
if (!entry)
|
||||
goto end;
|
||||
if (entry->rcu_pending)
|
||||
rcu_barrier_sched();
|
||||
entry->oldptr = old;
|
||||
entry->rcu_pending = 1;
|
||||
/* write rcu_pending before calling the RCU callback */
|
||||
smp_wmb();
|
||||
call_rcu_sched(&entry->rcu, free_old_closure);
|
||||
remove_marker(name); /* Ignore busy error message */
|
||||
ret = 0;
|
||||
end:
|
||||
mutex_unlock(&markers_mutex);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(marker_probe_unregister);
|
||||
|
||||
static struct marker_entry *
|
||||
get_marker_from_private_data(marker_probe_func *probe, void *probe_private)
|
||||
{
|
||||
struct marker_entry *entry;
|
||||
unsigned int i;
|
||||
struct hlist_head *head;
|
||||
struct hlist_node *node;
|
||||
|
||||
for (i = 0; i < MARKER_TABLE_SIZE; i++) {
|
||||
head = &marker_table[i];
|
||||
hlist_for_each_entry(entry, node, head, hlist) {
|
||||
if (!entry->ptype) {
|
||||
if (entry->single.func == probe
|
||||
&& entry->single.probe_private
|
||||
== probe_private)
|
||||
return entry;
|
||||
} else {
|
||||
struct marker_probe_closure *closure;
|
||||
closure = entry->multi;
|
||||
for (i = 0; closure[i].func; i++) {
|
||||
if (closure[i].func == probe &&
|
||||
closure[i].probe_private
|
||||
== probe_private)
|
||||
return entry;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* marker_probe_unregister_private_data - Disconnect a probe from a marker
|
||||
* @probe: probe function
|
||||
* @probe_private: probe private data
|
||||
*
|
||||
* Unregister a probe by providing the registered private data.
|
||||
* Only removes the first marker found in hash table.
|
||||
* Return 0 on success or error value.
|
||||
* We do not need to call a synchronize_sched to make sure the probes have
|
||||
* finished running before doing a module unload, because the module unload
|
||||
* itself uses stop_machine(), which insures that every preempt disabled section
|
||||
* have finished.
|
||||
*/
|
||||
int marker_probe_unregister_private_data(marker_probe_func *probe,
|
||||
void *probe_private)
|
||||
{
|
||||
struct marker_entry *entry;
|
||||
int ret = 0;
|
||||
struct marker_probe_closure *old;
|
||||
|
||||
mutex_lock(&markers_mutex);
|
||||
entry = get_marker_from_private_data(probe, probe_private);
|
||||
if (!entry) {
|
||||
ret = -ENOENT;
|
||||
goto end;
|
||||
}
|
||||
if (entry->rcu_pending)
|
||||
rcu_barrier_sched();
|
||||
old = marker_entry_remove_probe(entry, NULL, probe_private);
|
||||
mutex_unlock(&markers_mutex);
|
||||
marker_update_probes();
|
||||
mutex_lock(&markers_mutex);
|
||||
entry = get_marker_from_private_data(probe, probe_private);
|
||||
if (!entry)
|
||||
goto end;
|
||||
if (entry->rcu_pending)
|
||||
rcu_barrier_sched();
|
||||
entry->oldptr = old;
|
||||
entry->rcu_pending = 1;
|
||||
/* write rcu_pending before calling the RCU callback */
|
||||
smp_wmb();
|
||||
call_rcu_sched(&entry->rcu, free_old_closure);
|
||||
remove_marker(entry->name); /* Ignore busy error message */
|
||||
end:
|
||||
mutex_unlock(&markers_mutex);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
|
||||
|
||||
/**
|
||||
* marker_get_private_data - Get a marker's probe private data
|
||||
* @name: marker name
|
||||
* @probe: probe to match
|
||||
* @num: get the nth matching probe's private data
|
||||
*
|
||||
* Returns the nth private data pointer (starting from 0) matching, or an
|
||||
* ERR_PTR.
|
||||
* Returns the private data pointer, or an ERR_PTR.
|
||||
* The private data pointer should _only_ be dereferenced if the caller is the
|
||||
* owner of the data, or its content could vanish. This is mostly used to
|
||||
* confirm that a caller is the owner of a registered probe.
|
||||
*/
|
||||
void *marker_get_private_data(const char *name, marker_probe_func *probe,
|
||||
int num)
|
||||
{
|
||||
struct hlist_head *head;
|
||||
struct hlist_node *node;
|
||||
struct marker_entry *e;
|
||||
size_t name_len = strlen(name) + 1;
|
||||
u32 hash = jhash(name, name_len-1, 0);
|
||||
int i;
|
||||
|
||||
head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
|
||||
hlist_for_each_entry(e, node, head, hlist) {
|
||||
if (!strcmp(name, e->name)) {
|
||||
if (!e->ptype) {
|
||||
if (num == 0 && e->single.func == probe)
|
||||
return e->single.probe_private;
|
||||
} else {
|
||||
struct marker_probe_closure *closure;
|
||||
int match = 0;
|
||||
closure = e->multi;
|
||||
for (i = 0; closure[i].func; i++) {
|
||||
if (closure[i].func != probe)
|
||||
continue;
|
||||
if (match++ == num)
|
||||
return closure[i].probe_private;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return ERR_PTR(-ENOENT);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(marker_get_private_data);
|
||||
|
||||
#ifdef CONFIG_MODULES
|
||||
|
||||
int marker_module_notify(struct notifier_block *self,
|
||||
unsigned long val, void *data)
|
||||
{
|
||||
struct module *mod = data;
|
||||
|
||||
switch (val) {
|
||||
case MODULE_STATE_COMING:
|
||||
marker_update_probe_range(mod->markers,
|
||||
mod->markers + mod->num_markers);
|
||||
break;
|
||||
case MODULE_STATE_GOING:
|
||||
marker_update_probe_range(mod->markers,
|
||||
mod->markers + mod->num_markers);
|
||||
break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct notifier_block marker_module_nb = {
|
||||
.notifier_call = marker_module_notify,
|
||||
.priority = 0,
|
||||
};
|
||||
|
||||
static int init_markers(void)
|
||||
{
|
||||
return register_module_notifier(&marker_module_nb);
|
||||
}
|
||||
__initcall(init_markers);
|
||||
|
||||
#endif /* CONFIG_MODULES */
|
183
kernel/module.c
183
kernel/module.c
@@ -47,6 +47,7 @@
|
||||
#include <linux/rculist.h>
|
||||
#include <asm/uaccess.h>
|
||||
#include <asm/cacheflush.h>
|
||||
#include <asm/mmu_context.h>
|
||||
#include <linux/license.h>
|
||||
#include <asm/sections.h>
|
||||
#include <linux/tracepoint.h>
|
||||
@@ -1535,6 +1536,10 @@ static void free_module(struct module *mod)
|
||||
|
||||
/* Finally, free the core (containing the module structure) */
|
||||
module_free(mod, mod->module_core);
|
||||
|
||||
#ifdef CONFIG_MPU
|
||||
update_protections(current->mm);
|
||||
#endif
|
||||
}
|
||||
|
||||
void *__symbol_get(const char *symbol)
|
||||
@@ -1792,6 +1797,17 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
|
||||
}
|
||||
}
|
||||
|
||||
static void free_modinfo(struct module *mod)
|
||||
{
|
||||
struct module_attribute *attr;
|
||||
int i;
|
||||
|
||||
for (i = 0; (attr = modinfo_attrs[i]); i++) {
|
||||
if (attr->free)
|
||||
attr->free(mod);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_KALLSYMS
|
||||
|
||||
/* lookup symbol in given range of kernel_symbols */
|
||||
@@ -1857,13 +1873,93 @@ static char elf_type(const Elf_Sym *sym,
|
||||
return '?';
|
||||
}
|
||||
|
||||
static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
|
||||
unsigned int shnum)
|
||||
{
|
||||
const Elf_Shdr *sec;
|
||||
|
||||
if (src->st_shndx == SHN_UNDEF
|
||||
|| src->st_shndx >= shnum
|
||||
|| !src->st_name)
|
||||
return false;
|
||||
|
||||
sec = sechdrs + src->st_shndx;
|
||||
if (!(sec->sh_flags & SHF_ALLOC)
|
||||
#ifndef CONFIG_KALLSYMS_ALL
|
||||
|| !(sec->sh_flags & SHF_EXECINSTR)
|
||||
#endif
|
||||
|| (sec->sh_entsize & INIT_OFFSET_MASK))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static unsigned long layout_symtab(struct module *mod,
|
||||
Elf_Shdr *sechdrs,
|
||||
unsigned int symindex,
|
||||
unsigned int strindex,
|
||||
const Elf_Ehdr *hdr,
|
||||
const char *secstrings,
|
||||
unsigned long *pstroffs,
|
||||
unsigned long *strmap)
|
||||
{
|
||||
unsigned long symoffs;
|
||||
Elf_Shdr *symsect = sechdrs + symindex;
|
||||
Elf_Shdr *strsect = sechdrs + strindex;
|
||||
const Elf_Sym *src;
|
||||
const char *strtab;
|
||||
unsigned int i, nsrc, ndst;
|
||||
|
||||
/* Put symbol section at end of init part of module. */
|
||||
symsect->sh_flags |= SHF_ALLOC;
|
||||
symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
|
||||
symindex) | INIT_OFFSET_MASK;
|
||||
DEBUGP("\t%s\n", secstrings + symsect->sh_name);
|
||||
|
||||
src = (void *)hdr + symsect->sh_offset;
|
||||
nsrc = symsect->sh_size / sizeof(*src);
|
||||
strtab = (void *)hdr + strsect->sh_offset;
|
||||
for (ndst = i = 1; i < nsrc; ++i, ++src)
|
||||
if (is_core_symbol(src, sechdrs, hdr->e_shnum)) {
|
||||
unsigned int j = src->st_name;
|
||||
|
||||
while(!__test_and_set_bit(j, strmap) && strtab[j])
|
||||
++j;
|
||||
++ndst;
|
||||
}
|
||||
|
||||
/* Append room for core symbols at end of core part. */
|
||||
symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
|
||||
mod->core_size = symoffs + ndst * sizeof(Elf_Sym);
|
||||
|
||||
/* Put string table section at end of init part of module. */
|
||||
strsect->sh_flags |= SHF_ALLOC;
|
||||
strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
|
||||
strindex) | INIT_OFFSET_MASK;
|
||||
DEBUGP("\t%s\n", secstrings + strsect->sh_name);
|
||||
|
||||
/* Append room for core symbols' strings at end of core part. */
|
||||
*pstroffs = mod->core_size;
|
||||
__set_bit(0, strmap);
|
||||
mod->core_size += bitmap_weight(strmap, strsect->sh_size);
|
||||
|
||||
return symoffs;
|
||||
}
|
||||
|
||||
static void add_kallsyms(struct module *mod,
|
||||
Elf_Shdr *sechdrs,
|
||||
unsigned int shnum,
|
||||
unsigned int symindex,
|
||||
unsigned int strindex,
|
||||
const char *secstrings)
|
||||
unsigned long symoffs,
|
||||
unsigned long stroffs,
|
||||
const char *secstrings,
|
||||
unsigned long *strmap)
|
||||
{
|
||||
unsigned int i;
|
||||
unsigned int i, ndst;
|
||||
const Elf_Sym *src;
|
||||
Elf_Sym *dst;
|
||||
char *s;
|
||||
|
||||
mod->symtab = (void *)sechdrs[symindex].sh_addr;
|
||||
mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
|
||||
@@ -1873,13 +1969,44 @@ static void add_kallsyms(struct module *mod,
|
||||
for (i = 0; i < mod->num_symtab; i++)
|
||||
mod->symtab[i].st_info
|
||||
= elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
|
||||
|
||||
mod->core_symtab = dst = mod->module_core + symoffs;
|
||||
src = mod->symtab;
|
||||
*dst = *src;
|
||||
for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
|
||||
if (!is_core_symbol(src, sechdrs, shnum))
|
||||
continue;
|
||||
dst[ndst] = *src;
|
||||
dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name);
|
||||
++ndst;
|
||||
}
|
||||
mod->core_num_syms = ndst;
|
||||
|
||||
mod->core_strtab = s = mod->module_core + stroffs;
|
||||
for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i)
|
||||
if (test_bit(i, strmap))
|
||||
*++s = mod->strtab[i];
|
||||
}
|
||||
#else
|
||||
static inline unsigned long layout_symtab(struct module *mod,
|
||||
Elf_Shdr *sechdrs,
|
||||
unsigned int symindex,
|
||||
unsigned int strindex,
|
||||
const Elf_Hdr *hdr,
|
||||
const char *secstrings,
|
||||
unsigned long *pstroffs,
|
||||
unsigned long *strmap)
|
||||
{
|
||||
}
|
||||
static inline void add_kallsyms(struct module *mod,
|
||||
Elf_Shdr *sechdrs,
|
||||
unsigned int shnum,
|
||||
unsigned int symindex,
|
||||
unsigned int strindex,
|
||||
const char *secstrings)
|
||||
unsigned long symoffs,
|
||||
unsigned long stroffs,
|
||||
const char *secstrings,
|
||||
const unsigned long *strmap)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_KALLSYMS */
|
||||
@@ -1954,6 +2081,9 @@ static noinline struct module *load_module(void __user *umod,
|
||||
struct module *mod;
|
||||
long err = 0;
|
||||
void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
|
||||
#ifdef CONFIG_KALLSYMS
|
||||
unsigned long symoffs, stroffs, *strmap;
|
||||
#endif
|
||||
mm_segment_t old_fs;
|
||||
|
||||
DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
|
||||
@@ -2035,11 +2165,6 @@ static noinline struct module *load_module(void __user *umod,
|
||||
/* Don't keep modinfo and version sections. */
|
||||
sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
|
||||
sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
|
||||
#ifdef CONFIG_KALLSYMS
|
||||
/* Keep symbol and string tables for decoding later. */
|
||||
sechdrs[symindex].sh_flags |= SHF_ALLOC;
|
||||
sechdrs[strindex].sh_flags |= SHF_ALLOC;
|
||||
#endif
|
||||
|
||||
/* Check module struct version now, before we try to use module. */
|
||||
if (!check_modstruct_version(sechdrs, versindex, mod)) {
|
||||
@@ -2075,6 +2200,13 @@ static noinline struct module *load_module(void __user *umod,
|
||||
goto free_hdr;
|
||||
}
|
||||
|
||||
strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size)
|
||||
* sizeof(long), GFP_KERNEL);
|
||||
if (!strmap) {
|
||||
err = -ENOMEM;
|
||||
goto free_mod;
|
||||
}
|
||||
|
||||
if (find_module(mod->name)) {
|
||||
err = -EEXIST;
|
||||
goto free_mod;
|
||||
@@ -2104,6 +2236,8 @@ static noinline struct module *load_module(void __user *umod,
|
||||
this is done generically; there doesn't appear to be any
|
||||
special cases for the architectures. */
|
||||
layout_sections(mod, hdr, sechdrs, secstrings);
|
||||
symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr,
|
||||
secstrings, &stroffs, strmap);
|
||||
|
||||
/* Do the allocs. */
|
||||
ptr = module_alloc_update_bounds(mod->core_size);
|
||||
@@ -2237,10 +2371,6 @@ static noinline struct module *load_module(void __user *umod,
|
||||
sizeof(*mod->ctors), &mod->num_ctors);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_MARKERS
|
||||
mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
|
||||
sizeof(*mod->markers), &mod->num_markers);
|
||||
#endif
|
||||
#ifdef CONFIG_TRACEPOINTS
|
||||
mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
|
||||
"__tracepoints",
|
||||
@@ -2312,7 +2442,10 @@ static noinline struct module *load_module(void __user *umod,
|
||||
percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
|
||||
sechdrs[pcpuindex].sh_size);
|
||||
|
||||
add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
|
||||
add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
|
||||
symoffs, stroffs, secstrings, strmap);
|
||||
kfree(strmap);
|
||||
strmap = NULL;
|
||||
|
||||
if (!mod->taints) {
|
||||
struct _ddebug *debug;
|
||||
@@ -2384,13 +2517,14 @@ static noinline struct module *load_module(void __user *umod,
|
||||
synchronize_sched();
|
||||
module_arch_cleanup(mod);
|
||||
cleanup:
|
||||
free_modinfo(mod);
|
||||
kobject_del(&mod->mkobj.kobj);
|
||||
kobject_put(&mod->mkobj.kobj);
|
||||
free_unload:
|
||||
module_unload_free(mod);
|
||||
#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
|
||||
free_init:
|
||||
percpu_modfree(mod->refptr);
|
||||
free_init:
|
||||
#endif
|
||||
module_free(mod, mod->module_init);
|
||||
free_core:
|
||||
@@ -2401,6 +2535,7 @@ static noinline struct module *load_module(void __user *umod,
|
||||
percpu_modfree(percpu);
|
||||
free_mod:
|
||||
kfree(args);
|
||||
kfree(strmap);
|
||||
free_hdr:
|
||||
vfree(hdr);
|
||||
return ERR_PTR(err);
|
||||
@@ -2490,6 +2625,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
|
||||
/* Drop initial reference. */
|
||||
module_put(mod);
|
||||
trim_init_extable(mod);
|
||||
#ifdef CONFIG_KALLSYMS
|
||||
mod->num_symtab = mod->core_num_syms;
|
||||
mod->symtab = mod->core_symtab;
|
||||
mod->strtab = mod->core_strtab;
|
||||
#endif
|
||||
module_free(mod, mod->module_init);
|
||||
mod->module_init = NULL;
|
||||
mod->init_size = 0;
|
||||
@@ -2951,27 +3091,12 @@ void module_layout(struct module *mod,
|
||||
struct modversion_info *ver,
|
||||
struct kernel_param *kp,
|
||||
struct kernel_symbol *ks,
|
||||
struct marker *marker,
|
||||
struct tracepoint *tp)
|
||||
{
|
||||
}
|
||||
EXPORT_SYMBOL(module_layout);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_MARKERS
|
||||
void module_update_markers(void)
|
||||
{
|
||||
struct module *mod;
|
||||
|
||||
mutex_lock(&module_mutex);
|
||||
list_for_each_entry(mod, &modules, list)
|
||||
if (!mod->taints)
|
||||
marker_update_probe_range(mod->markers,
|
||||
mod->markers + mod->num_markers);
|
||||
mutex_unlock(&module_mutex);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_TRACEPOINTS
|
||||
void module_update_tracepoints(void)
|
||||
{
|
||||
|
@@ -42,8 +42,8 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
|
||||
* (hence either you are in the same cgroup as task, or in an
|
||||
* ancestor cgroup thereof)
|
||||
*/
|
||||
static int ns_can_attach(struct cgroup_subsys *ss,
|
||||
struct cgroup *new_cgroup, struct task_struct *task)
|
||||
static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
|
||||
struct task_struct *task, bool threadgroup)
|
||||
{
|
||||
if (current != task) {
|
||||
if (!capable(CAP_SYS_ADMIN))
|
||||
@@ -56,6 +56,18 @@ static int ns_can_attach(struct cgroup_subsys *ss,
|
||||
if (!cgroup_is_descendant(new_cgroup, task))
|
||||
return -EPERM;
|
||||
|
||||
if (threadgroup) {
|
||||
struct task_struct *c;
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
|
||||
if (!cgroup_is_descendant(new_cgroup, c)) {
|
||||
rcu_read_unlock();
|
||||
return -EPERM;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@@ -177,7 +177,7 @@ static const struct tnt tnts[] = {
|
||||
* 'W' - Taint on warning.
|
||||
* 'C' - modules from drivers/staging are loaded.
|
||||
*
|
||||
* The string is overwritten by the next call to print_taint().
|
||||
* The string is overwritten by the next call to print_tainted().
|
||||
*/
|
||||
const char *print_tainted(void)
|
||||
{
|
||||
|
@@ -23,6 +23,7 @@
|
||||
#include <linux/device.h>
|
||||
#include <linux/err.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/ctype.h>
|
||||
|
||||
#if 0
|
||||
#define DEBUGP printk
|
||||
@@ -87,7 +88,7 @@ static char *next_arg(char *args, char **param, char **val)
|
||||
}
|
||||
|
||||
for (i = 0; args[i]; i++) {
|
||||
if (args[i] == ' ' && !in_quote)
|
||||
if (isspace(args[i]) && !in_quote)
|
||||
break;
|
||||
if (equals == 0) {
|
||||
if (args[i] == '=')
|
||||
@@ -121,7 +122,7 @@ static char *next_arg(char *args, char **param, char **val)
|
||||
next = args + i;
|
||||
|
||||
/* Chew up trailing spaces. */
|
||||
while (*next == ' ')
|
||||
while (isspace(*next))
|
||||
next++;
|
||||
return next;
|
||||
}
|
||||
@@ -138,7 +139,7 @@ int parse_args(const char *name,
|
||||
DEBUGP("Parsing ARGS: %s\n", args);
|
||||
|
||||
/* Chew leading spaces */
|
||||
while (*args == ' ')
|
||||
while (isspace(*args))
|
||||
args++;
|
||||
|
||||
while (*args) {
|
||||
|
File diff suppressed because it is too large
Load Diff
5000
kernel/perf_event.c
Normal file
5000
kernel/perf_event.c
Normal file
File diff suppressed because it is too large
Load Diff
15
kernel/pid.c
15
kernel/pid.c
@@ -40,7 +40,7 @@
|
||||
#define pid_hashfn(nr, ns) \
|
||||
hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
|
||||
static struct hlist_head *pid_hash;
|
||||
static int pidhash_shift;
|
||||
static unsigned int pidhash_shift = 4;
|
||||
struct pid init_struct_pid = INIT_STRUCT_PID;
|
||||
|
||||
int pid_max = PID_MAX_DEFAULT;
|
||||
@@ -499,19 +499,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
|
||||
void __init pidhash_init(void)
|
||||
{
|
||||
int i, pidhash_size;
|
||||
unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
|
||||
|
||||
pidhash_shift = max(4, fls(megabytes * 4));
|
||||
pidhash_shift = min(12, pidhash_shift);
|
||||
pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
|
||||
HASH_EARLY | HASH_SMALL,
|
||||
&pidhash_shift, NULL, 4096);
|
||||
pidhash_size = 1 << pidhash_shift;
|
||||
|
||||
printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
|
||||
pidhash_size, pidhash_shift,
|
||||
pidhash_size * sizeof(struct hlist_head));
|
||||
|
||||
pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash)));
|
||||
if (!pid_hash)
|
||||
panic("Could not alloc pidhash!\n");
|
||||
for (i = 0; i < pidhash_size; i++)
|
||||
INIT_HLIST_HEAD(&pid_hash[i]);
|
||||
}
|
||||
|
@@ -118,7 +118,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old
|
||||
{
|
||||
if (!(flags & CLONE_NEWPID))
|
||||
return get_pid_ns(old_ns);
|
||||
if (flags & CLONE_THREAD)
|
||||
if (flags & (CLONE_THREAD|CLONE_PARENT))
|
||||
return ERR_PTR(-EINVAL);
|
||||
return create_pid_namespace(old_ns);
|
||||
}
|
||||
|
@@ -8,17 +8,18 @@
|
||||
#include <linux/math64.h>
|
||||
#include <asm/uaccess.h>
|
||||
#include <linux/kernel_stat.h>
|
||||
#include <trace/events/timer.h>
|
||||
|
||||
/*
|
||||
* Called after updating RLIMIT_CPU to set timer expiration if necessary.
|
||||
*/
|
||||
void update_rlimit_cpu(unsigned long rlim_new)
|
||||
{
|
||||
cputime_t cputime;
|
||||
cputime_t cputime = secs_to_cputime(rlim_new);
|
||||
struct signal_struct *const sig = current->signal;
|
||||
|
||||
cputime = secs_to_cputime(rlim_new);
|
||||
if (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
|
||||
cputime_gt(current->signal->it_prof_expires, cputime)) {
|
||||
if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) ||
|
||||
cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) {
|
||||
spin_lock_irq(¤t->sighand->siglock);
|
||||
set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
|
||||
spin_unlock_irq(¤t->sighand->siglock);
|
||||
@@ -542,6 +543,17 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
|
||||
now);
|
||||
}
|
||||
|
||||
static inline int expires_gt(cputime_t expires, cputime_t new_exp)
|
||||
{
|
||||
return cputime_eq(expires, cputime_zero) ||
|
||||
cputime_gt(expires, new_exp);
|
||||
}
|
||||
|
||||
static inline int expires_le(cputime_t expires, cputime_t new_exp)
|
||||
{
|
||||
return !cputime_eq(expires, cputime_zero) &&
|
||||
cputime_le(expires, new_exp);
|
||||
}
|
||||
/*
|
||||
* Insert the timer on the appropriate list before any timers that
|
||||
* expire later. This must be called with the tasklist_lock held
|
||||
@@ -586,34 +598,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
|
||||
*/
|
||||
|
||||
if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
|
||||
union cpu_time_count *exp = &nt->expires;
|
||||
|
||||
switch (CPUCLOCK_WHICH(timer->it_clock)) {
|
||||
default:
|
||||
BUG();
|
||||
case CPUCLOCK_PROF:
|
||||
if (cputime_eq(p->cputime_expires.prof_exp,
|
||||
cputime_zero) ||
|
||||
cputime_gt(p->cputime_expires.prof_exp,
|
||||
nt->expires.cpu))
|
||||
p->cputime_expires.prof_exp =
|
||||
nt->expires.cpu;
|
||||
if (expires_gt(p->cputime_expires.prof_exp,
|
||||
exp->cpu))
|
||||
p->cputime_expires.prof_exp = exp->cpu;
|
||||
break;
|
||||
case CPUCLOCK_VIRT:
|
||||
if (cputime_eq(p->cputime_expires.virt_exp,
|
||||
cputime_zero) ||
|
||||
cputime_gt(p->cputime_expires.virt_exp,
|
||||
nt->expires.cpu))
|
||||
p->cputime_expires.virt_exp =
|
||||
nt->expires.cpu;
|
||||
if (expires_gt(p->cputime_expires.virt_exp,
|
||||
exp->cpu))
|
||||
p->cputime_expires.virt_exp = exp->cpu;
|
||||
break;
|
||||
case CPUCLOCK_SCHED:
|
||||
if (p->cputime_expires.sched_exp == 0 ||
|
||||
p->cputime_expires.sched_exp >
|
||||
nt->expires.sched)
|
||||
p->cputime_expires.sched_exp > exp->sched)
|
||||
p->cputime_expires.sched_exp =
|
||||
nt->expires.sched;
|
||||
exp->sched;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
struct signal_struct *const sig = p->signal;
|
||||
union cpu_time_count *exp = &timer->it.cpu.expires;
|
||||
|
||||
/*
|
||||
* For a process timer, set the cached expiration time.
|
||||
*/
|
||||
@@ -621,30 +631,23 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
|
||||
default:
|
||||
BUG();
|
||||
case CPUCLOCK_VIRT:
|
||||
if (!cputime_eq(p->signal->it_virt_expires,
|
||||
cputime_zero) &&
|
||||
cputime_lt(p->signal->it_virt_expires,
|
||||
timer->it.cpu.expires.cpu))
|
||||
if (expires_le(sig->it[CPUCLOCK_VIRT].expires,
|
||||
exp->cpu))
|
||||
break;
|
||||
p->signal->cputime_expires.virt_exp =
|
||||
timer->it.cpu.expires.cpu;
|
||||
sig->cputime_expires.virt_exp = exp->cpu;
|
||||
break;
|
||||
case CPUCLOCK_PROF:
|
||||
if (!cputime_eq(p->signal->it_prof_expires,
|
||||
cputime_zero) &&
|
||||
cputime_lt(p->signal->it_prof_expires,
|
||||
timer->it.cpu.expires.cpu))
|
||||
if (expires_le(sig->it[CPUCLOCK_PROF].expires,
|
||||
exp->cpu))
|
||||
break;
|
||||
i = p->signal->rlim[RLIMIT_CPU].rlim_cur;
|
||||
i = sig->rlim[RLIMIT_CPU].rlim_cur;
|
||||
if (i != RLIM_INFINITY &&
|
||||
i <= cputime_to_secs(timer->it.cpu.expires.cpu))
|
||||
i <= cputime_to_secs(exp->cpu))
|
||||
break;
|
||||
p->signal->cputime_expires.prof_exp =
|
||||
timer->it.cpu.expires.cpu;
|
||||
sig->cputime_expires.prof_exp = exp->cpu;
|
||||
break;
|
||||
case CPUCLOCK_SCHED:
|
||||
p->signal->cputime_expires.sched_exp =
|
||||
timer->it.cpu.expires.sched;
|
||||
sig->cputime_expires.sched_exp = exp->sched;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -1071,6 +1074,40 @@ static void stop_process_timers(struct task_struct *tsk)
|
||||
spin_unlock_irqrestore(&cputimer->lock, flags);
|
||||
}
|
||||
|
||||
static u32 onecputick;
|
||||
|
||||
static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
|
||||
cputime_t *expires, cputime_t cur_time, int signo)
|
||||
{
|
||||
if (cputime_eq(it->expires, cputime_zero))
|
||||
return;
|
||||
|
||||
if (cputime_ge(cur_time, it->expires)) {
|
||||
if (!cputime_eq(it->incr, cputime_zero)) {
|
||||
it->expires = cputime_add(it->expires, it->incr);
|
||||
it->error += it->incr_error;
|
||||
if (it->error >= onecputick) {
|
||||
it->expires = cputime_sub(it->expires,
|
||||
cputime_one_jiffy);
|
||||
it->error -= onecputick;
|
||||
}
|
||||
} else {
|
||||
it->expires = cputime_zero;
|
||||
}
|
||||
|
||||
trace_itimer_expire(signo == SIGPROF ?
|
||||
ITIMER_PROF : ITIMER_VIRTUAL,
|
||||
tsk->signal->leader_pid, cur_time);
|
||||
__group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
|
||||
}
|
||||
|
||||
if (!cputime_eq(it->expires, cputime_zero) &&
|
||||
(cputime_eq(*expires, cputime_zero) ||
|
||||
cputime_lt(it->expires, *expires))) {
|
||||
*expires = it->expires;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Check for any per-thread CPU timers that have fired and move them
|
||||
* off the tsk->*_timers list onto the firing list. Per-thread timers
|
||||
@@ -1090,10 +1127,10 @@ static void check_process_timers(struct task_struct *tsk,
|
||||
* Don't sample the current process CPU clocks if there are no timers.
|
||||
*/
|
||||
if (list_empty(&timers[CPUCLOCK_PROF]) &&
|
||||
cputime_eq(sig->it_prof_expires, cputime_zero) &&
|
||||
cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) &&
|
||||
sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
|
||||
list_empty(&timers[CPUCLOCK_VIRT]) &&
|
||||
cputime_eq(sig->it_virt_expires, cputime_zero) &&
|
||||
cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
|
||||
list_empty(&timers[CPUCLOCK_SCHED])) {
|
||||
stop_process_timers(tsk);
|
||||
return;
|
||||
@@ -1153,38 +1190,11 @@ static void check_process_timers(struct task_struct *tsk,
|
||||
/*
|
||||
* Check for the special case process timers.
|
||||
*/
|
||||
if (!cputime_eq(sig->it_prof_expires, cputime_zero)) {
|
||||
if (cputime_ge(ptime, sig->it_prof_expires)) {
|
||||
/* ITIMER_PROF fires and reloads. */
|
||||
sig->it_prof_expires = sig->it_prof_incr;
|
||||
if (!cputime_eq(sig->it_prof_expires, cputime_zero)) {
|
||||
sig->it_prof_expires = cputime_add(
|
||||
sig->it_prof_expires, ptime);
|
||||
}
|
||||
__group_send_sig_info(SIGPROF, SEND_SIG_PRIV, tsk);
|
||||
}
|
||||
if (!cputime_eq(sig->it_prof_expires, cputime_zero) &&
|
||||
(cputime_eq(prof_expires, cputime_zero) ||
|
||||
cputime_lt(sig->it_prof_expires, prof_expires))) {
|
||||
prof_expires = sig->it_prof_expires;
|
||||
}
|
||||
}
|
||||
if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
|
||||
if (cputime_ge(utime, sig->it_virt_expires)) {
|
||||
/* ITIMER_VIRTUAL fires and reloads. */
|
||||
sig->it_virt_expires = sig->it_virt_incr;
|
||||
if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
|
||||
sig->it_virt_expires = cputime_add(
|
||||
sig->it_virt_expires, utime);
|
||||
}
|
||||
__group_send_sig_info(SIGVTALRM, SEND_SIG_PRIV, tsk);
|
||||
}
|
||||
if (!cputime_eq(sig->it_virt_expires, cputime_zero) &&
|
||||
(cputime_eq(virt_expires, cputime_zero) ||
|
||||
cputime_lt(sig->it_virt_expires, virt_expires))) {
|
||||
virt_expires = sig->it_virt_expires;
|
||||
}
|
||||
}
|
||||
check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime,
|
||||
SIGPROF);
|
||||
check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
|
||||
SIGVTALRM);
|
||||
|
||||
if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
|
||||
unsigned long psecs = cputime_to_secs(ptime);
|
||||
cputime_t x;
|
||||
@@ -1457,7 +1467,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
|
||||
if (!cputime_eq(*oldval, cputime_zero)) {
|
||||
if (cputime_le(*oldval, now.cpu)) {
|
||||
/* Just about to fire. */
|
||||
*oldval = jiffies_to_cputime(1);
|
||||
*oldval = cputime_one_jiffy;
|
||||
} else {
|
||||
*oldval = cputime_sub(*oldval, now.cpu);
|
||||
}
|
||||
@@ -1703,10 +1713,15 @@ static __init int init_posix_cpu_timers(void)
|
||||
.nsleep = thread_cpu_nsleep,
|
||||
.nsleep_restart = thread_cpu_nsleep_restart,
|
||||
};
|
||||
struct timespec ts;
|
||||
|
||||
register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
|
||||
register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
|
||||
|
||||
cputime_to_timespec(cputime_one_jiffy, &ts);
|
||||
onecputick = ts.tv_nsec;
|
||||
WARN_ON(ts.tv_sec != 0);
|
||||
|
||||
return 0;
|
||||
}
|
||||
__initcall(init_posix_cpu_timers);
|
||||
|
@@ -242,6 +242,25 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
|
||||
{
|
||||
*tp = current_kernel_time();
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int posix_get_monotonic_coarse(clockid_t which_clock,
|
||||
struct timespec *tp)
|
||||
{
|
||||
*tp = get_monotonic_coarse();
|
||||
return 0;
|
||||
}
|
||||
|
||||
int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
|
||||
{
|
||||
*tp = ktime_to_timespec(KTIME_LOW_RES);
|
||||
return 0;
|
||||
}
|
||||
/*
|
||||
* Initialize everything, well, just everything in Posix clocks/timers ;)
|
||||
*/
|
||||
@@ -262,10 +281,26 @@ static __init int init_posix_timers(void)
|
||||
.timer_create = no_timer_create,
|
||||
.nsleep = no_nsleep,
|
||||
};
|
||||
struct k_clock clock_realtime_coarse = {
|
||||
.clock_getres = posix_get_coarse_res,
|
||||
.clock_get = posix_get_realtime_coarse,
|
||||
.clock_set = do_posix_clock_nosettime,
|
||||
.timer_create = no_timer_create,
|
||||
.nsleep = no_nsleep,
|
||||
};
|
||||
struct k_clock clock_monotonic_coarse = {
|
||||
.clock_getres = posix_get_coarse_res,
|
||||
.clock_get = posix_get_monotonic_coarse,
|
||||
.clock_set = do_posix_clock_nosettime,
|
||||
.timer_create = no_timer_create,
|
||||
.nsleep = no_nsleep,
|
||||
};
|
||||
|
||||
register_posix_clock(CLOCK_REALTIME, &clock_realtime);
|
||||
register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
|
||||
register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
|
||||
register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
|
||||
register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
|
||||
|
||||
posix_timers_cache = kmem_cache_create("posix_timers_cache",
|
||||
sizeof (struct k_itimer), 0, SLAB_PANIC,
|
||||
|
@@ -14,56 +14,13 @@
|
||||
#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
|
||||
|
||||
static int orig_fgconsole, orig_kmsg;
|
||||
static int disable_vt_switch;
|
||||
|
||||
/*
|
||||
* Normally during a suspend, we allocate a new console and switch to it.
|
||||
* When we resume, we switch back to the original console. This switch
|
||||
* can be slow, so on systems where the framebuffer can handle restoration
|
||||
* of video registers anyways, there's little point in doing the console
|
||||
* switch. This function allows you to disable it by passing it '0'.
|
||||
*/
|
||||
void pm_set_vt_switch(int do_switch)
|
||||
{
|
||||
acquire_console_sem();
|
||||
disable_vt_switch = !do_switch;
|
||||
release_console_sem();
|
||||
}
|
||||
EXPORT_SYMBOL(pm_set_vt_switch);
|
||||
|
||||
int pm_prepare_console(void)
|
||||
{
|
||||
acquire_console_sem();
|
||||
|
||||
if (disable_vt_switch) {
|
||||
release_console_sem();
|
||||
return 0;
|
||||
}
|
||||
|
||||
orig_fgconsole = fg_console;
|
||||
|
||||
if (vc_allocate(SUSPEND_CONSOLE)) {
|
||||
/* we can't have a free VC for now. Too bad,
|
||||
* we don't want to mess the screen for now. */
|
||||
release_console_sem();
|
||||
orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
|
||||
if (orig_fgconsole < 0)
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (set_console(SUSPEND_CONSOLE)) {
|
||||
/*
|
||||
* We're unable to switch to the SUSPEND_CONSOLE.
|
||||
* Let the calling function know so it can decide
|
||||
* what to do.
|
||||
*/
|
||||
release_console_sem();
|
||||
return 1;
|
||||
}
|
||||
release_console_sem();
|
||||
|
||||
if (vt_waitactive(SUSPEND_CONSOLE)) {
|
||||
pr_debug("Suspend: Can't switch VCs.");
|
||||
return 1;
|
||||
}
|
||||
orig_kmsg = kmsg_redirect;
|
||||
kmsg_redirect = SUSPEND_CONSOLE;
|
||||
return 0;
|
||||
@@ -71,19 +28,9 @@ int pm_prepare_console(void)
|
||||
|
||||
void pm_restore_console(void)
|
||||
{
|
||||
acquire_console_sem();
|
||||
if (disable_vt_switch) {
|
||||
release_console_sem();
|
||||
return;
|
||||
if (orig_fgconsole >= 0) {
|
||||
vt_move_to_console(orig_fgconsole, 0);
|
||||
kmsg_redirect = orig_kmsg;
|
||||
}
|
||||
set_console(orig_fgconsole);
|
||||
release_console_sem();
|
||||
|
||||
if (vt_waitactive(orig_fgconsole)) {
|
||||
pr_debug("Resume: Can't switch VCs.");
|
||||
return;
|
||||
}
|
||||
|
||||
kmsg_redirect = orig_kmsg;
|
||||
}
|
||||
#endif
|
||||
|
@@ -9,6 +9,7 @@
|
||||
#undef DEBUG
|
||||
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/oom.h>
|
||||
#include <linux/suspend.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/syscalls.h>
|
||||
|
@@ -619,7 +619,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
|
||||
BUG_ON(!region);
|
||||
} else
|
||||
/* This allocation cannot fail */
|
||||
region = alloc_bootmem_low(sizeof(struct nosave_region));
|
||||
region = alloc_bootmem(sizeof(struct nosave_region));
|
||||
region->start_pfn = start_pfn;
|
||||
region->end_pfn = end_pfn;
|
||||
list_add_tail(®ion->list, &nosave_regions);
|
||||
|
@@ -13,7 +13,6 @@
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/utsname.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/genhd.h>
|
||||
|
@@ -206,12 +206,11 @@ __setup("log_buf_len=", log_buf_len_setup);
|
||||
#ifdef CONFIG_BOOT_PRINTK_DELAY
|
||||
|
||||
static unsigned int boot_delay; /* msecs delay after each printk during bootup */
|
||||
static unsigned long long printk_delay_msec; /* per msec, based on boot_delay */
|
||||
static unsigned long long loops_per_msec; /* based on boot_delay */
|
||||
|
||||
static int __init boot_delay_setup(char *str)
|
||||
{
|
||||
unsigned long lpj;
|
||||
unsigned long long loops_per_msec;
|
||||
|
||||
lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */
|
||||
loops_per_msec = (unsigned long long)lpj / 1000 * HZ;
|
||||
@@ -220,10 +219,9 @@ static int __init boot_delay_setup(char *str)
|
||||
if (boot_delay > 10 * 1000)
|
||||
boot_delay = 0;
|
||||
|
||||
printk_delay_msec = loops_per_msec;
|
||||
printk(KERN_DEBUG "boot_delay: %u, preset_lpj: %ld, lpj: %lu, "
|
||||
"HZ: %d, printk_delay_msec: %llu\n",
|
||||
boot_delay, preset_lpj, lpj, HZ, printk_delay_msec);
|
||||
pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, "
|
||||
"HZ: %d, loops_per_msec: %llu\n",
|
||||
boot_delay, preset_lpj, lpj, HZ, loops_per_msec);
|
||||
return 1;
|
||||
}
|
||||
__setup("boot_delay=", boot_delay_setup);
|
||||
@@ -236,7 +234,7 @@ static void boot_delay_msec(void)
|
||||
if (boot_delay == 0 || system_state != SYSTEM_BOOTING)
|
||||
return;
|
||||
|
||||
k = (unsigned long long)printk_delay_msec * boot_delay;
|
||||
k = (unsigned long long)loops_per_msec * boot_delay;
|
||||
|
||||
timeout = jiffies + msecs_to_jiffies(boot_delay);
|
||||
while (k) {
|
||||
@@ -655,6 +653,20 @@ static int recursion_bug;
|
||||
static int new_text_line = 1;
|
||||
static char printk_buf[1024];
|
||||
|
||||
int printk_delay_msec __read_mostly;
|
||||
|
||||
static inline void printk_delay(void)
|
||||
{
|
||||
if (unlikely(printk_delay_msec)) {
|
||||
int m = printk_delay_msec;
|
||||
|
||||
while (m--) {
|
||||
mdelay(1);
|
||||
touch_nmi_watchdog();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
asmlinkage int vprintk(const char *fmt, va_list args)
|
||||
{
|
||||
int printed_len = 0;
|
||||
@@ -664,6 +676,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
|
||||
char *p;
|
||||
|
||||
boot_delay_msec();
|
||||
printk_delay();
|
||||
|
||||
preempt_disable();
|
||||
/* This stops the holder of console_sem just where we want him */
|
||||
|
@@ -442,48 +442,51 @@ void profile_tick(int type)
|
||||
|
||||
#ifdef CONFIG_PROC_FS
|
||||
#include <linux/proc_fs.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <asm/uaccess.h>
|
||||
|
||||
static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
|
||||
int count, int *eof, void *data)
|
||||
static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
|
||||
{
|
||||
int len = cpumask_scnprintf(page, count, data);
|
||||
if (count - len < 2)
|
||||
return -EINVAL;
|
||||
len += sprintf(page + len, "\n");
|
||||
return len;
|
||||
seq_cpumask(m, prof_cpu_mask);
|
||||
seq_putc(m, '\n');
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int prof_cpu_mask_write_proc(struct file *file,
|
||||
const char __user *buffer, unsigned long count, void *data)
|
||||
static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return single_open(file, prof_cpu_mask_proc_show, NULL);
|
||||
}
|
||||
|
||||
static ssize_t prof_cpu_mask_proc_write(struct file *file,
|
||||
const char __user *buffer, size_t count, loff_t *pos)
|
||||
{
|
||||
struct cpumask *mask = data;
|
||||
unsigned long full_count = count, err;
|
||||
cpumask_var_t new_value;
|
||||
int err;
|
||||
|
||||
if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
|
||||
return -ENOMEM;
|
||||
|
||||
err = cpumask_parse_user(buffer, count, new_value);
|
||||
if (!err) {
|
||||
cpumask_copy(mask, new_value);
|
||||
err = full_count;
|
||||
cpumask_copy(prof_cpu_mask, new_value);
|
||||
err = count;
|
||||
}
|
||||
free_cpumask_var(new_value);
|
||||
return err;
|
||||
}
|
||||
|
||||
static const struct file_operations prof_cpu_mask_proc_fops = {
|
||||
.open = prof_cpu_mask_proc_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = single_release,
|
||||
.write = prof_cpu_mask_proc_write,
|
||||
};
|
||||
|
||||
void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
|
||||
{
|
||||
struct proc_dir_entry *entry;
|
||||
|
||||
/* create /proc/irq/prof_cpu_mask */
|
||||
entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir);
|
||||
if (!entry)
|
||||
return;
|
||||
entry->data = prof_cpu_mask;
|
||||
entry->read_proc = prof_cpu_mask_read_proc;
|
||||
entry->write_proc = prof_cpu_mask_write_proc;
|
||||
proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@@ -266,9 +266,10 @@ static int ignoring_children(struct sighand_struct *sigh)
|
||||
* or self-reaping. Do notification now if it would have happened earlier.
|
||||
* If it should reap itself, return true.
|
||||
*
|
||||
* If it's our own child, there is no notification to do.
|
||||
* But if our normal children self-reap, then this child
|
||||
* was prevented by ptrace and we must reap it now.
|
||||
* If it's our own child, there is no notification to do. But if our normal
|
||||
* children self-reap, then this child was prevented by ptrace and we must
|
||||
* reap it now, in that case we must also wake up sub-threads sleeping in
|
||||
* do_wait().
|
||||
*/
|
||||
static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
|
||||
{
|
||||
@@ -278,8 +279,10 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
|
||||
if (!task_detached(p) && thread_group_empty(p)) {
|
||||
if (!same_thread_group(p->real_parent, tracer))
|
||||
do_notify_parent(p, p->exit_signal);
|
||||
else if (ignoring_children(tracer->sighand))
|
||||
else if (ignoring_children(tracer->sighand)) {
|
||||
__wake_up_parent(p, tracer);
|
||||
p->exit_signal = -1;
|
||||
}
|
||||
}
|
||||
if (task_detached(p)) {
|
||||
/* Mark it as in the process of being reaped. */
|
||||
|
@@ -19,7 +19,7 @@
|
||||
*
|
||||
* Authors: Dipankar Sarma <dipankar@in.ibm.com>
|
||||
* Manfred Spraul <manfred@colorfullife.com>
|
||||
*
|
||||
*
|
||||
* Based on the original work by Paul McKenney <paulmck@us.ibm.com>
|
||||
* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
|
||||
* Papers:
|
||||
@@ -27,7 +27,7 @@
|
||||
* http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
|
||||
*
|
||||
* For detailed explanation of Read-Copy Update mechanism see -
|
||||
* http://lse.sourceforge.net/locking/rcupdate.html
|
||||
* http://lse.sourceforge.net/locking/rcupdate.html
|
||||
*
|
||||
*/
|
||||
#include <linux/types.h>
|
||||
@@ -74,6 +74,8 @@ void wakeme_after_rcu(struct rcu_head *head)
|
||||
complete(&rcu->completion);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_TREE_PREEMPT_RCU
|
||||
|
||||
/**
|
||||
* synchronize_rcu - wait until a grace period has elapsed.
|
||||
*
|
||||
@@ -87,7 +89,7 @@ void synchronize_rcu(void)
|
||||
{
|
||||
struct rcu_synchronize rcu;
|
||||
|
||||
if (rcu_blocking_is_gp())
|
||||
if (!rcu_scheduler_active)
|
||||
return;
|
||||
|
||||
init_completion(&rcu.completion);
|
||||
@@ -98,6 +100,46 @@ void synchronize_rcu(void)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(synchronize_rcu);
|
||||
|
||||
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
|
||||
|
||||
/**
|
||||
* synchronize_sched - wait until an rcu-sched grace period has elapsed.
|
||||
*
|
||||
* Control will return to the caller some time after a full rcu-sched
|
||||
* grace period has elapsed, in other words after all currently executing
|
||||
* rcu-sched read-side critical sections have completed. These read-side
|
||||
* critical sections are delimited by rcu_read_lock_sched() and
|
||||
* rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
|
||||
* local_irq_disable(), and so on may be used in place of
|
||||
* rcu_read_lock_sched().
|
||||
*
|
||||
* This means that all preempt_disable code sequences, including NMI and
|
||||
* hardware-interrupt handlers, in progress on entry will have completed
|
||||
* before this primitive returns. However, this does not guarantee that
|
||||
* softirq handlers will have completed, since in some kernels, these
|
||||
* handlers can run in process context, and can block.
|
||||
*
|
||||
* This primitive provides the guarantees made by the (now removed)
|
||||
* synchronize_kernel() API. In contrast, synchronize_rcu() only
|
||||
* guarantees that rcu_read_lock() sections will have completed.
|
||||
* In "classic RCU", these two guarantees happen to be one and
|
||||
* the same, but can differ in realtime RCU implementations.
|
||||
*/
|
||||
void synchronize_sched(void)
|
||||
{
|
||||
struct rcu_synchronize rcu;
|
||||
|
||||
if (rcu_blocking_is_gp())
|
||||
return;
|
||||
|
||||
init_completion(&rcu.completion);
|
||||
/* Will wake me after RCU finished. */
|
||||
call_rcu_sched(&rcu.head, wakeme_after_rcu);
|
||||
/* Wait for it. */
|
||||
wait_for_completion(&rcu.completion);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(synchronize_sched);
|
||||
|
||||
/**
|
||||
* synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
|
||||
*
|
||||
|
@@ -18,7 +18,7 @@
|
||||
* Copyright (C) IBM Corporation, 2005, 2006
|
||||
*
|
||||
* Authors: Paul E. McKenney <paulmck@us.ibm.com>
|
||||
* Josh Triplett <josh@freedesktop.org>
|
||||
* Josh Triplett <josh@freedesktop.org>
|
||||
*
|
||||
* See also: Documentation/RCU/torture.txt
|
||||
*/
|
||||
@@ -50,7 +50,7 @@
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
|
||||
"Josh Triplett <josh@freedesktop.org>");
|
||||
"Josh Triplett <josh@freedesktop.org>");
|
||||
|
||||
static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
|
||||
static int nfakewriters = 4; /* # fake writer threads */
|
||||
@@ -110,8 +110,8 @@ struct rcu_torture {
|
||||
};
|
||||
|
||||
static LIST_HEAD(rcu_torture_freelist);
|
||||
static struct rcu_torture *rcu_torture_current = NULL;
|
||||
static long rcu_torture_current_version = 0;
|
||||
static struct rcu_torture *rcu_torture_current;
|
||||
static long rcu_torture_current_version;
|
||||
static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
|
||||
static DEFINE_SPINLOCK(rcu_torture_lock);
|
||||
static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
|
||||
@@ -124,11 +124,11 @@ static atomic_t n_rcu_torture_alloc_fail;
|
||||
static atomic_t n_rcu_torture_free;
|
||||
static atomic_t n_rcu_torture_mberror;
|
||||
static atomic_t n_rcu_torture_error;
|
||||
static long n_rcu_torture_timers = 0;
|
||||
static long n_rcu_torture_timers;
|
||||
static struct list_head rcu_torture_removed;
|
||||
static cpumask_var_t shuffle_tmp_mask;
|
||||
|
||||
static int stutter_pause_test = 0;
|
||||
static int stutter_pause_test;
|
||||
|
||||
#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
|
||||
#define RCUTORTURE_RUNNABLE_INIT 1
|
||||
@@ -267,7 +267,8 @@ struct rcu_torture_ops {
|
||||
int irq_capable;
|
||||
char *name;
|
||||
};
|
||||
static struct rcu_torture_ops *cur_ops = NULL;
|
||||
|
||||
static struct rcu_torture_ops *cur_ops;
|
||||
|
||||
/*
|
||||
* Definitions for rcu torture testing.
|
||||
@@ -281,14 +282,17 @@ static int rcu_torture_read_lock(void) __acquires(RCU)
|
||||
|
||||
static void rcu_read_delay(struct rcu_random_state *rrsp)
|
||||
{
|
||||
long delay;
|
||||
const long longdelay = 200;
|
||||
const unsigned long shortdelay_us = 200;
|
||||
const unsigned long longdelay_ms = 50;
|
||||
|
||||
/* We want there to be long-running readers, but not all the time. */
|
||||
/* We want a short delay sometimes to make a reader delay the grace
|
||||
* period, and we want a long delay occasionally to trigger
|
||||
* force_quiescent_state. */
|
||||
|
||||
delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay);
|
||||
if (!delay)
|
||||
udelay(longdelay);
|
||||
if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
|
||||
mdelay(longdelay_ms);
|
||||
if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
|
||||
udelay(shortdelay_us);
|
||||
}
|
||||
|
||||
static void rcu_torture_read_unlock(int idx) __releases(RCU)
|
||||
@@ -339,8 +343,8 @@ static struct rcu_torture_ops rcu_ops = {
|
||||
.sync = synchronize_rcu,
|
||||
.cb_barrier = rcu_barrier,
|
||||
.stats = NULL,
|
||||
.irq_capable = 1,
|
||||
.name = "rcu"
|
||||
.irq_capable = 1,
|
||||
.name = "rcu"
|
||||
};
|
||||
|
||||
static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
|
||||
@@ -638,7 +642,8 @@ rcu_torture_writer(void *arg)
|
||||
|
||||
do {
|
||||
schedule_timeout_uninterruptible(1);
|
||||
if ((rp = rcu_torture_alloc()) == NULL)
|
||||
rp = rcu_torture_alloc();
|
||||
if (rp == NULL)
|
||||
continue;
|
||||
rp->rtort_pipe_count = 0;
|
||||
udelay(rcu_random(&rand) & 0x3ff);
|
||||
@@ -1110,7 +1115,7 @@ rcu_torture_init(void)
|
||||
printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
|
||||
torture_type);
|
||||
mutex_unlock(&fullstop_mutex);
|
||||
return (-EINVAL);
|
||||
return -EINVAL;
|
||||
}
|
||||
if (cur_ops->init)
|
||||
cur_ops->init(); /* no "goto unwind" prior to this point!!! */
|
||||
@@ -1161,7 +1166,7 @@ rcu_torture_init(void)
|
||||
goto unwind;
|
||||
}
|
||||
fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
|
||||
GFP_KERNEL);
|
||||
GFP_KERNEL);
|
||||
if (fakewriter_tasks == NULL) {
|
||||
VERBOSE_PRINTK_ERRSTRING("out of memory");
|
||||
firsterr = -ENOMEM;
|
||||
@@ -1170,7 +1175,7 @@ rcu_torture_init(void)
|
||||
for (i = 0; i < nfakewriters; i++) {
|
||||
VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task");
|
||||
fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL,
|
||||
"rcu_torture_fakewriter");
|
||||
"rcu_torture_fakewriter");
|
||||
if (IS_ERR(fakewriter_tasks[i])) {
|
||||
firsterr = PTR_ERR(fakewriter_tasks[i]);
|
||||
VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter");
|
||||
|
105
kernel/rcutree.c
105
kernel/rcutree.c
@@ -25,7 +25,7 @@
|
||||
* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
|
||||
*
|
||||
* For detailed explanation of Read-Copy Update mechanism see -
|
||||
* Documentation/RCU
|
||||
* Documentation/RCU
|
||||
*/
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
@@ -107,27 +107,23 @@ static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp,
|
||||
*/
|
||||
void rcu_sched_qs(int cpu)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct rcu_data *rdp;
|
||||
|
||||
local_irq_save(flags);
|
||||
rdp = &per_cpu(rcu_sched_data, cpu);
|
||||
rdp->passed_quiesc = 1;
|
||||
rdp->passed_quiesc_completed = rdp->completed;
|
||||
rcu_preempt_qs(cpu);
|
||||
local_irq_restore(flags);
|
||||
barrier();
|
||||
rdp->passed_quiesc = 1;
|
||||
rcu_preempt_note_context_switch(cpu);
|
||||
}
|
||||
|
||||
void rcu_bh_qs(int cpu)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct rcu_data *rdp;
|
||||
|
||||
local_irq_save(flags);
|
||||
rdp = &per_cpu(rcu_bh_data, cpu);
|
||||
rdp->passed_quiesc = 1;
|
||||
rdp->passed_quiesc_completed = rdp->completed;
|
||||
local_irq_restore(flags);
|
||||
barrier();
|
||||
rdp->passed_quiesc = 1;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NO_HZ
|
||||
@@ -605,8 +601,6 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
|
||||
{
|
||||
struct rcu_data *rdp = rsp->rda[smp_processor_id()];
|
||||
struct rcu_node *rnp = rcu_get_root(rsp);
|
||||
struct rcu_node *rnp_cur;
|
||||
struct rcu_node *rnp_end;
|
||||
|
||||
if (!cpu_needs_another_gp(rsp, rdp)) {
|
||||
spin_unlock_irqrestore(&rnp->lock, flags);
|
||||
@@ -615,6 +609,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
|
||||
|
||||
/* Advance to a new grace period and initialize state. */
|
||||
rsp->gpnum++;
|
||||
WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
|
||||
rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
|
||||
rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
|
||||
record_gp_stall_check_time(rsp);
|
||||
@@ -631,7 +626,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
|
||||
|
||||
/* Special-case the common single-level case. */
|
||||
if (NUM_RCU_NODES == 1) {
|
||||
rcu_preempt_check_blocked_tasks(rnp);
|
||||
rnp->qsmask = rnp->qsmaskinit;
|
||||
rnp->gpnum = rsp->gpnum;
|
||||
rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
|
||||
spin_unlock_irqrestore(&rnp->lock, flags);
|
||||
return;
|
||||
@@ -644,42 +641,28 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
|
||||
spin_lock(&rsp->onofflock); /* irqs already disabled. */
|
||||
|
||||
/*
|
||||
* Set the quiescent-state-needed bits in all the non-leaf RCU
|
||||
* nodes for all currently online CPUs. This operation relies
|
||||
* on the layout of the hierarchy within the rsp->node[] array.
|
||||
* Note that other CPUs will access only the leaves of the
|
||||
* hierarchy, which still indicate that no grace period is in
|
||||
* progress. In addition, we have excluded CPU-hotplug operations.
|
||||
*
|
||||
* We therefore do not need to hold any locks. Any required
|
||||
* memory barriers will be supplied by the locks guarding the
|
||||
* leaf rcu_nodes in the hierarchy.
|
||||
*/
|
||||
|
||||
rnp_end = rsp->level[NUM_RCU_LVLS - 1];
|
||||
for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
|
||||
rnp_cur->qsmask = rnp_cur->qsmaskinit;
|
||||
|
||||
/*
|
||||
* Now set up the leaf nodes. Here we must be careful. First,
|
||||
* we need to hold the lock in order to exclude other CPUs, which
|
||||
* might be contending for the leaf nodes' locks. Second, as
|
||||
* soon as we initialize a given leaf node, its CPUs might run
|
||||
* up the rest of the hierarchy. We must therefore acquire locks
|
||||
* for each node that we touch during this stage. (But we still
|
||||
* are excluding CPU-hotplug operations.)
|
||||
* Set the quiescent-state-needed bits in all the rcu_node
|
||||
* structures for all currently online CPUs in breadth-first
|
||||
* order, starting from the root rcu_node structure. This
|
||||
* operation relies on the layout of the hierarchy within the
|
||||
* rsp->node[] array. Note that other CPUs will access only
|
||||
* the leaves of the hierarchy, which still indicate that no
|
||||
* grace period is in progress, at least until the corresponding
|
||||
* leaf node has been initialized. In addition, we have excluded
|
||||
* CPU-hotplug operations.
|
||||
*
|
||||
* Note that the grace period cannot complete until we finish
|
||||
* the initialization process, as there will be at least one
|
||||
* qsmask bit set in the root node until that time, namely the
|
||||
* one corresponding to this CPU.
|
||||
* one corresponding to this CPU, due to the fact that we have
|
||||
* irqs disabled.
|
||||
*/
|
||||
rnp_end = &rsp->node[NUM_RCU_NODES];
|
||||
rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
|
||||
for (; rnp_cur < rnp_end; rnp_cur++) {
|
||||
spin_lock(&rnp_cur->lock); /* irqs already disabled. */
|
||||
rnp_cur->qsmask = rnp_cur->qsmaskinit;
|
||||
spin_unlock(&rnp_cur->lock); /* irqs already disabled. */
|
||||
for (rnp = &rsp->node[0]; rnp < &rsp->node[NUM_RCU_NODES]; rnp++) {
|
||||
spin_lock(&rnp->lock); /* irqs already disabled. */
|
||||
rcu_preempt_check_blocked_tasks(rnp);
|
||||
rnp->qsmask = rnp->qsmaskinit;
|
||||
rnp->gpnum = rsp->gpnum;
|
||||
spin_unlock(&rnp->lock); /* irqs already disabled. */
|
||||
}
|
||||
|
||||
rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
|
||||
@@ -722,6 +705,7 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
|
||||
static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
|
||||
__releases(rnp->lock)
|
||||
{
|
||||
WARN_ON_ONCE(rsp->completed == rsp->gpnum);
|
||||
rsp->completed = rsp->gpnum;
|
||||
rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
|
||||
rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
|
||||
@@ -739,6 +723,8 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
|
||||
unsigned long flags)
|
||||
__releases(rnp->lock)
|
||||
{
|
||||
struct rcu_node *rnp_c;
|
||||
|
||||
/* Walk up the rcu_node hierarchy. */
|
||||
for (;;) {
|
||||
if (!(rnp->qsmask & mask)) {
|
||||
@@ -762,8 +748,10 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
|
||||
break;
|
||||
}
|
||||
spin_unlock_irqrestore(&rnp->lock, flags);
|
||||
rnp_c = rnp;
|
||||
rnp = rnp->parent;
|
||||
spin_lock_irqsave(&rnp->lock, flags);
|
||||
WARN_ON_ONCE(rnp_c->qsmask);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -776,10 +764,10 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
|
||||
|
||||
/*
|
||||
* Record a quiescent state for the specified CPU, which must either be
|
||||
* the current CPU or an offline CPU. The lastcomp argument is used to
|
||||
* make sure we are still in the grace period of interest. We don't want
|
||||
* to end the current grace period based on quiescent states detected in
|
||||
* an earlier grace period!
|
||||
* the current CPU. The lastcomp argument is used to make sure we are
|
||||
* still in the grace period of interest. We don't want to end the current
|
||||
* grace period based on quiescent states detected in an earlier grace
|
||||
* period!
|
||||
*/
|
||||
static void
|
||||
cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
|
||||
@@ -814,7 +802,6 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
|
||||
* This GP can't end until cpu checks in, so all of our
|
||||
* callbacks can be processed during the next GP.
|
||||
*/
|
||||
rdp = rsp->rda[smp_processor_id()];
|
||||
rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
|
||||
|
||||
cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
|
||||
@@ -872,7 +859,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
|
||||
spin_lock_irqsave(&rsp->onofflock, flags);
|
||||
|
||||
/* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
|
||||
rnp = rdp->mynode;
|
||||
rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */
|
||||
mask = rdp->grpmask; /* rnp->grplo is constant. */
|
||||
do {
|
||||
spin_lock(&rnp->lock); /* irqs already disabled. */
|
||||
@@ -881,7 +868,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
|
||||
spin_unlock(&rnp->lock); /* irqs remain disabled. */
|
||||
break;
|
||||
}
|
||||
rcu_preempt_offline_tasks(rsp, rnp);
|
||||
rcu_preempt_offline_tasks(rsp, rnp, rdp);
|
||||
mask = rnp->grpmask;
|
||||
spin_unlock(&rnp->lock); /* irqs remain disabled. */
|
||||
rnp = rnp->parent;
|
||||
@@ -890,9 +877,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
|
||||
|
||||
spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
|
||||
|
||||
/* Being offline is a quiescent state, so go record it. */
|
||||
cpu_quiet(cpu, rsp, rdp, lastcomp);
|
||||
|
||||
/*
|
||||
* Move callbacks from the outgoing CPU to the running CPU.
|
||||
* Note that the outgoing CPU is now quiscent, so it is now
|
||||
@@ -1457,20 +1441,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
|
||||
rnp = rnp->parent;
|
||||
} while (rnp != NULL && !(rnp->qsmaskinit & mask));
|
||||
|
||||
spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
|
||||
|
||||
/*
|
||||
* A new grace period might start here. If so, we will be part of
|
||||
* it, and its gpnum will be greater than ours, so we will
|
||||
* participate. It is also possible for the gpnum to have been
|
||||
* incremented before this function was called, and the bitmasks
|
||||
* to not be filled out until now, in which case we will also
|
||||
* participate due to our gpnum being behind.
|
||||
*/
|
||||
|
||||
/* Since it is coming online, the CPU is in a quiescent state. */
|
||||
cpu_quiet(cpu, rsp, rdp, lastcomp);
|
||||
local_irq_restore(flags);
|
||||
spin_unlock_irqrestore(&rsp->onofflock, flags);
|
||||
}
|
||||
|
||||
static void __cpuinit rcu_online_cpu(int cpu)
|
||||
|
@@ -142,7 +142,7 @@ struct rcu_data {
|
||||
*/
|
||||
struct rcu_head *nxtlist;
|
||||
struct rcu_head **nxttail[RCU_NEXT_SIZE];
|
||||
long qlen; /* # of queued callbacks */
|
||||
long qlen; /* # of queued callbacks */
|
||||
long blimit; /* Upper limit on a processed batch */
|
||||
|
||||
#ifdef CONFIG_NO_HZ
|
||||
|
@@ -64,22 +64,31 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
|
||||
* not in a quiescent state. There might be any number of tasks blocked
|
||||
* while in an RCU read-side critical section.
|
||||
*/
|
||||
static void rcu_preempt_qs_record(int cpu)
|
||||
static void rcu_preempt_qs(int cpu)
|
||||
{
|
||||
struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
|
||||
rdp->passed_quiesc = 1;
|
||||
rdp->passed_quiesc_completed = rdp->completed;
|
||||
barrier();
|
||||
rdp->passed_quiesc = 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* We have entered the scheduler or are between softirqs in ksoftirqd.
|
||||
* If we are in an RCU read-side critical section, we need to reflect
|
||||
* that in the state of the rcu_node structure corresponding to this CPU.
|
||||
* Caller must disable hardirqs.
|
||||
* We have entered the scheduler, and the current task might soon be
|
||||
* context-switched away from. If this task is in an RCU read-side
|
||||
* critical section, we will no longer be able to rely on the CPU to
|
||||
* record that fact, so we enqueue the task on the appropriate entry
|
||||
* of the blocked_tasks[] array. The task will dequeue itself when
|
||||
* it exits the outermost enclosing RCU read-side critical section.
|
||||
* Therefore, the current grace period cannot be permitted to complete
|
||||
* until the blocked_tasks[] entry indexed by the low-order bit of
|
||||
* rnp->gpnum empties.
|
||||
*
|
||||
* Caller must disable preemption.
|
||||
*/
|
||||
static void rcu_preempt_qs(int cpu)
|
||||
static void rcu_preempt_note_context_switch(int cpu)
|
||||
{
|
||||
struct task_struct *t = current;
|
||||
unsigned long flags;
|
||||
int phase;
|
||||
struct rcu_data *rdp;
|
||||
struct rcu_node *rnp;
|
||||
@@ -90,7 +99,7 @@ static void rcu_preempt_qs(int cpu)
|
||||
/* Possibly blocking in an RCU read-side critical section. */
|
||||
rdp = rcu_preempt_state.rda[cpu];
|
||||
rnp = rdp->mynode;
|
||||
spin_lock(&rnp->lock);
|
||||
spin_lock_irqsave(&rnp->lock, flags);
|
||||
t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
|
||||
t->rcu_blocked_node = rnp;
|
||||
|
||||
@@ -103,11 +112,15 @@ static void rcu_preempt_qs(int cpu)
|
||||
* state for the current grace period), then as long
|
||||
* as that task remains queued, the current grace period
|
||||
* cannot end.
|
||||
*
|
||||
* But first, note that the current CPU must still be
|
||||
* on line!
|
||||
*/
|
||||
phase = !(rnp->qsmask & rdp->grpmask) ^ (rnp->gpnum & 0x1);
|
||||
WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
|
||||
WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
|
||||
phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
|
||||
list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
|
||||
smp_mb(); /* Ensure later ctxt swtch seen after above. */
|
||||
spin_unlock(&rnp->lock);
|
||||
spin_unlock_irqrestore(&rnp->lock, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -119,9 +132,10 @@ static void rcu_preempt_qs(int cpu)
|
||||
* grace period, then the fact that the task has been enqueued
|
||||
* means that we continue to block the current grace period.
|
||||
*/
|
||||
rcu_preempt_qs_record(cpu);
|
||||
t->rcu_read_unlock_special &= ~(RCU_READ_UNLOCK_NEED_QS |
|
||||
RCU_READ_UNLOCK_GOT_QS);
|
||||
rcu_preempt_qs(cpu);
|
||||
local_irq_save(flags);
|
||||
t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -157,7 +171,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
|
||||
special = t->rcu_read_unlock_special;
|
||||
if (special & RCU_READ_UNLOCK_NEED_QS) {
|
||||
t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
|
||||
t->rcu_read_unlock_special |= RCU_READ_UNLOCK_GOT_QS;
|
||||
rcu_preempt_qs(smp_processor_id());
|
||||
}
|
||||
|
||||
/* Hardware IRQ handlers cannot block. */
|
||||
@@ -177,10 +191,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
|
||||
*/
|
||||
for (;;) {
|
||||
rnp = t->rcu_blocked_node;
|
||||
spin_lock(&rnp->lock);
|
||||
spin_lock(&rnp->lock); /* irqs already disabled. */
|
||||
if (rnp == t->rcu_blocked_node)
|
||||
break;
|
||||
spin_unlock(&rnp->lock);
|
||||
spin_unlock(&rnp->lock); /* irqs remain disabled. */
|
||||
}
|
||||
empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
|
||||
list_del_init(&t->rcu_node_entry);
|
||||
@@ -194,9 +208,8 @@ static void rcu_read_unlock_special(struct task_struct *t)
|
||||
*/
|
||||
if (!empty && rnp->qsmask == 0 &&
|
||||
list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) {
|
||||
t->rcu_read_unlock_special &=
|
||||
~(RCU_READ_UNLOCK_NEED_QS |
|
||||
RCU_READ_UNLOCK_GOT_QS);
|
||||
struct rcu_node *rnp_p;
|
||||
|
||||
if (rnp->parent == NULL) {
|
||||
/* Only one rcu_node in the tree. */
|
||||
cpu_quiet_msk_finish(&rcu_preempt_state, flags);
|
||||
@@ -205,9 +218,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
|
||||
/* Report up the rest of the hierarchy. */
|
||||
mask = rnp->grpmask;
|
||||
spin_unlock_irqrestore(&rnp->lock, flags);
|
||||
rnp = rnp->parent;
|
||||
spin_lock_irqsave(&rnp->lock, flags);
|
||||
cpu_quiet_msk(mask, &rcu_preempt_state, rnp, flags);
|
||||
rnp_p = rnp->parent;
|
||||
spin_lock_irqsave(&rnp_p->lock, flags);
|
||||
WARN_ON_ONCE(rnp->qsmask);
|
||||
cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags);
|
||||
return;
|
||||
}
|
||||
spin_unlock(&rnp->lock);
|
||||
@@ -258,6 +272,19 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
|
||||
|
||||
#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
|
||||
|
||||
/*
|
||||
* Check that the list of blocked tasks for the newly completed grace
|
||||
* period is in fact empty. It is a serious bug to complete a grace
|
||||
* period that still has RCU readers blocked! This function must be
|
||||
* invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
|
||||
* must be held by the caller.
|
||||
*/
|
||||
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
|
||||
{
|
||||
WARN_ON_ONCE(!list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]));
|
||||
WARN_ON_ONCE(rnp->qsmask);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check for preempted RCU readers for the specified rcu_node structure.
|
||||
* If the caller needs a reliable answer, it must hold the rcu_node's
|
||||
@@ -280,7 +307,8 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
|
||||
* The caller must hold rnp->lock with irqs disabled.
|
||||
*/
|
||||
static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
|
||||
struct rcu_node *rnp)
|
||||
struct rcu_node *rnp,
|
||||
struct rcu_data *rdp)
|
||||
{
|
||||
int i;
|
||||
struct list_head *lp;
|
||||
@@ -292,6 +320,9 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
|
||||
WARN_ONCE(1, "Last CPU thought to be offlined?");
|
||||
return; /* Shouldn't happen: at least one CPU online. */
|
||||
}
|
||||
WARN_ON_ONCE(rnp != rdp->mynode &&
|
||||
(!list_empty(&rnp->blocked_tasks[0]) ||
|
||||
!list_empty(&rnp->blocked_tasks[1])));
|
||||
|
||||
/*
|
||||
* Move tasks up to root rcu_node. Rely on the fact that the
|
||||
@@ -335,20 +366,12 @@ static void rcu_preempt_check_callbacks(int cpu)
|
||||
struct task_struct *t = current;
|
||||
|
||||
if (t->rcu_read_lock_nesting == 0) {
|
||||
t->rcu_read_unlock_special &=
|
||||
~(RCU_READ_UNLOCK_NEED_QS | RCU_READ_UNLOCK_GOT_QS);
|
||||
rcu_preempt_qs_record(cpu);
|
||||
t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
|
||||
rcu_preempt_qs(cpu);
|
||||
return;
|
||||
}
|
||||
if (per_cpu(rcu_preempt_data, cpu).qs_pending) {
|
||||
if (t->rcu_read_unlock_special & RCU_READ_UNLOCK_GOT_QS) {
|
||||
rcu_preempt_qs_record(cpu);
|
||||
t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_GOT_QS;
|
||||
} else if (!(t->rcu_read_unlock_special &
|
||||
RCU_READ_UNLOCK_NEED_QS)) {
|
||||
t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
|
||||
}
|
||||
}
|
||||
if (per_cpu(rcu_preempt_data, cpu).qs_pending)
|
||||
t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -434,7 +457,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
|
||||
* Because preemptable RCU does not exist, we never have to check for
|
||||
* CPUs being in quiescent states.
|
||||
*/
|
||||
static void rcu_preempt_qs(int cpu)
|
||||
static void rcu_preempt_note_context_switch(int cpu)
|
||||
{
|
||||
}
|
||||
|
||||
@@ -450,6 +473,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
|
||||
|
||||
#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
|
||||
|
||||
/*
|
||||
* Because there is no preemptable RCU, there can be no readers blocked,
|
||||
* so there is no need to check for blocked tasks. So check only for
|
||||
* bogus qsmask values.
|
||||
*/
|
||||
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
|
||||
{
|
||||
WARN_ON_ONCE(rnp->qsmask);
|
||||
}
|
||||
|
||||
/*
|
||||
* Because preemptable RCU does not exist, there are never any preempted
|
||||
* RCU readers.
|
||||
@@ -466,7 +499,8 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
|
||||
* tasks that were blocked within RCU read-side critical sections.
|
||||
*/
|
||||
static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
|
||||
struct rcu_node *rnp)
|
||||
struct rcu_node *rnp,
|
||||
struct rcu_data *rdp)
|
||||
{
|
||||
}
|
||||
|
||||
|
@@ -20,7 +20,7 @@
|
||||
* Papers: http://www.rdrop.com/users/paulmck/RCU
|
||||
*
|
||||
* For detailed explanation of Read-Copy Update mechanism see -
|
||||
* Documentation/RCU
|
||||
* Documentation/RCU
|
||||
*
|
||||
*/
|
||||
#include <linux/types.h>
|
||||
|
@@ -60,7 +60,7 @@ static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
/*
|
||||
* vm_ops for relay file mappings.
|
||||
*/
|
||||
static struct vm_operations_struct relay_file_mmap_ops = {
|
||||
static const struct vm_operations_struct relay_file_mmap_ops = {
|
||||
.fault = relay_buf_fault,
|
||||
.close = relay_file_mmap_close,
|
||||
};
|
||||
|
@@ -19,6 +19,7 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
|
||||
{
|
||||
spin_lock_init(&counter->lock);
|
||||
counter->limit = RESOURCE_MAX;
|
||||
counter->soft_limit = RESOURCE_MAX;
|
||||
counter->parent = parent;
|
||||
}
|
||||
|
||||
@@ -36,17 +37,27 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
|
||||
}
|
||||
|
||||
int res_counter_charge(struct res_counter *counter, unsigned long val,
|
||||
struct res_counter **limit_fail_at)
|
||||
struct res_counter **limit_fail_at,
|
||||
struct res_counter **soft_limit_fail_at)
|
||||
{
|
||||
int ret;
|
||||
unsigned long flags;
|
||||
struct res_counter *c, *u;
|
||||
|
||||
*limit_fail_at = NULL;
|
||||
if (soft_limit_fail_at)
|
||||
*soft_limit_fail_at = NULL;
|
||||
local_irq_save(flags);
|
||||
for (c = counter; c != NULL; c = c->parent) {
|
||||
spin_lock(&c->lock);
|
||||
ret = res_counter_charge_locked(c, val);
|
||||
/*
|
||||
* With soft limits, we return the highest ancestor
|
||||
* that exceeds its soft limit
|
||||
*/
|
||||
if (soft_limit_fail_at &&
|
||||
!res_counter_soft_limit_check_locked(c))
|
||||
*soft_limit_fail_at = c;
|
||||
spin_unlock(&c->lock);
|
||||
if (ret < 0) {
|
||||
*limit_fail_at = c;
|
||||
@@ -74,7 +85,8 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
|
||||
counter->usage -= val;
|
||||
}
|
||||
|
||||
void res_counter_uncharge(struct res_counter *counter, unsigned long val)
|
||||
void res_counter_uncharge(struct res_counter *counter, unsigned long val,
|
||||
bool *was_soft_limit_excess)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct res_counter *c;
|
||||
@@ -82,6 +94,9 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val)
|
||||
local_irq_save(flags);
|
||||
for (c = counter; c != NULL; c = c->parent) {
|
||||
spin_lock(&c->lock);
|
||||
if (was_soft_limit_excess)
|
||||
*was_soft_limit_excess =
|
||||
!res_counter_soft_limit_check_locked(c);
|
||||
res_counter_uncharge_locked(c, val);
|
||||
spin_unlock(&c->lock);
|
||||
}
|
||||
@@ -101,6 +116,8 @@ res_counter_member(struct res_counter *counter, int member)
|
||||
return &counter->limit;
|
||||
case RES_FAILCNT:
|
||||
return &counter->failcnt;
|
||||
case RES_SOFT_LIMIT:
|
||||
return &counter->soft_limit;
|
||||
};
|
||||
|
||||
BUG();
|
||||
|
@@ -223,13 +223,13 @@ int release_resource(struct resource *old)
|
||||
|
||||
EXPORT_SYMBOL(release_resource);
|
||||
|
||||
#if defined(CONFIG_MEMORY_HOTPLUG) && !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
|
||||
#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
|
||||
/*
|
||||
* Finds the lowest memory reosurce exists within [res->start.res->end)
|
||||
* the caller must specify res->start, res->end, res->flags.
|
||||
* the caller must specify res->start, res->end, res->flags and "name".
|
||||
* If found, returns 0, res is overwritten, if not found, returns -1.
|
||||
*/
|
||||
static int find_next_system_ram(struct resource *res)
|
||||
static int find_next_system_ram(struct resource *res, char *name)
|
||||
{
|
||||
resource_size_t start, end;
|
||||
struct resource *p;
|
||||
@@ -245,6 +245,8 @@ static int find_next_system_ram(struct resource *res)
|
||||
/* system ram is just marked as IORESOURCE_MEM */
|
||||
if (p->flags != res->flags)
|
||||
continue;
|
||||
if (name && strcmp(p->name, name))
|
||||
continue;
|
||||
if (p->start > end) {
|
||||
p = NULL;
|
||||
break;
|
||||
@@ -262,19 +264,26 @@ static int find_next_system_ram(struct resource *res)
|
||||
res->end = p->end;
|
||||
return 0;
|
||||
}
|
||||
int
|
||||
walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg,
|
||||
int (*func)(unsigned long, unsigned long, void *))
|
||||
|
||||
/*
|
||||
* This function calls callback against all memory range of "System RAM"
|
||||
* which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
|
||||
* Now, this function is only for "System RAM".
|
||||
*/
|
||||
int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
|
||||
void *arg, int (*func)(unsigned long, unsigned long, void *))
|
||||
{
|
||||
struct resource res;
|
||||
unsigned long pfn, len;
|
||||
u64 orig_end;
|
||||
int ret = -1;
|
||||
|
||||
res.start = (u64) start_pfn << PAGE_SHIFT;
|
||||
res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
|
||||
res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
|
||||
orig_end = res.end;
|
||||
while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) {
|
||||
while ((res.start < res.end) &&
|
||||
(find_next_system_ram(&res, "System RAM") >= 0)) {
|
||||
pfn = (unsigned long)(res.start >> PAGE_SHIFT);
|
||||
len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT);
|
||||
ret = (*func)(pfn, len, arg);
|
||||
|
550
kernel/sched.c
550
kernel/sched.c
@@ -39,7 +39,7 @@
|
||||
#include <linux/completion.h>
|
||||
#include <linux/kernel_stat.h>
|
||||
#include <linux/debug_locks.h>
|
||||
#include <linux/perf_counter.h>
|
||||
#include <linux/perf_event.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/notifier.h>
|
||||
#include <linux/profile.h>
|
||||
@@ -119,8 +119,6 @@
|
||||
*/
|
||||
#define RUNTIME_INF ((u64)~0ULL)
|
||||
|
||||
static void double_rq_lock(struct rq *rq1, struct rq *rq2);
|
||||
|
||||
static inline int rt_policy(int policy)
|
||||
{
|
||||
if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
|
||||
@@ -378,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
|
||||
|
||||
#else
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
static int root_task_group_empty(void)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
|
||||
static inline struct task_group *task_group(struct task_struct *p)
|
||||
{
|
||||
@@ -514,14 +505,6 @@ struct root_domain {
|
||||
#ifdef CONFIG_SMP
|
||||
struct cpupri cpupri;
|
||||
#endif
|
||||
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
|
||||
/*
|
||||
* Preferred wake up cpu nominated by sched_mc balance that will be
|
||||
* used when most cpus are idle in the system indicating overall very
|
||||
* low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
|
||||
*/
|
||||
unsigned int sched_mc_preferred_wakeup_cpu;
|
||||
#endif
|
||||
};
|
||||
|
||||
/*
|
||||
@@ -646,9 +629,10 @@ struct rq {
|
||||
|
||||
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
|
||||
|
||||
static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
|
||||
static inline
|
||||
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
|
||||
{
|
||||
rq->curr->sched_class->check_preempt_curr(rq, p, sync);
|
||||
rq->curr->sched_class->check_preempt_curr(rq, p, flags);
|
||||
}
|
||||
|
||||
static inline int cpu_of(struct rq *rq)
|
||||
@@ -697,15 +681,9 @@ inline void update_rq_clock(struct rq *rq)
|
||||
* This interface allows printk to be called with the runqueue lock
|
||||
* held and know whether or not it is OK to wake up the klogd.
|
||||
*/
|
||||
int runqueue_is_locked(void)
|
||||
int runqueue_is_locked(int cpu)
|
||||
{
|
||||
int cpu = get_cpu();
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
int ret;
|
||||
|
||||
ret = spin_is_locked(&rq->lock);
|
||||
put_cpu();
|
||||
return ret;
|
||||
return spin_is_locked(&cpu_rq(cpu)->lock);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1509,8 +1487,65 @@ static int tg_nop(struct task_group *tg, void *data)
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
static unsigned long source_load(int cpu, int type);
|
||||
static unsigned long target_load(int cpu, int type);
|
||||
/* Used instead of source_load when we know the type == 0 */
|
||||
static unsigned long weighted_cpuload(const int cpu)
|
||||
{
|
||||
return cpu_rq(cpu)->load.weight;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return a low guess at the load of a migration-source cpu weighted
|
||||
* according to the scheduling class and "nice" value.
|
||||
*
|
||||
* We want to under-estimate the load of migration sources, to
|
||||
* balance conservatively.
|
||||
*/
|
||||
static unsigned long source_load(int cpu, int type)
|
||||
{
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
unsigned long total = weighted_cpuload(cpu);
|
||||
|
||||
if (type == 0 || !sched_feat(LB_BIAS))
|
||||
return total;
|
||||
|
||||
return min(rq->cpu_load[type-1], total);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return a high guess at the load of a migration-target cpu weighted
|
||||
* according to the scheduling class and "nice" value.
|
||||
*/
|
||||
static unsigned long target_load(int cpu, int type)
|
||||
{
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
unsigned long total = weighted_cpuload(cpu);
|
||||
|
||||
if (type == 0 || !sched_feat(LB_BIAS))
|
||||
return total;
|
||||
|
||||
return max(rq->cpu_load[type-1], total);
|
||||
}
|
||||
|
||||
static struct sched_group *group_of(int cpu)
|
||||
{
|
||||
struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
|
||||
|
||||
if (!sd)
|
||||
return NULL;
|
||||
|
||||
return sd->groups;
|
||||
}
|
||||
|
||||
static unsigned long power_of(int cpu)
|
||||
{
|
||||
struct sched_group *group = group_of(cpu);
|
||||
|
||||
if (!group)
|
||||
return SCHED_LOAD_SCALE;
|
||||
|
||||
return group->cpu_power;
|
||||
}
|
||||
|
||||
static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
|
||||
|
||||
static unsigned long cpu_avg_load_per_task(int cpu)
|
||||
@@ -1695,6 +1730,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
|
||||
|
||||
#ifdef CONFIG_PREEMPT
|
||||
|
||||
static void double_rq_lock(struct rq *rq1, struct rq *rq2);
|
||||
|
||||
/*
|
||||
* fair double_lock_balance: Safely acquires both rq->locks in a fair
|
||||
* way at the expense of forcing extra atomic operations in all
|
||||
@@ -1959,13 +1996,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
/* Used instead of source_load when we know the type == 0 */
|
||||
static unsigned long weighted_cpuload(const int cpu)
|
||||
{
|
||||
return cpu_rq(cpu)->load.weight;
|
||||
}
|
||||
|
||||
/*
|
||||
* Is this task likely cache-hot:
|
||||
*/
|
||||
@@ -2023,7 +2053,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
|
||||
if (task_hot(p, old_rq->clock, NULL))
|
||||
schedstat_inc(p, se.nr_forced2_migrations);
|
||||
#endif
|
||||
perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
|
||||
perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
|
||||
1, 1, NULL, 0);
|
||||
}
|
||||
p->se.vruntime -= old_cfsrq->min_vruntime -
|
||||
@@ -2239,185 +2269,6 @@ void kick_process(struct task_struct *p)
|
||||
preempt_enable();
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kick_process);
|
||||
|
||||
/*
|
||||
* Return a low guess at the load of a migration-source cpu weighted
|
||||
* according to the scheduling class and "nice" value.
|
||||
*
|
||||
* We want to under-estimate the load of migration sources, to
|
||||
* balance conservatively.
|
||||
*/
|
||||
static unsigned long source_load(int cpu, int type)
|
||||
{
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
unsigned long total = weighted_cpuload(cpu);
|
||||
|
||||
if (type == 0 || !sched_feat(LB_BIAS))
|
||||
return total;
|
||||
|
||||
return min(rq->cpu_load[type-1], total);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return a high guess at the load of a migration-target cpu weighted
|
||||
* according to the scheduling class and "nice" value.
|
||||
*/
|
||||
static unsigned long target_load(int cpu, int type)
|
||||
{
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
unsigned long total = weighted_cpuload(cpu);
|
||||
|
||||
if (type == 0 || !sched_feat(LB_BIAS))
|
||||
return total;
|
||||
|
||||
return max(rq->cpu_load[type-1], total);
|
||||
}
|
||||
|
||||
/*
|
||||
* find_idlest_group finds and returns the least busy CPU group within the
|
||||
* domain.
|
||||
*/
|
||||
static struct sched_group *
|
||||
find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
|
||||
{
|
||||
struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
|
||||
unsigned long min_load = ULONG_MAX, this_load = 0;
|
||||
int load_idx = sd->forkexec_idx;
|
||||
int imbalance = 100 + (sd->imbalance_pct-100)/2;
|
||||
|
||||
do {
|
||||
unsigned long load, avg_load;
|
||||
int local_group;
|
||||
int i;
|
||||
|
||||
/* Skip over this group if it has no CPUs allowed */
|
||||
if (!cpumask_intersects(sched_group_cpus(group),
|
||||
&p->cpus_allowed))
|
||||
continue;
|
||||
|
||||
local_group = cpumask_test_cpu(this_cpu,
|
||||
sched_group_cpus(group));
|
||||
|
||||
/* Tally up the load of all CPUs in the group */
|
||||
avg_load = 0;
|
||||
|
||||
for_each_cpu(i, sched_group_cpus(group)) {
|
||||
/* Bias balancing toward cpus of our domain */
|
||||
if (local_group)
|
||||
load = source_load(i, load_idx);
|
||||
else
|
||||
load = target_load(i, load_idx);
|
||||
|
||||
avg_load += load;
|
||||
}
|
||||
|
||||
/* Adjust by relative CPU power of the group */
|
||||
avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
|
||||
|
||||
if (local_group) {
|
||||
this_load = avg_load;
|
||||
this = group;
|
||||
} else if (avg_load < min_load) {
|
||||
min_load = avg_load;
|
||||
idlest = group;
|
||||
}
|
||||
} while (group = group->next, group != sd->groups);
|
||||
|
||||
if (!idlest || 100*this_load < imbalance*min_load)
|
||||
return NULL;
|
||||
return idlest;
|
||||
}
|
||||
|
||||
/*
|
||||
* find_idlest_cpu - find the idlest cpu among the cpus in group.
|
||||
*/
|
||||
static int
|
||||
find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
|
||||
{
|
||||
unsigned long load, min_load = ULONG_MAX;
|
||||
int idlest = -1;
|
||||
int i;
|
||||
|
||||
/* Traverse only the allowed CPUs */
|
||||
for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
|
||||
load = weighted_cpuload(i);
|
||||
|
||||
if (load < min_load || (load == min_load && i == this_cpu)) {
|
||||
min_load = load;
|
||||
idlest = i;
|
||||
}
|
||||
}
|
||||
|
||||
return idlest;
|
||||
}
|
||||
|
||||
/*
|
||||
* sched_balance_self: balance the current task (running on cpu) in domains
|
||||
* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
|
||||
* SD_BALANCE_EXEC.
|
||||
*
|
||||
* Balance, ie. select the least loaded group.
|
||||
*
|
||||
* Returns the target CPU number, or the same CPU if no balancing is needed.
|
||||
*
|
||||
* preempt must be disabled.
|
||||
*/
|
||||
static int sched_balance_self(int cpu, int flag)
|
||||
{
|
||||
struct task_struct *t = current;
|
||||
struct sched_domain *tmp, *sd = NULL;
|
||||
|
||||
for_each_domain(cpu, tmp) {
|
||||
/*
|
||||
* If power savings logic is enabled for a domain, stop there.
|
||||
*/
|
||||
if (tmp->flags & SD_POWERSAVINGS_BALANCE)
|
||||
break;
|
||||
if (tmp->flags & flag)
|
||||
sd = tmp;
|
||||
}
|
||||
|
||||
if (sd)
|
||||
update_shares(sd);
|
||||
|
||||
while (sd) {
|
||||
struct sched_group *group;
|
||||
int new_cpu, weight;
|
||||
|
||||
if (!(sd->flags & flag)) {
|
||||
sd = sd->child;
|
||||
continue;
|
||||
}
|
||||
|
||||
group = find_idlest_group(sd, t, cpu);
|
||||
if (!group) {
|
||||
sd = sd->child;
|
||||
continue;
|
||||
}
|
||||
|
||||
new_cpu = find_idlest_cpu(group, t, cpu);
|
||||
if (new_cpu == -1 || new_cpu == cpu) {
|
||||
/* Now try balancing at a lower domain level of cpu */
|
||||
sd = sd->child;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Now try balancing at a lower domain level of new_cpu */
|
||||
cpu = new_cpu;
|
||||
weight = cpumask_weight(sched_domain_span(sd));
|
||||
sd = NULL;
|
||||
for_each_domain(cpu, tmp) {
|
||||
if (weight <= cpumask_weight(sched_domain_span(tmp)))
|
||||
break;
|
||||
if (tmp->flags & flag)
|
||||
sd = tmp;
|
||||
}
|
||||
/* while loop will break here if sd == NULL */
|
||||
}
|
||||
|
||||
return cpu;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
/**
|
||||
@@ -2455,37 +2306,22 @@ void task_oncpu_function_call(struct task_struct *p,
|
||||
*
|
||||
* returns failure only if the task is already active.
|
||||
*/
|
||||
static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
|
||||
static int try_to_wake_up(struct task_struct *p, unsigned int state,
|
||||
int wake_flags)
|
||||
{
|
||||
int cpu, orig_cpu, this_cpu, success = 0;
|
||||
unsigned long flags;
|
||||
long old_state;
|
||||
struct rq *rq;
|
||||
|
||||
if (!sched_feat(SYNC_WAKEUPS))
|
||||
sync = 0;
|
||||
wake_flags &= ~WF_SYNC;
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
|
||||
struct sched_domain *sd;
|
||||
|
||||
this_cpu = raw_smp_processor_id();
|
||||
cpu = task_cpu(p);
|
||||
|
||||
for_each_domain(this_cpu, sd) {
|
||||
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
|
||||
update_shares(sd);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
this_cpu = get_cpu();
|
||||
|
||||
smp_wmb();
|
||||
rq = task_rq_lock(p, &flags);
|
||||
update_rq_clock(rq);
|
||||
old_state = p->state;
|
||||
if (!(old_state & state))
|
||||
if (!(p->state & state))
|
||||
goto out;
|
||||
|
||||
if (p->se.on_rq)
|
||||
@@ -2493,27 +2329,29 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
|
||||
|
||||
cpu = task_cpu(p);
|
||||
orig_cpu = cpu;
|
||||
this_cpu = smp_processor_id();
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
if (unlikely(task_running(rq, p)))
|
||||
goto out_activate;
|
||||
|
||||
cpu = p->sched_class->select_task_rq(p, sync);
|
||||
if (cpu != orig_cpu) {
|
||||
set_task_cpu(p, cpu);
|
||||
task_rq_unlock(rq, &flags);
|
||||
/* might preempt at this point */
|
||||
rq = task_rq_lock(p, &flags);
|
||||
old_state = p->state;
|
||||
if (!(old_state & state))
|
||||
goto out;
|
||||
if (p->se.on_rq)
|
||||
goto out_running;
|
||||
/*
|
||||
* In order to handle concurrent wakeups and release the rq->lock
|
||||
* we put the task in TASK_WAKING state.
|
||||
*
|
||||
* First fix up the nr_uninterruptible count:
|
||||
*/
|
||||
if (task_contributes_to_load(p))
|
||||
rq->nr_uninterruptible--;
|
||||
p->state = TASK_WAKING;
|
||||
task_rq_unlock(rq, &flags);
|
||||
|
||||
this_cpu = smp_processor_id();
|
||||
cpu = task_cpu(p);
|
||||
}
|
||||
cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
|
||||
if (cpu != orig_cpu)
|
||||
set_task_cpu(p, cpu);
|
||||
|
||||
rq = task_rq_lock(p, &flags);
|
||||
WARN_ON(p->state != TASK_WAKING);
|
||||
cpu = task_cpu(p);
|
||||
|
||||
#ifdef CONFIG_SCHEDSTATS
|
||||
schedstat_inc(rq, ttwu_count);
|
||||
@@ -2533,7 +2371,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
|
||||
out_activate:
|
||||
#endif /* CONFIG_SMP */
|
||||
schedstat_inc(p, se.nr_wakeups);
|
||||
if (sync)
|
||||
if (wake_flags & WF_SYNC)
|
||||
schedstat_inc(p, se.nr_wakeups_sync);
|
||||
if (orig_cpu != cpu)
|
||||
schedstat_inc(p, se.nr_wakeups_migrate);
|
||||
@@ -2562,7 +2400,7 @@ out_activate:
|
||||
|
||||
out_running:
|
||||
trace_sched_wakeup(rq, p, success);
|
||||
check_preempt_curr(rq, p, sync);
|
||||
check_preempt_curr(rq, p, wake_flags);
|
||||
|
||||
p->state = TASK_RUNNING;
|
||||
#ifdef CONFIG_SMP
|
||||
@@ -2571,6 +2409,7 @@ out_running:
|
||||
#endif
|
||||
out:
|
||||
task_rq_unlock(rq, &flags);
|
||||
put_cpu();
|
||||
|
||||
return success;
|
||||
}
|
||||
@@ -2613,6 +2452,7 @@ static void __sched_fork(struct task_struct *p)
|
||||
p->se.avg_overlap = 0;
|
||||
p->se.start_runtime = 0;
|
||||
p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
|
||||
p->se.avg_running = 0;
|
||||
|
||||
#ifdef CONFIG_SCHEDSTATS
|
||||
p->se.wait_start = 0;
|
||||
@@ -2674,11 +2514,6 @@ void sched_fork(struct task_struct *p, int clone_flags)
|
||||
|
||||
__sched_fork(p);
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
|
||||
#endif
|
||||
set_task_cpu(p, cpu);
|
||||
|
||||
/*
|
||||
* Make sure we do not leak PI boosting priority to the child.
|
||||
*/
|
||||
@@ -2709,6 +2544,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
|
||||
if (!rt_prio(p->prio))
|
||||
p->sched_class = &fair_sched_class;
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
|
||||
#endif
|
||||
set_task_cpu(p, cpu);
|
||||
|
||||
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
|
||||
if (likely(sched_info_on()))
|
||||
memset(&p->sched_info, 0, sizeof(p->sched_info));
|
||||
@@ -2754,7 +2594,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
|
||||
inc_nr_running(rq);
|
||||
}
|
||||
trace_sched_wakeup_new(rq, p, 1);
|
||||
check_preempt_curr(rq, p, 0);
|
||||
check_preempt_curr(rq, p, WF_FORK);
|
||||
#ifdef CONFIG_SMP
|
||||
if (p->sched_class->task_wake_up)
|
||||
p->sched_class->task_wake_up(rq, p);
|
||||
@@ -2878,7 +2718,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
|
||||
*/
|
||||
prev_state = prev->state;
|
||||
finish_arch_switch(prev);
|
||||
perf_counter_task_sched_in(current, cpu_of(rq));
|
||||
perf_event_task_sched_in(current, cpu_of(rq));
|
||||
finish_lock_switch(rq, prev);
|
||||
|
||||
fire_sched_in_preempt_notifiers(current);
|
||||
@@ -3064,6 +2904,19 @@ unsigned long nr_iowait(void)
|
||||
return sum;
|
||||
}
|
||||
|
||||
unsigned long nr_iowait_cpu(void)
|
||||
{
|
||||
struct rq *this = this_rq();
|
||||
return atomic_read(&this->nr_iowait);
|
||||
}
|
||||
|
||||
unsigned long this_cpu_load(void)
|
||||
{
|
||||
struct rq *this = this_rq();
|
||||
return this->cpu_load[0];
|
||||
}
|
||||
|
||||
|
||||
/* Variables and functions for calc_load */
|
||||
static atomic_long_t calc_load_tasks;
|
||||
static unsigned long calc_load_update;
|
||||
@@ -3263,7 +3116,7 @@ out:
|
||||
void sched_exec(void)
|
||||
{
|
||||
int new_cpu, this_cpu = get_cpu();
|
||||
new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
|
||||
new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
|
||||
put_cpu();
|
||||
if (new_cpu != this_cpu)
|
||||
sched_migrate_task(current, new_cpu);
|
||||
@@ -3683,11 +3536,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
|
||||
*imbalance = sds->min_load_per_task;
|
||||
sds->busiest = sds->group_min;
|
||||
|
||||
if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
|
||||
cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
|
||||
group_first_cpu(sds->group_leader);
|
||||
}
|
||||
|
||||
return 1;
|
||||
|
||||
}
|
||||
@@ -3711,7 +3559,18 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
|
||||
}
|
||||
#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
|
||||
|
||||
unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
|
||||
|
||||
unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
|
||||
{
|
||||
return SCHED_LOAD_SCALE;
|
||||
}
|
||||
|
||||
unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
|
||||
{
|
||||
return default_scale_freq_power(sd, cpu);
|
||||
}
|
||||
|
||||
unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
|
||||
{
|
||||
unsigned long weight = cpumask_weight(sched_domain_span(sd));
|
||||
unsigned long smt_gain = sd->smt_gain;
|
||||
@@ -3721,6 +3580,11 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
|
||||
return smt_gain;
|
||||
}
|
||||
|
||||
unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
|
||||
{
|
||||
return default_scale_smt_power(sd, cpu);
|
||||
}
|
||||
|
||||
unsigned long scale_rt_power(int cpu)
|
||||
{
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
@@ -3745,10 +3609,19 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
|
||||
unsigned long power = SCHED_LOAD_SCALE;
|
||||
struct sched_group *sdg = sd->groups;
|
||||
|
||||
/* here we could scale based on cpufreq */
|
||||
if (sched_feat(ARCH_POWER))
|
||||
power *= arch_scale_freq_power(sd, cpu);
|
||||
else
|
||||
power *= default_scale_freq_power(sd, cpu);
|
||||
|
||||
power >>= SCHED_LOAD_SHIFT;
|
||||
|
||||
if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
|
||||
power *= arch_scale_smt_power(sd, cpu);
|
||||
if (sched_feat(ARCH_POWER))
|
||||
power *= arch_scale_smt_power(sd, cpu);
|
||||
else
|
||||
power *= default_scale_smt_power(sd, cpu);
|
||||
|
||||
power >>= SCHED_LOAD_SHIFT;
|
||||
}
|
||||
|
||||
@@ -4161,26 +4034,6 @@ ret:
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct sched_group *group_of(int cpu)
|
||||
{
|
||||
struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
|
||||
|
||||
if (!sd)
|
||||
return NULL;
|
||||
|
||||
return sd->groups;
|
||||
}
|
||||
|
||||
static unsigned long power_of(int cpu)
|
||||
{
|
||||
struct sched_group *group = group_of(cpu);
|
||||
|
||||
if (!group)
|
||||
return SCHED_LOAD_SCALE;
|
||||
|
||||
return group->cpu_power;
|
||||
}
|
||||
|
||||
/*
|
||||
* find_busiest_queue - find the busiest runqueue among the cpus in group.
|
||||
*/
|
||||
@@ -5239,17 +5092,16 @@ void account_idle_time(cputime_t cputime)
|
||||
*/
|
||||
void account_process_tick(struct task_struct *p, int user_tick)
|
||||
{
|
||||
cputime_t one_jiffy = jiffies_to_cputime(1);
|
||||
cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
|
||||
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
|
||||
struct rq *rq = this_rq();
|
||||
|
||||
if (user_tick)
|
||||
account_user_time(p, one_jiffy, one_jiffy_scaled);
|
||||
account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
|
||||
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
|
||||
account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
|
||||
account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
|
||||
one_jiffy_scaled);
|
||||
else
|
||||
account_idle_time(one_jiffy);
|
||||
account_idle_time(cputime_one_jiffy);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -5353,7 +5205,7 @@ void scheduler_tick(void)
|
||||
curr->sched_class->task_tick(rq, curr, 0);
|
||||
spin_unlock(&rq->lock);
|
||||
|
||||
perf_counter_task_tick(curr, cpu);
|
||||
perf_event_task_tick(curr, cpu);
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
rq->idle_at_tick = idle_cpu(cpu);
|
||||
@@ -5465,14 +5317,13 @@ static inline void schedule_debug(struct task_struct *prev)
|
||||
#endif
|
||||
}
|
||||
|
||||
static void put_prev_task(struct rq *rq, struct task_struct *prev)
|
||||
static void put_prev_task(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
if (prev->state == TASK_RUNNING) {
|
||||
u64 runtime = prev->se.sum_exec_runtime;
|
||||
u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
|
||||
|
||||
runtime -= prev->se.prev_sum_exec_runtime;
|
||||
runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
|
||||
update_avg(&p->se.avg_running, runtime);
|
||||
|
||||
if (p->state == TASK_RUNNING) {
|
||||
/*
|
||||
* In order to avoid avg_overlap growing stale when we are
|
||||
* indeed overlapping and hence not getting put to sleep, grow
|
||||
@@ -5482,9 +5333,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
|
||||
* correlates to the amount of cache footprint a task can
|
||||
* build up.
|
||||
*/
|
||||
update_avg(&prev->se.avg_overlap, runtime);
|
||||
runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
|
||||
update_avg(&p->se.avg_overlap, runtime);
|
||||
} else {
|
||||
update_avg(&p->se.avg_running, 0);
|
||||
}
|
||||
prev->sched_class->put_prev_task(rq, prev);
|
||||
p->sched_class->put_prev_task(rq, p);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -5567,7 +5421,7 @@ need_resched_nonpreemptible:
|
||||
|
||||
if (likely(prev != next)) {
|
||||
sched_info_switch(prev, next);
|
||||
perf_counter_task_sched_out(prev, next, cpu);
|
||||
perf_event_task_sched_out(prev, next, cpu);
|
||||
|
||||
rq->nr_switches++;
|
||||
rq->curr = next;
|
||||
@@ -5716,10 +5570,10 @@ asmlinkage void __sched preempt_schedule_irq(void)
|
||||
|
||||
#endif /* CONFIG_PREEMPT */
|
||||
|
||||
int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
|
||||
int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
|
||||
void *key)
|
||||
{
|
||||
return try_to_wake_up(curr->private, mode, sync);
|
||||
return try_to_wake_up(curr->private, mode, wake_flags);
|
||||
}
|
||||
EXPORT_SYMBOL(default_wake_function);
|
||||
|
||||
@@ -5733,14 +5587,14 @@ EXPORT_SYMBOL(default_wake_function);
|
||||
* zero in this (rare) case, and we handle it by continuing to scan the queue.
|
||||
*/
|
||||
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
|
||||
int nr_exclusive, int sync, void *key)
|
||||
int nr_exclusive, int wake_flags, void *key)
|
||||
{
|
||||
wait_queue_t *curr, *next;
|
||||
|
||||
list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
|
||||
unsigned flags = curr->flags;
|
||||
|
||||
if (curr->func(curr, mode, sync, key) &&
|
||||
if (curr->func(curr, mode, wake_flags, key) &&
|
||||
(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
|
||||
break;
|
||||
}
|
||||
@@ -5801,16 +5655,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
|
||||
int nr_exclusive, void *key)
|
||||
{
|
||||
unsigned long flags;
|
||||
int sync = 1;
|
||||
int wake_flags = WF_SYNC;
|
||||
|
||||
if (unlikely(!q))
|
||||
return;
|
||||
|
||||
if (unlikely(!nr_exclusive))
|
||||
sync = 0;
|
||||
wake_flags = 0;
|
||||
|
||||
spin_lock_irqsave(&q->lock, flags);
|
||||
__wake_up_common(q, mode, nr_exclusive, sync, key);
|
||||
__wake_up_common(q, mode, nr_exclusive, wake_flags, key);
|
||||
spin_unlock_irqrestore(&q->lock, flags);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__wake_up_sync_key);
|
||||
@@ -6977,23 +6831,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
|
||||
if (retval)
|
||||
goto out_unlock;
|
||||
|
||||
/*
|
||||
* Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER
|
||||
* tasks that are on an otherwise idle runqueue:
|
||||
*/
|
||||
time_slice = 0;
|
||||
if (p->policy == SCHED_RR) {
|
||||
time_slice = DEF_TIMESLICE;
|
||||
} else if (p->policy != SCHED_FIFO) {
|
||||
struct sched_entity *se = &p->se;
|
||||
unsigned long flags;
|
||||
struct rq *rq;
|
||||
time_slice = p->sched_class->get_rr_interval(p);
|
||||
|
||||
rq = task_rq_lock(p, &flags);
|
||||
if (rq->cfs.load.weight)
|
||||
time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
|
||||
task_rq_unlock(rq, &flags);
|
||||
}
|
||||
read_unlock(&tasklist_lock);
|
||||
jiffies_to_timespec(time_slice, &t);
|
||||
retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
|
||||
@@ -7844,7 +7683,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
|
||||
/*
|
||||
* Register at high priority so that task migration (migrate_all_tasks)
|
||||
* happens before everything else. This has to be lower priority than
|
||||
* the notifier in the perf_counter subsystem, though.
|
||||
* the notifier in the perf_event subsystem, though.
|
||||
*/
|
||||
static struct notifier_block __cpuinitdata migration_notifier = {
|
||||
.notifier_call = migration_call,
|
||||
@@ -8000,9 +7839,7 @@ static int sd_degenerate(struct sched_domain *sd)
|
||||
}
|
||||
|
||||
/* Following flags don't use groups */
|
||||
if (sd->flags & (SD_WAKE_IDLE |
|
||||
SD_WAKE_AFFINE |
|
||||
SD_WAKE_BALANCE))
|
||||
if (sd->flags & (SD_WAKE_AFFINE))
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
@@ -8019,10 +7856,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
|
||||
if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
|
||||
return 0;
|
||||
|
||||
/* Does parent contain flags not in child? */
|
||||
/* WAKE_BALANCE is a subset of WAKE_AFFINE */
|
||||
if (cflags & SD_WAKE_AFFINE)
|
||||
pflags &= ~SD_WAKE_BALANCE;
|
||||
/* Flags needing groups don't count if only 1 group in parent */
|
||||
if (parent->groups == parent->groups->next) {
|
||||
pflags &= ~(SD_LOAD_BALANCE |
|
||||
@@ -8708,10 +8541,10 @@ static void set_domain_attribute(struct sched_domain *sd,
|
||||
request = attr->relax_domain_level;
|
||||
if (request < sd->level) {
|
||||
/* turn off idle balance on this domain */
|
||||
sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
|
||||
sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
|
||||
} else {
|
||||
/* turn on idle balance on this domain */
|
||||
sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
|
||||
sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9329,6 +9162,7 @@ void __init sched_init_smp(void)
|
||||
cpumask_var_t non_isolated_cpus;
|
||||
|
||||
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
|
||||
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
|
||||
|
||||
#if defined(CONFIG_NUMA)
|
||||
sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
|
||||
@@ -9360,7 +9194,6 @@ void __init sched_init_smp(void)
|
||||
sched_init_granularity();
|
||||
free_cpumask_var(non_isolated_cpus);
|
||||
|
||||
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
|
||||
init_sched_rt_class();
|
||||
}
|
||||
#else
|
||||
@@ -9707,7 +9540,7 @@ void __init sched_init(void)
|
||||
alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
|
||||
#endif /* SMP */
|
||||
|
||||
perf_counter_init();
|
||||
perf_event_init();
|
||||
|
||||
scheduler_running = 1;
|
||||
}
|
||||
@@ -10479,7 +10312,7 @@ static int sched_rt_global_constraints(void)
|
||||
#endif /* CONFIG_RT_GROUP_SCHED */
|
||||
|
||||
int sched_rt_handler(struct ctl_table *table, int write,
|
||||
struct file *filp, void __user *buffer, size_t *lenp,
|
||||
void __user *buffer, size_t *lenp,
|
||||
loff_t *ppos)
|
||||
{
|
||||
int ret;
|
||||
@@ -10490,7 +10323,7 @@ int sched_rt_handler(struct ctl_table *table, int write,
|
||||
old_period = sysctl_sched_rt_period;
|
||||
old_runtime = sysctl_sched_rt_runtime;
|
||||
|
||||
ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
|
||||
ret = proc_dointvec(table, write, buffer, lenp, ppos);
|
||||
|
||||
if (!ret && write) {
|
||||
ret = sched_rt_global_constraints();
|
||||
@@ -10544,8 +10377,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
|
||||
}
|
||||
|
||||
static int
|
||||
cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
|
||||
struct task_struct *tsk)
|
||||
cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
|
||||
{
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
|
||||
@@ -10555,15 +10387,45 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
|
||||
if (tsk->sched_class != &fair_sched_class)
|
||||
return -EINVAL;
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
|
||||
struct task_struct *tsk, bool threadgroup)
|
||||
{
|
||||
int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
|
||||
if (retval)
|
||||
return retval;
|
||||
if (threadgroup) {
|
||||
struct task_struct *c;
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
|
||||
retval = cpu_cgroup_can_attach_task(cgrp, c);
|
||||
if (retval) {
|
||||
rcu_read_unlock();
|
||||
return retval;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void
|
||||
cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
|
||||
struct cgroup *old_cont, struct task_struct *tsk)
|
||||
struct cgroup *old_cont, struct task_struct *tsk,
|
||||
bool threadgroup)
|
||||
{
|
||||
sched_move_task(tsk);
|
||||
if (threadgroup) {
|
||||
struct task_struct *c;
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
|
||||
sched_move_task(c);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
|
@@ -48,13 +48,6 @@ static __read_mostly int sched_clock_running;
|
||||
__read_mostly int sched_clock_stable;
|
||||
|
||||
struct sched_clock_data {
|
||||
/*
|
||||
* Raw spinlock - this is a special case: this might be called
|
||||
* from within instrumentation code so we dont want to do any
|
||||
* instrumentation ourselves.
|
||||
*/
|
||||
raw_spinlock_t lock;
|
||||
|
||||
u64 tick_raw;
|
||||
u64 tick_gtod;
|
||||
u64 clock;
|
||||
@@ -80,7 +73,6 @@ void sched_clock_init(void)
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct sched_clock_data *scd = cpu_sdc(cpu);
|
||||
|
||||
scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
|
||||
scd->tick_raw = 0;
|
||||
scd->tick_gtod = ktime_now;
|
||||
scd->clock = ktime_now;
|
||||
@@ -109,14 +101,19 @@ static inline u64 wrap_max(u64 x, u64 y)
|
||||
* - filter out backward motion
|
||||
* - use the GTOD tick value to create a window to filter crazy TSC values
|
||||
*/
|
||||
static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
|
||||
static u64 sched_clock_local(struct sched_clock_data *scd)
|
||||
{
|
||||
s64 delta = now - scd->tick_raw;
|
||||
u64 clock, min_clock, max_clock;
|
||||
u64 now, clock, old_clock, min_clock, max_clock;
|
||||
s64 delta;
|
||||
|
||||
again:
|
||||
now = sched_clock();
|
||||
delta = now - scd->tick_raw;
|
||||
if (unlikely(delta < 0))
|
||||
delta = 0;
|
||||
|
||||
old_clock = scd->clock;
|
||||
|
||||
/*
|
||||
* scd->clock = clamp(scd->tick_gtod + delta,
|
||||
* max(scd->tick_gtod, scd->clock),
|
||||
@@ -124,84 +121,73 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
|
||||
*/
|
||||
|
||||
clock = scd->tick_gtod + delta;
|
||||
min_clock = wrap_max(scd->tick_gtod, scd->clock);
|
||||
max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC);
|
||||
min_clock = wrap_max(scd->tick_gtod, old_clock);
|
||||
max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
|
||||
|
||||
clock = wrap_max(clock, min_clock);
|
||||
clock = wrap_min(clock, max_clock);
|
||||
|
||||
scd->clock = clock;
|
||||
if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
|
||||
goto again;
|
||||
|
||||
return scd->clock;
|
||||
return clock;
|
||||
}
|
||||
|
||||
static void lock_double_clock(struct sched_clock_data *data1,
|
||||
struct sched_clock_data *data2)
|
||||
static u64 sched_clock_remote(struct sched_clock_data *scd)
|
||||
{
|
||||
if (data1 < data2) {
|
||||
__raw_spin_lock(&data1->lock);
|
||||
__raw_spin_lock(&data2->lock);
|
||||
struct sched_clock_data *my_scd = this_scd();
|
||||
u64 this_clock, remote_clock;
|
||||
u64 *ptr, old_val, val;
|
||||
|
||||
sched_clock_local(my_scd);
|
||||
again:
|
||||
this_clock = my_scd->clock;
|
||||
remote_clock = scd->clock;
|
||||
|
||||
/*
|
||||
* Use the opportunity that we have both locks
|
||||
* taken to couple the two clocks: we take the
|
||||
* larger time as the latest time for both
|
||||
* runqueues. (this creates monotonic movement)
|
||||
*/
|
||||
if (likely((s64)(remote_clock - this_clock) < 0)) {
|
||||
ptr = &scd->clock;
|
||||
old_val = remote_clock;
|
||||
val = this_clock;
|
||||
} else {
|
||||
__raw_spin_lock(&data2->lock);
|
||||
__raw_spin_lock(&data1->lock);
|
||||
/*
|
||||
* Should be rare, but possible:
|
||||
*/
|
||||
ptr = &my_scd->clock;
|
||||
old_val = this_clock;
|
||||
val = remote_clock;
|
||||
}
|
||||
|
||||
if (cmpxchg64(ptr, old_val, val) != old_val)
|
||||
goto again;
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
u64 sched_clock_cpu(int cpu)
|
||||
{
|
||||
u64 now, clock, this_clock, remote_clock;
|
||||
struct sched_clock_data *scd;
|
||||
u64 clock;
|
||||
|
||||
WARN_ON_ONCE(!irqs_disabled());
|
||||
|
||||
if (sched_clock_stable)
|
||||
return sched_clock();
|
||||
|
||||
scd = cpu_sdc(cpu);
|
||||
|
||||
/*
|
||||
* Normally this is not called in NMI context - but if it is,
|
||||
* trying to do any locking here is totally lethal.
|
||||
*/
|
||||
if (unlikely(in_nmi()))
|
||||
return scd->clock;
|
||||
|
||||
if (unlikely(!sched_clock_running))
|
||||
return 0ull;
|
||||
|
||||
WARN_ON_ONCE(!irqs_disabled());
|
||||
now = sched_clock();
|
||||
scd = cpu_sdc(cpu);
|
||||
|
||||
if (cpu != raw_smp_processor_id()) {
|
||||
struct sched_clock_data *my_scd = this_scd();
|
||||
|
||||
lock_double_clock(scd, my_scd);
|
||||
|
||||
this_clock = __update_sched_clock(my_scd, now);
|
||||
remote_clock = scd->clock;
|
||||
|
||||
/*
|
||||
* Use the opportunity that we have both locks
|
||||
* taken to couple the two clocks: we take the
|
||||
* larger time as the latest time for both
|
||||
* runqueues. (this creates monotonic movement)
|
||||
*/
|
||||
if (likely((s64)(remote_clock - this_clock) < 0)) {
|
||||
clock = this_clock;
|
||||
scd->clock = clock;
|
||||
} else {
|
||||
/*
|
||||
* Should be rare, but possible:
|
||||
*/
|
||||
clock = remote_clock;
|
||||
my_scd->clock = remote_clock;
|
||||
}
|
||||
|
||||
__raw_spin_unlock(&my_scd->lock);
|
||||
} else {
|
||||
__raw_spin_lock(&scd->lock);
|
||||
clock = __update_sched_clock(scd, now);
|
||||
}
|
||||
|
||||
__raw_spin_unlock(&scd->lock);
|
||||
if (cpu != smp_processor_id())
|
||||
clock = sched_clock_remote(scd);
|
||||
else
|
||||
clock = sched_clock_local(scd);
|
||||
|
||||
return clock;
|
||||
}
|
||||
@@ -223,11 +209,9 @@ void sched_clock_tick(void)
|
||||
now_gtod = ktime_to_ns(ktime_get());
|
||||
now = sched_clock();
|
||||
|
||||
__raw_spin_lock(&scd->lock);
|
||||
scd->tick_raw = now;
|
||||
scd->tick_gtod = now_gtod;
|
||||
__update_sched_clock(scd, now);
|
||||
__raw_spin_unlock(&scd->lock);
|
||||
sched_clock_local(scd);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@@ -395,6 +395,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
|
||||
PN(se.sum_exec_runtime);
|
||||
PN(se.avg_overlap);
|
||||
PN(se.avg_wakeup);
|
||||
PN(se.avg_running);
|
||||
|
||||
nr_switches = p->nvcsw + p->nivcsw;
|
||||
|
||||
|
@@ -384,10 +384,10 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
|
||||
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
int sched_nr_latency_handler(struct ctl_table *table, int write,
|
||||
struct file *filp, void __user *buffer, size_t *lenp,
|
||||
void __user *buffer, size_t *lenp,
|
||||
loff_t *ppos)
|
||||
{
|
||||
int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
|
||||
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
||||
|
||||
if (ret || !write)
|
||||
return ret;
|
||||
@@ -513,6 +513,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
|
||||
if (entity_is_task(curr)) {
|
||||
struct task_struct *curtask = task_of(curr);
|
||||
|
||||
trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
|
||||
cpuacct_charge(curtask, delta_exec);
|
||||
account_group_exec_runtime(curtask, delta_exec);
|
||||
}
|
||||
@@ -709,24 +710,28 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
|
||||
if (initial && sched_feat(START_DEBIT))
|
||||
vruntime += sched_vslice(cfs_rq, se);
|
||||
|
||||
if (!initial) {
|
||||
/* sleeps upto a single latency don't count. */
|
||||
if (sched_feat(NEW_FAIR_SLEEPERS)) {
|
||||
unsigned long thresh = sysctl_sched_latency;
|
||||
/* sleeps up to a single latency don't count. */
|
||||
if (!initial && sched_feat(FAIR_SLEEPERS)) {
|
||||
unsigned long thresh = sysctl_sched_latency;
|
||||
|
||||
/*
|
||||
* Convert the sleeper threshold into virtual time.
|
||||
* SCHED_IDLE is a special sub-class. We care about
|
||||
* fairness only relative to other SCHED_IDLE tasks,
|
||||
* all of which have the same weight.
|
||||
*/
|
||||
if (sched_feat(NORMALIZED_SLEEPER) &&
|
||||
(!entity_is_task(se) ||
|
||||
task_of(se)->policy != SCHED_IDLE))
|
||||
thresh = calc_delta_fair(thresh, se);
|
||||
/*
|
||||
* Convert the sleeper threshold into virtual time.
|
||||
* SCHED_IDLE is a special sub-class. We care about
|
||||
* fairness only relative to other SCHED_IDLE tasks,
|
||||
* all of which have the same weight.
|
||||
*/
|
||||
if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
|
||||
task_of(se)->policy != SCHED_IDLE))
|
||||
thresh = calc_delta_fair(thresh, se);
|
||||
|
||||
vruntime -= thresh;
|
||||
}
|
||||
/*
|
||||
* Halve their sleep time's effect, to allow
|
||||
* for a gentler effect of sleepers:
|
||||
*/
|
||||
if (sched_feat(GENTLE_FAIR_SLEEPERS))
|
||||
thresh >>= 1;
|
||||
|
||||
vruntime -= thresh;
|
||||
}
|
||||
|
||||
/* ensure we never gain time by being placed backwards. */
|
||||
@@ -757,10 +762,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
|
||||
|
||||
static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
{
|
||||
if (cfs_rq->last == se)
|
||||
if (!se || cfs_rq->last == se)
|
||||
cfs_rq->last = NULL;
|
||||
|
||||
if (cfs_rq->next == se)
|
||||
if (!se || cfs_rq->next == se)
|
||||
cfs_rq->next = NULL;
|
||||
}
|
||||
|
||||
@@ -1062,83 +1067,6 @@ static void yield_task_fair(struct rq *rq)
|
||||
se->vruntime = rightmost->vruntime + 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* wake_idle() will wake a task on an idle cpu if task->cpu is
|
||||
* not idle and an idle cpu is available. The span of cpus to
|
||||
* search starts with cpus closest then further out as needed,
|
||||
* so we always favor a closer, idle cpu.
|
||||
* Domains may include CPUs that are not usable for migration,
|
||||
* hence we need to mask them out (rq->rd->online)
|
||||
*
|
||||
* Returns the CPU we should wake onto.
|
||||
*/
|
||||
#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
|
||||
|
||||
#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
|
||||
|
||||
static int wake_idle(int cpu, struct task_struct *p)
|
||||
{
|
||||
struct sched_domain *sd;
|
||||
int i;
|
||||
unsigned int chosen_wakeup_cpu;
|
||||
int this_cpu;
|
||||
struct rq *task_rq = task_rq(p);
|
||||
|
||||
/*
|
||||
* At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
|
||||
* are idle and this is not a kernel thread and this task's affinity
|
||||
* allows it to be moved to preferred cpu, then just move!
|
||||
*/
|
||||
|
||||
this_cpu = smp_processor_id();
|
||||
chosen_wakeup_cpu =
|
||||
cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
|
||||
|
||||
if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
|
||||
idle_cpu(cpu) && idle_cpu(this_cpu) &&
|
||||
p->mm && !(p->flags & PF_KTHREAD) &&
|
||||
cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
|
||||
return chosen_wakeup_cpu;
|
||||
|
||||
/*
|
||||
* If it is idle, then it is the best cpu to run this task.
|
||||
*
|
||||
* This cpu is also the best, if it has more than one task already.
|
||||
* Siblings must be also busy(in most cases) as they didn't already
|
||||
* pickup the extra load from this cpu and hence we need not check
|
||||
* sibling runqueue info. This will avoid the checks and cache miss
|
||||
* penalities associated with that.
|
||||
*/
|
||||
if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
|
||||
return cpu;
|
||||
|
||||
for_each_domain(cpu, sd) {
|
||||
if ((sd->flags & SD_WAKE_IDLE)
|
||||
|| ((sd->flags & SD_WAKE_IDLE_FAR)
|
||||
&& !task_hot(p, task_rq->clock, sd))) {
|
||||
for_each_cpu_and(i, sched_domain_span(sd),
|
||||
&p->cpus_allowed) {
|
||||
if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
|
||||
if (i != task_cpu(p)) {
|
||||
schedstat_inc(p,
|
||||
se.nr_wakeups_idle);
|
||||
}
|
||||
return i;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return cpu;
|
||||
}
|
||||
#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
|
||||
static inline int wake_idle(int cpu, struct task_struct *p)
|
||||
{
|
||||
return cpu;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
@@ -1225,25 +1153,34 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
|
||||
|
||||
#endif
|
||||
|
||||
static int
|
||||
wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
|
||||
struct task_struct *p, int prev_cpu, int this_cpu, int sync,
|
||||
int idx, unsigned long load, unsigned long this_load,
|
||||
unsigned int imbalance)
|
||||
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
|
||||
{
|
||||
struct task_struct *curr = this_rq->curr;
|
||||
struct task_group *tg;
|
||||
unsigned long tl = this_load;
|
||||
struct task_struct *curr = current;
|
||||
unsigned long this_load, load;
|
||||
int idx, this_cpu, prev_cpu;
|
||||
unsigned long tl_per_task;
|
||||
unsigned int imbalance;
|
||||
struct task_group *tg;
|
||||
unsigned long weight;
|
||||
int balanced;
|
||||
|
||||
if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
|
||||
return 0;
|
||||
idx = sd->wake_idx;
|
||||
this_cpu = smp_processor_id();
|
||||
prev_cpu = task_cpu(p);
|
||||
load = source_load(prev_cpu, idx);
|
||||
this_load = target_load(this_cpu, idx);
|
||||
|
||||
if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
|
||||
p->se.avg_overlap > sysctl_sched_migration_cost))
|
||||
sync = 0;
|
||||
if (sync) {
|
||||
if (sched_feat(SYNC_LESS) &&
|
||||
(curr->se.avg_overlap > sysctl_sched_migration_cost ||
|
||||
p->se.avg_overlap > sysctl_sched_migration_cost))
|
||||
sync = 0;
|
||||
} else {
|
||||
if (sched_feat(SYNC_MORE) &&
|
||||
(curr->se.avg_overlap < sysctl_sched_migration_cost &&
|
||||
p->se.avg_overlap < sysctl_sched_migration_cost))
|
||||
sync = 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* If sync wakeup then subtract the (maximum possible)
|
||||
@@ -1254,24 +1191,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
|
||||
tg = task_group(current);
|
||||
weight = current->se.load.weight;
|
||||
|
||||
tl += effective_load(tg, this_cpu, -weight, -weight);
|
||||
this_load += effective_load(tg, this_cpu, -weight, -weight);
|
||||
load += effective_load(tg, prev_cpu, 0, -weight);
|
||||
}
|
||||
|
||||
tg = task_group(p);
|
||||
weight = p->se.load.weight;
|
||||
|
||||
imbalance = 100 + (sd->imbalance_pct - 100) / 2;
|
||||
|
||||
/*
|
||||
* In low-load situations, where prev_cpu is idle and this_cpu is idle
|
||||
* due to the sync cause above having dropped tl to 0, we'll always have
|
||||
* an imbalance, but there's really nothing you can do about that, so
|
||||
* that's good too.
|
||||
* due to the sync cause above having dropped this_load to 0, we'll
|
||||
* always have an imbalance, but there's really nothing you can do
|
||||
* about that, so that's good too.
|
||||
*
|
||||
* Otherwise check if either cpus are near enough in load to allow this
|
||||
* task to be woken on this_cpu.
|
||||
*/
|
||||
balanced = !tl ||
|
||||
100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
|
||||
balanced = !this_load ||
|
||||
100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
|
||||
imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
|
||||
|
||||
/*
|
||||
@@ -1285,14 +1224,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
|
||||
schedstat_inc(p, se.nr_wakeups_affine_attempts);
|
||||
tl_per_task = cpu_avg_load_per_task(this_cpu);
|
||||
|
||||
if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
|
||||
tl_per_task)) {
|
||||
if (balanced ||
|
||||
(this_load <= load &&
|
||||
this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
|
||||
/*
|
||||
* This domain has SD_WAKE_AFFINE and
|
||||
* p is cache cold in this domain, and
|
||||
* there is no bad imbalance.
|
||||
*/
|
||||
schedstat_inc(this_sd, ttwu_move_affine);
|
||||
schedstat_inc(sd, ttwu_move_affine);
|
||||
schedstat_inc(p, se.nr_wakeups_affine);
|
||||
|
||||
return 1;
|
||||
@@ -1300,65 +1240,216 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int select_task_rq_fair(struct task_struct *p, int sync)
|
||||
/*
|
||||
* find_idlest_group finds and returns the least busy CPU group within the
|
||||
* domain.
|
||||
*/
|
||||
static struct sched_group *
|
||||
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
|
||||
int this_cpu, int load_idx)
|
||||
{
|
||||
struct sched_domain *sd, *this_sd = NULL;
|
||||
int prev_cpu, this_cpu, new_cpu;
|
||||
unsigned long load, this_load;
|
||||
struct rq *this_rq;
|
||||
unsigned int imbalance;
|
||||
int idx;
|
||||
struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
|
||||
unsigned long min_load = ULONG_MAX, this_load = 0;
|
||||
int imbalance = 100 + (sd->imbalance_pct-100)/2;
|
||||
|
||||
prev_cpu = task_cpu(p);
|
||||
this_cpu = smp_processor_id();
|
||||
this_rq = cpu_rq(this_cpu);
|
||||
new_cpu = prev_cpu;
|
||||
do {
|
||||
unsigned long load, avg_load;
|
||||
int local_group;
|
||||
int i;
|
||||
|
||||
/*
|
||||
* 'this_sd' is the first domain that both
|
||||
* this_cpu and prev_cpu are present in:
|
||||
*/
|
||||
for_each_domain(this_cpu, sd) {
|
||||
if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
|
||||
this_sd = sd;
|
||||
break;
|
||||
/* Skip over this group if it has no CPUs allowed */
|
||||
if (!cpumask_intersects(sched_group_cpus(group),
|
||||
&p->cpus_allowed))
|
||||
continue;
|
||||
|
||||
local_group = cpumask_test_cpu(this_cpu,
|
||||
sched_group_cpus(group));
|
||||
|
||||
/* Tally up the load of all CPUs in the group */
|
||||
avg_load = 0;
|
||||
|
||||
for_each_cpu(i, sched_group_cpus(group)) {
|
||||
/* Bias balancing toward cpus of our domain */
|
||||
if (local_group)
|
||||
load = source_load(i, load_idx);
|
||||
else
|
||||
load = target_load(i, load_idx);
|
||||
|
||||
avg_load += load;
|
||||
}
|
||||
|
||||
/* Adjust by relative CPU power of the group */
|
||||
avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
|
||||
|
||||
if (local_group) {
|
||||
this_load = avg_load;
|
||||
this = group;
|
||||
} else if (avg_load < min_load) {
|
||||
min_load = avg_load;
|
||||
idlest = group;
|
||||
}
|
||||
} while (group = group->next, group != sd->groups);
|
||||
|
||||
if (!idlest || 100*this_load < imbalance*min_load)
|
||||
return NULL;
|
||||
return idlest;
|
||||
}
|
||||
|
||||
/*
|
||||
* find_idlest_cpu - find the idlest cpu among the cpus in group.
|
||||
*/
|
||||
static int
|
||||
find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
|
||||
{
|
||||
unsigned long load, min_load = ULONG_MAX;
|
||||
int idlest = -1;
|
||||
int i;
|
||||
|
||||
/* Traverse only the allowed CPUs */
|
||||
for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
|
||||
load = weighted_cpuload(i);
|
||||
|
||||
if (load < min_load || (load == min_load && i == this_cpu)) {
|
||||
min_load = load;
|
||||
idlest = i;
|
||||
}
|
||||
}
|
||||
|
||||
if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
|
||||
goto out;
|
||||
return idlest;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check for affine wakeup and passive balancing possibilities.
|
||||
*/
|
||||
if (!this_sd)
|
||||
goto out;
|
||||
/*
|
||||
* sched_balance_self: balance the current task (running on cpu) in domains
|
||||
* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
|
||||
* SD_BALANCE_EXEC.
|
||||
*
|
||||
* Balance, ie. select the least loaded group.
|
||||
*
|
||||
* Returns the target CPU number, or the same CPU if no balancing is needed.
|
||||
*
|
||||
* preempt must be disabled.
|
||||
*/
|
||||
static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
|
||||
{
|
||||
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
|
||||
int cpu = smp_processor_id();
|
||||
int prev_cpu = task_cpu(p);
|
||||
int new_cpu = cpu;
|
||||
int want_affine = 0;
|
||||
int want_sd = 1;
|
||||
int sync = wake_flags & WF_SYNC;
|
||||
|
||||
idx = this_sd->wake_idx;
|
||||
if (sd_flag & SD_BALANCE_WAKE) {
|
||||
if (sched_feat(AFFINE_WAKEUPS) &&
|
||||
cpumask_test_cpu(cpu, &p->cpus_allowed))
|
||||
want_affine = 1;
|
||||
new_cpu = prev_cpu;
|
||||
}
|
||||
|
||||
imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
|
||||
rcu_read_lock();
|
||||
for_each_domain(cpu, tmp) {
|
||||
/*
|
||||
* If power savings logic is enabled for a domain, see if we
|
||||
* are not overloaded, if so, don't balance wider.
|
||||
*/
|
||||
if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
|
||||
unsigned long power = 0;
|
||||
unsigned long nr_running = 0;
|
||||
unsigned long capacity;
|
||||
int i;
|
||||
|
||||
load = source_load(prev_cpu, idx);
|
||||
this_load = target_load(this_cpu, idx);
|
||||
for_each_cpu(i, sched_domain_span(tmp)) {
|
||||
power += power_of(i);
|
||||
nr_running += cpu_rq(i)->cfs.nr_running;
|
||||
}
|
||||
|
||||
if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
|
||||
load, this_load, imbalance))
|
||||
return this_cpu;
|
||||
capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
|
||||
|
||||
/*
|
||||
* Start passive balancing when half the imbalance_pct
|
||||
* limit is reached.
|
||||
*/
|
||||
if (this_sd->flags & SD_WAKE_BALANCE) {
|
||||
if (imbalance*this_load <= 100*load) {
|
||||
schedstat_inc(this_sd, ttwu_move_balance);
|
||||
schedstat_inc(p, se.nr_wakeups_passive);
|
||||
return this_cpu;
|
||||
if (tmp->flags & SD_POWERSAVINGS_BALANCE)
|
||||
nr_running /= 2;
|
||||
|
||||
if (nr_running < capacity)
|
||||
want_sd = 0;
|
||||
}
|
||||
|
||||
if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
|
||||
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
|
||||
|
||||
affine_sd = tmp;
|
||||
want_affine = 0;
|
||||
}
|
||||
|
||||
if (!want_sd && !want_affine)
|
||||
break;
|
||||
|
||||
if (!(tmp->flags & sd_flag))
|
||||
continue;
|
||||
|
||||
if (want_sd)
|
||||
sd = tmp;
|
||||
}
|
||||
|
||||
if (sched_feat(LB_SHARES_UPDATE)) {
|
||||
/*
|
||||
* Pick the largest domain to update shares over
|
||||
*/
|
||||
tmp = sd;
|
||||
if (affine_sd && (!tmp ||
|
||||
cpumask_weight(sched_domain_span(affine_sd)) >
|
||||
cpumask_weight(sched_domain_span(sd))))
|
||||
tmp = affine_sd;
|
||||
|
||||
if (tmp)
|
||||
update_shares(tmp);
|
||||
}
|
||||
|
||||
if (affine_sd && wake_affine(affine_sd, p, sync)) {
|
||||
new_cpu = cpu;
|
||||
goto out;
|
||||
}
|
||||
|
||||
while (sd) {
|
||||
int load_idx = sd->forkexec_idx;
|
||||
struct sched_group *group;
|
||||
int weight;
|
||||
|
||||
if (!(sd->flags & sd_flag)) {
|
||||
sd = sd->child;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (sd_flag & SD_BALANCE_WAKE)
|
||||
load_idx = sd->wake_idx;
|
||||
|
||||
group = find_idlest_group(sd, p, cpu, load_idx);
|
||||
if (!group) {
|
||||
sd = sd->child;
|
||||
continue;
|
||||
}
|
||||
|
||||
new_cpu = find_idlest_cpu(group, p, cpu);
|
||||
if (new_cpu == -1 || new_cpu == cpu) {
|
||||
/* Now try balancing at a lower domain level of cpu */
|
||||
sd = sd->child;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Now try balancing at a lower domain level of new_cpu */
|
||||
cpu = new_cpu;
|
||||
weight = cpumask_weight(sched_domain_span(sd));
|
||||
sd = NULL;
|
||||
for_each_domain(cpu, tmp) {
|
||||
if (weight <= cpumask_weight(sched_domain_span(tmp)))
|
||||
break;
|
||||
if (tmp->flags & sd_flag)
|
||||
sd = tmp;
|
||||
}
|
||||
/* while loop will break here if sd == NULL */
|
||||
}
|
||||
|
||||
out:
|
||||
return wake_idle(new_cpu, p);
|
||||
rcu_read_unlock();
|
||||
return new_cpu;
|
||||
}
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
@@ -1471,11 +1562,12 @@ static void set_next_buddy(struct sched_entity *se)
|
||||
/*
|
||||
* Preempt the current task with a newly woken task if needed:
|
||||
*/
|
||||
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
|
||||
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
|
||||
{
|
||||
struct task_struct *curr = rq->curr;
|
||||
struct sched_entity *se = &curr->se, *pse = &p->se;
|
||||
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
|
||||
int sync = wake_flags & WF_SYNC;
|
||||
|
||||
update_curr(cfs_rq);
|
||||
|
||||
@@ -1501,7 +1593,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
|
||||
*/
|
||||
if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
|
||||
set_last_buddy(se);
|
||||
set_next_buddy(pse);
|
||||
if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
|
||||
set_next_buddy(pse);
|
||||
|
||||
/*
|
||||
* We can come here with TIF_NEED_RESCHED already set from new task
|
||||
@@ -1523,16 +1616,25 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
|
||||
return;
|
||||
}
|
||||
|
||||
if (!sched_feat(WAKEUP_PREEMPT))
|
||||
return;
|
||||
|
||||
if (sched_feat(WAKEUP_OVERLAP) && (sync ||
|
||||
(se->avg_overlap < sysctl_sched_migration_cost &&
|
||||
pse->avg_overlap < sysctl_sched_migration_cost))) {
|
||||
if ((sched_feat(WAKEUP_SYNC) && sync) ||
|
||||
(sched_feat(WAKEUP_OVERLAP) &&
|
||||
(se->avg_overlap < sysctl_sched_migration_cost &&
|
||||
pse->avg_overlap < sysctl_sched_migration_cost))) {
|
||||
resched_task(curr);
|
||||
return;
|
||||
}
|
||||
|
||||
if (sched_feat(WAKEUP_RUNNING)) {
|
||||
if (pse->avg_running < se->avg_running) {
|
||||
set_next_buddy(pse);
|
||||
resched_task(curr);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (!sched_feat(WAKEUP_PREEMPT))
|
||||
return;
|
||||
|
||||
find_matching_se(&se, &pse);
|
||||
|
||||
BUG_ON(!pse);
|
||||
@@ -1555,8 +1657,13 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
|
||||
/*
|
||||
* If se was a buddy, clear it so that it will have to earn
|
||||
* the favour again.
|
||||
*
|
||||
* If se was not a buddy, clear the buddies because neither
|
||||
* was elegible to run, let them earn it again.
|
||||
*
|
||||
* IOW. unconditionally clear buddies.
|
||||
*/
|
||||
__clear_buddies(cfs_rq, se);
|
||||
__clear_buddies(cfs_rq, NULL);
|
||||
set_next_entity(cfs_rq, se);
|
||||
cfs_rq = group_cfs_rq(se);
|
||||
} while (cfs_rq);
|
||||
@@ -1832,6 +1939,25 @@ static void moved_group_fair(struct task_struct *p)
|
||||
}
|
||||
#endif
|
||||
|
||||
unsigned int get_rr_interval_fair(struct task_struct *task)
|
||||
{
|
||||
struct sched_entity *se = &task->se;
|
||||
unsigned long flags;
|
||||
struct rq *rq;
|
||||
unsigned int rr_interval = 0;
|
||||
|
||||
/*
|
||||
* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
|
||||
* idle runqueue:
|
||||
*/
|
||||
rq = task_rq_lock(task, &flags);
|
||||
if (rq->cfs.load.weight)
|
||||
rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
|
||||
task_rq_unlock(rq, &flags);
|
||||
|
||||
return rr_interval;
|
||||
}
|
||||
|
||||
/*
|
||||
* All the scheduling class methods:
|
||||
*/
|
||||
@@ -1860,6 +1986,8 @@ static const struct sched_class fair_sched_class = {
|
||||
.prio_changed = prio_changed_fair,
|
||||
.switched_to = switched_to_fair,
|
||||
|
||||
.get_rr_interval = get_rr_interval_fair,
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
.moved_group = moved_group_fair,
|
||||
#endif
|
||||
|
@@ -1,17 +1,123 @@
|
||||
SCHED_FEAT(NEW_FAIR_SLEEPERS, 0)
|
||||
/*
|
||||
* Disregards a certain amount of sleep time (sched_latency_ns) and
|
||||
* considers the task to be running during that period. This gives it
|
||||
* a service deficit on wakeup, allowing it to run sooner.
|
||||
*/
|
||||
SCHED_FEAT(FAIR_SLEEPERS, 1)
|
||||
|
||||
/*
|
||||
* Only give sleepers 50% of their service deficit. This allows
|
||||
* them to run sooner, but does not allow tons of sleepers to
|
||||
* rip the spread apart.
|
||||
*/
|
||||
SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
|
||||
|
||||
/*
|
||||
* By not normalizing the sleep time, heavy tasks get an effective
|
||||
* longer period, and lighter task an effective shorter period they
|
||||
* are considered running.
|
||||
*/
|
||||
SCHED_FEAT(NORMALIZED_SLEEPER, 0)
|
||||
SCHED_FEAT(ADAPTIVE_GRAN, 1)
|
||||
SCHED_FEAT(WAKEUP_PREEMPT, 1)
|
||||
|
||||
/*
|
||||
* Place new tasks ahead so that they do not starve already running
|
||||
* tasks
|
||||
*/
|
||||
SCHED_FEAT(START_DEBIT, 1)
|
||||
SCHED_FEAT(AFFINE_WAKEUPS, 1)
|
||||
SCHED_FEAT(CACHE_HOT_BUDDY, 1)
|
||||
|
||||
/*
|
||||
* Should wakeups try to preempt running tasks.
|
||||
*/
|
||||
SCHED_FEAT(WAKEUP_PREEMPT, 1)
|
||||
|
||||
/*
|
||||
* Compute wakeup_gran based on task behaviour, clipped to
|
||||
* [0, sched_wakeup_gran_ns]
|
||||
*/
|
||||
SCHED_FEAT(ADAPTIVE_GRAN, 1)
|
||||
|
||||
/*
|
||||
* When converting the wakeup granularity to virtual time, do it such
|
||||
* that heavier tasks preempting a lighter task have an edge.
|
||||
*/
|
||||
SCHED_FEAT(ASYM_GRAN, 1)
|
||||
|
||||
/*
|
||||
* Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
|
||||
*/
|
||||
SCHED_FEAT(WAKEUP_SYNC, 0)
|
||||
|
||||
/*
|
||||
* Wakeup preempt based on task behaviour. Tasks that do not overlap
|
||||
* don't get preempted.
|
||||
*/
|
||||
SCHED_FEAT(WAKEUP_OVERLAP, 0)
|
||||
|
||||
/*
|
||||
* Wakeup preemption towards tasks that run short
|
||||
*/
|
||||
SCHED_FEAT(WAKEUP_RUNNING, 0)
|
||||
|
||||
/*
|
||||
* Use the SYNC wakeup hint, pipes and the likes use this to indicate
|
||||
* the remote end is likely to consume the data we just wrote, and
|
||||
* therefore has cache benefit from being placed on the same cpu, see
|
||||
* also AFFINE_WAKEUPS.
|
||||
*/
|
||||
SCHED_FEAT(SYNC_WAKEUPS, 1)
|
||||
|
||||
/*
|
||||
* Based on load and program behaviour, see if it makes sense to place
|
||||
* a newly woken task on the same cpu as the task that woke it --
|
||||
* improve cache locality. Typically used with SYNC wakeups as
|
||||
* generated by pipes and the like, see also SYNC_WAKEUPS.
|
||||
*/
|
||||
SCHED_FEAT(AFFINE_WAKEUPS, 1)
|
||||
|
||||
/*
|
||||
* Weaken SYNC hint based on overlap
|
||||
*/
|
||||
SCHED_FEAT(SYNC_LESS, 1)
|
||||
|
||||
/*
|
||||
* Add SYNC hint based on overlap
|
||||
*/
|
||||
SCHED_FEAT(SYNC_MORE, 0)
|
||||
|
||||
/*
|
||||
* Prefer to schedule the task we woke last (assuming it failed
|
||||
* wakeup-preemption), since its likely going to consume data we
|
||||
* touched, increases cache locality.
|
||||
*/
|
||||
SCHED_FEAT(NEXT_BUDDY, 0)
|
||||
|
||||
/*
|
||||
* Prefer to schedule the task that ran last (when we did
|
||||
* wake-preempt) as that likely will touch the same data, increases
|
||||
* cache locality.
|
||||
*/
|
||||
SCHED_FEAT(LAST_BUDDY, 1)
|
||||
|
||||
/*
|
||||
* Consider buddies to be cache hot, decreases the likelyness of a
|
||||
* cache buddy being migrated away, increases cache locality.
|
||||
*/
|
||||
SCHED_FEAT(CACHE_HOT_BUDDY, 1)
|
||||
|
||||
/*
|
||||
* Use arch dependent cpu power functions
|
||||
*/
|
||||
SCHED_FEAT(ARCH_POWER, 0)
|
||||
|
||||
SCHED_FEAT(HRTICK, 0)
|
||||
SCHED_FEAT(DOUBLE_TICK, 0)
|
||||
SCHED_FEAT(ASYM_GRAN, 1)
|
||||
SCHED_FEAT(LB_BIAS, 1)
|
||||
SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
|
||||
SCHED_FEAT(LB_SHARES_UPDATE, 1)
|
||||
SCHED_FEAT(ASYM_EFF_LOAD, 1)
|
||||
SCHED_FEAT(WAKEUP_OVERLAP, 0)
|
||||
SCHED_FEAT(LAST_BUDDY, 1)
|
||||
|
||||
/*
|
||||
* Spin-wait on mutex acquisition when the mutex owner is running on
|
||||
* another cpu -- assumes that when the owner is running, it will soon
|
||||
* release the lock. Decreases scheduling overhead.
|
||||
*/
|
||||
SCHED_FEAT(OWNER_SPIN, 1)
|
||||
|
@@ -6,7 +6,7 @@
|
||||
*/
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
static int select_task_rq_idle(struct task_struct *p, int sync)
|
||||
static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
|
||||
{
|
||||
return task_cpu(p); /* IDLE tasks as never migrated */
|
||||
}
|
||||
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
|
||||
/*
|
||||
* Idle tasks are unconditionally rescheduled:
|
||||
*/
|
||||
static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync)
|
||||
static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
|
||||
{
|
||||
resched_task(rq->idle);
|
||||
}
|
||||
@@ -97,6 +97,11 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
|
||||
check_preempt_curr(rq, p, 0);
|
||||
}
|
||||
|
||||
unsigned int get_rr_interval_idle(struct task_struct *task)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Simple, special scheduling class for the per-CPU idle tasks:
|
||||
*/
|
||||
@@ -122,6 +127,8 @@ static const struct sched_class idle_sched_class = {
|
||||
.set_curr_task = set_curr_task_idle,
|
||||
.task_tick = task_tick_idle,
|
||||
|
||||
.get_rr_interval = get_rr_interval_idle,
|
||||
|
||||
.prio_changed = prio_changed_idle,
|
||||
.switched_to = switched_to_idle,
|
||||
|
||||
|
@@ -938,10 +938,13 @@ static void yield_task_rt(struct rq *rq)
|
||||
#ifdef CONFIG_SMP
|
||||
static int find_lowest_rq(struct task_struct *task);
|
||||
|
||||
static int select_task_rq_rt(struct task_struct *p, int sync)
|
||||
static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
|
||||
{
|
||||
struct rq *rq = task_rq(p);
|
||||
|
||||
if (sd_flag != SD_BALANCE_WAKE)
|
||||
return smp_processor_id();
|
||||
|
||||
/*
|
||||
* If the current task is an RT task, then
|
||||
* try to see if we can wake this RT task up on another
|
||||
@@ -999,7 +1002,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
|
||||
/*
|
||||
* Preempt the current task with a newly woken task if needed:
|
||||
*/
|
||||
static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
|
||||
static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
|
||||
{
|
||||
if (p->prio < rq->curr->prio) {
|
||||
resched_task(rq->curr);
|
||||
@@ -1731,6 +1734,17 @@ static void set_curr_task_rt(struct rq *rq)
|
||||
dequeue_pushable_task(rq, p);
|
||||
}
|
||||
|
||||
unsigned int get_rr_interval_rt(struct task_struct *task)
|
||||
{
|
||||
/*
|
||||
* Time slice is 0 for SCHED_FIFO tasks
|
||||
*/
|
||||
if (task->policy == SCHED_RR)
|
||||
return DEF_TIMESLICE;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct sched_class rt_sched_class = {
|
||||
.next = &fair_sched_class,
|
||||
.enqueue_task = enqueue_task_rt,
|
||||
@@ -1759,6 +1773,8 @@ static const struct sched_class rt_sched_class = {
|
||||
.set_curr_task = set_curr_task_rt,
|
||||
.task_tick = task_tick_rt,
|
||||
|
||||
.get_rr_interval = get_rr_interval_rt,
|
||||
|
||||
.prio_changed = prio_changed_rt,
|
||||
.switched_to = switched_to_rt,
|
||||
};
|
||||
|
168
kernel/signal.c
168
kernel/signal.c
@@ -705,7 +705,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
|
||||
|
||||
if (why) {
|
||||
/*
|
||||
* The first thread which returns from finish_stop()
|
||||
* The first thread which returns from do_signal_stop()
|
||||
* will take ->siglock, notice SIGNAL_CLD_MASK, and
|
||||
* notify its parent. See get_signal_to_deliver().
|
||||
*/
|
||||
@@ -971,6 +971,20 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
|
||||
return send_signal(sig, info, t, 0);
|
||||
}
|
||||
|
||||
int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
|
||||
bool group)
|
||||
{
|
||||
unsigned long flags;
|
||||
int ret = -ESRCH;
|
||||
|
||||
if (lock_task_sighand(p, &flags)) {
|
||||
ret = send_signal(sig, info, p, group);
|
||||
unlock_task_sighand(p, &flags);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Force a signal that the process can't ignore: if necessary
|
||||
* we unblock the signal and change any SIG_IGN to SIG_DFL.
|
||||
@@ -1036,12 +1050,6 @@ void zap_other_threads(struct task_struct *p)
|
||||
}
|
||||
}
|
||||
|
||||
int __fatal_signal_pending(struct task_struct *tsk)
|
||||
{
|
||||
return sigismember(&tsk->pending.signal, SIGKILL);
|
||||
}
|
||||
EXPORT_SYMBOL(__fatal_signal_pending);
|
||||
|
||||
struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
|
||||
{
|
||||
struct sighand_struct *sighand;
|
||||
@@ -1068,18 +1076,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
|
||||
*/
|
||||
int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
|
||||
{
|
||||
unsigned long flags;
|
||||
int ret;
|
||||
int ret = check_kill_permission(sig, info, p);
|
||||
|
||||
ret = check_kill_permission(sig, info, p);
|
||||
|
||||
if (!ret && sig) {
|
||||
ret = -ESRCH;
|
||||
if (lock_task_sighand(p, &flags)) {
|
||||
ret = __group_send_sig_info(sig, info, p);
|
||||
unlock_task_sighand(p, &flags);
|
||||
}
|
||||
}
|
||||
if (!ret && sig)
|
||||
ret = do_send_sig_info(sig, info, p, true);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@@ -1224,15 +1224,9 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
|
||||
* These are for backward compatibility with the rest of the kernel source.
|
||||
*/
|
||||
|
||||
/*
|
||||
* The caller must ensure the task can't exit.
|
||||
*/
|
||||
int
|
||||
send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
|
||||
{
|
||||
int ret;
|
||||
unsigned long flags;
|
||||
|
||||
/*
|
||||
* Make sure legacy kernel users don't send in bad values
|
||||
* (normal paths check this in check_kill_permission).
|
||||
@@ -1240,10 +1234,7 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
|
||||
if (!valid_signal(sig))
|
||||
return -EINVAL;
|
||||
|
||||
spin_lock_irqsave(&p->sighand->siglock, flags);
|
||||
ret = specific_send_sig_info(sig, info, p);
|
||||
spin_unlock_irqrestore(&p->sighand->siglock, flags);
|
||||
return ret;
|
||||
return do_send_sig_info(sig, info, p, false);
|
||||
}
|
||||
|
||||
#define __si_special(priv) \
|
||||
@@ -1382,15 +1373,6 @@ ret:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Wake up any threads in the parent blocked in wait* syscalls.
|
||||
*/
|
||||
static inline void __wake_up_parent(struct task_struct *p,
|
||||
struct task_struct *parent)
|
||||
{
|
||||
wake_up_interruptible_sync(&parent->signal->wait_chldexit);
|
||||
}
|
||||
|
||||
/*
|
||||
* Let a parent know about the death of a child.
|
||||
* For a stopped/continued status change, use do_notify_parent_cldstop instead.
|
||||
@@ -1673,29 +1655,6 @@ void ptrace_notify(int exit_code)
|
||||
spin_unlock_irq(¤t->sighand->siglock);
|
||||
}
|
||||
|
||||
static void
|
||||
finish_stop(int stop_count)
|
||||
{
|
||||
/*
|
||||
* If there are no other threads in the group, or if there is
|
||||
* a group stop in progress and we are the last to stop,
|
||||
* report to the parent. When ptraced, every thread reports itself.
|
||||
*/
|
||||
if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) {
|
||||
read_lock(&tasklist_lock);
|
||||
do_notify_parent_cldstop(current, CLD_STOPPED);
|
||||
read_unlock(&tasklist_lock);
|
||||
}
|
||||
|
||||
do {
|
||||
schedule();
|
||||
} while (try_to_freeze());
|
||||
/*
|
||||
* Now we don't run again until continued.
|
||||
*/
|
||||
current->exit_code = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* This performs the stopping for SIGSTOP and other stop signals.
|
||||
* We have to stop all threads in the thread group.
|
||||
@@ -1705,15 +1664,9 @@ finish_stop(int stop_count)
|
||||
static int do_signal_stop(int signr)
|
||||
{
|
||||
struct signal_struct *sig = current->signal;
|
||||
int stop_count;
|
||||
int notify;
|
||||
|
||||
if (sig->group_stop_count > 0) {
|
||||
/*
|
||||
* There is a group stop in progress. We don't need to
|
||||
* start another one.
|
||||
*/
|
||||
stop_count = --sig->group_stop_count;
|
||||
} else {
|
||||
if (!sig->group_stop_count) {
|
||||
struct task_struct *t;
|
||||
|
||||
if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
|
||||
@@ -1725,7 +1678,7 @@ static int do_signal_stop(int signr)
|
||||
*/
|
||||
sig->group_exit_code = signr;
|
||||
|
||||
stop_count = 0;
|
||||
sig->group_stop_count = 1;
|
||||
for (t = next_thread(current); t != current; t = next_thread(t))
|
||||
/*
|
||||
* Setting state to TASK_STOPPED for a group
|
||||
@@ -1734,19 +1687,44 @@ static int do_signal_stop(int signr)
|
||||
*/
|
||||
if (!(t->flags & PF_EXITING) &&
|
||||
!task_is_stopped_or_traced(t)) {
|
||||
stop_count++;
|
||||
sig->group_stop_count++;
|
||||
signal_wake_up(t, 0);
|
||||
}
|
||||
sig->group_stop_count = stop_count;
|
||||
}
|
||||
/*
|
||||
* If there are no other threads in the group, or if there is
|
||||
* a group stop in progress and we are the last to stop, report
|
||||
* to the parent. When ptraced, every thread reports itself.
|
||||
*/
|
||||
notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0;
|
||||
notify = tracehook_notify_jctl(notify, CLD_STOPPED);
|
||||
/*
|
||||
* tracehook_notify_jctl() can drop and reacquire siglock, so
|
||||
* we keep ->group_stop_count != 0 before the call. If SIGCONT
|
||||
* or SIGKILL comes in between ->group_stop_count == 0.
|
||||
*/
|
||||
if (sig->group_stop_count) {
|
||||
if (!--sig->group_stop_count)
|
||||
sig->flags = SIGNAL_STOP_STOPPED;
|
||||
current->exit_code = sig->group_exit_code;
|
||||
__set_current_state(TASK_STOPPED);
|
||||
}
|
||||
spin_unlock_irq(¤t->sighand->siglock);
|
||||
|
||||
if (notify) {
|
||||
read_lock(&tasklist_lock);
|
||||
do_notify_parent_cldstop(current, notify);
|
||||
read_unlock(&tasklist_lock);
|
||||
}
|
||||
|
||||
if (stop_count == 0)
|
||||
sig->flags = SIGNAL_STOP_STOPPED;
|
||||
current->exit_code = sig->group_exit_code;
|
||||
__set_current_state(TASK_STOPPED);
|
||||
/* Now we don't run again until woken by SIGCONT or SIGKILL */
|
||||
do {
|
||||
schedule();
|
||||
} while (try_to_freeze());
|
||||
|
||||
tracehook_finish_jctl();
|
||||
current->exit_code = 0;
|
||||
|
||||
spin_unlock_irq(¤t->sighand->siglock);
|
||||
finish_stop(stop_count);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -1815,14 +1793,15 @@ relock:
|
||||
int why = (signal->flags & SIGNAL_STOP_CONTINUED)
|
||||
? CLD_CONTINUED : CLD_STOPPED;
|
||||
signal->flags &= ~SIGNAL_CLD_MASK;
|
||||
|
||||
why = tracehook_notify_jctl(why, CLD_CONTINUED);
|
||||
spin_unlock_irq(&sighand->siglock);
|
||||
|
||||
if (unlikely(!tracehook_notify_jctl(1, why)))
|
||||
goto relock;
|
||||
|
||||
read_lock(&tasklist_lock);
|
||||
do_notify_parent_cldstop(current->group_leader, why);
|
||||
read_unlock(&tasklist_lock);
|
||||
if (why) {
|
||||
read_lock(&tasklist_lock);
|
||||
do_notify_parent_cldstop(current->group_leader, why);
|
||||
read_unlock(&tasklist_lock);
|
||||
}
|
||||
goto relock;
|
||||
}
|
||||
|
||||
@@ -1987,14 +1966,14 @@ void exit_signals(struct task_struct *tsk)
|
||||
if (unlikely(tsk->signal->group_stop_count) &&
|
||||
!--tsk->signal->group_stop_count) {
|
||||
tsk->signal->flags = SIGNAL_STOP_STOPPED;
|
||||
group_stop = 1;
|
||||
group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED);
|
||||
}
|
||||
out:
|
||||
spin_unlock_irq(&tsk->sighand->siglock);
|
||||
|
||||
if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) {
|
||||
if (unlikely(group_stop)) {
|
||||
read_lock(&tasklist_lock);
|
||||
do_notify_parent_cldstop(tsk, CLD_STOPPED);
|
||||
do_notify_parent_cldstop(tsk, group_stop);
|
||||
read_unlock(&tasklist_lock);
|
||||
}
|
||||
}
|
||||
@@ -2290,7 +2269,6 @@ static int
|
||||
do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
|
||||
{
|
||||
struct task_struct *p;
|
||||
unsigned long flags;
|
||||
int error = -ESRCH;
|
||||
|
||||
rcu_read_lock();
|
||||
@@ -2300,14 +2278,16 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
|
||||
/*
|
||||
* The null signal is a permissions and process existence
|
||||
* probe. No signal is actually delivered.
|
||||
*
|
||||
* If lock_task_sighand() fails we pretend the task dies
|
||||
* after receiving the signal. The window is tiny, and the
|
||||
* signal is private anyway.
|
||||
*/
|
||||
if (!error && sig && lock_task_sighand(p, &flags)) {
|
||||
error = specific_send_sig_info(sig, info, p);
|
||||
unlock_task_sighand(p, &flags);
|
||||
if (!error && sig) {
|
||||
error = do_send_sig_info(sig, info, p, false);
|
||||
/*
|
||||
* If lock_task_sighand() failed we pretend the task
|
||||
* dies after receiving the signal. The window is tiny,
|
||||
* and the signal is private anyway.
|
||||
*/
|
||||
if (unlikely(error == -ESRCH))
|
||||
error = 0;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
@@ -26,10 +26,10 @@ static void slow_work_cull_timeout(unsigned long);
|
||||
static void slow_work_oom_timeout(unsigned long);
|
||||
|
||||
#ifdef CONFIG_SYSCTL
|
||||
static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *,
|
||||
static int slow_work_min_threads_sysctl(struct ctl_table *, int,
|
||||
void __user *, size_t *, loff_t *);
|
||||
|
||||
static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *,
|
||||
static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
|
||||
void __user *, size_t *, loff_t *);
|
||||
#endif
|
||||
|
||||
@@ -493,10 +493,10 @@ static void slow_work_oom_timeout(unsigned long data)
|
||||
* Handle adjustment of the minimum number of threads
|
||||
*/
|
||||
static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
|
||||
struct file *filp, void __user *buffer,
|
||||
void __user *buffer,
|
||||
size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
|
||||
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
||||
int n;
|
||||
|
||||
if (ret == 0) {
|
||||
@@ -521,10 +521,10 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
|
||||
* Handle adjustment of the maximum number of threads
|
||||
*/
|
||||
static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
|
||||
struct file *filp, void __user *buffer,
|
||||
void __user *buffer,
|
||||
size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
|
||||
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
||||
int n;
|
||||
|
||||
if (ret == 0) {
|
||||
|
36
kernel/smp.c
36
kernel/smp.c
@@ -29,8 +29,7 @@ enum {
|
||||
|
||||
struct call_function_data {
|
||||
struct call_single_data csd;
|
||||
spinlock_t lock;
|
||||
unsigned int refs;
|
||||
atomic_t refs;
|
||||
cpumask_var_t cpumask;
|
||||
};
|
||||
|
||||
@@ -39,9 +38,7 @@ struct call_single_queue {
|
||||
spinlock_t lock;
|
||||
};
|
||||
|
||||
static DEFINE_PER_CPU(struct call_function_data, cfd_data) = {
|
||||
.lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock),
|
||||
};
|
||||
static DEFINE_PER_CPU(struct call_function_data, cfd_data);
|
||||
|
||||
static int
|
||||
hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
|
||||
@@ -196,25 +193,18 @@ void generic_smp_call_function_interrupt(void)
|
||||
list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
|
||||
int refs;
|
||||
|
||||
spin_lock(&data->lock);
|
||||
if (!cpumask_test_cpu(cpu, data->cpumask)) {
|
||||
spin_unlock(&data->lock);
|
||||
if (!cpumask_test_and_clear_cpu(cpu, data->cpumask))
|
||||
continue;
|
||||
}
|
||||
cpumask_clear_cpu(cpu, data->cpumask);
|
||||
spin_unlock(&data->lock);
|
||||
|
||||
data->csd.func(data->csd.info);
|
||||
|
||||
spin_lock(&data->lock);
|
||||
WARN_ON(data->refs == 0);
|
||||
refs = --data->refs;
|
||||
refs = atomic_dec_return(&data->refs);
|
||||
WARN_ON(refs < 0);
|
||||
if (!refs) {
|
||||
spin_lock(&call_function.lock);
|
||||
list_del_rcu(&data->csd.list);
|
||||
spin_unlock(&call_function.lock);
|
||||
}
|
||||
spin_unlock(&data->lock);
|
||||
|
||||
if (refs)
|
||||
continue;
|
||||
@@ -357,13 +347,6 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
|
||||
generic_exec_single(cpu, data, wait);
|
||||
}
|
||||
|
||||
/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */
|
||||
|
||||
#ifndef arch_send_call_function_ipi_mask
|
||||
# define arch_send_call_function_ipi_mask(maskp) \
|
||||
arch_send_call_function_ipi(*(maskp))
|
||||
#endif
|
||||
|
||||
/**
|
||||
* smp_call_function_many(): Run a function on a set of other CPUs.
|
||||
* @mask: The set of cpus to run on (only runs on online subset).
|
||||
@@ -419,23 +402,20 @@ void smp_call_function_many(const struct cpumask *mask,
|
||||
data = &__get_cpu_var(cfd_data);
|
||||
csd_lock(&data->csd);
|
||||
|
||||
spin_lock_irqsave(&data->lock, flags);
|
||||
data->csd.func = func;
|
||||
data->csd.info = info;
|
||||
cpumask_and(data->cpumask, mask, cpu_online_mask);
|
||||
cpumask_clear_cpu(this_cpu, data->cpumask);
|
||||
data->refs = cpumask_weight(data->cpumask);
|
||||
atomic_set(&data->refs, cpumask_weight(data->cpumask));
|
||||
|
||||
spin_lock(&call_function.lock);
|
||||
spin_lock_irqsave(&call_function.lock, flags);
|
||||
/*
|
||||
* Place entry at the _HEAD_ of the list, so that any cpu still
|
||||
* observing the entry in generic_smp_call_function_interrupt()
|
||||
* will not miss any other list entries:
|
||||
*/
|
||||
list_add_rcu(&data->csd.list, &call_function.queue);
|
||||
spin_unlock(&call_function.lock);
|
||||
|
||||
spin_unlock_irqrestore(&data->lock, flags);
|
||||
spin_unlock_irqrestore(&call_function.lock, flags);
|
||||
|
||||
/*
|
||||
* Make the list addition visible before sending the ipi.
|
||||
|
@@ -90,11 +90,11 @@ void touch_all_softlockup_watchdogs(void)
|
||||
EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
|
||||
|
||||
int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
|
||||
struct file *filp, void __user *buffer,
|
||||
void __user *buffer,
|
||||
size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
touch_all_softlockup_watchdogs();
|
||||
return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
|
||||
return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
||||
}
|
||||
|
||||
/*
|
||||
|
46
kernel/sys.c
46
kernel/sys.c
@@ -14,7 +14,7 @@
|
||||
#include <linux/prctl.h>
|
||||
#include <linux/highuid.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/perf_counter.h>
|
||||
#include <linux/perf_event.h>
|
||||
#include <linux/resource.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/kexec.h>
|
||||
@@ -1338,6 +1338,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
|
||||
unsigned long flags;
|
||||
cputime_t utime, stime;
|
||||
struct task_cputime cputime;
|
||||
unsigned long maxrss = 0;
|
||||
|
||||
memset((char *) r, 0, sizeof *r);
|
||||
utime = stime = cputime_zero;
|
||||
@@ -1346,6 +1347,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
|
||||
utime = task_utime(current);
|
||||
stime = task_stime(current);
|
||||
accumulate_thread_rusage(p, r);
|
||||
maxrss = p->signal->maxrss;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@@ -1363,6 +1365,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
|
||||
r->ru_majflt = p->signal->cmaj_flt;
|
||||
r->ru_inblock = p->signal->cinblock;
|
||||
r->ru_oublock = p->signal->coublock;
|
||||
maxrss = p->signal->cmaxrss;
|
||||
|
||||
if (who == RUSAGE_CHILDREN)
|
||||
break;
|
||||
@@ -1377,6 +1380,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
|
||||
r->ru_majflt += p->signal->maj_flt;
|
||||
r->ru_inblock += p->signal->inblock;
|
||||
r->ru_oublock += p->signal->oublock;
|
||||
if (maxrss < p->signal->maxrss)
|
||||
maxrss = p->signal->maxrss;
|
||||
t = p;
|
||||
do {
|
||||
accumulate_thread_rusage(t, r);
|
||||
@@ -1392,6 +1397,15 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
|
||||
out:
|
||||
cputime_to_timeval(utime, &r->ru_utime);
|
||||
cputime_to_timeval(stime, &r->ru_stime);
|
||||
|
||||
if (who != RUSAGE_CHILDREN) {
|
||||
struct mm_struct *mm = get_task_mm(p);
|
||||
if (mm) {
|
||||
setmax_mm_hiwater_rss(&maxrss, mm);
|
||||
mmput(mm);
|
||||
}
|
||||
}
|
||||
r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
|
||||
}
|
||||
|
||||
int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
|
||||
@@ -1511,11 +1525,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
|
||||
case PR_SET_TSC:
|
||||
error = SET_TSC_CTL(arg2);
|
||||
break;
|
||||
case PR_TASK_PERF_COUNTERS_DISABLE:
|
||||
error = perf_counter_task_disable();
|
||||
case PR_TASK_PERF_EVENTS_DISABLE:
|
||||
error = perf_event_task_disable();
|
||||
break;
|
||||
case PR_TASK_PERF_COUNTERS_ENABLE:
|
||||
error = perf_counter_task_enable();
|
||||
case PR_TASK_PERF_EVENTS_ENABLE:
|
||||
error = perf_event_task_enable();
|
||||
break;
|
||||
case PR_GET_TIMERSLACK:
|
||||
error = current->timer_slack_ns;
|
||||
@@ -1528,6 +1542,28 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
|
||||
current->timer_slack_ns = arg2;
|
||||
error = 0;
|
||||
break;
|
||||
case PR_MCE_KILL:
|
||||
if (arg4 | arg5)
|
||||
return -EINVAL;
|
||||
switch (arg2) {
|
||||
case 0:
|
||||
if (arg3 != 0)
|
||||
return -EINVAL;
|
||||
current->flags &= ~PF_MCE_PROCESS;
|
||||
break;
|
||||
case 1:
|
||||
current->flags |= PF_MCE_PROCESS;
|
||||
if (arg3 != 0)
|
||||
current->flags |= PF_MCE_EARLY;
|
||||
else
|
||||
current->flags &= ~PF_MCE_EARLY;
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
error = 0;
|
||||
break;
|
||||
|
||||
default:
|
||||
error = -EINVAL;
|
||||
break;
|
||||
|
@@ -49,6 +49,7 @@ cond_syscall(sys_sendmsg);
|
||||
cond_syscall(compat_sys_sendmsg);
|
||||
cond_syscall(sys_recvmsg);
|
||||
cond_syscall(compat_sys_recvmsg);
|
||||
cond_syscall(compat_sys_recvfrom);
|
||||
cond_syscall(sys_socketcall);
|
||||
cond_syscall(sys_futex);
|
||||
cond_syscall(compat_sys_futex);
|
||||
@@ -177,4 +178,4 @@ cond_syscall(sys_eventfd);
|
||||
cond_syscall(sys_eventfd2);
|
||||
|
||||
/* performance counters: */
|
||||
cond_syscall(sys_perf_counter_open);
|
||||
cond_syscall(sys_perf_event_open);
|
||||
|
149
kernel/sysctl.c
149
kernel/sysctl.c
@@ -26,7 +26,6 @@
|
||||
#include <linux/proc_fs.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/ctype.h>
|
||||
#include <linux/utsname.h>
|
||||
#include <linux/kmemcheck.h>
|
||||
#include <linux/smp_lock.h>
|
||||
#include <linux/fs.h>
|
||||
@@ -50,7 +49,7 @@
|
||||
#include <linux/reboot.h>
|
||||
#include <linux/ftrace.h>
|
||||
#include <linux/slow-work.h>
|
||||
#include <linux/perf_counter.h>
|
||||
#include <linux/perf_event.h>
|
||||
|
||||
#include <asm/uaccess.h>
|
||||
#include <asm/processor.h>
|
||||
@@ -77,6 +76,7 @@ extern int max_threads;
|
||||
extern int core_uses_pid;
|
||||
extern int suid_dumpable;
|
||||
extern char core_pattern[];
|
||||
extern unsigned int core_pipe_limit;
|
||||
extern int pid_max;
|
||||
extern int min_free_kbytes;
|
||||
extern int pid_max_min, pid_max_max;
|
||||
@@ -106,6 +106,9 @@ static int __maybe_unused one = 1;
|
||||
static int __maybe_unused two = 2;
|
||||
static unsigned long one_ul = 1;
|
||||
static int one_hundred = 100;
|
||||
#ifdef CONFIG_PRINTK
|
||||
static int ten_thousand = 10000;
|
||||
#endif
|
||||
|
||||
/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
|
||||
static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
|
||||
@@ -160,9 +163,9 @@ extern int max_lock_depth;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_PROC_SYSCTL
|
||||
static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
|
||||
static int proc_do_cad_pid(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos);
|
||||
static int proc_taint(struct ctl_table *table, int write, struct file *filp,
|
||||
static int proc_taint(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos);
|
||||
#endif
|
||||
|
||||
@@ -421,6 +424,14 @@ static struct ctl_table kern_table[] = {
|
||||
.proc_handler = &proc_dostring,
|
||||
.strategy = &sysctl_string,
|
||||
},
|
||||
{
|
||||
.ctl_name = CTL_UNNUMBERED,
|
||||
.procname = "core_pipe_limit",
|
||||
.data = &core_pipe_limit,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = &proc_dointvec,
|
||||
},
|
||||
#ifdef CONFIG_PROC_SYSCTL
|
||||
{
|
||||
.procname = "tainted",
|
||||
@@ -722,6 +733,17 @@ static struct ctl_table kern_table[] = {
|
||||
.mode = 0644,
|
||||
.proc_handler = &proc_dointvec,
|
||||
},
|
||||
{
|
||||
.ctl_name = CTL_UNNUMBERED,
|
||||
.procname = "printk_delay",
|
||||
.data = &printk_delay_msec,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = &proc_dointvec_minmax,
|
||||
.strategy = &sysctl_intvec,
|
||||
.extra1 = &zero,
|
||||
.extra2 = &ten_thousand,
|
||||
},
|
||||
#endif
|
||||
{
|
||||
.ctl_name = KERN_NGROUPS_MAX,
|
||||
@@ -964,28 +986,28 @@ static struct ctl_table kern_table[] = {
|
||||
.child = slow_work_sysctls,
|
||||
},
|
||||
#endif
|
||||
#ifdef CONFIG_PERF_COUNTERS
|
||||
#ifdef CONFIG_PERF_EVENTS
|
||||
{
|
||||
.ctl_name = CTL_UNNUMBERED,
|
||||
.procname = "perf_counter_paranoid",
|
||||
.data = &sysctl_perf_counter_paranoid,
|
||||
.maxlen = sizeof(sysctl_perf_counter_paranoid),
|
||||
.procname = "perf_event_paranoid",
|
||||
.data = &sysctl_perf_event_paranoid,
|
||||
.maxlen = sizeof(sysctl_perf_event_paranoid),
|
||||
.mode = 0644,
|
||||
.proc_handler = &proc_dointvec,
|
||||
},
|
||||
{
|
||||
.ctl_name = CTL_UNNUMBERED,
|
||||
.procname = "perf_counter_mlock_kb",
|
||||
.data = &sysctl_perf_counter_mlock,
|
||||
.maxlen = sizeof(sysctl_perf_counter_mlock),
|
||||
.procname = "perf_event_mlock_kb",
|
||||
.data = &sysctl_perf_event_mlock,
|
||||
.maxlen = sizeof(sysctl_perf_event_mlock),
|
||||
.mode = 0644,
|
||||
.proc_handler = &proc_dointvec,
|
||||
},
|
||||
{
|
||||
.ctl_name = CTL_UNNUMBERED,
|
||||
.procname = "perf_counter_max_sample_rate",
|
||||
.data = &sysctl_perf_counter_sample_rate,
|
||||
.maxlen = sizeof(sysctl_perf_counter_sample_rate),
|
||||
.procname = "perf_event_max_sample_rate",
|
||||
.data = &sysctl_perf_event_sample_rate,
|
||||
.maxlen = sizeof(sysctl_perf_event_sample_rate),
|
||||
.mode = 0644,
|
||||
.proc_handler = &proc_dointvec,
|
||||
},
|
||||
@@ -1376,6 +1398,31 @@ static struct ctl_table vm_table[] = {
|
||||
.mode = 0644,
|
||||
.proc_handler = &scan_unevictable_handler,
|
||||
},
|
||||
#ifdef CONFIG_MEMORY_FAILURE
|
||||
{
|
||||
.ctl_name = CTL_UNNUMBERED,
|
||||
.procname = "memory_failure_early_kill",
|
||||
.data = &sysctl_memory_failure_early_kill,
|
||||
.maxlen = sizeof(sysctl_memory_failure_early_kill),
|
||||
.mode = 0644,
|
||||
.proc_handler = &proc_dointvec_minmax,
|
||||
.strategy = &sysctl_intvec,
|
||||
.extra1 = &zero,
|
||||
.extra2 = &one,
|
||||
},
|
||||
{
|
||||
.ctl_name = CTL_UNNUMBERED,
|
||||
.procname = "memory_failure_recovery",
|
||||
.data = &sysctl_memory_failure_recovery,
|
||||
.maxlen = sizeof(sysctl_memory_failure_recovery),
|
||||
.mode = 0644,
|
||||
.proc_handler = &proc_dointvec_minmax,
|
||||
.strategy = &sysctl_intvec,
|
||||
.extra1 = &zero,
|
||||
.extra2 = &one,
|
||||
},
|
||||
#endif
|
||||
|
||||
/*
|
||||
* NOTE: do not add new entries to this table unless you have read
|
||||
* Documentation/sysctl/ctl_unnumbered.txt
|
||||
@@ -2204,7 +2251,7 @@ void sysctl_head_put(struct ctl_table_header *head)
|
||||
#ifdef CONFIG_PROC_SYSCTL
|
||||
|
||||
static int _proc_do_string(void* data, int maxlen, int write,
|
||||
struct file *filp, void __user *buffer,
|
||||
void __user *buffer,
|
||||
size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
size_t len;
|
||||
@@ -2265,7 +2312,6 @@ static int _proc_do_string(void* data, int maxlen, int write,
|
||||
* proc_dostring - read a string sysctl
|
||||
* @table: the sysctl table
|
||||
* @write: %TRUE if this is a write to the sysctl file
|
||||
* @filp: the file structure
|
||||
* @buffer: the user buffer
|
||||
* @lenp: the size of the user buffer
|
||||
* @ppos: file position
|
||||
@@ -2279,10 +2325,10 @@ static int _proc_do_string(void* data, int maxlen, int write,
|
||||
*
|
||||
* Returns 0 on success.
|
||||
*/
|
||||
int proc_dostring(struct ctl_table *table, int write, struct file *filp,
|
||||
int proc_dostring(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
return _proc_do_string(table->data, table->maxlen, write, filp,
|
||||
return _proc_do_string(table->data, table->maxlen, write,
|
||||
buffer, lenp, ppos);
|
||||
}
|
||||
|
||||
@@ -2307,7 +2353,7 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
|
||||
}
|
||||
|
||||
static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
|
||||
int write, struct file *filp, void __user *buffer,
|
||||
int write, void __user *buffer,
|
||||
size_t *lenp, loff_t *ppos,
|
||||
int (*conv)(int *negp, unsigned long *lvalp, int *valp,
|
||||
int write, void *data),
|
||||
@@ -2414,13 +2460,13 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
|
||||
#undef TMPBUFLEN
|
||||
}
|
||||
|
||||
static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp,
|
||||
static int do_proc_dointvec(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos,
|
||||
int (*conv)(int *negp, unsigned long *lvalp, int *valp,
|
||||
int write, void *data),
|
||||
void *data)
|
||||
{
|
||||
return __do_proc_dointvec(table->data, table, write, filp,
|
||||
return __do_proc_dointvec(table->data, table, write,
|
||||
buffer, lenp, ppos, conv, data);
|
||||
}
|
||||
|
||||
@@ -2428,7 +2474,6 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
|
||||
* proc_dointvec - read a vector of integers
|
||||
* @table: the sysctl table
|
||||
* @write: %TRUE if this is a write to the sysctl file
|
||||
* @filp: the file structure
|
||||
* @buffer: the user buffer
|
||||
* @lenp: the size of the user buffer
|
||||
* @ppos: file position
|
||||
@@ -2438,10 +2483,10 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
|
||||
*
|
||||
* Returns 0 on success.
|
||||
*/
|
||||
int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
|
||||
int proc_dointvec(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
|
||||
return do_proc_dointvec(table,write,buffer,lenp,ppos,
|
||||
NULL,NULL);
|
||||
}
|
||||
|
||||
@@ -2449,7 +2494,7 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
|
||||
* Taint values can only be increased
|
||||
* This means we can safely use a temporary.
|
||||
*/
|
||||
static int proc_taint(struct ctl_table *table, int write, struct file *filp,
|
||||
static int proc_taint(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
struct ctl_table t;
|
||||
@@ -2461,7 +2506,7 @@ static int proc_taint(struct ctl_table *table, int write, struct file *filp,
|
||||
|
||||
t = *table;
|
||||
t.data = &tmptaint;
|
||||
err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos);
|
||||
err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
@@ -2513,7 +2558,6 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
|
||||
* proc_dointvec_minmax - read a vector of integers with min/max values
|
||||
* @table: the sysctl table
|
||||
* @write: %TRUE if this is a write to the sysctl file
|
||||
* @filp: the file structure
|
||||
* @buffer: the user buffer
|
||||
* @lenp: the size of the user buffer
|
||||
* @ppos: file position
|
||||
@@ -2526,19 +2570,18 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
|
||||
*
|
||||
* Returns 0 on success.
|
||||
*/
|
||||
int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp,
|
||||
int proc_dointvec_minmax(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
struct do_proc_dointvec_minmax_conv_param param = {
|
||||
.min = (int *) table->extra1,
|
||||
.max = (int *) table->extra2,
|
||||
};
|
||||
return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
|
||||
return do_proc_dointvec(table, write, buffer, lenp, ppos,
|
||||
do_proc_dointvec_minmax_conv, ¶m);
|
||||
}
|
||||
|
||||
static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
|
||||
struct file *filp,
|
||||
void __user *buffer,
|
||||
size_t *lenp, loff_t *ppos,
|
||||
unsigned long convmul,
|
||||
@@ -2643,21 +2686,19 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
|
||||
}
|
||||
|
||||
static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
|
||||
struct file *filp,
|
||||
void __user *buffer,
|
||||
size_t *lenp, loff_t *ppos,
|
||||
unsigned long convmul,
|
||||
unsigned long convdiv)
|
||||
{
|
||||
return __do_proc_doulongvec_minmax(table->data, table, write,
|
||||
filp, buffer, lenp, ppos, convmul, convdiv);
|
||||
buffer, lenp, ppos, convmul, convdiv);
|
||||
}
|
||||
|
||||
/**
|
||||
* proc_doulongvec_minmax - read a vector of long integers with min/max values
|
||||
* @table: the sysctl table
|
||||
* @write: %TRUE if this is a write to the sysctl file
|
||||
* @filp: the file structure
|
||||
* @buffer: the user buffer
|
||||
* @lenp: the size of the user buffer
|
||||
* @ppos: file position
|
||||
@@ -2670,17 +2711,16 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
|
||||
*
|
||||
* Returns 0 on success.
|
||||
*/
|
||||
int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp,
|
||||
int proc_doulongvec_minmax(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l);
|
||||
return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
|
||||
}
|
||||
|
||||
/**
|
||||
* proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
|
||||
* @table: the sysctl table
|
||||
* @write: %TRUE if this is a write to the sysctl file
|
||||
* @filp: the file structure
|
||||
* @buffer: the user buffer
|
||||
* @lenp: the size of the user buffer
|
||||
* @ppos: file position
|
||||
@@ -2695,11 +2735,10 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp
|
||||
* Returns 0 on success.
|
||||
*/
|
||||
int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
|
||||
struct file *filp,
|
||||
void __user *buffer,
|
||||
size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
return do_proc_doulongvec_minmax(table, write, filp, buffer,
|
||||
return do_proc_doulongvec_minmax(table, write, buffer,
|
||||
lenp, ppos, HZ, 1000l);
|
||||
}
|
||||
|
||||
@@ -2775,7 +2814,6 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
|
||||
* proc_dointvec_jiffies - read a vector of integers as seconds
|
||||
* @table: the sysctl table
|
||||
* @write: %TRUE if this is a write to the sysctl file
|
||||
* @filp: the file structure
|
||||
* @buffer: the user buffer
|
||||
* @lenp: the size of the user buffer
|
||||
* @ppos: file position
|
||||
@@ -2787,10 +2825,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
|
||||
*
|
||||
* Returns 0 on success.
|
||||
*/
|
||||
int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
|
||||
int proc_dointvec_jiffies(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
|
||||
return do_proc_dointvec(table,write,buffer,lenp,ppos,
|
||||
do_proc_dointvec_jiffies_conv,NULL);
|
||||
}
|
||||
|
||||
@@ -2798,7 +2836,6 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
|
||||
* proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
|
||||
* @table: the sysctl table
|
||||
* @write: %TRUE if this is a write to the sysctl file
|
||||
* @filp: the file structure
|
||||
* @buffer: the user buffer
|
||||
* @lenp: the size of the user buffer
|
||||
* @ppos: pointer to the file position
|
||||
@@ -2810,10 +2847,10 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
|
||||
*
|
||||
* Returns 0 on success.
|
||||
*/
|
||||
int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp,
|
||||
int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
|
||||
return do_proc_dointvec(table,write,buffer,lenp,ppos,
|
||||
do_proc_dointvec_userhz_jiffies_conv,NULL);
|
||||
}
|
||||
|
||||
@@ -2821,7 +2858,6 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
|
||||
* proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
|
||||
* @table: the sysctl table
|
||||
* @write: %TRUE if this is a write to the sysctl file
|
||||
* @filp: the file structure
|
||||
* @buffer: the user buffer
|
||||
* @lenp: the size of the user buffer
|
||||
* @ppos: file position
|
||||
@@ -2834,14 +2870,14 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
|
||||
*
|
||||
* Returns 0 on success.
|
||||
*/
|
||||
int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp,
|
||||
int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
|
||||
return do_proc_dointvec(table, write, buffer, lenp, ppos,
|
||||
do_proc_dointvec_ms_jiffies_conv, NULL);
|
||||
}
|
||||
|
||||
static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
|
||||
static int proc_do_cad_pid(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
struct pid *new_pid;
|
||||
@@ -2850,7 +2886,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
|
||||
|
||||
tmp = pid_vnr(cad_pid);
|
||||
|
||||
r = __do_proc_dointvec(&tmp, table, write, filp, buffer,
|
||||
r = __do_proc_dointvec(&tmp, table, write, buffer,
|
||||
lenp, ppos, NULL, NULL);
|
||||
if (r || !write)
|
||||
return r;
|
||||
@@ -2865,50 +2901,49 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
|
||||
|
||||
#else /* CONFIG_PROC_FS */
|
||||
|
||||
int proc_dostring(struct ctl_table *table, int write, struct file *filp,
|
||||
int proc_dostring(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
return -ENOSYS;
|
||||
}
|
||||
|
||||
int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
|
||||
int proc_dointvec(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
return -ENOSYS;
|
||||
}
|
||||
|
||||
int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp,
|
||||
int proc_dointvec_minmax(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
return -ENOSYS;
|
||||
}
|
||||
|
||||
int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
|
||||
int proc_dointvec_jiffies(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
return -ENOSYS;
|
||||
}
|
||||
|
||||
int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp,
|
||||
int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
return -ENOSYS;
|
||||
}
|
||||
|
||||
int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp,
|
||||
int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
return -ENOSYS;
|
||||
}
|
||||
|
||||
int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp,
|
||||
int proc_doulongvec_minmax(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
return -ENOSYS;
|
||||
}
|
||||
|
||||
int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
|
||||
struct file *filp,
|
||||
void __user *buffer,
|
||||
size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
|
@@ -370,13 +370,20 @@ EXPORT_SYMBOL(mktime);
|
||||
* 0 <= tv_nsec < NSEC_PER_SEC
|
||||
* For negative values only the tv_sec field is negative !
|
||||
*/
|
||||
void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
|
||||
void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec)
|
||||
{
|
||||
while (nsec >= NSEC_PER_SEC) {
|
||||
/*
|
||||
* The following asm() prevents the compiler from
|
||||
* optimising this loop into a modulo operation. See
|
||||
* also __iter_div_u64_rem() in include/linux/time.h
|
||||
*/
|
||||
asm("" : "+rm"(nsec));
|
||||
nsec -= NSEC_PER_SEC;
|
||||
++sec;
|
||||
}
|
||||
while (nsec < 0) {
|
||||
asm("" : "+rm"(nsec));
|
||||
nsec += NSEC_PER_SEC;
|
||||
--sec;
|
||||
}
|
||||
|
@@ -1,4 +1,4 @@
|
||||
obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
|
||||
obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o
|
||||
|
||||
obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
|
||||
obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
|
||||
|
@@ -21,7 +21,6 @@
|
||||
*
|
||||
* TODO WishList:
|
||||
* o Allow clocksource drivers to be unregistered
|
||||
* o get rid of clocksource_jiffies extern
|
||||
*/
|
||||
|
||||
#include <linux/clocksource.h>
|
||||
@@ -30,6 +29,7 @@
|
||||
#include <linux/module.h>
|
||||
#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
|
||||
#include <linux/tick.h>
|
||||
#include <linux/kthread.h>
|
||||
|
||||
void timecounter_init(struct timecounter *tc,
|
||||
const struct cyclecounter *cc,
|
||||
@@ -107,50 +107,35 @@ u64 timecounter_cyc2time(struct timecounter *tc,
|
||||
}
|
||||
EXPORT_SYMBOL(timecounter_cyc2time);
|
||||
|
||||
/* XXX - Would like a better way for initializing curr_clocksource */
|
||||
extern struct clocksource clocksource_jiffies;
|
||||
|
||||
/*[Clocksource internal variables]---------
|
||||
* curr_clocksource:
|
||||
* currently selected clocksource. Initialized to clocksource_jiffies.
|
||||
* next_clocksource:
|
||||
* pending next selected clocksource.
|
||||
* currently selected clocksource.
|
||||
* clocksource_list:
|
||||
* linked list with the registered clocksources
|
||||
* clocksource_lock:
|
||||
* protects manipulations to curr_clocksource and next_clocksource
|
||||
* and the clocksource_list
|
||||
* clocksource_mutex:
|
||||
* protects manipulations to curr_clocksource and the clocksource_list
|
||||
* override_name:
|
||||
* Name of the user-specified clocksource.
|
||||
*/
|
||||
static struct clocksource *curr_clocksource = &clocksource_jiffies;
|
||||
static struct clocksource *next_clocksource;
|
||||
static struct clocksource *clocksource_override;
|
||||
static struct clocksource *curr_clocksource;
|
||||
static LIST_HEAD(clocksource_list);
|
||||
static DEFINE_SPINLOCK(clocksource_lock);
|
||||
static DEFINE_MUTEX(clocksource_mutex);
|
||||
static char override_name[32];
|
||||
static int finished_booting;
|
||||
|
||||
/* clocksource_done_booting - Called near the end of core bootup
|
||||
*
|
||||
* Hack to avoid lots of clocksource churn at boot time.
|
||||
* We use fs_initcall because we want this to start before
|
||||
* device_initcall but after subsys_initcall.
|
||||
*/
|
||||
static int __init clocksource_done_booting(void)
|
||||
{
|
||||
finished_booting = 1;
|
||||
return 0;
|
||||
}
|
||||
fs_initcall(clocksource_done_booting);
|
||||
|
||||
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
|
||||
static void clocksource_watchdog_work(struct work_struct *work);
|
||||
|
||||
static LIST_HEAD(watchdog_list);
|
||||
static struct clocksource *watchdog;
|
||||
static struct timer_list watchdog_timer;
|
||||
static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
|
||||
static DEFINE_SPINLOCK(watchdog_lock);
|
||||
static cycle_t watchdog_last;
|
||||
static unsigned long watchdog_resumed;
|
||||
static int watchdog_running;
|
||||
|
||||
static int clocksource_watchdog_kthread(void *data);
|
||||
static void __clocksource_change_rating(struct clocksource *cs, int rating);
|
||||
|
||||
/*
|
||||
* Interval: 0.5sec Threshold: 0.0625s
|
||||
@@ -158,135 +143,249 @@ static unsigned long watchdog_resumed;
|
||||
#define WATCHDOG_INTERVAL (HZ >> 1)
|
||||
#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
|
||||
|
||||
static void clocksource_ratewd(struct clocksource *cs, int64_t delta)
|
||||
static void clocksource_watchdog_work(struct work_struct *work)
|
||||
{
|
||||
if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD)
|
||||
return;
|
||||
/*
|
||||
* If kthread_run fails the next watchdog scan over the
|
||||
* watchdog_list will find the unstable clock again.
|
||||
*/
|
||||
kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
|
||||
}
|
||||
|
||||
static void __clocksource_unstable(struct clocksource *cs)
|
||||
{
|
||||
cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
|
||||
cs->flags |= CLOCK_SOURCE_UNSTABLE;
|
||||
if (finished_booting)
|
||||
schedule_work(&watchdog_work);
|
||||
}
|
||||
|
||||
static void clocksource_unstable(struct clocksource *cs, int64_t delta)
|
||||
{
|
||||
printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
|
||||
cs->name, delta);
|
||||
cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
|
||||
clocksource_change_rating(cs, 0);
|
||||
list_del(&cs->wd_list);
|
||||
__clocksource_unstable(cs);
|
||||
}
|
||||
|
||||
/**
|
||||
* clocksource_mark_unstable - mark clocksource unstable via watchdog
|
||||
* @cs: clocksource to be marked unstable
|
||||
*
|
||||
* This function is called instead of clocksource_change_rating from
|
||||
* cpu hotplug code to avoid a deadlock between the clocksource mutex
|
||||
* and the cpu hotplug mutex. It defers the update of the clocksource
|
||||
* to the watchdog thread.
|
||||
*/
|
||||
void clocksource_mark_unstable(struct clocksource *cs)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&watchdog_lock, flags);
|
||||
if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
|
||||
if (list_empty(&cs->wd_list))
|
||||
list_add(&cs->wd_list, &watchdog_list);
|
||||
__clocksource_unstable(cs);
|
||||
}
|
||||
spin_unlock_irqrestore(&watchdog_lock, flags);
|
||||
}
|
||||
|
||||
static void clocksource_watchdog(unsigned long data)
|
||||
{
|
||||
struct clocksource *cs, *tmp;
|
||||
struct clocksource *cs;
|
||||
cycle_t csnow, wdnow;
|
||||
int64_t wd_nsec, cs_nsec;
|
||||
int resumed;
|
||||
int next_cpu;
|
||||
|
||||
spin_lock(&watchdog_lock);
|
||||
|
||||
resumed = test_and_clear_bit(0, &watchdog_resumed);
|
||||
if (!watchdog_running)
|
||||
goto out;
|
||||
|
||||
wdnow = watchdog->read(watchdog);
|
||||
wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
|
||||
wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask,
|
||||
watchdog->mult, watchdog->shift);
|
||||
watchdog_last = wdnow;
|
||||
|
||||
list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
|
||||
list_for_each_entry(cs, &watchdog_list, wd_list) {
|
||||
|
||||
/* Clocksource already marked unstable? */
|
||||
if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
|
||||
if (finished_booting)
|
||||
schedule_work(&watchdog_work);
|
||||
continue;
|
||||
}
|
||||
|
||||
csnow = cs->read(cs);
|
||||
|
||||
if (unlikely(resumed)) {
|
||||
/* Clocksource initialized ? */
|
||||
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
|
||||
cs->flags |= CLOCK_SOURCE_WATCHDOG;
|
||||
cs->wd_last = csnow;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Initialized ? */
|
||||
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
|
||||
if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
|
||||
(watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
|
||||
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
|
||||
/*
|
||||
* We just marked the clocksource as
|
||||
* highres-capable, notify the rest of the
|
||||
* system as well so that we transition
|
||||
* into high-res mode:
|
||||
*/
|
||||
tick_clock_notify();
|
||||
}
|
||||
cs->flags |= CLOCK_SOURCE_WATCHDOG;
|
||||
cs->wd_last = csnow;
|
||||
} else {
|
||||
cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask);
|
||||
cs->wd_last = csnow;
|
||||
/* Check the delta. Might remove from the list ! */
|
||||
clocksource_ratewd(cs, cs_nsec - wd_nsec);
|
||||
/* Check the deviation from the watchdog clocksource. */
|
||||
cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) &
|
||||
cs->mask, cs->mult, cs->shift);
|
||||
cs->wd_last = csnow;
|
||||
if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
|
||||
clocksource_unstable(cs, cs_nsec - wd_nsec);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
|
||||
(cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
|
||||
(watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
|
||||
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
|
||||
/*
|
||||
* We just marked the clocksource as highres-capable,
|
||||
* notify the rest of the system as well so that we
|
||||
* transition into high-res mode:
|
||||
*/
|
||||
tick_clock_notify();
|
||||
}
|
||||
}
|
||||
|
||||
if (!list_empty(&watchdog_list)) {
|
||||
/*
|
||||
* Cycle through CPUs to check if the CPUs stay
|
||||
* synchronized to each other.
|
||||
*/
|
||||
int next_cpu = cpumask_next(raw_smp_processor_id(),
|
||||
cpu_online_mask);
|
||||
|
||||
if (next_cpu >= nr_cpu_ids)
|
||||
next_cpu = cpumask_first(cpu_online_mask);
|
||||
watchdog_timer.expires += WATCHDOG_INTERVAL;
|
||||
add_timer_on(&watchdog_timer, next_cpu);
|
||||
}
|
||||
/*
|
||||
* Cycle through CPUs to check if the CPUs stay synchronized
|
||||
* to each other.
|
||||
*/
|
||||
next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
|
||||
if (next_cpu >= nr_cpu_ids)
|
||||
next_cpu = cpumask_first(cpu_online_mask);
|
||||
watchdog_timer.expires += WATCHDOG_INTERVAL;
|
||||
add_timer_on(&watchdog_timer, next_cpu);
|
||||
out:
|
||||
spin_unlock(&watchdog_lock);
|
||||
}
|
||||
static void clocksource_resume_watchdog(void)
|
||||
|
||||
static inline void clocksource_start_watchdog(void)
|
||||
{
|
||||
set_bit(0, &watchdog_resumed);
|
||||
if (watchdog_running || !watchdog || list_empty(&watchdog_list))
|
||||
return;
|
||||
init_timer(&watchdog_timer);
|
||||
watchdog_timer.function = clocksource_watchdog;
|
||||
watchdog_last = watchdog->read(watchdog);
|
||||
watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
|
||||
add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
|
||||
watchdog_running = 1;
|
||||
}
|
||||
|
||||
static void clocksource_check_watchdog(struct clocksource *cs)
|
||||
static inline void clocksource_stop_watchdog(void)
|
||||
{
|
||||
if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
|
||||
return;
|
||||
del_timer(&watchdog_timer);
|
||||
watchdog_running = 0;
|
||||
}
|
||||
|
||||
static inline void clocksource_reset_watchdog(void)
|
||||
{
|
||||
struct clocksource *cs;
|
||||
|
||||
list_for_each_entry(cs, &watchdog_list, wd_list)
|
||||
cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
|
||||
}
|
||||
|
||||
static void clocksource_resume_watchdog(void)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&watchdog_lock, flags);
|
||||
clocksource_reset_watchdog();
|
||||
spin_unlock_irqrestore(&watchdog_lock, flags);
|
||||
}
|
||||
|
||||
static void clocksource_enqueue_watchdog(struct clocksource *cs)
|
||||
{
|
||||
struct clocksource *cse;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&watchdog_lock, flags);
|
||||
if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
|
||||
int started = !list_empty(&watchdog_list);
|
||||
|
||||
/* cs is a clocksource to be watched. */
|
||||
list_add(&cs->wd_list, &watchdog_list);
|
||||
if (!started && watchdog) {
|
||||
watchdog_last = watchdog->read(watchdog);
|
||||
watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
|
||||
add_timer_on(&watchdog_timer,
|
||||
cpumask_first(cpu_online_mask));
|
||||
}
|
||||
cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
|
||||
} else {
|
||||
/* cs is a watchdog. */
|
||||
if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
|
||||
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
|
||||
|
||||
/* Pick the best watchdog. */
|
||||
if (!watchdog || cs->rating > watchdog->rating) {
|
||||
if (watchdog)
|
||||
del_timer(&watchdog_timer);
|
||||
watchdog = cs;
|
||||
init_timer(&watchdog_timer);
|
||||
watchdog_timer.function = clocksource_watchdog;
|
||||
|
||||
/* Reset watchdog cycles */
|
||||
list_for_each_entry(cse, &watchdog_list, wd_list)
|
||||
cse->flags &= ~CLOCK_SOURCE_WATCHDOG;
|
||||
/* Start if list is not empty */
|
||||
if (!list_empty(&watchdog_list)) {
|
||||
watchdog_last = watchdog->read(watchdog);
|
||||
watchdog_timer.expires =
|
||||
jiffies + WATCHDOG_INTERVAL;
|
||||
add_timer_on(&watchdog_timer,
|
||||
cpumask_first(cpu_online_mask));
|
||||
}
|
||||
clocksource_reset_watchdog();
|
||||
}
|
||||
}
|
||||
/* Check if the watchdog timer needs to be started. */
|
||||
clocksource_start_watchdog();
|
||||
spin_unlock_irqrestore(&watchdog_lock, flags);
|
||||
}
|
||||
#else
|
||||
static void clocksource_check_watchdog(struct clocksource *cs)
|
||||
|
||||
static void clocksource_dequeue_watchdog(struct clocksource *cs)
|
||||
{
|
||||
struct clocksource *tmp;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&watchdog_lock, flags);
|
||||
if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
|
||||
/* cs is a watched clocksource. */
|
||||
list_del_init(&cs->wd_list);
|
||||
} else if (cs == watchdog) {
|
||||
/* Reset watchdog cycles */
|
||||
clocksource_reset_watchdog();
|
||||
/* Current watchdog is removed. Find an alternative. */
|
||||
watchdog = NULL;
|
||||
list_for_each_entry(tmp, &clocksource_list, list) {
|
||||
if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY)
|
||||
continue;
|
||||
if (!watchdog || tmp->rating > watchdog->rating)
|
||||
watchdog = tmp;
|
||||
}
|
||||
}
|
||||
cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
|
||||
/* Check if the watchdog timer needs to be stopped. */
|
||||
clocksource_stop_watchdog();
|
||||
spin_unlock_irqrestore(&watchdog_lock, flags);
|
||||
}
|
||||
|
||||
static int clocksource_watchdog_kthread(void *data)
|
||||
{
|
||||
struct clocksource *cs, *tmp;
|
||||
unsigned long flags;
|
||||
LIST_HEAD(unstable);
|
||||
|
||||
mutex_lock(&clocksource_mutex);
|
||||
spin_lock_irqsave(&watchdog_lock, flags);
|
||||
list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list)
|
||||
if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
|
||||
list_del_init(&cs->wd_list);
|
||||
list_add(&cs->wd_list, &unstable);
|
||||
}
|
||||
/* Check if the watchdog timer needs to be stopped. */
|
||||
clocksource_stop_watchdog();
|
||||
spin_unlock_irqrestore(&watchdog_lock, flags);
|
||||
|
||||
/* Needs to be done outside of watchdog lock */
|
||||
list_for_each_entry_safe(cs, tmp, &unstable, wd_list) {
|
||||
list_del_init(&cs->wd_list);
|
||||
__clocksource_change_rating(cs, 0);
|
||||
}
|
||||
mutex_unlock(&clocksource_mutex);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else /* CONFIG_CLOCKSOURCE_WATCHDOG */
|
||||
|
||||
static void clocksource_enqueue_watchdog(struct clocksource *cs)
|
||||
{
|
||||
if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
|
||||
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
|
||||
}
|
||||
|
||||
static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
|
||||
static inline void clocksource_resume_watchdog(void) { }
|
||||
#endif
|
||||
static inline int clocksource_watchdog_kthread(void *data) { return 0; }
|
||||
|
||||
#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
|
||||
|
||||
/**
|
||||
* clocksource_resume - resume the clocksource(s)
|
||||
@@ -294,18 +393,12 @@ static inline void clocksource_resume_watchdog(void) { }
|
||||
void clocksource_resume(void)
|
||||
{
|
||||
struct clocksource *cs;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&clocksource_lock, flags);
|
||||
|
||||
list_for_each_entry(cs, &clocksource_list, list) {
|
||||
list_for_each_entry(cs, &clocksource_list, list)
|
||||
if (cs->resume)
|
||||
cs->resume();
|
||||
}
|
||||
|
||||
clocksource_resume_watchdog();
|
||||
|
||||
spin_unlock_irqrestore(&clocksource_lock, flags);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -320,75 +413,94 @@ void clocksource_touch_watchdog(void)
|
||||
clocksource_resume_watchdog();
|
||||
}
|
||||
|
||||
/**
|
||||
* clocksource_get_next - Returns the selected clocksource
|
||||
*
|
||||
*/
|
||||
struct clocksource *clocksource_get_next(void)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&clocksource_lock, flags);
|
||||
if (next_clocksource && finished_booting) {
|
||||
curr_clocksource = next_clocksource;
|
||||
next_clocksource = NULL;
|
||||
}
|
||||
spin_unlock_irqrestore(&clocksource_lock, flags);
|
||||
|
||||
return curr_clocksource;
|
||||
}
|
||||
#ifdef CONFIG_GENERIC_TIME
|
||||
|
||||
/**
|
||||
* select_clocksource - Selects the best registered clocksource.
|
||||
* clocksource_select - Select the best clocksource available
|
||||
*
|
||||
* Private function. Must hold clocksource_lock when called.
|
||||
* Private function. Must hold clocksource_mutex when called.
|
||||
*
|
||||
* Select the clocksource with the best rating, or the clocksource,
|
||||
* which is selected by userspace override.
|
||||
*/
|
||||
static struct clocksource *select_clocksource(void)
|
||||
static void clocksource_select(void)
|
||||
{
|
||||
struct clocksource *next;
|
||||
struct clocksource *best, *cs;
|
||||
|
||||
if (list_empty(&clocksource_list))
|
||||
return NULL;
|
||||
|
||||
if (clocksource_override)
|
||||
next = clocksource_override;
|
||||
else
|
||||
next = list_entry(clocksource_list.next, struct clocksource,
|
||||
list);
|
||||
|
||||
if (next == curr_clocksource)
|
||||
return NULL;
|
||||
|
||||
return next;
|
||||
if (!finished_booting || list_empty(&clocksource_list))
|
||||
return;
|
||||
/* First clocksource on the list has the best rating. */
|
||||
best = list_first_entry(&clocksource_list, struct clocksource, list);
|
||||
/* Check for the override clocksource. */
|
||||
list_for_each_entry(cs, &clocksource_list, list) {
|
||||
if (strcmp(cs->name, override_name) != 0)
|
||||
continue;
|
||||
/*
|
||||
* Check to make sure we don't switch to a non-highres
|
||||
* capable clocksource if the tick code is in oneshot
|
||||
* mode (highres or nohz)
|
||||
*/
|
||||
if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
|
||||
tick_oneshot_mode_active()) {
|
||||
/* Override clocksource cannot be used. */
|
||||
printk(KERN_WARNING "Override clocksource %s is not "
|
||||
"HRT compatible. Cannot switch while in "
|
||||
"HRT/NOHZ mode\n", cs->name);
|
||||
override_name[0] = 0;
|
||||
} else
|
||||
/* Override clocksource can be used. */
|
||||
best = cs;
|
||||
break;
|
||||
}
|
||||
if (curr_clocksource != best) {
|
||||
printk(KERN_INFO "Switching to clocksource %s\n", best->name);
|
||||
curr_clocksource = best;
|
||||
timekeeping_notify(curr_clocksource);
|
||||
}
|
||||
}
|
||||
|
||||
#else /* CONFIG_GENERIC_TIME */
|
||||
|
||||
static inline void clocksource_select(void) { }
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* clocksource_done_booting - Called near the end of core bootup
|
||||
*
|
||||
* Hack to avoid lots of clocksource churn at boot time.
|
||||
* We use fs_initcall because we want this to start before
|
||||
* device_initcall but after subsys_initcall.
|
||||
*/
|
||||
static int __init clocksource_done_booting(void)
|
||||
{
|
||||
finished_booting = 1;
|
||||
|
||||
/*
|
||||
* Run the watchdog first to eliminate unstable clock sources
|
||||
*/
|
||||
clocksource_watchdog_kthread(NULL);
|
||||
|
||||
mutex_lock(&clocksource_mutex);
|
||||
clocksource_select();
|
||||
mutex_unlock(&clocksource_mutex);
|
||||
return 0;
|
||||
}
|
||||
fs_initcall(clocksource_done_booting);
|
||||
|
||||
/*
|
||||
* Enqueue the clocksource sorted by rating
|
||||
*/
|
||||
static int clocksource_enqueue(struct clocksource *c)
|
||||
static void clocksource_enqueue(struct clocksource *cs)
|
||||
{
|
||||
struct list_head *tmp, *entry = &clocksource_list;
|
||||
struct list_head *entry = &clocksource_list;
|
||||
struct clocksource *tmp;
|
||||
|
||||
list_for_each(tmp, &clocksource_list) {
|
||||
struct clocksource *cs;
|
||||
|
||||
cs = list_entry(tmp, struct clocksource, list);
|
||||
if (cs == c)
|
||||
return -EBUSY;
|
||||
list_for_each_entry(tmp, &clocksource_list, list)
|
||||
/* Keep track of the place, where to insert */
|
||||
if (cs->rating >= c->rating)
|
||||
entry = tmp;
|
||||
}
|
||||
list_add(&c->list, entry);
|
||||
|
||||
if (strlen(c->name) == strlen(override_name) &&
|
||||
!strcmp(c->name, override_name))
|
||||
clocksource_override = c;
|
||||
|
||||
return 0;
|
||||
if (tmp->rating >= cs->rating)
|
||||
entry = &tmp->list;
|
||||
list_add(&cs->list, entry);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -397,52 +509,48 @@ static int clocksource_enqueue(struct clocksource *c)
|
||||
*
|
||||
* Returns -EBUSY if registration fails, zero otherwise.
|
||||
*/
|
||||
int clocksource_register(struct clocksource *c)
|
||||
int clocksource_register(struct clocksource *cs)
|
||||
{
|
||||
unsigned long flags;
|
||||
int ret;
|
||||
|
||||
spin_lock_irqsave(&clocksource_lock, flags);
|
||||
ret = clocksource_enqueue(c);
|
||||
if (!ret)
|
||||
next_clocksource = select_clocksource();
|
||||
spin_unlock_irqrestore(&clocksource_lock, flags);
|
||||
if (!ret)
|
||||
clocksource_check_watchdog(c);
|
||||
return ret;
|
||||
mutex_lock(&clocksource_mutex);
|
||||
clocksource_enqueue(cs);
|
||||
clocksource_select();
|
||||
clocksource_enqueue_watchdog(cs);
|
||||
mutex_unlock(&clocksource_mutex);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(clocksource_register);
|
||||
|
||||
/**
|
||||
* clocksource_change_rating - Change the rating of a registered clocksource
|
||||
*
|
||||
*/
|
||||
void clocksource_change_rating(struct clocksource *cs, int rating)
|
||||
static void __clocksource_change_rating(struct clocksource *cs, int rating)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&clocksource_lock, flags);
|
||||
list_del(&cs->list);
|
||||
cs->rating = rating;
|
||||
clocksource_enqueue(cs);
|
||||
next_clocksource = select_clocksource();
|
||||
spin_unlock_irqrestore(&clocksource_lock, flags);
|
||||
clocksource_select();
|
||||
}
|
||||
|
||||
/**
|
||||
* clocksource_change_rating - Change the rating of a registered clocksource
|
||||
*/
|
||||
void clocksource_change_rating(struct clocksource *cs, int rating)
|
||||
{
|
||||
mutex_lock(&clocksource_mutex);
|
||||
__clocksource_change_rating(cs, rating);
|
||||
mutex_unlock(&clocksource_mutex);
|
||||
}
|
||||
EXPORT_SYMBOL(clocksource_change_rating);
|
||||
|
||||
/**
|
||||
* clocksource_unregister - remove a registered clocksource
|
||||
*/
|
||||
void clocksource_unregister(struct clocksource *cs)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&clocksource_lock, flags);
|
||||
mutex_lock(&clocksource_mutex);
|
||||
clocksource_dequeue_watchdog(cs);
|
||||
list_del(&cs->list);
|
||||
if (clocksource_override == cs)
|
||||
clocksource_override = NULL;
|
||||
next_clocksource = select_clocksource();
|
||||
spin_unlock_irqrestore(&clocksource_lock, flags);
|
||||
clocksource_select();
|
||||
mutex_unlock(&clocksource_mutex);
|
||||
}
|
||||
EXPORT_SYMBOL(clocksource_unregister);
|
||||
|
||||
#ifdef CONFIG_SYSFS
|
||||
/**
|
||||
@@ -458,9 +566,9 @@ sysfs_show_current_clocksources(struct sys_device *dev,
|
||||
{
|
||||
ssize_t count = 0;
|
||||
|
||||
spin_lock_irq(&clocksource_lock);
|
||||
mutex_lock(&clocksource_mutex);
|
||||
count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
|
||||
spin_unlock_irq(&clocksource_lock);
|
||||
mutex_unlock(&clocksource_mutex);
|
||||
|
||||
return count;
|
||||
}
|
||||
@@ -478,9 +586,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
|
||||
struct sysdev_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
struct clocksource *ovr = NULL;
|
||||
size_t ret = count;
|
||||
int len;
|
||||
|
||||
/* strings from sysfs write are not 0 terminated! */
|
||||
if (count >= sizeof(override_name))
|
||||
@@ -490,44 +596,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
|
||||
if (buf[count-1] == '\n')
|
||||
count--;
|
||||
|
||||
spin_lock_irq(&clocksource_lock);
|
||||
mutex_lock(&clocksource_mutex);
|
||||
|
||||
if (count > 0)
|
||||
memcpy(override_name, buf, count);
|
||||
override_name[count] = 0;
|
||||
clocksource_select();
|
||||
|
||||
len = strlen(override_name);
|
||||
if (len) {
|
||||
struct clocksource *cs;
|
||||
|
||||
ovr = clocksource_override;
|
||||
/* try to select it: */
|
||||
list_for_each_entry(cs, &clocksource_list, list) {
|
||||
if (strlen(cs->name) == len &&
|
||||
!strcmp(cs->name, override_name))
|
||||
ovr = cs;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Check to make sure we don't switch to a non-highres capable
|
||||
* clocksource if the tick code is in oneshot mode (highres or nohz)
|
||||
*/
|
||||
if (tick_oneshot_mode_active() && ovr &&
|
||||
!(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
|
||||
printk(KERN_WARNING "%s clocksource is not HRT compatible. "
|
||||
"Cannot switch while in HRT/NOHZ mode\n", ovr->name);
|
||||
ovr = NULL;
|
||||
override_name[0] = 0;
|
||||
}
|
||||
|
||||
/* Reselect, when the override name has changed */
|
||||
if (ovr != clocksource_override) {
|
||||
clocksource_override = ovr;
|
||||
next_clocksource = select_clocksource();
|
||||
}
|
||||
|
||||
spin_unlock_irq(&clocksource_lock);
|
||||
mutex_unlock(&clocksource_mutex);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@@ -547,7 +623,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
|
||||
struct clocksource *src;
|
||||
ssize_t count = 0;
|
||||
|
||||
spin_lock_irq(&clocksource_lock);
|
||||
mutex_lock(&clocksource_mutex);
|
||||
list_for_each_entry(src, &clocksource_list, list) {
|
||||
/*
|
||||
* Don't show non-HRES clocksource if the tick code is
|
||||
@@ -559,7 +635,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
|
||||
max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
|
||||
"%s ", src->name);
|
||||
}
|
||||
spin_unlock_irq(&clocksource_lock);
|
||||
mutex_unlock(&clocksource_mutex);
|
||||
|
||||
count += snprintf(buf + count,
|
||||
max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
|
||||
@@ -614,11 +690,10 @@ device_initcall(init_clocksource_sysfs);
|
||||
*/
|
||||
static int __init boot_override_clocksource(char* str)
|
||||
{
|
||||
unsigned long flags;
|
||||
spin_lock_irqsave(&clocksource_lock, flags);
|
||||
mutex_lock(&clocksource_mutex);
|
||||
if (str)
|
||||
strlcpy(override_name, str, sizeof(override_name));
|
||||
spin_unlock_irqrestore(&clocksource_lock, flags);
|
||||
mutex_unlock(&clocksource_mutex);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@@ -61,7 +61,6 @@ struct clocksource clocksource_jiffies = {
|
||||
.read = jiffies_read,
|
||||
.mask = 0xffffffff, /*32bits*/
|
||||
.mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
|
||||
.mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT,
|
||||
.shift = JIFFIES_SHIFT,
|
||||
};
|
||||
|
||||
@@ -71,3 +70,8 @@ static int __init init_jiffies_clocksource(void)
|
||||
}
|
||||
|
||||
core_initcall(init_jiffies_clocksource);
|
||||
|
||||
struct clocksource * __init __weak clocksource_default_clock(void)
|
||||
{
|
||||
return &clocksource_jiffies;
|
||||
}
|
||||
|
@@ -194,8 +194,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
|
||||
case TIME_OK:
|
||||
break;
|
||||
case TIME_INS:
|
||||
xtime.tv_sec--;
|
||||
wall_to_monotonic.tv_sec++;
|
||||
timekeeping_leap_insert(-1);
|
||||
time_state = TIME_OOP;
|
||||
printk(KERN_NOTICE
|
||||
"Clock: inserting leap second 23:59:60 UTC\n");
|
||||
@@ -203,9 +202,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
|
||||
res = HRTIMER_RESTART;
|
||||
break;
|
||||
case TIME_DEL:
|
||||
xtime.tv_sec++;
|
||||
timekeeping_leap_insert(1);
|
||||
time_tai--;
|
||||
wall_to_monotonic.tv_sec--;
|
||||
time_state = TIME_WAIT;
|
||||
printk(KERN_NOTICE
|
||||
"Clock: deleting leap second 23:59:59 UTC\n");
|
||||
@@ -219,7 +217,6 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
|
||||
time_state = TIME_OK;
|
||||
break;
|
||||
}
|
||||
update_vsyscall(&xtime, clock);
|
||||
|
||||
write_sequnlock(&xtime_lock);
|
||||
|
||||
|
127
kernel/time/timeconv.c
Normal file
127
kernel/time/timeconv.c
Normal file
@@ -0,0 +1,127 @@
|
||||
/*
|
||||
* Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc.
|
||||
* This file is part of the GNU C Library.
|
||||
* Contributed by Paul Eggert (eggert@twinsun.com).
|
||||
*
|
||||
* The GNU C Library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Library General Public License as
|
||||
* published by the Free Software Foundation; either version 2 of the
|
||||
* License, or (at your option) any later version.
|
||||
*
|
||||
* The GNU C Library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Library General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Library General Public
|
||||
* License along with the GNU C Library; see the file COPYING.LIB. If not,
|
||||
* write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Converts the calendar time to broken-down time representation
|
||||
* Based on code from glibc-2.6
|
||||
*
|
||||
* 2009-7-14:
|
||||
* Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com>
|
||||
*/
|
||||
|
||||
#include <linux/time.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
/*
|
||||
* Nonzero if YEAR is a leap year (every 4 years,
|
||||
* except every 100th isn't, and every 400th is).
|
||||
*/
|
||||
static int __isleap(long year)
|
||||
{
|
||||
return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0);
|
||||
}
|
||||
|
||||
/* do a mathdiv for long type */
|
||||
static long math_div(long a, long b)
|
||||
{
|
||||
return a / b - (a % b < 0);
|
||||
}
|
||||
|
||||
/* How many leap years between y1 and y2, y1 must less or equal to y2 */
|
||||
static long leaps_between(long y1, long y2)
|
||||
{
|
||||
long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100)
|
||||
+ math_div(y1 - 1, 400);
|
||||
long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100)
|
||||
+ math_div(y2 - 1, 400);
|
||||
return leaps2 - leaps1;
|
||||
}
|
||||
|
||||
/* How many days come before each month (0-12). */
|
||||
static const unsigned short __mon_yday[2][13] = {
|
||||
/* Normal years. */
|
||||
{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365},
|
||||
/* Leap years. */
|
||||
{0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366}
|
||||
};
|
||||
|
||||
#define SECS_PER_HOUR (60 * 60)
|
||||
#define SECS_PER_DAY (SECS_PER_HOUR * 24)
|
||||
|
||||
/**
|
||||
* time_to_tm - converts the calendar time to local broken-down time
|
||||
*
|
||||
* @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970,
|
||||
* Coordinated Universal Time (UTC).
|
||||
* @offset offset seconds adding to totalsecs.
|
||||
* @result pointer to struct tm variable to receive broken-down time
|
||||
*/
|
||||
void time_to_tm(time_t totalsecs, int offset, struct tm *result)
|
||||
{
|
||||
long days, rem, y;
|
||||
const unsigned short *ip;
|
||||
|
||||
days = totalsecs / SECS_PER_DAY;
|
||||
rem = totalsecs % SECS_PER_DAY;
|
||||
rem += offset;
|
||||
while (rem < 0) {
|
||||
rem += SECS_PER_DAY;
|
||||
--days;
|
||||
}
|
||||
while (rem >= SECS_PER_DAY) {
|
||||
rem -= SECS_PER_DAY;
|
||||
++days;
|
||||
}
|
||||
|
||||
result->tm_hour = rem / SECS_PER_HOUR;
|
||||
rem %= SECS_PER_HOUR;
|
||||
result->tm_min = rem / 60;
|
||||
result->tm_sec = rem % 60;
|
||||
|
||||
/* January 1, 1970 was a Thursday. */
|
||||
result->tm_wday = (4 + days) % 7;
|
||||
if (result->tm_wday < 0)
|
||||
result->tm_wday += 7;
|
||||
|
||||
y = 1970;
|
||||
|
||||
while (days < 0 || days >= (__isleap(y) ? 366 : 365)) {
|
||||
/* Guess a corrected year, assuming 365 days per year. */
|
||||
long yg = y + math_div(days, 365);
|
||||
|
||||
/* Adjust DAYS and Y to match the guessed year. */
|
||||
days -= (yg - y) * 365 + leaps_between(y, yg);
|
||||
y = yg;
|
||||
}
|
||||
|
||||
result->tm_year = y - 1900;
|
||||
|
||||
result->tm_yday = days;
|
||||
|
||||
ip = __mon_yday[__isleap(y)];
|
||||
for (y = 11; days < ip[y]; y--)
|
||||
continue;
|
||||
days -= ip[y];
|
||||
|
||||
result->tm_mon = y;
|
||||
result->tm_mday = days + 1;
|
||||
}
|
||||
EXPORT_SYMBOL(time_to_tm);
|
@@ -18,7 +18,117 @@
|
||||
#include <linux/jiffies.h>
|
||||
#include <linux/time.h>
|
||||
#include <linux/tick.h>
|
||||
#include <linux/stop_machine.h>
|
||||
|
||||
/* Structure holding internal timekeeping values. */
|
||||
struct timekeeper {
|
||||
/* Current clocksource used for timekeeping. */
|
||||
struct clocksource *clock;
|
||||
/* The shift value of the current clocksource. */
|
||||
int shift;
|
||||
|
||||
/* Number of clock cycles in one NTP interval. */
|
||||
cycle_t cycle_interval;
|
||||
/* Number of clock shifted nano seconds in one NTP interval. */
|
||||
u64 xtime_interval;
|
||||
/* Raw nano seconds accumulated per NTP interval. */
|
||||
u32 raw_interval;
|
||||
|
||||
/* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */
|
||||
u64 xtime_nsec;
|
||||
/* Difference between accumulated time and NTP time in ntp
|
||||
* shifted nano seconds. */
|
||||
s64 ntp_error;
|
||||
/* Shift conversion between clock shifted nano seconds and
|
||||
* ntp shifted nano seconds. */
|
||||
int ntp_error_shift;
|
||||
/* NTP adjusted clock multiplier */
|
||||
u32 mult;
|
||||
};
|
||||
|
||||
struct timekeeper timekeeper;
|
||||
|
||||
/**
|
||||
* timekeeper_setup_internals - Set up internals to use clocksource clock.
|
||||
*
|
||||
* @clock: Pointer to clocksource.
|
||||
*
|
||||
* Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
|
||||
* pair and interval request.
|
||||
*
|
||||
* Unless you're the timekeeping code, you should not be using this!
|
||||
*/
|
||||
static void timekeeper_setup_internals(struct clocksource *clock)
|
||||
{
|
||||
cycle_t interval;
|
||||
u64 tmp;
|
||||
|
||||
timekeeper.clock = clock;
|
||||
clock->cycle_last = clock->read(clock);
|
||||
|
||||
/* Do the ns -> cycle conversion first, using original mult */
|
||||
tmp = NTP_INTERVAL_LENGTH;
|
||||
tmp <<= clock->shift;
|
||||
tmp += clock->mult/2;
|
||||
do_div(tmp, clock->mult);
|
||||
if (tmp == 0)
|
||||
tmp = 1;
|
||||
|
||||
interval = (cycle_t) tmp;
|
||||
timekeeper.cycle_interval = interval;
|
||||
|
||||
/* Go back from cycles -> shifted ns */
|
||||
timekeeper.xtime_interval = (u64) interval * clock->mult;
|
||||
timekeeper.raw_interval =
|
||||
((u64) interval * clock->mult) >> clock->shift;
|
||||
|
||||
timekeeper.xtime_nsec = 0;
|
||||
timekeeper.shift = clock->shift;
|
||||
|
||||
timekeeper.ntp_error = 0;
|
||||
timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
|
||||
|
||||
/*
|
||||
* The timekeeper keeps its own mult values for the currently
|
||||
* active clocksource. These value will be adjusted via NTP
|
||||
* to counteract clock drifting.
|
||||
*/
|
||||
timekeeper.mult = clock->mult;
|
||||
}
|
||||
|
||||
/* Timekeeper helper functions. */
|
||||
static inline s64 timekeeping_get_ns(void)
|
||||
{
|
||||
cycle_t cycle_now, cycle_delta;
|
||||
struct clocksource *clock;
|
||||
|
||||
/* read clocksource: */
|
||||
clock = timekeeper.clock;
|
||||
cycle_now = clock->read(clock);
|
||||
|
||||
/* calculate the delta since the last update_wall_time: */
|
||||
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
|
||||
|
||||
/* return delta convert to nanoseconds using ntp adjusted mult. */
|
||||
return clocksource_cyc2ns(cycle_delta, timekeeper.mult,
|
||||
timekeeper.shift);
|
||||
}
|
||||
|
||||
static inline s64 timekeeping_get_ns_raw(void)
|
||||
{
|
||||
cycle_t cycle_now, cycle_delta;
|
||||
struct clocksource *clock;
|
||||
|
||||
/* read clocksource: */
|
||||
clock = timekeeper.clock;
|
||||
cycle_now = clock->read(clock);
|
||||
|
||||
/* calculate the delta since the last update_wall_time: */
|
||||
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
|
||||
|
||||
/* return delta convert to nanoseconds using ntp adjusted mult. */
|
||||
return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
|
||||
}
|
||||
|
||||
/*
|
||||
* This read-write spinlock protects us from races in SMP while
|
||||
@@ -44,7 +154,12 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
|
||||
*/
|
||||
struct timespec xtime __attribute__ ((aligned (16)));
|
||||
struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
|
||||
static unsigned long total_sleep_time; /* seconds */
|
||||
static struct timespec total_sleep_time;
|
||||
|
||||
/*
|
||||
* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
|
||||
*/
|
||||
struct timespec raw_time;
|
||||
|
||||
/* flag for if timekeeping is suspended */
|
||||
int __read_mostly timekeeping_suspended;
|
||||
@@ -56,35 +171,44 @@ void update_xtime_cache(u64 nsec)
|
||||
timespec_add_ns(&xtime_cache, nsec);
|
||||
}
|
||||
|
||||
struct clocksource *clock;
|
||||
|
||||
/* must hold xtime_lock */
|
||||
void timekeeping_leap_insert(int leapsecond)
|
||||
{
|
||||
xtime.tv_sec += leapsecond;
|
||||
wall_to_monotonic.tv_sec -= leapsecond;
|
||||
update_vsyscall(&xtime, timekeeper.clock);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_GENERIC_TIME
|
||||
|
||||
/**
|
||||
* clocksource_forward_now - update clock to the current time
|
||||
* timekeeping_forward_now - update clock to the current time
|
||||
*
|
||||
* Forward the current clock to update its state since the last call to
|
||||
* update_wall_time(). This is useful before significant clock changes,
|
||||
* as it avoids having to deal with this time offset explicitly.
|
||||
*/
|
||||
static void clocksource_forward_now(void)
|
||||
static void timekeeping_forward_now(void)
|
||||
{
|
||||
cycle_t cycle_now, cycle_delta;
|
||||
struct clocksource *clock;
|
||||
s64 nsec;
|
||||
|
||||
cycle_now = clocksource_read(clock);
|
||||
clock = timekeeper.clock;
|
||||
cycle_now = clock->read(clock);
|
||||
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
|
||||
clock->cycle_last = cycle_now;
|
||||
|
||||
nsec = cyc2ns(clock, cycle_delta);
|
||||
nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult,
|
||||
timekeeper.shift);
|
||||
|
||||
/* If arch requires, add in gettimeoffset() */
|
||||
nsec += arch_gettimeoffset();
|
||||
|
||||
timespec_add_ns(&xtime, nsec);
|
||||
|
||||
nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
|
||||
clock->raw_time.tv_nsec += nsec;
|
||||
nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
|
||||
timespec_add_ns(&raw_time, nsec);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -95,7 +219,6 @@ static void clocksource_forward_now(void)
|
||||
*/
|
||||
void getnstimeofday(struct timespec *ts)
|
||||
{
|
||||
cycle_t cycle_now, cycle_delta;
|
||||
unsigned long seq;
|
||||
s64 nsecs;
|
||||
|
||||
@@ -105,15 +228,7 @@ void getnstimeofday(struct timespec *ts)
|
||||
seq = read_seqbegin(&xtime_lock);
|
||||
|
||||
*ts = xtime;
|
||||
|
||||
/* read clocksource: */
|
||||
cycle_now = clocksource_read(clock);
|
||||
|
||||
/* calculate the delta since the last update_wall_time: */
|
||||
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
|
||||
|
||||
/* convert to nanoseconds: */
|
||||
nsecs = cyc2ns(clock, cycle_delta);
|
||||
nsecs = timekeeping_get_ns();
|
||||
|
||||
/* If arch requires, add in gettimeoffset() */
|
||||
nsecs += arch_gettimeoffset();
|
||||
@@ -125,6 +240,57 @@ void getnstimeofday(struct timespec *ts)
|
||||
|
||||
EXPORT_SYMBOL(getnstimeofday);
|
||||
|
||||
ktime_t ktime_get(void)
|
||||
{
|
||||
unsigned int seq;
|
||||
s64 secs, nsecs;
|
||||
|
||||
WARN_ON(timekeeping_suspended);
|
||||
|
||||
do {
|
||||
seq = read_seqbegin(&xtime_lock);
|
||||
secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
|
||||
nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
|
||||
nsecs += timekeeping_get_ns();
|
||||
|
||||
} while (read_seqretry(&xtime_lock, seq));
|
||||
/*
|
||||
* Use ktime_set/ktime_add_ns to create a proper ktime on
|
||||
* 32-bit architectures without CONFIG_KTIME_SCALAR.
|
||||
*/
|
||||
return ktime_add_ns(ktime_set(secs, 0), nsecs);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(ktime_get);
|
||||
|
||||
/**
|
||||
* ktime_get_ts - get the monotonic clock in timespec format
|
||||
* @ts: pointer to timespec variable
|
||||
*
|
||||
* The function calculates the monotonic clock from the realtime
|
||||
* clock and the wall_to_monotonic offset and stores the result
|
||||
* in normalized timespec format in the variable pointed to by @ts.
|
||||
*/
|
||||
void ktime_get_ts(struct timespec *ts)
|
||||
{
|
||||
struct timespec tomono;
|
||||
unsigned int seq;
|
||||
s64 nsecs;
|
||||
|
||||
WARN_ON(timekeeping_suspended);
|
||||
|
||||
do {
|
||||
seq = read_seqbegin(&xtime_lock);
|
||||
*ts = xtime;
|
||||
tomono = wall_to_monotonic;
|
||||
nsecs = timekeeping_get_ns();
|
||||
|
||||
} while (read_seqretry(&xtime_lock, seq));
|
||||
|
||||
set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
|
||||
ts->tv_nsec + tomono.tv_nsec + nsecs);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(ktime_get_ts);
|
||||
|
||||
/**
|
||||
* do_gettimeofday - Returns the time of day in a timeval
|
||||
* @tv: pointer to the timeval to be set
|
||||
@@ -157,7 +323,7 @@ int do_settimeofday(struct timespec *tv)
|
||||
|
||||
write_seqlock_irqsave(&xtime_lock, flags);
|
||||
|
||||
clocksource_forward_now();
|
||||
timekeeping_forward_now();
|
||||
|
||||
ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
|
||||
ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
|
||||
@@ -167,10 +333,10 @@ int do_settimeofday(struct timespec *tv)
|
||||
|
||||
update_xtime_cache(0);
|
||||
|
||||
clock->error = 0;
|
||||
timekeeper.ntp_error = 0;
|
||||
ntp_clear();
|
||||
|
||||
update_vsyscall(&xtime, clock);
|
||||
update_vsyscall(&xtime, timekeeper.clock);
|
||||
|
||||
write_sequnlock_irqrestore(&xtime_lock, flags);
|
||||
|
||||
@@ -187,44 +353,97 @@ EXPORT_SYMBOL(do_settimeofday);
|
||||
*
|
||||
* Accumulates current time interval and initializes new clocksource
|
||||
*/
|
||||
static void change_clocksource(void)
|
||||
static int change_clocksource(void *data)
|
||||
{
|
||||
struct clocksource *new, *old;
|
||||
|
||||
new = clocksource_get_next();
|
||||
new = (struct clocksource *) data;
|
||||
|
||||
if (clock == new)
|
||||
return;
|
||||
|
||||
clocksource_forward_now();
|
||||
|
||||
if (clocksource_enable(new))
|
||||
return;
|
||||
|
||||
new->raw_time = clock->raw_time;
|
||||
old = clock;
|
||||
clock = new;
|
||||
clocksource_disable(old);
|
||||
|
||||
clock->cycle_last = 0;
|
||||
clock->cycle_last = clocksource_read(clock);
|
||||
clock->error = 0;
|
||||
clock->xtime_nsec = 0;
|
||||
clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
|
||||
|
||||
tick_clock_notify();
|
||||
|
||||
/*
|
||||
* We're holding xtime lock and waking up klogd would deadlock
|
||||
* us on enqueue. So no printing!
|
||||
printk(KERN_INFO "Time: %s clocksource has been installed.\n",
|
||||
clock->name);
|
||||
*/
|
||||
timekeeping_forward_now();
|
||||
if (!new->enable || new->enable(new) == 0) {
|
||||
old = timekeeper.clock;
|
||||
timekeeper_setup_internals(new);
|
||||
if (old->disable)
|
||||
old->disable(old);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
static inline void clocksource_forward_now(void) { }
|
||||
static inline void change_clocksource(void) { }
|
||||
#endif
|
||||
|
||||
/**
|
||||
* timekeeping_notify - Install a new clock source
|
||||
* @clock: pointer to the clock source
|
||||
*
|
||||
* This function is called from clocksource.c after a new, better clock
|
||||
* source has been registered. The caller holds the clocksource_mutex.
|
||||
*/
|
||||
void timekeeping_notify(struct clocksource *clock)
|
||||
{
|
||||
if (timekeeper.clock == clock)
|
||||
return;
|
||||
stop_machine(change_clocksource, clock, NULL);
|
||||
tick_clock_notify();
|
||||
}
|
||||
|
||||
#else /* GENERIC_TIME */
|
||||
|
||||
static inline void timekeeping_forward_now(void) { }
|
||||
|
||||
/**
|
||||
* ktime_get - get the monotonic time in ktime_t format
|
||||
*
|
||||
* returns the time in ktime_t format
|
||||
*/
|
||||
ktime_t ktime_get(void)
|
||||
{
|
||||
struct timespec now;
|
||||
|
||||
ktime_get_ts(&now);
|
||||
|
||||
return timespec_to_ktime(now);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(ktime_get);
|
||||
|
||||
/**
|
||||
* ktime_get_ts - get the monotonic clock in timespec format
|
||||
* @ts: pointer to timespec variable
|
||||
*
|
||||
* The function calculates the monotonic clock from the realtime
|
||||
* clock and the wall_to_monotonic offset and stores the result
|
||||
* in normalized timespec format in the variable pointed to by @ts.
|
||||
*/
|
||||
void ktime_get_ts(struct timespec *ts)
|
||||
{
|
||||
struct timespec tomono;
|
||||
unsigned long seq;
|
||||
|
||||
do {
|
||||
seq = read_seqbegin(&xtime_lock);
|
||||
getnstimeofday(ts);
|
||||
tomono = wall_to_monotonic;
|
||||
|
||||
} while (read_seqretry(&xtime_lock, seq));
|
||||
|
||||
set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
|
||||
ts->tv_nsec + tomono.tv_nsec);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(ktime_get_ts);
|
||||
|
||||
#endif /* !GENERIC_TIME */
|
||||
|
||||
/**
|
||||
* ktime_get_real - get the real (wall-) time in ktime_t format
|
||||
*
|
||||
* returns the time in ktime_t format
|
||||
*/
|
||||
ktime_t ktime_get_real(void)
|
||||
{
|
||||
struct timespec now;
|
||||
|
||||
getnstimeofday(&now);
|
||||
|
||||
return timespec_to_ktime(now);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(ktime_get_real);
|
||||
|
||||
/**
|
||||
* getrawmonotonic - Returns the raw monotonic time in a timespec
|
||||
@@ -236,21 +455,11 @@ void getrawmonotonic(struct timespec *ts)
|
||||
{
|
||||
unsigned long seq;
|
||||
s64 nsecs;
|
||||
cycle_t cycle_now, cycle_delta;
|
||||
|
||||
do {
|
||||
seq = read_seqbegin(&xtime_lock);
|
||||
|
||||
/* read clocksource: */
|
||||
cycle_now = clocksource_read(clock);
|
||||
|
||||
/* calculate the delta since the last update_wall_time: */
|
||||
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
|
||||
|
||||
/* convert to nanoseconds: */
|
||||
nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
|
||||
|
||||
*ts = clock->raw_time;
|
||||
nsecs = timekeeping_get_ns_raw();
|
||||
*ts = raw_time;
|
||||
|
||||
} while (read_seqretry(&xtime_lock, seq));
|
||||
|
||||
@@ -270,7 +479,7 @@ int timekeeping_valid_for_hres(void)
|
||||
do {
|
||||
seq = read_seqbegin(&xtime_lock);
|
||||
|
||||
ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
|
||||
ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
|
||||
|
||||
} while (read_seqretry(&xtime_lock, seq));
|
||||
|
||||
@@ -278,17 +487,33 @@ int timekeeping_valid_for_hres(void)
|
||||
}
|
||||
|
||||
/**
|
||||
* read_persistent_clock - Return time in seconds from the persistent clock.
|
||||
* read_persistent_clock - Return time from the persistent clock.
|
||||
*
|
||||
* Weak dummy function for arches that do not yet support it.
|
||||
* Returns seconds from epoch using the battery backed persistent clock.
|
||||
* Returns zero if unsupported.
|
||||
* Reads the time from the battery backed persistent clock.
|
||||
* Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
|
||||
*
|
||||
* XXX - Do be sure to remove it once all arches implement it.
|
||||
*/
|
||||
unsigned long __attribute__((weak)) read_persistent_clock(void)
|
||||
void __attribute__((weak)) read_persistent_clock(struct timespec *ts)
|
||||
{
|
||||
return 0;
|
||||
ts->tv_sec = 0;
|
||||
ts->tv_nsec = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* read_boot_clock - Return time of the system start.
|
||||
*
|
||||
* Weak dummy function for arches that do not yet support it.
|
||||
* Function to read the exact time the system has been started.
|
||||
* Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
|
||||
*
|
||||
* XXX - Do be sure to remove it once all arches implement it.
|
||||
*/
|
||||
void __attribute__((weak)) read_boot_clock(struct timespec *ts)
|
||||
{
|
||||
ts->tv_sec = 0;
|
||||
ts->tv_nsec = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -296,29 +521,40 @@ unsigned long __attribute__((weak)) read_persistent_clock(void)
|
||||
*/
|
||||
void __init timekeeping_init(void)
|
||||
{
|
||||
struct clocksource *clock;
|
||||
unsigned long flags;
|
||||
unsigned long sec = read_persistent_clock();
|
||||
struct timespec now, boot;
|
||||
|
||||
read_persistent_clock(&now);
|
||||
read_boot_clock(&boot);
|
||||
|
||||
write_seqlock_irqsave(&xtime_lock, flags);
|
||||
|
||||
ntp_init();
|
||||
|
||||
clock = clocksource_get_next();
|
||||
clocksource_enable(clock);
|
||||
clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
|
||||
clock->cycle_last = clocksource_read(clock);
|
||||
clock = clocksource_default_clock();
|
||||
if (clock->enable)
|
||||
clock->enable(clock);
|
||||
timekeeper_setup_internals(clock);
|
||||
|
||||
xtime.tv_sec = sec;
|
||||
xtime.tv_nsec = 0;
|
||||
xtime.tv_sec = now.tv_sec;
|
||||
xtime.tv_nsec = now.tv_nsec;
|
||||
raw_time.tv_sec = 0;
|
||||
raw_time.tv_nsec = 0;
|
||||
if (boot.tv_sec == 0 && boot.tv_nsec == 0) {
|
||||
boot.tv_sec = xtime.tv_sec;
|
||||
boot.tv_nsec = xtime.tv_nsec;
|
||||
}
|
||||
set_normalized_timespec(&wall_to_monotonic,
|
||||
-xtime.tv_sec, -xtime.tv_nsec);
|
||||
-boot.tv_sec, -boot.tv_nsec);
|
||||
update_xtime_cache(0);
|
||||
total_sleep_time = 0;
|
||||
total_sleep_time.tv_sec = 0;
|
||||
total_sleep_time.tv_nsec = 0;
|
||||
write_sequnlock_irqrestore(&xtime_lock, flags);
|
||||
}
|
||||
|
||||
/* time in seconds when suspend began */
|
||||
static unsigned long timekeeping_suspend_time;
|
||||
static struct timespec timekeeping_suspend_time;
|
||||
|
||||
/**
|
||||
* timekeeping_resume - Resumes the generic timekeeping subsystem.
|
||||
@@ -331,24 +567,24 @@ static unsigned long timekeeping_suspend_time;
|
||||
static int timekeeping_resume(struct sys_device *dev)
|
||||
{
|
||||
unsigned long flags;
|
||||
unsigned long now = read_persistent_clock();
|
||||
struct timespec ts;
|
||||
|
||||
read_persistent_clock(&ts);
|
||||
|
||||
clocksource_resume();
|
||||
|
||||
write_seqlock_irqsave(&xtime_lock, flags);
|
||||
|
||||
if (now && (now > timekeeping_suspend_time)) {
|
||||
unsigned long sleep_length = now - timekeeping_suspend_time;
|
||||
|
||||
xtime.tv_sec += sleep_length;
|
||||
wall_to_monotonic.tv_sec -= sleep_length;
|
||||
total_sleep_time += sleep_length;
|
||||
if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
|
||||
ts = timespec_sub(ts, timekeeping_suspend_time);
|
||||
xtime = timespec_add_safe(xtime, ts);
|
||||
wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
|
||||
total_sleep_time = timespec_add_safe(total_sleep_time, ts);
|
||||
}
|
||||
update_xtime_cache(0);
|
||||
/* re-base the last cycle value */
|
||||
clock->cycle_last = 0;
|
||||
clock->cycle_last = clocksource_read(clock);
|
||||
clock->error = 0;
|
||||
timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
|
||||
timekeeper.ntp_error = 0;
|
||||
timekeeping_suspended = 0;
|
||||
write_sequnlock_irqrestore(&xtime_lock, flags);
|
||||
|
||||
@@ -366,10 +602,10 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
timekeeping_suspend_time = read_persistent_clock();
|
||||
read_persistent_clock(&timekeeping_suspend_time);
|
||||
|
||||
write_seqlock_irqsave(&xtime_lock, flags);
|
||||
clocksource_forward_now();
|
||||
timekeeping_forward_now();
|
||||
timekeeping_suspended = 1;
|
||||
write_sequnlock_irqrestore(&xtime_lock, flags);
|
||||
|
||||
@@ -404,7 +640,7 @@ device_initcall(timekeeping_init_device);
|
||||
* If the error is already larger, we look ahead even further
|
||||
* to compensate for late or lost adjustments.
|
||||
*/
|
||||
static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
|
||||
static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
|
||||
s64 *offset)
|
||||
{
|
||||
s64 tick_error, i;
|
||||
@@ -420,7 +656,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
|
||||
* here. This is tuned so that an error of about 1 msec is adjusted
|
||||
* within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
|
||||
*/
|
||||
error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
|
||||
error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
|
||||
error2 = abs(error2);
|
||||
for (look_ahead = 0; error2 > 0; look_ahead++)
|
||||
error2 >>= 2;
|
||||
@@ -429,8 +665,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
|
||||
* Now calculate the error in (1 << look_ahead) ticks, but first
|
||||
* remove the single look ahead already included in the error.
|
||||
*/
|
||||
tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1);
|
||||
tick_error -= clock->xtime_interval >> 1;
|
||||
tick_error = tick_length >> (timekeeper.ntp_error_shift + 1);
|
||||
tick_error -= timekeeper.xtime_interval >> 1;
|
||||
error = ((error - tick_error) >> look_ahead) + tick_error;
|
||||
|
||||
/* Finally calculate the adjustment shift value. */
|
||||
@@ -455,18 +691,18 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
|
||||
* this is optimized for the most common adjustments of -1,0,1,
|
||||
* for other values we can do a bit more work.
|
||||
*/
|
||||
static void clocksource_adjust(s64 offset)
|
||||
static void timekeeping_adjust(s64 offset)
|
||||
{
|
||||
s64 error, interval = clock->cycle_interval;
|
||||
s64 error, interval = timekeeper.cycle_interval;
|
||||
int adj;
|
||||
|
||||
error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1);
|
||||
error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
|
||||
if (error > interval) {
|
||||
error >>= 2;
|
||||
if (likely(error <= interval))
|
||||
adj = 1;
|
||||
else
|
||||
adj = clocksource_bigadjust(error, &interval, &offset);
|
||||
adj = timekeeping_bigadjust(error, &interval, &offset);
|
||||
} else if (error < -interval) {
|
||||
error >>= 2;
|
||||
if (likely(error >= -interval)) {
|
||||
@@ -474,15 +710,15 @@ static void clocksource_adjust(s64 offset)
|
||||
interval = -interval;
|
||||
offset = -offset;
|
||||
} else
|
||||
adj = clocksource_bigadjust(error, &interval, &offset);
|
||||
adj = timekeeping_bigadjust(error, &interval, &offset);
|
||||
} else
|
||||
return;
|
||||
|
||||
clock->mult += adj;
|
||||
clock->xtime_interval += interval;
|
||||
clock->xtime_nsec -= offset;
|
||||
clock->error -= (interval - offset) <<
|
||||
(NTP_SCALE_SHIFT - clock->shift);
|
||||
timekeeper.mult += adj;
|
||||
timekeeper.xtime_interval += interval;
|
||||
timekeeper.xtime_nsec -= offset;
|
||||
timekeeper.ntp_error -= (interval - offset) <<
|
||||
timekeeper.ntp_error_shift;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -492,53 +728,59 @@ static void clocksource_adjust(s64 offset)
|
||||
*/
|
||||
void update_wall_time(void)
|
||||
{
|
||||
struct clocksource *clock;
|
||||
cycle_t offset;
|
||||
u64 nsecs;
|
||||
|
||||
/* Make sure we're fully resumed: */
|
||||
if (unlikely(timekeeping_suspended))
|
||||
return;
|
||||
|
||||
clock = timekeeper.clock;
|
||||
#ifdef CONFIG_GENERIC_TIME
|
||||
offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
|
||||
offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
|
||||
#else
|
||||
offset = clock->cycle_interval;
|
||||
offset = timekeeper.cycle_interval;
|
||||
#endif
|
||||
clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift;
|
||||
timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
|
||||
|
||||
/* normally this loop will run just once, however in the
|
||||
* case of lost or late ticks, it will accumulate correctly.
|
||||
*/
|
||||
while (offset >= clock->cycle_interval) {
|
||||
/* accumulate one interval */
|
||||
offset -= clock->cycle_interval;
|
||||
clock->cycle_last += clock->cycle_interval;
|
||||
while (offset >= timekeeper.cycle_interval) {
|
||||
u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
|
||||
|
||||
clock->xtime_nsec += clock->xtime_interval;
|
||||
if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
|
||||
clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
|
||||
/* accumulate one interval */
|
||||
offset -= timekeeper.cycle_interval;
|
||||
clock->cycle_last += timekeeper.cycle_interval;
|
||||
|
||||
timekeeper.xtime_nsec += timekeeper.xtime_interval;
|
||||
if (timekeeper.xtime_nsec >= nsecps) {
|
||||
timekeeper.xtime_nsec -= nsecps;
|
||||
xtime.tv_sec++;
|
||||
second_overflow();
|
||||
}
|
||||
|
||||
clock->raw_time.tv_nsec += clock->raw_interval;
|
||||
if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) {
|
||||
clock->raw_time.tv_nsec -= NSEC_PER_SEC;
|
||||
clock->raw_time.tv_sec++;
|
||||
raw_time.tv_nsec += timekeeper.raw_interval;
|
||||
if (raw_time.tv_nsec >= NSEC_PER_SEC) {
|
||||
raw_time.tv_nsec -= NSEC_PER_SEC;
|
||||
raw_time.tv_sec++;
|
||||
}
|
||||
|
||||
/* accumulate error between NTP and clock interval */
|
||||
clock->error += tick_length;
|
||||
clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
|
||||
timekeeper.ntp_error += tick_length;
|
||||
timekeeper.ntp_error -= timekeeper.xtime_interval <<
|
||||
timekeeper.ntp_error_shift;
|
||||
}
|
||||
|
||||
/* correct the clock when NTP error is too big */
|
||||
clocksource_adjust(offset);
|
||||
timekeeping_adjust(offset);
|
||||
|
||||
/*
|
||||
* Since in the loop above, we accumulate any amount of time
|
||||
* in xtime_nsec over a second into xtime.tv_sec, its possible for
|
||||
* xtime_nsec to be fairly small after the loop. Further, if we're
|
||||
* slightly speeding the clocksource up in clocksource_adjust(),
|
||||
* slightly speeding the clocksource up in timekeeping_adjust(),
|
||||
* its possible the required corrective factor to xtime_nsec could
|
||||
* cause it to underflow.
|
||||
*
|
||||
@@ -550,24 +792,25 @@ void update_wall_time(void)
|
||||
* We'll correct this error next time through this function, when
|
||||
* xtime_nsec is not as small.
|
||||
*/
|
||||
if (unlikely((s64)clock->xtime_nsec < 0)) {
|
||||
s64 neg = -(s64)clock->xtime_nsec;
|
||||
clock->xtime_nsec = 0;
|
||||
clock->error += neg << (NTP_SCALE_SHIFT - clock->shift);
|
||||
if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
|
||||
s64 neg = -(s64)timekeeper.xtime_nsec;
|
||||
timekeeper.xtime_nsec = 0;
|
||||
timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
|
||||
}
|
||||
|
||||
/* store full nanoseconds into xtime after rounding it up and
|
||||
* add the remainder to the error difference.
|
||||
*/
|
||||
xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1;
|
||||
clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
|
||||
clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift);
|
||||
xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
|
||||
timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift;
|
||||
timekeeper.ntp_error += timekeeper.xtime_nsec <<
|
||||
timekeeper.ntp_error_shift;
|
||||
|
||||
update_xtime_cache(cyc2ns(clock, offset));
|
||||
nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
|
||||
update_xtime_cache(nsecs);
|
||||
|
||||
/* check to see if there is a new clocksource to use */
|
||||
change_clocksource();
|
||||
update_vsyscall(&xtime, clock);
|
||||
update_vsyscall(&xtime, timekeeper.clock);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -583,9 +826,12 @@ void update_wall_time(void)
|
||||
*/
|
||||
void getboottime(struct timespec *ts)
|
||||
{
|
||||
set_normalized_timespec(ts,
|
||||
- (wall_to_monotonic.tv_sec + total_sleep_time),
|
||||
- wall_to_monotonic.tv_nsec);
|
||||
struct timespec boottime = {
|
||||
.tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec,
|
||||
.tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec
|
||||
};
|
||||
|
||||
set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -594,7 +840,7 @@ void getboottime(struct timespec *ts)
|
||||
*/
|
||||
void monotonic_to_bootbased(struct timespec *ts)
|
||||
{
|
||||
ts->tv_sec += total_sleep_time;
|
||||
*ts = timespec_add_safe(*ts, total_sleep_time);
|
||||
}
|
||||
|
||||
unsigned long get_seconds(void)
|
||||
@@ -603,6 +849,10 @@ unsigned long get_seconds(void)
|
||||
}
|
||||
EXPORT_SYMBOL(get_seconds);
|
||||
|
||||
struct timespec __current_kernel_time(void)
|
||||
{
|
||||
return xtime_cache;
|
||||
}
|
||||
|
||||
struct timespec current_kernel_time(void)
|
||||
{
|
||||
@@ -618,3 +868,20 @@ struct timespec current_kernel_time(void)
|
||||
return now;
|
||||
}
|
||||
EXPORT_SYMBOL(current_kernel_time);
|
||||
|
||||
struct timespec get_monotonic_coarse(void)
|
||||
{
|
||||
struct timespec now, mono;
|
||||
unsigned long seq;
|
||||
|
||||
do {
|
||||
seq = read_seqbegin(&xtime_lock);
|
||||
|
||||
now = xtime_cache;
|
||||
mono = wall_to_monotonic;
|
||||
} while (read_seqretry(&xtime_lock, seq));
|
||||
|
||||
set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
|
||||
now.tv_nsec + mono.tv_nsec);
|
||||
return now;
|
||||
}
|
||||
|
@@ -37,7 +37,7 @@
|
||||
#include <linux/delay.h>
|
||||
#include <linux/tick.h>
|
||||
#include <linux/kallsyms.h>
|
||||
#include <linux/perf_counter.h>
|
||||
#include <linux/perf_event.h>
|
||||
#include <linux/sched.h>
|
||||
|
||||
#include <asm/uaccess.h>
|
||||
@@ -46,6 +46,9 @@
|
||||
#include <asm/timex.h>
|
||||
#include <asm/io.h>
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include <trace/events/timer.h>
|
||||
|
||||
u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
|
||||
|
||||
EXPORT_SYMBOL(jiffies_64);
|
||||
@@ -72,6 +75,7 @@ struct tvec_base {
|
||||
spinlock_t lock;
|
||||
struct timer_list *running_timer;
|
||||
unsigned long timer_jiffies;
|
||||
unsigned long next_timer;
|
||||
struct tvec_root tv1;
|
||||
struct tvec tv2;
|
||||
struct tvec tv3;
|
||||
@@ -520,6 +524,25 @@ static inline void debug_timer_activate(struct timer_list *timer) { }
|
||||
static inline void debug_timer_deactivate(struct timer_list *timer) { }
|
||||
#endif
|
||||
|
||||
static inline void debug_init(struct timer_list *timer)
|
||||
{
|
||||
debug_timer_init(timer);
|
||||
trace_timer_init(timer);
|
||||
}
|
||||
|
||||
static inline void
|
||||
debug_activate(struct timer_list *timer, unsigned long expires)
|
||||
{
|
||||
debug_timer_activate(timer);
|
||||
trace_timer_start(timer, expires);
|
||||
}
|
||||
|
||||
static inline void debug_deactivate(struct timer_list *timer)
|
||||
{
|
||||
debug_timer_deactivate(timer);
|
||||
trace_timer_cancel(timer);
|
||||
}
|
||||
|
||||
static void __init_timer(struct timer_list *timer,
|
||||
const char *name,
|
||||
struct lock_class_key *key)
|
||||
@@ -548,7 +571,7 @@ void init_timer_key(struct timer_list *timer,
|
||||
const char *name,
|
||||
struct lock_class_key *key)
|
||||
{
|
||||
debug_timer_init(timer);
|
||||
debug_init(timer);
|
||||
__init_timer(timer, name, key);
|
||||
}
|
||||
EXPORT_SYMBOL(init_timer_key);
|
||||
@@ -567,7 +590,7 @@ static inline void detach_timer(struct timer_list *timer,
|
||||
{
|
||||
struct list_head *entry = &timer->entry;
|
||||
|
||||
debug_timer_deactivate(timer);
|
||||
debug_deactivate(timer);
|
||||
|
||||
__list_del(entry->prev, entry->next);
|
||||
if (clear_pending)
|
||||
@@ -622,13 +645,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
|
||||
|
||||
if (timer_pending(timer)) {
|
||||
detach_timer(timer, 0);
|
||||
if (timer->expires == base->next_timer &&
|
||||
!tbase_get_deferrable(timer->base))
|
||||
base->next_timer = base->timer_jiffies;
|
||||
ret = 1;
|
||||
} else {
|
||||
if (pending_only)
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
debug_timer_activate(timer);
|
||||
debug_activate(timer, expires);
|
||||
|
||||
new_base = __get_cpu_var(tvec_bases);
|
||||
|
||||
@@ -663,6 +689,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
|
||||
}
|
||||
|
||||
timer->expires = expires;
|
||||
if (time_before(timer->expires, base->next_timer) &&
|
||||
!tbase_get_deferrable(timer->base))
|
||||
base->next_timer = timer->expires;
|
||||
internal_add_timer(base, timer);
|
||||
|
||||
out_unlock:
|
||||
@@ -780,7 +809,10 @@ void add_timer_on(struct timer_list *timer, int cpu)
|
||||
BUG_ON(timer_pending(timer) || !timer->function);
|
||||
spin_lock_irqsave(&base->lock, flags);
|
||||
timer_set_base(timer, base);
|
||||
debug_timer_activate(timer);
|
||||
debug_activate(timer, timer->expires);
|
||||
if (time_before(timer->expires, base->next_timer) &&
|
||||
!tbase_get_deferrable(timer->base))
|
||||
base->next_timer = timer->expires;
|
||||
internal_add_timer(base, timer);
|
||||
/*
|
||||
* Check whether the other CPU is idle and needs to be
|
||||
@@ -817,6 +849,9 @@ int del_timer(struct timer_list *timer)
|
||||
base = lock_timer_base(timer, &flags);
|
||||
if (timer_pending(timer)) {
|
||||
detach_timer(timer, 1);
|
||||
if (timer->expires == base->next_timer &&
|
||||
!tbase_get_deferrable(timer->base))
|
||||
base->next_timer = base->timer_jiffies;
|
||||
ret = 1;
|
||||
}
|
||||
spin_unlock_irqrestore(&base->lock, flags);
|
||||
@@ -850,6 +885,9 @@ int try_to_del_timer_sync(struct timer_list *timer)
|
||||
ret = 0;
|
||||
if (timer_pending(timer)) {
|
||||
detach_timer(timer, 1);
|
||||
if (timer->expires == base->next_timer &&
|
||||
!tbase_get_deferrable(timer->base))
|
||||
base->next_timer = base->timer_jiffies;
|
||||
ret = 1;
|
||||
}
|
||||
out:
|
||||
@@ -984,7 +1022,9 @@ static inline void __run_timers(struct tvec_base *base)
|
||||
*/
|
||||
lock_map_acquire(&lockdep_map);
|
||||
|
||||
trace_timer_expire_entry(timer);
|
||||
fn(data);
|
||||
trace_timer_expire_exit(timer);
|
||||
|
||||
lock_map_release(&lockdep_map);
|
||||
|
||||
@@ -1007,8 +1047,8 @@ static inline void __run_timers(struct tvec_base *base)
|
||||
#ifdef CONFIG_NO_HZ
|
||||
/*
|
||||
* Find out when the next timer event is due to happen. This
|
||||
* is used on S/390 to stop all activity when a cpus is idle.
|
||||
* This functions needs to be called disabled.
|
||||
* is used on S/390 to stop all activity when a CPU is idle.
|
||||
* This function needs to be called with interrupts disabled.
|
||||
*/
|
||||
static unsigned long __next_timer_interrupt(struct tvec_base *base)
|
||||
{
|
||||
@@ -1134,7 +1174,9 @@ unsigned long get_next_timer_interrupt(unsigned long now)
|
||||
unsigned long expires;
|
||||
|
||||
spin_lock(&base->lock);
|
||||
expires = __next_timer_interrupt(base);
|
||||
if (time_before_eq(base->next_timer, base->timer_jiffies))
|
||||
base->next_timer = __next_timer_interrupt(base);
|
||||
expires = base->next_timer;
|
||||
spin_unlock(&base->lock);
|
||||
|
||||
if (time_before_eq(expires, now))
|
||||
@@ -1169,7 +1211,7 @@ static void run_timer_softirq(struct softirq_action *h)
|
||||
{
|
||||
struct tvec_base *base = __get_cpu_var(tvec_bases);
|
||||
|
||||
perf_counter_do_pending();
|
||||
perf_event_do_pending();
|
||||
|
||||
hrtimer_run_pending();
|
||||
|
||||
@@ -1522,6 +1564,7 @@ static int __cpuinit init_timers_cpu(int cpu)
|
||||
INIT_LIST_HEAD(base->tv1.vec + j);
|
||||
|
||||
base->timer_jiffies = jiffies;
|
||||
base->next_timer = base->timer_jiffies;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1534,6 +1577,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
|
||||
timer = list_first_entry(head, struct timer_list, entry);
|
||||
detach_timer(timer, 0);
|
||||
timer_set_base(timer, new_base);
|
||||
if (time_before(timer->expires, new_base->next_timer) &&
|
||||
!tbase_get_deferrable(timer->base))
|
||||
new_base->next_timer = timer->expires;
|
||||
internal_add_timer(new_base, timer);
|
||||
}
|
||||
}
|
||||
|
@@ -83,7 +83,7 @@ config RING_BUFFER_ALLOW_SWAP
|
||||
# This allows those options to appear when no other tracer is selected. But the
|
||||
# options do not appear when something else selects it. We need the two options
|
||||
# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
|
||||
# hidding of the automatic options options.
|
||||
# hidding of the automatic options.
|
||||
|
||||
config TRACING
|
||||
bool
|
||||
|
@@ -42,7 +42,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
|
||||
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
|
||||
obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
|
||||
obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
|
||||
obj-$(CONFIG_POWER_TRACER) += trace_power.o
|
||||
obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
|
||||
obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
|
||||
obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
|
||||
@@ -54,5 +53,6 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o
|
||||
obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
|
||||
obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
|
||||
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
|
||||
obj-$(CONFIG_EVENT_TRACING) += power-traces.o
|
||||
|
||||
libftrace-y := ftrace.o
|
||||
|
@@ -225,7 +225,11 @@ static void ftrace_update_pid_func(void)
|
||||
if (ftrace_trace_function == ftrace_stub)
|
||||
return;
|
||||
|
||||
#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
|
||||
func = ftrace_trace_function;
|
||||
#else
|
||||
func = __ftrace_trace_function;
|
||||
#endif
|
||||
|
||||
if (ftrace_pid_trace) {
|
||||
set_ftrace_pid_function(func);
|
||||
@@ -1520,7 +1524,7 @@ static int t_show(struct seq_file *m, void *v)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct seq_operations show_ftrace_seq_ops = {
|
||||
static const struct seq_operations show_ftrace_seq_ops = {
|
||||
.start = t_start,
|
||||
.next = t_next,
|
||||
.stop = t_stop,
|
||||
@@ -1621,8 +1625,10 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
|
||||
if (!ret) {
|
||||
struct seq_file *m = file->private_data;
|
||||
m->private = iter;
|
||||
} else
|
||||
} else {
|
||||
trace_parser_put(&iter->parser);
|
||||
kfree(iter);
|
||||
}
|
||||
} else
|
||||
file->private_data = iter;
|
||||
mutex_unlock(&ftrace_regex_lock);
|
||||
@@ -2148,7 +2154,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
|
||||
struct trace_parser *parser;
|
||||
ssize_t ret, read;
|
||||
|
||||
if (!cnt || cnt < 0)
|
||||
if (!cnt)
|
||||
return 0;
|
||||
|
||||
mutex_lock(&ftrace_regex_lock);
|
||||
@@ -2162,7 +2168,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
|
||||
parser = &iter->parser;
|
||||
read = trace_get_user(parser, ubuf, cnt, ppos);
|
||||
|
||||
if (trace_parser_loaded(parser) &&
|
||||
if (read >= 0 && trace_parser_loaded(parser) &&
|
||||
!trace_parser_cont(parser)) {
|
||||
ret = ftrace_process_regex(parser->buffer,
|
||||
parser->idx, enable);
|
||||
@@ -2360,11 +2366,9 @@ unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
|
||||
static void *
|
||||
__g_next(struct seq_file *m, loff_t *pos)
|
||||
{
|
||||
unsigned long *array = m->private;
|
||||
|
||||
if (*pos >= ftrace_graph_count)
|
||||
return NULL;
|
||||
return &array[*pos];
|
||||
return &ftrace_graph_funcs[*pos];
|
||||
}
|
||||
|
||||
static void *
|
||||
@@ -2407,7 +2411,7 @@ static int g_show(struct seq_file *m, void *v)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct seq_operations ftrace_graph_seq_ops = {
|
||||
static const struct seq_operations ftrace_graph_seq_ops = {
|
||||
.start = g_start,
|
||||
.next = g_next,
|
||||
.stop = g_stop,
|
||||
@@ -2428,17 +2432,11 @@ ftrace_graph_open(struct inode *inode, struct file *file)
|
||||
ftrace_graph_count = 0;
|
||||
memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
|
||||
}
|
||||
|
||||
if (file->f_mode & FMODE_READ) {
|
||||
ret = seq_open(file, &ftrace_graph_seq_ops);
|
||||
if (!ret) {
|
||||
struct seq_file *m = file->private_data;
|
||||
m->private = ftrace_graph_funcs;
|
||||
}
|
||||
} else
|
||||
file->private_data = ftrace_graph_funcs;
|
||||
mutex_unlock(&graph_lock);
|
||||
|
||||
if (file->f_mode & FMODE_READ)
|
||||
ret = seq_open(file, &ftrace_graph_seq_ops);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -2506,9 +2504,7 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
|
||||
size_t cnt, loff_t *ppos)
|
||||
{
|
||||
struct trace_parser parser;
|
||||
unsigned long *array;
|
||||
size_t read = 0;
|
||||
ssize_t ret;
|
||||
ssize_t read, ret;
|
||||
|
||||
if (!cnt || cnt < 0)
|
||||
return 0;
|
||||
@@ -2517,35 +2513,31 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
|
||||
|
||||
if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
|
||||
ret = -EBUSY;
|
||||
goto out;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
if (file->f_mode & FMODE_READ) {
|
||||
struct seq_file *m = file->private_data;
|
||||
array = m->private;
|
||||
} else
|
||||
array = file->private_data;
|
||||
|
||||
if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
read = trace_get_user(&parser, ubuf, cnt, ppos);
|
||||
|
||||
if (trace_parser_loaded((&parser))) {
|
||||
if (read >= 0 && trace_parser_loaded((&parser))) {
|
||||
parser.buffer[parser.idx] = 0;
|
||||
|
||||
/* we allow only one expression at a time */
|
||||
ret = ftrace_set_func(array, &ftrace_graph_count,
|
||||
ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
|
||||
parser.buffer);
|
||||
if (ret)
|
||||
goto out;
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
ret = read;
|
||||
out:
|
||||
|
||||
out_free:
|
||||
trace_parser_put(&parser);
|
||||
out_unlock:
|
||||
mutex_unlock(&graph_lock);
|
||||
|
||||
return ret;
|
||||
@@ -2976,7 +2968,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
|
||||
|
||||
int
|
||||
ftrace_enable_sysctl(struct ctl_table *table, int write,
|
||||
struct file *file, void __user *buffer, size_t *lenp,
|
||||
void __user *buffer, size_t *lenp,
|
||||
loff_t *ppos)
|
||||
{
|
||||
int ret;
|
||||
@@ -2986,7 +2978,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
|
||||
|
||||
mutex_lock(&ftrace_lock);
|
||||
|
||||
ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
|
||||
ret = proc_dointvec(table, write, buffer, lenp, ppos);
|
||||
|
||||
if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
|
||||
goto out;
|
||||
|
20
kernel/trace/power-traces.c
Normal file
20
kernel/trace/power-traces.c
Normal file
@@ -0,0 +1,20 @@
|
||||
/*
|
||||
* Power trace points
|
||||
*
|
||||
* Copyright (C) 2009 Arjan van de Ven <arjan@linux.intel.com>
|
||||
*/
|
||||
|
||||
#include <linux/string.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include <trace/events/power.h>
|
||||
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(power_end);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
|
||||
|
@@ -201,8 +201,6 @@ int tracing_is_on(void)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(tracing_is_on);
|
||||
|
||||
#include "trace.h"
|
||||
|
||||
#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
|
||||
#define RB_ALIGNMENT 4U
|
||||
#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
|
||||
|
@@ -125,13 +125,13 @@ int ftrace_dump_on_oops;
|
||||
|
||||
static int tracing_set_tracer(const char *buf);
|
||||
|
||||
#define BOOTUP_TRACER_SIZE 100
|
||||
static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata;
|
||||
#define MAX_TRACER_SIZE 100
|
||||
static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
|
||||
static char *default_bootup_tracer;
|
||||
|
||||
static int __init set_ftrace(char *str)
|
||||
{
|
||||
strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE);
|
||||
strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
|
||||
default_bootup_tracer = bootup_tracer_buf;
|
||||
/* We are using ftrace early, expand it */
|
||||
ring_buffer_expanded = 1;
|
||||
@@ -241,13 +241,6 @@ static struct tracer *trace_types __read_mostly;
|
||||
/* current_trace points to the tracer that is currently active */
|
||||
static struct tracer *current_trace __read_mostly;
|
||||
|
||||
/*
|
||||
* max_tracer_type_len is used to simplify the allocating of
|
||||
* buffers to read userspace tracer names. We keep track of
|
||||
* the longest tracer name registered.
|
||||
*/
|
||||
static int max_tracer_type_len;
|
||||
|
||||
/*
|
||||
* trace_types_lock is used to protect the trace_types list.
|
||||
* This lock is also used to keep user access serialized.
|
||||
@@ -275,12 +268,18 @@ static DEFINE_SPINLOCK(tracing_start_lock);
|
||||
*/
|
||||
void trace_wake_up(void)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
if (trace_flags & TRACE_ITER_BLOCK)
|
||||
return;
|
||||
/*
|
||||
* The runqueue_is_locked() can fail, but this is the best we
|
||||
* have for now:
|
||||
*/
|
||||
if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked())
|
||||
cpu = get_cpu();
|
||||
if (!runqueue_is_locked(cpu))
|
||||
wake_up(&trace_wait);
|
||||
put_cpu();
|
||||
}
|
||||
|
||||
static int __init set_buf_size(char *str)
|
||||
@@ -416,7 +415,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
|
||||
|
||||
/* read the non-space input */
|
||||
while (cnt && !isspace(ch)) {
|
||||
if (parser->idx < parser->size)
|
||||
if (parser->idx < parser->size - 1)
|
||||
parser->buffer[parser->idx++] = ch;
|
||||
else {
|
||||
ret = -EINVAL;
|
||||
@@ -619,7 +618,6 @@ __releases(kernel_lock)
|
||||
__acquires(kernel_lock)
|
||||
{
|
||||
struct tracer *t;
|
||||
int len;
|
||||
int ret = 0;
|
||||
|
||||
if (!type->name) {
|
||||
@@ -627,6 +625,11 @@ __acquires(kernel_lock)
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (strlen(type->name) > MAX_TRACER_SIZE) {
|
||||
pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* When this gets called we hold the BKL which means that
|
||||
* preemption is disabled. Various trace selftests however
|
||||
@@ -641,7 +644,7 @@ __acquires(kernel_lock)
|
||||
for (t = trace_types; t; t = t->next) {
|
||||
if (strcmp(type->name, t->name) == 0) {
|
||||
/* already found */
|
||||
pr_info("Trace %s already registered\n",
|
||||
pr_info("Tracer %s already registered\n",
|
||||
type->name);
|
||||
ret = -1;
|
||||
goto out;
|
||||
@@ -692,9 +695,6 @@ __acquires(kernel_lock)
|
||||
|
||||
type->next = trace_types;
|
||||
trace_types = type;
|
||||
len = strlen(type->name);
|
||||
if (len > max_tracer_type_len)
|
||||
max_tracer_type_len = len;
|
||||
|
||||
out:
|
||||
tracing_selftest_running = false;
|
||||
@@ -703,7 +703,7 @@ __acquires(kernel_lock)
|
||||
if (ret || !default_bootup_tracer)
|
||||
goto out_unlock;
|
||||
|
||||
if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE))
|
||||
if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE))
|
||||
goto out_unlock;
|
||||
|
||||
printk(KERN_INFO "Starting tracer '%s'\n", type->name);
|
||||
@@ -725,14 +725,13 @@ __acquires(kernel_lock)
|
||||
void unregister_tracer(struct tracer *type)
|
||||
{
|
||||
struct tracer **t;
|
||||
int len;
|
||||
|
||||
mutex_lock(&trace_types_lock);
|
||||
for (t = &trace_types; *t; t = &(*t)->next) {
|
||||
if (*t == type)
|
||||
goto found;
|
||||
}
|
||||
pr_info("Trace %s not registered\n", type->name);
|
||||
pr_info("Tracer %s not registered\n", type->name);
|
||||
goto out;
|
||||
|
||||
found:
|
||||
@@ -745,17 +744,7 @@ void unregister_tracer(struct tracer *type)
|
||||
current_trace->stop(&global_trace);
|
||||
current_trace = &nop_trace;
|
||||
}
|
||||
|
||||
if (strlen(type->name) != max_tracer_type_len)
|
||||
goto out;
|
||||
|
||||
max_tracer_type_len = 0;
|
||||
for (t = &trace_types; *t; t = &(*t)->next) {
|
||||
len = strlen((*t)->name);
|
||||
if (len > max_tracer_type_len)
|
||||
max_tracer_type_len = len;
|
||||
}
|
||||
out:
|
||||
out:
|
||||
mutex_unlock(&trace_types_lock);
|
||||
}
|
||||
|
||||
@@ -1960,7 +1949,7 @@ static int s_show(struct seq_file *m, void *v)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct seq_operations tracer_seq_ops = {
|
||||
static const struct seq_operations tracer_seq_ops = {
|
||||
.start = s_start,
|
||||
.next = s_next,
|
||||
.stop = s_stop,
|
||||
@@ -1995,11 +1984,9 @@ __tracing_open(struct inode *inode, struct file *file)
|
||||
if (current_trace)
|
||||
*iter->trace = *current_trace;
|
||||
|
||||
if (!alloc_cpumask_var(&iter->started, GFP_KERNEL))
|
||||
if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
|
||||
goto fail;
|
||||
|
||||
cpumask_clear(iter->started);
|
||||
|
||||
if (current_trace && current_trace->print_max)
|
||||
iter->tr = &max_tr;
|
||||
else
|
||||
@@ -2174,7 +2161,7 @@ static int t_show(struct seq_file *m, void *v)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct seq_operations show_traces_seq_ops = {
|
||||
static const struct seq_operations show_traces_seq_ops = {
|
||||
.start = t_start,
|
||||
.next = t_next,
|
||||
.stop = t_stop,
|
||||
@@ -2604,7 +2591,7 @@ static ssize_t
|
||||
tracing_set_trace_read(struct file *filp, char __user *ubuf,
|
||||
size_t cnt, loff_t *ppos)
|
||||
{
|
||||
char buf[max_tracer_type_len+2];
|
||||
char buf[MAX_TRACER_SIZE+2];
|
||||
int r;
|
||||
|
||||
mutex_lock(&trace_types_lock);
|
||||
@@ -2754,15 +2741,15 @@ static ssize_t
|
||||
tracing_set_trace_write(struct file *filp, const char __user *ubuf,
|
||||
size_t cnt, loff_t *ppos)
|
||||
{
|
||||
char buf[max_tracer_type_len+1];
|
||||
char buf[MAX_TRACER_SIZE+1];
|
||||
int i;
|
||||
size_t ret;
|
||||
int err;
|
||||
|
||||
ret = cnt;
|
||||
|
||||
if (cnt > max_tracer_type_len)
|
||||
cnt = max_tracer_type_len;
|
||||
if (cnt > MAX_TRACER_SIZE)
|
||||
cnt = MAX_TRACER_SIZE;
|
||||
|
||||
if (copy_from_user(&buf, ubuf, cnt))
|
||||
return -EFAULT;
|
||||
@@ -4400,7 +4387,7 @@ __init static int tracer_alloc_buffers(void)
|
||||
if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
|
||||
goto out_free_buffer_mask;
|
||||
|
||||
if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
|
||||
if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
|
||||
goto out_free_tracing_cpumask;
|
||||
|
||||
/* To save memory, keep the ring buffer size to its minimum */
|
||||
@@ -4411,7 +4398,6 @@ __init static int tracer_alloc_buffers(void)
|
||||
|
||||
cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
|
||||
cpumask_copy(tracing_cpumask, cpu_all_mask);
|
||||
cpumask_clear(tracing_reader_cpumask);
|
||||
|
||||
/* TODO: make the number of buffers hot pluggable with CPUS */
|
||||
global_trace.buffer = ring_buffer_alloc(ring_buf_size,
|
||||
|
@@ -11,7 +11,6 @@
|
||||
#include <linux/ftrace.h>
|
||||
#include <trace/boot.h>
|
||||
#include <linux/kmemtrace.h>
|
||||
#include <trace/power.h>
|
||||
|
||||
#include <linux/trace_seq.h>
|
||||
#include <linux/ftrace_event.h>
|
||||
@@ -37,7 +36,6 @@ enum trace_type {
|
||||
TRACE_HW_BRANCHES,
|
||||
TRACE_KMEM_ALLOC,
|
||||
TRACE_KMEM_FREE,
|
||||
TRACE_POWER,
|
||||
TRACE_BLK,
|
||||
|
||||
__TRACE_LAST_TYPE,
|
||||
@@ -207,7 +205,6 @@ extern void __ftrace_bad_type(void);
|
||||
IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
|
||||
TRACE_GRAPH_RET); \
|
||||
IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
|
||||
IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
|
||||
IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
|
||||
TRACE_KMEM_ALLOC); \
|
||||
IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
|
||||
|
@@ -330,23 +330,6 @@ FTRACE_ENTRY(hw_branch, hw_branch_entry,
|
||||
F_printk("from: %llx to: %llx", __entry->from, __entry->to)
|
||||
);
|
||||
|
||||
FTRACE_ENTRY(power, trace_power,
|
||||
|
||||
TRACE_POWER,
|
||||
|
||||
F_STRUCT(
|
||||
__field_struct( struct power_trace, state_data )
|
||||
__field_desc( s64, state_data, stamp )
|
||||
__field_desc( s64, state_data, end )
|
||||
__field_desc( int, state_data, type )
|
||||
__field_desc( int, state_data, state )
|
||||
),
|
||||
|
||||
F_printk("%llx->%llx type:%u state:%u",
|
||||
__entry->stamp, __entry->end,
|
||||
__entry->type, __entry->state)
|
||||
);
|
||||
|
||||
FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
|
||||
|
||||
TRACE_KMEM_ALLOC,
|
||||
|
@@ -8,6 +8,57 @@
|
||||
#include <linux/module.h>
|
||||
#include "trace.h"
|
||||
|
||||
/*
|
||||
* We can't use a size but a type in alloc_percpu()
|
||||
* So let's create a dummy type that matches the desired size
|
||||
*/
|
||||
typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
|
||||
|
||||
char *trace_profile_buf;
|
||||
EXPORT_SYMBOL_GPL(trace_profile_buf);
|
||||
|
||||
char *trace_profile_buf_nmi;
|
||||
EXPORT_SYMBOL_GPL(trace_profile_buf_nmi);
|
||||
|
||||
/* Count the events in use (per event id, not per instance) */
|
||||
static int total_profile_count;
|
||||
|
||||
static int ftrace_profile_enable_event(struct ftrace_event_call *event)
|
||||
{
|
||||
char *buf;
|
||||
int ret = -ENOMEM;
|
||||
|
||||
if (atomic_inc_return(&event->profile_count))
|
||||
return 0;
|
||||
|
||||
if (!total_profile_count++) {
|
||||
buf = (char *)alloc_percpu(profile_buf_t);
|
||||
if (!buf)
|
||||
goto fail_buf;
|
||||
|
||||
rcu_assign_pointer(trace_profile_buf, buf);
|
||||
|
||||
buf = (char *)alloc_percpu(profile_buf_t);
|
||||
if (!buf)
|
||||
goto fail_buf_nmi;
|
||||
|
||||
rcu_assign_pointer(trace_profile_buf_nmi, buf);
|
||||
}
|
||||
|
||||
ret = event->profile_enable();
|
||||
if (!ret)
|
||||
return 0;
|
||||
|
||||
kfree(trace_profile_buf_nmi);
|
||||
fail_buf_nmi:
|
||||
kfree(trace_profile_buf);
|
||||
fail_buf:
|
||||
total_profile_count--;
|
||||
atomic_dec(&event->profile_count);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ftrace_profile_enable(int event_id)
|
||||
{
|
||||
struct ftrace_event_call *event;
|
||||
@@ -17,7 +68,7 @@ int ftrace_profile_enable(int event_id)
|
||||
list_for_each_entry(event, &ftrace_events, list) {
|
||||
if (event->id == event_id && event->profile_enable &&
|
||||
try_module_get(event->mod)) {
|
||||
ret = event->profile_enable(event);
|
||||
ret = ftrace_profile_enable_event(event);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -26,6 +77,33 @@ int ftrace_profile_enable(int event_id)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void ftrace_profile_disable_event(struct ftrace_event_call *event)
|
||||
{
|
||||
char *buf, *nmi_buf;
|
||||
|
||||
if (!atomic_add_negative(-1, &event->profile_count))
|
||||
return;
|
||||
|
||||
event->profile_disable();
|
||||
|
||||
if (!--total_profile_count) {
|
||||
buf = trace_profile_buf;
|
||||
rcu_assign_pointer(trace_profile_buf, NULL);
|
||||
|
||||
nmi_buf = trace_profile_buf_nmi;
|
||||
rcu_assign_pointer(trace_profile_buf_nmi, NULL);
|
||||
|
||||
/*
|
||||
* Ensure every events in profiling have finished before
|
||||
* releasing the buffers
|
||||
*/
|
||||
synchronize_sched();
|
||||
|
||||
free_percpu(buf);
|
||||
free_percpu(nmi_buf);
|
||||
}
|
||||
}
|
||||
|
||||
void ftrace_profile_disable(int event_id)
|
||||
{
|
||||
struct ftrace_event_call *event;
|
||||
@@ -33,7 +111,7 @@ void ftrace_profile_disable(int event_id)
|
||||
mutex_lock(&event_mutex);
|
||||
list_for_each_entry(event, &ftrace_events, list) {
|
||||
if (event->id == event_id) {
|
||||
event->profile_disable(event);
|
||||
ftrace_profile_disable_event(event);
|
||||
module_put(event->mod);
|
||||
break;
|
||||
}
|
||||
|
@@ -232,10 +232,9 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
|
||||
size_t cnt, loff_t *ppos)
|
||||
{
|
||||
struct trace_parser parser;
|
||||
size_t read = 0;
|
||||
ssize_t ret;
|
||||
ssize_t read, ret;
|
||||
|
||||
if (!cnt || cnt < 0)
|
||||
if (!cnt)
|
||||
return 0;
|
||||
|
||||
ret = tracing_update_buffers();
|
||||
@@ -247,7 +246,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
|
||||
|
||||
read = trace_get_user(&parser, ubuf, cnt, ppos);
|
||||
|
||||
if (trace_parser_loaded((&parser))) {
|
||||
if (read >= 0 && trace_parser_loaded((&parser))) {
|
||||
int set = 1;
|
||||
|
||||
if (*parser.buffer == '!')
|
||||
@@ -271,42 +270,32 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
|
||||
static void *
|
||||
t_next(struct seq_file *m, void *v, loff_t *pos)
|
||||
{
|
||||
struct list_head *list = m->private;
|
||||
struct ftrace_event_call *call;
|
||||
struct ftrace_event_call *call = v;
|
||||
|
||||
(*pos)++;
|
||||
|
||||
for (;;) {
|
||||
if (list == &ftrace_events)
|
||||
return NULL;
|
||||
|
||||
call = list_entry(list, struct ftrace_event_call, list);
|
||||
|
||||
list_for_each_entry_continue(call, &ftrace_events, list) {
|
||||
/*
|
||||
* The ftrace subsystem is for showing formats only.
|
||||
* They can not be enabled or disabled via the event files.
|
||||
*/
|
||||
if (call->regfunc)
|
||||
break;
|
||||
|
||||
list = list->next;
|
||||
return call;
|
||||
}
|
||||
|
||||
m->private = list->next;
|
||||
|
||||
return call;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void *t_start(struct seq_file *m, loff_t *pos)
|
||||
{
|
||||
struct ftrace_event_call *call = NULL;
|
||||
struct ftrace_event_call *call;
|
||||
loff_t l;
|
||||
|
||||
mutex_lock(&event_mutex);
|
||||
|
||||
m->private = ftrace_events.next;
|
||||
call = list_entry(&ftrace_events, struct ftrace_event_call, list);
|
||||
for (l = 0; l <= *pos; ) {
|
||||
call = t_next(m, NULL, &l);
|
||||
call = t_next(m, call, &l);
|
||||
if (!call)
|
||||
break;
|
||||
}
|
||||
@@ -316,37 +305,28 @@ static void *t_start(struct seq_file *m, loff_t *pos)
|
||||
static void *
|
||||
s_next(struct seq_file *m, void *v, loff_t *pos)
|
||||
{
|
||||
struct list_head *list = m->private;
|
||||
struct ftrace_event_call *call;
|
||||
struct ftrace_event_call *call = v;
|
||||
|
||||
(*pos)++;
|
||||
|
||||
retry:
|
||||
if (list == &ftrace_events)
|
||||
return NULL;
|
||||
|
||||
call = list_entry(list, struct ftrace_event_call, list);
|
||||
|
||||
if (!call->enabled) {
|
||||
list = list->next;
|
||||
goto retry;
|
||||
list_for_each_entry_continue(call, &ftrace_events, list) {
|
||||
if (call->enabled)
|
||||
return call;
|
||||
}
|
||||
|
||||
m->private = list->next;
|
||||
|
||||
return call;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void *s_start(struct seq_file *m, loff_t *pos)
|
||||
{
|
||||
struct ftrace_event_call *call = NULL;
|
||||
struct ftrace_event_call *call;
|
||||
loff_t l;
|
||||
|
||||
mutex_lock(&event_mutex);
|
||||
|
||||
m->private = ftrace_events.next;
|
||||
call = list_entry(&ftrace_events, struct ftrace_event_call, list);
|
||||
for (l = 0; l <= *pos; ) {
|
||||
call = s_next(m, NULL, &l);
|
||||
call = s_next(m, call, &l);
|
||||
if (!call)
|
||||
break;
|
||||
}
|
||||
|
@@ -155,7 +155,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
|
||||
seq_print_ip_sym(seq, it->from, symflags) &&
|
||||
trace_seq_printf(seq, "\n"))
|
||||
return TRACE_TYPE_HANDLED;
|
||||
return TRACE_TYPE_PARTIAL_LINE;;
|
||||
return TRACE_TYPE_PARTIAL_LINE;
|
||||
}
|
||||
return TRACE_TYPE_UNHANDLED;
|
||||
}
|
||||
|
@@ -1,218 +0,0 @@
|
||||
/*
|
||||
* ring buffer based C-state tracer
|
||||
*
|
||||
* Arjan van de Ven <arjan@linux.intel.com>
|
||||
* Copyright (C) 2008 Intel Corporation
|
||||
*
|
||||
* Much is borrowed from trace_boot.c which is
|
||||
* Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/init.h>
|
||||
#include <linux/debugfs.h>
|
||||
#include <trace/power.h>
|
||||
#include <linux/kallsyms.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
#include "trace.h"
|
||||
#include "trace_output.h"
|
||||
|
||||
static struct trace_array *power_trace;
|
||||
static int __read_mostly trace_power_enabled;
|
||||
|
||||
static void probe_power_start(struct power_trace *it, unsigned int type,
|
||||
unsigned int level)
|
||||
{
|
||||
if (!trace_power_enabled)
|
||||
return;
|
||||
|
||||
memset(it, 0, sizeof(struct power_trace));
|
||||
it->state = level;
|
||||
it->type = type;
|
||||
it->stamp = ktime_get();
|
||||
}
|
||||
|
||||
|
||||
static void probe_power_end(struct power_trace *it)
|
||||
{
|
||||
struct ftrace_event_call *call = &event_power;
|
||||
struct ring_buffer_event *event;
|
||||
struct ring_buffer *buffer;
|
||||
struct trace_power *entry;
|
||||
struct trace_array_cpu *data;
|
||||
struct trace_array *tr = power_trace;
|
||||
|
||||
if (!trace_power_enabled)
|
||||
return;
|
||||
|
||||
buffer = tr->buffer;
|
||||
|
||||
preempt_disable();
|
||||
it->end = ktime_get();
|
||||
data = tr->data[smp_processor_id()];
|
||||
|
||||
event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
|
||||
sizeof(*entry), 0, 0);
|
||||
if (!event)
|
||||
goto out;
|
||||
entry = ring_buffer_event_data(event);
|
||||
entry->state_data = *it;
|
||||
if (!filter_check_discard(call, entry, buffer, event))
|
||||
trace_buffer_unlock_commit(buffer, event, 0, 0);
|
||||
out:
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
static void probe_power_mark(struct power_trace *it, unsigned int type,
|
||||
unsigned int level)
|
||||
{
|
||||
struct ftrace_event_call *call = &event_power;
|
||||
struct ring_buffer_event *event;
|
||||
struct ring_buffer *buffer;
|
||||
struct trace_power *entry;
|
||||
struct trace_array_cpu *data;
|
||||
struct trace_array *tr = power_trace;
|
||||
|
||||
if (!trace_power_enabled)
|
||||
return;
|
||||
|
||||
buffer = tr->buffer;
|
||||
|
||||
memset(it, 0, sizeof(struct power_trace));
|
||||
it->state = level;
|
||||
it->type = type;
|
||||
it->stamp = ktime_get();
|
||||
preempt_disable();
|
||||
it->end = it->stamp;
|
||||
data = tr->data[smp_processor_id()];
|
||||
|
||||
event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
|
||||
sizeof(*entry), 0, 0);
|
||||
if (!event)
|
||||
goto out;
|
||||
entry = ring_buffer_event_data(event);
|
||||
entry->state_data = *it;
|
||||
if (!filter_check_discard(call, entry, buffer, event))
|
||||
trace_buffer_unlock_commit(buffer, event, 0, 0);
|
||||
out:
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
static int tracing_power_register(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = register_trace_power_start(probe_power_start);
|
||||
if (ret) {
|
||||
pr_info("power trace: Couldn't activate tracepoint"
|
||||
" probe to trace_power_start\n");
|
||||
return ret;
|
||||
}
|
||||
ret = register_trace_power_end(probe_power_end);
|
||||
if (ret) {
|
||||
pr_info("power trace: Couldn't activate tracepoint"
|
||||
" probe to trace_power_end\n");
|
||||
goto fail_start;
|
||||
}
|
||||
ret = register_trace_power_mark(probe_power_mark);
|
||||
if (ret) {
|
||||
pr_info("power trace: Couldn't activate tracepoint"
|
||||
" probe to trace_power_mark\n");
|
||||
goto fail_end;
|
||||
}
|
||||
return ret;
|
||||
fail_end:
|
||||
unregister_trace_power_end(probe_power_end);
|
||||
fail_start:
|
||||
unregister_trace_power_start(probe_power_start);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void start_power_trace(struct trace_array *tr)
|
||||
{
|
||||
trace_power_enabled = 1;
|
||||
}
|
||||
|
||||
static void stop_power_trace(struct trace_array *tr)
|
||||
{
|
||||
trace_power_enabled = 0;
|
||||
}
|
||||
|
||||
static void power_trace_reset(struct trace_array *tr)
|
||||
{
|
||||
trace_power_enabled = 0;
|
||||
unregister_trace_power_start(probe_power_start);
|
||||
unregister_trace_power_end(probe_power_end);
|
||||
unregister_trace_power_mark(probe_power_mark);
|
||||
}
|
||||
|
||||
|
||||
static int power_trace_init(struct trace_array *tr)
|
||||
{
|
||||
power_trace = tr;
|
||||
|
||||
trace_power_enabled = 1;
|
||||
tracing_power_register();
|
||||
|
||||
tracing_reset_online_cpus(tr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static enum print_line_t power_print_line(struct trace_iterator *iter)
|
||||
{
|
||||
int ret = 0;
|
||||
struct trace_entry *entry = iter->ent;
|
||||
struct trace_power *field ;
|
||||
struct power_trace *it;
|
||||
struct trace_seq *s = &iter->seq;
|
||||
struct timespec stamp;
|
||||
struct timespec duration;
|
||||
|
||||
trace_assign_type(field, entry);
|
||||
it = &field->state_data;
|
||||
stamp = ktime_to_timespec(it->stamp);
|
||||
duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
|
||||
|
||||
if (entry->type == TRACE_POWER) {
|
||||
if (it->type == POWER_CSTATE)
|
||||
ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
|
||||
stamp.tv_sec,
|
||||
stamp.tv_nsec,
|
||||
it->state, iter->cpu,
|
||||
duration.tv_sec,
|
||||
duration.tv_nsec);
|
||||
if (it->type == POWER_PSTATE)
|
||||
ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
|
||||
stamp.tv_sec,
|
||||
stamp.tv_nsec,
|
||||
it->state, iter->cpu);
|
||||
if (!ret)
|
||||
return TRACE_TYPE_PARTIAL_LINE;
|
||||
return TRACE_TYPE_HANDLED;
|
||||
}
|
||||
return TRACE_TYPE_UNHANDLED;
|
||||
}
|
||||
|
||||
static void power_print_header(struct seq_file *s)
|
||||
{
|
||||
seq_puts(s, "# TIMESTAMP STATE EVENT\n");
|
||||
seq_puts(s, "# | | |\n");
|
||||
}
|
||||
|
||||
static struct tracer power_tracer __read_mostly =
|
||||
{
|
||||
.name = "power",
|
||||
.init = power_trace_init,
|
||||
.start = start_power_trace,
|
||||
.stop = stop_power_trace,
|
||||
.reset = power_trace_reset,
|
||||
.print_line = power_print_line,
|
||||
.print_header = power_print_header,
|
||||
};
|
||||
|
||||
static int init_power_trace(void)
|
||||
{
|
||||
return register_tracer(&power_tracer);
|
||||
}
|
||||
device_initcall(init_power_trace);
|
@@ -11,7 +11,6 @@
|
||||
#include <linux/ftrace.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/marker.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/ctype.h>
|
||||
#include <linux/list.h>
|
||||
|
@@ -296,14 +296,14 @@ static const struct file_operations stack_trace_fops = {
|
||||
|
||||
int
|
||||
stack_trace_sysctl(struct ctl_table *table, int write,
|
||||
struct file *file, void __user *buffer, size_t *lenp,
|
||||
void __user *buffer, size_t *lenp,
|
||||
loff_t *ppos)
|
||||
{
|
||||
int ret;
|
||||
|
||||
mutex_lock(&stack_sysctl_mutex);
|
||||
|
||||
ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
|
||||
ret = proc_dointvec(table, write, buffer, lenp, ppos);
|
||||
|
||||
if (ret || !write ||
|
||||
(last_stack_tracer_enabled == !!stack_tracer_enabled))
|
||||
|
@@ -2,7 +2,7 @@
|
||||
#include <trace/events/syscalls.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/ftrace.h>
|
||||
#include <linux/perf_counter.h>
|
||||
#include <linux/perf_event.h>
|
||||
#include <asm/syscall.h>
|
||||
|
||||
#include "trace_output.h"
|
||||
@@ -384,10 +384,13 @@ static int sys_prof_refcount_exit;
|
||||
|
||||
static void prof_syscall_enter(struct pt_regs *regs, long id)
|
||||
{
|
||||
struct syscall_trace_enter *rec;
|
||||
struct syscall_metadata *sys_data;
|
||||
struct syscall_trace_enter *rec;
|
||||
unsigned long flags;
|
||||
char *raw_data;
|
||||
int syscall_nr;
|
||||
int size;
|
||||
int cpu;
|
||||
|
||||
syscall_nr = syscall_get_nr(current, regs);
|
||||
if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
|
||||
@@ -402,20 +405,38 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
|
||||
size = ALIGN(size + sizeof(u32), sizeof(u64));
|
||||
size -= sizeof(u32);
|
||||
|
||||
do {
|
||||
char raw_data[size];
|
||||
if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
|
||||
"profile buffer not large enough"))
|
||||
return;
|
||||
|
||||
/* zero the dead bytes from align to not leak stack to user */
|
||||
*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
|
||||
/* Protect the per cpu buffer, begin the rcu read side */
|
||||
local_irq_save(flags);
|
||||
|
||||
rec = (struct syscall_trace_enter *) raw_data;
|
||||
tracing_generic_entry_update(&rec->ent, 0, 0);
|
||||
rec->ent.type = sys_data->enter_id;
|
||||
rec->nr = syscall_nr;
|
||||
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
|
||||
(unsigned long *)&rec->args);
|
||||
perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
|
||||
} while(0);
|
||||
cpu = smp_processor_id();
|
||||
|
||||
if (in_nmi())
|
||||
raw_data = rcu_dereference(trace_profile_buf_nmi);
|
||||
else
|
||||
raw_data = rcu_dereference(trace_profile_buf);
|
||||
|
||||
if (!raw_data)
|
||||
goto end;
|
||||
|
||||
raw_data = per_cpu_ptr(raw_data, cpu);
|
||||
|
||||
/* zero the dead bytes from align to not leak stack to user */
|
||||
*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
|
||||
|
||||
rec = (struct syscall_trace_enter *) raw_data;
|
||||
tracing_generic_entry_update(&rec->ent, 0, 0);
|
||||
rec->ent.type = sys_data->enter_id;
|
||||
rec->nr = syscall_nr;
|
||||
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
|
||||
(unsigned long *)&rec->args);
|
||||
perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
|
||||
|
||||
end:
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
int reg_prof_syscall_enter(char *name)
|
||||
@@ -460,8 +481,12 @@ void unreg_prof_syscall_enter(char *name)
|
||||
static void prof_syscall_exit(struct pt_regs *regs, long ret)
|
||||
{
|
||||
struct syscall_metadata *sys_data;
|
||||
struct syscall_trace_exit rec;
|
||||
struct syscall_trace_exit *rec;
|
||||
unsigned long flags;
|
||||
int syscall_nr;
|
||||
char *raw_data;
|
||||
int size;
|
||||
int cpu;
|
||||
|
||||
syscall_nr = syscall_get_nr(current, regs);
|
||||
if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
|
||||
@@ -471,12 +496,46 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
|
||||
if (!sys_data)
|
||||
return;
|
||||
|
||||
tracing_generic_entry_update(&rec.ent, 0, 0);
|
||||
rec.ent.type = sys_data->exit_id;
|
||||
rec.nr = syscall_nr;
|
||||
rec.ret = syscall_get_return_value(current, regs);
|
||||
/* We can probably do that at build time */
|
||||
size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
|
||||
size -= sizeof(u32);
|
||||
|
||||
perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec));
|
||||
/*
|
||||
* Impossible, but be paranoid with the future
|
||||
* How to put this check outside runtime?
|
||||
*/
|
||||
if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
|
||||
"exit event has grown above profile buffer size"))
|
||||
return;
|
||||
|
||||
/* Protect the per cpu buffer, begin the rcu read side */
|
||||
local_irq_save(flags);
|
||||
cpu = smp_processor_id();
|
||||
|
||||
if (in_nmi())
|
||||
raw_data = rcu_dereference(trace_profile_buf_nmi);
|
||||
else
|
||||
raw_data = rcu_dereference(trace_profile_buf);
|
||||
|
||||
if (!raw_data)
|
||||
goto end;
|
||||
|
||||
raw_data = per_cpu_ptr(raw_data, cpu);
|
||||
|
||||
/* zero the dead bytes from align to not leak stack to user */
|
||||
*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
|
||||
|
||||
rec = (struct syscall_trace_exit *)raw_data;
|
||||
|
||||
tracing_generic_entry_update(&rec->ent, 0, 0);
|
||||
rec->ent.type = sys_data->exit_id;
|
||||
rec->nr = syscall_nr;
|
||||
rec->ret = syscall_get_return_value(current, regs);
|
||||
|
||||
perf_tp_event(sys_data->exit_id, 0, 1, rec, size);
|
||||
|
||||
end:
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
int reg_prof_syscall_exit(char *name)
|
||||
|
@@ -48,7 +48,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
|
||||
|
||||
/*
|
||||
* Note about RCU :
|
||||
* It is used to to delay the free of multiple probes array until a quiescent
|
||||
* It is used to delay the free of multiple probes array until a quiescent
|
||||
* state is reached.
|
||||
* Tracepoint entries modifications are protected by the tracepoints_mutex.
|
||||
*/
|
||||
|
@@ -4,7 +4,6 @@
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/utsname.h>
|
||||
#include <linux/mman.h>
|
||||
#include <linux/notifier.h>
|
||||
#include <linux/reboot.h>
|
||||
|
@@ -42,14 +42,14 @@ static void put_uts(ctl_table *table, int write, void *which)
|
||||
* Special case of dostring for the UTS structure. This has locks
|
||||
* to observe. Should this be in kernel/sys.c ????
|
||||
*/
|
||||
static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
|
||||
static int proc_do_uts_string(ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
struct ctl_table uts_table;
|
||||
int r;
|
||||
memcpy(&uts_table, table, sizeof(uts_table));
|
||||
uts_table.data = get_uts(table, write);
|
||||
r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos);
|
||||
r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
|
||||
put_uts(table, write, uts_table.data);
|
||||
return r;
|
||||
}
|
||||
|
Reference in New Issue
Block a user