Merge branch 'tracing/urgent' into tracing/core

Merge reason: Pick up latest fixes and update to latest upstream.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
Ingo Molnar
2009-10-01 11:20:33 +02:00
4522 changed files with 350207 additions and 129152 deletions

View File

@@ -58,7 +58,6 @@ obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
@@ -87,7 +86,6 @@ obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
obj-$(CONFIG_MARKERS) += marker.o
obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
obj-$(CONFIG_LATENCYTOP) += latencytop.o
obj-$(CONFIG_FUNCTION_TRACER) += trace/
@@ -96,7 +94,7 @@ obj-$(CONFIG_X86_DS) += trace/
obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
obj-$(CONFIG_SLOW_WORK) += slow-work.o
obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
obj-$(CONFIG_PERF_EVENTS) += perf_event.o
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is

View File

@@ -855,18 +855,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
break;
}
case AUDIT_SIGNAL_INFO:
err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
if (err)
return err;
len = 0;
if (audit_sig_sid) {
err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
if (err)
return err;
}
sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
if (!sig_data) {
security_release_secctx(ctx, len);
if (audit_sig_sid)
security_release_secctx(ctx, len);
return -ENOMEM;
}
sig_data->uid = audit_sig_uid;
sig_data->pid = audit_sig_pid;
memcpy(sig_data->ctx, ctx, len);
security_release_secctx(ctx, len);
if (audit_sig_sid) {
memcpy(sig_data->ctx, ctx, len);
security_release_secctx(ctx, len);
}
audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
0, 0, sig_data, sizeof(*sig_data) + len);
kfree(sig_data);

View File

@@ -45,8 +45,8 @@
struct audit_watch {
atomic_t count; /* reference count */
char *path; /* insertion path */
dev_t dev; /* associated superblock device */
char *path; /* insertion path */
unsigned long ino; /* associated inode number */
struct audit_parent *parent; /* associated parent */
struct list_head wlist; /* entry in parent->watches list */

View File

@@ -168,12 +168,12 @@ struct audit_context {
int in_syscall; /* 1 if task is in a syscall */
enum audit_state state, current_state;
unsigned int serial; /* serial number for record */
struct timespec ctime; /* time of syscall entry */
int major; /* syscall number */
struct timespec ctime; /* time of syscall entry */
unsigned long argv[4]; /* syscall arguments */
int return_valid; /* return code is valid */
long return_code;/* syscall return code */
u64 prio;
int return_valid; /* return code is valid */
int name_count;
struct audit_names names[AUDIT_NAMES];
char * filterkey; /* key for rule that triggered record */
@@ -198,8 +198,8 @@ struct audit_context {
char target_comm[TASK_COMM_LEN];
struct audit_tree_refs *trees, *first_trees;
int tree_count;
struct list_head killed_trees;
int tree_count;
int type;
union {

File diff suppressed because it is too large Load Diff

View File

@@ -1,105 +0,0 @@
/*
* kernel/cgroup_debug.c - Example cgroup subsystem that
* exposes debug info
*
* Copyright (C) Google Inc, 2007
*
* Developed by Paul Menage (menage@google.com)
*
*/
#include <linux/cgroup.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/rcupdate.h>
#include <asm/atomic.h>
static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
struct cgroup *cont)
{
struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
if (!css)
return ERR_PTR(-ENOMEM);
return css;
}
static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
{
kfree(cont->subsys[debug_subsys_id]);
}
static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
{
return atomic_read(&cont->count);
}
static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
{
u64 count;
count = cgroup_task_count(cont);
return count;
}
static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
{
return (u64)(long)current->cgroups;
}
static u64 current_css_set_refcount_read(struct cgroup *cont,
struct cftype *cft)
{
u64 count;
rcu_read_lock();
count = atomic_read(&current->cgroups->refcount);
rcu_read_unlock();
return count;
}
static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
{
return test_bit(CGRP_RELEASABLE, &cgrp->flags);
}
static struct cftype files[] = {
{
.name = "cgroup_refcount",
.read_u64 = cgroup_refcount_read,
},
{
.name = "taskcount",
.read_u64 = taskcount_read,
},
{
.name = "current_css_set",
.read_u64 = current_css_set_read,
},
{
.name = "current_css_set_refcount",
.read_u64 = current_css_set_refcount_read,
},
{
.name = "releasable",
.read_u64 = releasable_read,
},
};
static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
{
return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
}
struct cgroup_subsys debug_subsys = {
.name = "debug",
.create = debug_create,
.destroy = debug_destroy,
.populate = debug_populate,
.subsys_id = debug_subsys_id,
};

View File

@@ -159,7 +159,7 @@ static bool is_task_frozen_enough(struct task_struct *task)
*/
static int freezer_can_attach(struct cgroup_subsys *ss,
struct cgroup *new_cgroup,
struct task_struct *task)
struct task_struct *task, bool threadgroup)
{
struct freezer *freezer;
@@ -177,6 +177,19 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
if (freezer->state == CGROUP_FROZEN)
return -EBUSY;
if (threadgroup) {
struct task_struct *c;
rcu_read_lock();
list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
if (is_task_frozen_enough(c)) {
rcu_read_unlock();
return -EBUSY;
}
}
rcu_read_unlock();
}
return 0;
}

View File

@@ -1324,9 +1324,10 @@ static int fmeter_getrate(struct fmeter *fmp)
static cpumask_var_t cpus_attach;
/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
static int cpuset_can_attach(struct cgroup_subsys *ss,
struct cgroup *cont, struct task_struct *tsk)
static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
struct task_struct *tsk, bool threadgroup)
{
int ret;
struct cpuset *cs = cgroup_cs(cont);
if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1343,18 +1344,51 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
if (tsk->flags & PF_THREAD_BOUND)
return -EINVAL;
return security_task_setscheduler(tsk, 0, NULL);
ret = security_task_setscheduler(tsk, 0, NULL);
if (ret)
return ret;
if (threadgroup) {
struct task_struct *c;
rcu_read_lock();
list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
ret = security_task_setscheduler(c, 0, NULL);
if (ret) {
rcu_read_unlock();
return ret;
}
}
rcu_read_unlock();
}
return 0;
}
static void cpuset_attach(struct cgroup_subsys *ss,
struct cgroup *cont, struct cgroup *oldcont,
struct task_struct *tsk)
static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
struct cpuset *cs)
{
int err;
/*
* can_attach beforehand should guarantee that this doesn't fail.
* TODO: have a better way to handle failure here
*/
err = set_cpus_allowed_ptr(tsk, cpus_attach);
WARN_ON_ONCE(err);
task_lock(tsk);
cpuset_change_task_nodemask(tsk, to);
task_unlock(tsk);
cpuset_update_task_spread_flag(cs, tsk);
}
static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
struct cgroup *oldcont, struct task_struct *tsk,
bool threadgroup)
{
nodemask_t from, to;
struct mm_struct *mm;
struct cpuset *cs = cgroup_cs(cont);
struct cpuset *oldcs = cgroup_cs(oldcont);
int err;
if (cs == &top_cpuset) {
cpumask_copy(cpus_attach, cpu_possible_mask);
@@ -1363,15 +1397,19 @@ static void cpuset_attach(struct cgroup_subsys *ss,
guarantee_online_cpus(cs, cpus_attach);
guarantee_online_mems(cs, &to);
}
err = set_cpus_allowed_ptr(tsk, cpus_attach);
if (err)
return;
task_lock(tsk);
cpuset_change_task_nodemask(tsk, &to);
task_unlock(tsk);
cpuset_update_task_spread_flag(cs, tsk);
/* do per-task migration stuff possibly for each in the threadgroup */
cpuset_attach_task(tsk, &to, cs);
if (threadgroup) {
struct task_struct *c;
rcu_read_lock();
list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
cpuset_attach_task(c, &to, cs);
}
rcu_read_unlock();
}
/* change mm; only needs to be done once even if threadgroup */
from = oldcs->mems_allowed;
to = cs->mems_allowed;
mm = get_task_mm(tsk);

View File

@@ -782,6 +782,25 @@ EXPORT_SYMBOL(set_create_files_as);
#ifdef CONFIG_DEBUG_CREDENTIALS
bool creds_are_invalid(const struct cred *cred)
{
if (cred->magic != CRED_MAGIC)
return true;
if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
return true;
#ifdef CONFIG_SECURITY_SELINUX
if (selinux_is_enabled()) {
if ((unsigned long) cred->security < PAGE_SIZE)
return true;
if ((*(u32 *)cred->security & 0xffffff00) ==
(POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))
return true;
}
#endif
return false;
}
EXPORT_SYMBOL(creds_are_invalid);
/*
* dump invalid credentials
*/

View File

@@ -15,6 +15,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/taskstats.h>
#include <linux/time.h>
#include <linux/sysctl.h>
#include <linux/delayacct.h>

View File

@@ -47,7 +47,7 @@
#include <linux/tracehook.h>
#include <linux/fs_struct.h>
#include <linux/init_task.h>
#include <linux/perf_counter.h>
#include <linux/perf_event.h>
#include <trace/events/sched.h>
#include <asm/uaccess.h>
@@ -154,8 +154,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
#ifdef CONFIG_PERF_COUNTERS
WARN_ON_ONCE(tsk->perf_counter_ctxp);
#ifdef CONFIG_PERF_EVENTS
WARN_ON_ONCE(tsk->perf_event_ctxp);
#endif
trace_sched_process_free(tsk);
put_task_struct(tsk);
@@ -359,8 +359,10 @@ void __set_special_pids(struct pid *pid)
{
struct task_struct *curr = current->group_leader;
if (task_session(curr) != pid)
if (task_session(curr) != pid) {
change_pid(curr, PIDTYPE_SID, pid);
proc_sid_connector(curr);
}
if (task_pgrp(curr) != pid)
change_pid(curr, PIDTYPE_PGID, pid);
@@ -945,6 +947,8 @@ NORET_TYPE void do_exit(long code)
if (group_dead) {
hrtimer_cancel(&tsk->signal->real_timer);
exit_itimers(tsk->signal);
if (tsk->mm)
setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
}
acct_collect(code, group_dead);
if (group_dead)
@@ -972,8 +976,6 @@ NORET_TYPE void do_exit(long code)
disassociate_ctty(1);
module_put(task_thread_info(tsk)->exec_domain->module);
if (tsk->binfmt)
module_put(tsk->binfmt->module);
proc_exit_connector(tsk);
@@ -981,7 +983,7 @@ NORET_TYPE void do_exit(long code)
* Flush inherited counters to the parent - before the parent
* gets woken up by child-exit notifications.
*/
perf_counter_exit_task(tsk);
perf_event_exit_task(tsk);
exit_notify(tsk, group_dead);
#ifdef CONFIG_NUMA
@@ -1093,28 +1095,28 @@ struct wait_opts {
int __user *wo_stat;
struct rusage __user *wo_rusage;
wait_queue_t child_wait;
int notask_error;
};
static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
static inline
struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
{
struct pid *pid = NULL;
if (type == PIDTYPE_PID)
pid = task->pids[type].pid;
else if (type < PIDTYPE_MAX)
pid = task->group_leader->pids[type].pid;
return pid;
if (type != PIDTYPE_PID)
task = task->group_leader;
return task->pids[type].pid;
}
static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
{
return wo->wo_type == PIDTYPE_MAX ||
task_pid_type(p, wo->wo_type) == wo->wo_pid;
}
static int eligible_child(struct wait_opts *wo, struct task_struct *p)
{
int err;
if (wo->wo_type < PIDTYPE_MAX) {
if (task_pid_type(p, wo->wo_type) != wo->wo_pid)
return 0;
}
if (!eligible_pid(wo, p))
return 0;
/* Wait for all children (clone and not) if __WALL is set;
* otherwise, wait for clone children *only* if __WCLONE is
* set; otherwise, wait for non-clone children *only*. (Note:
@@ -1124,10 +1126,6 @@ static int eligible_child(struct wait_opts *wo, struct task_struct *p)
&& !(wo->wo_flags & __WALL))
return 0;
err = security_task_wait(p);
if (err)
return err;
return 1;
}
@@ -1140,18 +1138,20 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
put_task_struct(p);
infop = wo->wo_info;
if (!retval)
retval = put_user(SIGCHLD, &infop->si_signo);
if (!retval)
retval = put_user(0, &infop->si_errno);
if (!retval)
retval = put_user((short)why, &infop->si_code);
if (!retval)
retval = put_user(pid, &infop->si_pid);
if (!retval)
retval = put_user(uid, &infop->si_uid);
if (!retval)
retval = put_user(status, &infop->si_status);
if (infop) {
if (!retval)
retval = put_user(SIGCHLD, &infop->si_signo);
if (!retval)
retval = put_user(0, &infop->si_errno);
if (!retval)
retval = put_user((short)why, &infop->si_code);
if (!retval)
retval = put_user(pid, &infop->si_pid);
if (!retval)
retval = put_user(uid, &infop->si_uid);
if (!retval)
retval = put_user(status, &infop->si_status);
}
if (!retval)
retval = pid;
return retval;
@@ -1208,6 +1208,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
if (likely(!traced) && likely(!task_detached(p))) {
struct signal_struct *psig;
struct signal_struct *sig;
unsigned long maxrss;
/*
* The resource counters for the group leader are in its
@@ -1256,6 +1257,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
psig->coublock +=
task_io_get_oublock(p) +
sig->oublock + sig->coublock;
maxrss = max(sig->maxrss, sig->cmaxrss);
if (psig->cmaxrss < maxrss)
psig->cmaxrss = maxrss;
task_io_accounting_add(&psig->ioac, &p->ioac);
task_io_accounting_add(&psig->ioac, &sig->ioac);
spin_unlock_irq(&p->real_parent->sighand->siglock);
@@ -1477,13 +1481,14 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
* then ->notask_error is 0 if @p is an eligible child,
* or another error from security_task_wait(), or still -ECHILD.
*/
static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent,
int ptrace, struct task_struct *p)
static int wait_consider_task(struct wait_opts *wo, int ptrace,
struct task_struct *p)
{
int ret = eligible_child(wo, p);
if (!ret)
return ret;
ret = security_task_wait(p);
if (unlikely(ret < 0)) {
/*
* If we have not yet seen any eligible child,
@@ -1545,7 +1550,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
* Do not consider detached threads.
*/
if (!task_detached(p)) {
int ret = wait_consider_task(wo, tsk, 0, p);
int ret = wait_consider_task(wo, 0, p);
if (ret)
return ret;
}
@@ -1559,7 +1564,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
struct task_struct *p;
list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
int ret = wait_consider_task(wo, tsk, 1, p);
int ret = wait_consider_task(wo, 1, p);
if (ret)
return ret;
}
@@ -1567,15 +1572,38 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
return 0;
}
static int child_wait_callback(wait_queue_t *wait, unsigned mode,
int sync, void *key)
{
struct wait_opts *wo = container_of(wait, struct wait_opts,
child_wait);
struct task_struct *p = key;
if (!eligible_pid(wo, p))
return 0;
if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
return 0;
return default_wake_function(wait, mode, sync, key);
}
void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
{
__wake_up_sync_key(&parent->signal->wait_chldexit,
TASK_INTERRUPTIBLE, 1, p);
}
static long do_wait(struct wait_opts *wo)
{
DECLARE_WAITQUEUE(wait, current);
struct task_struct *tsk;
int retval;
trace_sched_process_wait(wo->wo_pid);
add_wait_queue(&current->signal->wait_chldexit,&wait);
init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
wo->child_wait.private = current;
add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
repeat:
/*
* If there is nothing that can match our critiera just get out.
@@ -1616,32 +1644,7 @@ notask:
}
end:
__set_current_state(TASK_RUNNING);
remove_wait_queue(&current->signal->wait_chldexit,&wait);
if (wo->wo_info) {
struct siginfo __user *infop = wo->wo_info;
if (retval > 0)
retval = 0;
else {
/*
* For a WNOHANG return, clear out all the fields
* we would set so the user can easily tell the
* difference.
*/
if (!retval)
retval = put_user(0, &infop->si_signo);
if (!retval)
retval = put_user(0, &infop->si_errno);
if (!retval)
retval = put_user(0, &infop->si_code);
if (!retval)
retval = put_user(0, &infop->si_pid);
if (!retval)
retval = put_user(0, &infop->si_uid);
if (!retval)
retval = put_user(0, &infop->si_status);
}
}
remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
return retval;
}
@@ -1686,6 +1689,29 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
wo.wo_stat = NULL;
wo.wo_rusage = ru;
ret = do_wait(&wo);
if (ret > 0) {
ret = 0;
} else if (infop) {
/*
* For a WNOHANG return, clear out all the fields
* we would set so the user can easily tell the
* difference.
*/
if (!ret)
ret = put_user(0, &infop->si_signo);
if (!ret)
ret = put_user(0, &infop->si_errno);
if (!ret)
ret = put_user(0, &infop->si_code);
if (!ret)
ret = put_user(0, &infop->si_pid);
if (!ret)
ret = put_user(0, &infop->si_uid);
if (!ret)
ret = put_user(0, &infop->si_status);
}
put_pid(pid);
/* avoid REGPARM breakage on x86: */

View File

@@ -49,6 +49,7 @@
#include <linux/ftrace.h>
#include <linux/profile.h>
#include <linux/rmap.h>
#include <linux/ksm.h>
#include <linux/acct.h>
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
@@ -61,7 +62,8 @@
#include <linux/blkdev.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
#include <linux/perf_counter.h>
#include <linux/perf_event.h>
#include <linux/posix-timers.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -136,9 +138,17 @@ struct kmem_cache *vm_area_cachep;
/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;
static void account_kernel_stack(struct thread_info *ti, int account)
{
struct zone *zone = page_zone(virt_to_page(ti));
mod_zone_page_state(zone, NR_KERNEL_STACK, account);
}
void free_task(struct task_struct *tsk)
{
prop_local_destroy_single(&tsk->dirties);
account_kernel_stack(tsk->stack, -1);
free_thread_info(tsk->stack);
rt_mutex_debug_task_free(tsk);
ftrace_graph_exit_task(tsk);
@@ -253,6 +263,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
tsk->btrace_seq = 0;
#endif
tsk->splice_pipe = NULL;
account_kernel_stack(ti, 1);
return tsk;
out:
@@ -288,6 +301,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
rb_link = &mm->mm_rb.rb_node;
rb_parent = NULL;
pprev = &mm->mmap;
retval = ksm_fork(mm, oldmm);
if (retval)
goto out;
for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
struct file *file;
@@ -418,22 +434,30 @@ __setup("coredump_filter=", coredump_filter_setup);
#include <linux/init_task.h>
static void mm_init_aio(struct mm_struct *mm)
{
#ifdef CONFIG_AIO
spin_lock_init(&mm->ioctx_lock);
INIT_HLIST_HEAD(&mm->ioctx_list);
#endif
}
static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
{
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem);
INIT_LIST_HEAD(&mm->mmlist);
mm->flags = (current->mm) ? current->mm->flags : default_dump_filter;
mm->flags = (current->mm) ?
(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
mm->core_state = NULL;
mm->nr_ptes = 0;
set_mm_counter(mm, file_rss, 0);
set_mm_counter(mm, anon_rss, 0);
spin_lock_init(&mm->page_table_lock);
spin_lock_init(&mm->ioctx_lock);
INIT_HLIST_HEAD(&mm->ioctx_list);
mm->free_area_cache = TASK_UNMAPPED_BASE;
mm->cached_hole_size = ~0UL;
mm_init_aio(mm);
mm_init_owner(mm, p);
if (likely(!mm_alloc_pgd(mm))) {
@@ -485,6 +509,7 @@ void mmput(struct mm_struct *mm)
if (atomic_dec_and_test(&mm->mm_users)) {
exit_aio(mm);
ksm_exit(mm);
exit_mmap(mm);
set_mm_exe_file(mm, NULL);
if (!list_empty(&mm->mmlist)) {
@@ -493,6 +518,8 @@ void mmput(struct mm_struct *mm)
spin_unlock(&mmlist_lock);
}
put_swap_token(mm);
if (mm->binfmt)
module_put(mm->binfmt->module);
mmdrop(mm);
}
}
@@ -618,9 +645,14 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
mm->hiwater_rss = get_mm_rss(mm);
mm->hiwater_vm = mm->total_vm;
if (mm->binfmt && !try_module_get(mm->binfmt->module))
goto free_pt;
return mm;
free_pt:
/* don't put binfmt in mmput, we haven't got module yet */
mm->binfmt = NULL;
mmput(mm);
fail_nomem:
@@ -788,10 +820,10 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
thread_group_cputime_init(sig);
/* Expiration times and increments. */
sig->it_virt_expires = cputime_zero;
sig->it_virt_incr = cputime_zero;
sig->it_prof_expires = cputime_zero;
sig->it_prof_incr = cputime_zero;
sig->it[CPUCLOCK_PROF].expires = cputime_zero;
sig->it[CPUCLOCK_PROF].incr = cputime_zero;
sig->it[CPUCLOCK_VIRT].expires = cputime_zero;
sig->it[CPUCLOCK_VIRT].incr = cputime_zero;
/* Cached expiration times. */
sig->cputime_expires.prof_exp = cputime_zero;
@@ -849,6 +881,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
sig->maxrss = sig->cmaxrss = 0;
task_io_accounting_init(&sig->ioac);
sig->sum_sched_runtime = 0;
taskstats_tgid_init(sig);
@@ -863,6 +896,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
tty_audit_fork(sig);
sig->oom_adj = current->signal->oom_adj;
return 0;
}
@@ -958,6 +993,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
return ERR_PTR(-EINVAL);
/*
* Siblings of global init remain as zombies on exit since they are
* not reaped by their parent (swapper). To solve this and to avoid
* multi-rooted process trees, prevent global and container-inits
* from creating siblings.
*/
if ((clone_flags & CLONE_PARENT) &&
current->signal->flags & SIGNAL_UNKILLABLE)
return ERR_PTR(-EINVAL);
retval = security_task_create(clone_flags);
if (retval)
goto fork_out;
@@ -999,9 +1044,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (!try_module_get(task_thread_info(p)->exec_domain->module))
goto bad_fork_cleanup_count;
if (p->binfmt && !try_module_get(p->binfmt->module))
goto bad_fork_cleanup_put_domain;
p->did_exec = 0;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
copy_flags(clone_flags, p);
@@ -1075,10 +1117,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->bts = NULL;
p->stack_start = stack_start;
/* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p, clone_flags);
retval = perf_counter_init_task(p);
retval = perf_event_init_task(p);
if (retval)
goto bad_fork_cleanup_policy;
@@ -1253,7 +1297,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p);
perf_counter_fork(p);
perf_event_fork(p);
return p;
bad_fork_free_pid:
@@ -1280,16 +1324,13 @@ bad_fork_cleanup_semundo:
bad_fork_cleanup_audit:
audit_free(p);
bad_fork_cleanup_policy:
perf_counter_free_task(p);
perf_event_free_task(p);
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
bad_fork_cleanup_cgroup:
#endif
cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
if (p->binfmt)
module_put(p->binfmt->module);
bad_fork_cleanup_put_domain:
module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
atomic_dec(&p->cred->user->processes);

View File

@@ -89,36 +89,36 @@ struct futex_pi_state {
union futex_key key;
};
/*
* We use this hashed waitqueue instead of a normal wait_queue_t, so
/**
* struct futex_q - The hashed futex queue entry, one per waiting task
* @task: the task waiting on the futex
* @lock_ptr: the hash bucket lock
* @key: the key the futex is hashed on
* @pi_state: optional priority inheritance state
* @rt_waiter: rt_waiter storage for use with requeue_pi
* @requeue_pi_key: the requeue_pi target futex key
* @bitset: bitset for the optional bitmasked wakeup
*
* We use this hashed waitqueue, instead of a normal wait_queue_t, so
* we can wake only the relevant ones (hashed queues may be shared).
*
* A futex_q has a woken state, just like tasks have TASK_RUNNING.
* It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
* The order of wakup is always to make the first condition true, then
* wake up q->waiter, then make the second condition true.
* the second.
*
* PI futexes are typically woken before they are removed from the hash list via
* the rt_mutex code. See unqueue_me_pi().
*/
struct futex_q {
struct plist_node list;
/* Waiter reference */
struct task_struct *task;
/* Which hash list lock to use: */
spinlock_t *lock_ptr;
/* Key which the futex is hashed on: */
union futex_key key;
/* Optional priority inheritance state: */
struct futex_pi_state *pi_state;
/* rt_waiter storage for requeue_pi: */
struct rt_mutex_waiter *rt_waiter;
/* The expected requeue pi target futex key: */
union futex_key *requeue_pi_key;
/* Bitset for the optional bitmasked wakeup */
u32 bitset;
};
@@ -198,11 +198,12 @@ static void drop_futex_key_refs(union futex_key *key)
}
/**
* get_futex_key - Get parameters which are the keys for a futex.
* @uaddr: virtual address of the futex
* @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
* @key: address where result is stored.
* @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE)
* get_futex_key() - Get parameters which are the keys for a futex
* @uaddr: virtual address of the futex
* @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
* @key: address where result is stored.
* @rw: mapping needs to be read/write (values: VERIFY_READ,
* VERIFY_WRITE)
*
* Returns a negative error code or 0
* The key words are stored in *key on success.
@@ -288,8 +289,8 @@ void put_futex_key(int fshared, union futex_key *key)
drop_futex_key_refs(key);
}
/*
* fault_in_user_writeable - fault in user address and verify RW access
/**
* fault_in_user_writeable() - Fault in user address and verify RW access
* @uaddr: pointer to faulting user space address
*
* Slow path to fixup the fault we just took in the atomic write
@@ -309,8 +310,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
/**
* futex_top_waiter() - Return the highest priority waiter on a futex
* @hb: the hash bucket the futex_q's reside in
* @key: the futex key (to distinguish it from other futex futex_q's)
* @hb: the hash bucket the futex_q's reside in
* @key: the futex key (to distinguish it from other futex futex_q's)
*
* Must be called with the hb lock held.
*/
@@ -588,7 +589,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
}
/**
* futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex
* futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
* @uaddr: the pi futex user address
* @hb: the pi futex hash bucket
* @key: the futex key associated with uaddr and hb
@@ -1011,9 +1012,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
/**
* requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
* q: the futex_q
* key: the key of the requeue target futex
* hb: the hash_bucket of the requeue target futex
* @q: the futex_q
* @key: the key of the requeue target futex
* @hb: the hash_bucket of the requeue target futex
*
* During futex_requeue, with requeue_pi=1, it is possible to acquire the
* target futex if it is uncontended or via a lock steal. Set the futex_q key
@@ -1350,6 +1351,25 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
return hb;
}
static inline void
queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
{
spin_unlock(&hb->lock);
drop_futex_key_refs(&q->key);
}
/**
* queue_me() - Enqueue the futex_q on the futex_hash_bucket
* @q: The futex_q to enqueue
* @hb: The destination hash bucket
*
* The hb->lock must be held by the caller, and is released here. A call to
* queue_me() is typically paired with exactly one call to unqueue_me(). The
* exceptions involve the PI related operations, which may use unqueue_me_pi()
* or nothing if the unqueue is done as part of the wake process and the unqueue
* state is implicit in the state of woken task (see futex_wait_requeue_pi() for
* an example).
*/
static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
{
int prio;
@@ -1373,19 +1393,17 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
spin_unlock(&hb->lock);
}
static inline void
queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
{
spin_unlock(&hb->lock);
drop_futex_key_refs(&q->key);
}
/*
* queue_me and unqueue_me must be called as a pair, each
* exactly once. They are called with the hashed spinlock held.
/**
* unqueue_me() - Remove the futex_q from its futex_hash_bucket
* @q: The futex_q to unqueue
*
* The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
* be paired with exactly one earlier call to queue_me().
*
* Returns:
* 1 - if the futex_q was still queued (and we removed unqueued it)
* 0 - if the futex_q was already removed by the waking thread
*/
/* Return 1 if we were still queued (ie. 0 means we were woken) */
static int unqueue_me(struct futex_q *q)
{
spinlock_t *lock_ptr;
@@ -1638,17 +1656,14 @@ out:
static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
struct hrtimer_sleeper *timeout)
{
queue_me(q, hb);
/*
* There might have been scheduling since the queue_me(), as we
* cannot hold a spinlock across the get_user() in case it
* faults, and we cannot just set TASK_INTERRUPTIBLE state when
* queueing ourselves into the futex hash. This code thus has to
* rely on the futex_wake() code removing us from hash when it
* wakes us up.
* The task state is guaranteed to be set before another task can
* wake it. set_current_state() is implemented using set_mb() and
* queue_me() calls spin_unlock() upon completion, both serializing
* access to the hash list and forcing another memory barrier.
*/
set_current_state(TASK_INTERRUPTIBLE);
queue_me(q, hb);
/* Arm the timer */
if (timeout) {
@@ -1658,8 +1673,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
}
/*
* !plist_node_empty() is safe here without any lock.
* q.lock_ptr != 0 is not safe, because of ordering against wakeup.
* If we have been removed from the hash list, then another task
* has tried to wake us, and we can skip the call to schedule().
*/
if (likely(!plist_node_empty(&q->list))) {
/*
@@ -2114,12 +2129,12 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
/**
* futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
* @uaddr: the futex we initialyl wait on (non-pi)
* @uaddr: the futex we initially wait on (non-pi)
* @fshared: whether the futexes are shared (1) or not (0). They must be
* the same type, no requeueing from private to shared, etc.
* @val: the expected value of uaddr
* @abs_time: absolute timeout
* @bitset: 32 bit wakeup bitset set by userspace, defaults to all.
* @bitset: 32 bit wakeup bitset set by userspace, defaults to all
* @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
* @uaddr2: the pi futex we will take prior to returning to user-space
*
@@ -2246,7 +2261,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
res = fixup_owner(uaddr2, fshared, &q, !ret);
/*
* If fixup_owner() returned an error, proprogate that. If it
* acquired the lock, clear our -ETIMEDOUT or -EINTR.
* acquired the lock, clear -ETIMEDOUT or -EINTR.
*/
if (res)
ret = (res < 0) ? res : 0;
@@ -2302,9 +2317,9 @@ out:
*/
/**
* sys_set_robust_list - set the robust-futex list head of a task
* @head: pointer to the list-head
* @len: length of the list-head, as userspace expects
* sys_set_robust_list() - Set the robust-futex list head of a task
* @head: pointer to the list-head
* @len: length of the list-head, as userspace expects
*/
SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
size_t, len)
@@ -2323,10 +2338,10 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
}
/**
* sys_get_robust_list - get the robust-futex list head of a task
* @pid: pid of the process [zero for current task]
* @head_ptr: pointer to a list-head pointer, the kernel fills it in
* @len_ptr: pointer to a length field, the kernel fills in the header size
* sys_get_robust_list() - Get the robust-futex list head of a task
* @pid: pid of the process [zero for current task]
* @head_ptr: pointer to a list-head pointer, the kernel fills it in
* @len_ptr: pointer to a length field, the kernel fills in the header size
*/
SYSCALL_DEFINE3(get_robust_list, int, pid,
struct robust_list_head __user * __user *, head_ptr,

View File

@@ -34,7 +34,7 @@ config GCOV_KERNEL
config GCOV_PROFILE_ALL
bool "Profile entire Kernel"
depends on GCOV_KERNEL
depends on S390 || X86 || (PPC && EXPERIMENTAL)
depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
default n
---help---
This options activates profiling for the entire kernel.

View File

@@ -48,36 +48,7 @@
#include <asm/uaccess.h>
/**
* ktime_get - get the monotonic time in ktime_t format
*
* returns the time in ktime_t format
*/
ktime_t ktime_get(void)
{
struct timespec now;
ktime_get_ts(&now);
return timespec_to_ktime(now);
}
EXPORT_SYMBOL_GPL(ktime_get);
/**
* ktime_get_real - get the real (wall-) time in ktime_t format
*
* returns the time in ktime_t format
*/
ktime_t ktime_get_real(void)
{
struct timespec now;
getnstimeofday(&now);
return timespec_to_ktime(now);
}
EXPORT_SYMBOL_GPL(ktime_get_real);
#include <trace/events/timer.h>
/*
* The timer bases:
@@ -106,31 +77,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
}
};
/**
* ktime_get_ts - get the monotonic clock in timespec format
* @ts: pointer to timespec variable
*
* The function calculates the monotonic clock from the realtime
* clock and the wall_to_monotonic offset and stores the result
* in normalized timespec format in the variable pointed to by @ts.
*/
void ktime_get_ts(struct timespec *ts)
{
struct timespec tomono;
unsigned long seq;
do {
seq = read_seqbegin(&xtime_lock);
getnstimeofday(ts);
tomono = wall_to_monotonic;
} while (read_seqretry(&xtime_lock, seq));
set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
ts->tv_nsec + tomono.tv_nsec);
}
EXPORT_SYMBOL_GPL(ktime_get_ts);
/*
* Get the coarse grained time at the softirq based on xtime and
* wall_to_monotonic.
@@ -498,6 +444,26 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
#endif
static inline void
debug_init(struct hrtimer *timer, clockid_t clockid,
enum hrtimer_mode mode)
{
debug_hrtimer_init(timer);
trace_hrtimer_init(timer, clockid, mode);
}
static inline void debug_activate(struct hrtimer *timer)
{
debug_hrtimer_activate(timer);
trace_hrtimer_start(timer);
}
static inline void debug_deactivate(struct hrtimer *timer)
{
debug_hrtimer_deactivate(timer);
trace_hrtimer_cancel(timer);
}
/* High resolution timer related functions */
#ifdef CONFIG_HIGH_RES_TIMERS
@@ -543,13 +509,14 @@ static inline int hrtimer_hres_active(void)
* next event
* Called with interrupts disabled and base->lock held
*/
static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
static void
hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
{
int i;
struct hrtimer_clock_base *base = cpu_base->clock_base;
ktime_t expires;
ktime_t expires, expires_next;
cpu_base->expires_next.tv64 = KTIME_MAX;
expires_next.tv64 = KTIME_MAX;
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
struct hrtimer *timer;
@@ -565,10 +532,15 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
*/
if (expires.tv64 < 0)
expires.tv64 = 0;
if (expires.tv64 < cpu_base->expires_next.tv64)
cpu_base->expires_next = expires;
if (expires.tv64 < expires_next.tv64)
expires_next = expires;
}
if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
return;
cpu_base->expires_next.tv64 = expires_next.tv64;
if (cpu_base->expires_next.tv64 != KTIME_MAX)
tick_program_event(cpu_base->expires_next, 1);
}
@@ -651,7 +623,7 @@ static void retrigger_next_event(void *arg)
base->clock_base[CLOCK_REALTIME].offset =
timespec_to_ktime(realtime_offset);
hrtimer_force_reprogram(base);
hrtimer_force_reprogram(base, 0);
spin_unlock(&base->lock);
}
@@ -764,7 +736,8 @@ static int hrtimer_switch_to_hres(void)
static inline int hrtimer_hres_active(void) { return 0; }
static inline int hrtimer_is_hres_enabled(void) { return 0; }
static inline int hrtimer_switch_to_hres(void) { return 0; }
static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
static inline void
hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
struct hrtimer_clock_base *base,
int wakeup)
@@ -854,7 +827,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
struct hrtimer *entry;
int leftmost = 1;
debug_hrtimer_activate(timer);
debug_activate(timer);
/*
* Find the right place in the rbtree:
@@ -907,19 +880,29 @@ static void __remove_hrtimer(struct hrtimer *timer,
struct hrtimer_clock_base *base,
unsigned long newstate, int reprogram)
{
if (timer->state & HRTIMER_STATE_ENQUEUED) {
/*
* Remove the timer from the rbtree and replace the
* first entry pointer if necessary.
*/
if (base->first == &timer->node) {
base->first = rb_next(&timer->node);
/* Reprogram the clock event device. if enabled */
if (reprogram && hrtimer_hres_active())
hrtimer_force_reprogram(base->cpu_base);
if (!(timer->state & HRTIMER_STATE_ENQUEUED))
goto out;
/*
* Remove the timer from the rbtree and replace the first
* entry pointer if necessary.
*/
if (base->first == &timer->node) {
base->first = rb_next(&timer->node);
#ifdef CONFIG_HIGH_RES_TIMERS
/* Reprogram the clock event device. if enabled */
if (reprogram && hrtimer_hres_active()) {
ktime_t expires;
expires = ktime_sub(hrtimer_get_expires(timer),
base->offset);
if (base->cpu_base->expires_next.tv64 == expires.tv64)
hrtimer_force_reprogram(base->cpu_base, 1);
}
rb_erase(&timer->node, &base->active);
#endif
}
rb_erase(&timer->node, &base->active);
out:
timer->state = newstate;
}
@@ -940,7 +923,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
* reprogramming happens in the interrupt handler. This is a
* rare case and less expensive than a smp call.
*/
debug_hrtimer_deactivate(timer);
debug_deactivate(timer);
timer_stats_hrtimer_clear_start_info(timer);
reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
@@ -1155,7 +1138,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
clock_id = CLOCK_MONOTONIC;
timer->base = &cpu_base->clock_base[clock_id];
INIT_LIST_HEAD(&timer->cb_entry);
hrtimer_init_timer_hres(timer);
#ifdef CONFIG_TIMER_STATS
@@ -1174,7 +1156,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
enum hrtimer_mode mode)
{
debug_hrtimer_init(timer);
debug_init(timer, clock_id, mode);
__hrtimer_init(timer, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_init);
@@ -1198,7 +1180,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
}
EXPORT_SYMBOL_GPL(hrtimer_get_res);
static void __run_hrtimer(struct hrtimer *timer)
static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
{
struct hrtimer_clock_base *base = timer->base;
struct hrtimer_cpu_base *cpu_base = base->cpu_base;
@@ -1207,7 +1189,7 @@ static void __run_hrtimer(struct hrtimer *timer)
WARN_ON(!irqs_disabled());
debug_hrtimer_deactivate(timer);
debug_deactivate(timer);
__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
timer_stats_account_hrtimer(timer);
fn = timer->function;
@@ -1218,7 +1200,9 @@ static void __run_hrtimer(struct hrtimer *timer)
* the timer base.
*/
spin_unlock(&cpu_base->lock);
trace_hrtimer_expire_entry(timer, now);
restart = fn(timer);
trace_hrtimer_expire_exit(timer);
spin_lock(&cpu_base->lock);
/*
@@ -1329,7 +1313,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
break;
}
__run_hrtimer(timer);
__run_hrtimer(timer, &basenow);
}
base++;
}
@@ -1451,7 +1435,7 @@ void hrtimer_run_queues(void)
hrtimer_get_expires_tv64(timer))
break;
__run_hrtimer(timer);
__run_hrtimer(timer, &base->softirq_time);
}
spin_unlock(&cpu_base->lock);
}
@@ -1628,7 +1612,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
while ((node = rb_first(&old_base->active))) {
timer = rb_entry(node, struct hrtimer, node);
BUG_ON(hrtimer_callback_running(timer));
debug_hrtimer_deactivate(timer);
debug_deactivate(timer);
/*
* Mark it as STATE_MIGRATE not INACTIVE otherwise the

View File

@@ -171,12 +171,12 @@ static unsigned long timeout_jiffies(unsigned long timeout)
* Process updating of timeout sysctl
*/
int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
struct file *filp, void __user *buffer,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
int ret;
ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
goto out;

View File

@@ -12,6 +12,7 @@
#include <linux/time.h>
#include <linux/posix-timers.h>
#include <linux/hrtimer.h>
#include <trace/events/timer.h>
#include <asm/uaccess.h>
@@ -41,10 +42,43 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer)
return ktime_to_timeval(rem);
}
static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
struct itimerval *const value)
{
cputime_t cval, cinterval;
struct cpu_itimer *it = &tsk->signal->it[clock_id];
spin_lock_irq(&tsk->sighand->siglock);
cval = it->expires;
cinterval = it->incr;
if (!cputime_eq(cval, cputime_zero)) {
struct task_cputime cputime;
cputime_t t;
thread_group_cputimer(tsk, &cputime);
if (clock_id == CPUCLOCK_PROF)
t = cputime_add(cputime.utime, cputime.stime);
else
/* CPUCLOCK_VIRT */
t = cputime.utime;
if (cputime_le(cval, t))
/* about to fire */
cval = cputime_one_jiffy;
else
cval = cputime_sub(cval, t);
}
spin_unlock_irq(&tsk->sighand->siglock);
cputime_to_timeval(cval, &value->it_value);
cputime_to_timeval(cinterval, &value->it_interval);
}
int do_getitimer(int which, struct itimerval *value)
{
struct task_struct *tsk = current;
cputime_t cinterval, cval;
switch (which) {
case ITIMER_REAL:
@@ -55,44 +89,10 @@ int do_getitimer(int which, struct itimerval *value)
spin_unlock_irq(&tsk->sighand->siglock);
break;
case ITIMER_VIRTUAL:
spin_lock_irq(&tsk->sighand->siglock);
cval = tsk->signal->it_virt_expires;
cinterval = tsk->signal->it_virt_incr;
if (!cputime_eq(cval, cputime_zero)) {
struct task_cputime cputime;
cputime_t utime;
thread_group_cputimer(tsk, &cputime);
utime = cputime.utime;
if (cputime_le(cval, utime)) { /* about to fire */
cval = jiffies_to_cputime(1);
} else {
cval = cputime_sub(cval, utime);
}
}
spin_unlock_irq(&tsk->sighand->siglock);
cputime_to_timeval(cval, &value->it_value);
cputime_to_timeval(cinterval, &value->it_interval);
get_cpu_itimer(tsk, CPUCLOCK_VIRT, value);
break;
case ITIMER_PROF:
spin_lock_irq(&tsk->sighand->siglock);
cval = tsk->signal->it_prof_expires;
cinterval = tsk->signal->it_prof_incr;
if (!cputime_eq(cval, cputime_zero)) {
struct task_cputime times;
cputime_t ptime;
thread_group_cputimer(tsk, &times);
ptime = cputime_add(times.utime, times.stime);
if (cputime_le(cval, ptime)) { /* about to fire */
cval = jiffies_to_cputime(1);
} else {
cval = cputime_sub(cval, ptime);
}
}
spin_unlock_irq(&tsk->sighand->siglock);
cputime_to_timeval(cval, &value->it_value);
cputime_to_timeval(cinterval, &value->it_interval);
get_cpu_itimer(tsk, CPUCLOCK_PROF, value);
break;
default:
return(-EINVAL);
@@ -123,11 +123,62 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
struct signal_struct *sig =
container_of(timer, struct signal_struct, real_timer);
trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0);
kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid);
return HRTIMER_NORESTART;
}
static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns)
{
struct timespec ts;
s64 cpu_ns;
cputime_to_timespec(ct, &ts);
cpu_ns = timespec_to_ns(&ts);
return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns;
}
static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
const struct itimerval *const value,
struct itimerval *const ovalue)
{
cputime_t cval, nval, cinterval, ninterval;
s64 ns_ninterval, ns_nval;
struct cpu_itimer *it = &tsk->signal->it[clock_id];
nval = timeval_to_cputime(&value->it_value);
ns_nval = timeval_to_ns(&value->it_value);
ninterval = timeval_to_cputime(&value->it_interval);
ns_ninterval = timeval_to_ns(&value->it_interval);
it->incr_error = cputime_sub_ns(ninterval, ns_ninterval);
it->error = cputime_sub_ns(nval, ns_nval);
spin_lock_irq(&tsk->sighand->siglock);
cval = it->expires;
cinterval = it->incr;
if (!cputime_eq(cval, cputime_zero) ||
!cputime_eq(nval, cputime_zero)) {
if (cputime_gt(nval, cputime_zero))
nval = cputime_add(nval, cputime_one_jiffy);
set_process_cpu_timer(tsk, clock_id, &nval, &cval);
}
it->expires = nval;
it->incr = ninterval;
trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
spin_unlock_irq(&tsk->sighand->siglock);
if (ovalue) {
cputime_to_timeval(cval, &ovalue->it_value);
cputime_to_timeval(cinterval, &ovalue->it_interval);
}
}
/*
* Returns true if the timeval is in canonical form
*/
@@ -139,7 +190,6 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
struct task_struct *tsk = current;
struct hrtimer *timer;
ktime_t expires;
cputime_t cval, cinterval, nval, ninterval;
/*
* Validate the timevals in value.
@@ -171,51 +221,14 @@ again:
} else
tsk->signal->it_real_incr.tv64 = 0;
trace_itimer_state(ITIMER_REAL, value, 0);
spin_unlock_irq(&tsk->sighand->siglock);
break;
case ITIMER_VIRTUAL:
nval = timeval_to_cputime(&value->it_value);
ninterval = timeval_to_cputime(&value->it_interval);
spin_lock_irq(&tsk->sighand->siglock);
cval = tsk->signal->it_virt_expires;
cinterval = tsk->signal->it_virt_incr;
if (!cputime_eq(cval, cputime_zero) ||
!cputime_eq(nval, cputime_zero)) {
if (cputime_gt(nval, cputime_zero))
nval = cputime_add(nval,
jiffies_to_cputime(1));
set_process_cpu_timer(tsk, CPUCLOCK_VIRT,
&nval, &cval);
}
tsk->signal->it_virt_expires = nval;
tsk->signal->it_virt_incr = ninterval;
spin_unlock_irq(&tsk->sighand->siglock);
if (ovalue) {
cputime_to_timeval(cval, &ovalue->it_value);
cputime_to_timeval(cinterval, &ovalue->it_interval);
}
set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue);
break;
case ITIMER_PROF:
nval = timeval_to_cputime(&value->it_value);
ninterval = timeval_to_cputime(&value->it_interval);
spin_lock_irq(&tsk->sighand->siglock);
cval = tsk->signal->it_prof_expires;
cinterval = tsk->signal->it_prof_incr;
if (!cputime_eq(cval, cputime_zero) ||
!cputime_eq(nval, cputime_zero)) {
if (cputime_gt(nval, cputime_zero))
nval = cputime_add(nval,
jiffies_to_cputime(1));
set_process_cpu_timer(tsk, CPUCLOCK_PROF,
&nval, &cval);
}
tsk->signal->it_prof_expires = nval;
tsk->signal->it_prof_incr = ninterval;
spin_unlock_irq(&tsk->sighand->siglock);
if (ovalue) {
cputime_to_timeval(cval, &ovalue->it_value);
cputime_to_timeval(cinterval, &ovalue->it_interval);
}
set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue);
break;
default:
return -EINVAL;

View File

@@ -59,7 +59,8 @@ static inline int is_kernel_inittext(unsigned long addr)
static inline int is_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext)
if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
arch_is_kernel_text(addr))
return 1;
return in_gate_area_no_task(addr);
}

View File

@@ -117,7 +117,7 @@ EXPORT_SYMBOL(kfifo_free);
* writer, you don't need extra locking to use these functions.
*/
unsigned int __kfifo_put(struct kfifo *fifo,
unsigned char *buffer, unsigned int len)
const unsigned char *buffer, unsigned int len)
{
unsigned int l;

View File

@@ -1321,7 +1321,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
return 0;
}
static struct seq_operations kprobes_seq_ops = {
static const struct seq_operations kprobes_seq_ops = {
.start = kprobe_seq_start,
.next = kprobe_seq_next,
.stop = kprobe_seq_stop,

View File

@@ -578,6 +578,9 @@ static int static_obj(void *obj)
if ((addr >= start) && (addr < end))
return 1;
if (arch_is_kernel_data(addr))
return 1;
#ifdef CONFIG_SMP
/*
* percpu var?

View File

@@ -594,7 +594,7 @@ static int ls_show(struct seq_file *m, void *v)
return 0;
}
static struct seq_operations lockstat_ops = {
static const struct seq_operations lockstat_ops = {
.start = ls_start,
.next = ls_next,
.stop = ls_stop,

View File

@@ -1,930 +0,0 @@
/*
* Copyright (C) 2007 Mathieu Desnoyers
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/types.h>
#include <linux/jhash.h>
#include <linux/list.h>
#include <linux/rcupdate.h>
#include <linux/marker.h>
#include <linux/err.h>
#include <linux/slab.h>
extern struct marker __start___markers[];
extern struct marker __stop___markers[];
/* Set to 1 to enable marker debug output */
static const int marker_debug;
/*
* markers_mutex nests inside module_mutex. Markers mutex protects the builtin
* and module markers and the hash table.
*/
static DEFINE_MUTEX(markers_mutex);
/*
* Marker hash table, containing the active markers.
* Protected by module_mutex.
*/
#define MARKER_HASH_BITS 6
#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
static struct hlist_head marker_table[MARKER_TABLE_SIZE];
/*
* Note about RCU :
* It is used to make sure every handler has finished using its private data
* between two consecutive operation (add or remove) on a given marker. It is
* also used to delay the free of multiple probes array until a quiescent state
* is reached.
* marker entries modifications are protected by the markers_mutex.
*/
struct marker_entry {
struct hlist_node hlist;
char *format;
/* Probe wrapper */
void (*call)(const struct marker *mdata, void *call_private, ...);
struct marker_probe_closure single;
struct marker_probe_closure *multi;
int refcount; /* Number of times armed. 0 if disarmed. */
struct rcu_head rcu;
void *oldptr;
int rcu_pending;
unsigned char ptype:1;
unsigned char format_allocated:1;
char name[0]; /* Contains name'\0'format'\0' */
};
/**
* __mark_empty_function - Empty probe callback
* @probe_private: probe private data
* @call_private: call site private data
* @fmt: format string
* @...: variable argument list
*
* Empty callback provided as a probe to the markers. By providing this to a
* disabled marker, we make sure the execution flow is always valid even
* though the function pointer change and the marker enabling are two distinct
* operations that modifies the execution flow of preemptible code.
*/
notrace void __mark_empty_function(void *probe_private, void *call_private,
const char *fmt, va_list *args)
{
}
EXPORT_SYMBOL_GPL(__mark_empty_function);
/*
* marker_probe_cb Callback that prepares the variable argument list for probes.
* @mdata: pointer of type struct marker
* @call_private: caller site private data
* @...: Variable argument list.
*
* Since we do not use "typical" pointer based RCU in the 1 argument case, we
* need to put a full smp_rmb() in this branch. This is why we do not use
* rcu_dereference() for the pointer read.
*/
notrace void marker_probe_cb(const struct marker *mdata,
void *call_private, ...)
{
va_list args;
char ptype;
/*
* rcu_read_lock_sched does two things : disabling preemption to make
* sure the teardown of the callbacks can be done correctly when they
* are in modules and they insure RCU read coherency.
*/
rcu_read_lock_sched_notrace();
ptype = mdata->ptype;
if (likely(!ptype)) {
marker_probe_func *func;
/* Must read the ptype before ptr. They are not data dependant,
* so we put an explicit smp_rmb() here. */
smp_rmb();
func = mdata->single.func;
/* Must read the ptr before private data. They are not data
* dependant, so we put an explicit smp_rmb() here. */
smp_rmb();
va_start(args, call_private);
func(mdata->single.probe_private, call_private, mdata->format,
&args);
va_end(args);
} else {
struct marker_probe_closure *multi;
int i;
/*
* Read mdata->ptype before mdata->multi.
*/
smp_rmb();
multi = mdata->multi;
/*
* multi points to an array, therefore accessing the array
* depends on reading multi. However, even in this case,
* we must insure that the pointer is read _before_ the array
* data. Same as rcu_dereference, but we need a full smp_rmb()
* in the fast path, so put the explicit barrier here.
*/
smp_read_barrier_depends();
for (i = 0; multi[i].func; i++) {
va_start(args, call_private);
multi[i].func(multi[i].probe_private, call_private,
mdata->format, &args);
va_end(args);
}
}
rcu_read_unlock_sched_notrace();
}
EXPORT_SYMBOL_GPL(marker_probe_cb);
/*
* marker_probe_cb Callback that does not prepare the variable argument list.
* @mdata: pointer of type struct marker
* @call_private: caller site private data
* @...: Variable argument list.
*
* Should be connected to markers "MARK_NOARGS".
*/
static notrace void marker_probe_cb_noarg(const struct marker *mdata,
void *call_private, ...)
{
va_list args; /* not initialized */
char ptype;
rcu_read_lock_sched_notrace();
ptype = mdata->ptype;
if (likely(!ptype)) {
marker_probe_func *func;
/* Must read the ptype before ptr. They are not data dependant,
* so we put an explicit smp_rmb() here. */
smp_rmb();
func = mdata->single.func;
/* Must read the ptr before private data. They are not data
* dependant, so we put an explicit smp_rmb() here. */
smp_rmb();
func(mdata->single.probe_private, call_private, mdata->format,
&args);
} else {
struct marker_probe_closure *multi;
int i;
/*
* Read mdata->ptype before mdata->multi.
*/
smp_rmb();
multi = mdata->multi;
/*
* multi points to an array, therefore accessing the array
* depends on reading multi. However, even in this case,
* we must insure that the pointer is read _before_ the array
* data. Same as rcu_dereference, but we need a full smp_rmb()
* in the fast path, so put the explicit barrier here.
*/
smp_read_barrier_depends();
for (i = 0; multi[i].func; i++)
multi[i].func(multi[i].probe_private, call_private,
mdata->format, &args);
}
rcu_read_unlock_sched_notrace();
}
static void free_old_closure(struct rcu_head *head)
{
struct marker_entry *entry = container_of(head,
struct marker_entry, rcu);
kfree(entry->oldptr);
/* Make sure we free the data before setting the pending flag to 0 */
smp_wmb();
entry->rcu_pending = 0;
}
static void debug_print_probes(struct marker_entry *entry)
{
int i;
if (!marker_debug)
return;
if (!entry->ptype) {
printk(KERN_DEBUG "Single probe : %p %p\n",
entry->single.func,
entry->single.probe_private);
} else {
for (i = 0; entry->multi[i].func; i++)
printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
entry->multi[i].func,
entry->multi[i].probe_private);
}
}
static struct marker_probe_closure *
marker_entry_add_probe(struct marker_entry *entry,
marker_probe_func *probe, void *probe_private)
{
int nr_probes = 0;
struct marker_probe_closure *old, *new;
WARN_ON(!probe);
debug_print_probes(entry);
old = entry->multi;
if (!entry->ptype) {
if (entry->single.func == probe &&
entry->single.probe_private == probe_private)
return ERR_PTR(-EBUSY);
if (entry->single.func == __mark_empty_function) {
/* 0 -> 1 probes */
entry->single.func = probe;
entry->single.probe_private = probe_private;
entry->refcount = 1;
entry->ptype = 0;
debug_print_probes(entry);
return NULL;
} else {
/* 1 -> 2 probes */
nr_probes = 1;
old = NULL;
}
} else {
/* (N -> N+1), (N != 0, 1) probes */
for (nr_probes = 0; old[nr_probes].func; nr_probes++)
if (old[nr_probes].func == probe
&& old[nr_probes].probe_private
== probe_private)
return ERR_PTR(-EBUSY);
}
/* + 2 : one for new probe, one for NULL func */
new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure),
GFP_KERNEL);
if (new == NULL)
return ERR_PTR(-ENOMEM);
if (!old)
new[0] = entry->single;
else
memcpy(new, old,
nr_probes * sizeof(struct marker_probe_closure));
new[nr_probes].func = probe;
new[nr_probes].probe_private = probe_private;
entry->refcount = nr_probes + 1;
entry->multi = new;
entry->ptype = 1;
debug_print_probes(entry);
return old;
}
static struct marker_probe_closure *
marker_entry_remove_probe(struct marker_entry *entry,
marker_probe_func *probe, void *probe_private)
{
int nr_probes = 0, nr_del = 0, i;
struct marker_probe_closure *old, *new;
old = entry->multi;
debug_print_probes(entry);
if (!entry->ptype) {
/* 0 -> N is an error */
WARN_ON(entry->single.func == __mark_empty_function);
/* 1 -> 0 probes */
WARN_ON(probe && entry->single.func != probe);
WARN_ON(entry->single.probe_private != probe_private);
entry->single.func = __mark_empty_function;
entry->refcount = 0;
entry->ptype = 0;
debug_print_probes(entry);
return NULL;
} else {
/* (N -> M), (N > 1, M >= 0) probes */
for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
if ((!probe || old[nr_probes].func == probe)
&& old[nr_probes].probe_private
== probe_private)
nr_del++;
}
}
if (nr_probes - nr_del == 0) {
/* N -> 0, (N > 1) */
entry->single.func = __mark_empty_function;
entry->refcount = 0;
entry->ptype = 0;
} else if (nr_probes - nr_del == 1) {
/* N -> 1, (N > 1) */
for (i = 0; old[i].func; i++)
if ((probe && old[i].func != probe) ||
old[i].probe_private != probe_private)
entry->single = old[i];
entry->refcount = 1;
entry->ptype = 0;
} else {
int j = 0;
/* N -> M, (N > 1, M > 1) */
/* + 1 for NULL */
new = kzalloc((nr_probes - nr_del + 1)
* sizeof(struct marker_probe_closure), GFP_KERNEL);
if (new == NULL)
return ERR_PTR(-ENOMEM);
for (i = 0; old[i].func; i++)
if ((probe && old[i].func != probe) ||
old[i].probe_private != probe_private)
new[j++] = old[i];
entry->refcount = nr_probes - nr_del;
entry->ptype = 1;
entry->multi = new;
}
debug_print_probes(entry);
return old;
}
/*
* Get marker if the marker is present in the marker hash table.
* Must be called with markers_mutex held.
* Returns NULL if not present.
*/
static struct marker_entry *get_marker(const char *name)
{
struct hlist_head *head;
struct hlist_node *node;
struct marker_entry *e;
u32 hash = jhash(name, strlen(name), 0);
head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
hlist_for_each_entry(e, node, head, hlist) {
if (!strcmp(name, e->name))
return e;
}
return NULL;
}
/*
* Add the marker to the marker hash table. Must be called with markers_mutex
* held.
*/
static struct marker_entry *add_marker(const char *name, const char *format)
{
struct hlist_head *head;
struct hlist_node *node;
struct marker_entry *e;
size_t name_len = strlen(name) + 1;
size_t format_len = 0;
u32 hash = jhash(name, name_len-1, 0);
if (format)
format_len = strlen(format) + 1;
head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
hlist_for_each_entry(e, node, head, hlist) {
if (!strcmp(name, e->name)) {
printk(KERN_NOTICE
"Marker %s busy\n", name);
return ERR_PTR(-EBUSY); /* Already there */
}
}
/*
* Using kmalloc here to allocate a variable length element. Could
* cause some memory fragmentation if overused.
*/
e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
GFP_KERNEL);
if (!e)
return ERR_PTR(-ENOMEM);
memcpy(&e->name[0], name, name_len);
if (format) {
e->format = &e->name[name_len];
memcpy(e->format, format, format_len);
if (strcmp(e->format, MARK_NOARGS) == 0)
e->call = marker_probe_cb_noarg;
else
e->call = marker_probe_cb;
trace_mark(core_marker_format, "name %s format %s",
e->name, e->format);
} else {
e->format = NULL;
e->call = marker_probe_cb;
}
e->single.func = __mark_empty_function;
e->single.probe_private = NULL;
e->multi = NULL;
e->ptype = 0;
e->format_allocated = 0;
e->refcount = 0;
e->rcu_pending = 0;
hlist_add_head(&e->hlist, head);
return e;
}
/*
* Remove the marker from the marker hash table. Must be called with mutex_lock
* held.
*/
static int remove_marker(const char *name)
{
struct hlist_head *head;
struct hlist_node *node;
struct marker_entry *e;
int found = 0;
size_t len = strlen(name) + 1;
u32 hash = jhash(name, len-1, 0);
head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
hlist_for_each_entry(e, node, head, hlist) {
if (!strcmp(name, e->name)) {
found = 1;
break;
}
}
if (!found)
return -ENOENT;
if (e->single.func != __mark_empty_function)
return -EBUSY;
hlist_del(&e->hlist);
if (e->format_allocated)
kfree(e->format);
/* Make sure the call_rcu has been executed */
if (e->rcu_pending)
rcu_barrier_sched();
kfree(e);
return 0;
}
/*
* Set the mark_entry format to the format found in the element.
*/
static int marker_set_format(struct marker_entry *entry, const char *format)
{
entry->format = kstrdup(format, GFP_KERNEL);
if (!entry->format)
return -ENOMEM;
entry->format_allocated = 1;
trace_mark(core_marker_format, "name %s format %s",
entry->name, entry->format);
return 0;
}
/*
* Sets the probe callback corresponding to one marker.
*/
static int set_marker(struct marker_entry *entry, struct marker *elem,
int active)
{
int ret = 0;
WARN_ON(strcmp(entry->name, elem->name) != 0);
if (entry->format) {
if (strcmp(entry->format, elem->format) != 0) {
printk(KERN_NOTICE
"Format mismatch for probe %s "
"(%s), marker (%s)\n",
entry->name,
entry->format,
elem->format);
return -EPERM;
}
} else {
ret = marker_set_format(entry, elem->format);
if (ret)
return ret;
}
/*
* probe_cb setup (statically known) is done here. It is
* asynchronous with the rest of execution, therefore we only
* pass from a "safe" callback (with argument) to an "unsafe"
* callback (does not set arguments).
*/
elem->call = entry->call;
/*
* Sanity check :
* We only update the single probe private data when the ptr is
* set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
*/
WARN_ON(elem->single.func != __mark_empty_function
&& elem->single.probe_private != entry->single.probe_private
&& !elem->ptype);
elem->single.probe_private = entry->single.probe_private;
/*
* Make sure the private data is valid when we update the
* single probe ptr.
*/
smp_wmb();
elem->single.func = entry->single.func;
/*
* We also make sure that the new probe callbacks array is consistent
* before setting a pointer to it.
*/
rcu_assign_pointer(elem->multi, entry->multi);
/*
* Update the function or multi probe array pointer before setting the
* ptype.
*/
smp_wmb();
elem->ptype = entry->ptype;
if (elem->tp_name && (active ^ elem->state)) {
WARN_ON(!elem->tp_cb);
/*
* It is ok to directly call the probe registration because type
* checking has been done in the __trace_mark_tp() macro.
*/
if (active) {
/*
* try_module_get should always succeed because we hold
* lock_module() to get the tp_cb address.
*/
ret = try_module_get(__module_text_address(
(unsigned long)elem->tp_cb));
BUG_ON(!ret);
ret = tracepoint_probe_register_noupdate(
elem->tp_name,
elem->tp_cb);
} else {
ret = tracepoint_probe_unregister_noupdate(
elem->tp_name,
elem->tp_cb);
/*
* tracepoint_probe_update_all() must be called
* before the module containing tp_cb is unloaded.
*/
module_put(__module_text_address(
(unsigned long)elem->tp_cb));
}
}
elem->state = active;
return ret;
}
/*
* Disable a marker and its probe callback.
* Note: only waiting an RCU period after setting elem->call to the empty
* function insures that the original callback is not used anymore. This insured
* by rcu_read_lock_sched around the call site.
*/
static void disable_marker(struct marker *elem)
{
int ret;
/* leave "call" as is. It is known statically. */
if (elem->tp_name && elem->state) {
WARN_ON(!elem->tp_cb);
/*
* It is ok to directly call the probe registration because type
* checking has been done in the __trace_mark_tp() macro.
*/
ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
elem->tp_cb);
WARN_ON(ret);
/*
* tracepoint_probe_update_all() must be called
* before the module containing tp_cb is unloaded.
*/
module_put(__module_text_address((unsigned long)elem->tp_cb));
}
elem->state = 0;
elem->single.func = __mark_empty_function;
/* Update the function before setting the ptype */
smp_wmb();
elem->ptype = 0; /* single probe */
/*
* Leave the private data and id there, because removal is racy and
* should be done only after an RCU period. These are never used until
* the next initialization anyway.
*/
}
/**
* marker_update_probe_range - Update a probe range
* @begin: beginning of the range
* @end: end of the range
*
* Updates the probe callback corresponding to a range of markers.
*/
void marker_update_probe_range(struct marker *begin,
struct marker *end)
{
struct marker *iter;
struct marker_entry *mark_entry;
mutex_lock(&markers_mutex);
for (iter = begin; iter < end; iter++) {
mark_entry = get_marker(iter->name);
if (mark_entry) {
set_marker(mark_entry, iter, !!mark_entry->refcount);
/*
* ignore error, continue
*/
} else {
disable_marker(iter);
}
}
mutex_unlock(&markers_mutex);
}
/*
* Update probes, removing the faulty probes.
*
* Internal callback only changed before the first probe is connected to it.
* Single probe private data can only be changed on 0 -> 1 and 2 -> 1
* transitions. All other transitions will leave the old private data valid.
* This makes the non-atomicity of the callback/private data updates valid.
*
* "special case" updates :
* 0 -> 1 callback
* 1 -> 0 callback
* 1 -> 2 callbacks
* 2 -> 1 callbacks
* Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
* Site effect : marker_set_format may delete the marker entry (creating a
* replacement).
*/
static void marker_update_probes(void)
{
/* Core kernel markers */
marker_update_probe_range(__start___markers, __stop___markers);
/* Markers in modules. */
module_update_markers();
tracepoint_probe_update_all();
}
/**
* marker_probe_register - Connect a probe to a marker
* @name: marker name
* @format: format string
* @probe: probe handler
* @probe_private: probe private data
*
* private data must be a valid allocated memory address, or NULL.
* Returns 0 if ok, error value on error.
* The probe address must at least be aligned on the architecture pointer size.
*/
int marker_probe_register(const char *name, const char *format,
marker_probe_func *probe, void *probe_private)
{
struct marker_entry *entry;
int ret = 0;
struct marker_probe_closure *old;
mutex_lock(&markers_mutex);
entry = get_marker(name);
if (!entry) {
entry = add_marker(name, format);
if (IS_ERR(entry))
ret = PTR_ERR(entry);
} else if (format) {
if (!entry->format)
ret = marker_set_format(entry, format);
else if (strcmp(entry->format, format))
ret = -EPERM;
}
if (ret)
goto end;
/*
* If we detect that a call_rcu is pending for this marker,
* make sure it's executed now.
*/
if (entry->rcu_pending)
rcu_barrier_sched();
old = marker_entry_add_probe(entry, probe, probe_private);
if (IS_ERR(old)) {
ret = PTR_ERR(old);
goto end;
}
mutex_unlock(&markers_mutex);
marker_update_probes();
mutex_lock(&markers_mutex);
entry = get_marker(name);
if (!entry)
goto end;
if (entry->rcu_pending)
rcu_barrier_sched();
entry->oldptr = old;
entry->rcu_pending = 1;
/* write rcu_pending before calling the RCU callback */
smp_wmb();
call_rcu_sched(&entry->rcu, free_old_closure);
end:
mutex_unlock(&markers_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(marker_probe_register);
/**
* marker_probe_unregister - Disconnect a probe from a marker
* @name: marker name
* @probe: probe function pointer
* @probe_private: probe private data
*
* Returns the private data given to marker_probe_register, or an ERR_PTR().
* We do not need to call a synchronize_sched to make sure the probes have
* finished running before doing a module unload, because the module unload
* itself uses stop_machine(), which insures that every preempt disabled section
* have finished.
*/
int marker_probe_unregister(const char *name,
marker_probe_func *probe, void *probe_private)
{
struct marker_entry *entry;
struct marker_probe_closure *old;
int ret = -ENOENT;
mutex_lock(&markers_mutex);
entry = get_marker(name);
if (!entry)
goto end;
if (entry->rcu_pending)
rcu_barrier_sched();
old = marker_entry_remove_probe(entry, probe, probe_private);
mutex_unlock(&markers_mutex);
marker_update_probes();
mutex_lock(&markers_mutex);
entry = get_marker(name);
if (!entry)
goto end;
if (entry->rcu_pending)
rcu_barrier_sched();
entry->oldptr = old;
entry->rcu_pending = 1;
/* write rcu_pending before calling the RCU callback */
smp_wmb();
call_rcu_sched(&entry->rcu, free_old_closure);
remove_marker(name); /* Ignore busy error message */
ret = 0;
end:
mutex_unlock(&markers_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(marker_probe_unregister);
static struct marker_entry *
get_marker_from_private_data(marker_probe_func *probe, void *probe_private)
{
struct marker_entry *entry;
unsigned int i;
struct hlist_head *head;
struct hlist_node *node;
for (i = 0; i < MARKER_TABLE_SIZE; i++) {
head = &marker_table[i];
hlist_for_each_entry(entry, node, head, hlist) {
if (!entry->ptype) {
if (entry->single.func == probe
&& entry->single.probe_private
== probe_private)
return entry;
} else {
struct marker_probe_closure *closure;
closure = entry->multi;
for (i = 0; closure[i].func; i++) {
if (closure[i].func == probe &&
closure[i].probe_private
== probe_private)
return entry;
}
}
}
}
return NULL;
}
/**
* marker_probe_unregister_private_data - Disconnect a probe from a marker
* @probe: probe function
* @probe_private: probe private data
*
* Unregister a probe by providing the registered private data.
* Only removes the first marker found in hash table.
* Return 0 on success or error value.
* We do not need to call a synchronize_sched to make sure the probes have
* finished running before doing a module unload, because the module unload
* itself uses stop_machine(), which insures that every preempt disabled section
* have finished.
*/
int marker_probe_unregister_private_data(marker_probe_func *probe,
void *probe_private)
{
struct marker_entry *entry;
int ret = 0;
struct marker_probe_closure *old;
mutex_lock(&markers_mutex);
entry = get_marker_from_private_data(probe, probe_private);
if (!entry) {
ret = -ENOENT;
goto end;
}
if (entry->rcu_pending)
rcu_barrier_sched();
old = marker_entry_remove_probe(entry, NULL, probe_private);
mutex_unlock(&markers_mutex);
marker_update_probes();
mutex_lock(&markers_mutex);
entry = get_marker_from_private_data(probe, probe_private);
if (!entry)
goto end;
if (entry->rcu_pending)
rcu_barrier_sched();
entry->oldptr = old;
entry->rcu_pending = 1;
/* write rcu_pending before calling the RCU callback */
smp_wmb();
call_rcu_sched(&entry->rcu, free_old_closure);
remove_marker(entry->name); /* Ignore busy error message */
end:
mutex_unlock(&markers_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
/**
* marker_get_private_data - Get a marker's probe private data
* @name: marker name
* @probe: probe to match
* @num: get the nth matching probe's private data
*
* Returns the nth private data pointer (starting from 0) matching, or an
* ERR_PTR.
* Returns the private data pointer, or an ERR_PTR.
* The private data pointer should _only_ be dereferenced if the caller is the
* owner of the data, or its content could vanish. This is mostly used to
* confirm that a caller is the owner of a registered probe.
*/
void *marker_get_private_data(const char *name, marker_probe_func *probe,
int num)
{
struct hlist_head *head;
struct hlist_node *node;
struct marker_entry *e;
size_t name_len = strlen(name) + 1;
u32 hash = jhash(name, name_len-1, 0);
int i;
head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
hlist_for_each_entry(e, node, head, hlist) {
if (!strcmp(name, e->name)) {
if (!e->ptype) {
if (num == 0 && e->single.func == probe)
return e->single.probe_private;
} else {
struct marker_probe_closure *closure;
int match = 0;
closure = e->multi;
for (i = 0; closure[i].func; i++) {
if (closure[i].func != probe)
continue;
if (match++ == num)
return closure[i].probe_private;
}
}
break;
}
}
return ERR_PTR(-ENOENT);
}
EXPORT_SYMBOL_GPL(marker_get_private_data);
#ifdef CONFIG_MODULES
int marker_module_notify(struct notifier_block *self,
unsigned long val, void *data)
{
struct module *mod = data;
switch (val) {
case MODULE_STATE_COMING:
marker_update_probe_range(mod->markers,
mod->markers + mod->num_markers);
break;
case MODULE_STATE_GOING:
marker_update_probe_range(mod->markers,
mod->markers + mod->num_markers);
break;
}
return 0;
}
struct notifier_block marker_module_nb = {
.notifier_call = marker_module_notify,
.priority = 0,
};
static int init_markers(void)
{
return register_module_notifier(&marker_module_nb);
}
__initcall(init_markers);
#endif /* CONFIG_MODULES */

View File

@@ -47,6 +47,7 @@
#include <linux/rculist.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
#include <linux/license.h>
#include <asm/sections.h>
#include <linux/tracepoint.h>
@@ -1535,6 +1536,10 @@ static void free_module(struct module *mod)
/* Finally, free the core (containing the module structure) */
module_free(mod, mod->module_core);
#ifdef CONFIG_MPU
update_protections(current->mm);
#endif
}
void *__symbol_get(const char *symbol)
@@ -1792,6 +1797,17 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
}
}
static void free_modinfo(struct module *mod)
{
struct module_attribute *attr;
int i;
for (i = 0; (attr = modinfo_attrs[i]); i++) {
if (attr->free)
attr->free(mod);
}
}
#ifdef CONFIG_KALLSYMS
/* lookup symbol in given range of kernel_symbols */
@@ -1857,13 +1873,93 @@ static char elf_type(const Elf_Sym *sym,
return '?';
}
static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
unsigned int shnum)
{
const Elf_Shdr *sec;
if (src->st_shndx == SHN_UNDEF
|| src->st_shndx >= shnum
|| !src->st_name)
return false;
sec = sechdrs + src->st_shndx;
if (!(sec->sh_flags & SHF_ALLOC)
#ifndef CONFIG_KALLSYMS_ALL
|| !(sec->sh_flags & SHF_EXECINSTR)
#endif
|| (sec->sh_entsize & INIT_OFFSET_MASK))
return false;
return true;
}
static unsigned long layout_symtab(struct module *mod,
Elf_Shdr *sechdrs,
unsigned int symindex,
unsigned int strindex,
const Elf_Ehdr *hdr,
const char *secstrings,
unsigned long *pstroffs,
unsigned long *strmap)
{
unsigned long symoffs;
Elf_Shdr *symsect = sechdrs + symindex;
Elf_Shdr *strsect = sechdrs + strindex;
const Elf_Sym *src;
const char *strtab;
unsigned int i, nsrc, ndst;
/* Put symbol section at end of init part of module. */
symsect->sh_flags |= SHF_ALLOC;
symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
symindex) | INIT_OFFSET_MASK;
DEBUGP("\t%s\n", secstrings + symsect->sh_name);
src = (void *)hdr + symsect->sh_offset;
nsrc = symsect->sh_size / sizeof(*src);
strtab = (void *)hdr + strsect->sh_offset;
for (ndst = i = 1; i < nsrc; ++i, ++src)
if (is_core_symbol(src, sechdrs, hdr->e_shnum)) {
unsigned int j = src->st_name;
while(!__test_and_set_bit(j, strmap) && strtab[j])
++j;
++ndst;
}
/* Append room for core symbols at end of core part. */
symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
mod->core_size = symoffs + ndst * sizeof(Elf_Sym);
/* Put string table section at end of init part of module. */
strsect->sh_flags |= SHF_ALLOC;
strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
strindex) | INIT_OFFSET_MASK;
DEBUGP("\t%s\n", secstrings + strsect->sh_name);
/* Append room for core symbols' strings at end of core part. */
*pstroffs = mod->core_size;
__set_bit(0, strmap);
mod->core_size += bitmap_weight(strmap, strsect->sh_size);
return symoffs;
}
static void add_kallsyms(struct module *mod,
Elf_Shdr *sechdrs,
unsigned int shnum,
unsigned int symindex,
unsigned int strindex,
const char *secstrings)
unsigned long symoffs,
unsigned long stroffs,
const char *secstrings,
unsigned long *strmap)
{
unsigned int i;
unsigned int i, ndst;
const Elf_Sym *src;
Elf_Sym *dst;
char *s;
mod->symtab = (void *)sechdrs[symindex].sh_addr;
mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
@@ -1873,13 +1969,44 @@ static void add_kallsyms(struct module *mod,
for (i = 0; i < mod->num_symtab; i++)
mod->symtab[i].st_info
= elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
mod->core_symtab = dst = mod->module_core + symoffs;
src = mod->symtab;
*dst = *src;
for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
if (!is_core_symbol(src, sechdrs, shnum))
continue;
dst[ndst] = *src;
dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name);
++ndst;
}
mod->core_num_syms = ndst;
mod->core_strtab = s = mod->module_core + stroffs;
for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i)
if (test_bit(i, strmap))
*++s = mod->strtab[i];
}
#else
static inline unsigned long layout_symtab(struct module *mod,
Elf_Shdr *sechdrs,
unsigned int symindex,
unsigned int strindex,
const Elf_Hdr *hdr,
const char *secstrings,
unsigned long *pstroffs,
unsigned long *strmap)
{
}
static inline void add_kallsyms(struct module *mod,
Elf_Shdr *sechdrs,
unsigned int shnum,
unsigned int symindex,
unsigned int strindex,
const char *secstrings)
unsigned long symoffs,
unsigned long stroffs,
const char *secstrings,
const unsigned long *strmap)
{
}
#endif /* CONFIG_KALLSYMS */
@@ -1954,6 +2081,9 @@ static noinline struct module *load_module(void __user *umod,
struct module *mod;
long err = 0;
void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
#ifdef CONFIG_KALLSYMS
unsigned long symoffs, stroffs, *strmap;
#endif
mm_segment_t old_fs;
DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2035,11 +2165,6 @@ static noinline struct module *load_module(void __user *umod,
/* Don't keep modinfo and version sections. */
sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
#ifdef CONFIG_KALLSYMS
/* Keep symbol and string tables for decoding later. */
sechdrs[symindex].sh_flags |= SHF_ALLOC;
sechdrs[strindex].sh_flags |= SHF_ALLOC;
#endif
/* Check module struct version now, before we try to use module. */
if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -2075,6 +2200,13 @@ static noinline struct module *load_module(void __user *umod,
goto free_hdr;
}
strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size)
* sizeof(long), GFP_KERNEL);
if (!strmap) {
err = -ENOMEM;
goto free_mod;
}
if (find_module(mod->name)) {
err = -EEXIST;
goto free_mod;
@@ -2104,6 +2236,8 @@ static noinline struct module *load_module(void __user *umod,
this is done generically; there doesn't appear to be any
special cases for the architectures. */
layout_sections(mod, hdr, sechdrs, secstrings);
symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr,
secstrings, &stroffs, strmap);
/* Do the allocs. */
ptr = module_alloc_update_bounds(mod->core_size);
@@ -2237,10 +2371,6 @@ static noinline struct module *load_module(void __user *umod,
sizeof(*mod->ctors), &mod->num_ctors);
#endif
#ifdef CONFIG_MARKERS
mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
sizeof(*mod->markers), &mod->num_markers);
#endif
#ifdef CONFIG_TRACEPOINTS
mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
"__tracepoints",
@@ -2312,7 +2442,10 @@ static noinline struct module *load_module(void __user *umod,
percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
sechdrs[pcpuindex].sh_size);
add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
symoffs, stroffs, secstrings, strmap);
kfree(strmap);
strmap = NULL;
if (!mod->taints) {
struct _ddebug *debug;
@@ -2384,13 +2517,14 @@ static noinline struct module *load_module(void __user *umod,
synchronize_sched();
module_arch_cleanup(mod);
cleanup:
free_modinfo(mod);
kobject_del(&mod->mkobj.kobj);
kobject_put(&mod->mkobj.kobj);
free_unload:
module_unload_free(mod);
#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
free_init:
percpu_modfree(mod->refptr);
free_init:
#endif
module_free(mod, mod->module_init);
free_core:
@@ -2401,6 +2535,7 @@ static noinline struct module *load_module(void __user *umod,
percpu_modfree(percpu);
free_mod:
kfree(args);
kfree(strmap);
free_hdr:
vfree(hdr);
return ERR_PTR(err);
@@ -2490,6 +2625,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
/* Drop initial reference. */
module_put(mod);
trim_init_extable(mod);
#ifdef CONFIG_KALLSYMS
mod->num_symtab = mod->core_num_syms;
mod->symtab = mod->core_symtab;
mod->strtab = mod->core_strtab;
#endif
module_free(mod, mod->module_init);
mod->module_init = NULL;
mod->init_size = 0;
@@ -2951,27 +3091,12 @@ void module_layout(struct module *mod,
struct modversion_info *ver,
struct kernel_param *kp,
struct kernel_symbol *ks,
struct marker *marker,
struct tracepoint *tp)
{
}
EXPORT_SYMBOL(module_layout);
#endif
#ifdef CONFIG_MARKERS
void module_update_markers(void)
{
struct module *mod;
mutex_lock(&module_mutex);
list_for_each_entry(mod, &modules, list)
if (!mod->taints)
marker_update_probe_range(mod->markers,
mod->markers + mod->num_markers);
mutex_unlock(&module_mutex);
}
#endif
#ifdef CONFIG_TRACEPOINTS
void module_update_tracepoints(void)
{

View File

@@ -42,8 +42,8 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
* (hence either you are in the same cgroup as task, or in an
* ancestor cgroup thereof)
*/
static int ns_can_attach(struct cgroup_subsys *ss,
struct cgroup *new_cgroup, struct task_struct *task)
static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
struct task_struct *task, bool threadgroup)
{
if (current != task) {
if (!capable(CAP_SYS_ADMIN))
@@ -56,6 +56,18 @@ static int ns_can_attach(struct cgroup_subsys *ss,
if (!cgroup_is_descendant(new_cgroup, task))
return -EPERM;
if (threadgroup) {
struct task_struct *c;
rcu_read_lock();
list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
if (!cgroup_is_descendant(new_cgroup, c)) {
rcu_read_unlock();
return -EPERM;
}
}
rcu_read_unlock();
}
return 0;
}

View File

@@ -177,7 +177,7 @@ static const struct tnt tnts[] = {
* 'W' - Taint on warning.
* 'C' - modules from drivers/staging are loaded.
*
* The string is overwritten by the next call to print_taint().
* The string is overwritten by the next call to print_tainted().
*/
const char *print_tainted(void)
{

View File

@@ -23,6 +23,7 @@
#include <linux/device.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/ctype.h>
#if 0
#define DEBUGP printk
@@ -87,7 +88,7 @@ static char *next_arg(char *args, char **param, char **val)
}
for (i = 0; args[i]; i++) {
if (args[i] == ' ' && !in_quote)
if (isspace(args[i]) && !in_quote)
break;
if (equals == 0) {
if (args[i] == '=')
@@ -121,7 +122,7 @@ static char *next_arg(char *args, char **param, char **val)
next = args + i;
/* Chew up trailing spaces. */
while (*next == ' ')
while (isspace(*next))
next++;
return next;
}
@@ -138,7 +139,7 @@ int parse_args(const char *name,
DEBUGP("Parsing ARGS: %s\n", args);
/* Chew leading spaces */
while (*args == ' ')
while (isspace(*args))
args++;
while (*args) {

File diff suppressed because it is too large Load Diff

5000
kernel/perf_event.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -40,7 +40,7 @@
#define pid_hashfn(nr, ns) \
hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
static struct hlist_head *pid_hash;
static int pidhash_shift;
static unsigned int pidhash_shift = 4;
struct pid init_struct_pid = INIT_STRUCT_PID;
int pid_max = PID_MAX_DEFAULT;
@@ -499,19 +499,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
void __init pidhash_init(void)
{
int i, pidhash_size;
unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
pidhash_shift = max(4, fls(megabytes * 4));
pidhash_shift = min(12, pidhash_shift);
pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
HASH_EARLY | HASH_SMALL,
&pidhash_shift, NULL, 4096);
pidhash_size = 1 << pidhash_shift;
printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
pidhash_size, pidhash_shift,
pidhash_size * sizeof(struct hlist_head));
pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash)));
if (!pid_hash)
panic("Could not alloc pidhash!\n");
for (i = 0; i < pidhash_size; i++)
INIT_HLIST_HEAD(&pid_hash[i]);
}

View File

@@ -118,7 +118,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old
{
if (!(flags & CLONE_NEWPID))
return get_pid_ns(old_ns);
if (flags & CLONE_THREAD)
if (flags & (CLONE_THREAD|CLONE_PARENT))
return ERR_PTR(-EINVAL);
return create_pid_namespace(old_ns);
}

View File

@@ -8,17 +8,18 @@
#include <linux/math64.h>
#include <asm/uaccess.h>
#include <linux/kernel_stat.h>
#include <trace/events/timer.h>
/*
* Called after updating RLIMIT_CPU to set timer expiration if necessary.
*/
void update_rlimit_cpu(unsigned long rlim_new)
{
cputime_t cputime;
cputime_t cputime = secs_to_cputime(rlim_new);
struct signal_struct *const sig = current->signal;
cputime = secs_to_cputime(rlim_new);
if (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
cputime_gt(current->signal->it_prof_expires, cputime)) {
if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) ||
cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) {
spin_lock_irq(&current->sighand->siglock);
set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
spin_unlock_irq(&current->sighand->siglock);
@@ -542,6 +543,17 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
now);
}
static inline int expires_gt(cputime_t expires, cputime_t new_exp)
{
return cputime_eq(expires, cputime_zero) ||
cputime_gt(expires, new_exp);
}
static inline int expires_le(cputime_t expires, cputime_t new_exp)
{
return !cputime_eq(expires, cputime_zero) &&
cputime_le(expires, new_exp);
}
/*
* Insert the timer on the appropriate list before any timers that
* expire later. This must be called with the tasklist_lock held
@@ -586,34 +598,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
*/
if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
union cpu_time_count *exp = &nt->expires;
switch (CPUCLOCK_WHICH(timer->it_clock)) {
default:
BUG();
case CPUCLOCK_PROF:
if (cputime_eq(p->cputime_expires.prof_exp,
cputime_zero) ||
cputime_gt(p->cputime_expires.prof_exp,
nt->expires.cpu))
p->cputime_expires.prof_exp =
nt->expires.cpu;
if (expires_gt(p->cputime_expires.prof_exp,
exp->cpu))
p->cputime_expires.prof_exp = exp->cpu;
break;
case CPUCLOCK_VIRT:
if (cputime_eq(p->cputime_expires.virt_exp,
cputime_zero) ||
cputime_gt(p->cputime_expires.virt_exp,
nt->expires.cpu))
p->cputime_expires.virt_exp =
nt->expires.cpu;
if (expires_gt(p->cputime_expires.virt_exp,
exp->cpu))
p->cputime_expires.virt_exp = exp->cpu;
break;
case CPUCLOCK_SCHED:
if (p->cputime_expires.sched_exp == 0 ||
p->cputime_expires.sched_exp >
nt->expires.sched)
p->cputime_expires.sched_exp > exp->sched)
p->cputime_expires.sched_exp =
nt->expires.sched;
exp->sched;
break;
}
} else {
struct signal_struct *const sig = p->signal;
union cpu_time_count *exp = &timer->it.cpu.expires;
/*
* For a process timer, set the cached expiration time.
*/
@@ -621,30 +631,23 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
default:
BUG();
case CPUCLOCK_VIRT:
if (!cputime_eq(p->signal->it_virt_expires,
cputime_zero) &&
cputime_lt(p->signal->it_virt_expires,
timer->it.cpu.expires.cpu))
if (expires_le(sig->it[CPUCLOCK_VIRT].expires,
exp->cpu))
break;
p->signal->cputime_expires.virt_exp =
timer->it.cpu.expires.cpu;
sig->cputime_expires.virt_exp = exp->cpu;
break;
case CPUCLOCK_PROF:
if (!cputime_eq(p->signal->it_prof_expires,
cputime_zero) &&
cputime_lt(p->signal->it_prof_expires,
timer->it.cpu.expires.cpu))
if (expires_le(sig->it[CPUCLOCK_PROF].expires,
exp->cpu))
break;
i = p->signal->rlim[RLIMIT_CPU].rlim_cur;
i = sig->rlim[RLIMIT_CPU].rlim_cur;
if (i != RLIM_INFINITY &&
i <= cputime_to_secs(timer->it.cpu.expires.cpu))
i <= cputime_to_secs(exp->cpu))
break;
p->signal->cputime_expires.prof_exp =
timer->it.cpu.expires.cpu;
sig->cputime_expires.prof_exp = exp->cpu;
break;
case CPUCLOCK_SCHED:
p->signal->cputime_expires.sched_exp =
timer->it.cpu.expires.sched;
sig->cputime_expires.sched_exp = exp->sched;
break;
}
}
@@ -1071,6 +1074,40 @@ static void stop_process_timers(struct task_struct *tsk)
spin_unlock_irqrestore(&cputimer->lock, flags);
}
static u32 onecputick;
static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
cputime_t *expires, cputime_t cur_time, int signo)
{
if (cputime_eq(it->expires, cputime_zero))
return;
if (cputime_ge(cur_time, it->expires)) {
if (!cputime_eq(it->incr, cputime_zero)) {
it->expires = cputime_add(it->expires, it->incr);
it->error += it->incr_error;
if (it->error >= onecputick) {
it->expires = cputime_sub(it->expires,
cputime_one_jiffy);
it->error -= onecputick;
}
} else {
it->expires = cputime_zero;
}
trace_itimer_expire(signo == SIGPROF ?
ITIMER_PROF : ITIMER_VIRTUAL,
tsk->signal->leader_pid, cur_time);
__group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
}
if (!cputime_eq(it->expires, cputime_zero) &&
(cputime_eq(*expires, cputime_zero) ||
cputime_lt(it->expires, *expires))) {
*expires = it->expires;
}
}
/*
* Check for any per-thread CPU timers that have fired and move them
* off the tsk->*_timers list onto the firing list. Per-thread timers
@@ -1090,10 +1127,10 @@ static void check_process_timers(struct task_struct *tsk,
* Don't sample the current process CPU clocks if there are no timers.
*/
if (list_empty(&timers[CPUCLOCK_PROF]) &&
cputime_eq(sig->it_prof_expires, cputime_zero) &&
cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) &&
sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
list_empty(&timers[CPUCLOCK_VIRT]) &&
cputime_eq(sig->it_virt_expires, cputime_zero) &&
cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
list_empty(&timers[CPUCLOCK_SCHED])) {
stop_process_timers(tsk);
return;
@@ -1153,38 +1190,11 @@ static void check_process_timers(struct task_struct *tsk,
/*
* Check for the special case process timers.
*/
if (!cputime_eq(sig->it_prof_expires, cputime_zero)) {
if (cputime_ge(ptime, sig->it_prof_expires)) {
/* ITIMER_PROF fires and reloads. */
sig->it_prof_expires = sig->it_prof_incr;
if (!cputime_eq(sig->it_prof_expires, cputime_zero)) {
sig->it_prof_expires = cputime_add(
sig->it_prof_expires, ptime);
}
__group_send_sig_info(SIGPROF, SEND_SIG_PRIV, tsk);
}
if (!cputime_eq(sig->it_prof_expires, cputime_zero) &&
(cputime_eq(prof_expires, cputime_zero) ||
cputime_lt(sig->it_prof_expires, prof_expires))) {
prof_expires = sig->it_prof_expires;
}
}
if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
if (cputime_ge(utime, sig->it_virt_expires)) {
/* ITIMER_VIRTUAL fires and reloads. */
sig->it_virt_expires = sig->it_virt_incr;
if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
sig->it_virt_expires = cputime_add(
sig->it_virt_expires, utime);
}
__group_send_sig_info(SIGVTALRM, SEND_SIG_PRIV, tsk);
}
if (!cputime_eq(sig->it_virt_expires, cputime_zero) &&
(cputime_eq(virt_expires, cputime_zero) ||
cputime_lt(sig->it_virt_expires, virt_expires))) {
virt_expires = sig->it_virt_expires;
}
}
check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime,
SIGPROF);
check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
SIGVTALRM);
if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
unsigned long psecs = cputime_to_secs(ptime);
cputime_t x;
@@ -1457,7 +1467,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
if (!cputime_eq(*oldval, cputime_zero)) {
if (cputime_le(*oldval, now.cpu)) {
/* Just about to fire. */
*oldval = jiffies_to_cputime(1);
*oldval = cputime_one_jiffy;
} else {
*oldval = cputime_sub(*oldval, now.cpu);
}
@@ -1703,10 +1713,15 @@ static __init int init_posix_cpu_timers(void)
.nsleep = thread_cpu_nsleep,
.nsleep_restart = thread_cpu_nsleep_restart,
};
struct timespec ts;
register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
cputime_to_timespec(cputime_one_jiffy, &ts);
onecputick = ts.tv_nsec;
WARN_ON(ts.tv_sec != 0);
return 0;
}
__initcall(init_posix_cpu_timers);

View File

@@ -242,6 +242,25 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
return 0;
}
static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
{
*tp = current_kernel_time();
return 0;
}
static int posix_get_monotonic_coarse(clockid_t which_clock,
struct timespec *tp)
{
*tp = get_monotonic_coarse();
return 0;
}
int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
{
*tp = ktime_to_timespec(KTIME_LOW_RES);
return 0;
}
/*
* Initialize everything, well, just everything in Posix clocks/timers ;)
*/
@@ -262,10 +281,26 @@ static __init int init_posix_timers(void)
.timer_create = no_timer_create,
.nsleep = no_nsleep,
};
struct k_clock clock_realtime_coarse = {
.clock_getres = posix_get_coarse_res,
.clock_get = posix_get_realtime_coarse,
.clock_set = do_posix_clock_nosettime,
.timer_create = no_timer_create,
.nsleep = no_nsleep,
};
struct k_clock clock_monotonic_coarse = {
.clock_getres = posix_get_coarse_res,
.clock_get = posix_get_monotonic_coarse,
.clock_set = do_posix_clock_nosettime,
.timer_create = no_timer_create,
.nsleep = no_nsleep,
};
register_posix_clock(CLOCK_REALTIME, &clock_realtime);
register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
posix_timers_cache = kmem_cache_create("posix_timers_cache",
sizeof (struct k_itimer), 0, SLAB_PANIC,

View File

@@ -14,56 +14,13 @@
#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
static int orig_fgconsole, orig_kmsg;
static int disable_vt_switch;
/*
* Normally during a suspend, we allocate a new console and switch to it.
* When we resume, we switch back to the original console. This switch
* can be slow, so on systems where the framebuffer can handle restoration
* of video registers anyways, there's little point in doing the console
* switch. This function allows you to disable it by passing it '0'.
*/
void pm_set_vt_switch(int do_switch)
{
acquire_console_sem();
disable_vt_switch = !do_switch;
release_console_sem();
}
EXPORT_SYMBOL(pm_set_vt_switch);
int pm_prepare_console(void)
{
acquire_console_sem();
if (disable_vt_switch) {
release_console_sem();
return 0;
}
orig_fgconsole = fg_console;
if (vc_allocate(SUSPEND_CONSOLE)) {
/* we can't have a free VC for now. Too bad,
* we don't want to mess the screen for now. */
release_console_sem();
orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
if (orig_fgconsole < 0)
return 1;
}
if (set_console(SUSPEND_CONSOLE)) {
/*
* We're unable to switch to the SUSPEND_CONSOLE.
* Let the calling function know so it can decide
* what to do.
*/
release_console_sem();
return 1;
}
release_console_sem();
if (vt_waitactive(SUSPEND_CONSOLE)) {
pr_debug("Suspend: Can't switch VCs.");
return 1;
}
orig_kmsg = kmsg_redirect;
kmsg_redirect = SUSPEND_CONSOLE;
return 0;
@@ -71,19 +28,9 @@ int pm_prepare_console(void)
void pm_restore_console(void)
{
acquire_console_sem();
if (disable_vt_switch) {
release_console_sem();
return;
if (orig_fgconsole >= 0) {
vt_move_to_console(orig_fgconsole, 0);
kmsg_redirect = orig_kmsg;
}
set_console(orig_fgconsole);
release_console_sem();
if (vt_waitactive(orig_fgconsole)) {
pr_debug("Resume: Can't switch VCs.");
return;
}
kmsg_redirect = orig_kmsg;
}
#endif

View File

@@ -9,6 +9,7 @@
#undef DEBUG
#include <linux/interrupt.h>
#include <linux/oom.h>
#include <linux/suspend.h>
#include <linux/module.h>
#include <linux/syscalls.h>

View File

@@ -619,7 +619,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
BUG_ON(!region);
} else
/* This allocation cannot fail */
region = alloc_bootmem_low(sizeof(struct nosave_region));
region = alloc_bootmem(sizeof(struct nosave_region));
region->start_pfn = start_pfn;
region->end_pfn = end_pfn;
list_add_tail(&region->list, &nosave_regions);

View File

@@ -13,7 +13,6 @@
#include <linux/module.h>
#include <linux/file.h>
#include <linux/utsname.h>
#include <linux/delay.h>
#include <linux/bitops.h>
#include <linux/genhd.h>

View File

@@ -206,12 +206,11 @@ __setup("log_buf_len=", log_buf_len_setup);
#ifdef CONFIG_BOOT_PRINTK_DELAY
static unsigned int boot_delay; /* msecs delay after each printk during bootup */
static unsigned long long printk_delay_msec; /* per msec, based on boot_delay */
static unsigned long long loops_per_msec; /* based on boot_delay */
static int __init boot_delay_setup(char *str)
{
unsigned long lpj;
unsigned long long loops_per_msec;
lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */
loops_per_msec = (unsigned long long)lpj / 1000 * HZ;
@@ -220,10 +219,9 @@ static int __init boot_delay_setup(char *str)
if (boot_delay > 10 * 1000)
boot_delay = 0;
printk_delay_msec = loops_per_msec;
printk(KERN_DEBUG "boot_delay: %u, preset_lpj: %ld, lpj: %lu, "
"HZ: %d, printk_delay_msec: %llu\n",
boot_delay, preset_lpj, lpj, HZ, printk_delay_msec);
pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, "
"HZ: %d, loops_per_msec: %llu\n",
boot_delay, preset_lpj, lpj, HZ, loops_per_msec);
return 1;
}
__setup("boot_delay=", boot_delay_setup);
@@ -236,7 +234,7 @@ static void boot_delay_msec(void)
if (boot_delay == 0 || system_state != SYSTEM_BOOTING)
return;
k = (unsigned long long)printk_delay_msec * boot_delay;
k = (unsigned long long)loops_per_msec * boot_delay;
timeout = jiffies + msecs_to_jiffies(boot_delay);
while (k) {
@@ -655,6 +653,20 @@ static int recursion_bug;
static int new_text_line = 1;
static char printk_buf[1024];
int printk_delay_msec __read_mostly;
static inline void printk_delay(void)
{
if (unlikely(printk_delay_msec)) {
int m = printk_delay_msec;
while (m--) {
mdelay(1);
touch_nmi_watchdog();
}
}
}
asmlinkage int vprintk(const char *fmt, va_list args)
{
int printed_len = 0;
@@ -664,6 +676,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
char *p;
boot_delay_msec();
printk_delay();
preempt_disable();
/* This stops the holder of console_sem just where we want him */

View File

@@ -442,48 +442,51 @@ void profile_tick(int type)
#ifdef CONFIG_PROC_FS
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <asm/uaccess.h>
static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
{
int len = cpumask_scnprintf(page, count, data);
if (count - len < 2)
return -EINVAL;
len += sprintf(page + len, "\n");
return len;
seq_cpumask(m, prof_cpu_mask);
seq_putc(m, '\n');
return 0;
}
static int prof_cpu_mask_write_proc(struct file *file,
const char __user *buffer, unsigned long count, void *data)
static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file)
{
return single_open(file, prof_cpu_mask_proc_show, NULL);
}
static ssize_t prof_cpu_mask_proc_write(struct file *file,
const char __user *buffer, size_t count, loff_t *pos)
{
struct cpumask *mask = data;
unsigned long full_count = count, err;
cpumask_var_t new_value;
int err;
if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
return -ENOMEM;
err = cpumask_parse_user(buffer, count, new_value);
if (!err) {
cpumask_copy(mask, new_value);
err = full_count;
cpumask_copy(prof_cpu_mask, new_value);
err = count;
}
free_cpumask_var(new_value);
return err;
}
static const struct file_operations prof_cpu_mask_proc_fops = {
.open = prof_cpu_mask_proc_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
.write = prof_cpu_mask_proc_write,
};
void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
{
struct proc_dir_entry *entry;
/* create /proc/irq/prof_cpu_mask */
entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir);
if (!entry)
return;
entry->data = prof_cpu_mask;
entry->read_proc = prof_cpu_mask_read_proc;
entry->write_proc = prof_cpu_mask_write_proc;
proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops);
}
/*

View File

@@ -266,9 +266,10 @@ static int ignoring_children(struct sighand_struct *sigh)
* or self-reaping. Do notification now if it would have happened earlier.
* If it should reap itself, return true.
*
* If it's our own child, there is no notification to do.
* But if our normal children self-reap, then this child
* was prevented by ptrace and we must reap it now.
* If it's our own child, there is no notification to do. But if our normal
* children self-reap, then this child was prevented by ptrace and we must
* reap it now, in that case we must also wake up sub-threads sleeping in
* do_wait().
*/
static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
{
@@ -278,8 +279,10 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
if (!task_detached(p) && thread_group_empty(p)) {
if (!same_thread_group(p->real_parent, tracer))
do_notify_parent(p, p->exit_signal);
else if (ignoring_children(tracer->sighand))
else if (ignoring_children(tracer->sighand)) {
__wake_up_parent(p, tracer);
p->exit_signal = -1;
}
}
if (task_detached(p)) {
/* Mark it as in the process of being reaped. */

View File

@@ -19,7 +19,7 @@
*
* Authors: Dipankar Sarma <dipankar@in.ibm.com>
* Manfred Spraul <manfred@colorfullife.com>
*
*
* Based on the original work by Paul McKenney <paulmck@us.ibm.com>
* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
* Papers:
@@ -27,7 +27,7 @@
* http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
*
* For detailed explanation of Read-Copy Update mechanism see -
* http://lse.sourceforge.net/locking/rcupdate.html
* http://lse.sourceforge.net/locking/rcupdate.html
*
*/
#include <linux/types.h>
@@ -74,6 +74,8 @@ void wakeme_after_rcu(struct rcu_head *head)
complete(&rcu->completion);
}
#ifdef CONFIG_TREE_PREEMPT_RCU
/**
* synchronize_rcu - wait until a grace period has elapsed.
*
@@ -87,7 +89,7 @@ void synchronize_rcu(void)
{
struct rcu_synchronize rcu;
if (rcu_blocking_is_gp())
if (!rcu_scheduler_active)
return;
init_completion(&rcu.completion);
@@ -98,6 +100,46 @@ void synchronize_rcu(void)
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
/**
* synchronize_sched - wait until an rcu-sched grace period has elapsed.
*
* Control will return to the caller some time after a full rcu-sched
* grace period has elapsed, in other words after all currently executing
* rcu-sched read-side critical sections have completed. These read-side
* critical sections are delimited by rcu_read_lock_sched() and
* rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
* local_irq_disable(), and so on may be used in place of
* rcu_read_lock_sched().
*
* This means that all preempt_disable code sequences, including NMI and
* hardware-interrupt handlers, in progress on entry will have completed
* before this primitive returns. However, this does not guarantee that
* softirq handlers will have completed, since in some kernels, these
* handlers can run in process context, and can block.
*
* This primitive provides the guarantees made by the (now removed)
* synchronize_kernel() API. In contrast, synchronize_rcu() only
* guarantees that rcu_read_lock() sections will have completed.
* In "classic RCU", these two guarantees happen to be one and
* the same, but can differ in realtime RCU implementations.
*/
void synchronize_sched(void)
{
struct rcu_synchronize rcu;
if (rcu_blocking_is_gp())
return;
init_completion(&rcu.completion);
/* Will wake me after RCU finished. */
call_rcu_sched(&rcu.head, wakeme_after_rcu);
/* Wait for it. */
wait_for_completion(&rcu.completion);
}
EXPORT_SYMBOL_GPL(synchronize_sched);
/**
* synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
*

View File

@@ -18,7 +18,7 @@
* Copyright (C) IBM Corporation, 2005, 2006
*
* Authors: Paul E. McKenney <paulmck@us.ibm.com>
* Josh Triplett <josh@freedesktop.org>
* Josh Triplett <josh@freedesktop.org>
*
* See also: Documentation/RCU/torture.txt
*/
@@ -50,7 +50,7 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
"Josh Triplett <josh@freedesktop.org>");
"Josh Triplett <josh@freedesktop.org>");
static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
static int nfakewriters = 4; /* # fake writer threads */
@@ -110,8 +110,8 @@ struct rcu_torture {
};
static LIST_HEAD(rcu_torture_freelist);
static struct rcu_torture *rcu_torture_current = NULL;
static long rcu_torture_current_version = 0;
static struct rcu_torture *rcu_torture_current;
static long rcu_torture_current_version;
static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
static DEFINE_SPINLOCK(rcu_torture_lock);
static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
@@ -124,11 +124,11 @@ static atomic_t n_rcu_torture_alloc_fail;
static atomic_t n_rcu_torture_free;
static atomic_t n_rcu_torture_mberror;
static atomic_t n_rcu_torture_error;
static long n_rcu_torture_timers = 0;
static long n_rcu_torture_timers;
static struct list_head rcu_torture_removed;
static cpumask_var_t shuffle_tmp_mask;
static int stutter_pause_test = 0;
static int stutter_pause_test;
#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
#define RCUTORTURE_RUNNABLE_INIT 1
@@ -267,7 +267,8 @@ struct rcu_torture_ops {
int irq_capable;
char *name;
};
static struct rcu_torture_ops *cur_ops = NULL;
static struct rcu_torture_ops *cur_ops;
/*
* Definitions for rcu torture testing.
@@ -281,14 +282,17 @@ static int rcu_torture_read_lock(void) __acquires(RCU)
static void rcu_read_delay(struct rcu_random_state *rrsp)
{
long delay;
const long longdelay = 200;
const unsigned long shortdelay_us = 200;
const unsigned long longdelay_ms = 50;
/* We want there to be long-running readers, but not all the time. */
/* We want a short delay sometimes to make a reader delay the grace
* period, and we want a long delay occasionally to trigger
* force_quiescent_state. */
delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay);
if (!delay)
udelay(longdelay);
if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
mdelay(longdelay_ms);
if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
udelay(shortdelay_us);
}
static void rcu_torture_read_unlock(int idx) __releases(RCU)
@@ -339,8 +343,8 @@ static struct rcu_torture_ops rcu_ops = {
.sync = synchronize_rcu,
.cb_barrier = rcu_barrier,
.stats = NULL,
.irq_capable = 1,
.name = "rcu"
.irq_capable = 1,
.name = "rcu"
};
static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
@@ -638,7 +642,8 @@ rcu_torture_writer(void *arg)
do {
schedule_timeout_uninterruptible(1);
if ((rp = rcu_torture_alloc()) == NULL)
rp = rcu_torture_alloc();
if (rp == NULL)
continue;
rp->rtort_pipe_count = 0;
udelay(rcu_random(&rand) & 0x3ff);
@@ -1110,7 +1115,7 @@ rcu_torture_init(void)
printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
torture_type);
mutex_unlock(&fullstop_mutex);
return (-EINVAL);
return -EINVAL;
}
if (cur_ops->init)
cur_ops->init(); /* no "goto unwind" prior to this point!!! */
@@ -1161,7 +1166,7 @@ rcu_torture_init(void)
goto unwind;
}
fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
GFP_KERNEL);
GFP_KERNEL);
if (fakewriter_tasks == NULL) {
VERBOSE_PRINTK_ERRSTRING("out of memory");
firsterr = -ENOMEM;
@@ -1170,7 +1175,7 @@ rcu_torture_init(void)
for (i = 0; i < nfakewriters; i++) {
VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task");
fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL,
"rcu_torture_fakewriter");
"rcu_torture_fakewriter");
if (IS_ERR(fakewriter_tasks[i])) {
firsterr = PTR_ERR(fakewriter_tasks[i]);
VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter");

View File

@@ -25,7 +25,7 @@
* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
*
* For detailed explanation of Read-Copy Update mechanism see -
* Documentation/RCU
* Documentation/RCU
*/
#include <linux/types.h>
#include <linux/kernel.h>
@@ -107,27 +107,23 @@ static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp,
*/
void rcu_sched_qs(int cpu)
{
unsigned long flags;
struct rcu_data *rdp;
local_irq_save(flags);
rdp = &per_cpu(rcu_sched_data, cpu);
rdp->passed_quiesc = 1;
rdp->passed_quiesc_completed = rdp->completed;
rcu_preempt_qs(cpu);
local_irq_restore(flags);
barrier();
rdp->passed_quiesc = 1;
rcu_preempt_note_context_switch(cpu);
}
void rcu_bh_qs(int cpu)
{
unsigned long flags;
struct rcu_data *rdp;
local_irq_save(flags);
rdp = &per_cpu(rcu_bh_data, cpu);
rdp->passed_quiesc = 1;
rdp->passed_quiesc_completed = rdp->completed;
local_irq_restore(flags);
barrier();
rdp->passed_quiesc = 1;
}
#ifdef CONFIG_NO_HZ
@@ -605,8 +601,6 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
{
struct rcu_data *rdp = rsp->rda[smp_processor_id()];
struct rcu_node *rnp = rcu_get_root(rsp);
struct rcu_node *rnp_cur;
struct rcu_node *rnp_end;
if (!cpu_needs_another_gp(rsp, rdp)) {
spin_unlock_irqrestore(&rnp->lock, flags);
@@ -615,6 +609,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
/* Advance to a new grace period and initialize state. */
rsp->gpnum++;
WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
record_gp_stall_check_time(rsp);
@@ -631,7 +626,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
/* Special-case the common single-level case. */
if (NUM_RCU_NODES == 1) {
rcu_preempt_check_blocked_tasks(rnp);
rnp->qsmask = rnp->qsmaskinit;
rnp->gpnum = rsp->gpnum;
rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
spin_unlock_irqrestore(&rnp->lock, flags);
return;
@@ -644,42 +641,28 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
spin_lock(&rsp->onofflock); /* irqs already disabled. */
/*
* Set the quiescent-state-needed bits in all the non-leaf RCU
* nodes for all currently online CPUs. This operation relies
* on the layout of the hierarchy within the rsp->node[] array.
* Note that other CPUs will access only the leaves of the
* hierarchy, which still indicate that no grace period is in
* progress. In addition, we have excluded CPU-hotplug operations.
*
* We therefore do not need to hold any locks. Any required
* memory barriers will be supplied by the locks guarding the
* leaf rcu_nodes in the hierarchy.
*/
rnp_end = rsp->level[NUM_RCU_LVLS - 1];
for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
rnp_cur->qsmask = rnp_cur->qsmaskinit;
/*
* Now set up the leaf nodes. Here we must be careful. First,
* we need to hold the lock in order to exclude other CPUs, which
* might be contending for the leaf nodes' locks. Second, as
* soon as we initialize a given leaf node, its CPUs might run
* up the rest of the hierarchy. We must therefore acquire locks
* for each node that we touch during this stage. (But we still
* are excluding CPU-hotplug operations.)
* Set the quiescent-state-needed bits in all the rcu_node
* structures for all currently online CPUs in breadth-first
* order, starting from the root rcu_node structure. This
* operation relies on the layout of the hierarchy within the
* rsp->node[] array. Note that other CPUs will access only
* the leaves of the hierarchy, which still indicate that no
* grace period is in progress, at least until the corresponding
* leaf node has been initialized. In addition, we have excluded
* CPU-hotplug operations.
*
* Note that the grace period cannot complete until we finish
* the initialization process, as there will be at least one
* qsmask bit set in the root node until that time, namely the
* one corresponding to this CPU.
* one corresponding to this CPU, due to the fact that we have
* irqs disabled.
*/
rnp_end = &rsp->node[NUM_RCU_NODES];
rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
for (; rnp_cur < rnp_end; rnp_cur++) {
spin_lock(&rnp_cur->lock); /* irqs already disabled. */
rnp_cur->qsmask = rnp_cur->qsmaskinit;
spin_unlock(&rnp_cur->lock); /* irqs already disabled. */
for (rnp = &rsp->node[0]; rnp < &rsp->node[NUM_RCU_NODES]; rnp++) {
spin_lock(&rnp->lock); /* irqs already disabled. */
rcu_preempt_check_blocked_tasks(rnp);
rnp->qsmask = rnp->qsmaskinit;
rnp->gpnum = rsp->gpnum;
spin_unlock(&rnp->lock); /* irqs already disabled. */
}
rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
@@ -722,6 +705,7 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
__releases(rnp->lock)
{
WARN_ON_ONCE(rsp->completed == rsp->gpnum);
rsp->completed = rsp->gpnum;
rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
@@ -739,6 +723,8 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
unsigned long flags)
__releases(rnp->lock)
{
struct rcu_node *rnp_c;
/* Walk up the rcu_node hierarchy. */
for (;;) {
if (!(rnp->qsmask & mask)) {
@@ -762,8 +748,10 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
break;
}
spin_unlock_irqrestore(&rnp->lock, flags);
rnp_c = rnp;
rnp = rnp->parent;
spin_lock_irqsave(&rnp->lock, flags);
WARN_ON_ONCE(rnp_c->qsmask);
}
/*
@@ -776,10 +764,10 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
/*
* Record a quiescent state for the specified CPU, which must either be
* the current CPU or an offline CPU. The lastcomp argument is used to
* make sure we are still in the grace period of interest. We don't want
* to end the current grace period based on quiescent states detected in
* an earlier grace period!
* the current CPU. The lastcomp argument is used to make sure we are
* still in the grace period of interest. We don't want to end the current
* grace period based on quiescent states detected in an earlier grace
* period!
*/
static void
cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
@@ -814,7 +802,6 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
* This GP can't end until cpu checks in, so all of our
* callbacks can be processed during the next GP.
*/
rdp = rsp->rda[smp_processor_id()];
rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
@@ -872,7 +859,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
spin_lock_irqsave(&rsp->onofflock, flags);
/* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
rnp = rdp->mynode;
rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */
mask = rdp->grpmask; /* rnp->grplo is constant. */
do {
spin_lock(&rnp->lock); /* irqs already disabled. */
@@ -881,7 +868,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
spin_unlock(&rnp->lock); /* irqs remain disabled. */
break;
}
rcu_preempt_offline_tasks(rsp, rnp);
rcu_preempt_offline_tasks(rsp, rnp, rdp);
mask = rnp->grpmask;
spin_unlock(&rnp->lock); /* irqs remain disabled. */
rnp = rnp->parent;
@@ -890,9 +877,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
/* Being offline is a quiescent state, so go record it. */
cpu_quiet(cpu, rsp, rdp, lastcomp);
/*
* Move callbacks from the outgoing CPU to the running CPU.
* Note that the outgoing CPU is now quiscent, so it is now
@@ -1457,20 +1441,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
rnp = rnp->parent;
} while (rnp != NULL && !(rnp->qsmaskinit & mask));
spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
/*
* A new grace period might start here. If so, we will be part of
* it, and its gpnum will be greater than ours, so we will
* participate. It is also possible for the gpnum to have been
* incremented before this function was called, and the bitmasks
* to not be filled out until now, in which case we will also
* participate due to our gpnum being behind.
*/
/* Since it is coming online, the CPU is in a quiescent state. */
cpu_quiet(cpu, rsp, rdp, lastcomp);
local_irq_restore(flags);
spin_unlock_irqrestore(&rsp->onofflock, flags);
}
static void __cpuinit rcu_online_cpu(int cpu)

View File

@@ -142,7 +142,7 @@ struct rcu_data {
*/
struct rcu_head *nxtlist;
struct rcu_head **nxttail[RCU_NEXT_SIZE];
long qlen; /* # of queued callbacks */
long qlen; /* # of queued callbacks */
long blimit; /* Upper limit on a processed batch */
#ifdef CONFIG_NO_HZ

View File

@@ -64,22 +64,31 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
* not in a quiescent state. There might be any number of tasks blocked
* while in an RCU read-side critical section.
*/
static void rcu_preempt_qs_record(int cpu)
static void rcu_preempt_qs(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
rdp->passed_quiesc = 1;
rdp->passed_quiesc_completed = rdp->completed;
barrier();
rdp->passed_quiesc = 1;
}
/*
* We have entered the scheduler or are between softirqs in ksoftirqd.
* If we are in an RCU read-side critical section, we need to reflect
* that in the state of the rcu_node structure corresponding to this CPU.
* Caller must disable hardirqs.
* We have entered the scheduler, and the current task might soon be
* context-switched away from. If this task is in an RCU read-side
* critical section, we will no longer be able to rely on the CPU to
* record that fact, so we enqueue the task on the appropriate entry
* of the blocked_tasks[] array. The task will dequeue itself when
* it exits the outermost enclosing RCU read-side critical section.
* Therefore, the current grace period cannot be permitted to complete
* until the blocked_tasks[] entry indexed by the low-order bit of
* rnp->gpnum empties.
*
* Caller must disable preemption.
*/
static void rcu_preempt_qs(int cpu)
static void rcu_preempt_note_context_switch(int cpu)
{
struct task_struct *t = current;
unsigned long flags;
int phase;
struct rcu_data *rdp;
struct rcu_node *rnp;
@@ -90,7 +99,7 @@ static void rcu_preempt_qs(int cpu)
/* Possibly blocking in an RCU read-side critical section. */
rdp = rcu_preempt_state.rda[cpu];
rnp = rdp->mynode;
spin_lock(&rnp->lock);
spin_lock_irqsave(&rnp->lock, flags);
t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
t->rcu_blocked_node = rnp;
@@ -103,11 +112,15 @@ static void rcu_preempt_qs(int cpu)
* state for the current grace period), then as long
* as that task remains queued, the current grace period
* cannot end.
*
* But first, note that the current CPU must still be
* on line!
*/
phase = !(rnp->qsmask & rdp->grpmask) ^ (rnp->gpnum & 0x1);
WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
smp_mb(); /* Ensure later ctxt swtch seen after above. */
spin_unlock(&rnp->lock);
spin_unlock_irqrestore(&rnp->lock, flags);
}
/*
@@ -119,9 +132,10 @@ static void rcu_preempt_qs(int cpu)
* grace period, then the fact that the task has been enqueued
* means that we continue to block the current grace period.
*/
rcu_preempt_qs_record(cpu);
t->rcu_read_unlock_special &= ~(RCU_READ_UNLOCK_NEED_QS |
RCU_READ_UNLOCK_GOT_QS);
rcu_preempt_qs(cpu);
local_irq_save(flags);
t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
local_irq_restore(flags);
}
/*
@@ -157,7 +171,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
special = t->rcu_read_unlock_special;
if (special & RCU_READ_UNLOCK_NEED_QS) {
t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
t->rcu_read_unlock_special |= RCU_READ_UNLOCK_GOT_QS;
rcu_preempt_qs(smp_processor_id());
}
/* Hardware IRQ handlers cannot block. */
@@ -177,10 +191,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
*/
for (;;) {
rnp = t->rcu_blocked_node;
spin_lock(&rnp->lock);
spin_lock(&rnp->lock); /* irqs already disabled. */
if (rnp == t->rcu_blocked_node)
break;
spin_unlock(&rnp->lock);
spin_unlock(&rnp->lock); /* irqs remain disabled. */
}
empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
list_del_init(&t->rcu_node_entry);
@@ -194,9 +208,8 @@ static void rcu_read_unlock_special(struct task_struct *t)
*/
if (!empty && rnp->qsmask == 0 &&
list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) {
t->rcu_read_unlock_special &=
~(RCU_READ_UNLOCK_NEED_QS |
RCU_READ_UNLOCK_GOT_QS);
struct rcu_node *rnp_p;
if (rnp->parent == NULL) {
/* Only one rcu_node in the tree. */
cpu_quiet_msk_finish(&rcu_preempt_state, flags);
@@ -205,9 +218,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
/* Report up the rest of the hierarchy. */
mask = rnp->grpmask;
spin_unlock_irqrestore(&rnp->lock, flags);
rnp = rnp->parent;
spin_lock_irqsave(&rnp->lock, flags);
cpu_quiet_msk(mask, &rcu_preempt_state, rnp, flags);
rnp_p = rnp->parent;
spin_lock_irqsave(&rnp_p->lock, flags);
WARN_ON_ONCE(rnp->qsmask);
cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags);
return;
}
spin_unlock(&rnp->lock);
@@ -258,6 +272,19 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
/*
* Check that the list of blocked tasks for the newly completed grace
* period is in fact empty. It is a serious bug to complete a grace
* period that still has RCU readers blocked! This function must be
* invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
* must be held by the caller.
*/
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
{
WARN_ON_ONCE(!list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]));
WARN_ON_ONCE(rnp->qsmask);
}
/*
* Check for preempted RCU readers for the specified rcu_node structure.
* If the caller needs a reliable answer, it must hold the rcu_node's
@@ -280,7 +307,8 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
* The caller must hold rnp->lock with irqs disabled.
*/
static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
struct rcu_node *rnp)
struct rcu_node *rnp,
struct rcu_data *rdp)
{
int i;
struct list_head *lp;
@@ -292,6 +320,9 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
WARN_ONCE(1, "Last CPU thought to be offlined?");
return; /* Shouldn't happen: at least one CPU online. */
}
WARN_ON_ONCE(rnp != rdp->mynode &&
(!list_empty(&rnp->blocked_tasks[0]) ||
!list_empty(&rnp->blocked_tasks[1])));
/*
* Move tasks up to root rcu_node. Rely on the fact that the
@@ -335,20 +366,12 @@ static void rcu_preempt_check_callbacks(int cpu)
struct task_struct *t = current;
if (t->rcu_read_lock_nesting == 0) {
t->rcu_read_unlock_special &=
~(RCU_READ_UNLOCK_NEED_QS | RCU_READ_UNLOCK_GOT_QS);
rcu_preempt_qs_record(cpu);
t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
rcu_preempt_qs(cpu);
return;
}
if (per_cpu(rcu_preempt_data, cpu).qs_pending) {
if (t->rcu_read_unlock_special & RCU_READ_UNLOCK_GOT_QS) {
rcu_preempt_qs_record(cpu);
t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_GOT_QS;
} else if (!(t->rcu_read_unlock_special &
RCU_READ_UNLOCK_NEED_QS)) {
t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
}
}
if (per_cpu(rcu_preempt_data, cpu).qs_pending)
t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
}
/*
@@ -434,7 +457,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
* Because preemptable RCU does not exist, we never have to check for
* CPUs being in quiescent states.
*/
static void rcu_preempt_qs(int cpu)
static void rcu_preempt_note_context_switch(int cpu)
{
}
@@ -450,6 +473,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
/*
* Because there is no preemptable RCU, there can be no readers blocked,
* so there is no need to check for blocked tasks. So check only for
* bogus qsmask values.
*/
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
{
WARN_ON_ONCE(rnp->qsmask);
}
/*
* Because preemptable RCU does not exist, there are never any preempted
* RCU readers.
@@ -466,7 +499,8 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
* tasks that were blocked within RCU read-side critical sections.
*/
static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
struct rcu_node *rnp)
struct rcu_node *rnp,
struct rcu_data *rdp)
{
}

View File

@@ -20,7 +20,7 @@
* Papers: http://www.rdrop.com/users/paulmck/RCU
*
* For detailed explanation of Read-Copy Update mechanism see -
* Documentation/RCU
* Documentation/RCU
*
*/
#include <linux/types.h>

View File

@@ -60,7 +60,7 @@ static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
/*
* vm_ops for relay file mappings.
*/
static struct vm_operations_struct relay_file_mmap_ops = {
static const struct vm_operations_struct relay_file_mmap_ops = {
.fault = relay_buf_fault,
.close = relay_file_mmap_close,
};

View File

@@ -19,6 +19,7 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
{
spin_lock_init(&counter->lock);
counter->limit = RESOURCE_MAX;
counter->soft_limit = RESOURCE_MAX;
counter->parent = parent;
}
@@ -36,17 +37,27 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
}
int res_counter_charge(struct res_counter *counter, unsigned long val,
struct res_counter **limit_fail_at)
struct res_counter **limit_fail_at,
struct res_counter **soft_limit_fail_at)
{
int ret;
unsigned long flags;
struct res_counter *c, *u;
*limit_fail_at = NULL;
if (soft_limit_fail_at)
*soft_limit_fail_at = NULL;
local_irq_save(flags);
for (c = counter; c != NULL; c = c->parent) {
spin_lock(&c->lock);
ret = res_counter_charge_locked(c, val);
/*
* With soft limits, we return the highest ancestor
* that exceeds its soft limit
*/
if (soft_limit_fail_at &&
!res_counter_soft_limit_check_locked(c))
*soft_limit_fail_at = c;
spin_unlock(&c->lock);
if (ret < 0) {
*limit_fail_at = c;
@@ -74,7 +85,8 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
counter->usage -= val;
}
void res_counter_uncharge(struct res_counter *counter, unsigned long val)
void res_counter_uncharge(struct res_counter *counter, unsigned long val,
bool *was_soft_limit_excess)
{
unsigned long flags;
struct res_counter *c;
@@ -82,6 +94,9 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val)
local_irq_save(flags);
for (c = counter; c != NULL; c = c->parent) {
spin_lock(&c->lock);
if (was_soft_limit_excess)
*was_soft_limit_excess =
!res_counter_soft_limit_check_locked(c);
res_counter_uncharge_locked(c, val);
spin_unlock(&c->lock);
}
@@ -101,6 +116,8 @@ res_counter_member(struct res_counter *counter, int member)
return &counter->limit;
case RES_FAILCNT:
return &counter->failcnt;
case RES_SOFT_LIMIT:
return &counter->soft_limit;
};
BUG();

View File

@@ -223,13 +223,13 @@ int release_resource(struct resource *old)
EXPORT_SYMBOL(release_resource);
#if defined(CONFIG_MEMORY_HOTPLUG) && !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
/*
* Finds the lowest memory reosurce exists within [res->start.res->end)
* the caller must specify res->start, res->end, res->flags.
* the caller must specify res->start, res->end, res->flags and "name".
* If found, returns 0, res is overwritten, if not found, returns -1.
*/
static int find_next_system_ram(struct resource *res)
static int find_next_system_ram(struct resource *res, char *name)
{
resource_size_t start, end;
struct resource *p;
@@ -245,6 +245,8 @@ static int find_next_system_ram(struct resource *res)
/* system ram is just marked as IORESOURCE_MEM */
if (p->flags != res->flags)
continue;
if (name && strcmp(p->name, name))
continue;
if (p->start > end) {
p = NULL;
break;
@@ -262,19 +264,26 @@ static int find_next_system_ram(struct resource *res)
res->end = p->end;
return 0;
}
int
walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg,
int (*func)(unsigned long, unsigned long, void *))
/*
* This function calls callback against all memory range of "System RAM"
* which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
* Now, this function is only for "System RAM".
*/
int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg, int (*func)(unsigned long, unsigned long, void *))
{
struct resource res;
unsigned long pfn, len;
u64 orig_end;
int ret = -1;
res.start = (u64) start_pfn << PAGE_SHIFT;
res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
orig_end = res.end;
while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) {
while ((res.start < res.end) &&
(find_next_system_ram(&res, "System RAM") >= 0)) {
pfn = (unsigned long)(res.start >> PAGE_SHIFT);
len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT);
ret = (*func)(pfn, len, arg);

View File

@@ -39,7 +39,7 @@
#include <linux/completion.h>
#include <linux/kernel_stat.h>
#include <linux/debug_locks.h>
#include <linux/perf_counter.h>
#include <linux/perf_event.h>
#include <linux/security.h>
#include <linux/notifier.h>
#include <linux/profile.h>
@@ -119,8 +119,6 @@
*/
#define RUNTIME_INF ((u64)~0ULL)
static void double_rq_lock(struct rq *rq1, struct rq *rq2);
static inline int rt_policy(int policy)
{
if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@ -378,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
#else
#ifdef CONFIG_SMP
static int root_task_group_empty(void)
{
return 1;
}
#endif
static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
static inline struct task_group *task_group(struct task_struct *p)
{
@@ -514,14 +505,6 @@ struct root_domain {
#ifdef CONFIG_SMP
struct cpupri cpupri;
#endif
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
/*
* Preferred wake up cpu nominated by sched_mc balance that will be
* used when most cpus are idle in the system indicating overall very
* low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
*/
unsigned int sched_mc_preferred_wakeup_cpu;
#endif
};
/*
@@ -646,9 +629,10 @@ struct rq {
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
static inline
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
rq->curr->sched_class->check_preempt_curr(rq, p, sync);
rq->curr->sched_class->check_preempt_curr(rq, p, flags);
}
static inline int cpu_of(struct rq *rq)
@@ -697,15 +681,9 @@ inline void update_rq_clock(struct rq *rq)
* This interface allows printk to be called with the runqueue lock
* held and know whether or not it is OK to wake up the klogd.
*/
int runqueue_is_locked(void)
int runqueue_is_locked(int cpu)
{
int cpu = get_cpu();
struct rq *rq = cpu_rq(cpu);
int ret;
ret = spin_is_locked(&rq->lock);
put_cpu();
return ret;
return spin_is_locked(&cpu_rq(cpu)->lock);
}
/*
@@ -1509,8 +1487,65 @@ static int tg_nop(struct task_group *tg, void *data)
#endif
#ifdef CONFIG_SMP
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
/* Used instead of source_load when we know the type == 0 */
static unsigned long weighted_cpuload(const int cpu)
{
return cpu_rq(cpu)->load.weight;
}
/*
* Return a low guess at the load of a migration-source cpu weighted
* according to the scheduling class and "nice" value.
*
* We want to under-estimate the load of migration sources, to
* balance conservatively.
*/
static unsigned long source_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
unsigned long total = weighted_cpuload(cpu);
if (type == 0 || !sched_feat(LB_BIAS))
return total;
return min(rq->cpu_load[type-1], total);
}
/*
* Return a high guess at the load of a migration-target cpu weighted
* according to the scheduling class and "nice" value.
*/
static unsigned long target_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
unsigned long total = weighted_cpuload(cpu);
if (type == 0 || !sched_feat(LB_BIAS))
return total;
return max(rq->cpu_load[type-1], total);
}
static struct sched_group *group_of(int cpu)
{
struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
if (!sd)
return NULL;
return sd->groups;
}
static unsigned long power_of(int cpu)
{
struct sched_group *group = group_of(cpu);
if (!group)
return SCHED_LOAD_SCALE;
return group->cpu_power;
}
static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
static unsigned long cpu_avg_load_per_task(int cpu)
@@ -1695,6 +1730,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
#ifdef CONFIG_PREEMPT
static void double_rq_lock(struct rq *rq1, struct rq *rq2);
/*
* fair double_lock_balance: Safely acquires both rq->locks in a fair
* way at the expense of forcing extra atomic operations in all
@@ -1959,13 +1996,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
}
#ifdef CONFIG_SMP
/* Used instead of source_load when we know the type == 0 */
static unsigned long weighted_cpuload(const int cpu)
{
return cpu_rq(cpu)->load.weight;
}
/*
* Is this task likely cache-hot:
*/
@@ -2023,7 +2053,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
if (task_hot(p, old_rq->clock, NULL))
schedstat_inc(p, se.nr_forced2_migrations);
#endif
perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
1, 1, NULL, 0);
}
p->se.vruntime -= old_cfsrq->min_vruntime -
@@ -2239,185 +2269,6 @@ void kick_process(struct task_struct *p)
preempt_enable();
}
EXPORT_SYMBOL_GPL(kick_process);
/*
* Return a low guess at the load of a migration-source cpu weighted
* according to the scheduling class and "nice" value.
*
* We want to under-estimate the load of migration sources, to
* balance conservatively.
*/
static unsigned long source_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
unsigned long total = weighted_cpuload(cpu);
if (type == 0 || !sched_feat(LB_BIAS))
return total;
return min(rq->cpu_load[type-1], total);
}
/*
* Return a high guess at the load of a migration-target cpu weighted
* according to the scheduling class and "nice" value.
*/
static unsigned long target_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
unsigned long total = weighted_cpuload(cpu);
if (type == 0 || !sched_feat(LB_BIAS))
return total;
return max(rq->cpu_load[type-1], total);
}
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
*/
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
{
struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
unsigned long min_load = ULONG_MAX, this_load = 0;
int load_idx = sd->forkexec_idx;
int imbalance = 100 + (sd->imbalance_pct-100)/2;
do {
unsigned long load, avg_load;
int local_group;
int i;
/* Skip over this group if it has no CPUs allowed */
if (!cpumask_intersects(sched_group_cpus(group),
&p->cpus_allowed))
continue;
local_group = cpumask_test_cpu(this_cpu,
sched_group_cpus(group));
/* Tally up the load of all CPUs in the group */
avg_load = 0;
for_each_cpu(i, sched_group_cpus(group)) {
/* Bias balancing toward cpus of our domain */
if (local_group)
load = source_load(i, load_idx);
else
load = target_load(i, load_idx);
avg_load += load;
}
/* Adjust by relative CPU power of the group */
avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
if (local_group) {
this_load = avg_load;
this = group;
} else if (avg_load < min_load) {
min_load = avg_load;
idlest = group;
}
} while (group = group->next, group != sd->groups);
if (!idlest || 100*this_load < imbalance*min_load)
return NULL;
return idlest;
}
/*
* find_idlest_cpu - find the idlest cpu among the cpus in group.
*/
static int
find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
int idlest = -1;
int i;
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
load = weighted_cpuload(i);
if (load < min_load || (load == min_load && i == this_cpu)) {
min_load = load;
idlest = i;
}
}
return idlest;
}
/*
* sched_balance_self: balance the current task (running on cpu) in domains
* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
* SD_BALANCE_EXEC.
*
* Balance, ie. select the least loaded group.
*
* Returns the target CPU number, or the same CPU if no balancing is needed.
*
* preempt must be disabled.
*/
static int sched_balance_self(int cpu, int flag)
{
struct task_struct *t = current;
struct sched_domain *tmp, *sd = NULL;
for_each_domain(cpu, tmp) {
/*
* If power savings logic is enabled for a domain, stop there.
*/
if (tmp->flags & SD_POWERSAVINGS_BALANCE)
break;
if (tmp->flags & flag)
sd = tmp;
}
if (sd)
update_shares(sd);
while (sd) {
struct sched_group *group;
int new_cpu, weight;
if (!(sd->flags & flag)) {
sd = sd->child;
continue;
}
group = find_idlest_group(sd, t, cpu);
if (!group) {
sd = sd->child;
continue;
}
new_cpu = find_idlest_cpu(group, t, cpu);
if (new_cpu == -1 || new_cpu == cpu) {
/* Now try balancing at a lower domain level of cpu */
sd = sd->child;
continue;
}
/* Now try balancing at a lower domain level of new_cpu */
cpu = new_cpu;
weight = cpumask_weight(sched_domain_span(sd));
sd = NULL;
for_each_domain(cpu, tmp) {
if (weight <= cpumask_weight(sched_domain_span(tmp)))
break;
if (tmp->flags & flag)
sd = tmp;
}
/* while loop will break here if sd == NULL */
}
return cpu;
}
#endif /* CONFIG_SMP */
/**
@@ -2455,37 +2306,22 @@ void task_oncpu_function_call(struct task_struct *p,
*
* returns failure only if the task is already active.
*/
static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
static int try_to_wake_up(struct task_struct *p, unsigned int state,
int wake_flags)
{
int cpu, orig_cpu, this_cpu, success = 0;
unsigned long flags;
long old_state;
struct rq *rq;
if (!sched_feat(SYNC_WAKEUPS))
sync = 0;
wake_flags &= ~WF_SYNC;
#ifdef CONFIG_SMP
if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
struct sched_domain *sd;
this_cpu = raw_smp_processor_id();
cpu = task_cpu(p);
for_each_domain(this_cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
update_shares(sd);
break;
}
}
}
#endif
this_cpu = get_cpu();
smp_wmb();
rq = task_rq_lock(p, &flags);
update_rq_clock(rq);
old_state = p->state;
if (!(old_state & state))
if (!(p->state & state))
goto out;
if (p->se.on_rq)
@@ -2493,27 +2329,29 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
cpu = task_cpu(p);
orig_cpu = cpu;
this_cpu = smp_processor_id();
#ifdef CONFIG_SMP
if (unlikely(task_running(rq, p)))
goto out_activate;
cpu = p->sched_class->select_task_rq(p, sync);
if (cpu != orig_cpu) {
set_task_cpu(p, cpu);
task_rq_unlock(rq, &flags);
/* might preempt at this point */
rq = task_rq_lock(p, &flags);
old_state = p->state;
if (!(old_state & state))
goto out;
if (p->se.on_rq)
goto out_running;
/*
* In order to handle concurrent wakeups and release the rq->lock
* we put the task in TASK_WAKING state.
*
* First fix up the nr_uninterruptible count:
*/
if (task_contributes_to_load(p))
rq->nr_uninterruptible--;
p->state = TASK_WAKING;
task_rq_unlock(rq, &flags);
this_cpu = smp_processor_id();
cpu = task_cpu(p);
}
cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
if (cpu != orig_cpu)
set_task_cpu(p, cpu);
rq = task_rq_lock(p, &flags);
WARN_ON(p->state != TASK_WAKING);
cpu = task_cpu(p);
#ifdef CONFIG_SCHEDSTATS
schedstat_inc(rq, ttwu_count);
@@ -2533,7 +2371,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
out_activate:
#endif /* CONFIG_SMP */
schedstat_inc(p, se.nr_wakeups);
if (sync)
if (wake_flags & WF_SYNC)
schedstat_inc(p, se.nr_wakeups_sync);
if (orig_cpu != cpu)
schedstat_inc(p, se.nr_wakeups_migrate);
@@ -2562,7 +2400,7 @@ out_activate:
out_running:
trace_sched_wakeup(rq, p, success);
check_preempt_curr(rq, p, sync);
check_preempt_curr(rq, p, wake_flags);
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
@@ -2571,6 +2409,7 @@ out_running:
#endif
out:
task_rq_unlock(rq, &flags);
put_cpu();
return success;
}
@@ -2613,6 +2452,7 @@ static void __sched_fork(struct task_struct *p)
p->se.avg_overlap = 0;
p->se.start_runtime = 0;
p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
p->se.avg_running = 0;
#ifdef CONFIG_SCHEDSTATS
p->se.wait_start = 0;
@@ -2674,11 +2514,6 @@ void sched_fork(struct task_struct *p, int clone_flags)
__sched_fork(p);
#ifdef CONFIG_SMP
cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
#endif
set_task_cpu(p, cpu);
/*
* Make sure we do not leak PI boosting priority to the child.
*/
@@ -2709,6 +2544,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
if (!rt_prio(p->prio))
p->sched_class = &fair_sched_class;
#ifdef CONFIG_SMP
cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
#endif
set_task_cpu(p, cpu);
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -2754,7 +2594,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
inc_nr_running(rq);
}
trace_sched_wakeup_new(rq, p, 1);
check_preempt_curr(rq, p, 0);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
if (p->sched_class->task_wake_up)
p->sched_class->task_wake_up(rq, p);
@@ -2878,7 +2718,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
*/
prev_state = prev->state;
finish_arch_switch(prev);
perf_counter_task_sched_in(current, cpu_of(rq));
perf_event_task_sched_in(current, cpu_of(rq));
finish_lock_switch(rq, prev);
fire_sched_in_preempt_notifiers(current);
@@ -3064,6 +2904,19 @@ unsigned long nr_iowait(void)
return sum;
}
unsigned long nr_iowait_cpu(void)
{
struct rq *this = this_rq();
return atomic_read(&this->nr_iowait);
}
unsigned long this_cpu_load(void)
{
struct rq *this = this_rq();
return this->cpu_load[0];
}
/* Variables and functions for calc_load */
static atomic_long_t calc_load_tasks;
static unsigned long calc_load_update;
@@ -3263,7 +3116,7 @@ out:
void sched_exec(void)
{
int new_cpu, this_cpu = get_cpu();
new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
put_cpu();
if (new_cpu != this_cpu)
sched_migrate_task(current, new_cpu);
@@ -3683,11 +3536,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
*imbalance = sds->min_load_per_task;
sds->busiest = sds->group_min;
if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
group_first_cpu(sds->group_leader);
}
return 1;
}
@@ -3711,7 +3559,18 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
}
#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
{
return SCHED_LOAD_SCALE;
}
unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
{
return default_scale_freq_power(sd, cpu);
}
unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
{
unsigned long weight = cpumask_weight(sched_domain_span(sd));
unsigned long smt_gain = sd->smt_gain;
@@ -3721,6 +3580,11 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
return smt_gain;
}
unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
{
return default_scale_smt_power(sd, cpu);
}
unsigned long scale_rt_power(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -3745,10 +3609,19 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
unsigned long power = SCHED_LOAD_SCALE;
struct sched_group *sdg = sd->groups;
/* here we could scale based on cpufreq */
if (sched_feat(ARCH_POWER))
power *= arch_scale_freq_power(sd, cpu);
else
power *= default_scale_freq_power(sd, cpu);
power >>= SCHED_LOAD_SHIFT;
if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
power *= arch_scale_smt_power(sd, cpu);
if (sched_feat(ARCH_POWER))
power *= arch_scale_smt_power(sd, cpu);
else
power *= default_scale_smt_power(sd, cpu);
power >>= SCHED_LOAD_SHIFT;
}
@@ -4161,26 +4034,6 @@ ret:
return NULL;
}
static struct sched_group *group_of(int cpu)
{
struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
if (!sd)
return NULL;
return sd->groups;
}
static unsigned long power_of(int cpu)
{
struct sched_group *group = group_of(cpu);
if (!group)
return SCHED_LOAD_SCALE;
return group->cpu_power;
}
/*
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
@@ -5239,17 +5092,16 @@ void account_idle_time(cputime_t cputime)
*/
void account_process_tick(struct task_struct *p, int user_tick)
{
cputime_t one_jiffy = jiffies_to_cputime(1);
cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
struct rq *rq = this_rq();
if (user_tick)
account_user_time(p, one_jiffy, one_jiffy_scaled);
account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
one_jiffy_scaled);
else
account_idle_time(one_jiffy);
account_idle_time(cputime_one_jiffy);
}
/*
@@ -5353,7 +5205,7 @@ void scheduler_tick(void)
curr->sched_class->task_tick(rq, curr, 0);
spin_unlock(&rq->lock);
perf_counter_task_tick(curr, cpu);
perf_event_task_tick(curr, cpu);
#ifdef CONFIG_SMP
rq->idle_at_tick = idle_cpu(cpu);
@@ -5465,14 +5317,13 @@ static inline void schedule_debug(struct task_struct *prev)
#endif
}
static void put_prev_task(struct rq *rq, struct task_struct *prev)
static void put_prev_task(struct rq *rq, struct task_struct *p)
{
if (prev->state == TASK_RUNNING) {
u64 runtime = prev->se.sum_exec_runtime;
u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
runtime -= prev->se.prev_sum_exec_runtime;
runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
update_avg(&p->se.avg_running, runtime);
if (p->state == TASK_RUNNING) {
/*
* In order to avoid avg_overlap growing stale when we are
* indeed overlapping and hence not getting put to sleep, grow
@@ -5482,9 +5333,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
* correlates to the amount of cache footprint a task can
* build up.
*/
update_avg(&prev->se.avg_overlap, runtime);
runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
update_avg(&p->se.avg_overlap, runtime);
} else {
update_avg(&p->se.avg_running, 0);
}
prev->sched_class->put_prev_task(rq, prev);
p->sched_class->put_prev_task(rq, p);
}
/*
@@ -5567,7 +5421,7 @@ need_resched_nonpreemptible:
if (likely(prev != next)) {
sched_info_switch(prev, next);
perf_counter_task_sched_out(prev, next, cpu);
perf_event_task_sched_out(prev, next, cpu);
rq->nr_switches++;
rq->curr = next;
@@ -5716,10 +5570,10 @@ asmlinkage void __sched preempt_schedule_irq(void)
#endif /* CONFIG_PREEMPT */
int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
void *key)
{
return try_to_wake_up(curr->private, mode, sync);
return try_to_wake_up(curr->private, mode, wake_flags);
}
EXPORT_SYMBOL(default_wake_function);
@@ -5733,14 +5587,14 @@ EXPORT_SYMBOL(default_wake_function);
* zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, int sync, void *key)
int nr_exclusive, int wake_flags, void *key)
{
wait_queue_t *curr, *next;
list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
unsigned flags = curr->flags;
if (curr->func(curr, mode, sync, key) &&
if (curr->func(curr, mode, wake_flags, key) &&
(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
}
@@ -5801,16 +5655,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, void *key)
{
unsigned long flags;
int sync = 1;
int wake_flags = WF_SYNC;
if (unlikely(!q))
return;
if (unlikely(!nr_exclusive))
sync = 0;
wake_flags = 0;
spin_lock_irqsave(&q->lock, flags);
__wake_up_common(q, mode, nr_exclusive, sync, key);
__wake_up_common(q, mode, nr_exclusive, wake_flags, key);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL_GPL(__wake_up_sync_key);
@@ -6977,23 +6831,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
if (retval)
goto out_unlock;
/*
* Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER
* tasks that are on an otherwise idle runqueue:
*/
time_slice = 0;
if (p->policy == SCHED_RR) {
time_slice = DEF_TIMESLICE;
} else if (p->policy != SCHED_FIFO) {
struct sched_entity *se = &p->se;
unsigned long flags;
struct rq *rq;
time_slice = p->sched_class->get_rr_interval(p);
rq = task_rq_lock(p, &flags);
if (rq->cfs.load.weight)
time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
task_rq_unlock(rq, &flags);
}
read_unlock(&tasklist_lock);
jiffies_to_timespec(time_slice, &t);
retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
@@ -7844,7 +7683,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
/*
* Register at high priority so that task migration (migrate_all_tasks)
* happens before everything else. This has to be lower priority than
* the notifier in the perf_counter subsystem, though.
* the notifier in the perf_event subsystem, though.
*/
static struct notifier_block __cpuinitdata migration_notifier = {
.notifier_call = migration_call,
@@ -8000,9 +7839,7 @@ static int sd_degenerate(struct sched_domain *sd)
}
/* Following flags don't use groups */
if (sd->flags & (SD_WAKE_IDLE |
SD_WAKE_AFFINE |
SD_WAKE_BALANCE))
if (sd->flags & (SD_WAKE_AFFINE))
return 0;
return 1;
@@ -8019,10 +7856,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
return 0;
/* Does parent contain flags not in child? */
/* WAKE_BALANCE is a subset of WAKE_AFFINE */
if (cflags & SD_WAKE_AFFINE)
pflags &= ~SD_WAKE_BALANCE;
/* Flags needing groups don't count if only 1 group in parent */
if (parent->groups == parent->groups->next) {
pflags &= ~(SD_LOAD_BALANCE |
@@ -8708,10 +8541,10 @@ static void set_domain_attribute(struct sched_domain *sd,
request = attr->relax_domain_level;
if (request < sd->level) {
/* turn off idle balance on this domain */
sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
} else {
/* turn on idle balance on this domain */
sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
}
}
@@ -9329,6 +9162,7 @@ void __init sched_init_smp(void)
cpumask_var_t non_isolated_cpus;
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
#if defined(CONFIG_NUMA)
sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@ -9360,7 +9194,6 @@ void __init sched_init_smp(void)
sched_init_granularity();
free_cpumask_var(non_isolated_cpus);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
init_sched_rt_class();
}
#else
@@ -9707,7 +9540,7 @@ void __init sched_init(void)
alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
#endif /* SMP */
perf_counter_init();
perf_event_init();
scheduler_running = 1;
}
@@ -10479,7 +10312,7 @@ static int sched_rt_global_constraints(void)
#endif /* CONFIG_RT_GROUP_SCHED */
int sched_rt_handler(struct ctl_table *table, int write,
struct file *filp, void __user *buffer, size_t *lenp,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
@@ -10490,7 +10323,7 @@ int sched_rt_handler(struct ctl_table *table, int write,
old_period = sysctl_sched_rt_period;
old_runtime = sysctl_sched_rt_runtime;
ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (!ret && write) {
ret = sched_rt_global_constraints();
@@ -10544,8 +10377,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
}
static int
cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct task_struct *tsk)
cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
#ifdef CONFIG_RT_GROUP_SCHED
if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
@@ -10555,15 +10387,45 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
if (tsk->sched_class != &fair_sched_class)
return -EINVAL;
#endif
return 0;
}
static int
cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct task_struct *tsk, bool threadgroup)
{
int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
if (retval)
return retval;
if (threadgroup) {
struct task_struct *c;
rcu_read_lock();
list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
retval = cpu_cgroup_can_attach_task(cgrp, c);
if (retval) {
rcu_read_unlock();
return retval;
}
}
rcu_read_unlock();
}
return 0;
}
static void
cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct cgroup *old_cont, struct task_struct *tsk)
struct cgroup *old_cont, struct task_struct *tsk,
bool threadgroup)
{
sched_move_task(tsk);
if (threadgroup) {
struct task_struct *c;
rcu_read_lock();
list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
sched_move_task(c);
}
rcu_read_unlock();
}
}
#ifdef CONFIG_FAIR_GROUP_SCHED

View File

@@ -48,13 +48,6 @@ static __read_mostly int sched_clock_running;
__read_mostly int sched_clock_stable;
struct sched_clock_data {
/*
* Raw spinlock - this is a special case: this might be called
* from within instrumentation code so we dont want to do any
* instrumentation ourselves.
*/
raw_spinlock_t lock;
u64 tick_raw;
u64 tick_gtod;
u64 clock;
@@ -80,7 +73,6 @@ void sched_clock_init(void)
for_each_possible_cpu(cpu) {
struct sched_clock_data *scd = cpu_sdc(cpu);
scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
scd->tick_raw = 0;
scd->tick_gtod = ktime_now;
scd->clock = ktime_now;
@@ -109,14 +101,19 @@ static inline u64 wrap_max(u64 x, u64 y)
* - filter out backward motion
* - use the GTOD tick value to create a window to filter crazy TSC values
*/
static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
static u64 sched_clock_local(struct sched_clock_data *scd)
{
s64 delta = now - scd->tick_raw;
u64 clock, min_clock, max_clock;
u64 now, clock, old_clock, min_clock, max_clock;
s64 delta;
again:
now = sched_clock();
delta = now - scd->tick_raw;
if (unlikely(delta < 0))
delta = 0;
old_clock = scd->clock;
/*
* scd->clock = clamp(scd->tick_gtod + delta,
* max(scd->tick_gtod, scd->clock),
@@ -124,84 +121,73 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
*/
clock = scd->tick_gtod + delta;
min_clock = wrap_max(scd->tick_gtod, scd->clock);
max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC);
min_clock = wrap_max(scd->tick_gtod, old_clock);
max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
clock = wrap_max(clock, min_clock);
clock = wrap_min(clock, max_clock);
scd->clock = clock;
if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
goto again;
return scd->clock;
return clock;
}
static void lock_double_clock(struct sched_clock_data *data1,
struct sched_clock_data *data2)
static u64 sched_clock_remote(struct sched_clock_data *scd)
{
if (data1 < data2) {
__raw_spin_lock(&data1->lock);
__raw_spin_lock(&data2->lock);
struct sched_clock_data *my_scd = this_scd();
u64 this_clock, remote_clock;
u64 *ptr, old_val, val;
sched_clock_local(my_scd);
again:
this_clock = my_scd->clock;
remote_clock = scd->clock;
/*
* Use the opportunity that we have both locks
* taken to couple the two clocks: we take the
* larger time as the latest time for both
* runqueues. (this creates monotonic movement)
*/
if (likely((s64)(remote_clock - this_clock) < 0)) {
ptr = &scd->clock;
old_val = remote_clock;
val = this_clock;
} else {
__raw_spin_lock(&data2->lock);
__raw_spin_lock(&data1->lock);
/*
* Should be rare, but possible:
*/
ptr = &my_scd->clock;
old_val = this_clock;
val = remote_clock;
}
if (cmpxchg64(ptr, old_val, val) != old_val)
goto again;
return val;
}
u64 sched_clock_cpu(int cpu)
{
u64 now, clock, this_clock, remote_clock;
struct sched_clock_data *scd;
u64 clock;
WARN_ON_ONCE(!irqs_disabled());
if (sched_clock_stable)
return sched_clock();
scd = cpu_sdc(cpu);
/*
* Normally this is not called in NMI context - but if it is,
* trying to do any locking here is totally lethal.
*/
if (unlikely(in_nmi()))
return scd->clock;
if (unlikely(!sched_clock_running))
return 0ull;
WARN_ON_ONCE(!irqs_disabled());
now = sched_clock();
scd = cpu_sdc(cpu);
if (cpu != raw_smp_processor_id()) {
struct sched_clock_data *my_scd = this_scd();
lock_double_clock(scd, my_scd);
this_clock = __update_sched_clock(my_scd, now);
remote_clock = scd->clock;
/*
* Use the opportunity that we have both locks
* taken to couple the two clocks: we take the
* larger time as the latest time for both
* runqueues. (this creates monotonic movement)
*/
if (likely((s64)(remote_clock - this_clock) < 0)) {
clock = this_clock;
scd->clock = clock;
} else {
/*
* Should be rare, but possible:
*/
clock = remote_clock;
my_scd->clock = remote_clock;
}
__raw_spin_unlock(&my_scd->lock);
} else {
__raw_spin_lock(&scd->lock);
clock = __update_sched_clock(scd, now);
}
__raw_spin_unlock(&scd->lock);
if (cpu != smp_processor_id())
clock = sched_clock_remote(scd);
else
clock = sched_clock_local(scd);
return clock;
}
@@ -223,11 +209,9 @@ void sched_clock_tick(void)
now_gtod = ktime_to_ns(ktime_get());
now = sched_clock();
__raw_spin_lock(&scd->lock);
scd->tick_raw = now;
scd->tick_gtod = now_gtod;
__update_sched_clock(scd, now);
__raw_spin_unlock(&scd->lock);
sched_clock_local(scd);
}
/*

View File

@@ -395,6 +395,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
PN(se.sum_exec_runtime);
PN(se.avg_overlap);
PN(se.avg_wakeup);
PN(se.avg_running);
nr_switches = p->nvcsw + p->nivcsw;

View File

@@ -384,10 +384,10 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
#ifdef CONFIG_SCHED_DEBUG
int sched_nr_latency_handler(struct ctl_table *table, int write,
struct file *filp, void __user *buffer, size_t *lenp,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
return ret;
@@ -513,6 +513,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
if (entity_is_task(curr)) {
struct task_struct *curtask = task_of(curr);
trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
cpuacct_charge(curtask, delta_exec);
account_group_exec_runtime(curtask, delta_exec);
}
@@ -709,24 +710,28 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
if (initial && sched_feat(START_DEBIT))
vruntime += sched_vslice(cfs_rq, se);
if (!initial) {
/* sleeps upto a single latency don't count. */
if (sched_feat(NEW_FAIR_SLEEPERS)) {
unsigned long thresh = sysctl_sched_latency;
/* sleeps up to a single latency don't count. */
if (!initial && sched_feat(FAIR_SLEEPERS)) {
unsigned long thresh = sysctl_sched_latency;
/*
* Convert the sleeper threshold into virtual time.
* SCHED_IDLE is a special sub-class. We care about
* fairness only relative to other SCHED_IDLE tasks,
* all of which have the same weight.
*/
if (sched_feat(NORMALIZED_SLEEPER) &&
(!entity_is_task(se) ||
task_of(se)->policy != SCHED_IDLE))
thresh = calc_delta_fair(thresh, se);
/*
* Convert the sleeper threshold into virtual time.
* SCHED_IDLE is a special sub-class. We care about
* fairness only relative to other SCHED_IDLE tasks,
* all of which have the same weight.
*/
if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
task_of(se)->policy != SCHED_IDLE))
thresh = calc_delta_fair(thresh, se);
vruntime -= thresh;
}
/*
* Halve their sleep time's effect, to allow
* for a gentler effect of sleepers:
*/
if (sched_feat(GENTLE_FAIR_SLEEPERS))
thresh >>= 1;
vruntime -= thresh;
}
/* ensure we never gain time by being placed backwards. */
@@ -757,10 +762,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (cfs_rq->last == se)
if (!se || cfs_rq->last == se)
cfs_rq->last = NULL;
if (cfs_rq->next == se)
if (!se || cfs_rq->next == se)
cfs_rq->next = NULL;
}
@@ -1062,83 +1067,6 @@ static void yield_task_fair(struct rq *rq)
se->vruntime = rightmost->vruntime + 1;
}
/*
* wake_idle() will wake a task on an idle cpu if task->cpu is
* not idle and an idle cpu is available. The span of cpus to
* search starts with cpus closest then further out as needed,
* so we always favor a closer, idle cpu.
* Domains may include CPUs that are not usable for migration,
* hence we need to mask them out (rq->rd->online)
*
* Returns the CPU we should wake onto.
*/
#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
static int wake_idle(int cpu, struct task_struct *p)
{
struct sched_domain *sd;
int i;
unsigned int chosen_wakeup_cpu;
int this_cpu;
struct rq *task_rq = task_rq(p);
/*
* At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
* are idle and this is not a kernel thread and this task's affinity
* allows it to be moved to preferred cpu, then just move!
*/
this_cpu = smp_processor_id();
chosen_wakeup_cpu =
cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
idle_cpu(cpu) && idle_cpu(this_cpu) &&
p->mm && !(p->flags & PF_KTHREAD) &&
cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
return chosen_wakeup_cpu;
/*
* If it is idle, then it is the best cpu to run this task.
*
* This cpu is also the best, if it has more than one task already.
* Siblings must be also busy(in most cases) as they didn't already
* pickup the extra load from this cpu and hence we need not check
* sibling runqueue info. This will avoid the checks and cache miss
* penalities associated with that.
*/
if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
return cpu;
for_each_domain(cpu, sd) {
if ((sd->flags & SD_WAKE_IDLE)
|| ((sd->flags & SD_WAKE_IDLE_FAR)
&& !task_hot(p, task_rq->clock, sd))) {
for_each_cpu_and(i, sched_domain_span(sd),
&p->cpus_allowed) {
if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
if (i != task_cpu(p)) {
schedstat_inc(p,
se.nr_wakeups_idle);
}
return i;
}
}
} else {
break;
}
}
return cpu;
}
#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
static inline int wake_idle(int cpu, struct task_struct *p)
{
return cpu;
}
#endif
#ifdef CONFIG_SMP
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1225,25 +1153,34 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
#endif
static int
wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
struct task_struct *p, int prev_cpu, int this_cpu, int sync,
int idx, unsigned long load, unsigned long this_load,
unsigned int imbalance)
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
{
struct task_struct *curr = this_rq->curr;
struct task_group *tg;
unsigned long tl = this_load;
struct task_struct *curr = current;
unsigned long this_load, load;
int idx, this_cpu, prev_cpu;
unsigned long tl_per_task;
unsigned int imbalance;
struct task_group *tg;
unsigned long weight;
int balanced;
if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
return 0;
idx = sd->wake_idx;
this_cpu = smp_processor_id();
prev_cpu = task_cpu(p);
load = source_load(prev_cpu, idx);
this_load = target_load(this_cpu, idx);
if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
p->se.avg_overlap > sysctl_sched_migration_cost))
sync = 0;
if (sync) {
if (sched_feat(SYNC_LESS) &&
(curr->se.avg_overlap > sysctl_sched_migration_cost ||
p->se.avg_overlap > sysctl_sched_migration_cost))
sync = 0;
} else {
if (sched_feat(SYNC_MORE) &&
(curr->se.avg_overlap < sysctl_sched_migration_cost &&
p->se.avg_overlap < sysctl_sched_migration_cost))
sync = 1;
}
/*
* If sync wakeup then subtract the (maximum possible)
@@ -1254,24 +1191,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
tg = task_group(current);
weight = current->se.load.weight;
tl += effective_load(tg, this_cpu, -weight, -weight);
this_load += effective_load(tg, this_cpu, -weight, -weight);
load += effective_load(tg, prev_cpu, 0, -weight);
}
tg = task_group(p);
weight = p->se.load.weight;
imbalance = 100 + (sd->imbalance_pct - 100) / 2;
/*
* In low-load situations, where prev_cpu is idle and this_cpu is idle
* due to the sync cause above having dropped tl to 0, we'll always have
* an imbalance, but there's really nothing you can do about that, so
* that's good too.
* due to the sync cause above having dropped this_load to 0, we'll
* always have an imbalance, but there's really nothing you can do
* about that, so that's good too.
*
* Otherwise check if either cpus are near enough in load to allow this
* task to be woken on this_cpu.
*/
balanced = !tl ||
100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
balanced = !this_load ||
100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
/*
@@ -1285,14 +1224,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
schedstat_inc(p, se.nr_wakeups_affine_attempts);
tl_per_task = cpu_avg_load_per_task(this_cpu);
if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
tl_per_task)) {
if (balanced ||
(this_load <= load &&
this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
/*
* This domain has SD_WAKE_AFFINE and
* p is cache cold in this domain, and
* there is no bad imbalance.
*/
schedstat_inc(this_sd, ttwu_move_affine);
schedstat_inc(sd, ttwu_move_affine);
schedstat_inc(p, se.nr_wakeups_affine);
return 1;
@@ -1300,65 +1240,216 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
return 0;
}
static int select_task_rq_fair(struct task_struct *p, int sync)
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
*/
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int load_idx)
{
struct sched_domain *sd, *this_sd = NULL;
int prev_cpu, this_cpu, new_cpu;
unsigned long load, this_load;
struct rq *this_rq;
unsigned int imbalance;
int idx;
struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
unsigned long min_load = ULONG_MAX, this_load = 0;
int imbalance = 100 + (sd->imbalance_pct-100)/2;
prev_cpu = task_cpu(p);
this_cpu = smp_processor_id();
this_rq = cpu_rq(this_cpu);
new_cpu = prev_cpu;
do {
unsigned long load, avg_load;
int local_group;
int i;
/*
* 'this_sd' is the first domain that both
* this_cpu and prev_cpu are present in:
*/
for_each_domain(this_cpu, sd) {
if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
this_sd = sd;
break;
/* Skip over this group if it has no CPUs allowed */
if (!cpumask_intersects(sched_group_cpus(group),
&p->cpus_allowed))
continue;
local_group = cpumask_test_cpu(this_cpu,
sched_group_cpus(group));
/* Tally up the load of all CPUs in the group */
avg_load = 0;
for_each_cpu(i, sched_group_cpus(group)) {
/* Bias balancing toward cpus of our domain */
if (local_group)
load = source_load(i, load_idx);
else
load = target_load(i, load_idx);
avg_load += load;
}
/* Adjust by relative CPU power of the group */
avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
if (local_group) {
this_load = avg_load;
this = group;
} else if (avg_load < min_load) {
min_load = avg_load;
idlest = group;
}
} while (group = group->next, group != sd->groups);
if (!idlest || 100*this_load < imbalance*min_load)
return NULL;
return idlest;
}
/*
* find_idlest_cpu - find the idlest cpu among the cpus in group.
*/
static int
find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
int idlest = -1;
int i;
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
load = weighted_cpuload(i);
if (load < min_load || (load == min_load && i == this_cpu)) {
min_load = load;
idlest = i;
}
}
if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
goto out;
return idlest;
}
/*
* Check for affine wakeup and passive balancing possibilities.
*/
if (!this_sd)
goto out;
/*
* sched_balance_self: balance the current task (running on cpu) in domains
* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
* SD_BALANCE_EXEC.
*
* Balance, ie. select the least loaded group.
*
* Returns the target CPU number, or the same CPU if no balancing is needed.
*
* preempt must be disabled.
*/
static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
int new_cpu = cpu;
int want_affine = 0;
int want_sd = 1;
int sync = wake_flags & WF_SYNC;
idx = this_sd->wake_idx;
if (sd_flag & SD_BALANCE_WAKE) {
if (sched_feat(AFFINE_WAKEUPS) &&
cpumask_test_cpu(cpu, &p->cpus_allowed))
want_affine = 1;
new_cpu = prev_cpu;
}
imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
rcu_read_lock();
for_each_domain(cpu, tmp) {
/*
* If power savings logic is enabled for a domain, see if we
* are not overloaded, if so, don't balance wider.
*/
if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
unsigned long power = 0;
unsigned long nr_running = 0;
unsigned long capacity;
int i;
load = source_load(prev_cpu, idx);
this_load = target_load(this_cpu, idx);
for_each_cpu(i, sched_domain_span(tmp)) {
power += power_of(i);
nr_running += cpu_rq(i)->cfs.nr_running;
}
if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
load, this_load, imbalance))
return this_cpu;
capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
/*
* Start passive balancing when half the imbalance_pct
* limit is reached.
*/
if (this_sd->flags & SD_WAKE_BALANCE) {
if (imbalance*this_load <= 100*load) {
schedstat_inc(this_sd, ttwu_move_balance);
schedstat_inc(p, se.nr_wakeups_passive);
return this_cpu;
if (tmp->flags & SD_POWERSAVINGS_BALANCE)
nr_running /= 2;
if (nr_running < capacity)
want_sd = 0;
}
if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
affine_sd = tmp;
want_affine = 0;
}
if (!want_sd && !want_affine)
break;
if (!(tmp->flags & sd_flag))
continue;
if (want_sd)
sd = tmp;
}
if (sched_feat(LB_SHARES_UPDATE)) {
/*
* Pick the largest domain to update shares over
*/
tmp = sd;
if (affine_sd && (!tmp ||
cpumask_weight(sched_domain_span(affine_sd)) >
cpumask_weight(sched_domain_span(sd))))
tmp = affine_sd;
if (tmp)
update_shares(tmp);
}
if (affine_sd && wake_affine(affine_sd, p, sync)) {
new_cpu = cpu;
goto out;
}
while (sd) {
int load_idx = sd->forkexec_idx;
struct sched_group *group;
int weight;
if (!(sd->flags & sd_flag)) {
sd = sd->child;
continue;
}
if (sd_flag & SD_BALANCE_WAKE)
load_idx = sd->wake_idx;
group = find_idlest_group(sd, p, cpu, load_idx);
if (!group) {
sd = sd->child;
continue;
}
new_cpu = find_idlest_cpu(group, p, cpu);
if (new_cpu == -1 || new_cpu == cpu) {
/* Now try balancing at a lower domain level of cpu */
sd = sd->child;
continue;
}
/* Now try balancing at a lower domain level of new_cpu */
cpu = new_cpu;
weight = cpumask_weight(sched_domain_span(sd));
sd = NULL;
for_each_domain(cpu, tmp) {
if (weight <= cpumask_weight(sched_domain_span(tmp)))
break;
if (tmp->flags & sd_flag)
sd = tmp;
}
/* while loop will break here if sd == NULL */
}
out:
return wake_idle(new_cpu, p);
rcu_read_unlock();
return new_cpu;
}
#endif /* CONFIG_SMP */
@@ -1471,11 +1562,12 @@ static void set_next_buddy(struct sched_entity *se)
/*
* Preempt the current task with a newly woken task if needed:
*/
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
struct task_struct *curr = rq->curr;
struct sched_entity *se = &curr->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
int sync = wake_flags & WF_SYNC;
update_curr(cfs_rq);
@@ -1501,7 +1593,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
*/
if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
set_last_buddy(se);
set_next_buddy(pse);
if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
set_next_buddy(pse);
/*
* We can come here with TIF_NEED_RESCHED already set from new task
@@ -1523,16 +1616,25 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
return;
}
if (!sched_feat(WAKEUP_PREEMPT))
return;
if (sched_feat(WAKEUP_OVERLAP) && (sync ||
(se->avg_overlap < sysctl_sched_migration_cost &&
pse->avg_overlap < sysctl_sched_migration_cost))) {
if ((sched_feat(WAKEUP_SYNC) && sync) ||
(sched_feat(WAKEUP_OVERLAP) &&
(se->avg_overlap < sysctl_sched_migration_cost &&
pse->avg_overlap < sysctl_sched_migration_cost))) {
resched_task(curr);
return;
}
if (sched_feat(WAKEUP_RUNNING)) {
if (pse->avg_running < se->avg_running) {
set_next_buddy(pse);
resched_task(curr);
return;
}
}
if (!sched_feat(WAKEUP_PREEMPT))
return;
find_matching_se(&se, &pse);
BUG_ON(!pse);
@@ -1555,8 +1657,13 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
/*
* If se was a buddy, clear it so that it will have to earn
* the favour again.
*
* If se was not a buddy, clear the buddies because neither
* was elegible to run, let them earn it again.
*
* IOW. unconditionally clear buddies.
*/
__clear_buddies(cfs_rq, se);
__clear_buddies(cfs_rq, NULL);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
@@ -1832,6 +1939,25 @@ static void moved_group_fair(struct task_struct *p)
}
#endif
unsigned int get_rr_interval_fair(struct task_struct *task)
{
struct sched_entity *se = &task->se;
unsigned long flags;
struct rq *rq;
unsigned int rr_interval = 0;
/*
* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
* idle runqueue:
*/
rq = task_rq_lock(task, &flags);
if (rq->cfs.load.weight)
rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
task_rq_unlock(rq, &flags);
return rr_interval;
}
/*
* All the scheduling class methods:
*/
@@ -1860,6 +1986,8 @@ static const struct sched_class fair_sched_class = {
.prio_changed = prio_changed_fair,
.switched_to = switched_to_fair,
.get_rr_interval = get_rr_interval_fair,
#ifdef CONFIG_FAIR_GROUP_SCHED
.moved_group = moved_group_fair,
#endif

View File

@@ -1,17 +1,123 @@
SCHED_FEAT(NEW_FAIR_SLEEPERS, 0)
/*
* Disregards a certain amount of sleep time (sched_latency_ns) and
* considers the task to be running during that period. This gives it
* a service deficit on wakeup, allowing it to run sooner.
*/
SCHED_FEAT(FAIR_SLEEPERS, 1)
/*
* Only give sleepers 50% of their service deficit. This allows
* them to run sooner, but does not allow tons of sleepers to
* rip the spread apart.
*/
SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
/*
* By not normalizing the sleep time, heavy tasks get an effective
* longer period, and lighter task an effective shorter period they
* are considered running.
*/
SCHED_FEAT(NORMALIZED_SLEEPER, 0)
SCHED_FEAT(ADAPTIVE_GRAN, 1)
SCHED_FEAT(WAKEUP_PREEMPT, 1)
/*
* Place new tasks ahead so that they do not starve already running
* tasks
*/
SCHED_FEAT(START_DEBIT, 1)
SCHED_FEAT(AFFINE_WAKEUPS, 1)
SCHED_FEAT(CACHE_HOT_BUDDY, 1)
/*
* Should wakeups try to preempt running tasks.
*/
SCHED_FEAT(WAKEUP_PREEMPT, 1)
/*
* Compute wakeup_gran based on task behaviour, clipped to
* [0, sched_wakeup_gran_ns]
*/
SCHED_FEAT(ADAPTIVE_GRAN, 1)
/*
* When converting the wakeup granularity to virtual time, do it such
* that heavier tasks preempting a lighter task have an edge.
*/
SCHED_FEAT(ASYM_GRAN, 1)
/*
* Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
*/
SCHED_FEAT(WAKEUP_SYNC, 0)
/*
* Wakeup preempt based on task behaviour. Tasks that do not overlap
* don't get preempted.
*/
SCHED_FEAT(WAKEUP_OVERLAP, 0)
/*
* Wakeup preemption towards tasks that run short
*/
SCHED_FEAT(WAKEUP_RUNNING, 0)
/*
* Use the SYNC wakeup hint, pipes and the likes use this to indicate
* the remote end is likely to consume the data we just wrote, and
* therefore has cache benefit from being placed on the same cpu, see
* also AFFINE_WAKEUPS.
*/
SCHED_FEAT(SYNC_WAKEUPS, 1)
/*
* Based on load and program behaviour, see if it makes sense to place
* a newly woken task on the same cpu as the task that woke it --
* improve cache locality. Typically used with SYNC wakeups as
* generated by pipes and the like, see also SYNC_WAKEUPS.
*/
SCHED_FEAT(AFFINE_WAKEUPS, 1)
/*
* Weaken SYNC hint based on overlap
*/
SCHED_FEAT(SYNC_LESS, 1)
/*
* Add SYNC hint based on overlap
*/
SCHED_FEAT(SYNC_MORE, 0)
/*
* Prefer to schedule the task we woke last (assuming it failed
* wakeup-preemption), since its likely going to consume data we
* touched, increases cache locality.
*/
SCHED_FEAT(NEXT_BUDDY, 0)
/*
* Prefer to schedule the task that ran last (when we did
* wake-preempt) as that likely will touch the same data, increases
* cache locality.
*/
SCHED_FEAT(LAST_BUDDY, 1)
/*
* Consider buddies to be cache hot, decreases the likelyness of a
* cache buddy being migrated away, increases cache locality.
*/
SCHED_FEAT(CACHE_HOT_BUDDY, 1)
/*
* Use arch dependent cpu power functions
*/
SCHED_FEAT(ARCH_POWER, 0)
SCHED_FEAT(HRTICK, 0)
SCHED_FEAT(DOUBLE_TICK, 0)
SCHED_FEAT(ASYM_GRAN, 1)
SCHED_FEAT(LB_BIAS, 1)
SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
SCHED_FEAT(LB_SHARES_UPDATE, 1)
SCHED_FEAT(ASYM_EFF_LOAD, 1)
SCHED_FEAT(WAKEUP_OVERLAP, 0)
SCHED_FEAT(LAST_BUDDY, 1)
/*
* Spin-wait on mutex acquisition when the mutex owner is running on
* another cpu -- assumes that when the owner is running, it will soon
* release the lock. Decreases scheduling overhead.
*/
SCHED_FEAT(OWNER_SPIN, 1)

View File

@@ -6,7 +6,7 @@
*/
#ifdef CONFIG_SMP
static int select_task_rq_idle(struct task_struct *p, int sync)
static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
/*
* Idle tasks are unconditionally rescheduled:
*/
static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync)
static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
{
resched_task(rq->idle);
}
@@ -97,6 +97,11 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
check_preempt_curr(rq, p, 0);
}
unsigned int get_rr_interval_idle(struct task_struct *task)
{
return 0;
}
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
@@ -122,6 +127,8 @@ static const struct sched_class idle_sched_class = {
.set_curr_task = set_curr_task_idle,
.task_tick = task_tick_idle,
.get_rr_interval = get_rr_interval_idle,
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,

View File

@@ -938,10 +938,13 @@ static void yield_task_rt(struct rq *rq)
#ifdef CONFIG_SMP
static int find_lowest_rq(struct task_struct *task);
static int select_task_rq_rt(struct task_struct *p, int sync)
static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
{
struct rq *rq = task_rq(p);
if (sd_flag != SD_BALANCE_WAKE)
return smp_processor_id();
/*
* If the current task is an RT task, then
* try to see if we can wake this RT task up on another
@@ -999,7 +1002,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
/*
* Preempt the current task with a newly woken task if needed:
*/
static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
{
if (p->prio < rq->curr->prio) {
resched_task(rq->curr);
@@ -1731,6 +1734,17 @@ static void set_curr_task_rt(struct rq *rq)
dequeue_pushable_task(rq, p);
}
unsigned int get_rr_interval_rt(struct task_struct *task)
{
/*
* Time slice is 0 for SCHED_FIFO tasks
*/
if (task->policy == SCHED_RR)
return DEF_TIMESLICE;
else
return 0;
}
static const struct sched_class rt_sched_class = {
.next = &fair_sched_class,
.enqueue_task = enqueue_task_rt,
@@ -1759,6 +1773,8 @@ static const struct sched_class rt_sched_class = {
.set_curr_task = set_curr_task_rt,
.task_tick = task_tick_rt,
.get_rr_interval = get_rr_interval_rt,
.prio_changed = prio_changed_rt,
.switched_to = switched_to_rt,
};

View File

@@ -705,7 +705,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
if (why) {
/*
* The first thread which returns from finish_stop()
* The first thread which returns from do_signal_stop()
* will take ->siglock, notice SIGNAL_CLD_MASK, and
* notify its parent. See get_signal_to_deliver().
*/
@@ -971,6 +971,20 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
return send_signal(sig, info, t, 0);
}
int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
bool group)
{
unsigned long flags;
int ret = -ESRCH;
if (lock_task_sighand(p, &flags)) {
ret = send_signal(sig, info, p, group);
unlock_task_sighand(p, &flags);
}
return ret;
}
/*
* Force a signal that the process can't ignore: if necessary
* we unblock the signal and change any SIG_IGN to SIG_DFL.
@@ -1036,12 +1050,6 @@ void zap_other_threads(struct task_struct *p)
}
}
int __fatal_signal_pending(struct task_struct *tsk)
{
return sigismember(&tsk->pending.signal, SIGKILL);
}
EXPORT_SYMBOL(__fatal_signal_pending);
struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
{
struct sighand_struct *sighand;
@@ -1068,18 +1076,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
*/
int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
{
unsigned long flags;
int ret;
int ret = check_kill_permission(sig, info, p);
ret = check_kill_permission(sig, info, p);
if (!ret && sig) {
ret = -ESRCH;
if (lock_task_sighand(p, &flags)) {
ret = __group_send_sig_info(sig, info, p);
unlock_task_sighand(p, &flags);
}
}
if (!ret && sig)
ret = do_send_sig_info(sig, info, p, true);
return ret;
}
@@ -1224,15 +1224,9 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
* These are for backward compatibility with the rest of the kernel source.
*/
/*
* The caller must ensure the task can't exit.
*/
int
send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
{
int ret;
unsigned long flags;
/*
* Make sure legacy kernel users don't send in bad values
* (normal paths check this in check_kill_permission).
@@ -1240,10 +1234,7 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
if (!valid_signal(sig))
return -EINVAL;
spin_lock_irqsave(&p->sighand->siglock, flags);
ret = specific_send_sig_info(sig, info, p);
spin_unlock_irqrestore(&p->sighand->siglock, flags);
return ret;
return do_send_sig_info(sig, info, p, false);
}
#define __si_special(priv) \
@@ -1382,15 +1373,6 @@ ret:
return ret;
}
/*
* Wake up any threads in the parent blocked in wait* syscalls.
*/
static inline void __wake_up_parent(struct task_struct *p,
struct task_struct *parent)
{
wake_up_interruptible_sync(&parent->signal->wait_chldexit);
}
/*
* Let a parent know about the death of a child.
* For a stopped/continued status change, use do_notify_parent_cldstop instead.
@@ -1673,29 +1655,6 @@ void ptrace_notify(int exit_code)
spin_unlock_irq(&current->sighand->siglock);
}
static void
finish_stop(int stop_count)
{
/*
* If there are no other threads in the group, or if there is
* a group stop in progress and we are the last to stop,
* report to the parent. When ptraced, every thread reports itself.
*/
if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) {
read_lock(&tasklist_lock);
do_notify_parent_cldstop(current, CLD_STOPPED);
read_unlock(&tasklist_lock);
}
do {
schedule();
} while (try_to_freeze());
/*
* Now we don't run again until continued.
*/
current->exit_code = 0;
}
/*
* This performs the stopping for SIGSTOP and other stop signals.
* We have to stop all threads in the thread group.
@@ -1705,15 +1664,9 @@ finish_stop(int stop_count)
static int do_signal_stop(int signr)
{
struct signal_struct *sig = current->signal;
int stop_count;
int notify;
if (sig->group_stop_count > 0) {
/*
* There is a group stop in progress. We don't need to
* start another one.
*/
stop_count = --sig->group_stop_count;
} else {
if (!sig->group_stop_count) {
struct task_struct *t;
if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
@@ -1725,7 +1678,7 @@ static int do_signal_stop(int signr)
*/
sig->group_exit_code = signr;
stop_count = 0;
sig->group_stop_count = 1;
for (t = next_thread(current); t != current; t = next_thread(t))
/*
* Setting state to TASK_STOPPED for a group
@@ -1734,19 +1687,44 @@ static int do_signal_stop(int signr)
*/
if (!(t->flags & PF_EXITING) &&
!task_is_stopped_or_traced(t)) {
stop_count++;
sig->group_stop_count++;
signal_wake_up(t, 0);
}
sig->group_stop_count = stop_count;
}
/*
* If there are no other threads in the group, or if there is
* a group stop in progress and we are the last to stop, report
* to the parent. When ptraced, every thread reports itself.
*/
notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0;
notify = tracehook_notify_jctl(notify, CLD_STOPPED);
/*
* tracehook_notify_jctl() can drop and reacquire siglock, so
* we keep ->group_stop_count != 0 before the call. If SIGCONT
* or SIGKILL comes in between ->group_stop_count == 0.
*/
if (sig->group_stop_count) {
if (!--sig->group_stop_count)
sig->flags = SIGNAL_STOP_STOPPED;
current->exit_code = sig->group_exit_code;
__set_current_state(TASK_STOPPED);
}
spin_unlock_irq(&current->sighand->siglock);
if (notify) {
read_lock(&tasklist_lock);
do_notify_parent_cldstop(current, notify);
read_unlock(&tasklist_lock);
}
if (stop_count == 0)
sig->flags = SIGNAL_STOP_STOPPED;
current->exit_code = sig->group_exit_code;
__set_current_state(TASK_STOPPED);
/* Now we don't run again until woken by SIGCONT or SIGKILL */
do {
schedule();
} while (try_to_freeze());
tracehook_finish_jctl();
current->exit_code = 0;
spin_unlock_irq(&current->sighand->siglock);
finish_stop(stop_count);
return 1;
}
@@ -1815,14 +1793,15 @@ relock:
int why = (signal->flags & SIGNAL_STOP_CONTINUED)
? CLD_CONTINUED : CLD_STOPPED;
signal->flags &= ~SIGNAL_CLD_MASK;
why = tracehook_notify_jctl(why, CLD_CONTINUED);
spin_unlock_irq(&sighand->siglock);
if (unlikely(!tracehook_notify_jctl(1, why)))
goto relock;
read_lock(&tasklist_lock);
do_notify_parent_cldstop(current->group_leader, why);
read_unlock(&tasklist_lock);
if (why) {
read_lock(&tasklist_lock);
do_notify_parent_cldstop(current->group_leader, why);
read_unlock(&tasklist_lock);
}
goto relock;
}
@@ -1987,14 +1966,14 @@ void exit_signals(struct task_struct *tsk)
if (unlikely(tsk->signal->group_stop_count) &&
!--tsk->signal->group_stop_count) {
tsk->signal->flags = SIGNAL_STOP_STOPPED;
group_stop = 1;
group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED);
}
out:
spin_unlock_irq(&tsk->sighand->siglock);
if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) {
if (unlikely(group_stop)) {
read_lock(&tasklist_lock);
do_notify_parent_cldstop(tsk, CLD_STOPPED);
do_notify_parent_cldstop(tsk, group_stop);
read_unlock(&tasklist_lock);
}
}
@@ -2290,7 +2269,6 @@ static int
do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
{
struct task_struct *p;
unsigned long flags;
int error = -ESRCH;
rcu_read_lock();
@@ -2300,14 +2278,16 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
/*
* The null signal is a permissions and process existence
* probe. No signal is actually delivered.
*
* If lock_task_sighand() fails we pretend the task dies
* after receiving the signal. The window is tiny, and the
* signal is private anyway.
*/
if (!error && sig && lock_task_sighand(p, &flags)) {
error = specific_send_sig_info(sig, info, p);
unlock_task_sighand(p, &flags);
if (!error && sig) {
error = do_send_sig_info(sig, info, p, false);
/*
* If lock_task_sighand() failed we pretend the task
* dies after receiving the signal. The window is tiny,
* and the signal is private anyway.
*/
if (unlikely(error == -ESRCH))
error = 0;
}
}
rcu_read_unlock();

View File

@@ -26,10 +26,10 @@ static void slow_work_cull_timeout(unsigned long);
static void slow_work_oom_timeout(unsigned long);
#ifdef CONFIG_SYSCTL
static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *,
static int slow_work_min_threads_sysctl(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *,
static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
void __user *, size_t *, loff_t *);
#endif
@@ -493,10 +493,10 @@ static void slow_work_oom_timeout(unsigned long data)
* Handle adjustment of the minimum number of threads
*/
static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
struct file *filp, void __user *buffer,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
int n;
if (ret == 0) {
@@ -521,10 +521,10 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
* Handle adjustment of the maximum number of threads
*/
static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
struct file *filp, void __user *buffer,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
int n;
if (ret == 0) {

View File

@@ -29,8 +29,7 @@ enum {
struct call_function_data {
struct call_single_data csd;
spinlock_t lock;
unsigned int refs;
atomic_t refs;
cpumask_var_t cpumask;
};
@@ -39,9 +38,7 @@ struct call_single_queue {
spinlock_t lock;
};
static DEFINE_PER_CPU(struct call_function_data, cfd_data) = {
.lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock),
};
static DEFINE_PER_CPU(struct call_function_data, cfd_data);
static int
hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -196,25 +193,18 @@ void generic_smp_call_function_interrupt(void)
list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
int refs;
spin_lock(&data->lock);
if (!cpumask_test_cpu(cpu, data->cpumask)) {
spin_unlock(&data->lock);
if (!cpumask_test_and_clear_cpu(cpu, data->cpumask))
continue;
}
cpumask_clear_cpu(cpu, data->cpumask);
spin_unlock(&data->lock);
data->csd.func(data->csd.info);
spin_lock(&data->lock);
WARN_ON(data->refs == 0);
refs = --data->refs;
refs = atomic_dec_return(&data->refs);
WARN_ON(refs < 0);
if (!refs) {
spin_lock(&call_function.lock);
list_del_rcu(&data->csd.list);
spin_unlock(&call_function.lock);
}
spin_unlock(&data->lock);
if (refs)
continue;
@@ -357,13 +347,6 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
generic_exec_single(cpu, data, wait);
}
/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */
#ifndef arch_send_call_function_ipi_mask
# define arch_send_call_function_ipi_mask(maskp) \
arch_send_call_function_ipi(*(maskp))
#endif
/**
* smp_call_function_many(): Run a function on a set of other CPUs.
* @mask: The set of cpus to run on (only runs on online subset).
@@ -419,23 +402,20 @@ void smp_call_function_many(const struct cpumask *mask,
data = &__get_cpu_var(cfd_data);
csd_lock(&data->csd);
spin_lock_irqsave(&data->lock, flags);
data->csd.func = func;
data->csd.info = info;
cpumask_and(data->cpumask, mask, cpu_online_mask);
cpumask_clear_cpu(this_cpu, data->cpumask);
data->refs = cpumask_weight(data->cpumask);
atomic_set(&data->refs, cpumask_weight(data->cpumask));
spin_lock(&call_function.lock);
spin_lock_irqsave(&call_function.lock, flags);
/*
* Place entry at the _HEAD_ of the list, so that any cpu still
* observing the entry in generic_smp_call_function_interrupt()
* will not miss any other list entries:
*/
list_add_rcu(&data->csd.list, &call_function.queue);
spin_unlock(&call_function.lock);
spin_unlock_irqrestore(&data->lock, flags);
spin_unlock_irqrestore(&call_function.lock, flags);
/*
* Make the list addition visible before sending the ipi.

View File

@@ -90,11 +90,11 @@ void touch_all_softlockup_watchdogs(void)
EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
struct file *filp, void __user *buffer,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
touch_all_softlockup_watchdogs();
return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
}
/*

View File

@@ -14,7 +14,7 @@
#include <linux/prctl.h>
#include <linux/highuid.h>
#include <linux/fs.h>
#include <linux/perf_counter.h>
#include <linux/perf_event.h>
#include <linux/resource.h>
#include <linux/kernel.h>
#include <linux/kexec.h>
@@ -1338,6 +1338,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
unsigned long flags;
cputime_t utime, stime;
struct task_cputime cputime;
unsigned long maxrss = 0;
memset((char *) r, 0, sizeof *r);
utime = stime = cputime_zero;
@@ -1346,6 +1347,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
utime = task_utime(current);
stime = task_stime(current);
accumulate_thread_rusage(p, r);
maxrss = p->signal->maxrss;
goto out;
}
@@ -1363,6 +1365,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
r->ru_majflt = p->signal->cmaj_flt;
r->ru_inblock = p->signal->cinblock;
r->ru_oublock = p->signal->coublock;
maxrss = p->signal->cmaxrss;
if (who == RUSAGE_CHILDREN)
break;
@@ -1377,6 +1380,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
r->ru_majflt += p->signal->maj_flt;
r->ru_inblock += p->signal->inblock;
r->ru_oublock += p->signal->oublock;
if (maxrss < p->signal->maxrss)
maxrss = p->signal->maxrss;
t = p;
do {
accumulate_thread_rusage(t, r);
@@ -1392,6 +1397,15 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
out:
cputime_to_timeval(utime, &r->ru_utime);
cputime_to_timeval(stime, &r->ru_stime);
if (who != RUSAGE_CHILDREN) {
struct mm_struct *mm = get_task_mm(p);
if (mm) {
setmax_mm_hiwater_rss(&maxrss, mm);
mmput(mm);
}
}
r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
}
int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
@@ -1511,11 +1525,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_SET_TSC:
error = SET_TSC_CTL(arg2);
break;
case PR_TASK_PERF_COUNTERS_DISABLE:
error = perf_counter_task_disable();
case PR_TASK_PERF_EVENTS_DISABLE:
error = perf_event_task_disable();
break;
case PR_TASK_PERF_COUNTERS_ENABLE:
error = perf_counter_task_enable();
case PR_TASK_PERF_EVENTS_ENABLE:
error = perf_event_task_enable();
break;
case PR_GET_TIMERSLACK:
error = current->timer_slack_ns;
@@ -1528,6 +1542,28 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
current->timer_slack_ns = arg2;
error = 0;
break;
case PR_MCE_KILL:
if (arg4 | arg5)
return -EINVAL;
switch (arg2) {
case 0:
if (arg3 != 0)
return -EINVAL;
current->flags &= ~PF_MCE_PROCESS;
break;
case 1:
current->flags |= PF_MCE_PROCESS;
if (arg3 != 0)
current->flags |= PF_MCE_EARLY;
else
current->flags &= ~PF_MCE_EARLY;
break;
default:
return -EINVAL;
}
error = 0;
break;
default:
error = -EINVAL;
break;

View File

@@ -49,6 +49,7 @@ cond_syscall(sys_sendmsg);
cond_syscall(compat_sys_sendmsg);
cond_syscall(sys_recvmsg);
cond_syscall(compat_sys_recvmsg);
cond_syscall(compat_sys_recvfrom);
cond_syscall(sys_socketcall);
cond_syscall(sys_futex);
cond_syscall(compat_sys_futex);
@@ -177,4 +178,4 @@ cond_syscall(sys_eventfd);
cond_syscall(sys_eventfd2);
/* performance counters: */
cond_syscall(sys_perf_counter_open);
cond_syscall(sys_perf_event_open);

View File

@@ -26,7 +26,6 @@
#include <linux/proc_fs.h>
#include <linux/security.h>
#include <linux/ctype.h>
#include <linux/utsname.h>
#include <linux/kmemcheck.h>
#include <linux/smp_lock.h>
#include <linux/fs.h>
@@ -50,7 +49,7 @@
#include <linux/reboot.h>
#include <linux/ftrace.h>
#include <linux/slow-work.h>
#include <linux/perf_counter.h>
#include <linux/perf_event.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -77,6 +76,7 @@ extern int max_threads;
extern int core_uses_pid;
extern int suid_dumpable;
extern char core_pattern[];
extern unsigned int core_pipe_limit;
extern int pid_max;
extern int min_free_kbytes;
extern int pid_max_min, pid_max_max;
@@ -106,6 +106,9 @@ static int __maybe_unused one = 1;
static int __maybe_unused two = 2;
static unsigned long one_ul = 1;
static int one_hundred = 100;
#ifdef CONFIG_PRINTK
static int ten_thousand = 10000;
#endif
/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -160,9 +163,9 @@ extern int max_lock_depth;
#endif
#ifdef CONFIG_PROC_SYSCTL
static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
static int proc_do_cad_pid(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
static int proc_taint(struct ctl_table *table, int write, struct file *filp,
static int proc_taint(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
#endif
@@ -421,6 +424,14 @@ static struct ctl_table kern_table[] = {
.proc_handler = &proc_dostring,
.strategy = &sysctl_string,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "core_pipe_limit",
.data = &core_pipe_limit,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
#ifdef CONFIG_PROC_SYSCTL
{
.procname = "tainted",
@@ -722,6 +733,17 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "printk_delay",
.data = &printk_delay_msec,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.strategy = &sysctl_intvec,
.extra1 = &zero,
.extra2 = &ten_thousand,
},
#endif
{
.ctl_name = KERN_NGROUPS_MAX,
@@ -964,28 +986,28 @@ static struct ctl_table kern_table[] = {
.child = slow_work_sysctls,
},
#endif
#ifdef CONFIG_PERF_COUNTERS
#ifdef CONFIG_PERF_EVENTS
{
.ctl_name = CTL_UNNUMBERED,
.procname = "perf_counter_paranoid",
.data = &sysctl_perf_counter_paranoid,
.maxlen = sizeof(sysctl_perf_counter_paranoid),
.procname = "perf_event_paranoid",
.data = &sysctl_perf_event_paranoid,
.maxlen = sizeof(sysctl_perf_event_paranoid),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "perf_counter_mlock_kb",
.data = &sysctl_perf_counter_mlock,
.maxlen = sizeof(sysctl_perf_counter_mlock),
.procname = "perf_event_mlock_kb",
.data = &sysctl_perf_event_mlock,
.maxlen = sizeof(sysctl_perf_event_mlock),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "perf_counter_max_sample_rate",
.data = &sysctl_perf_counter_sample_rate,
.maxlen = sizeof(sysctl_perf_counter_sample_rate),
.procname = "perf_event_max_sample_rate",
.data = &sysctl_perf_event_sample_rate,
.maxlen = sizeof(sysctl_perf_event_sample_rate),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
@@ -1376,6 +1398,31 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = &scan_unevictable_handler,
},
#ifdef CONFIG_MEMORY_FAILURE
{
.ctl_name = CTL_UNNUMBERED,
.procname = "memory_failure_early_kill",
.data = &sysctl_memory_failure_early_kill,
.maxlen = sizeof(sysctl_memory_failure_early_kill),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.strategy = &sysctl_intvec,
.extra1 = &zero,
.extra2 = &one,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "memory_failure_recovery",
.data = &sysctl_memory_failure_recovery,
.maxlen = sizeof(sysctl_memory_failure_recovery),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.strategy = &sysctl_intvec,
.extra1 = &zero,
.extra2 = &one,
},
#endif
/*
* NOTE: do not add new entries to this table unless you have read
* Documentation/sysctl/ctl_unnumbered.txt
@@ -2204,7 +2251,7 @@ void sysctl_head_put(struct ctl_table_header *head)
#ifdef CONFIG_PROC_SYSCTL
static int _proc_do_string(void* data, int maxlen, int write,
struct file *filp, void __user *buffer,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
size_t len;
@@ -2265,7 +2312,6 @@ static int _proc_do_string(void* data, int maxlen, int write,
* proc_dostring - read a string sysctl
* @table: the sysctl table
* @write: %TRUE if this is a write to the sysctl file
* @filp: the file structure
* @buffer: the user buffer
* @lenp: the size of the user buffer
* @ppos: file position
@@ -2279,10 +2325,10 @@ static int _proc_do_string(void* data, int maxlen, int write,
*
* Returns 0 on success.
*/
int proc_dostring(struct ctl_table *table, int write, struct file *filp,
int proc_dostring(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
return _proc_do_string(table->data, table->maxlen, write, filp,
return _proc_do_string(table->data, table->maxlen, write,
buffer, lenp, ppos);
}
@@ -2307,7 +2353,7 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
}
static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
int write, struct file *filp, void __user *buffer,
int write, void __user *buffer,
size_t *lenp, loff_t *ppos,
int (*conv)(int *negp, unsigned long *lvalp, int *valp,
int write, void *data),
@@ -2414,13 +2460,13 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
#undef TMPBUFLEN
}
static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp,
static int do_proc_dointvec(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos,
int (*conv)(int *negp, unsigned long *lvalp, int *valp,
int write, void *data),
void *data)
{
return __do_proc_dointvec(table->data, table, write, filp,
return __do_proc_dointvec(table->data, table, write,
buffer, lenp, ppos, conv, data);
}
@@ -2428,7 +2474,6 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
* proc_dointvec - read a vector of integers
* @table: the sysctl table
* @write: %TRUE if this is a write to the sysctl file
* @filp: the file structure
* @buffer: the user buffer
* @lenp: the size of the user buffer
* @ppos: file position
@@ -2438,10 +2483,10 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
*
* Returns 0 on success.
*/
int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
int proc_dointvec(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
return do_proc_dointvec(table,write,buffer,lenp,ppos,
NULL,NULL);
}
@@ -2449,7 +2494,7 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
* Taint values can only be increased
* This means we can safely use a temporary.
*/
static int proc_taint(struct ctl_table *table, int write, struct file *filp,
static int proc_taint(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
@@ -2461,7 +2506,7 @@ static int proc_taint(struct ctl_table *table, int write, struct file *filp,
t = *table;
t.data = &tmptaint;
err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos);
err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
if (err < 0)
return err;
@@ -2513,7 +2558,6 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
* proc_dointvec_minmax - read a vector of integers with min/max values
* @table: the sysctl table
* @write: %TRUE if this is a write to the sysctl file
* @filp: the file structure
* @buffer: the user buffer
* @lenp: the size of the user buffer
* @ppos: file position
@@ -2526,19 +2570,18 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
*
* Returns 0 on success.
*/
int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp,
int proc_dointvec_minmax(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
struct do_proc_dointvec_minmax_conv_param param = {
.min = (int *) table->extra1,
.max = (int *) table->extra2,
};
return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
return do_proc_dointvec(table, write, buffer, lenp, ppos,
do_proc_dointvec_minmax_conv, &param);
}
static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
struct file *filp,
void __user *buffer,
size_t *lenp, loff_t *ppos,
unsigned long convmul,
@@ -2643,21 +2686,19 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
}
static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
struct file *filp,
void __user *buffer,
size_t *lenp, loff_t *ppos,
unsigned long convmul,
unsigned long convdiv)
{
return __do_proc_doulongvec_minmax(table->data, table, write,
filp, buffer, lenp, ppos, convmul, convdiv);
buffer, lenp, ppos, convmul, convdiv);
}
/**
* proc_doulongvec_minmax - read a vector of long integers with min/max values
* @table: the sysctl table
* @write: %TRUE if this is a write to the sysctl file
* @filp: the file structure
* @buffer: the user buffer
* @lenp: the size of the user buffer
* @ppos: file position
@@ -2670,17 +2711,16 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
*
* Returns 0 on success.
*/
int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp,
int proc_doulongvec_minmax(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l);
return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
}
/**
* proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
* @table: the sysctl table
* @write: %TRUE if this is a write to the sysctl file
* @filp: the file structure
* @buffer: the user buffer
* @lenp: the size of the user buffer
* @ppos: file position
@@ -2695,11 +2735,10 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp
* Returns 0 on success.
*/
int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
struct file *filp,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
return do_proc_doulongvec_minmax(table, write, filp, buffer,
return do_proc_doulongvec_minmax(table, write, buffer,
lenp, ppos, HZ, 1000l);
}
@@ -2775,7 +2814,6 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
* proc_dointvec_jiffies - read a vector of integers as seconds
* @table: the sysctl table
* @write: %TRUE if this is a write to the sysctl file
* @filp: the file structure
* @buffer: the user buffer
* @lenp: the size of the user buffer
* @ppos: file position
@@ -2787,10 +2825,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
*
* Returns 0 on success.
*/
int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
int proc_dointvec_jiffies(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
return do_proc_dointvec(table,write,buffer,lenp,ppos,
do_proc_dointvec_jiffies_conv,NULL);
}
@@ -2798,7 +2836,6 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
* proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
* @table: the sysctl table
* @write: %TRUE if this is a write to the sysctl file
* @filp: the file structure
* @buffer: the user buffer
* @lenp: the size of the user buffer
* @ppos: pointer to the file position
@@ -2810,10 +2847,10 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
*
* Returns 0 on success.
*/
int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp,
int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
return do_proc_dointvec(table,write,buffer,lenp,ppos,
do_proc_dointvec_userhz_jiffies_conv,NULL);
}
@@ -2821,7 +2858,6 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
* proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
* @table: the sysctl table
* @write: %TRUE if this is a write to the sysctl file
* @filp: the file structure
* @buffer: the user buffer
* @lenp: the size of the user buffer
* @ppos: file position
@@ -2834,14 +2870,14 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
*
* Returns 0 on success.
*/
int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp,
int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
return do_proc_dointvec(table, write, buffer, lenp, ppos,
do_proc_dointvec_ms_jiffies_conv, NULL);
}
static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
static int proc_do_cad_pid(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
struct pid *new_pid;
@@ -2850,7 +2886,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
tmp = pid_vnr(cad_pid);
r = __do_proc_dointvec(&tmp, table, write, filp, buffer,
r = __do_proc_dointvec(&tmp, table, write, buffer,
lenp, ppos, NULL, NULL);
if (r || !write)
return r;
@@ -2865,50 +2901,49 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
#else /* CONFIG_PROC_FS */
int proc_dostring(struct ctl_table *table, int write, struct file *filp,
int proc_dostring(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
int proc_dointvec(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp,
int proc_dointvec_minmax(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
int proc_dointvec_jiffies(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp,
int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp,
int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp,
int proc_doulongvec_minmax(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
struct file *filp,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{

View File

@@ -370,13 +370,20 @@ EXPORT_SYMBOL(mktime);
* 0 <= tv_nsec < NSEC_PER_SEC
* For negative values only the tv_sec field is negative !
*/
void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec)
{
while (nsec >= NSEC_PER_SEC) {
/*
* The following asm() prevents the compiler from
* optimising this loop into a modulo operation. See
* also __iter_div_u64_rem() in include/linux/time.h
*/
asm("" : "+rm"(nsec));
nsec -= NSEC_PER_SEC;
++sec;
}
while (nsec < 0) {
asm("" : "+rm"(nsec));
nsec += NSEC_PER_SEC;
--sec;
}

View File

@@ -1,4 +1,4 @@
obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o
obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o

View File

@@ -21,7 +21,6 @@
*
* TODO WishList:
* o Allow clocksource drivers to be unregistered
* o get rid of clocksource_jiffies extern
*/
#include <linux/clocksource.h>
@@ -30,6 +29,7 @@
#include <linux/module.h>
#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
#include <linux/tick.h>
#include <linux/kthread.h>
void timecounter_init(struct timecounter *tc,
const struct cyclecounter *cc,
@@ -107,50 +107,35 @@ u64 timecounter_cyc2time(struct timecounter *tc,
}
EXPORT_SYMBOL(timecounter_cyc2time);
/* XXX - Would like a better way for initializing curr_clocksource */
extern struct clocksource clocksource_jiffies;
/*[Clocksource internal variables]---------
* curr_clocksource:
* currently selected clocksource. Initialized to clocksource_jiffies.
* next_clocksource:
* pending next selected clocksource.
* currently selected clocksource.
* clocksource_list:
* linked list with the registered clocksources
* clocksource_lock:
* protects manipulations to curr_clocksource and next_clocksource
* and the clocksource_list
* clocksource_mutex:
* protects manipulations to curr_clocksource and the clocksource_list
* override_name:
* Name of the user-specified clocksource.
*/
static struct clocksource *curr_clocksource = &clocksource_jiffies;
static struct clocksource *next_clocksource;
static struct clocksource *clocksource_override;
static struct clocksource *curr_clocksource;
static LIST_HEAD(clocksource_list);
static DEFINE_SPINLOCK(clocksource_lock);
static DEFINE_MUTEX(clocksource_mutex);
static char override_name[32];
static int finished_booting;
/* clocksource_done_booting - Called near the end of core bootup
*
* Hack to avoid lots of clocksource churn at boot time.
* We use fs_initcall because we want this to start before
* device_initcall but after subsys_initcall.
*/
static int __init clocksource_done_booting(void)
{
finished_booting = 1;
return 0;
}
fs_initcall(clocksource_done_booting);
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
static void clocksource_watchdog_work(struct work_struct *work);
static LIST_HEAD(watchdog_list);
static struct clocksource *watchdog;
static struct timer_list watchdog_timer;
static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
static DEFINE_SPINLOCK(watchdog_lock);
static cycle_t watchdog_last;
static unsigned long watchdog_resumed;
static int watchdog_running;
static int clocksource_watchdog_kthread(void *data);
static void __clocksource_change_rating(struct clocksource *cs, int rating);
/*
* Interval: 0.5sec Threshold: 0.0625s
@@ -158,135 +143,249 @@ static unsigned long watchdog_resumed;
#define WATCHDOG_INTERVAL (HZ >> 1)
#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
static void clocksource_ratewd(struct clocksource *cs, int64_t delta)
static void clocksource_watchdog_work(struct work_struct *work)
{
if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD)
return;
/*
* If kthread_run fails the next watchdog scan over the
* watchdog_list will find the unstable clock again.
*/
kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
}
static void __clocksource_unstable(struct clocksource *cs)
{
cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
cs->flags |= CLOCK_SOURCE_UNSTABLE;
if (finished_booting)
schedule_work(&watchdog_work);
}
static void clocksource_unstable(struct clocksource *cs, int64_t delta)
{
printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
cs->name, delta);
cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
clocksource_change_rating(cs, 0);
list_del(&cs->wd_list);
__clocksource_unstable(cs);
}
/**
* clocksource_mark_unstable - mark clocksource unstable via watchdog
* @cs: clocksource to be marked unstable
*
* This function is called instead of clocksource_change_rating from
* cpu hotplug code to avoid a deadlock between the clocksource mutex
* and the cpu hotplug mutex. It defers the update of the clocksource
* to the watchdog thread.
*/
void clocksource_mark_unstable(struct clocksource *cs)
{
unsigned long flags;
spin_lock_irqsave(&watchdog_lock, flags);
if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
if (list_empty(&cs->wd_list))
list_add(&cs->wd_list, &watchdog_list);
__clocksource_unstable(cs);
}
spin_unlock_irqrestore(&watchdog_lock, flags);
}
static void clocksource_watchdog(unsigned long data)
{
struct clocksource *cs, *tmp;
struct clocksource *cs;
cycle_t csnow, wdnow;
int64_t wd_nsec, cs_nsec;
int resumed;
int next_cpu;
spin_lock(&watchdog_lock);
resumed = test_and_clear_bit(0, &watchdog_resumed);
if (!watchdog_running)
goto out;
wdnow = watchdog->read(watchdog);
wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask,
watchdog->mult, watchdog->shift);
watchdog_last = wdnow;
list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
list_for_each_entry(cs, &watchdog_list, wd_list) {
/* Clocksource already marked unstable? */
if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
if (finished_booting)
schedule_work(&watchdog_work);
continue;
}
csnow = cs->read(cs);
if (unlikely(resumed)) {
/* Clocksource initialized ? */
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
cs->flags |= CLOCK_SOURCE_WATCHDOG;
cs->wd_last = csnow;
continue;
}
/* Initialized ? */
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
(watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
/*
* We just marked the clocksource as
* highres-capable, notify the rest of the
* system as well so that we transition
* into high-res mode:
*/
tick_clock_notify();
}
cs->flags |= CLOCK_SOURCE_WATCHDOG;
cs->wd_last = csnow;
} else {
cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask);
cs->wd_last = csnow;
/* Check the delta. Might remove from the list ! */
clocksource_ratewd(cs, cs_nsec - wd_nsec);
/* Check the deviation from the watchdog clocksource. */
cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) &
cs->mask, cs->mult, cs->shift);
cs->wd_last = csnow;
if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
clocksource_unstable(cs, cs_nsec - wd_nsec);
continue;
}
if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
(cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
(watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
/*
* We just marked the clocksource as highres-capable,
* notify the rest of the system as well so that we
* transition into high-res mode:
*/
tick_clock_notify();
}
}
if (!list_empty(&watchdog_list)) {
/*
* Cycle through CPUs to check if the CPUs stay
* synchronized to each other.
*/
int next_cpu = cpumask_next(raw_smp_processor_id(),
cpu_online_mask);
if (next_cpu >= nr_cpu_ids)
next_cpu = cpumask_first(cpu_online_mask);
watchdog_timer.expires += WATCHDOG_INTERVAL;
add_timer_on(&watchdog_timer, next_cpu);
}
/*
* Cycle through CPUs to check if the CPUs stay synchronized
* to each other.
*/
next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
if (next_cpu >= nr_cpu_ids)
next_cpu = cpumask_first(cpu_online_mask);
watchdog_timer.expires += WATCHDOG_INTERVAL;
add_timer_on(&watchdog_timer, next_cpu);
out:
spin_unlock(&watchdog_lock);
}
static void clocksource_resume_watchdog(void)
static inline void clocksource_start_watchdog(void)
{
set_bit(0, &watchdog_resumed);
if (watchdog_running || !watchdog || list_empty(&watchdog_list))
return;
init_timer(&watchdog_timer);
watchdog_timer.function = clocksource_watchdog;
watchdog_last = watchdog->read(watchdog);
watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
watchdog_running = 1;
}
static void clocksource_check_watchdog(struct clocksource *cs)
static inline void clocksource_stop_watchdog(void)
{
if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
return;
del_timer(&watchdog_timer);
watchdog_running = 0;
}
static inline void clocksource_reset_watchdog(void)
{
struct clocksource *cs;
list_for_each_entry(cs, &watchdog_list, wd_list)
cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
}
static void clocksource_resume_watchdog(void)
{
unsigned long flags;
spin_lock_irqsave(&watchdog_lock, flags);
clocksource_reset_watchdog();
spin_unlock_irqrestore(&watchdog_lock, flags);
}
static void clocksource_enqueue_watchdog(struct clocksource *cs)
{
struct clocksource *cse;
unsigned long flags;
spin_lock_irqsave(&watchdog_lock, flags);
if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
int started = !list_empty(&watchdog_list);
/* cs is a clocksource to be watched. */
list_add(&cs->wd_list, &watchdog_list);
if (!started && watchdog) {
watchdog_last = watchdog->read(watchdog);
watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
add_timer_on(&watchdog_timer,
cpumask_first(cpu_online_mask));
}
cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
} else {
/* cs is a watchdog. */
if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
/* Pick the best watchdog. */
if (!watchdog || cs->rating > watchdog->rating) {
if (watchdog)
del_timer(&watchdog_timer);
watchdog = cs;
init_timer(&watchdog_timer);
watchdog_timer.function = clocksource_watchdog;
/* Reset watchdog cycles */
list_for_each_entry(cse, &watchdog_list, wd_list)
cse->flags &= ~CLOCK_SOURCE_WATCHDOG;
/* Start if list is not empty */
if (!list_empty(&watchdog_list)) {
watchdog_last = watchdog->read(watchdog);
watchdog_timer.expires =
jiffies + WATCHDOG_INTERVAL;
add_timer_on(&watchdog_timer,
cpumask_first(cpu_online_mask));
}
clocksource_reset_watchdog();
}
}
/* Check if the watchdog timer needs to be started. */
clocksource_start_watchdog();
spin_unlock_irqrestore(&watchdog_lock, flags);
}
#else
static void clocksource_check_watchdog(struct clocksource *cs)
static void clocksource_dequeue_watchdog(struct clocksource *cs)
{
struct clocksource *tmp;
unsigned long flags;
spin_lock_irqsave(&watchdog_lock, flags);
if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
/* cs is a watched clocksource. */
list_del_init(&cs->wd_list);
} else if (cs == watchdog) {
/* Reset watchdog cycles */
clocksource_reset_watchdog();
/* Current watchdog is removed. Find an alternative. */
watchdog = NULL;
list_for_each_entry(tmp, &clocksource_list, list) {
if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY)
continue;
if (!watchdog || tmp->rating > watchdog->rating)
watchdog = tmp;
}
}
cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
/* Check if the watchdog timer needs to be stopped. */
clocksource_stop_watchdog();
spin_unlock_irqrestore(&watchdog_lock, flags);
}
static int clocksource_watchdog_kthread(void *data)
{
struct clocksource *cs, *tmp;
unsigned long flags;
LIST_HEAD(unstable);
mutex_lock(&clocksource_mutex);
spin_lock_irqsave(&watchdog_lock, flags);
list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list)
if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
list_del_init(&cs->wd_list);
list_add(&cs->wd_list, &unstable);
}
/* Check if the watchdog timer needs to be stopped. */
clocksource_stop_watchdog();
spin_unlock_irqrestore(&watchdog_lock, flags);
/* Needs to be done outside of watchdog lock */
list_for_each_entry_safe(cs, tmp, &unstable, wd_list) {
list_del_init(&cs->wd_list);
__clocksource_change_rating(cs, 0);
}
mutex_unlock(&clocksource_mutex);
return 0;
}
#else /* CONFIG_CLOCKSOURCE_WATCHDOG */
static void clocksource_enqueue_watchdog(struct clocksource *cs)
{
if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
}
static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
static inline void clocksource_resume_watchdog(void) { }
#endif
static inline int clocksource_watchdog_kthread(void *data) { return 0; }
#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
/**
* clocksource_resume - resume the clocksource(s)
@@ -294,18 +393,12 @@ static inline void clocksource_resume_watchdog(void) { }
void clocksource_resume(void)
{
struct clocksource *cs;
unsigned long flags;
spin_lock_irqsave(&clocksource_lock, flags);
list_for_each_entry(cs, &clocksource_list, list) {
list_for_each_entry(cs, &clocksource_list, list)
if (cs->resume)
cs->resume();
}
clocksource_resume_watchdog();
spin_unlock_irqrestore(&clocksource_lock, flags);
}
/**
@@ -320,75 +413,94 @@ void clocksource_touch_watchdog(void)
clocksource_resume_watchdog();
}
/**
* clocksource_get_next - Returns the selected clocksource
*
*/
struct clocksource *clocksource_get_next(void)
{
unsigned long flags;
spin_lock_irqsave(&clocksource_lock, flags);
if (next_clocksource && finished_booting) {
curr_clocksource = next_clocksource;
next_clocksource = NULL;
}
spin_unlock_irqrestore(&clocksource_lock, flags);
return curr_clocksource;
}
#ifdef CONFIG_GENERIC_TIME
/**
* select_clocksource - Selects the best registered clocksource.
* clocksource_select - Select the best clocksource available
*
* Private function. Must hold clocksource_lock when called.
* Private function. Must hold clocksource_mutex when called.
*
* Select the clocksource with the best rating, or the clocksource,
* which is selected by userspace override.
*/
static struct clocksource *select_clocksource(void)
static void clocksource_select(void)
{
struct clocksource *next;
struct clocksource *best, *cs;
if (list_empty(&clocksource_list))
return NULL;
if (clocksource_override)
next = clocksource_override;
else
next = list_entry(clocksource_list.next, struct clocksource,
list);
if (next == curr_clocksource)
return NULL;
return next;
if (!finished_booting || list_empty(&clocksource_list))
return;
/* First clocksource on the list has the best rating. */
best = list_first_entry(&clocksource_list, struct clocksource, list);
/* Check for the override clocksource. */
list_for_each_entry(cs, &clocksource_list, list) {
if (strcmp(cs->name, override_name) != 0)
continue;
/*
* Check to make sure we don't switch to a non-highres
* capable clocksource if the tick code is in oneshot
* mode (highres or nohz)
*/
if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
tick_oneshot_mode_active()) {
/* Override clocksource cannot be used. */
printk(KERN_WARNING "Override clocksource %s is not "
"HRT compatible. Cannot switch while in "
"HRT/NOHZ mode\n", cs->name);
override_name[0] = 0;
} else
/* Override clocksource can be used. */
best = cs;
break;
}
if (curr_clocksource != best) {
printk(KERN_INFO "Switching to clocksource %s\n", best->name);
curr_clocksource = best;
timekeeping_notify(curr_clocksource);
}
}
#else /* CONFIG_GENERIC_TIME */
static inline void clocksource_select(void) { }
#endif
/*
* clocksource_done_booting - Called near the end of core bootup
*
* Hack to avoid lots of clocksource churn at boot time.
* We use fs_initcall because we want this to start before
* device_initcall but after subsys_initcall.
*/
static int __init clocksource_done_booting(void)
{
finished_booting = 1;
/*
* Run the watchdog first to eliminate unstable clock sources
*/
clocksource_watchdog_kthread(NULL);
mutex_lock(&clocksource_mutex);
clocksource_select();
mutex_unlock(&clocksource_mutex);
return 0;
}
fs_initcall(clocksource_done_booting);
/*
* Enqueue the clocksource sorted by rating
*/
static int clocksource_enqueue(struct clocksource *c)
static void clocksource_enqueue(struct clocksource *cs)
{
struct list_head *tmp, *entry = &clocksource_list;
struct list_head *entry = &clocksource_list;
struct clocksource *tmp;
list_for_each(tmp, &clocksource_list) {
struct clocksource *cs;
cs = list_entry(tmp, struct clocksource, list);
if (cs == c)
return -EBUSY;
list_for_each_entry(tmp, &clocksource_list, list)
/* Keep track of the place, where to insert */
if (cs->rating >= c->rating)
entry = tmp;
}
list_add(&c->list, entry);
if (strlen(c->name) == strlen(override_name) &&
!strcmp(c->name, override_name))
clocksource_override = c;
return 0;
if (tmp->rating >= cs->rating)
entry = &tmp->list;
list_add(&cs->list, entry);
}
/**
@@ -397,52 +509,48 @@ static int clocksource_enqueue(struct clocksource *c)
*
* Returns -EBUSY if registration fails, zero otherwise.
*/
int clocksource_register(struct clocksource *c)
int clocksource_register(struct clocksource *cs)
{
unsigned long flags;
int ret;
spin_lock_irqsave(&clocksource_lock, flags);
ret = clocksource_enqueue(c);
if (!ret)
next_clocksource = select_clocksource();
spin_unlock_irqrestore(&clocksource_lock, flags);
if (!ret)
clocksource_check_watchdog(c);
return ret;
mutex_lock(&clocksource_mutex);
clocksource_enqueue(cs);
clocksource_select();
clocksource_enqueue_watchdog(cs);
mutex_unlock(&clocksource_mutex);
return 0;
}
EXPORT_SYMBOL(clocksource_register);
/**
* clocksource_change_rating - Change the rating of a registered clocksource
*
*/
void clocksource_change_rating(struct clocksource *cs, int rating)
static void __clocksource_change_rating(struct clocksource *cs, int rating)
{
unsigned long flags;
spin_lock_irqsave(&clocksource_lock, flags);
list_del(&cs->list);
cs->rating = rating;
clocksource_enqueue(cs);
next_clocksource = select_clocksource();
spin_unlock_irqrestore(&clocksource_lock, flags);
clocksource_select();
}
/**
* clocksource_change_rating - Change the rating of a registered clocksource
*/
void clocksource_change_rating(struct clocksource *cs, int rating)
{
mutex_lock(&clocksource_mutex);
__clocksource_change_rating(cs, rating);
mutex_unlock(&clocksource_mutex);
}
EXPORT_SYMBOL(clocksource_change_rating);
/**
* clocksource_unregister - remove a registered clocksource
*/
void clocksource_unregister(struct clocksource *cs)
{
unsigned long flags;
spin_lock_irqsave(&clocksource_lock, flags);
mutex_lock(&clocksource_mutex);
clocksource_dequeue_watchdog(cs);
list_del(&cs->list);
if (clocksource_override == cs)
clocksource_override = NULL;
next_clocksource = select_clocksource();
spin_unlock_irqrestore(&clocksource_lock, flags);
clocksource_select();
mutex_unlock(&clocksource_mutex);
}
EXPORT_SYMBOL(clocksource_unregister);
#ifdef CONFIG_SYSFS
/**
@@ -458,9 +566,9 @@ sysfs_show_current_clocksources(struct sys_device *dev,
{
ssize_t count = 0;
spin_lock_irq(&clocksource_lock);
mutex_lock(&clocksource_mutex);
count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
spin_unlock_irq(&clocksource_lock);
mutex_unlock(&clocksource_mutex);
return count;
}
@@ -478,9 +586,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
struct sysdev_attribute *attr,
const char *buf, size_t count)
{
struct clocksource *ovr = NULL;
size_t ret = count;
int len;
/* strings from sysfs write are not 0 terminated! */
if (count >= sizeof(override_name))
@@ -490,44 +596,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
if (buf[count-1] == '\n')
count--;
spin_lock_irq(&clocksource_lock);
mutex_lock(&clocksource_mutex);
if (count > 0)
memcpy(override_name, buf, count);
override_name[count] = 0;
clocksource_select();
len = strlen(override_name);
if (len) {
struct clocksource *cs;
ovr = clocksource_override;
/* try to select it: */
list_for_each_entry(cs, &clocksource_list, list) {
if (strlen(cs->name) == len &&
!strcmp(cs->name, override_name))
ovr = cs;
}
}
/*
* Check to make sure we don't switch to a non-highres capable
* clocksource if the tick code is in oneshot mode (highres or nohz)
*/
if (tick_oneshot_mode_active() && ovr &&
!(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
printk(KERN_WARNING "%s clocksource is not HRT compatible. "
"Cannot switch while in HRT/NOHZ mode\n", ovr->name);
ovr = NULL;
override_name[0] = 0;
}
/* Reselect, when the override name has changed */
if (ovr != clocksource_override) {
clocksource_override = ovr;
next_clocksource = select_clocksource();
}
spin_unlock_irq(&clocksource_lock);
mutex_unlock(&clocksource_mutex);
return ret;
}
@@ -547,7 +623,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
struct clocksource *src;
ssize_t count = 0;
spin_lock_irq(&clocksource_lock);
mutex_lock(&clocksource_mutex);
list_for_each_entry(src, &clocksource_list, list) {
/*
* Don't show non-HRES clocksource if the tick code is
@@ -559,7 +635,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
"%s ", src->name);
}
spin_unlock_irq(&clocksource_lock);
mutex_unlock(&clocksource_mutex);
count += snprintf(buf + count,
max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
@@ -614,11 +690,10 @@ device_initcall(init_clocksource_sysfs);
*/
static int __init boot_override_clocksource(char* str)
{
unsigned long flags;
spin_lock_irqsave(&clocksource_lock, flags);
mutex_lock(&clocksource_mutex);
if (str)
strlcpy(override_name, str, sizeof(override_name));
spin_unlock_irqrestore(&clocksource_lock, flags);
mutex_unlock(&clocksource_mutex);
return 1;
}

View File

@@ -61,7 +61,6 @@ struct clocksource clocksource_jiffies = {
.read = jiffies_read,
.mask = 0xffffffff, /*32bits*/
.mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
.mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT,
.shift = JIFFIES_SHIFT,
};
@@ -71,3 +70,8 @@ static int __init init_jiffies_clocksource(void)
}
core_initcall(init_jiffies_clocksource);
struct clocksource * __init __weak clocksource_default_clock(void)
{
return &clocksource_jiffies;
}

View File

@@ -194,8 +194,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
case TIME_OK:
break;
case TIME_INS:
xtime.tv_sec--;
wall_to_monotonic.tv_sec++;
timekeeping_leap_insert(-1);
time_state = TIME_OOP;
printk(KERN_NOTICE
"Clock: inserting leap second 23:59:60 UTC\n");
@@ -203,9 +202,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
res = HRTIMER_RESTART;
break;
case TIME_DEL:
xtime.tv_sec++;
timekeeping_leap_insert(1);
time_tai--;
wall_to_monotonic.tv_sec--;
time_state = TIME_WAIT;
printk(KERN_NOTICE
"Clock: deleting leap second 23:59:59 UTC\n");
@@ -219,7 +217,6 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
time_state = TIME_OK;
break;
}
update_vsyscall(&xtime, clock);
write_sequnlock(&xtime_lock);

127
kernel/time/timeconv.c Normal file
View File

@@ -0,0 +1,127 @@
/*
* Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc.
* This file is part of the GNU C Library.
* Contributed by Paul Eggert (eggert@twinsun.com).
*
* The GNU C Library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* The GNU C Library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with the GNU C Library; see the file COPYING.LIB. If not,
* write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
/*
* Converts the calendar time to broken-down time representation
* Based on code from glibc-2.6
*
* 2009-7-14:
* Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com>
*/
#include <linux/time.h>
#include <linux/module.h>
/*
* Nonzero if YEAR is a leap year (every 4 years,
* except every 100th isn't, and every 400th is).
*/
static int __isleap(long year)
{
return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0);
}
/* do a mathdiv for long type */
static long math_div(long a, long b)
{
return a / b - (a % b < 0);
}
/* How many leap years between y1 and y2, y1 must less or equal to y2 */
static long leaps_between(long y1, long y2)
{
long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100)
+ math_div(y1 - 1, 400);
long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100)
+ math_div(y2 - 1, 400);
return leaps2 - leaps1;
}
/* How many days come before each month (0-12). */
static const unsigned short __mon_yday[2][13] = {
/* Normal years. */
{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365},
/* Leap years. */
{0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366}
};
#define SECS_PER_HOUR (60 * 60)
#define SECS_PER_DAY (SECS_PER_HOUR * 24)
/**
* time_to_tm - converts the calendar time to local broken-down time
*
* @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970,
* Coordinated Universal Time (UTC).
* @offset offset seconds adding to totalsecs.
* @result pointer to struct tm variable to receive broken-down time
*/
void time_to_tm(time_t totalsecs, int offset, struct tm *result)
{
long days, rem, y;
const unsigned short *ip;
days = totalsecs / SECS_PER_DAY;
rem = totalsecs % SECS_PER_DAY;
rem += offset;
while (rem < 0) {
rem += SECS_PER_DAY;
--days;
}
while (rem >= SECS_PER_DAY) {
rem -= SECS_PER_DAY;
++days;
}
result->tm_hour = rem / SECS_PER_HOUR;
rem %= SECS_PER_HOUR;
result->tm_min = rem / 60;
result->tm_sec = rem % 60;
/* January 1, 1970 was a Thursday. */
result->tm_wday = (4 + days) % 7;
if (result->tm_wday < 0)
result->tm_wday += 7;
y = 1970;
while (days < 0 || days >= (__isleap(y) ? 366 : 365)) {
/* Guess a corrected year, assuming 365 days per year. */
long yg = y + math_div(days, 365);
/* Adjust DAYS and Y to match the guessed year. */
days -= (yg - y) * 365 + leaps_between(y, yg);
y = yg;
}
result->tm_year = y - 1900;
result->tm_yday = days;
ip = __mon_yday[__isleap(y)];
for (y = 11; days < ip[y]; y--)
continue;
days -= ip[y];
result->tm_mon = y;
result->tm_mday = days + 1;
}
EXPORT_SYMBOL(time_to_tm);

View File

@@ -18,7 +18,117 @@
#include <linux/jiffies.h>
#include <linux/time.h>
#include <linux/tick.h>
#include <linux/stop_machine.h>
/* Structure holding internal timekeeping values. */
struct timekeeper {
/* Current clocksource used for timekeeping. */
struct clocksource *clock;
/* The shift value of the current clocksource. */
int shift;
/* Number of clock cycles in one NTP interval. */
cycle_t cycle_interval;
/* Number of clock shifted nano seconds in one NTP interval. */
u64 xtime_interval;
/* Raw nano seconds accumulated per NTP interval. */
u32 raw_interval;
/* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */
u64 xtime_nsec;
/* Difference between accumulated time and NTP time in ntp
* shifted nano seconds. */
s64 ntp_error;
/* Shift conversion between clock shifted nano seconds and
* ntp shifted nano seconds. */
int ntp_error_shift;
/* NTP adjusted clock multiplier */
u32 mult;
};
struct timekeeper timekeeper;
/**
* timekeeper_setup_internals - Set up internals to use clocksource clock.
*
* @clock: Pointer to clocksource.
*
* Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
* pair and interval request.
*
* Unless you're the timekeeping code, you should not be using this!
*/
static void timekeeper_setup_internals(struct clocksource *clock)
{
cycle_t interval;
u64 tmp;
timekeeper.clock = clock;
clock->cycle_last = clock->read(clock);
/* Do the ns -> cycle conversion first, using original mult */
tmp = NTP_INTERVAL_LENGTH;
tmp <<= clock->shift;
tmp += clock->mult/2;
do_div(tmp, clock->mult);
if (tmp == 0)
tmp = 1;
interval = (cycle_t) tmp;
timekeeper.cycle_interval = interval;
/* Go back from cycles -> shifted ns */
timekeeper.xtime_interval = (u64) interval * clock->mult;
timekeeper.raw_interval =
((u64) interval * clock->mult) >> clock->shift;
timekeeper.xtime_nsec = 0;
timekeeper.shift = clock->shift;
timekeeper.ntp_error = 0;
timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
/*
* The timekeeper keeps its own mult values for the currently
* active clocksource. These value will be adjusted via NTP
* to counteract clock drifting.
*/
timekeeper.mult = clock->mult;
}
/* Timekeeper helper functions. */
static inline s64 timekeeping_get_ns(void)
{
cycle_t cycle_now, cycle_delta;
struct clocksource *clock;
/* read clocksource: */
clock = timekeeper.clock;
cycle_now = clock->read(clock);
/* calculate the delta since the last update_wall_time: */
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
/* return delta convert to nanoseconds using ntp adjusted mult. */
return clocksource_cyc2ns(cycle_delta, timekeeper.mult,
timekeeper.shift);
}
static inline s64 timekeeping_get_ns_raw(void)
{
cycle_t cycle_now, cycle_delta;
struct clocksource *clock;
/* read clocksource: */
clock = timekeeper.clock;
cycle_now = clock->read(clock);
/* calculate the delta since the last update_wall_time: */
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
/* return delta convert to nanoseconds using ntp adjusted mult. */
return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
}
/*
* This read-write spinlock protects us from races in SMP while
@@ -44,7 +154,12 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
*/
struct timespec xtime __attribute__ ((aligned (16)));
struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
static unsigned long total_sleep_time; /* seconds */
static struct timespec total_sleep_time;
/*
* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
*/
struct timespec raw_time;
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
@@ -56,35 +171,44 @@ void update_xtime_cache(u64 nsec)
timespec_add_ns(&xtime_cache, nsec);
}
struct clocksource *clock;
/* must hold xtime_lock */
void timekeeping_leap_insert(int leapsecond)
{
xtime.tv_sec += leapsecond;
wall_to_monotonic.tv_sec -= leapsecond;
update_vsyscall(&xtime, timekeeper.clock);
}
#ifdef CONFIG_GENERIC_TIME
/**
* clocksource_forward_now - update clock to the current time
* timekeeping_forward_now - update clock to the current time
*
* Forward the current clock to update its state since the last call to
* update_wall_time(). This is useful before significant clock changes,
* as it avoids having to deal with this time offset explicitly.
*/
static void clocksource_forward_now(void)
static void timekeeping_forward_now(void)
{
cycle_t cycle_now, cycle_delta;
struct clocksource *clock;
s64 nsec;
cycle_now = clocksource_read(clock);
clock = timekeeper.clock;
cycle_now = clock->read(clock);
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
clock->cycle_last = cycle_now;
nsec = cyc2ns(clock, cycle_delta);
nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult,
timekeeper.shift);
/* If arch requires, add in gettimeoffset() */
nsec += arch_gettimeoffset();
timespec_add_ns(&xtime, nsec);
nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
clock->raw_time.tv_nsec += nsec;
nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
timespec_add_ns(&raw_time, nsec);
}
/**
@@ -95,7 +219,6 @@ static void clocksource_forward_now(void)
*/
void getnstimeofday(struct timespec *ts)
{
cycle_t cycle_now, cycle_delta;
unsigned long seq;
s64 nsecs;
@@ -105,15 +228,7 @@ void getnstimeofday(struct timespec *ts)
seq = read_seqbegin(&xtime_lock);
*ts = xtime;
/* read clocksource: */
cycle_now = clocksource_read(clock);
/* calculate the delta since the last update_wall_time: */
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
/* convert to nanoseconds: */
nsecs = cyc2ns(clock, cycle_delta);
nsecs = timekeeping_get_ns();
/* If arch requires, add in gettimeoffset() */
nsecs += arch_gettimeoffset();
@@ -125,6 +240,57 @@ void getnstimeofday(struct timespec *ts)
EXPORT_SYMBOL(getnstimeofday);
ktime_t ktime_get(void)
{
unsigned int seq;
s64 secs, nsecs;
WARN_ON(timekeeping_suspended);
do {
seq = read_seqbegin(&xtime_lock);
secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
nsecs += timekeeping_get_ns();
} while (read_seqretry(&xtime_lock, seq));
/*
* Use ktime_set/ktime_add_ns to create a proper ktime on
* 32-bit architectures without CONFIG_KTIME_SCALAR.
*/
return ktime_add_ns(ktime_set(secs, 0), nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get);
/**
* ktime_get_ts - get the monotonic clock in timespec format
* @ts: pointer to timespec variable
*
* The function calculates the monotonic clock from the realtime
* clock and the wall_to_monotonic offset and stores the result
* in normalized timespec format in the variable pointed to by @ts.
*/
void ktime_get_ts(struct timespec *ts)
{
struct timespec tomono;
unsigned int seq;
s64 nsecs;
WARN_ON(timekeeping_suspended);
do {
seq = read_seqbegin(&xtime_lock);
*ts = xtime;
tomono = wall_to_monotonic;
nsecs = timekeeping_get_ns();
} while (read_seqretry(&xtime_lock, seq));
set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
ts->tv_nsec + tomono.tv_nsec + nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get_ts);
/**
* do_gettimeofday - Returns the time of day in a timeval
* @tv: pointer to the timeval to be set
@@ -157,7 +323,7 @@ int do_settimeofday(struct timespec *tv)
write_seqlock_irqsave(&xtime_lock, flags);
clocksource_forward_now();
timekeeping_forward_now();
ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
@@ -167,10 +333,10 @@ int do_settimeofday(struct timespec *tv)
update_xtime_cache(0);
clock->error = 0;
timekeeper.ntp_error = 0;
ntp_clear();
update_vsyscall(&xtime, clock);
update_vsyscall(&xtime, timekeeper.clock);
write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -187,44 +353,97 @@ EXPORT_SYMBOL(do_settimeofday);
*
* Accumulates current time interval and initializes new clocksource
*/
static void change_clocksource(void)
static int change_clocksource(void *data)
{
struct clocksource *new, *old;
new = clocksource_get_next();
new = (struct clocksource *) data;
if (clock == new)
return;
clocksource_forward_now();
if (clocksource_enable(new))
return;
new->raw_time = clock->raw_time;
old = clock;
clock = new;
clocksource_disable(old);
clock->cycle_last = 0;
clock->cycle_last = clocksource_read(clock);
clock->error = 0;
clock->xtime_nsec = 0;
clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
tick_clock_notify();
/*
* We're holding xtime lock and waking up klogd would deadlock
* us on enqueue. So no printing!
printk(KERN_INFO "Time: %s clocksource has been installed.\n",
clock->name);
*/
timekeeping_forward_now();
if (!new->enable || new->enable(new) == 0) {
old = timekeeper.clock;
timekeeper_setup_internals(new);
if (old->disable)
old->disable(old);
}
return 0;
}
#else
static inline void clocksource_forward_now(void) { }
static inline void change_clocksource(void) { }
#endif
/**
* timekeeping_notify - Install a new clock source
* @clock: pointer to the clock source
*
* This function is called from clocksource.c after a new, better clock
* source has been registered. The caller holds the clocksource_mutex.
*/
void timekeeping_notify(struct clocksource *clock)
{
if (timekeeper.clock == clock)
return;
stop_machine(change_clocksource, clock, NULL);
tick_clock_notify();
}
#else /* GENERIC_TIME */
static inline void timekeeping_forward_now(void) { }
/**
* ktime_get - get the monotonic time in ktime_t format
*
* returns the time in ktime_t format
*/
ktime_t ktime_get(void)
{
struct timespec now;
ktime_get_ts(&now);
return timespec_to_ktime(now);
}
EXPORT_SYMBOL_GPL(ktime_get);
/**
* ktime_get_ts - get the monotonic clock in timespec format
* @ts: pointer to timespec variable
*
* The function calculates the monotonic clock from the realtime
* clock and the wall_to_monotonic offset and stores the result
* in normalized timespec format in the variable pointed to by @ts.
*/
void ktime_get_ts(struct timespec *ts)
{
struct timespec tomono;
unsigned long seq;
do {
seq = read_seqbegin(&xtime_lock);
getnstimeofday(ts);
tomono = wall_to_monotonic;
} while (read_seqretry(&xtime_lock, seq));
set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
ts->tv_nsec + tomono.tv_nsec);
}
EXPORT_SYMBOL_GPL(ktime_get_ts);
#endif /* !GENERIC_TIME */
/**
* ktime_get_real - get the real (wall-) time in ktime_t format
*
* returns the time in ktime_t format
*/
ktime_t ktime_get_real(void)
{
struct timespec now;
getnstimeofday(&now);
return timespec_to_ktime(now);
}
EXPORT_SYMBOL_GPL(ktime_get_real);
/**
* getrawmonotonic - Returns the raw monotonic time in a timespec
@@ -236,21 +455,11 @@ void getrawmonotonic(struct timespec *ts)
{
unsigned long seq;
s64 nsecs;
cycle_t cycle_now, cycle_delta;
do {
seq = read_seqbegin(&xtime_lock);
/* read clocksource: */
cycle_now = clocksource_read(clock);
/* calculate the delta since the last update_wall_time: */
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
/* convert to nanoseconds: */
nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
*ts = clock->raw_time;
nsecs = timekeeping_get_ns_raw();
*ts = raw_time;
} while (read_seqretry(&xtime_lock, seq));
@@ -270,7 +479,7 @@ int timekeeping_valid_for_hres(void)
do {
seq = read_seqbegin(&xtime_lock);
ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
} while (read_seqretry(&xtime_lock, seq));
@@ -278,17 +487,33 @@ int timekeeping_valid_for_hres(void)
}
/**
* read_persistent_clock - Return time in seconds from the persistent clock.
* read_persistent_clock - Return time from the persistent clock.
*
* Weak dummy function for arches that do not yet support it.
* Returns seconds from epoch using the battery backed persistent clock.
* Returns zero if unsupported.
* Reads the time from the battery backed persistent clock.
* Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
*
* XXX - Do be sure to remove it once all arches implement it.
*/
unsigned long __attribute__((weak)) read_persistent_clock(void)
void __attribute__((weak)) read_persistent_clock(struct timespec *ts)
{
return 0;
ts->tv_sec = 0;
ts->tv_nsec = 0;
}
/**
* read_boot_clock - Return time of the system start.
*
* Weak dummy function for arches that do not yet support it.
* Function to read the exact time the system has been started.
* Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
*
* XXX - Do be sure to remove it once all arches implement it.
*/
void __attribute__((weak)) read_boot_clock(struct timespec *ts)
{
ts->tv_sec = 0;
ts->tv_nsec = 0;
}
/*
@@ -296,29 +521,40 @@ unsigned long __attribute__((weak)) read_persistent_clock(void)
*/
void __init timekeeping_init(void)
{
struct clocksource *clock;
unsigned long flags;
unsigned long sec = read_persistent_clock();
struct timespec now, boot;
read_persistent_clock(&now);
read_boot_clock(&boot);
write_seqlock_irqsave(&xtime_lock, flags);
ntp_init();
clock = clocksource_get_next();
clocksource_enable(clock);
clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
clock->cycle_last = clocksource_read(clock);
clock = clocksource_default_clock();
if (clock->enable)
clock->enable(clock);
timekeeper_setup_internals(clock);
xtime.tv_sec = sec;
xtime.tv_nsec = 0;
xtime.tv_sec = now.tv_sec;
xtime.tv_nsec = now.tv_nsec;
raw_time.tv_sec = 0;
raw_time.tv_nsec = 0;
if (boot.tv_sec == 0 && boot.tv_nsec == 0) {
boot.tv_sec = xtime.tv_sec;
boot.tv_nsec = xtime.tv_nsec;
}
set_normalized_timespec(&wall_to_monotonic,
-xtime.tv_sec, -xtime.tv_nsec);
-boot.tv_sec, -boot.tv_nsec);
update_xtime_cache(0);
total_sleep_time = 0;
total_sleep_time.tv_sec = 0;
total_sleep_time.tv_nsec = 0;
write_sequnlock_irqrestore(&xtime_lock, flags);
}
/* time in seconds when suspend began */
static unsigned long timekeeping_suspend_time;
static struct timespec timekeeping_suspend_time;
/**
* timekeeping_resume - Resumes the generic timekeeping subsystem.
@@ -331,24 +567,24 @@ static unsigned long timekeeping_suspend_time;
static int timekeeping_resume(struct sys_device *dev)
{
unsigned long flags;
unsigned long now = read_persistent_clock();
struct timespec ts;
read_persistent_clock(&ts);
clocksource_resume();
write_seqlock_irqsave(&xtime_lock, flags);
if (now && (now > timekeeping_suspend_time)) {
unsigned long sleep_length = now - timekeeping_suspend_time;
xtime.tv_sec += sleep_length;
wall_to_monotonic.tv_sec -= sleep_length;
total_sleep_time += sleep_length;
if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
ts = timespec_sub(ts, timekeeping_suspend_time);
xtime = timespec_add_safe(xtime, ts);
wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
total_sleep_time = timespec_add_safe(total_sleep_time, ts);
}
update_xtime_cache(0);
/* re-base the last cycle value */
clock->cycle_last = 0;
clock->cycle_last = clocksource_read(clock);
clock->error = 0;
timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
timekeeper.ntp_error = 0;
timekeeping_suspended = 0;
write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -366,10 +602,10 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
{
unsigned long flags;
timekeeping_suspend_time = read_persistent_clock();
read_persistent_clock(&timekeeping_suspend_time);
write_seqlock_irqsave(&xtime_lock, flags);
clocksource_forward_now();
timekeeping_forward_now();
timekeeping_suspended = 1;
write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -404,7 +640,7 @@ device_initcall(timekeeping_init_device);
* If the error is already larger, we look ahead even further
* to compensate for late or lost adjustments.
*/
static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
s64 *offset)
{
s64 tick_error, i;
@@ -420,7 +656,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
* here. This is tuned so that an error of about 1 msec is adjusted
* within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
*/
error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
error2 = abs(error2);
for (look_ahead = 0; error2 > 0; look_ahead++)
error2 >>= 2;
@@ -429,8 +665,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
* Now calculate the error in (1 << look_ahead) ticks, but first
* remove the single look ahead already included in the error.
*/
tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1);
tick_error -= clock->xtime_interval >> 1;
tick_error = tick_length >> (timekeeper.ntp_error_shift + 1);
tick_error -= timekeeper.xtime_interval >> 1;
error = ((error - tick_error) >> look_ahead) + tick_error;
/* Finally calculate the adjustment shift value. */
@@ -455,18 +691,18 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
* this is optimized for the most common adjustments of -1,0,1,
* for other values we can do a bit more work.
*/
static void clocksource_adjust(s64 offset)
static void timekeeping_adjust(s64 offset)
{
s64 error, interval = clock->cycle_interval;
s64 error, interval = timekeeper.cycle_interval;
int adj;
error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1);
error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
if (error > interval) {
error >>= 2;
if (likely(error <= interval))
adj = 1;
else
adj = clocksource_bigadjust(error, &interval, &offset);
adj = timekeeping_bigadjust(error, &interval, &offset);
} else if (error < -interval) {
error >>= 2;
if (likely(error >= -interval)) {
@@ -474,15 +710,15 @@ static void clocksource_adjust(s64 offset)
interval = -interval;
offset = -offset;
} else
adj = clocksource_bigadjust(error, &interval, &offset);
adj = timekeeping_bigadjust(error, &interval, &offset);
} else
return;
clock->mult += adj;
clock->xtime_interval += interval;
clock->xtime_nsec -= offset;
clock->error -= (interval - offset) <<
(NTP_SCALE_SHIFT - clock->shift);
timekeeper.mult += adj;
timekeeper.xtime_interval += interval;
timekeeper.xtime_nsec -= offset;
timekeeper.ntp_error -= (interval - offset) <<
timekeeper.ntp_error_shift;
}
/**
@@ -492,53 +728,59 @@ static void clocksource_adjust(s64 offset)
*/
void update_wall_time(void)
{
struct clocksource *clock;
cycle_t offset;
u64 nsecs;
/* Make sure we're fully resumed: */
if (unlikely(timekeeping_suspended))
return;
clock = timekeeper.clock;
#ifdef CONFIG_GENERIC_TIME
offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
#else
offset = clock->cycle_interval;
offset = timekeeper.cycle_interval;
#endif
clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift;
timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
/* normally this loop will run just once, however in the
* case of lost or late ticks, it will accumulate correctly.
*/
while (offset >= clock->cycle_interval) {
/* accumulate one interval */
offset -= clock->cycle_interval;
clock->cycle_last += clock->cycle_interval;
while (offset >= timekeeper.cycle_interval) {
u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
clock->xtime_nsec += clock->xtime_interval;
if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
/* accumulate one interval */
offset -= timekeeper.cycle_interval;
clock->cycle_last += timekeeper.cycle_interval;
timekeeper.xtime_nsec += timekeeper.xtime_interval;
if (timekeeper.xtime_nsec >= nsecps) {
timekeeper.xtime_nsec -= nsecps;
xtime.tv_sec++;
second_overflow();
}
clock->raw_time.tv_nsec += clock->raw_interval;
if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) {
clock->raw_time.tv_nsec -= NSEC_PER_SEC;
clock->raw_time.tv_sec++;
raw_time.tv_nsec += timekeeper.raw_interval;
if (raw_time.tv_nsec >= NSEC_PER_SEC) {
raw_time.tv_nsec -= NSEC_PER_SEC;
raw_time.tv_sec++;
}
/* accumulate error between NTP and clock interval */
clock->error += tick_length;
clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
timekeeper.ntp_error += tick_length;
timekeeper.ntp_error -= timekeeper.xtime_interval <<
timekeeper.ntp_error_shift;
}
/* correct the clock when NTP error is too big */
clocksource_adjust(offset);
timekeeping_adjust(offset);
/*
* Since in the loop above, we accumulate any amount of time
* in xtime_nsec over a second into xtime.tv_sec, its possible for
* xtime_nsec to be fairly small after the loop. Further, if we're
* slightly speeding the clocksource up in clocksource_adjust(),
* slightly speeding the clocksource up in timekeeping_adjust(),
* its possible the required corrective factor to xtime_nsec could
* cause it to underflow.
*
@@ -550,24 +792,25 @@ void update_wall_time(void)
* We'll correct this error next time through this function, when
* xtime_nsec is not as small.
*/
if (unlikely((s64)clock->xtime_nsec < 0)) {
s64 neg = -(s64)clock->xtime_nsec;
clock->xtime_nsec = 0;
clock->error += neg << (NTP_SCALE_SHIFT - clock->shift);
if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
s64 neg = -(s64)timekeeper.xtime_nsec;
timekeeper.xtime_nsec = 0;
timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
}
/* store full nanoseconds into xtime after rounding it up and
* add the remainder to the error difference.
*/
xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1;
clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift);
xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift;
timekeeper.ntp_error += timekeeper.xtime_nsec <<
timekeeper.ntp_error_shift;
update_xtime_cache(cyc2ns(clock, offset));
nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
update_xtime_cache(nsecs);
/* check to see if there is a new clocksource to use */
change_clocksource();
update_vsyscall(&xtime, clock);
update_vsyscall(&xtime, timekeeper.clock);
}
/**
@@ -583,9 +826,12 @@ void update_wall_time(void)
*/
void getboottime(struct timespec *ts)
{
set_normalized_timespec(ts,
- (wall_to_monotonic.tv_sec + total_sleep_time),
- wall_to_monotonic.tv_nsec);
struct timespec boottime = {
.tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec,
.tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec
};
set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
}
/**
@@ -594,7 +840,7 @@ void getboottime(struct timespec *ts)
*/
void monotonic_to_bootbased(struct timespec *ts)
{
ts->tv_sec += total_sleep_time;
*ts = timespec_add_safe(*ts, total_sleep_time);
}
unsigned long get_seconds(void)
@@ -603,6 +849,10 @@ unsigned long get_seconds(void)
}
EXPORT_SYMBOL(get_seconds);
struct timespec __current_kernel_time(void)
{
return xtime_cache;
}
struct timespec current_kernel_time(void)
{
@@ -618,3 +868,20 @@ struct timespec current_kernel_time(void)
return now;
}
EXPORT_SYMBOL(current_kernel_time);
struct timespec get_monotonic_coarse(void)
{
struct timespec now, mono;
unsigned long seq;
do {
seq = read_seqbegin(&xtime_lock);
now = xtime_cache;
mono = wall_to_monotonic;
} while (read_seqretry(&xtime_lock, seq));
set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
now.tv_nsec + mono.tv_nsec);
return now;
}

View File

@@ -37,7 +37,7 @@
#include <linux/delay.h>
#include <linux/tick.h>
#include <linux/kallsyms.h>
#include <linux/perf_counter.h>
#include <linux/perf_event.h>
#include <linux/sched.h>
#include <asm/uaccess.h>
@@ -46,6 +46,9 @@
#include <asm/timex.h>
#include <asm/io.h>
#define CREATE_TRACE_POINTS
#include <trace/events/timer.h>
u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
EXPORT_SYMBOL(jiffies_64);
@@ -72,6 +75,7 @@ struct tvec_base {
spinlock_t lock;
struct timer_list *running_timer;
unsigned long timer_jiffies;
unsigned long next_timer;
struct tvec_root tv1;
struct tvec tv2;
struct tvec tv3;
@@ -520,6 +524,25 @@ static inline void debug_timer_activate(struct timer_list *timer) { }
static inline void debug_timer_deactivate(struct timer_list *timer) { }
#endif
static inline void debug_init(struct timer_list *timer)
{
debug_timer_init(timer);
trace_timer_init(timer);
}
static inline void
debug_activate(struct timer_list *timer, unsigned long expires)
{
debug_timer_activate(timer);
trace_timer_start(timer, expires);
}
static inline void debug_deactivate(struct timer_list *timer)
{
debug_timer_deactivate(timer);
trace_timer_cancel(timer);
}
static void __init_timer(struct timer_list *timer,
const char *name,
struct lock_class_key *key)
@@ -548,7 +571,7 @@ void init_timer_key(struct timer_list *timer,
const char *name,
struct lock_class_key *key)
{
debug_timer_init(timer);
debug_init(timer);
__init_timer(timer, name, key);
}
EXPORT_SYMBOL(init_timer_key);
@@ -567,7 +590,7 @@ static inline void detach_timer(struct timer_list *timer,
{
struct list_head *entry = &timer->entry;
debug_timer_deactivate(timer);
debug_deactivate(timer);
__list_del(entry->prev, entry->next);
if (clear_pending)
@@ -622,13 +645,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
if (timer_pending(timer)) {
detach_timer(timer, 0);
if (timer->expires == base->next_timer &&
!tbase_get_deferrable(timer->base))
base->next_timer = base->timer_jiffies;
ret = 1;
} else {
if (pending_only)
goto out_unlock;
}
debug_timer_activate(timer);
debug_activate(timer, expires);
new_base = __get_cpu_var(tvec_bases);
@@ -663,6 +689,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
}
timer->expires = expires;
if (time_before(timer->expires, base->next_timer) &&
!tbase_get_deferrable(timer->base))
base->next_timer = timer->expires;
internal_add_timer(base, timer);
out_unlock:
@@ -780,7 +809,10 @@ void add_timer_on(struct timer_list *timer, int cpu)
BUG_ON(timer_pending(timer) || !timer->function);
spin_lock_irqsave(&base->lock, flags);
timer_set_base(timer, base);
debug_timer_activate(timer);
debug_activate(timer, timer->expires);
if (time_before(timer->expires, base->next_timer) &&
!tbase_get_deferrable(timer->base))
base->next_timer = timer->expires;
internal_add_timer(base, timer);
/*
* Check whether the other CPU is idle and needs to be
@@ -817,6 +849,9 @@ int del_timer(struct timer_list *timer)
base = lock_timer_base(timer, &flags);
if (timer_pending(timer)) {
detach_timer(timer, 1);
if (timer->expires == base->next_timer &&
!tbase_get_deferrable(timer->base))
base->next_timer = base->timer_jiffies;
ret = 1;
}
spin_unlock_irqrestore(&base->lock, flags);
@@ -850,6 +885,9 @@ int try_to_del_timer_sync(struct timer_list *timer)
ret = 0;
if (timer_pending(timer)) {
detach_timer(timer, 1);
if (timer->expires == base->next_timer &&
!tbase_get_deferrable(timer->base))
base->next_timer = base->timer_jiffies;
ret = 1;
}
out:
@@ -984,7 +1022,9 @@ static inline void __run_timers(struct tvec_base *base)
*/
lock_map_acquire(&lockdep_map);
trace_timer_expire_entry(timer);
fn(data);
trace_timer_expire_exit(timer);
lock_map_release(&lockdep_map);
@@ -1007,8 +1047,8 @@ static inline void __run_timers(struct tvec_base *base)
#ifdef CONFIG_NO_HZ
/*
* Find out when the next timer event is due to happen. This
* is used on S/390 to stop all activity when a cpus is idle.
* This functions needs to be called disabled.
* is used on S/390 to stop all activity when a CPU is idle.
* This function needs to be called with interrupts disabled.
*/
static unsigned long __next_timer_interrupt(struct tvec_base *base)
{
@@ -1134,7 +1174,9 @@ unsigned long get_next_timer_interrupt(unsigned long now)
unsigned long expires;
spin_lock(&base->lock);
expires = __next_timer_interrupt(base);
if (time_before_eq(base->next_timer, base->timer_jiffies))
base->next_timer = __next_timer_interrupt(base);
expires = base->next_timer;
spin_unlock(&base->lock);
if (time_before_eq(expires, now))
@@ -1169,7 +1211,7 @@ static void run_timer_softirq(struct softirq_action *h)
{
struct tvec_base *base = __get_cpu_var(tvec_bases);
perf_counter_do_pending();
perf_event_do_pending();
hrtimer_run_pending();
@@ -1522,6 +1564,7 @@ static int __cpuinit init_timers_cpu(int cpu)
INIT_LIST_HEAD(base->tv1.vec + j);
base->timer_jiffies = jiffies;
base->next_timer = base->timer_jiffies;
return 0;
}
@@ -1534,6 +1577,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
timer = list_first_entry(head, struct timer_list, entry);
detach_timer(timer, 0);
timer_set_base(timer, new_base);
if (time_before(timer->expires, new_base->next_timer) &&
!tbase_get_deferrable(timer->base))
new_base->next_timer = timer->expires;
internal_add_timer(new_base, timer);
}
}

View File

@@ -83,7 +83,7 @@ config RING_BUFFER_ALLOW_SWAP
# This allows those options to appear when no other tracer is selected. But the
# options do not appear when something else selects it. We need the two options
# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
# hidding of the automatic options options.
# hidding of the automatic options.
config TRACING
bool

View File

@@ -42,7 +42,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
obj-$(CONFIG_POWER_TRACER) += trace_power.o
obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
@@ -54,5 +53,6 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o
obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_EVENT_TRACING) += power-traces.o
libftrace-y := ftrace.o

View File

@@ -225,7 +225,11 @@ static void ftrace_update_pid_func(void)
if (ftrace_trace_function == ftrace_stub)
return;
#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
func = ftrace_trace_function;
#else
func = __ftrace_trace_function;
#endif
if (ftrace_pid_trace) {
set_ftrace_pid_function(func);
@@ -1520,7 +1524,7 @@ static int t_show(struct seq_file *m, void *v)
return 0;
}
static struct seq_operations show_ftrace_seq_ops = {
static const struct seq_operations show_ftrace_seq_ops = {
.start = t_start,
.next = t_next,
.stop = t_stop,
@@ -1621,8 +1625,10 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
if (!ret) {
struct seq_file *m = file->private_data;
m->private = iter;
} else
} else {
trace_parser_put(&iter->parser);
kfree(iter);
}
} else
file->private_data = iter;
mutex_unlock(&ftrace_regex_lock);
@@ -2148,7 +2154,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
struct trace_parser *parser;
ssize_t ret, read;
if (!cnt || cnt < 0)
if (!cnt)
return 0;
mutex_lock(&ftrace_regex_lock);
@@ -2162,7 +2168,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
parser = &iter->parser;
read = trace_get_user(parser, ubuf, cnt, ppos);
if (trace_parser_loaded(parser) &&
if (read >= 0 && trace_parser_loaded(parser) &&
!trace_parser_cont(parser)) {
ret = ftrace_process_regex(parser->buffer,
parser->idx, enable);
@@ -2360,11 +2366,9 @@ unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
static void *
__g_next(struct seq_file *m, loff_t *pos)
{
unsigned long *array = m->private;
if (*pos >= ftrace_graph_count)
return NULL;
return &array[*pos];
return &ftrace_graph_funcs[*pos];
}
static void *
@@ -2407,7 +2411,7 @@ static int g_show(struct seq_file *m, void *v)
return 0;
}
static struct seq_operations ftrace_graph_seq_ops = {
static const struct seq_operations ftrace_graph_seq_ops = {
.start = g_start,
.next = g_next,
.stop = g_stop,
@@ -2428,17 +2432,11 @@ ftrace_graph_open(struct inode *inode, struct file *file)
ftrace_graph_count = 0;
memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
}
if (file->f_mode & FMODE_READ) {
ret = seq_open(file, &ftrace_graph_seq_ops);
if (!ret) {
struct seq_file *m = file->private_data;
m->private = ftrace_graph_funcs;
}
} else
file->private_data = ftrace_graph_funcs;
mutex_unlock(&graph_lock);
if (file->f_mode & FMODE_READ)
ret = seq_open(file, &ftrace_graph_seq_ops);
return ret;
}
@@ -2506,9 +2504,7 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
struct trace_parser parser;
unsigned long *array;
size_t read = 0;
ssize_t ret;
ssize_t read, ret;
if (!cnt || cnt < 0)
return 0;
@@ -2517,35 +2513,31 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
ret = -EBUSY;
goto out;
goto out_unlock;
}
if (file->f_mode & FMODE_READ) {
struct seq_file *m = file->private_data;
array = m->private;
} else
array = file->private_data;
if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
ret = -ENOMEM;
goto out;
goto out_unlock;
}
read = trace_get_user(&parser, ubuf, cnt, ppos);
if (trace_parser_loaded((&parser))) {
if (read >= 0 && trace_parser_loaded((&parser))) {
parser.buffer[parser.idx] = 0;
/* we allow only one expression at a time */
ret = ftrace_set_func(array, &ftrace_graph_count,
ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
parser.buffer);
if (ret)
goto out;
goto out_free;
}
ret = read;
out:
out_free:
trace_parser_put(&parser);
out_unlock:
mutex_unlock(&graph_lock);
return ret;
@@ -2976,7 +2968,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
int
ftrace_enable_sysctl(struct ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *lenp,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
@@ -2986,7 +2978,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
mutex_lock(&ftrace_lock);
ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
goto out;

View File

@@ -0,0 +1,20 @@
/*
* Power trace points
*
* Copyright (C) 2009 Arjan van de Ven <arjan@linux.intel.com>
*/
#include <linux/string.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/slab.h>
#define CREATE_TRACE_POINTS
#include <trace/events/power.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
EXPORT_TRACEPOINT_SYMBOL_GPL(power_end);
EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);

View File

@@ -201,8 +201,6 @@ int tracing_is_on(void)
}
EXPORT_SYMBOL_GPL(tracing_is_on);
#include "trace.h"
#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
#define RB_ALIGNMENT 4U
#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)

View File

@@ -125,13 +125,13 @@ int ftrace_dump_on_oops;
static int tracing_set_tracer(const char *buf);
#define BOOTUP_TRACER_SIZE 100
static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata;
#define MAX_TRACER_SIZE 100
static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
static char *default_bootup_tracer;
static int __init set_ftrace(char *str)
{
strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE);
strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
default_bootup_tracer = bootup_tracer_buf;
/* We are using ftrace early, expand it */
ring_buffer_expanded = 1;
@@ -241,13 +241,6 @@ static struct tracer *trace_types __read_mostly;
/* current_trace points to the tracer that is currently active */
static struct tracer *current_trace __read_mostly;
/*
* max_tracer_type_len is used to simplify the allocating of
* buffers to read userspace tracer names. We keep track of
* the longest tracer name registered.
*/
static int max_tracer_type_len;
/*
* trace_types_lock is used to protect the trace_types list.
* This lock is also used to keep user access serialized.
@@ -275,12 +268,18 @@ static DEFINE_SPINLOCK(tracing_start_lock);
*/
void trace_wake_up(void)
{
int cpu;
if (trace_flags & TRACE_ITER_BLOCK)
return;
/*
* The runqueue_is_locked() can fail, but this is the best we
* have for now:
*/
if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked())
cpu = get_cpu();
if (!runqueue_is_locked(cpu))
wake_up(&trace_wait);
put_cpu();
}
static int __init set_buf_size(char *str)
@@ -416,7 +415,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
/* read the non-space input */
while (cnt && !isspace(ch)) {
if (parser->idx < parser->size)
if (parser->idx < parser->size - 1)
parser->buffer[parser->idx++] = ch;
else {
ret = -EINVAL;
@@ -619,7 +618,6 @@ __releases(kernel_lock)
__acquires(kernel_lock)
{
struct tracer *t;
int len;
int ret = 0;
if (!type->name) {
@@ -627,6 +625,11 @@ __acquires(kernel_lock)
return -1;
}
if (strlen(type->name) > MAX_TRACER_SIZE) {
pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
return -1;
}
/*
* When this gets called we hold the BKL which means that
* preemption is disabled. Various trace selftests however
@@ -641,7 +644,7 @@ __acquires(kernel_lock)
for (t = trace_types; t; t = t->next) {
if (strcmp(type->name, t->name) == 0) {
/* already found */
pr_info("Trace %s already registered\n",
pr_info("Tracer %s already registered\n",
type->name);
ret = -1;
goto out;
@@ -692,9 +695,6 @@ __acquires(kernel_lock)
type->next = trace_types;
trace_types = type;
len = strlen(type->name);
if (len > max_tracer_type_len)
max_tracer_type_len = len;
out:
tracing_selftest_running = false;
@@ -703,7 +703,7 @@ __acquires(kernel_lock)
if (ret || !default_bootup_tracer)
goto out_unlock;
if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE))
if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE))
goto out_unlock;
printk(KERN_INFO "Starting tracer '%s'\n", type->name);
@@ -725,14 +725,13 @@ __acquires(kernel_lock)
void unregister_tracer(struct tracer *type)
{
struct tracer **t;
int len;
mutex_lock(&trace_types_lock);
for (t = &trace_types; *t; t = &(*t)->next) {
if (*t == type)
goto found;
}
pr_info("Trace %s not registered\n", type->name);
pr_info("Tracer %s not registered\n", type->name);
goto out;
found:
@@ -745,17 +744,7 @@ void unregister_tracer(struct tracer *type)
current_trace->stop(&global_trace);
current_trace = &nop_trace;
}
if (strlen(type->name) != max_tracer_type_len)
goto out;
max_tracer_type_len = 0;
for (t = &trace_types; *t; t = &(*t)->next) {
len = strlen((*t)->name);
if (len > max_tracer_type_len)
max_tracer_type_len = len;
}
out:
out:
mutex_unlock(&trace_types_lock);
}
@@ -1960,7 +1949,7 @@ static int s_show(struct seq_file *m, void *v)
return 0;
}
static struct seq_operations tracer_seq_ops = {
static const struct seq_operations tracer_seq_ops = {
.start = s_start,
.next = s_next,
.stop = s_stop,
@@ -1995,11 +1984,9 @@ __tracing_open(struct inode *inode, struct file *file)
if (current_trace)
*iter->trace = *current_trace;
if (!alloc_cpumask_var(&iter->started, GFP_KERNEL))
if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
goto fail;
cpumask_clear(iter->started);
if (current_trace && current_trace->print_max)
iter->tr = &max_tr;
else
@@ -2174,7 +2161,7 @@ static int t_show(struct seq_file *m, void *v)
return 0;
}
static struct seq_operations show_traces_seq_ops = {
static const struct seq_operations show_traces_seq_ops = {
.start = t_start,
.next = t_next,
.stop = t_stop,
@@ -2604,7 +2591,7 @@ static ssize_t
tracing_set_trace_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
char buf[max_tracer_type_len+2];
char buf[MAX_TRACER_SIZE+2];
int r;
mutex_lock(&trace_types_lock);
@@ -2754,15 +2741,15 @@ static ssize_t
tracing_set_trace_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
char buf[max_tracer_type_len+1];
char buf[MAX_TRACER_SIZE+1];
int i;
size_t ret;
int err;
ret = cnt;
if (cnt > max_tracer_type_len)
cnt = max_tracer_type_len;
if (cnt > MAX_TRACER_SIZE)
cnt = MAX_TRACER_SIZE;
if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
@@ -4400,7 +4387,7 @@ __init static int tracer_alloc_buffers(void)
if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
goto out_free_buffer_mask;
if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
goto out_free_tracing_cpumask;
/* To save memory, keep the ring buffer size to its minimum */
@@ -4411,7 +4398,6 @@ __init static int tracer_alloc_buffers(void)
cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
cpumask_copy(tracing_cpumask, cpu_all_mask);
cpumask_clear(tracing_reader_cpumask);
/* TODO: make the number of buffers hot pluggable with CPUS */
global_trace.buffer = ring_buffer_alloc(ring_buf_size,

View File

@@ -11,7 +11,6 @@
#include <linux/ftrace.h>
#include <trace/boot.h>
#include <linux/kmemtrace.h>
#include <trace/power.h>
#include <linux/trace_seq.h>
#include <linux/ftrace_event.h>
@@ -37,7 +36,6 @@ enum trace_type {
TRACE_HW_BRANCHES,
TRACE_KMEM_ALLOC,
TRACE_KMEM_FREE,
TRACE_POWER,
TRACE_BLK,
__TRACE_LAST_TYPE,
@@ -207,7 +205,6 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
TRACE_GRAPH_RET); \
IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
TRACE_KMEM_ALLOC); \
IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \

View File

@@ -330,23 +330,6 @@ FTRACE_ENTRY(hw_branch, hw_branch_entry,
F_printk("from: %llx to: %llx", __entry->from, __entry->to)
);
FTRACE_ENTRY(power, trace_power,
TRACE_POWER,
F_STRUCT(
__field_struct( struct power_trace, state_data )
__field_desc( s64, state_data, stamp )
__field_desc( s64, state_data, end )
__field_desc( int, state_data, type )
__field_desc( int, state_data, state )
),
F_printk("%llx->%llx type:%u state:%u",
__entry->stamp, __entry->end,
__entry->type, __entry->state)
);
FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
TRACE_KMEM_ALLOC,

View File

@@ -8,6 +8,57 @@
#include <linux/module.h>
#include "trace.h"
/*
* We can't use a size but a type in alloc_percpu()
* So let's create a dummy type that matches the desired size
*/
typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
char *trace_profile_buf;
EXPORT_SYMBOL_GPL(trace_profile_buf);
char *trace_profile_buf_nmi;
EXPORT_SYMBOL_GPL(trace_profile_buf_nmi);
/* Count the events in use (per event id, not per instance) */
static int total_profile_count;
static int ftrace_profile_enable_event(struct ftrace_event_call *event)
{
char *buf;
int ret = -ENOMEM;
if (atomic_inc_return(&event->profile_count))
return 0;
if (!total_profile_count++) {
buf = (char *)alloc_percpu(profile_buf_t);
if (!buf)
goto fail_buf;
rcu_assign_pointer(trace_profile_buf, buf);
buf = (char *)alloc_percpu(profile_buf_t);
if (!buf)
goto fail_buf_nmi;
rcu_assign_pointer(trace_profile_buf_nmi, buf);
}
ret = event->profile_enable();
if (!ret)
return 0;
kfree(trace_profile_buf_nmi);
fail_buf_nmi:
kfree(trace_profile_buf);
fail_buf:
total_profile_count--;
atomic_dec(&event->profile_count);
return ret;
}
int ftrace_profile_enable(int event_id)
{
struct ftrace_event_call *event;
@@ -17,7 +68,7 @@ int ftrace_profile_enable(int event_id)
list_for_each_entry(event, &ftrace_events, list) {
if (event->id == event_id && event->profile_enable &&
try_module_get(event->mod)) {
ret = event->profile_enable(event);
ret = ftrace_profile_enable_event(event);
break;
}
}
@@ -26,6 +77,33 @@ int ftrace_profile_enable(int event_id)
return ret;
}
static void ftrace_profile_disable_event(struct ftrace_event_call *event)
{
char *buf, *nmi_buf;
if (!atomic_add_negative(-1, &event->profile_count))
return;
event->profile_disable();
if (!--total_profile_count) {
buf = trace_profile_buf;
rcu_assign_pointer(trace_profile_buf, NULL);
nmi_buf = trace_profile_buf_nmi;
rcu_assign_pointer(trace_profile_buf_nmi, NULL);
/*
* Ensure every events in profiling have finished before
* releasing the buffers
*/
synchronize_sched();
free_percpu(buf);
free_percpu(nmi_buf);
}
}
void ftrace_profile_disable(int event_id)
{
struct ftrace_event_call *event;
@@ -33,7 +111,7 @@ void ftrace_profile_disable(int event_id)
mutex_lock(&event_mutex);
list_for_each_entry(event, &ftrace_events, list) {
if (event->id == event_id) {
event->profile_disable(event);
ftrace_profile_disable_event(event);
module_put(event->mod);
break;
}

View File

@@ -232,10 +232,9 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
struct trace_parser parser;
size_t read = 0;
ssize_t ret;
ssize_t read, ret;
if (!cnt || cnt < 0)
if (!cnt)
return 0;
ret = tracing_update_buffers();
@@ -247,7 +246,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
read = trace_get_user(&parser, ubuf, cnt, ppos);
if (trace_parser_loaded((&parser))) {
if (read >= 0 && trace_parser_loaded((&parser))) {
int set = 1;
if (*parser.buffer == '!')
@@ -271,42 +270,32 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
struct list_head *list = m->private;
struct ftrace_event_call *call;
struct ftrace_event_call *call = v;
(*pos)++;
for (;;) {
if (list == &ftrace_events)
return NULL;
call = list_entry(list, struct ftrace_event_call, list);
list_for_each_entry_continue(call, &ftrace_events, list) {
/*
* The ftrace subsystem is for showing formats only.
* They can not be enabled or disabled via the event files.
*/
if (call->regfunc)
break;
list = list->next;
return call;
}
m->private = list->next;
return call;
return NULL;
}
static void *t_start(struct seq_file *m, loff_t *pos)
{
struct ftrace_event_call *call = NULL;
struct ftrace_event_call *call;
loff_t l;
mutex_lock(&event_mutex);
m->private = ftrace_events.next;
call = list_entry(&ftrace_events, struct ftrace_event_call, list);
for (l = 0; l <= *pos; ) {
call = t_next(m, NULL, &l);
call = t_next(m, call, &l);
if (!call)
break;
}
@@ -316,37 +305,28 @@ static void *t_start(struct seq_file *m, loff_t *pos)
static void *
s_next(struct seq_file *m, void *v, loff_t *pos)
{
struct list_head *list = m->private;
struct ftrace_event_call *call;
struct ftrace_event_call *call = v;
(*pos)++;
retry:
if (list == &ftrace_events)
return NULL;
call = list_entry(list, struct ftrace_event_call, list);
if (!call->enabled) {
list = list->next;
goto retry;
list_for_each_entry_continue(call, &ftrace_events, list) {
if (call->enabled)
return call;
}
m->private = list->next;
return call;
return NULL;
}
static void *s_start(struct seq_file *m, loff_t *pos)
{
struct ftrace_event_call *call = NULL;
struct ftrace_event_call *call;
loff_t l;
mutex_lock(&event_mutex);
m->private = ftrace_events.next;
call = list_entry(&ftrace_events, struct ftrace_event_call, list);
for (l = 0; l <= *pos; ) {
call = s_next(m, NULL, &l);
call = s_next(m, call, &l);
if (!call)
break;
}

View File

@@ -155,7 +155,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
seq_print_ip_sym(seq, it->from, symflags) &&
trace_seq_printf(seq, "\n"))
return TRACE_TYPE_HANDLED;
return TRACE_TYPE_PARTIAL_LINE;;
return TRACE_TYPE_PARTIAL_LINE;
}
return TRACE_TYPE_UNHANDLED;
}

View File

@@ -1,218 +0,0 @@
/*
* ring buffer based C-state tracer
*
* Arjan van de Ven <arjan@linux.intel.com>
* Copyright (C) 2008 Intel Corporation
*
* Much is borrowed from trace_boot.c which is
* Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
*
*/
#include <linux/init.h>
#include <linux/debugfs.h>
#include <trace/power.h>
#include <linux/kallsyms.h>
#include <linux/module.h>
#include "trace.h"
#include "trace_output.h"
static struct trace_array *power_trace;
static int __read_mostly trace_power_enabled;
static void probe_power_start(struct power_trace *it, unsigned int type,
unsigned int level)
{
if (!trace_power_enabled)
return;
memset(it, 0, sizeof(struct power_trace));
it->state = level;
it->type = type;
it->stamp = ktime_get();
}
static void probe_power_end(struct power_trace *it)
{
struct ftrace_event_call *call = &event_power;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
struct trace_power *entry;
struct trace_array_cpu *data;
struct trace_array *tr = power_trace;
if (!trace_power_enabled)
return;
buffer = tr->buffer;
preempt_disable();
it->end = ktime_get();
data = tr->data[smp_processor_id()];
event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
sizeof(*entry), 0, 0);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
entry->state_data = *it;
if (!filter_check_discard(call, entry, buffer, event))
trace_buffer_unlock_commit(buffer, event, 0, 0);
out:
preempt_enable();
}
static void probe_power_mark(struct power_trace *it, unsigned int type,
unsigned int level)
{
struct ftrace_event_call *call = &event_power;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
struct trace_power *entry;
struct trace_array_cpu *data;
struct trace_array *tr = power_trace;
if (!trace_power_enabled)
return;
buffer = tr->buffer;
memset(it, 0, sizeof(struct power_trace));
it->state = level;
it->type = type;
it->stamp = ktime_get();
preempt_disable();
it->end = it->stamp;
data = tr->data[smp_processor_id()];
event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
sizeof(*entry), 0, 0);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
entry->state_data = *it;
if (!filter_check_discard(call, entry, buffer, event))
trace_buffer_unlock_commit(buffer, event, 0, 0);
out:
preempt_enable();
}
static int tracing_power_register(void)
{
int ret;
ret = register_trace_power_start(probe_power_start);
if (ret) {
pr_info("power trace: Couldn't activate tracepoint"
" probe to trace_power_start\n");
return ret;
}
ret = register_trace_power_end(probe_power_end);
if (ret) {
pr_info("power trace: Couldn't activate tracepoint"
" probe to trace_power_end\n");
goto fail_start;
}
ret = register_trace_power_mark(probe_power_mark);
if (ret) {
pr_info("power trace: Couldn't activate tracepoint"
" probe to trace_power_mark\n");
goto fail_end;
}
return ret;
fail_end:
unregister_trace_power_end(probe_power_end);
fail_start:
unregister_trace_power_start(probe_power_start);
return ret;
}
static void start_power_trace(struct trace_array *tr)
{
trace_power_enabled = 1;
}
static void stop_power_trace(struct trace_array *tr)
{
trace_power_enabled = 0;
}
static void power_trace_reset(struct trace_array *tr)
{
trace_power_enabled = 0;
unregister_trace_power_start(probe_power_start);
unregister_trace_power_end(probe_power_end);
unregister_trace_power_mark(probe_power_mark);
}
static int power_trace_init(struct trace_array *tr)
{
power_trace = tr;
trace_power_enabled = 1;
tracing_power_register();
tracing_reset_online_cpus(tr);
return 0;
}
static enum print_line_t power_print_line(struct trace_iterator *iter)
{
int ret = 0;
struct trace_entry *entry = iter->ent;
struct trace_power *field ;
struct power_trace *it;
struct trace_seq *s = &iter->seq;
struct timespec stamp;
struct timespec duration;
trace_assign_type(field, entry);
it = &field->state_data;
stamp = ktime_to_timespec(it->stamp);
duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
if (entry->type == TRACE_POWER) {
if (it->type == POWER_CSTATE)
ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
stamp.tv_sec,
stamp.tv_nsec,
it->state, iter->cpu,
duration.tv_sec,
duration.tv_nsec);
if (it->type == POWER_PSTATE)
ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
stamp.tv_sec,
stamp.tv_nsec,
it->state, iter->cpu);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_HANDLED;
}
return TRACE_TYPE_UNHANDLED;
}
static void power_print_header(struct seq_file *s)
{
seq_puts(s, "# TIMESTAMP STATE EVENT\n");
seq_puts(s, "# | | |\n");
}
static struct tracer power_tracer __read_mostly =
{
.name = "power",
.init = power_trace_init,
.start = start_power_trace,
.stop = stop_power_trace,
.reset = power_trace_reset,
.print_line = power_print_line,
.print_header = power_print_header,
};
static int init_power_trace(void)
{
return register_tracer(&power_tracer);
}
device_initcall(init_power_trace);

View File

@@ -11,7 +11,6 @@
#include <linux/ftrace.h>
#include <linux/string.h>
#include <linux/module.h>
#include <linux/marker.h>
#include <linux/mutex.h>
#include <linux/ctype.h>
#include <linux/list.h>

View File

@@ -296,14 +296,14 @@ static const struct file_operations stack_trace_fops = {
int
stack_trace_sysctl(struct ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *lenp,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
mutex_lock(&stack_sysctl_mutex);
ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret || !write ||
(last_stack_tracer_enabled == !!stack_tracer_enabled))

View File

@@ -2,7 +2,7 @@
#include <trace/events/syscalls.h>
#include <linux/kernel.h>
#include <linux/ftrace.h>
#include <linux/perf_counter.h>
#include <linux/perf_event.h>
#include <asm/syscall.h>
#include "trace_output.h"
@@ -384,10 +384,13 @@ static int sys_prof_refcount_exit;
static void prof_syscall_enter(struct pt_regs *regs, long id)
{
struct syscall_trace_enter *rec;
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
unsigned long flags;
char *raw_data;
int syscall_nr;
int size;
int cpu;
syscall_nr = syscall_get_nr(current, regs);
if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
@@ -402,20 +405,38 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
size = ALIGN(size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
do {
char raw_data[size];
if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
"profile buffer not large enough"))
return;
/* zero the dead bytes from align to not leak stack to user */
*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
/* Protect the per cpu buffer, begin the rcu read side */
local_irq_save(flags);
rec = (struct syscall_trace_enter *) raw_data;
tracing_generic_entry_update(&rec->ent, 0, 0);
rec->ent.type = sys_data->enter_id;
rec->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
(unsigned long *)&rec->args);
perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
} while(0);
cpu = smp_processor_id();
if (in_nmi())
raw_data = rcu_dereference(trace_profile_buf_nmi);
else
raw_data = rcu_dereference(trace_profile_buf);
if (!raw_data)
goto end;
raw_data = per_cpu_ptr(raw_data, cpu);
/* zero the dead bytes from align to not leak stack to user */
*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
rec = (struct syscall_trace_enter *) raw_data;
tracing_generic_entry_update(&rec->ent, 0, 0);
rec->ent.type = sys_data->enter_id;
rec->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
(unsigned long *)&rec->args);
perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
end:
local_irq_restore(flags);
}
int reg_prof_syscall_enter(char *name)
@@ -460,8 +481,12 @@ void unreg_prof_syscall_enter(char *name)
static void prof_syscall_exit(struct pt_regs *regs, long ret)
{
struct syscall_metadata *sys_data;
struct syscall_trace_exit rec;
struct syscall_trace_exit *rec;
unsigned long flags;
int syscall_nr;
char *raw_data;
int size;
int cpu;
syscall_nr = syscall_get_nr(current, regs);
if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
@@ -471,12 +496,46 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
if (!sys_data)
return;
tracing_generic_entry_update(&rec.ent, 0, 0);
rec.ent.type = sys_data->exit_id;
rec.nr = syscall_nr;
rec.ret = syscall_get_return_value(current, regs);
/* We can probably do that at build time */
size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec));
/*
* Impossible, but be paranoid with the future
* How to put this check outside runtime?
*/
if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
"exit event has grown above profile buffer size"))
return;
/* Protect the per cpu buffer, begin the rcu read side */
local_irq_save(flags);
cpu = smp_processor_id();
if (in_nmi())
raw_data = rcu_dereference(trace_profile_buf_nmi);
else
raw_data = rcu_dereference(trace_profile_buf);
if (!raw_data)
goto end;
raw_data = per_cpu_ptr(raw_data, cpu);
/* zero the dead bytes from align to not leak stack to user */
*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
rec = (struct syscall_trace_exit *)raw_data;
tracing_generic_entry_update(&rec->ent, 0, 0);
rec->ent.type = sys_data->exit_id;
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
perf_tp_event(sys_data->exit_id, 0, 1, rec, size);
end:
local_irq_restore(flags);
}
int reg_prof_syscall_exit(char *name)

View File

@@ -48,7 +48,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
/*
* Note about RCU :
* It is used to to delay the free of multiple probes array until a quiescent
* It is used to delay the free of multiple probes array until a quiescent
* state is reached.
* Tracepoint entries modifications are protected by the tracepoints_mutex.
*/

View File

@@ -4,7 +4,6 @@
*/
#include <linux/mm.h>
#include <linux/utsname.h>
#include <linux/mman.h>
#include <linux/notifier.h>
#include <linux/reboot.h>

View File

@@ -42,14 +42,14 @@ static void put_uts(ctl_table *table, int write, void *which)
* Special case of dostring for the UTS structure. This has locks
* to observe. Should this be in kernel/sys.c ????
*/
static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
static int proc_do_uts_string(ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table uts_table;
int r;
memcpy(&uts_table, table, sizeof(uts_table));
uts_table.data = get_uts(table, write);
r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos);
r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
put_uts(table, write, uts_table.data);
return r;
}