Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6

Conflicts:

	sound/core/memalloc.c
此提交包含在:
David S. Miller
2008-10-11 12:39:35 -07:00
當前提交 56c5d900db
共有 3711 個檔案被更改,包括 190093 行新增87746 行删除

查看文件

@@ -2738,14 +2738,15 @@ void cgroup_fork_callbacks(struct task_struct *child)
*/
void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
{
struct cgroup *oldcgrp, *newcgrp;
struct cgroup *oldcgrp, *newcgrp = NULL;
if (need_mm_owner_callback) {
int i;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
oldcgrp = task_cgroup(old, ss->subsys_id);
newcgrp = task_cgroup(new, ss->subsys_id);
if (new)
newcgrp = task_cgroup(new, ss->subsys_id);
if (oldcgrp == newcgrp)
continue;
if (ss->mm_owner_changed)

查看文件

@@ -199,13 +199,14 @@ static int __ref take_cpu_down(void *_param)
struct take_cpu_down_param *param = _param;
int err;
raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
param->hcpu);
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
if (err < 0)
return err;
raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
param->hcpu);
/* Force idle task to run as soon as we yield: it should
immediately notice cpu is offline and die quickly. */
sched_idle_next();
@@ -453,6 +454,25 @@ out:
}
#endif /* CONFIG_PM_SLEEP_SMP */
/**
* notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
* @cpu: cpu that just started
*
* This function calls the cpu_chain notifiers with CPU_STARTING.
* It must be called by the arch code on the new cpu, before the new cpu
* enables interrupts and before the "boot" cpu returns from __cpu_up().
*/
void notify_cpu_starting(unsigned int cpu)
{
unsigned long val = CPU_STARTING;
#ifdef CONFIG_PM_SLEEP_SMP
if (cpu_isset(cpu, frozen_cpus))
val = CPU_STARTING_FROZEN;
#endif /* CONFIG_PM_SLEEP_SMP */
raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
}
#endif /* CONFIG_SMP */
/*

查看文件

@@ -1921,7 +1921,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
* that has tasks along with an empty 'mems'. But if we did see such
* a cpuset, we'd handle it just like we do if its 'cpus' was empty.
*/
static void scan_for_empty_cpusets(const struct cpuset *root)
static void scan_for_empty_cpusets(struct cpuset *root)
{
LIST_HEAD(queue);
struct cpuset *cp; /* scans cpusets being updated */

查看文件

@@ -124,6 +124,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
}
return (mem != NULL);
}
EXPORT_SYMBOL(dma_alloc_from_coherent);
/**
* dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
@@ -151,3 +152,4 @@ int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
}
return 0;
}
EXPORT_SYMBOL(dma_release_from_coherent);

查看文件

@@ -583,8 +583,6 @@ mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
* If there are other users of the mm and the owner (us) is exiting
* we need to find a new owner to take on the responsibility.
*/
if (!mm)
return 0;
if (atomic_read(&mm->mm_users) <= 1)
return 0;
if (mm->owner != p)
@@ -627,6 +625,16 @@ retry:
} while_each_thread(g, c);
read_unlock(&tasklist_lock);
/*
* We found no owner yet mm_users > 1: this implies that we are
* most likely racing with swapoff (try_to_unuse()) or /proc or
* ptrace or page migration (get_task_mm()). Mark owner as NULL,
* so that subsystems can understand the callback and take action.
*/
down_write(&mm->mmap_sem);
cgroup_mm_owner_callbacks(mm->owner, NULL);
mm->owner = NULL;
up_write(&mm->mmap_sem);
return;
assign_new_owner:

查看文件

@@ -672,13 +672,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
*/
BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
return 1;
case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ:
case HRTIMER_CB_IRQSAFE_PERCPU:
case HRTIMER_CB_IRQSAFE_UNLOCKED:
/*
* This is solely for the sched tick emulation with
* dynamic tick support to ensure that we do not
* restart the tick right on the edge and end up with
* the tick timer in the softirq ! The calling site
* takes care of this.
* takes care of this. Also used for hrtimer sleeper !
*/
debug_hrtimer_deactivate(timer);
return 1;
@@ -1245,7 +1246,8 @@ static void __run_hrtimer(struct hrtimer *timer)
timer_stats_account_hrtimer(timer);
fn = timer->function;
if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) {
/*
* Used for scheduler timers, avoid lock inversion with
* rq->lock and tasklist_lock.
@@ -1452,7 +1454,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
sl->timer.function = hrtimer_wakeup;
sl->task = task;
#ifdef CONFIG_HIGH_RES_TIMERS
sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
#endif
}
@@ -1591,29 +1593,95 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
#ifdef CONFIG_HOTPLUG_CPU
static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
struct hrtimer_clock_base *new_base)
static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
struct hrtimer_clock_base *new_base, int dcpu)
{
struct hrtimer *timer;
struct rb_node *node;
int raise = 0;
while ((node = rb_first(&old_base->active))) {
timer = rb_entry(node, struct hrtimer, node);
BUG_ON(hrtimer_callback_running(timer));
debug_hrtimer_deactivate(timer);
__remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0);
/*
* Should not happen. Per CPU timers should be
* canceled _before_ the migration code is called
*/
if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
__remove_hrtimer(timer, old_base,
HRTIMER_STATE_INACTIVE, 0);
WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
timer, timer->function, dcpu);
continue;
}
/*
* Mark it as STATE_MIGRATE not INACTIVE otherwise the
* timer could be seen as !active and just vanish away
* under us on another CPU
*/
__remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
timer->base = new_base;
/*
* Enqueue the timer. Allow reprogramming of the event device
*/
enqueue_hrtimer(timer, new_base, 1);
#ifdef CONFIG_HIGH_RES_TIMERS
/*
* Happens with high res enabled when the timer was
* already expired and the callback mode is
* HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
* enqueue code does not move them to the soft irq
* pending list for performance/latency reasons, but
* in the migration state, we need to do that
* otherwise we end up with a stale timer.
*/
if (timer->state == HRTIMER_STATE_MIGRATE) {
timer->state = HRTIMER_STATE_PENDING;
list_add_tail(&timer->cb_entry,
&new_base->cpu_base->cb_pending);
raise = 1;
}
#endif
/* Clear the migration state bit */
timer->state &= ~HRTIMER_STATE_MIGRATE;
}
return raise;
}
#ifdef CONFIG_HIGH_RES_TIMERS
static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
struct hrtimer_cpu_base *new_base)
{
struct hrtimer *timer;
int raise = 0;
while (!list_empty(&old_base->cb_pending)) {
timer = list_entry(old_base->cb_pending.next,
struct hrtimer, cb_entry);
__remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
timer->base = &new_base->clock_base[timer->base->index];
list_add_tail(&timer->cb_entry, &new_base->cb_pending);
raise = 1;
}
return raise;
}
#else
static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
struct hrtimer_cpu_base *new_base)
{
return 0;
}
#endif
static void migrate_hrtimers(int cpu)
{
struct hrtimer_cpu_base *old_base, *new_base;
int i;
int i, raise = 0;
BUG_ON(cpu_online(cpu));
old_base = &per_cpu(hrtimer_bases, cpu);
@@ -1626,14 +1694,21 @@ static void migrate_hrtimers(int cpu)
spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
migrate_hrtimer_list(&old_base->clock_base[i],
&new_base->clock_base[i]);
if (migrate_hrtimer_list(&old_base->clock_base[i],
&new_base->clock_base[i], cpu))
raise = 1;
}
if (migrate_hrtimer_pending(old_base, new_base))
raise = 1;
spin_unlock(&old_base->lock);
spin_unlock(&new_base->lock);
local_irq_enable();
put_cpu_var(hrtimer_bases);
if (raise)
hrtimer_raise_softirq();
}
#endif /* CONFIG_HOTPLUG_CPU */

查看文件

@@ -89,7 +89,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
set_balance_irq_affinity(irq, cpumask);
#ifdef CONFIG_GENERIC_PENDING_IRQ
set_pending_irq(irq, cpumask);
if (desc->status & IRQ_MOVE_PCNTXT) {
unsigned long flags;
spin_lock_irqsave(&desc->lock, flags);
desc->chip->set_affinity(irq, cpumask);
spin_unlock_irqrestore(&desc->lock, flags);
} else
set_pending_irq(irq, cpumask);
#else
desc->affinity = cpumask;
desc->chip->set_affinity(irq, cpumask);

查看文件

@@ -753,8 +753,14 @@ static struct page *kimage_alloc_page(struct kimage *image,
*old = addr | (*old & ~PAGE_MASK);
/* The old page I have found cannot be a
* destination page, so return it.
* destination page, so return it if it's
* gfp_flags honor the ones passed in.
*/
if (!(gfp_mask & __GFP_HIGHMEM) &&
PageHighMem(old_page)) {
kimage_free_pages(old_page);
continue;
}
addr = old_addr;
page = old_page;
break;

查看文件

@@ -488,7 +488,7 @@ static int write_mem_msg(int binary)
if (err)
return err;
if (CACHE_FLUSH_IS_SAFE)
flush_icache_range(addr, addr + length + 1);
flush_icache_range(addr, addr + length);
return 0;
}
@@ -590,6 +590,7 @@ static void kgdb_wait(struct pt_regs *regs)
/* Signal the primary CPU that we are done: */
atomic_set(&cpu_in_kgdb[cpu], 0);
touch_softlockup_watchdog();
clocksource_touch_watchdog();
local_irq_restore(flags);
}
@@ -1432,6 +1433,7 @@ acquirelock:
atomic_read(&kgdb_cpu_doing_single_step) != cpu) {
atomic_set(&kgdb_active, -1);
touch_softlockup_watchdog();
clocksource_touch_watchdog();
local_irq_restore(flags);
@@ -1462,7 +1464,7 @@ acquirelock:
* Get the passive CPU lock which will hold all the non-primary
* CPU in a spin state while the debugger is active
*/
if (!kgdb_single_step || !kgdb_contthread) {
if (!kgdb_single_step) {
for (i = 0; i < NR_CPUS; i++)
atomic_set(&passive_cpu_wait[i], 1);
}
@@ -1475,7 +1477,7 @@ acquirelock:
#ifdef CONFIG_SMP
/* Signal the other CPUs to enter kgdb_wait() */
if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup)
if ((!kgdb_single_step) && kgdb_do_roundup)
kgdb_roundup_cpus(flags);
#endif
@@ -1494,7 +1496,7 @@ acquirelock:
kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
kgdb_deactivate_sw_breakpoints();
kgdb_single_step = 0;
kgdb_contthread = NULL;
kgdb_contthread = current;
exception_level = 0;
/* Talk to debugger with gdbserial protocol */
@@ -1508,7 +1510,7 @@ acquirelock:
kgdb_info[ks->cpu].task = NULL;
atomic_set(&cpu_in_kgdb[ks->cpu], 0);
if (!kgdb_single_step || !kgdb_contthread) {
if (!kgdb_single_step) {
for (i = NR_CPUS-1; i >= 0; i--)
atomic_set(&passive_cpu_wait[i], 0);
/*
@@ -1524,6 +1526,7 @@ acquirelock:
kgdb_restore:
/* Free kgdb_active */
atomic_set(&kgdb_active, -1);
touch_softlockup_watchdog();
clocksource_touch_watchdog();
local_irq_restore(flags);

查看文件

@@ -441,7 +441,7 @@ static struct k_itimer * alloc_posix_timer(void)
return tmr;
if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
kmem_cache_free(posix_timers_cache, tmr);
tmr = NULL;
return NULL;
}
memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
return tmr;

查看文件

@@ -47,6 +47,7 @@
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/mutex.h>
#include <linux/time.h>
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key rcu_lock_key;
@@ -60,12 +61,14 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
static struct rcu_ctrlblk rcu_ctrlblk = {
.cur = -300,
.completed = -300,
.pending = -300,
.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
.cpumask = CPU_MASK_NONE,
};
static struct rcu_ctrlblk rcu_bh_ctrlblk = {
.cur = -300,
.completed = -300,
.pending = -300,
.lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
.cpumask = CPU_MASK_NONE,
};
@@ -83,7 +86,10 @@ static void force_quiescent_state(struct rcu_data *rdp,
{
int cpu;
cpumask_t cpumask;
unsigned long flags;
set_need_resched();
spin_lock_irqsave(&rcp->lock, flags);
if (unlikely(!rcp->signaled)) {
rcp->signaled = 1;
/*
@@ -109,6 +115,7 @@ static void force_quiescent_state(struct rcu_data *rdp,
for_each_cpu_mask_nr(cpu, cpumask)
smp_send_reschedule(cpu);
}
spin_unlock_irqrestore(&rcp->lock, flags);
}
#else
static inline void force_quiescent_state(struct rcu_data *rdp,
@@ -118,6 +125,126 @@ static inline void force_quiescent_state(struct rcu_data *rdp,
}
#endif
static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
struct rcu_data *rdp)
{
long batch;
head->next = NULL;
smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
/*
* Determine the batch number of this callback.
*
* Using ACCESS_ONCE to avoid the following error when gcc eliminates
* local variable "batch" and emits codes like this:
* 1) rdp->batch = rcp->cur + 1 # gets old value
* ......
* 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
* then [*nxttail[0], *nxttail[1]) may contain callbacks
* that batch# = rdp->batch, see the comment of struct rcu_data.
*/
batch = ACCESS_ONCE(rcp->cur) + 1;
if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
/* process callbacks */
rdp->nxttail[0] = rdp->nxttail[1];
rdp->nxttail[1] = rdp->nxttail[2];
if (rcu_batch_after(batch - 1, rdp->batch))
rdp->nxttail[0] = rdp->nxttail[2];
}
rdp->batch = batch;
*rdp->nxttail[2] = head;
rdp->nxttail[2] = &head->next;
if (unlikely(++rdp->qlen > qhimark)) {
rdp->blimit = INT_MAX;
force_quiescent_state(rdp, &rcu_ctrlblk);
}
}
#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
{
rcp->gp_start = jiffies;
rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
}
static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
{
int cpu;
long delta;
unsigned long flags;
/* Only let one CPU complain about others per time interval. */
spin_lock_irqsave(&rcp->lock, flags);
delta = jiffies - rcp->jiffies_stall;
if (delta < 2 || rcp->cur != rcp->completed) {
spin_unlock_irqrestore(&rcp->lock, flags);
return;
}
rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
spin_unlock_irqrestore(&rcp->lock, flags);
/* OK, time to rat on our buddy... */
printk(KERN_ERR "RCU detected CPU stalls:");
for_each_possible_cpu(cpu) {
if (cpu_isset(cpu, rcp->cpumask))
printk(" %d", cpu);
}
printk(" (detected by %d, t=%ld jiffies)\n",
smp_processor_id(), (long)(jiffies - rcp->gp_start));
}
static void print_cpu_stall(struct rcu_ctrlblk *rcp)
{
unsigned long flags;
printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
smp_processor_id(), jiffies,
jiffies - rcp->gp_start);
dump_stack();
spin_lock_irqsave(&rcp->lock, flags);
if ((long)(jiffies - rcp->jiffies_stall) >= 0)
rcp->jiffies_stall =
jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
spin_unlock_irqrestore(&rcp->lock, flags);
set_need_resched(); /* kick ourselves to get things going. */
}
static void check_cpu_stall(struct rcu_ctrlblk *rcp)
{
long delta;
delta = jiffies - rcp->jiffies_stall;
if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) {
/* We haven't checked in, so go dump stack. */
print_cpu_stall(rcp);
} else if (rcp->cur != rcp->completed && delta >= 2) {
/* They had two seconds to dump stack, so complain. */
print_other_cpu_stall(rcp);
}
}
#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
{
}
static inline void check_cpu_stall(struct rcu_ctrlblk *rcp)
{
}
#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
/**
* call_rcu - Queue an RCU callback for invocation after a grace period.
* @head: structure to be used for queueing the RCU updates.
@@ -133,18 +260,10 @@ void call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *rcu))
{
unsigned long flags;
struct rcu_data *rdp;
head->func = func;
head->next = NULL;
local_irq_save(flags);
rdp = &__get_cpu_var(rcu_data);
*rdp->nxttail = head;
rdp->nxttail = &head->next;
if (unlikely(++rdp->qlen > qhimark)) {
rdp->blimit = INT_MAX;
force_quiescent_state(rdp, &rcu_ctrlblk);
}
__call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(call_rcu);
@@ -169,20 +288,10 @@ void call_rcu_bh(struct rcu_head *head,
void (*func)(struct rcu_head *rcu))
{
unsigned long flags;
struct rcu_data *rdp;
head->func = func;
head->next = NULL;
local_irq_save(flags);
rdp = &__get_cpu_var(rcu_bh_data);
*rdp->nxttail = head;
rdp->nxttail = &head->next;
if (unlikely(++rdp->qlen > qhimark)) {
rdp->blimit = INT_MAX;
force_quiescent_state(rdp, &rcu_bh_ctrlblk);
}
__call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(call_rcu_bh);
@@ -211,12 +320,6 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
static inline void raise_rcu_softirq(void)
{
raise_softirq(RCU_SOFTIRQ);
/*
* The smp_mb() here is required to ensure that this cpu's
* __rcu_process_callbacks() reads the most recently updated
* value of rcu->cur.
*/
smp_mb();
}
/*
@@ -225,6 +328,7 @@ static inline void raise_rcu_softirq(void)
*/
static void rcu_do_batch(struct rcu_data *rdp)
{
unsigned long flags;
struct rcu_head *next, *list;
int count = 0;
@@ -239,9 +343,9 @@ static void rcu_do_batch(struct rcu_data *rdp)
}
rdp->donelist = list;
local_irq_disable();
local_irq_save(flags);
rdp->qlen -= count;
local_irq_enable();
local_irq_restore(flags);
if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
rdp->blimit = blimit;
@@ -269,6 +373,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
* rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
* period (if necessary).
*/
/*
* Register a new batch of callbacks, and start it up if there is currently no
* active batch and the batch to be registered has not already occurred.
@@ -276,15 +381,10 @@ static void rcu_do_batch(struct rcu_data *rdp)
*/
static void rcu_start_batch(struct rcu_ctrlblk *rcp)
{
if (rcp->next_pending &&
if (rcp->cur != rcp->pending &&
rcp->completed == rcp->cur) {
rcp->next_pending = 0;
/*
* next_pending == 0 must be visible in
* __rcu_process_callbacks() before it can see new value of cur.
*/
smp_wmb();
rcp->cur++;
record_gp_stall_check_time(rcp);
/*
* Accessing nohz_cpu_mask before incrementing rcp->cur needs a
@@ -322,6 +422,8 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
struct rcu_data *rdp)
{
unsigned long flags;
if (rdp->quiescbatch != rcp->cur) {
/* start new grace period: */
rdp->qs_pending = 1;
@@ -345,7 +447,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
return;
rdp->qs_pending = 0;
spin_lock(&rcp->lock);
spin_lock_irqsave(&rcp->lock, flags);
/*
* rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
* during cpu startup. Ignore the quiescent state.
@@ -353,7 +455,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
if (likely(rdp->quiescbatch == rcp->cur))
cpu_quiet(rdp->cpu, rcp);
spin_unlock(&rcp->lock);
spin_unlock_irqrestore(&rcp->lock, flags);
}
@@ -364,33 +466,38 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
* which is dead and hence not processing interrupts.
*/
static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
struct rcu_head **tail)
struct rcu_head **tail, long batch)
{
local_irq_disable();
*this_rdp->nxttail = list;
if (list)
this_rdp->nxttail = tail;
local_irq_enable();
unsigned long flags;
if (list) {
local_irq_save(flags);
this_rdp->batch = batch;
*this_rdp->nxttail[2] = list;
this_rdp->nxttail[2] = tail;
local_irq_restore(flags);
}
}
static void __rcu_offline_cpu(struct rcu_data *this_rdp,
struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
{
/* if the cpu going offline owns the grace period
unsigned long flags;
/*
* if the cpu going offline owns the grace period
* we can block indefinitely waiting for it, so flush
* it here
*/
spin_lock_bh(&rcp->lock);
spin_lock_irqsave(&rcp->lock, flags);
if (rcp->cur != rcp->completed)
cpu_quiet(rdp->cpu, rcp);
spin_unlock_bh(&rcp->lock);
rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
spin_unlock(&rcp->lock);
local_irq_disable();
this_rdp->qlen += rdp->qlen;
local_irq_enable();
local_irq_restore(flags);
}
static void rcu_offline_cpu(int cpu)
@@ -420,38 +527,52 @@ static void rcu_offline_cpu(int cpu)
static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
struct rcu_data *rdp)
{
if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
*rdp->donetail = rdp->curlist;
rdp->donetail = rdp->curtail;
rdp->curlist = NULL;
rdp->curtail = &rdp->curlist;
}
unsigned long flags;
long completed_snap;
if (rdp->nxtlist && !rdp->curlist) {
local_irq_disable();
rdp->curlist = rdp->nxtlist;
rdp->curtail = rdp->nxttail;
rdp->nxtlist = NULL;
rdp->nxttail = &rdp->nxtlist;
local_irq_enable();
if (rdp->nxtlist) {
local_irq_save(flags);
completed_snap = ACCESS_ONCE(rcp->completed);
/*
* start the next batch of callbacks
* move the other grace-period-completed entries to
* [rdp->nxtlist, *rdp->nxttail[0]) temporarily
*/
if (!rcu_batch_before(completed_snap, rdp->batch))
rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
rdp->nxttail[0] = rdp->nxttail[1];
/* determine batch number */
rdp->batch = rcp->cur + 1;
/* see the comment and corresponding wmb() in
* the rcu_start_batch()
/*
* the grace period for entries in
* [rdp->nxtlist, *rdp->nxttail[0]) has completed and
* move these entries to donelist
*/
smp_rmb();
if (rdp->nxttail[0] != &rdp->nxtlist) {
*rdp->donetail = rdp->nxtlist;
rdp->donetail = rdp->nxttail[0];
rdp->nxtlist = *rdp->nxttail[0];
*rdp->donetail = NULL;
if (rdp->nxttail[1] == rdp->nxttail[0])
rdp->nxttail[1] = &rdp->nxtlist;
if (rdp->nxttail[2] == rdp->nxttail[0])
rdp->nxttail[2] = &rdp->nxtlist;
rdp->nxttail[0] = &rdp->nxtlist;
}
local_irq_restore(flags);
if (rcu_batch_after(rdp->batch, rcp->pending)) {
unsigned long flags2;
if (!rcp->next_pending) {
/* and start it/schedule start if it's a new batch */
spin_lock(&rcp->lock);
rcp->next_pending = 1;
rcu_start_batch(rcp);
spin_unlock(&rcp->lock);
spin_lock_irqsave(&rcp->lock, flags2);
if (rcu_batch_after(rdp->batch, rcp->pending)) {
rcp->pending = rdp->batch;
rcu_start_batch(rcp);
}
spin_unlock_irqrestore(&rcp->lock, flags2);
}
}
@@ -462,21 +583,53 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
static void rcu_process_callbacks(struct softirq_action *unused)
{
/*
* Memory references from any prior RCU read-side critical sections
* executed by the interrupted code must be see before any RCU
* grace-period manupulations below.
*/
smp_mb(); /* See above block comment. */
__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
/*
* Memory references from any later RCU read-side critical sections
* executed by the interrupted code must be see after any RCU
* grace-period manupulations above.
*/
smp_mb(); /* See above block comment. */
}
static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
{
/* This cpu has pending rcu entries and the grace period
* for them has completed.
*/
if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
return 1;
/* Check for CPU stalls, if enabled. */
check_cpu_stall(rcp);
/* This cpu has no pending entries, but there are new entries */
if (!rdp->curlist && rdp->nxtlist)
return 1;
if (rdp->nxtlist) {
long completed_snap = ACCESS_ONCE(rcp->completed);
/*
* This cpu has pending rcu entries and the grace period
* for them has completed.
*/
if (!rcu_batch_before(completed_snap, rdp->batch))
return 1;
if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
rdp->nxttail[0] != rdp->nxttail[1])
return 1;
if (rdp->nxttail[0] != &rdp->nxtlist)
return 1;
/*
* This cpu has pending rcu entries and the new batch
* for then hasn't been started nor scheduled start
*/
if (rcu_batch_after(rdp->batch, rcp->pending))
return 1;
}
/* This cpu has finished callbacks to invoke */
if (rdp->donelist)
@@ -512,9 +665,15 @@ int rcu_needs_cpu(int cpu)
struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
}
/*
* Top-level function driving RCU grace-period detection, normally
* invoked from the scheduler-clock interrupt. This function simply
* increments counters that are read only from softirq by this same
* CPU, so there are no memory barriers required.
*/
void rcu_check_callbacks(int cpu, int user)
{
if (user ||
@@ -558,14 +717,17 @@ void rcu_check_callbacks(int cpu, int user)
static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
struct rcu_data *rdp)
{
unsigned long flags;
spin_lock_irqsave(&rcp->lock, flags);
memset(rdp, 0, sizeof(*rdp));
rdp->curtail = &rdp->curlist;
rdp->nxttail = &rdp->nxtlist;
rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
rdp->donetail = &rdp->donelist;
rdp->quiescbatch = rcp->completed;
rdp->qs_pending = 0;
rdp->cpu = cpu;
rdp->blimit = blimit;
spin_unlock_irqrestore(&rcp->lock, flags);
}
static void __cpuinit rcu_online_cpu(int cpu)
@@ -610,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = {
*/
void __init __rcu_init(void)
{
#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
(void *)(long)smp_processor_id());
/* Register notifier for non-boot CPUs */

查看文件

@@ -58,14 +58,6 @@
#include <linux/cpumask.h>
#include <linux/rcupreempt_trace.h>
/*
* Macro that prevents the compiler from reordering accesses, but does
* absolutely -nothing- to prevent CPUs from reordering. This is used
* only to mediate communication between mainline code and hardware
* interrupt and NMI handlers.
*/
#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
/*
* PREEMPT_RCU data structures.
*/

查看文件

@@ -308,11 +308,16 @@ out:
static int __init rcupreempt_trace_init(void)
{
int ret;
mutex_init(&rcupreempt_trace_mutex);
rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
if (!rcupreempt_trace_buf)
return 1;
return rcupreempt_debugfs_init();
ret = rcupreempt_debugfs_init();
if (ret)
kfree(rcupreempt_trace_buf);
return ret;
}
static void __exit rcupreempt_trace_cleanup(void)

查看文件

@@ -516,6 +516,74 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
return result;
}
static void __init __reserve_region_with_split(struct resource *root,
resource_size_t start, resource_size_t end,
const char *name)
{
struct resource *parent = root;
struct resource *conflict;
struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
if (!res)
return;
res->name = name;
res->start = start;
res->end = end;
res->flags = IORESOURCE_BUSY;
for (;;) {
conflict = __request_resource(parent, res);
if (!conflict)
break;
if (conflict != parent) {
parent = conflict;
if (!(conflict->flags & IORESOURCE_BUSY))
continue;
}
/* Uhhuh, that didn't work out.. */
kfree(res);
res = NULL;
break;
}
if (!res) {
printk(KERN_DEBUG " __reserve_region_with_split: (%s) [%llx, %llx], res: (%s) [%llx, %llx]\n",
conflict->name, conflict->start, conflict->end,
name, start, end);
/* failed, split and try again */
/* conflict coverred whole area */
if (conflict->start <= start && conflict->end >= end)
return;
if (conflict->start > start)
__reserve_region_with_split(root, start, conflict->start-1, name);
if (!(conflict->flags & IORESOURCE_BUSY)) {
resource_size_t common_start, common_end;
common_start = max(conflict->start, start);
common_end = min(conflict->end, end);
if (common_start < common_end)
__reserve_region_with_split(root, common_start, common_end, name);
}
if (conflict->end < end)
__reserve_region_with_split(root, conflict->end+1, end, name);
}
}
void reserve_region_with_split(struct resource *root,
resource_size_t start, resource_size_t end,
const char *name)
{
write_lock(&resource_lock);
__reserve_region_with_split(root, start, end, name);
write_unlock(&resource_lock);
}
EXPORT_SYMBOL(adjust_resource);
/**

查看文件

@@ -201,14 +201,19 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
hrtimer_init(&rt_b->rt_period_timer,
CLOCK_MONOTONIC, HRTIMER_MODE_REL);
rt_b->rt_period_timer.function = sched_rt_period_timer;
rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
}
static inline int rt_bandwidth_enabled(void)
{
return sysctl_sched_rt_runtime >= 0;
}
static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
ktime_t now;
if (rt_b->rt_runtime == RUNTIME_INF)
if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
return;
if (hrtimer_active(&rt_b->rt_period_timer))
@@ -298,9 +303,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
#endif /* CONFIG_RT_GROUP_SCHED */
#else /* !CONFIG_FAIR_GROUP_SCHED */
#else /* !CONFIG_USER_SCHED */
#define root_task_group init_task_group
#endif /* CONFIG_FAIR_GROUP_SCHED */
#endif /* CONFIG_USER_SCHED */
/* task_group_lock serializes add/remove of task groups and also changes to
* a task group's cpu shares.
@@ -604,9 +609,9 @@ struct rq {
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
{
rq->curr->sched_class->check_preempt_curr(rq, p);
rq->curr->sched_class->check_preempt_curr(rq, p, sync);
}
static inline int cpu_of(struct rq *rq)
@@ -1087,7 +1092,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
return NOTIFY_DONE;
}
static void init_hrtick(void)
static __init void init_hrtick(void)
{
hotcpu_notifier(hotplug_hrtick, 0);
}
@@ -1102,7 +1107,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
}
static void init_hrtick(void)
static inline void init_hrtick(void)
{
}
#endif /* CONFIG_SMP */
@@ -1119,9 +1124,9 @@ static void init_rq_hrtick(struct rq *rq)
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
rq->hrtick_timer.function = hrtick;
rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
}
#else
#else /* CONFIG_SCHED_HRTICK */
static inline void hrtick_clear(struct rq *rq)
{
}
@@ -1133,7 +1138,7 @@ static inline void init_rq_hrtick(struct rq *rq)
static inline void init_hrtick(void)
{
}
#endif
#endif /* CONFIG_SCHED_HRTICK */
/*
* resched_task - mark a task 'to be rescheduled now'.
@@ -1380,6 +1385,51 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
update_load_sub(&rq->load, load);
}
#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
typedef int (*tg_visitor)(struct task_group *, void *);
/*
* Iterate the full tree, calling @down when first entering a node and @up when
* leaving it for the final time.
*/
static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
{
struct task_group *parent, *child;
int ret;
rcu_read_lock();
parent = &root_task_group;
down:
ret = (*down)(parent, data);
if (ret)
goto out_unlock;
list_for_each_entry_rcu(child, &parent->children, siblings) {
parent = child;
goto down;
up:
continue;
}
ret = (*up)(parent, data);
if (ret)
goto out_unlock;
child = parent;
parent = parent->parent;
if (parent)
goto up;
out_unlock:
rcu_read_unlock();
return ret;
}
static int tg_nop(struct task_group *tg, void *data)
{
return 0;
}
#endif
#ifdef CONFIG_SMP
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
@@ -1397,37 +1447,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
#ifdef CONFIG_FAIR_GROUP_SCHED
typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
/*
* Iterate the full tree, calling @down when first entering a node and @up when
* leaving it for the final time.
*/
static void
walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
{
struct task_group *parent, *child;
rcu_read_lock();
parent = &root_task_group;
down:
(*down)(parent, cpu, sd);
list_for_each_entry_rcu(child, &parent->children, siblings) {
parent = child;
goto down;
up:
continue;
}
(*up)(parent, cpu, sd);
child = parent;
parent = parent->parent;
if (parent)
goto up;
rcu_read_unlock();
}
static void __set_se_shares(struct sched_entity *se, unsigned long shares);
/*
@@ -1486,11 +1505,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
* This needs to be done in a bottom-up fashion because the rq weight of a
* parent group depends on the shares of its child groups.
*/
static void
tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
static int tg_shares_up(struct task_group *tg, void *data)
{
unsigned long rq_weight = 0;
unsigned long shares = 0;
struct sched_domain *sd = data;
int i;
for_each_cpu_mask(i, sd->span) {
@@ -1515,6 +1534,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
__update_group_shares_cpu(tg, i, shares, rq_weight);
spin_unlock_irqrestore(&rq->lock, flags);
}
return 0;
}
/*
@@ -1522,10 +1543,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
* This needs to be done in a top-down fashion because the load of a child
* group is a fraction of its parents load.
*/
static void
tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
static int tg_load_down(struct task_group *tg, void *data)
{
unsigned long load;
long cpu = (long)data;
if (!tg->parent) {
load = cpu_rq(cpu)->load.weight;
@@ -1536,11 +1557,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
}
tg->cfs_rq[cpu]->h_load = load;
}
static void
tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
{
return 0;
}
static void update_shares(struct sched_domain *sd)
@@ -1550,7 +1568,7 @@ static void update_shares(struct sched_domain *sd)
if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
sd->last_update = now;
walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
walk_tg_tree(tg_nop, tg_shares_up, sd);
}
}
@@ -1561,9 +1579,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
spin_lock(&rq->lock);
}
static void update_h_load(int cpu)
static void update_h_load(long cpu)
{
walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
}
#else
@@ -1921,11 +1939,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
running = task_running(rq, p);
on_rq = p->se.on_rq;
ncsw = 0;
if (!match_state || p->state == match_state) {
ncsw = p->nivcsw + p->nvcsw;
if (unlikely(!ncsw))
ncsw = 1;
}
if (!match_state || p->state == match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
task_rq_unlock(rq, &flags);
/*
@@ -2285,7 +2300,7 @@ out_running:
trace_mark(kernel_sched_wakeup,
"pid %d state %ld ## rq %p task %p rq->curr %p",
p->pid, p->state, rq, p, rq->curr);
check_preempt_curr(rq, p);
check_preempt_curr(rq, p, sync);
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
@@ -2420,7 +2435,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
trace_mark(kernel_sched_wakeup_new,
"pid %d state %ld ## rq %p task %p rq->curr %p",
p->pid, p->state, rq, p, rq->curr);
check_preempt_curr(rq, p);
check_preempt_curr(rq, p, 0);
#ifdef CONFIG_SMP
if (p->sched_class->task_wake_up)
p->sched_class->task_wake_up(rq, p);
@@ -2880,7 +2895,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
* Note that idle threads have a prio of MAX_PRIO, for this test
* to be always true for them.
*/
check_preempt_curr(this_rq, p);
check_preempt_curr(this_rq, p, 0);
}
/*
@@ -4627,6 +4642,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
}
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
/**
* complete: - signals a single thread waiting on this completion
* @x: holds the state of this particular completion
*
* This will wake up a single thread waiting on this completion. Threads will be
* awakened in the same order in which they were queued.
*
* See also complete_all(), wait_for_completion() and related routines.
*/
void complete(struct completion *x)
{
unsigned long flags;
@@ -4638,6 +4662,12 @@ void complete(struct completion *x)
}
EXPORT_SYMBOL(complete);
/**
* complete_all: - signals all threads waiting on this completion
* @x: holds the state of this particular completion
*
* This will wake up all threads waiting on this particular completion event.
*/
void complete_all(struct completion *x)
{
unsigned long flags;
@@ -4658,10 +4688,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
wait.flags |= WQ_FLAG_EXCLUSIVE;
__add_wait_queue_tail(&x->wait, &wait);
do {
if ((state == TASK_INTERRUPTIBLE &&
signal_pending(current)) ||
(state == TASK_KILLABLE &&
fatal_signal_pending(current))) {
if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
break;
}
@@ -4689,12 +4716,31 @@ wait_for_common(struct completion *x, long timeout, int state)
return timeout;
}
/**
* wait_for_completion: - waits for completion of a task
* @x: holds the state of this particular completion
*
* This waits to be signaled for completion of a specific task. It is NOT
* interruptible and there is no timeout.
*
* See also similar routines (i.e. wait_for_completion_timeout()) with timeout
* and interrupt capability. Also see complete().
*/
void __sched wait_for_completion(struct completion *x)
{
wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion);
/**
* wait_for_completion_timeout: - waits for completion of a task (w/timeout)
* @x: holds the state of this particular completion
* @timeout: timeout value in jiffies
*
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. The timeout is in jiffies. It is not
* interruptible.
*/
unsigned long __sched
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
{
@@ -4702,6 +4748,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
}
EXPORT_SYMBOL(wait_for_completion_timeout);
/**
* wait_for_completion_interruptible: - waits for completion of a task (w/intr)
* @x: holds the state of this particular completion
*
* This waits for completion of a specific task to be signaled. It is
* interruptible.
*/
int __sched wait_for_completion_interruptible(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
@@ -4711,6 +4764,14 @@ int __sched wait_for_completion_interruptible(struct completion *x)
}
EXPORT_SYMBOL(wait_for_completion_interruptible);
/**
* wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
* @x: holds the state of this particular completion
* @timeout: timeout value in jiffies
*
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. It is interruptible. The timeout is in jiffies.
*/
unsigned long __sched
wait_for_completion_interruptible_timeout(struct completion *x,
unsigned long timeout)
@@ -4719,6 +4780,13 @@ wait_for_completion_interruptible_timeout(struct completion *x,
}
EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
/**
* wait_for_completion_killable: - waits for completion of a task (killable)
* @x: holds the state of this particular completion
*
* This waits to be signaled for completion of a specific task. It can be
* interrupted by a kill signal.
*/
int __sched wait_for_completion_killable(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
@@ -5121,7 +5189,8 @@ recheck:
* Do not allow realtime tasks into groups that have no runtime
* assigned.
*/
if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
if (rt_bandwidth_enabled() && rt_policy(policy) &&
task_group(p)->rt_bandwidth.rt_runtime == 0)
return -EPERM;
#endif
@@ -5957,7 +6026,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
set_task_cpu(p, dest_cpu);
if (on_rq) {
activate_task(rq_dest, p, 0);
check_preempt_curr(rq_dest, p);
check_preempt_curr(rq_dest, p, 0);
}
done:
ret = 1;
@@ -6282,7 +6351,7 @@ set_table_entry(struct ctl_table *entry,
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
struct ctl_table *table = sd_alloc_ctl_entry(12);
struct ctl_table *table = sd_alloc_ctl_entry(13);
if (table == NULL)
return NULL;
@@ -6310,7 +6379,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[10], "flags", &sd->flags,
sizeof(int), 0644, proc_dointvec_minmax);
/* &table[11] is terminator */
set_table_entry(&table[11], "name", sd->name,
CORENAME_MAX_SIZE, 0444, proc_dostring);
/* &table[12] is terminator */
return table;
}
@@ -7194,13 +7265,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
*/
#ifdef CONFIG_SCHED_DEBUG
# define SD_INIT_NAME(sd, type) sd->name = #type
#else
# define SD_INIT_NAME(sd, type) do { } while (0)
#endif
#define SD_INIT(sd, type) sd_init_##type(sd)
#define SD_INIT_FUNC(type) \
static noinline void sd_init_##type(struct sched_domain *sd) \
{ \
memset(sd, 0, sizeof(*sd)); \
*sd = SD_##type##_INIT; \
sd->level = SD_LV_##type; \
SD_INIT_NAME(sd, type); \
}
SD_INIT_FUNC(CPU)
@@ -8242,20 +8321,25 @@ void __might_sleep(char *file, int line)
#ifdef in_atomic
static unsigned long prev_jiffy; /* ratelimiting */
if ((in_atomic() || irqs_disabled()) &&
system_state == SYSTEM_RUNNING && !oops_in_progress) {
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
prev_jiffy = jiffies;
printk(KERN_ERR "BUG: sleeping function called from invalid"
" context at %s:%d\n", file, line);
printk("in_atomic():%d, irqs_disabled():%d\n",
in_atomic(), irqs_disabled());
debug_show_held_locks(current);
if (irqs_disabled())
print_irqtrace_events(current);
dump_stack();
}
if ((!in_atomic() && !irqs_disabled()) ||
system_state != SYSTEM_RUNNING || oops_in_progress)
return;
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
prev_jiffy = jiffies;
printk(KERN_ERR
"BUG: sleeping function called from invalid context at %s:%d\n",
file, line);
printk(KERN_ERR
"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
in_atomic(), irqs_disabled(),
current->pid, current->comm);
debug_show_held_locks(current);
if (irqs_disabled())
print_irqtrace_events(current);
dump_stack();
#endif
}
EXPORT_SYMBOL(__might_sleep);
@@ -8753,75 +8837,97 @@ static DEFINE_MUTEX(rt_constraints_mutex);
static unsigned long to_ratio(u64 period, u64 runtime)
{
if (runtime == RUNTIME_INF)
return 1ULL << 16;
return 1ULL << 20;
return div64_u64(runtime << 16, period);
return div64_u64(runtime << 20, period);
}
#ifdef CONFIG_CGROUP_SCHED
static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
{
struct task_group *tgi, *parent = tg->parent;
unsigned long total = 0;
if (!parent) {
if (global_rt_period() < period)
return 0;
return to_ratio(period, runtime) <
to_ratio(global_rt_period(), global_rt_runtime());
}
if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
return 0;
rcu_read_lock();
list_for_each_entry_rcu(tgi, &parent->children, siblings) {
if (tgi == tg)
continue;
total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
tgi->rt_bandwidth.rt_runtime);
}
rcu_read_unlock();
return total + to_ratio(period, runtime) <=
to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
parent->rt_bandwidth.rt_runtime);
}
#elif defined CONFIG_USER_SCHED
static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
{
struct task_group *tgi;
unsigned long total = 0;
unsigned long global_ratio =
to_ratio(global_rt_period(), global_rt_runtime());
rcu_read_lock();
list_for_each_entry_rcu(tgi, &task_groups, list) {
if (tgi == tg)
continue;
total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
tgi->rt_bandwidth.rt_runtime);
}
rcu_read_unlock();
return total + to_ratio(period, runtime) < global_ratio;
}
#endif
/* Must be called with tasklist_lock held */
static inline int tg_has_rt_tasks(struct task_group *tg)
{
struct task_struct *g, *p;
do_each_thread(g, p) {
if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
return 1;
} while_each_thread(g, p);
return 0;
}
struct rt_schedulable_data {
struct task_group *tg;
u64 rt_period;
u64 rt_runtime;
};
static int tg_schedulable(struct task_group *tg, void *data)
{
struct rt_schedulable_data *d = data;
struct task_group *child;
unsigned long total, sum = 0;
u64 period, runtime;
period = ktime_to_ns(tg->rt_bandwidth.rt_period);
runtime = tg->rt_bandwidth.rt_runtime;
if (tg == d->tg) {
period = d->rt_period;
runtime = d->rt_runtime;
}
/*
* Cannot have more runtime than the period.
*/
if (runtime > period && runtime != RUNTIME_INF)
return -EINVAL;
/*
* Ensure we don't starve existing RT tasks.
*/
if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
return -EBUSY;
total = to_ratio(period, runtime);
/*
* Nobody can have more than the global setting allows.
*/
if (total > to_ratio(global_rt_period(), global_rt_runtime()))
return -EINVAL;
/*
* The sum of our children's runtime should not exceed our own.
*/
list_for_each_entry_rcu(child, &tg->children, siblings) {
period = ktime_to_ns(child->rt_bandwidth.rt_period);
runtime = child->rt_bandwidth.rt_runtime;
if (child == d->tg) {
period = d->rt_period;
runtime = d->rt_runtime;
}
sum += to_ratio(period, runtime);
}
if (sum > total)
return -EINVAL;
return 0;
}
static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
{
struct rt_schedulable_data data = {
.tg = tg,
.rt_period = period,
.rt_runtime = runtime,
};
return walk_tg_tree(tg_schedulable, tg_nop, &data);
}
static int tg_set_bandwidth(struct task_group *tg,
u64 rt_period, u64 rt_runtime)
{
@@ -8829,14 +8935,9 @@ static int tg_set_bandwidth(struct task_group *tg,
mutex_lock(&rt_constraints_mutex);
read_lock(&tasklist_lock);
if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
err = -EBUSY;
err = __rt_schedulable(tg, rt_period, rt_runtime);
if (err)
goto unlock;
}
if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
err = -EINVAL;
goto unlock;
}
spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@ -8905,16 +9006,25 @@ long sched_group_rt_period(struct task_group *tg)
static int sched_rt_global_constraints(void)
{
struct task_group *tg = &root_task_group;
u64 rt_runtime, rt_period;
u64 runtime, period;
int ret = 0;
rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
rt_runtime = tg->rt_bandwidth.rt_runtime;
if (sysctl_sched_rt_period <= 0)
return -EINVAL;
runtime = global_rt_runtime();
period = global_rt_period();
/*
* Sanity check on the sysctl variables.
*/
if (runtime > period && runtime != RUNTIME_INF)
return -EINVAL;
mutex_lock(&rt_constraints_mutex);
if (!__rt_schedulable(tg, rt_period, rt_runtime))
ret = -EINVAL;
read_lock(&tasklist_lock);
ret = __rt_schedulable(NULL, 0, 0);
read_unlock(&tasklist_lock);
mutex_unlock(&rt_constraints_mutex);
return ret;
@@ -8925,6 +9035,9 @@ static int sched_rt_global_constraints(void)
unsigned long flags;
int i;
if (sysctl_sched_rt_period <= 0)
return -EINVAL;
spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
for_each_possible_cpu(i) {
struct rt_rq *rt_rq = &cpu_rq(i)->rt;
@@ -8985,7 +9098,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
if (!cgrp->parent) {
/* This is early initialization for the top cgroup */
init_task_group.css.cgroup = cgrp;
return &init_task_group.css;
}
@@ -8994,9 +9106,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
/* Bind the cgroup to task_group object we just created */
tg->css.cgroup = cgrp;
return &tg->css;
}

查看文件

@@ -408,64 +408,6 @@ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
return __sched_period(nr_running);
}
/*
* The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
* that it favours >=0 over <0.
*
* -20 |
* |
* 0 --------+-------
* .'
* 19 .'
*
*/
static unsigned long
calc_delta_asym(unsigned long delta, struct sched_entity *se)
{
struct load_weight lw = {
.weight = NICE_0_LOAD,
.inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
};
for_each_sched_entity(se) {
struct load_weight *se_lw = &se->load;
unsigned long rw = cfs_rq_of(se)->load.weight;
#ifdef CONFIG_FAIR_SCHED_GROUP
struct cfs_rq *cfs_rq = se->my_q;
struct task_group *tg = NULL
if (cfs_rq)
tg = cfs_rq->tg;
if (tg && tg->shares < NICE_0_LOAD) {
/*
* scale shares to what it would have been had
* tg->weight been NICE_0_LOAD:
*
* weight = 1024 * shares / tg->weight
*/
lw.weight *= se->load.weight;
lw.weight /= tg->shares;
lw.inv_weight = 0;
se_lw = &lw;
rw += lw.weight - se->load.weight;
} else
#endif
if (se->load.weight < NICE_0_LOAD) {
se_lw = &lw;
rw += NICE_0_LOAD - se->load.weight;
}
delta = calc_delta_mine(delta, rw, se_lw);
}
return delta;
}
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
@@ -586,11 +528,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_load_add(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
inc_cpu_load(rq_of(cfs_rq), se->load.weight);
if (entity_is_task(se))
if (entity_is_task(se)) {
add_cfs_task_weight(cfs_rq, se->load.weight);
list_add(&se->group_node, &cfs_rq->tasks);
}
cfs_rq->nr_running++;
se->on_rq = 1;
list_add(&se->group_node, &cfs_rq->tasks);
}
static void
@@ -599,11 +542,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_load_sub(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
dec_cpu_load(rq_of(cfs_rq), se->load.weight);
if (entity_is_task(se))
if (entity_is_task(se)) {
add_cfs_task_weight(cfs_rq, -se->load.weight);
list_del_init(&se->group_node);
}
cfs_rq->nr_running--;
se->on_rq = 0;
list_del_init(&se->group_node);
}
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -1085,7 +1029,6 @@ static long effective_load(struct task_group *tg, int cpu,
long wl, long wg)
{
struct sched_entity *se = tg->se[cpu];
long more_w;
if (!tg->parent)
return wl;
@@ -1097,18 +1040,17 @@ static long effective_load(struct task_group *tg, int cpu,
if (!wl && sched_feat(ASYM_EFF_LOAD))
return wl;
/*
* Instead of using this increment, also add the difference
* between when the shares were last updated and now.
*/
more_w = se->my_q->load.weight - se->my_q->rq_weight;
wl += more_w;
wg += more_w;
for_each_sched_entity(se) {
#define D(n) (likely(n) ? (n) : 1)
long S, rw, s, a, b;
long more_w;
/*
* Instead of using this increment, also add the difference
* between when the shares were last updated and now.
*/
more_w = se->my_q->load.weight - se->my_q->rq_weight;
wl += more_w;
wg += more_w;
S = se->my_q->tg->shares;
s = se->my_q->shares;
@@ -1117,7 +1059,11 @@ static long effective_load(struct task_group *tg, int cpu,
a = S*(rw + wl);
b = S*rw + s*wg;
wl = s*(a-b)/D(b);
wl = s*(a-b);
if (likely(b))
wl /= b;
/*
* Assume the group is already running and will
* thus already be accounted for in the weight.
@@ -1126,7 +1072,6 @@ static long effective_load(struct task_group *tg, int cpu,
* alter the group weight.
*/
wg = 0;
#undef D
}
return wl;
@@ -1143,7 +1088,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
#endif
static int
wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
struct task_struct *p, int prev_cpu, int this_cpu, int sync,
int idx, unsigned long load, unsigned long this_load,
unsigned int imbalance)
@@ -1158,6 +1103,11 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
return 0;
if (!sync && sched_feat(SYNC_WAKEUPS) &&
curr->se.avg_overlap < sysctl_sched_migration_cost &&
p->se.avg_overlap < sysctl_sched_migration_cost)
sync = 1;
/*
* If sync wakeup then subtract the (maximum possible)
* effect of the currently running task from the load
@@ -1182,17 +1132,14 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
* a reasonable amount of time then attract this newly
* woken task:
*/
if (sync && balanced) {
if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
p->se.avg_overlap < sysctl_sched_migration_cost)
return 1;
}
if (sync && balanced)
return 1;
schedstat_inc(p, se.nr_wakeups_affine_attempts);
tl_per_task = cpu_avg_load_per_task(this_cpu);
if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
balanced) {
if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
tl_per_task)) {
/*
* This domain has SD_WAKE_AFFINE and
* p is cache cold in this domain, and
@@ -1211,16 +1158,17 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
struct sched_domain *sd, *this_sd = NULL;
int prev_cpu, this_cpu, new_cpu;
unsigned long load, this_load;
struct rq *rq, *this_rq;
struct rq *this_rq;
unsigned int imbalance;
int idx;
prev_cpu = task_cpu(p);
rq = task_rq(p);
this_cpu = smp_processor_id();
this_rq = cpu_rq(this_cpu);
new_cpu = prev_cpu;
if (prev_cpu == this_cpu)
goto out;
/*
* 'this_sd' is the first domain that both
* this_cpu and prev_cpu are present in:
@@ -1248,13 +1196,10 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
load = source_load(prev_cpu, idx);
this_load = target_load(this_cpu, idx);
if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
load, this_load, imbalance))
return this_cpu;
if (prev_cpu == this_cpu)
goto out;
/*
* Start passive balancing when half the imbalance_pct
* limit is reached.
@@ -1281,62 +1226,20 @@ static unsigned long wakeup_gran(struct sched_entity *se)
* + nice tasks.
*/
if (sched_feat(ASYM_GRAN))
gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
else
gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);
return gran;
}
/*
* Should 'se' preempt 'curr'.
*
* |s1
* |s2
* |s3
* g
* |<--->|c
*
* w(c, s1) = -1
* w(c, s2) = 0
* w(c, s3) = 1
*
*/
static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
{
s64 gran, vdiff = curr->vruntime - se->vruntime;
if (vdiff < 0)
return -1;
gran = wakeup_gran(curr);
if (vdiff > gran)
return 1;
return 0;
}
/* return depth at which a sched entity is present in the hierarchy */
static inline int depth_se(struct sched_entity *se)
{
int depth = 0;
for_each_sched_entity(se)
depth++;
return depth;
}
/*
* Preempt the current task with a newly woken task if needed:
*/
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
{
struct task_struct *curr = rq->curr;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
struct sched_entity *se = &curr->se, *pse = &p->se;
int se_depth, pse_depth;
s64 delta_exec;
if (unlikely(rt_prio(p->prio))) {
update_rq_clock(rq);
@@ -1350,6 +1253,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
cfs_rq_of(pse)->next = pse;
/*
* We can come here with TIF_NEED_RESCHED already set from new task
* wake up path.
*/
if (test_tsk_need_resched(curr))
return;
/*
* Batch tasks do not preempt (their preemption is driven by
* the tick):
@@ -1360,33 +1270,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
if (!sched_feat(WAKEUP_PREEMPT))
return;
/*
* preemption test can be made between sibling entities who are in the
* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
* both tasks until we find their ancestors who are siblings of common
* parent.
*/
/* First walk up until both entities are at same depth */
se_depth = depth_se(se);
pse_depth = depth_se(pse);
while (se_depth > pse_depth) {
se_depth--;
se = parent_entity(se);
if (sched_feat(WAKEUP_OVERLAP) && (sync ||
(se->avg_overlap < sysctl_sched_migration_cost &&
pse->avg_overlap < sysctl_sched_migration_cost))) {
resched_task(curr);
return;
}
while (pse_depth > se_depth) {
pse_depth--;
pse = parent_entity(pse);
}
while (!is_same_group(se, pse)) {
se = parent_entity(se);
pse = parent_entity(pse);
}
if (wakeup_preempt_entity(se, pse) == 1)
delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
if (delta_exec > wakeup_gran(pse))
resched_task(curr);
}
@@ -1445,19 +1337,9 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
if (next == &cfs_rq->tasks)
return NULL;
/* Skip over entities that are not tasks */
do {
se = list_entry(next, struct sched_entity, group_node);
next = next->next;
} while (next != &cfs_rq->tasks && !entity_is_task(se));
if (next == &cfs_rq->tasks)
return NULL;
cfs_rq->balance_iterator = next;
if (entity_is_task(se))
p = task_of(se);
se = list_entry(next, struct sched_entity, group_node);
p = task_of(se);
cfs_rq->balance_iterator = next->next;
return p;
}
@@ -1507,7 +1389,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
rcu_read_lock();
update_h_load(busiest_cpu);
list_for_each_entry(tg, &task_groups, list) {
list_for_each_entry_rcu(tg, &task_groups, list) {
struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
unsigned long busiest_h_load = busiest_cfs_rq->h_load;
unsigned long busiest_weight = busiest_cfs_rq->load.weight;
@@ -1620,10 +1502,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
* 'current' within the tree based on its new key value.
*/
swap(curr->vruntime, se->vruntime);
resched_task(rq->curr);
}
enqueue_task_fair(rq, p, 0);
resched_task(rq->curr);
}
/*
@@ -1642,7 +1524,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p,
if (p->prio > oldprio)
resched_task(rq->curr);
} else
check_preempt_curr(rq, p);
check_preempt_curr(rq, p, 0);
}
/*
@@ -1659,7 +1541,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p,
if (running)
resched_task(rq->curr);
else
check_preempt_curr(rq, p);
check_preempt_curr(rq, p, 0);
}
/* Account for a task changing its policy or group.

查看文件

@@ -11,3 +11,4 @@ SCHED_FEAT(ASYM_GRAN, 1)
SCHED_FEAT(LB_BIAS, 1)
SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
SCHED_FEAT(ASYM_EFF_LOAD, 1)
SCHED_FEAT(WAKEUP_OVERLAP, 0)

查看文件

@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
/*
* Idle tasks are unconditionally rescheduled:
*/
static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p)
static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync)
{
resched_task(rq->idle);
}
@@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p,
if (running)
resched_task(rq->curr);
else
check_preempt_curr(rq, p);
check_preempt_curr(rq, p, 0);
}
static void prio_changed_idle(struct rq *rq, struct task_struct *p,
@@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
if (p->prio > oldprio)
resched_task(rq->curr);
} else
check_preempt_curr(rq, p);
check_preempt_curr(rq, p, 0);
}
/*

查看文件

@@ -102,12 +102,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
struct sched_rt_entity *rt_se = rt_rq->rt_se;
if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
enqueue_rt_entity(rt_se);
if (rt_rq->rt_nr_running) {
if (rt_se && !on_rt_rq(rt_se))
enqueue_rt_entity(rt_se);
if (rt_rq->highest_prio < curr->prio)
resched_task(curr);
}
@@ -231,6 +231,9 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_SMP
/*
* We ran out of runtime, see if we can borrow some from our neighbours.
*/
static int do_balance_runtime(struct rt_rq *rt_rq)
{
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
@@ -250,9 +253,18 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
continue;
spin_lock(&iter->rt_runtime_lock);
/*
* Either all rqs have inf runtime and there's nothing to steal
* or __disable_runtime() below sets a specific rq to inf to
* indicate its been disabled and disalow stealing.
*/
if (iter->rt_runtime == RUNTIME_INF)
goto next;
/*
* From runqueues with spare time, take 1/n part of their
* spare time, but no more than our period.
*/
diff = iter->rt_runtime - iter->rt_time;
if (diff > 0) {
diff = div_u64((u64)diff, weight);
@@ -274,6 +286,9 @@ next:
return more;
}
/*
* Ensure this RQ takes back all the runtime it lend to its neighbours.
*/
static void __disable_runtime(struct rq *rq)
{
struct root_domain *rd = rq->rd;
@@ -289,17 +304,33 @@ static void __disable_runtime(struct rq *rq)
spin_lock(&rt_b->rt_runtime_lock);
spin_lock(&rt_rq->rt_runtime_lock);
/*
* Either we're all inf and nobody needs to borrow, or we're
* already disabled and thus have nothing to do, or we have
* exactly the right amount of runtime to take out.
*/
if (rt_rq->rt_runtime == RUNTIME_INF ||
rt_rq->rt_runtime == rt_b->rt_runtime)
goto balanced;
spin_unlock(&rt_rq->rt_runtime_lock);
/*
* Calculate the difference between what we started out with
* and what we current have, that's the amount of runtime
* we lend and now have to reclaim.
*/
want = rt_b->rt_runtime - rt_rq->rt_runtime;
/*
* Greedy reclaim, take back as much as we can.
*/
for_each_cpu_mask(i, rd->span) {
struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
s64 diff;
/*
* Can't reclaim from ourselves or disabled runqueues.
*/
if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
continue;
@@ -319,8 +350,16 @@ static void __disable_runtime(struct rq *rq)
}
spin_lock(&rt_rq->rt_runtime_lock);
/*
* We cannot be left wanting - that would mean some runtime
* leaked out of the system.
*/
BUG_ON(want);
balanced:
/*
* Disable all the borrow logic by pretending we have inf
* runtime - in which case borrowing doesn't make sense.
*/
rt_rq->rt_runtime = RUNTIME_INF;
spin_unlock(&rt_rq->rt_runtime_lock);
spin_unlock(&rt_b->rt_runtime_lock);
@@ -343,6 +382,9 @@ static void __enable_runtime(struct rq *rq)
if (unlikely(!scheduler_running))
return;
/*
* Reset each runqueue's bandwidth settings
*/
for_each_leaf_rt_rq(rt_rq, rq) {
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
@@ -350,6 +392,7 @@ static void __enable_runtime(struct rq *rq)
spin_lock(&rt_rq->rt_runtime_lock);
rt_rq->rt_runtime = rt_b->rt_runtime;
rt_rq->rt_time = 0;
rt_rq->rt_throttled = 0;
spin_unlock(&rt_rq->rt_runtime_lock);
spin_unlock(&rt_b->rt_runtime_lock);
}
@@ -388,7 +431,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
int i, idle = 1;
cpumask_t span;
if (rt_b->rt_runtime == RUNTIME_INF)
if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
return 1;
span = sched_rt_period_mask();
@@ -486,6 +529,9 @@ static void update_curr_rt(struct rq *rq)
curr->se.exec_start = rq->clock;
cpuacct_charge(curr, delta_exec);
if (!rt_bandwidth_enabled())
return;
for_each_sched_rt_entity(rt_se) {
rt_rq = rt_rq_of_se(rt_se);
@@ -783,7 +829,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
/*
* Preempt the current task with a newly woken task if needed:
*/
static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
{
if (p->prio < rq->curr->prio) {
resched_task(rq->curr);

查看文件

@@ -71,6 +71,16 @@ void clockevents_set_mode(struct clock_event_device *dev,
}
}
/**
* clockevents_shutdown - shutdown the device and clear next_event
* @dev: device to shutdown
*/
void clockevents_shutdown(struct clock_event_device *dev)
{
clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
dev->next_event.tv64 = KTIME_MAX;
}
/**
* clockevents_program_event - Reprogram the clock event device.
* @expires: absolute expiry time (monotonic clock)
@@ -206,7 +216,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
if (new) {
BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN);
clockevents_shutdown(new);
}
local_irq_restore(flags);
}

查看文件

@@ -235,9 +235,9 @@ static void tick_do_broadcast_on_off(void *why)
case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
if (!cpu_isset(cpu, tick_broadcast_mask)) {
cpu_set(cpu, tick_broadcast_mask);
if (td->mode == TICKDEV_MODE_PERIODIC)
clockevents_set_mode(dev,
CLOCK_EVT_MODE_SHUTDOWN);
if (tick_broadcast_device.mode ==
TICKDEV_MODE_PERIODIC)
clockevents_shutdown(dev);
}
if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
tick_broadcast_force = 1;
@@ -246,7 +246,8 @@ static void tick_do_broadcast_on_off(void *why)
if (!tick_broadcast_force &&
cpu_isset(cpu, tick_broadcast_mask)) {
cpu_clear(cpu, tick_broadcast_mask);
if (td->mode == TICKDEV_MODE_PERIODIC)
if (tick_broadcast_device.mode ==
TICKDEV_MODE_PERIODIC)
tick_setup_periodic(dev, 0);
}
break;
@@ -254,7 +255,7 @@ static void tick_do_broadcast_on_off(void *why)
if (cpus_empty(tick_broadcast_mask)) {
if (!bc_stopped)
clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
clockevents_shutdown(bc);
} else if (bc_stopped) {
if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
tick_broadcast_start_periodic(bc);
@@ -306,7 +307,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
if (bc && cpus_empty(tick_broadcast_mask))
clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
clockevents_shutdown(bc);
}
spin_unlock_irqrestore(&tick_broadcast_lock, flags);
@@ -321,7 +322,7 @@ void tick_suspend_broadcast(void)
bc = tick_broadcast_device.evtdev;
if (bc)
clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
clockevents_shutdown(bc);
spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
@@ -576,4 +577,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
/*
* Check, whether the broadcast device is in one shot mode
*/
int tick_broadcast_oneshot_active(void)
{
return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
}
#endif

查看文件

@@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
*/
ktime_t tick_next_period;
ktime_t tick_period;
int tick_do_timer_cpu __read_mostly = -1;
int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
DEFINE_SPINLOCK(tick_device_lock);
/*
@@ -109,7 +109,8 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
if (!tick_device_is_functional(dev))
return;
if (dev->features & CLOCK_EVT_FEAT_PERIODIC) {
if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
!tick_broadcast_oneshot_active()) {
clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
} else {
unsigned long seq;
@@ -148,7 +149,7 @@ static void tick_setup_device(struct tick_device *td,
* If no cpu took the do_timer update, assign it to
* this cpu:
*/
if (tick_do_timer_cpu == -1) {
if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
tick_do_timer_cpu = cpu;
tick_next_period = ktime_get();
tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
@@ -249,7 +250,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
* not give it back to the clockevents layer !
*/
if (tick_is_broadcast_device(curdev)) {
clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN);
clockevents_shutdown(curdev);
curdev = NULL;
}
clockevents_exchange_device(curdev, newdev);
@@ -300,7 +301,8 @@ static void tick_shutdown(unsigned int *cpup)
if (*cpup == tick_do_timer_cpu) {
int cpu = first_cpu(cpu_online_map);
tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1;
tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu :
TICK_DO_TIMER_NONE;
}
spin_unlock_irqrestore(&tick_device_lock, flags);
}
@@ -311,7 +313,7 @@ static void tick_suspend(void)
unsigned long flags;
spin_lock_irqsave(&tick_device_lock, flags);
clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_SHUTDOWN);
clockevents_shutdown(td->evtdev);
spin_unlock_irqrestore(&tick_device_lock, flags);
}

查看文件

@@ -1,6 +1,10 @@
/*
* tick internal variable and functions used by low/high res code
*/
#define TICK_DO_TIMER_NONE -1
#define TICK_DO_TIMER_BOOT -2
DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
extern spinlock_t tick_device_lock;
extern ktime_t tick_next_period;
@@ -10,6 +14,8 @@ extern int tick_do_timer_cpu __read_mostly;
extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
extern void tick_handle_periodic(struct clock_event_device *dev);
extern void clockevents_shutdown(struct clock_event_device *dev);
/*
* NO_HZ / high resolution timer shared code
*/
@@ -29,6 +35,7 @@ extern void tick_broadcast_oneshot_control(unsigned long reason);
extern void tick_broadcast_switch_to_oneshot(void);
extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
extern int tick_broadcast_oneshot_active(void);
# else /* BROADCAST */
static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
{
@@ -37,6 +44,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
static inline void tick_broadcast_switch_to_oneshot(void) { }
static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
static inline int tick_broadcast_oneshot_active(void) { return 0; }
# endif /* !BROADCAST */
#else /* !ONESHOT */
@@ -66,6 +74,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
{
return 0;
}
static inline int tick_broadcast_oneshot_active(void) { return 0; }
#endif /* !TICK_ONESHOT */
/*

查看文件

@@ -20,6 +20,7 @@
#include <linux/profile.h>
#include <linux/sched.h>
#include <linux/tick.h>
#include <linux/module.h>
#include <asm/irq_regs.h>
@@ -75,6 +76,9 @@ static void tick_do_update_jiffies64(ktime_t now)
incr * ticks);
}
do_timer(++ticks);
/* Keep the tick_next_period variable up to date */
tick_next_period = ktime_add(last_jiffies_update, tick_period);
}
write_sequnlock(&xtime_lock);
}
@@ -187,9 +191,17 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
*last_update_time = ktime_to_us(ts->idle_lastupdate);
if (!tick_nohz_enabled)
return -1;
if (ts->idle_active)
*last_update_time = ktime_to_us(ts->idle_lastupdate);
else
*last_update_time = ktime_to_us(ktime_get());
return ktime_to_us(ts->idle_sleeptime);
}
EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
/**
* tick_nohz_stop_sched_tick - stop the idle tick from the idle task
@@ -221,7 +233,7 @@ void tick_nohz_stop_sched_tick(int inidle)
*/
if (unlikely(!cpu_online(cpu))) {
if (cpu == tick_do_timer_cpu)
tick_do_timer_cpu = -1;
tick_do_timer_cpu = TICK_DO_TIMER_NONE;
}
if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
@@ -303,7 +315,7 @@ void tick_nohz_stop_sched_tick(int inidle)
* invoked.
*/
if (cpu == tick_do_timer_cpu)
tick_do_timer_cpu = -1;
tick_do_timer_cpu = TICK_DO_TIMER_NONE;
ts->idle_sleeps++;
@@ -468,7 +480,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
* this duty, then the jiffies update is still serialized by
* xtime_lock.
*/
if (unlikely(tick_do_timer_cpu == -1))
if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
tick_do_timer_cpu = cpu;
/* Check, if the jiffies need an update */
@@ -570,7 +582,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
* this duty, then the jiffies update is still serialized by
* xtime_lock.
*/
if (unlikely(tick_do_timer_cpu == -1))
if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
tick_do_timer_cpu = cpu;
#endif
@@ -622,7 +634,7 @@ void tick_setup_sched_timer(void)
*/
hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
ts->sched_timer.function = tick_sched_timer;
ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
/* Get the next period (per cpu) */
ts->sched_timer.expires = tick_init_jiffy_update();

查看文件

@@ -202,7 +202,7 @@ static void start_stack_timer(int cpu)
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hrtimer->function = stack_trace_timer_fn;
hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
}

查看文件

@@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
{
struct user_struct *up = container_of(kobj, struct user_struct, kobj);
return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg));
return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
}
static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
@@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
unsigned long rt_runtime;
int rc;
sscanf(buf, "%lu", &rt_runtime);
sscanf(buf, "%ld", &rt_runtime);
rc = sched_group_set_rt_runtime(up->tg, rt_runtime);