Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main changes in this cycle were: - tickless load average calculation enhancements (Byungchul Park) - vtime handling enhancements (Frederic Weisbecker) - scalability improvement via properly aligning a key structure field (Jiri Olsa) - various stop_machine() fixes (Oleg Nesterov) - sched/numa enhancement (Rik van Riel) - various fixes and improvements (Andi Kleen, Dietmar Eggemann, Geliang Tang, Hiroshi Shimamoto, Joonwoo Park, Peter Zijlstra, Waiman Long, Wanpeng Li, Yuyang Du)" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (32 commits) sched/fair: Fix new task's load avg removed from source CPU in wake_up_new_task() sched/core: Move sched_entity::avg into separate cache line x86/fpu: Properly align size in CHECK_MEMBER_AT_END_OF() macro sched/deadline: Fix the earliest_dl.next logic sched/fair: Disable the task group load_avg update for the root_task_group sched/fair: Move the cache-hot 'load_avg' variable into its own cacheline sched/fair: Avoid redundant idle_cpu() call in update_sg_lb_stats() sched/core: Move the sched_to_prio[] arrays out of line sched/cputime: Convert vtime_seqlock to seqcount sched/cputime: Introduce vtime accounting check for readers sched/cputime: Rename vtime_accounting_enabled() to vtime_accounting_cpu_enabled() sched/cputime: Correctly handle task guest time on housekeepers sched/cputime: Clarify vtime symbols and document them sched/cputime: Remove extra cost in task_cputime() sched/fair: Make it possible to account fair load avg consistently sched/fair: Modify the comment about lock assumptions in migrate_task_rq_fair() stop_machine: Clean up the usage of the preemption counter in cpu_stopper_thread() stop_machine: Shift the 'done != NULL' check from cpu_stop_signal_done() to callers stop_machine: Kill cpu_stop_done->executed stop_machine: Change __stop_cpus() to rely on cpu_stop_queue_work() ...
This commit is contained in:
@@ -212,7 +212,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
|
||||
ag = autogroup_task_get(p);
|
||||
|
||||
down_write(&ag->lock);
|
||||
err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]);
|
||||
err = sched_group_set_shares(ag->tg, sched_prio_to_weight[nice + 20]);
|
||||
if (!err)
|
||||
ag->nice = nice;
|
||||
up_write(&ag->lock);
|
||||
|
@@ -731,7 +731,7 @@ bool sched_can_stop_tick(void)
|
||||
if (current->policy == SCHED_RR) {
|
||||
struct sched_rt_entity *rt_se = ¤t->rt;
|
||||
|
||||
return rt_se->run_list.prev == rt_se->run_list.next;
|
||||
return list_is_singular(&rt_se->run_list);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -823,8 +823,8 @@ static void set_load_weight(struct task_struct *p)
|
||||
return;
|
||||
}
|
||||
|
||||
load->weight = scale_load(prio_to_weight[prio]);
|
||||
load->inv_weight = prio_to_wmult[prio];
|
||||
load->weight = scale_load(sched_prio_to_weight[prio]);
|
||||
load->inv_weight = sched_prio_to_wmult[prio];
|
||||
}
|
||||
|
||||
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
|
||||
@@ -1071,8 +1071,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
|
||||
{
|
||||
lockdep_assert_held(&rq->lock);
|
||||
|
||||
dequeue_task(rq, p, 0);
|
||||
p->on_rq = TASK_ON_RQ_MIGRATING;
|
||||
dequeue_task(rq, p, 0);
|
||||
set_task_cpu(p, new_cpu);
|
||||
raw_spin_unlock(&rq->lock);
|
||||
|
||||
@@ -1080,8 +1080,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
|
||||
|
||||
raw_spin_lock(&rq->lock);
|
||||
BUG_ON(task_cpu(p) != new_cpu);
|
||||
p->on_rq = TASK_ON_RQ_QUEUED;
|
||||
enqueue_task(rq, p, 0);
|
||||
p->on_rq = TASK_ON_RQ_QUEUED;
|
||||
check_preempt_curr(rq, p, 0);
|
||||
|
||||
return rq;
|
||||
@@ -1274,6 +1274,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
|
||||
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
|
||||
!p->on_rq);
|
||||
|
||||
/*
|
||||
* Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
|
||||
* because schedstat_wait_{start,end} rebase migrating task's wait_start
|
||||
* time relying on p->on_rq.
|
||||
*/
|
||||
WARN_ON_ONCE(p->state == TASK_RUNNING &&
|
||||
p->sched_class == &fair_sched_class &&
|
||||
(p->on_rq && !task_on_rq_migrating(p)));
|
||||
|
||||
#ifdef CONFIG_LOCKDEP
|
||||
/*
|
||||
* The caller should hold either p->pi_lock or rq->lock, when changing
|
||||
@@ -1310,9 +1319,11 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
|
||||
src_rq = task_rq(p);
|
||||
dst_rq = cpu_rq(cpu);
|
||||
|
||||
p->on_rq = TASK_ON_RQ_MIGRATING;
|
||||
deactivate_task(src_rq, p, 0);
|
||||
set_task_cpu(p, cpu);
|
||||
activate_task(dst_rq, p, 0);
|
||||
p->on_rq = TASK_ON_RQ_QUEUED;
|
||||
check_preempt_curr(dst_rq, p, 0);
|
||||
} else {
|
||||
/*
|
||||
@@ -2194,6 +2205,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
|
||||
p->se.vruntime = 0;
|
||||
INIT_LIST_HEAD(&p->se.group_node);
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
p->se.cfs_rq = NULL;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SCHEDSTATS
|
||||
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
|
||||
#endif
|
||||
@@ -7442,6 +7457,9 @@ int in_sched_functions(unsigned long addr)
|
||||
*/
|
||||
struct task_group root_task_group;
|
||||
LIST_HEAD(task_groups);
|
||||
|
||||
/* Cacheline aligned slab cache for task_group */
|
||||
static struct kmem_cache *task_group_cache __read_mostly;
|
||||
#endif
|
||||
|
||||
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
|
||||
@@ -7499,11 +7517,12 @@ void __init sched_init(void)
|
||||
#endif /* CONFIG_RT_GROUP_SCHED */
|
||||
|
||||
#ifdef CONFIG_CGROUP_SCHED
|
||||
task_group_cache = KMEM_CACHE(task_group, 0);
|
||||
|
||||
list_add(&root_task_group.list, &task_groups);
|
||||
INIT_LIST_HEAD(&root_task_group.children);
|
||||
INIT_LIST_HEAD(&root_task_group.siblings);
|
||||
autogroup_init(&init_task);
|
||||
|
||||
#endif /* CONFIG_CGROUP_SCHED */
|
||||
|
||||
for_each_possible_cpu(i) {
|
||||
@@ -7784,7 +7803,7 @@ static void free_sched_group(struct task_group *tg)
|
||||
free_fair_sched_group(tg);
|
||||
free_rt_sched_group(tg);
|
||||
autogroup_free(tg);
|
||||
kfree(tg);
|
||||
kmem_cache_free(task_group_cache, tg);
|
||||
}
|
||||
|
||||
/* allocate runqueue etc for a new task group */
|
||||
@@ -7792,7 +7811,7 @@ struct task_group *sched_create_group(struct task_group *parent)
|
||||
{
|
||||
struct task_group *tg;
|
||||
|
||||
tg = kzalloc(sizeof(*tg), GFP_KERNEL);
|
||||
tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
|
||||
if (!tg)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
@@ -8697,3 +8716,44 @@ void dump_cpu_task(int cpu)
|
||||
pr_info("Task dump for CPU %d:\n", cpu);
|
||||
sched_show_task(cpu_curr(cpu));
|
||||
}
|
||||
|
||||
/*
|
||||
* Nice levels are multiplicative, with a gentle 10% change for every
|
||||
* nice level changed. I.e. when a CPU-bound task goes from nice 0 to
|
||||
* nice 1, it will get ~10% less CPU time than another CPU-bound task
|
||||
* that remained on nice 0.
|
||||
*
|
||||
* The "10% effect" is relative and cumulative: from _any_ nice level,
|
||||
* if you go up 1 level, it's -10% CPU usage, if you go down 1 level
|
||||
* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
|
||||
* If a task goes up by ~10% and another task goes down by ~10% then
|
||||
* the relative distance between them is ~25%.)
|
||||
*/
|
||||
const int sched_prio_to_weight[40] = {
|
||||
/* -20 */ 88761, 71755, 56483, 46273, 36291,
|
||||
/* -15 */ 29154, 23254, 18705, 14949, 11916,
|
||||
/* -10 */ 9548, 7620, 6100, 4904, 3906,
|
||||
/* -5 */ 3121, 2501, 1991, 1586, 1277,
|
||||
/* 0 */ 1024, 820, 655, 526, 423,
|
||||
/* 5 */ 335, 272, 215, 172, 137,
|
||||
/* 10 */ 110, 87, 70, 56, 45,
|
||||
/* 15 */ 36, 29, 23, 18, 15,
|
||||
};
|
||||
|
||||
/*
|
||||
* Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
|
||||
*
|
||||
* In cases where the weight does not change often, we can use the
|
||||
* precalculated inverse to speed up arithmetics by turning divisions
|
||||
* into multiplications:
|
||||
*/
|
||||
const u32 sched_prio_to_wmult[40] = {
|
||||
/* -20 */ 48388, 59856, 76040, 92818, 118348,
|
||||
/* -15 */ 147320, 184698, 229616, 287308, 360437,
|
||||
/* -10 */ 449829, 563644, 704093, 875809, 1099582,
|
||||
/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
|
||||
/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
|
||||
/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
|
||||
/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
|
||||
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
|
||||
};
|
||||
|
@@ -466,7 +466,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
|
||||
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
|
||||
struct rq *rq = this_rq();
|
||||
|
||||
if (vtime_accounting_enabled())
|
||||
if (vtime_accounting_cpu_enabled())
|
||||
return;
|
||||
|
||||
if (sched_clock_irqtime) {
|
||||
@@ -680,7 +680,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
|
||||
{
|
||||
unsigned long long delta = vtime_delta(tsk);
|
||||
|
||||
WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
|
||||
WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
|
||||
tsk->vtime_snap += delta;
|
||||
|
||||
/* CHECKME: always safe to convert nsecs to cputime? */
|
||||
@@ -696,37 +696,37 @@ static void __vtime_account_system(struct task_struct *tsk)
|
||||
|
||||
void vtime_account_system(struct task_struct *tsk)
|
||||
{
|
||||
write_seqlock(&tsk->vtime_seqlock);
|
||||
write_seqcount_begin(&tsk->vtime_seqcount);
|
||||
__vtime_account_system(tsk);
|
||||
write_sequnlock(&tsk->vtime_seqlock);
|
||||
write_seqcount_end(&tsk->vtime_seqcount);
|
||||
}
|
||||
|
||||
void vtime_gen_account_irq_exit(struct task_struct *tsk)
|
||||
{
|
||||
write_seqlock(&tsk->vtime_seqlock);
|
||||
write_seqcount_begin(&tsk->vtime_seqcount);
|
||||
__vtime_account_system(tsk);
|
||||
if (context_tracking_in_user())
|
||||
tsk->vtime_snap_whence = VTIME_USER;
|
||||
write_sequnlock(&tsk->vtime_seqlock);
|
||||
write_seqcount_end(&tsk->vtime_seqcount);
|
||||
}
|
||||
|
||||
void vtime_account_user(struct task_struct *tsk)
|
||||
{
|
||||
cputime_t delta_cpu;
|
||||
|
||||
write_seqlock(&tsk->vtime_seqlock);
|
||||
write_seqcount_begin(&tsk->vtime_seqcount);
|
||||
delta_cpu = get_vtime_delta(tsk);
|
||||
tsk->vtime_snap_whence = VTIME_SYS;
|
||||
account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
|
||||
write_sequnlock(&tsk->vtime_seqlock);
|
||||
write_seqcount_end(&tsk->vtime_seqcount);
|
||||
}
|
||||
|
||||
void vtime_user_enter(struct task_struct *tsk)
|
||||
{
|
||||
write_seqlock(&tsk->vtime_seqlock);
|
||||
write_seqcount_begin(&tsk->vtime_seqcount);
|
||||
__vtime_account_system(tsk);
|
||||
tsk->vtime_snap_whence = VTIME_USER;
|
||||
write_sequnlock(&tsk->vtime_seqlock);
|
||||
write_seqcount_end(&tsk->vtime_seqcount);
|
||||
}
|
||||
|
||||
void vtime_guest_enter(struct task_struct *tsk)
|
||||
@@ -738,19 +738,19 @@ void vtime_guest_enter(struct task_struct *tsk)
|
||||
* synchronization against the reader (task_gtime())
|
||||
* that can thus safely catch up with a tickless delta.
|
||||
*/
|
||||
write_seqlock(&tsk->vtime_seqlock);
|
||||
write_seqcount_begin(&tsk->vtime_seqcount);
|
||||
__vtime_account_system(tsk);
|
||||
current->flags |= PF_VCPU;
|
||||
write_sequnlock(&tsk->vtime_seqlock);
|
||||
write_seqcount_end(&tsk->vtime_seqcount);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(vtime_guest_enter);
|
||||
|
||||
void vtime_guest_exit(struct task_struct *tsk)
|
||||
{
|
||||
write_seqlock(&tsk->vtime_seqlock);
|
||||
write_seqcount_begin(&tsk->vtime_seqcount);
|
||||
__vtime_account_system(tsk);
|
||||
current->flags &= ~PF_VCPU;
|
||||
write_sequnlock(&tsk->vtime_seqlock);
|
||||
write_seqcount_end(&tsk->vtime_seqcount);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(vtime_guest_exit);
|
||||
|
||||
@@ -763,24 +763,26 @@ void vtime_account_idle(struct task_struct *tsk)
|
||||
|
||||
void arch_vtime_task_switch(struct task_struct *prev)
|
||||
{
|
||||
write_seqlock(&prev->vtime_seqlock);
|
||||
prev->vtime_snap_whence = VTIME_SLEEPING;
|
||||
write_sequnlock(&prev->vtime_seqlock);
|
||||
write_seqcount_begin(&prev->vtime_seqcount);
|
||||
prev->vtime_snap_whence = VTIME_INACTIVE;
|
||||
write_seqcount_end(&prev->vtime_seqcount);
|
||||
|
||||
write_seqlock(¤t->vtime_seqlock);
|
||||
write_seqcount_begin(¤t->vtime_seqcount);
|
||||
current->vtime_snap_whence = VTIME_SYS;
|
||||
current->vtime_snap = sched_clock_cpu(smp_processor_id());
|
||||
write_sequnlock(¤t->vtime_seqlock);
|
||||
write_seqcount_end(¤t->vtime_seqcount);
|
||||
}
|
||||
|
||||
void vtime_init_idle(struct task_struct *t, int cpu)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
write_seqlock_irqsave(&t->vtime_seqlock, flags);
|
||||
local_irq_save(flags);
|
||||
write_seqcount_begin(&t->vtime_seqcount);
|
||||
t->vtime_snap_whence = VTIME_SYS;
|
||||
t->vtime_snap = sched_clock_cpu(cpu);
|
||||
write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
|
||||
write_seqcount_end(&t->vtime_seqcount);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
cputime_t task_gtime(struct task_struct *t)
|
||||
@@ -788,17 +790,17 @@ cputime_t task_gtime(struct task_struct *t)
|
||||
unsigned int seq;
|
||||
cputime_t gtime;
|
||||
|
||||
if (!context_tracking_is_enabled())
|
||||
if (!vtime_accounting_enabled())
|
||||
return t->gtime;
|
||||
|
||||
do {
|
||||
seq = read_seqbegin(&t->vtime_seqlock);
|
||||
seq = read_seqcount_begin(&t->vtime_seqcount);
|
||||
|
||||
gtime = t->gtime;
|
||||
if (t->flags & PF_VCPU)
|
||||
if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU)
|
||||
gtime += vtime_delta(t);
|
||||
|
||||
} while (read_seqretry(&t->vtime_seqlock, seq));
|
||||
} while (read_seqcount_retry(&t->vtime_seqcount, seq));
|
||||
|
||||
return gtime;
|
||||
}
|
||||
@@ -821,7 +823,7 @@ fetch_task_cputime(struct task_struct *t,
|
||||
*udelta = 0;
|
||||
*sdelta = 0;
|
||||
|
||||
seq = read_seqbegin(&t->vtime_seqlock);
|
||||
seq = read_seqcount_begin(&t->vtime_seqcount);
|
||||
|
||||
if (u_dst)
|
||||
*u_dst = *u_src;
|
||||
@@ -829,7 +831,7 @@ fetch_task_cputime(struct task_struct *t,
|
||||
*s_dst = *s_src;
|
||||
|
||||
/* Task is sleeping, nothing to add */
|
||||
if (t->vtime_snap_whence == VTIME_SLEEPING ||
|
||||
if (t->vtime_snap_whence == VTIME_INACTIVE ||
|
||||
is_idle_task(t))
|
||||
continue;
|
||||
|
||||
@@ -845,7 +847,7 @@ fetch_task_cputime(struct task_struct *t,
|
||||
if (t->vtime_snap_whence == VTIME_SYS)
|
||||
*sdelta = delta;
|
||||
}
|
||||
} while (read_seqretry(&t->vtime_seqlock, seq));
|
||||
} while (read_seqcount_retry(&t->vtime_seqcount, seq));
|
||||
}
|
||||
|
||||
|
||||
@@ -853,6 +855,14 @@ void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
|
||||
{
|
||||
cputime_t udelta, sdelta;
|
||||
|
||||
if (!vtime_accounting_enabled()) {
|
||||
if (utime)
|
||||
*utime = t->utime;
|
||||
if (stime)
|
||||
*stime = t->stime;
|
||||
return;
|
||||
}
|
||||
|
||||
fetch_task_cputime(t, utime, stime, &t->utime,
|
||||
&t->stime, &udelta, &sdelta);
|
||||
if (utime)
|
||||
@@ -866,6 +876,14 @@ void task_cputime_scaled(struct task_struct *t,
|
||||
{
|
||||
cputime_t udelta, sdelta;
|
||||
|
||||
if (!vtime_accounting_enabled()) {
|
||||
if (utimescaled)
|
||||
*utimescaled = t->utimescaled;
|
||||
if (stimescaled)
|
||||
*stimescaled = t->stimescaled;
|
||||
return;
|
||||
}
|
||||
|
||||
fetch_task_cputime(t, utimescaled, stimescaled,
|
||||
&t->utimescaled, &t->stimescaled, &udelta, &sdelta);
|
||||
if (utimescaled)
|
||||
|
@@ -176,8 +176,10 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
|
||||
}
|
||||
}
|
||||
|
||||
if (leftmost)
|
||||
if (leftmost) {
|
||||
dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks;
|
||||
dl_rq->earliest_dl.next = p->dl.deadline;
|
||||
}
|
||||
|
||||
rb_link_node(&p->pushable_dl_tasks, parent, link);
|
||||
rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
|
||||
@@ -195,6 +197,10 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
|
||||
|
||||
next_node = rb_next(&p->pushable_dl_tasks);
|
||||
dl_rq->pushable_dl_tasks_leftmost = next_node;
|
||||
if (next_node) {
|
||||
dl_rq->earliest_dl.next = rb_entry(next_node,
|
||||
struct task_struct, pushable_dl_tasks)->dl.deadline;
|
||||
}
|
||||
}
|
||||
|
||||
rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
|
||||
@@ -782,42 +788,14 @@ static void update_curr_dl(struct rq *rq)
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu);
|
||||
|
||||
static inline u64 next_deadline(struct rq *rq)
|
||||
{
|
||||
struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu);
|
||||
|
||||
if (next && dl_prio(next->prio))
|
||||
return next->dl.deadline;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
|
||||
{
|
||||
struct rq *rq = rq_of_dl_rq(dl_rq);
|
||||
|
||||
if (dl_rq->earliest_dl.curr == 0 ||
|
||||
dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
|
||||
/*
|
||||
* If the dl_rq had no -deadline tasks, or if the new task
|
||||
* has shorter deadline than the current one on dl_rq, we
|
||||
* know that the previous earliest becomes our next earliest,
|
||||
* as the new task becomes the earliest itself.
|
||||
*/
|
||||
dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr;
|
||||
dl_rq->earliest_dl.curr = deadline;
|
||||
cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
|
||||
} else if (dl_rq->earliest_dl.next == 0 ||
|
||||
dl_time_before(deadline, dl_rq->earliest_dl.next)) {
|
||||
/*
|
||||
* On the other hand, if the new -deadline task has a
|
||||
* a later deadline than the earliest one on dl_rq, but
|
||||
* it is earlier than the next (if any), we must
|
||||
* recompute the next-earliest.
|
||||
*/
|
||||
dl_rq->earliest_dl.next = next_deadline(rq);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -839,7 +817,6 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
|
||||
|
||||
entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
|
||||
dl_rq->earliest_dl.curr = entry->deadline;
|
||||
dl_rq->earliest_dl.next = next_deadline(rq);
|
||||
cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
|
||||
}
|
||||
}
|
||||
@@ -1274,28 +1251,6 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Returns the second earliest -deadline task, NULL otherwise */
|
||||
static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu)
|
||||
{
|
||||
struct rb_node *next_node = rq->dl.rb_leftmost;
|
||||
struct sched_dl_entity *dl_se;
|
||||
struct task_struct *p = NULL;
|
||||
|
||||
next_node:
|
||||
next_node = rb_next(next_node);
|
||||
if (next_node) {
|
||||
dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node);
|
||||
p = dl_task_of(dl_se);
|
||||
|
||||
if (pick_dl_task(rq, p, cpu))
|
||||
return p;
|
||||
|
||||
goto next_node;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the earliest pushable rq's task, which is suitable to be executed
|
||||
* on the CPU, NULL otherwise:
|
||||
|
@@ -738,12 +738,56 @@ static void update_curr_fair(struct rq *rq)
|
||||
update_curr(cfs_rq_of(&rq->curr->se));
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SCHEDSTATS
|
||||
static inline void
|
||||
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
{
|
||||
schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
|
||||
u64 wait_start = rq_clock(rq_of(cfs_rq));
|
||||
|
||||
if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
|
||||
likely(wait_start > se->statistics.wait_start))
|
||||
wait_start -= se->statistics.wait_start;
|
||||
|
||||
se->statistics.wait_start = wait_start;
|
||||
}
|
||||
|
||||
static void
|
||||
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
{
|
||||
struct task_struct *p;
|
||||
u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
|
||||
|
||||
if (entity_is_task(se)) {
|
||||
p = task_of(se);
|
||||
if (task_on_rq_migrating(p)) {
|
||||
/*
|
||||
* Preserve migrating task's wait time so wait_start
|
||||
* time stamp can be adjusted to accumulate wait time
|
||||
* prior to migration.
|
||||
*/
|
||||
se->statistics.wait_start = delta;
|
||||
return;
|
||||
}
|
||||
trace_sched_stat_wait(p, delta);
|
||||
}
|
||||
|
||||
se->statistics.wait_max = max(se->statistics.wait_max, delta);
|
||||
se->statistics.wait_count++;
|
||||
se->statistics.wait_sum += delta;
|
||||
se->statistics.wait_start = 0;
|
||||
}
|
||||
#else
|
||||
static inline void
|
||||
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void
|
||||
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Task is being enqueued - update stats:
|
||||
*/
|
||||
@@ -757,23 +801,6 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
update_stats_wait_start(cfs_rq, se);
|
||||
}
|
||||
|
||||
static void
|
||||
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
{
|
||||
schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
|
||||
rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
|
||||
schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
|
||||
schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
|
||||
rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
|
||||
#ifdef CONFIG_SCHEDSTATS
|
||||
if (entity_is_task(se)) {
|
||||
trace_sched_stat_wait(task_of(se),
|
||||
rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
|
||||
}
|
||||
#endif
|
||||
schedstat_set(se->statistics.wait_start, 0);
|
||||
}
|
||||
|
||||
static inline void
|
||||
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
{
|
||||
@@ -2155,6 +2182,7 @@ void task_numa_work(struct callback_head *work)
|
||||
unsigned long migrate, next_scan, now = jiffies;
|
||||
struct task_struct *p = current;
|
||||
struct mm_struct *mm = p->mm;
|
||||
u64 runtime = p->se.sum_exec_runtime;
|
||||
struct vm_area_struct *vma;
|
||||
unsigned long start, end;
|
||||
unsigned long nr_pte_updates = 0;
|
||||
@@ -2277,6 +2305,17 @@ out:
|
||||
else
|
||||
reset_ptenuma_scan(p);
|
||||
up_read(&mm->mmap_sem);
|
||||
|
||||
/*
|
||||
* Make sure tasks use at least 32x as much time to run other code
|
||||
* than they used here, to limit NUMA PTE scanning overhead to 3% max.
|
||||
* Usually update_task_scan_period slows down scanning enough; on an
|
||||
* overloaded system we need to limit overhead on a per task basis.
|
||||
*/
|
||||
if (unlikely(p->se.sum_exec_runtime != runtime)) {
|
||||
u64 diff = p->se.sum_exec_runtime - runtime;
|
||||
p->node_stamp += 32 * diff;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2670,12 +2709,64 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
|
||||
{
|
||||
long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
|
||||
|
||||
/*
|
||||
* No need to update load_avg for root_task_group as it is not used.
|
||||
*/
|
||||
if (cfs_rq->tg == &root_task_group)
|
||||
return;
|
||||
|
||||
if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
|
||||
atomic_long_add(delta, &cfs_rq->tg->load_avg);
|
||||
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Called within set_task_rq() right before setting a task's cpu. The
|
||||
* caller only guarantees p->pi_lock is held; no other assumptions,
|
||||
* including the state of rq->lock, should be made.
|
||||
*/
|
||||
void set_task_rq_fair(struct sched_entity *se,
|
||||
struct cfs_rq *prev, struct cfs_rq *next)
|
||||
{
|
||||
if (!sched_feat(ATTACH_AGE_LOAD))
|
||||
return;
|
||||
|
||||
/*
|
||||
* We are supposed to update the task to "current" time, then its up to
|
||||
* date and ready to go to new CPU/cfs_rq. But we have difficulty in
|
||||
* getting what current time is, so simply throw away the out-of-date
|
||||
* time. This will result in the wakee task is less decayed, but giving
|
||||
* the wakee more load sounds not bad.
|
||||
*/
|
||||
if (se->avg.last_update_time && prev) {
|
||||
u64 p_last_update_time;
|
||||
u64 n_last_update_time;
|
||||
|
||||
#ifndef CONFIG_64BIT
|
||||
u64 p_last_update_time_copy;
|
||||
u64 n_last_update_time_copy;
|
||||
|
||||
do {
|
||||
p_last_update_time_copy = prev->load_last_update_time_copy;
|
||||
n_last_update_time_copy = next->load_last_update_time_copy;
|
||||
|
||||
smp_rmb();
|
||||
|
||||
p_last_update_time = prev->avg.last_update_time;
|
||||
n_last_update_time = next->avg.last_update_time;
|
||||
|
||||
} while (p_last_update_time != p_last_update_time_copy ||
|
||||
n_last_update_time != n_last_update_time_copy);
|
||||
#else
|
||||
p_last_update_time = prev->avg.last_update_time;
|
||||
n_last_update_time = next->avg.last_update_time;
|
||||
#endif
|
||||
__update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
|
||||
&se->avg, 0, 0, NULL);
|
||||
se->avg.last_update_time = n_last_update_time;
|
||||
}
|
||||
}
|
||||
#else /* CONFIG_FAIR_GROUP_SCHED */
|
||||
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
|
||||
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
||||
@@ -2809,6 +2900,27 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
|
||||
}
|
||||
|
||||
#ifndef CONFIG_64BIT
|
||||
static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
u64 last_update_time_copy;
|
||||
u64 last_update_time;
|
||||
|
||||
do {
|
||||
last_update_time_copy = cfs_rq->load_last_update_time_copy;
|
||||
smp_rmb();
|
||||
last_update_time = cfs_rq->avg.last_update_time;
|
||||
} while (last_update_time != last_update_time_copy);
|
||||
|
||||
return last_update_time;
|
||||
}
|
||||
#else
|
||||
static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
return cfs_rq->avg.last_update_time;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Task first catches up with cfs_rq, and then subtract
|
||||
* itself from the cfs_rq (task must be off the queue now).
|
||||
@@ -2818,41 +2930,20 @@ void remove_entity_load_avg(struct sched_entity *se)
|
||||
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
||||
u64 last_update_time;
|
||||
|
||||
#ifndef CONFIG_64BIT
|
||||
u64 last_update_time_copy;
|
||||
/*
|
||||
* Newly created task or never used group entity should not be removed
|
||||
* from its (source) cfs_rq
|
||||
*/
|
||||
if (se->avg.last_update_time == 0)
|
||||
return;
|
||||
|
||||
do {
|
||||
last_update_time_copy = cfs_rq->load_last_update_time_copy;
|
||||
smp_rmb();
|
||||
last_update_time = cfs_rq->avg.last_update_time;
|
||||
} while (last_update_time != last_update_time_copy);
|
||||
#else
|
||||
last_update_time = cfs_rq->avg.last_update_time;
|
||||
#endif
|
||||
last_update_time = cfs_rq_last_update_time(cfs_rq);
|
||||
|
||||
__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
|
||||
atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
|
||||
atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
|
||||
}
|
||||
|
||||
/*
|
||||
* Update the rq's load with the elapsed running time before entering
|
||||
* idle. if the last scheduled task is not a CFS task, idle_enter will
|
||||
* be the only way to update the runnable statistic.
|
||||
*/
|
||||
void idle_enter_fair(struct rq *this_rq)
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
* Update the rq's load with the elapsed idle time before a task is
|
||||
* scheduled. if the newly scheduled task is not a CFS task, idle_exit will
|
||||
* be the only way to update the runnable statistic.
|
||||
*/
|
||||
void idle_exit_fair(struct rq *this_rq)
|
||||
{
|
||||
}
|
||||
|
||||
static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
return cfs_rq->runnable_load_avg;
|
||||
@@ -4240,42 +4331,37 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
||||
*/
|
||||
|
||||
/*
|
||||
* The exact cpuload at various idx values, calculated at every tick would be
|
||||
* load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
|
||||
* The exact cpuload calculated at every tick would be:
|
||||
*
|
||||
* If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
|
||||
* on nth tick when cpu may be busy, then we have:
|
||||
* load = ((2^idx - 1) / 2^idx)^(n-1) * load
|
||||
* load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
|
||||
* load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
|
||||
*
|
||||
* If a cpu misses updates for n ticks (as it was idle) and update gets
|
||||
* called on the n+1-th tick when cpu may be busy, then we have:
|
||||
*
|
||||
* load_n = (1 - 1/2^i)^n * load_0
|
||||
* load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
|
||||
*
|
||||
* decay_load_missed() below does efficient calculation of
|
||||
* load = ((2^idx - 1) / 2^idx)^(n-1) * load
|
||||
* avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
|
||||
*
|
||||
* load' = (1 - 1/2^i)^n * load
|
||||
*
|
||||
* Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
|
||||
* This allows us to precompute the above in said factors, thereby allowing the
|
||||
* reduction of an arbitrary n in O(log_2 n) steps. (See also
|
||||
* fixed_power_int())
|
||||
*
|
||||
* The calculation is approximated on a 128 point scale.
|
||||
* degrade_zero_ticks is the number of ticks after which load at any
|
||||
* particular idx is approximated to be zero.
|
||||
* degrade_factor is a precomputed table, a row for each load idx.
|
||||
* Each column corresponds to degradation factor for a power of two ticks,
|
||||
* based on 128 point scale.
|
||||
* Example:
|
||||
* row 2, col 3 (=12) says that the degradation at load idx 2 after
|
||||
* 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
|
||||
*
|
||||
* With this power of 2 load factors, we can degrade the load n times
|
||||
* by looking at 1 bits in n and doing as many mult/shift instead of
|
||||
* n mult/shifts needed by the exact degradation.
|
||||
*/
|
||||
#define DEGRADE_SHIFT 7
|
||||
static const unsigned char
|
||||
degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
|
||||
static const unsigned char
|
||||
degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
|
||||
{0, 0, 0, 0, 0, 0, 0, 0},
|
||||
{64, 32, 8, 0, 0, 0, 0, 0},
|
||||
{96, 72, 40, 12, 1, 0, 0},
|
||||
{112, 98, 75, 43, 15, 1, 0},
|
||||
{120, 112, 98, 76, 45, 16, 2} };
|
||||
|
||||
static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
|
||||
static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
|
||||
{ 0, 0, 0, 0, 0, 0, 0, 0 },
|
||||
{ 64, 32, 8, 0, 0, 0, 0, 0 },
|
||||
{ 96, 72, 40, 12, 1, 0, 0, 0 },
|
||||
{ 112, 98, 75, 43, 15, 1, 0, 0 },
|
||||
{ 120, 112, 98, 76, 45, 16, 2, 0 }
|
||||
};
|
||||
|
||||
/*
|
||||
* Update cpu_load for any missed ticks, due to tickless idle. The backlog
|
||||
@@ -4306,14 +4392,46 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
|
||||
return load;
|
||||
}
|
||||
|
||||
/*
|
||||
/**
|
||||
* __update_cpu_load - update the rq->cpu_load[] statistics
|
||||
* @this_rq: The rq to update statistics for
|
||||
* @this_load: The current load
|
||||
* @pending_updates: The number of missed updates
|
||||
* @active: !0 for NOHZ_FULL
|
||||
*
|
||||
* Update rq->cpu_load[] statistics. This function is usually called every
|
||||
* scheduler tick (TICK_NSEC). With tickless idle this will not be called
|
||||
* every tick. We fix it up based on jiffies.
|
||||
* scheduler tick (TICK_NSEC).
|
||||
*
|
||||
* This function computes a decaying average:
|
||||
*
|
||||
* load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
|
||||
*
|
||||
* Because of NOHZ it might not get called on every tick which gives need for
|
||||
* the @pending_updates argument.
|
||||
*
|
||||
* load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
|
||||
* = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
|
||||
* = A * (A * load[i]_n-2 + B) + B
|
||||
* = A * (A * (A * load[i]_n-3 + B) + B) + B
|
||||
* = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
|
||||
* = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
|
||||
* = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
|
||||
* = (1 - 1/2^i)^n * (load[i]_0 - load) + load
|
||||
*
|
||||
* In the above we've assumed load_n := load, which is true for NOHZ_FULL as
|
||||
* any change in load would have resulted in the tick being turned back on.
|
||||
*
|
||||
* For regular NOHZ, this reduces to:
|
||||
*
|
||||
* load[i]_n = (1 - 1/2^i)^n * load[i]_0
|
||||
*
|
||||
* see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
|
||||
* term. See the @active paramter.
|
||||
*/
|
||||
static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
|
||||
unsigned long pending_updates)
|
||||
unsigned long pending_updates, int active)
|
||||
{
|
||||
unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
|
||||
int i, scale;
|
||||
|
||||
this_rq->nr_load_updates++;
|
||||
@@ -4325,8 +4443,9 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
|
||||
|
||||
/* scale is effectively 1 << i now, and >> i divides by scale */
|
||||
|
||||
old_load = this_rq->cpu_load[i];
|
||||
old_load = this_rq->cpu_load[i] - tickless_load;
|
||||
old_load = decay_load_missed(old_load, pending_updates - 1, i);
|
||||
old_load += tickless_load;
|
||||
new_load = this_load;
|
||||
/*
|
||||
* Round up the averaging division if load is increasing. This
|
||||
@@ -4381,16 +4500,17 @@ static void update_idle_cpu_load(struct rq *this_rq)
|
||||
pending_updates = curr_jiffies - this_rq->last_load_update_tick;
|
||||
this_rq->last_load_update_tick = curr_jiffies;
|
||||
|
||||
__update_cpu_load(this_rq, load, pending_updates);
|
||||
__update_cpu_load(this_rq, load, pending_updates, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
|
||||
*/
|
||||
void update_cpu_load_nohz(void)
|
||||
void update_cpu_load_nohz(int active)
|
||||
{
|
||||
struct rq *this_rq = this_rq();
|
||||
unsigned long curr_jiffies = READ_ONCE(jiffies);
|
||||
unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
|
||||
unsigned long pending_updates;
|
||||
|
||||
if (curr_jiffies == this_rq->last_load_update_tick)
|
||||
@@ -4401,10 +4521,11 @@ void update_cpu_load_nohz(void)
|
||||
if (pending_updates) {
|
||||
this_rq->last_load_update_tick = curr_jiffies;
|
||||
/*
|
||||
* We were idle, this means load 0, the current load might be
|
||||
* !0 due to remote wakeups and the sort.
|
||||
* In the regular NOHZ case, we were idle, this means load 0.
|
||||
* In the NOHZ_FULL case, we were non-idle, we should consider
|
||||
* its weighted load.
|
||||
*/
|
||||
__update_cpu_load(this_rq, 0, pending_updates);
|
||||
__update_cpu_load(this_rq, load, pending_updates, active);
|
||||
}
|
||||
raw_spin_unlock(&this_rq->lock);
|
||||
}
|
||||
@@ -4420,7 +4541,7 @@ void update_cpu_load_active(struct rq *this_rq)
|
||||
* See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
|
||||
*/
|
||||
this_rq->last_load_update_tick = jiffies;
|
||||
__update_cpu_load(this_rq, load, 1);
|
||||
__update_cpu_load(this_rq, load, 1, 1);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -5007,8 +5128,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
|
||||
/*
|
||||
* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
|
||||
* cfs_rq_of(p) references at time of call are still valid and identify the
|
||||
* previous cpu. However, the caller only guarantees p->pi_lock is held; no
|
||||
* other assumptions, including the state of rq->lock, should be made.
|
||||
* previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
|
||||
*/
|
||||
static void migrate_task_rq_fair(struct task_struct *p)
|
||||
{
|
||||
@@ -5721,8 +5841,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
|
||||
{
|
||||
lockdep_assert_held(&env->src_rq->lock);
|
||||
|
||||
deactivate_task(env->src_rq, p, 0);
|
||||
p->on_rq = TASK_ON_RQ_MIGRATING;
|
||||
deactivate_task(env->src_rq, p, 0);
|
||||
set_task_cpu(p, env->dst_cpu);
|
||||
}
|
||||
|
||||
@@ -5855,8 +5975,8 @@ static void attach_task(struct rq *rq, struct task_struct *p)
|
||||
lockdep_assert_held(&rq->lock);
|
||||
|
||||
BUG_ON(task_rq(p) != rq);
|
||||
p->on_rq = TASK_ON_RQ_QUEUED;
|
||||
activate_task(rq, p, 0);
|
||||
p->on_rq = TASK_ON_RQ_QUEUED;
|
||||
check_preempt_curr(rq, p, 0);
|
||||
}
|
||||
|
||||
@@ -6302,7 +6422,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
||||
bool *overload)
|
||||
{
|
||||
unsigned long load;
|
||||
int i;
|
||||
int i, nr_running;
|
||||
|
||||
memset(sgs, 0, sizeof(*sgs));
|
||||
|
||||
@@ -6319,7 +6439,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
||||
sgs->group_util += cpu_util(i);
|
||||
sgs->sum_nr_running += rq->cfs.h_nr_running;
|
||||
|
||||
if (rq->nr_running > 1)
|
||||
nr_running = rq->nr_running;
|
||||
if (nr_running > 1)
|
||||
*overload = true;
|
||||
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
@@ -6327,7 +6448,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
||||
sgs->nr_preferred_running += rq->nr_preferred_running;
|
||||
#endif
|
||||
sgs->sum_weighted_load += weighted_cpuload(i);
|
||||
if (idle_cpu(i))
|
||||
/*
|
||||
* No need to call idle_cpu() if nr_running is not 0
|
||||
*/
|
||||
if (!nr_running && idle_cpu(i))
|
||||
sgs->idle_cpus++;
|
||||
}
|
||||
|
||||
@@ -7248,8 +7372,6 @@ static int idle_balance(struct rq *this_rq)
|
||||
int pulled_task = 0;
|
||||
u64 curr_cost = 0;
|
||||
|
||||
idle_enter_fair(this_rq);
|
||||
|
||||
/*
|
||||
* We must set idle_stamp _before_ calling idle_balance(), such that we
|
||||
* measure the duration of idle_balance() as idle time.
|
||||
@@ -7330,10 +7452,8 @@ out:
|
||||
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
|
||||
pulled_task = -1;
|
||||
|
||||
if (pulled_task) {
|
||||
idle_exit_fair(this_rq);
|
||||
if (pulled_task)
|
||||
this_rq->idle_stamp = 0;
|
||||
}
|
||||
|
||||
return pulled_task;
|
||||
}
|
||||
|
@@ -47,7 +47,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
|
||||
|
||||
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
|
||||
{
|
||||
idle_exit_fair(rq);
|
||||
rq_last_tick_reset(rq);
|
||||
}
|
||||
|
||||
|
@@ -248,7 +248,12 @@ struct task_group {
|
||||
unsigned long shares;
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
atomic_long_t load_avg;
|
||||
/*
|
||||
* load_avg can be heavily contended at clock tick time, so put
|
||||
* it in its own cacheline separated from the fields above which
|
||||
* will also be accessed at each tick.
|
||||
*/
|
||||
atomic_long_t load_avg ____cacheline_aligned;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -335,7 +340,15 @@ extern void sched_move_task(struct task_struct *tsk);
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
extern void set_task_rq_fair(struct sched_entity *se,
|
||||
struct cfs_rq *prev, struct cfs_rq *next);
|
||||
#else /* !CONFIG_SMP */
|
||||
static inline void set_task_rq_fair(struct sched_entity *se,
|
||||
struct cfs_rq *prev, struct cfs_rq *next) { }
|
||||
#endif /* CONFIG_SMP */
|
||||
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
||||
|
||||
#else /* CONFIG_CGROUP_SCHED */
|
||||
|
||||
@@ -933,6 +946,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]);
|
||||
p->se.cfs_rq = tg->cfs_rq[cpu];
|
||||
p->se.parent = tg->se[cpu];
|
||||
#endif
|
||||
@@ -1113,46 +1127,8 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
|
||||
#define WEIGHT_IDLEPRIO 3
|
||||
#define WMULT_IDLEPRIO 1431655765
|
||||
|
||||
/*
|
||||
* Nice levels are multiplicative, with a gentle 10% change for every
|
||||
* nice level changed. I.e. when a CPU-bound task goes from nice 0 to
|
||||
* nice 1, it will get ~10% less CPU time than another CPU-bound task
|
||||
* that remained on nice 0.
|
||||
*
|
||||
* The "10% effect" is relative and cumulative: from _any_ nice level,
|
||||
* if you go up 1 level, it's -10% CPU usage, if you go down 1 level
|
||||
* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
|
||||
* If a task goes up by ~10% and another task goes down by ~10% then
|
||||
* the relative distance between them is ~25%.)
|
||||
*/
|
||||
static const int prio_to_weight[40] = {
|
||||
/* -20 */ 88761, 71755, 56483, 46273, 36291,
|
||||
/* -15 */ 29154, 23254, 18705, 14949, 11916,
|
||||
/* -10 */ 9548, 7620, 6100, 4904, 3906,
|
||||
/* -5 */ 3121, 2501, 1991, 1586, 1277,
|
||||
/* 0 */ 1024, 820, 655, 526, 423,
|
||||
/* 5 */ 335, 272, 215, 172, 137,
|
||||
/* 10 */ 110, 87, 70, 56, 45,
|
||||
/* 15 */ 36, 29, 23, 18, 15,
|
||||
};
|
||||
|
||||
/*
|
||||
* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
|
||||
*
|
||||
* In cases where the weight does not change often, we can use the
|
||||
* precalculated inverse to speed up arithmetics by turning divisions
|
||||
* into multiplications:
|
||||
*/
|
||||
static const u32 prio_to_wmult[40] = {
|
||||
/* -20 */ 48388, 59856, 76040, 92818, 118348,
|
||||
/* -15 */ 147320, 184698, 229616, 287308, 360437,
|
||||
/* -10 */ 449829, 563644, 704093, 875809, 1099582,
|
||||
/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
|
||||
/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
|
||||
/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
|
||||
/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
|
||||
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
|
||||
};
|
||||
extern const int sched_prio_to_weight[40];
|
||||
extern const u32 sched_prio_to_wmult[40];
|
||||
|
||||
#define ENQUEUE_WAKEUP 0x01
|
||||
#define ENQUEUE_HEAD 0x02
|
||||
@@ -1252,16 +1228,8 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu);
|
||||
|
||||
extern void trigger_load_balance(struct rq *rq);
|
||||
|
||||
extern void idle_enter_fair(struct rq *this_rq);
|
||||
extern void idle_exit_fair(struct rq *this_rq);
|
||||
|
||||
extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
|
||||
|
||||
#else
|
||||
|
||||
static inline void idle_enter_fair(struct rq *rq) { }
|
||||
static inline void idle_exit_fair(struct rq *rq) { }
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_CPU_IDLE
|
||||
|
Reference in New Issue
Block a user