Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:

 - Move the nohz kick code out of the scheduler tick to a dedicated IPI,
   from Frederic Weisbecker.

  This necessiated quite some background infrastructure rework,
  including:

   * Clean up some irq-work internals
   * Implement remote irq-work
   * Implement nohz kick on top of remote irq-work
   * Move full dynticks timer enqueue notification to new kick
   * Move multi-task notification to new kick
   * Remove unecessary barriers on multi-task notification

 - Remove proliferation of wait_on_bit() action functions and allow
   wait_on_bit_action() functions to support a timeout.  (Neil Brown)

 - Another round of sched/numa improvements, cleanups and fixes.  (Rik
   van Riel)

 - Implement fast idling of CPUs when the system is partially loaded,
   for better scalability.  (Tim Chen)

 - Restructure and fix the CPU hotplug handling code that may leave
   cfs_rq and rt_rq's throttled when tasks are migrated away from a dead
   cpu.  (Kirill Tkhai)

 - Robustify the sched topology setup code.  (Peterz Zijlstra)

 - Improve sched_feat() handling wrt.  static_keys (Jason Baron)

 - Misc fixes.

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (37 commits)
  sched/fair: Fix 'make xmldocs' warning caused by missing description
  sched: Use macro for magic number of -1 for setparam
  sched: Robustify topology setup
  sched: Fix sched_setparam() policy == -1 logic
  sched: Allow wait_on_bit_action() functions to support a timeout
  sched: Remove proliferation of wait_on_bit() action functions
  sched/numa: Revert "Use effective_load() to balance NUMA loads"
  sched: Fix static_key race with sched_feat()
  sched: Remove extra static_key*() function indirection
  sched/rt: Fix replenish_dl_entity() comments to match the current upstream code
  sched: Transform resched_task() into resched_curr()
  sched/deadline: Kill task_struct->pi_top_task
  sched: Rework check_for_tasks()
  sched/rt: Enqueue just unthrottled rt_rq back on the stack in __disable_runtime()
  sched/fair: Disable runtime_enabled on dying rq
  sched/numa: Change scan period code to match intent
  sched/numa: Rework best node setting in task_numa_migrate()
  sched/numa: Examine a task move when examining a task swap
  sched/numa: Simplify task_numa_compare()
  sched/numa: Use effective_load() to balance NUMA loads
  ...
This commit is contained in:
Linus Torvalds
2014-08-04 16:23:30 -07:00
57 changed files with 591 additions and 563 deletions

View File

@@ -139,6 +139,8 @@ void update_rq_clock(struct rq *rq)
return;
delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
if (delta < 0)
return;
rq->clock += delta;
update_rq_clock_task(rq, delta);
}
@@ -243,6 +245,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
char buf[64];
char *cmp;
int i;
struct inode *inode;
if (cnt > 63)
cnt = 63;
@@ -253,7 +256,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
buf[cnt] = 0;
cmp = strstrip(buf);
/* Ensure the static_key remains in a consistent state */
inode = file_inode(filp);
mutex_lock(&inode->i_mutex);
i = sched_feat_set(cmp);
mutex_unlock(&inode->i_mutex);
if (i == __SCHED_FEAT_NR)
return -EINVAL;
@@ -587,30 +594,31 @@ static bool set_nr_if_polling(struct task_struct *p)
#endif
/*
* resched_task - mark a task 'to be rescheduled now'.
* resched_curr - mark rq's current task 'to be rescheduled now'.
*
* On UP this means the setting of the need_resched flag, on SMP it
* might also involve a cross-CPU call to trigger the scheduler on
* the target CPU.
*/
void resched_task(struct task_struct *p)
void resched_curr(struct rq *rq)
{
struct task_struct *curr = rq->curr;
int cpu;
lockdep_assert_held(&task_rq(p)->lock);
lockdep_assert_held(&rq->lock);
if (test_tsk_need_resched(p))
if (test_tsk_need_resched(curr))
return;
cpu = task_cpu(p);
cpu = cpu_of(rq);
if (cpu == smp_processor_id()) {
set_tsk_need_resched(p);
set_tsk_need_resched(curr);
set_preempt_need_resched();
return;
}
if (set_nr_and_not_polling(p))
if (set_nr_and_not_polling(curr))
smp_send_reschedule(cpu);
else
trace_sched_wake_idle_without_ipi(cpu);
@@ -623,7 +631,7 @@ void resched_cpu(int cpu)
if (!raw_spin_trylock_irqsave(&rq->lock, flags))
return;
resched_task(cpu_curr(cpu));
resched_curr(rq);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -684,10 +692,16 @@ static void wake_up_idle_cpu(int cpu)
static bool wake_up_full_nohz_cpu(int cpu)
{
/*
* We just need the target to call irq_exit() and re-evaluate
* the next tick. The nohz full kick at least implies that.
* If needed we can still optimize that later with an
* empty IRQ.
*/
if (tick_nohz_full_cpu(cpu)) {
if (cpu != smp_processor_id() ||
tick_nohz_tick_stopped())
smp_send_reschedule(cpu);
tick_nohz_full_kick_cpu(cpu);
return true;
}
@@ -730,18 +744,15 @@ static inline bool got_nohz_idle_kick(void)
#ifdef CONFIG_NO_HZ_FULL
bool sched_can_stop_tick(void)
{
struct rq *rq;
/*
* More than one running task need preemption.
* nr_running update is assumed to be visible
* after IPI is sent from wakers.
*/
if (this_rq()->nr_running > 1)
return false;
rq = this_rq();
/* Make sure rq->nr_running update is visible after the IPI */
smp_rmb();
/* More than one running task need preemption */
if (rq->nr_running > 1)
return false;
return true;
return true;
}
#endif /* CONFIG_NO_HZ_FULL */
@@ -1022,7 +1033,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
if (class == rq->curr->sched_class)
break;
if (class == p->sched_class) {
resched_task(rq->curr);
resched_curr(rq);
break;
}
}
@@ -1568,9 +1579,7 @@ void scheduler_ipi(void)
*/
preempt_fold_need_resched();
if (llist_empty(&this_rq()->wake_list)
&& !tick_nohz_full_cpu(smp_processor_id())
&& !got_nohz_idle_kick())
if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
return;
/*
@@ -1587,7 +1596,6 @@ void scheduler_ipi(void)
* somewhat pessimize the simple resched case.
*/
irq_enter();
tick_nohz_full_check();
sched_ttwu_pending();
/*
@@ -2431,7 +2439,12 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
{
u64 ns = 0;
if (task_current(rq, p)) {
/*
* Must be ->curr _and_ ->on_rq. If dequeued, we would
* project cycles that may never be accounted to this
* thread, breaking clock_gettime().
*/
if (task_current(rq, p) && p->on_rq) {
update_rq_clock(rq);
ns = rq_clock_task(rq) - p->se.exec_start;
if ((s64)ns < 0)
@@ -2474,8 +2487,10 @@ unsigned long long task_sched_runtime(struct task_struct *p)
* If we race with it leaving cpu, we'll take a lock. So we're correct.
* If we race with it entering cpu, unaccounted time is 0. This is
* indistinguishable from the read occurring a few cycles earlier.
* If we see ->on_cpu without ->on_rq, the task is leaving, and has
* been accounted, so we're correct here as well.
*/
if (!p->on_cpu)
if (!p->on_cpu || !p->on_rq)
return p->se.sum_exec_runtime;
#endif
@@ -2971,7 +2986,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
}
trace_sched_pi_setprio(p, prio);
p->pi_top_task = rt_mutex_get_top_task(p);
oldprio = p->prio;
prev_class = p->sched_class;
on_rq = p->on_rq;
@@ -2991,8 +3005,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
* running task
*/
if (dl_prio(prio)) {
if (!dl_prio(p->normal_prio) || (p->pi_top_task &&
dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
struct task_struct *pi_task = rt_mutex_get_top_task(p);
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
p->dl.dl_throttled = 0;
enqueue_flag = ENQUEUE_REPLENISH;
@@ -3064,7 +3079,7 @@ void set_user_nice(struct task_struct *p, long nice)
* lowered its priority, then reschedule its CPU:
*/
if (delta < 0 || (delta > 0 && task_running(rq, p)))
resched_task(rq->curr);
resched_curr(rq);
}
out_unlock:
task_rq_unlock(rq, p, &flags);
@@ -3203,12 +3218,18 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
dl_se->dl_yielded = 0;
}
/*
* sched_setparam() passes in -1 for its policy, to let the functions
* it calls know not to change it.
*/
#define SETPARAM_POLICY -1
static void __setscheduler_params(struct task_struct *p,
const struct sched_attr *attr)
{
int policy = attr->sched_policy;
if (policy == -1) /* setparam */
if (policy == SETPARAM_POLICY)
policy = p->policy;
p->policy = policy;
@@ -3557,10 +3578,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
.sched_nice = PRIO_TO_NICE(p->static_prio),
};
/*
* Fixup the legacy SCHED_RESET_ON_FORK hack
*/
if (policy & SCHED_RESET_ON_FORK) {
/* Fixup the legacy SCHED_RESET_ON_FORK hack. */
if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
policy &= ~SCHED_RESET_ON_FORK;
attr.sched_policy = policy;
@@ -3730,7 +3749,7 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
*/
SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
{
return do_sched_setscheduler(pid, -1, param);
return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
}
/**
@@ -4285,7 +4304,7 @@ again:
* fairness.
*/
if (preempt && rq != p_rq)
resched_task(p_rq->curr);
resched_curr(p_rq);
}
out_unlock:
@@ -6465,6 +6484,20 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
sched_domain_level_max = max(sched_domain_level_max, sd->level);
child->parent = sd;
sd->child = child;
if (!cpumask_subset(sched_domain_span(child),
sched_domain_span(sd))) {
pr_err("BUG: arch topology borken\n");
#ifdef CONFIG_SCHED_DEBUG
pr_err(" the %s domain not a subset of the %s domain\n",
child->name, sd->name);
#endif
/* Fixup, ensure @sd has at least @child cpus. */
cpumask_or(sched_domain_span(sd),
sched_domain_span(sd),
sched_domain_span(child));
}
}
set_domain_attribute(sd, attr);
@@ -7092,7 +7125,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
__setscheduler(rq, p, &attr);
if (on_rq) {
enqueue_task(rq, p, 0);
resched_task(rq->curr);
resched_curr(rq);
}
check_class_changed(rq, p, prev_class, old_prio);
@@ -7803,6 +7836,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
if (period > max_cfs_quota_period)
return -EINVAL;
/*
* Prevent race between setting of cfs_rq->runtime_enabled and
* unthrottle_offline_cfs_rqs().
*/
get_online_cpus();
mutex_lock(&cfs_constraints_mutex);
ret = __cfs_schedulable(tg, period, quota);
if (ret)
@@ -7828,7 +7866,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
}
raw_spin_unlock_irq(&cfs_b->lock);
for_each_possible_cpu(i) {
for_each_online_cpu(i) {
struct cfs_rq *cfs_rq = tg->cfs_rq[i];
struct rq *rq = cfs_rq->rq;
@@ -7844,6 +7882,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
cfs_bandwidth_usage_dec();
out_unlock:
mutex_unlock(&cfs_constraints_mutex);
put_online_cpus();
return ret;
}