Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: - Move the nohz kick code out of the scheduler tick to a dedicated IPI, from Frederic Weisbecker. This necessiated quite some background infrastructure rework, including: * Clean up some irq-work internals * Implement remote irq-work * Implement nohz kick on top of remote irq-work * Move full dynticks timer enqueue notification to new kick * Move multi-task notification to new kick * Remove unecessary barriers on multi-task notification - Remove proliferation of wait_on_bit() action functions and allow wait_on_bit_action() functions to support a timeout. (Neil Brown) - Another round of sched/numa improvements, cleanups and fixes. (Rik van Riel) - Implement fast idling of CPUs when the system is partially loaded, for better scalability. (Tim Chen) - Restructure and fix the CPU hotplug handling code that may leave cfs_rq and rt_rq's throttled when tasks are migrated away from a dead cpu. (Kirill Tkhai) - Robustify the sched topology setup code. (Peterz Zijlstra) - Improve sched_feat() handling wrt. static_keys (Jason Baron) - Misc fixes. * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (37 commits) sched/fair: Fix 'make xmldocs' warning caused by missing description sched: Use macro for magic number of -1 for setparam sched: Robustify topology setup sched: Fix sched_setparam() policy == -1 logic sched: Allow wait_on_bit_action() functions to support a timeout sched: Remove proliferation of wait_on_bit() action functions sched/numa: Revert "Use effective_load() to balance NUMA loads" sched: Fix static_key race with sched_feat() sched: Remove extra static_key*() function indirection sched/rt: Fix replenish_dl_entity() comments to match the current upstream code sched: Transform resched_task() into resched_curr() sched/deadline: Kill task_struct->pi_top_task sched: Rework check_for_tasks() sched/rt: Enqueue just unthrottled rt_rq back on the stack in __disable_runtime() sched/fair: Disable runtime_enabled on dying rq sched/numa: Change scan period code to match intent sched/numa: Rework best node setting in task_numa_migrate() sched/numa: Examine a task move when examining a task swap sched/numa: Simplify task_numa_compare() sched/numa: Use effective_load() to balance NUMA loads ...
This commit is contained in:
@@ -139,6 +139,8 @@ void update_rq_clock(struct rq *rq)
|
||||
return;
|
||||
|
||||
delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
|
||||
if (delta < 0)
|
||||
return;
|
||||
rq->clock += delta;
|
||||
update_rq_clock_task(rq, delta);
|
||||
}
|
||||
@@ -243,6 +245,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
|
||||
char buf[64];
|
||||
char *cmp;
|
||||
int i;
|
||||
struct inode *inode;
|
||||
|
||||
if (cnt > 63)
|
||||
cnt = 63;
|
||||
@@ -253,7 +256,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
|
||||
buf[cnt] = 0;
|
||||
cmp = strstrip(buf);
|
||||
|
||||
/* Ensure the static_key remains in a consistent state */
|
||||
inode = file_inode(filp);
|
||||
mutex_lock(&inode->i_mutex);
|
||||
i = sched_feat_set(cmp);
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
if (i == __SCHED_FEAT_NR)
|
||||
return -EINVAL;
|
||||
|
||||
@@ -587,30 +594,31 @@ static bool set_nr_if_polling(struct task_struct *p)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* resched_task - mark a task 'to be rescheduled now'.
|
||||
* resched_curr - mark rq's current task 'to be rescheduled now'.
|
||||
*
|
||||
* On UP this means the setting of the need_resched flag, on SMP it
|
||||
* might also involve a cross-CPU call to trigger the scheduler on
|
||||
* the target CPU.
|
||||
*/
|
||||
void resched_task(struct task_struct *p)
|
||||
void resched_curr(struct rq *rq)
|
||||
{
|
||||
struct task_struct *curr = rq->curr;
|
||||
int cpu;
|
||||
|
||||
lockdep_assert_held(&task_rq(p)->lock);
|
||||
lockdep_assert_held(&rq->lock);
|
||||
|
||||
if (test_tsk_need_resched(p))
|
||||
if (test_tsk_need_resched(curr))
|
||||
return;
|
||||
|
||||
cpu = task_cpu(p);
|
||||
cpu = cpu_of(rq);
|
||||
|
||||
if (cpu == smp_processor_id()) {
|
||||
set_tsk_need_resched(p);
|
||||
set_tsk_need_resched(curr);
|
||||
set_preempt_need_resched();
|
||||
return;
|
||||
}
|
||||
|
||||
if (set_nr_and_not_polling(p))
|
||||
if (set_nr_and_not_polling(curr))
|
||||
smp_send_reschedule(cpu);
|
||||
else
|
||||
trace_sched_wake_idle_without_ipi(cpu);
|
||||
@@ -623,7 +631,7 @@ void resched_cpu(int cpu)
|
||||
|
||||
if (!raw_spin_trylock_irqsave(&rq->lock, flags))
|
||||
return;
|
||||
resched_task(cpu_curr(cpu));
|
||||
resched_curr(rq);
|
||||
raw_spin_unlock_irqrestore(&rq->lock, flags);
|
||||
}
|
||||
|
||||
@@ -684,10 +692,16 @@ static void wake_up_idle_cpu(int cpu)
|
||||
|
||||
static bool wake_up_full_nohz_cpu(int cpu)
|
||||
{
|
||||
/*
|
||||
* We just need the target to call irq_exit() and re-evaluate
|
||||
* the next tick. The nohz full kick at least implies that.
|
||||
* If needed we can still optimize that later with an
|
||||
* empty IRQ.
|
||||
*/
|
||||
if (tick_nohz_full_cpu(cpu)) {
|
||||
if (cpu != smp_processor_id() ||
|
||||
tick_nohz_tick_stopped())
|
||||
smp_send_reschedule(cpu);
|
||||
tick_nohz_full_kick_cpu(cpu);
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -730,18 +744,15 @@ static inline bool got_nohz_idle_kick(void)
|
||||
#ifdef CONFIG_NO_HZ_FULL
|
||||
bool sched_can_stop_tick(void)
|
||||
{
|
||||
struct rq *rq;
|
||||
/*
|
||||
* More than one running task need preemption.
|
||||
* nr_running update is assumed to be visible
|
||||
* after IPI is sent from wakers.
|
||||
*/
|
||||
if (this_rq()->nr_running > 1)
|
||||
return false;
|
||||
|
||||
rq = this_rq();
|
||||
|
||||
/* Make sure rq->nr_running update is visible after the IPI */
|
||||
smp_rmb();
|
||||
|
||||
/* More than one running task need preemption */
|
||||
if (rq->nr_running > 1)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
return true;
|
||||
}
|
||||
#endif /* CONFIG_NO_HZ_FULL */
|
||||
|
||||
@@ -1022,7 +1033,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
|
||||
if (class == rq->curr->sched_class)
|
||||
break;
|
||||
if (class == p->sched_class) {
|
||||
resched_task(rq->curr);
|
||||
resched_curr(rq);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -1568,9 +1579,7 @@ void scheduler_ipi(void)
|
||||
*/
|
||||
preempt_fold_need_resched();
|
||||
|
||||
if (llist_empty(&this_rq()->wake_list)
|
||||
&& !tick_nohz_full_cpu(smp_processor_id())
|
||||
&& !got_nohz_idle_kick())
|
||||
if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
|
||||
return;
|
||||
|
||||
/*
|
||||
@@ -1587,7 +1596,6 @@ void scheduler_ipi(void)
|
||||
* somewhat pessimize the simple resched case.
|
||||
*/
|
||||
irq_enter();
|
||||
tick_nohz_full_check();
|
||||
sched_ttwu_pending();
|
||||
|
||||
/*
|
||||
@@ -2431,7 +2439,12 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
|
||||
{
|
||||
u64 ns = 0;
|
||||
|
||||
if (task_current(rq, p)) {
|
||||
/*
|
||||
* Must be ->curr _and_ ->on_rq. If dequeued, we would
|
||||
* project cycles that may never be accounted to this
|
||||
* thread, breaking clock_gettime().
|
||||
*/
|
||||
if (task_current(rq, p) && p->on_rq) {
|
||||
update_rq_clock(rq);
|
||||
ns = rq_clock_task(rq) - p->se.exec_start;
|
||||
if ((s64)ns < 0)
|
||||
@@ -2474,8 +2487,10 @@ unsigned long long task_sched_runtime(struct task_struct *p)
|
||||
* If we race with it leaving cpu, we'll take a lock. So we're correct.
|
||||
* If we race with it entering cpu, unaccounted time is 0. This is
|
||||
* indistinguishable from the read occurring a few cycles earlier.
|
||||
* If we see ->on_cpu without ->on_rq, the task is leaving, and has
|
||||
* been accounted, so we're correct here as well.
|
||||
*/
|
||||
if (!p->on_cpu)
|
||||
if (!p->on_cpu || !p->on_rq)
|
||||
return p->se.sum_exec_runtime;
|
||||
#endif
|
||||
|
||||
@@ -2971,7 +2986,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
|
||||
}
|
||||
|
||||
trace_sched_pi_setprio(p, prio);
|
||||
p->pi_top_task = rt_mutex_get_top_task(p);
|
||||
oldprio = p->prio;
|
||||
prev_class = p->sched_class;
|
||||
on_rq = p->on_rq;
|
||||
@@ -2991,8 +3005,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
|
||||
* running task
|
||||
*/
|
||||
if (dl_prio(prio)) {
|
||||
if (!dl_prio(p->normal_prio) || (p->pi_top_task &&
|
||||
dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
|
||||
struct task_struct *pi_task = rt_mutex_get_top_task(p);
|
||||
if (!dl_prio(p->normal_prio) ||
|
||||
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
|
||||
p->dl.dl_boosted = 1;
|
||||
p->dl.dl_throttled = 0;
|
||||
enqueue_flag = ENQUEUE_REPLENISH;
|
||||
@@ -3064,7 +3079,7 @@ void set_user_nice(struct task_struct *p, long nice)
|
||||
* lowered its priority, then reschedule its CPU:
|
||||
*/
|
||||
if (delta < 0 || (delta > 0 && task_running(rq, p)))
|
||||
resched_task(rq->curr);
|
||||
resched_curr(rq);
|
||||
}
|
||||
out_unlock:
|
||||
task_rq_unlock(rq, p, &flags);
|
||||
@@ -3203,12 +3218,18 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
|
||||
dl_se->dl_yielded = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* sched_setparam() passes in -1 for its policy, to let the functions
|
||||
* it calls know not to change it.
|
||||
*/
|
||||
#define SETPARAM_POLICY -1
|
||||
|
||||
static void __setscheduler_params(struct task_struct *p,
|
||||
const struct sched_attr *attr)
|
||||
{
|
||||
int policy = attr->sched_policy;
|
||||
|
||||
if (policy == -1) /* setparam */
|
||||
if (policy == SETPARAM_POLICY)
|
||||
policy = p->policy;
|
||||
|
||||
p->policy = policy;
|
||||
@@ -3557,10 +3578,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
|
||||
.sched_nice = PRIO_TO_NICE(p->static_prio),
|
||||
};
|
||||
|
||||
/*
|
||||
* Fixup the legacy SCHED_RESET_ON_FORK hack
|
||||
*/
|
||||
if (policy & SCHED_RESET_ON_FORK) {
|
||||
/* Fixup the legacy SCHED_RESET_ON_FORK hack. */
|
||||
if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
|
||||
attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
|
||||
policy &= ~SCHED_RESET_ON_FORK;
|
||||
attr.sched_policy = policy;
|
||||
@@ -3730,7 +3749,7 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
|
||||
*/
|
||||
SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
|
||||
{
|
||||
return do_sched_setscheduler(pid, -1, param);
|
||||
return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -4285,7 +4304,7 @@ again:
|
||||
* fairness.
|
||||
*/
|
||||
if (preempt && rq != p_rq)
|
||||
resched_task(p_rq->curr);
|
||||
resched_curr(p_rq);
|
||||
}
|
||||
|
||||
out_unlock:
|
||||
@@ -6465,6 +6484,20 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
|
||||
sched_domain_level_max = max(sched_domain_level_max, sd->level);
|
||||
child->parent = sd;
|
||||
sd->child = child;
|
||||
|
||||
if (!cpumask_subset(sched_domain_span(child),
|
||||
sched_domain_span(sd))) {
|
||||
pr_err("BUG: arch topology borken\n");
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
pr_err(" the %s domain not a subset of the %s domain\n",
|
||||
child->name, sd->name);
|
||||
#endif
|
||||
/* Fixup, ensure @sd has at least @child cpus. */
|
||||
cpumask_or(sched_domain_span(sd),
|
||||
sched_domain_span(sd),
|
||||
sched_domain_span(child));
|
||||
}
|
||||
|
||||
}
|
||||
set_domain_attribute(sd, attr);
|
||||
|
||||
@@ -7092,7 +7125,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
|
||||
__setscheduler(rq, p, &attr);
|
||||
if (on_rq) {
|
||||
enqueue_task(rq, p, 0);
|
||||
resched_task(rq->curr);
|
||||
resched_curr(rq);
|
||||
}
|
||||
|
||||
check_class_changed(rq, p, prev_class, old_prio);
|
||||
@@ -7803,6 +7836,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
|
||||
if (period > max_cfs_quota_period)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Prevent race between setting of cfs_rq->runtime_enabled and
|
||||
* unthrottle_offline_cfs_rqs().
|
||||
*/
|
||||
get_online_cpus();
|
||||
mutex_lock(&cfs_constraints_mutex);
|
||||
ret = __cfs_schedulable(tg, period, quota);
|
||||
if (ret)
|
||||
@@ -7828,7 +7866,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
|
||||
}
|
||||
raw_spin_unlock_irq(&cfs_b->lock);
|
||||
|
||||
for_each_possible_cpu(i) {
|
||||
for_each_online_cpu(i) {
|
||||
struct cfs_rq *cfs_rq = tg->cfs_rq[i];
|
||||
struct rq *rq = cfs_rq->rq;
|
||||
|
||||
@@ -7844,6 +7882,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
|
||||
cfs_bandwidth_usage_dec();
|
||||
out_unlock:
|
||||
mutex_unlock(&cfs_constraints_mutex);
|
||||
put_online_cpus();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
Reference in New Issue
Block a user