Merge tag 'sched-core-2020-06-02' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The changes in this cycle are: - Optimize the task wakeup CPU selection logic, to improve scalability and reduce wakeup latency spikes - PELT enhancements - CFS bandwidth handling fixes - Optimize the wakeup path by remove rq->wake_list and replacing it with ->ttwu_pending - Optimize IPI cross-calls by making flush_smp_call_function_queue() process sync callbacks first. - Misc fixes and enhancements" * tag 'sched-core-2020-06-02' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (36 commits) irq_work: Define irq_work_single() on !CONFIG_IRQ_WORK too sched/headers: Split out open-coded prototypes into kernel/sched/smp.h sched: Replace rq::wake_list sched: Add rq::ttwu_pending irq_work, smp: Allow irq_work on call_single_queue smp: Optimize send_call_function_single_ipi() smp: Move irq_work_run() out of flush_smp_call_function_queue() smp: Optimize flush_smp_call_function_queue() sched: Fix smp_call_function_single_async() usage for ILB sched/core: Offload wakee task activation if it the wakee is descheduling sched/core: Optimize ttwu() spinning on p->on_cpu sched: Defend cfs and rt bandwidth quota against overflow sched/cpuacct: Fix charge cpuacct.usage_sys sched/fair: Replace zero-length array with flexible-array sched/pelt: Sync util/runnable_sum with PELT window when propagating sched/cpuacct: Use __this_cpu_add() instead of this_cpu_ptr() sched/fair: Optimize enqueue_task_fair() sched: Make scheduler_ipi inline sched: Clean up scheduler_ipi() sched/core: Simplify sched_init() ...
This commit is contained in:
@@ -21,6 +21,7 @@
|
||||
#include "../smpboot.h"
|
||||
|
||||
#include "pelt.h"
|
||||
#include "smp.h"
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include <trace/events/sched.h>
|
||||
@@ -220,6 +221,13 @@ void update_rq_clock(struct rq *rq)
|
||||
update_rq_clock_task(rq, delta);
|
||||
}
|
||||
|
||||
static inline void
|
||||
rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func)
|
||||
{
|
||||
csd->flags = 0;
|
||||
csd->func = func;
|
||||
csd->info = rq;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SCHED_HRTICK
|
||||
/*
|
||||
@@ -315,16 +323,14 @@ void hrtick_start(struct rq *rq, u64 delay)
|
||||
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
|
||||
HRTIMER_MODE_REL_PINNED_HARD);
|
||||
}
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
static void hrtick_rq_init(struct rq *rq)
|
||||
{
|
||||
#ifdef CONFIG_SMP
|
||||
rq->hrtick_csd.flags = 0;
|
||||
rq->hrtick_csd.func = __hrtick_start;
|
||||
rq->hrtick_csd.info = rq;
|
||||
rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
|
||||
#endif
|
||||
|
||||
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
|
||||
rq->hrtick_timer.function = hrtick;
|
||||
}
|
||||
@@ -633,29 +639,23 @@ void wake_up_nohz_cpu(int cpu)
|
||||
wake_up_idle_cpu(cpu);
|
||||
}
|
||||
|
||||
static inline bool got_nohz_idle_kick(void)
|
||||
static void nohz_csd_func(void *info)
|
||||
{
|
||||
int cpu = smp_processor_id();
|
||||
|
||||
if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
|
||||
return false;
|
||||
|
||||
if (idle_cpu(cpu) && !need_resched())
|
||||
return true;
|
||||
struct rq *rq = info;
|
||||
int cpu = cpu_of(rq);
|
||||
unsigned int flags;
|
||||
|
||||
/*
|
||||
* We can't run Idle Load Balance on this CPU for this time so we
|
||||
* cancel it and clear NOHZ_BALANCE_KICK
|
||||
* Release the rq::nohz_csd.
|
||||
*/
|
||||
atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
|
||||
return false;
|
||||
}
|
||||
flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
|
||||
WARN_ON(!(flags & NOHZ_KICK_MASK));
|
||||
|
||||
#else /* CONFIG_NO_HZ_COMMON */
|
||||
|
||||
static inline bool got_nohz_idle_kick(void)
|
||||
{
|
||||
return false;
|
||||
rq->idle_balance = idle_cpu(cpu);
|
||||
if (rq->idle_balance && !need_resched()) {
|
||||
rq->nohz_idle_balance = flags;
|
||||
raise_softirq_irqoff(SCHED_SOFTIRQ);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* CONFIG_NO_HZ_COMMON */
|
||||
@@ -1540,7 +1540,7 @@ static int migration_cpu_stop(void *data)
|
||||
* __migrate_task() such that we will not miss enforcing cpus_ptr
|
||||
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
|
||||
*/
|
||||
sched_ttwu_pending();
|
||||
flush_smp_call_function_from_idle();
|
||||
|
||||
raw_spin_lock(&p->pi_lock);
|
||||
rq_lock(rq, &rf);
|
||||
@@ -2274,16 +2274,23 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
void sched_ttwu_pending(void)
|
||||
void sched_ttwu_pending(void *arg)
|
||||
{
|
||||
struct llist_node *llist = arg;
|
||||
struct rq *rq = this_rq();
|
||||
struct llist_node *llist = llist_del_all(&rq->wake_list);
|
||||
struct task_struct *p, *t;
|
||||
struct rq_flags rf;
|
||||
|
||||
if (!llist)
|
||||
return;
|
||||
|
||||
/*
|
||||
* rq::ttwu_pending racy indication of out-standing wakeups.
|
||||
* Races such that false-negatives are possible, since they
|
||||
* are shorter lived that false-positives would be.
|
||||
*/
|
||||
WRITE_ONCE(rq->ttwu_pending, 0);
|
||||
|
||||
rq_lock_irqsave(rq, &rf);
|
||||
update_rq_clock(rq);
|
||||
|
||||
@@ -2293,56 +2300,30 @@ void sched_ttwu_pending(void)
|
||||
rq_unlock_irqrestore(rq, &rf);
|
||||
}
|
||||
|
||||
void scheduler_ipi(void)
|
||||
void send_call_function_single_ipi(int cpu)
|
||||
{
|
||||
/*
|
||||
* Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
|
||||
* TIF_NEED_RESCHED remotely (for the first time) will also send
|
||||
* this IPI.
|
||||
*/
|
||||
preempt_fold_need_resched();
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
|
||||
if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
|
||||
return;
|
||||
|
||||
/*
|
||||
* Not all reschedule IPI handlers call irq_enter/irq_exit, since
|
||||
* traditionally all their work was done from the interrupt return
|
||||
* path. Now that we actually do some work, we need to make sure
|
||||
* we do call them.
|
||||
*
|
||||
* Some archs already do call them, luckily irq_enter/exit nest
|
||||
* properly.
|
||||
*
|
||||
* Arguably we should visit all archs and update all handlers,
|
||||
* however a fair share of IPIs are still resched only so this would
|
||||
* somewhat pessimize the simple resched case.
|
||||
*/
|
||||
irq_enter();
|
||||
sched_ttwu_pending();
|
||||
|
||||
/*
|
||||
* Check if someone kicked us for doing the nohz idle load balance.
|
||||
*/
|
||||
if (unlikely(got_nohz_idle_kick())) {
|
||||
this_rq()->idle_balance = 1;
|
||||
raise_softirq_irqoff(SCHED_SOFTIRQ);
|
||||
}
|
||||
irq_exit();
|
||||
if (!set_nr_if_polling(rq->idle))
|
||||
arch_send_call_function_single_ipi(cpu);
|
||||
else
|
||||
trace_sched_wake_idle_without_ipi(cpu);
|
||||
}
|
||||
|
||||
static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
|
||||
/*
|
||||
* Queue a task on the target CPUs wake_list and wake the CPU via IPI if
|
||||
* necessary. The wakee CPU on receipt of the IPI will queue the task
|
||||
* via sched_ttwu_wakeup() for activation so the wakee incurs the cost
|
||||
* of the wakeup instead of the waker.
|
||||
*/
|
||||
static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
|
||||
{
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
|
||||
p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
|
||||
|
||||
if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
|
||||
if (!set_nr_if_polling(rq->idle))
|
||||
smp_send_reschedule(cpu);
|
||||
else
|
||||
trace_sched_wake_idle_without_ipi(cpu);
|
||||
}
|
||||
WRITE_ONCE(rq->ttwu_pending, 1);
|
||||
__smp_call_single_queue(cpu, &p->wake_entry);
|
||||
}
|
||||
|
||||
void wake_up_if_idle(int cpu)
|
||||
@@ -2373,6 +2354,38 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
|
||||
{
|
||||
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
|
||||
}
|
||||
|
||||
static inline bool ttwu_queue_cond(int cpu, int wake_flags)
|
||||
{
|
||||
/*
|
||||
* If the CPU does not share cache, then queue the task on the
|
||||
* remote rqs wakelist to avoid accessing remote data.
|
||||
*/
|
||||
if (!cpus_share_cache(smp_processor_id(), cpu))
|
||||
return true;
|
||||
|
||||
/*
|
||||
* If the task is descheduling and the only running task on the
|
||||
* CPU then use the wakelist to offload the task activation to
|
||||
* the soon-to-be-idle CPU as the current CPU is likely busy.
|
||||
* nr_running is checked to avoid unnecessary task stacking.
|
||||
*/
|
||||
if ((wake_flags & WF_ON_RQ) && cpu_rq(cpu)->nr_running <= 1)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
|
||||
{
|
||||
if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
|
||||
sched_clock_cpu(cpu); /* Sync clocks across CPUs */
|
||||
__ttwu_queue_wakelist(p, cpu, wake_flags);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
|
||||
@@ -2381,11 +2394,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
|
||||
struct rq_flags rf;
|
||||
|
||||
#if defined(CONFIG_SMP)
|
||||
if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
|
||||
sched_clock_cpu(cpu); /* Sync clocks across CPUs */
|
||||
ttwu_queue_remote(p, cpu, wake_flags);
|
||||
if (ttwu_queue_wakelist(p, cpu, wake_flags))
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
rq_lock(rq, &rf);
|
||||
@@ -2569,7 +2579,15 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
|
||||
if (p->on_rq && ttwu_remote(p, wake_flags))
|
||||
goto unlock;
|
||||
|
||||
if (p->in_iowait) {
|
||||
delayacct_blkio_end(p);
|
||||
atomic_dec(&task_rq(p)->nr_iowait);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
p->sched_contributes_to_load = !!task_contributes_to_load(p);
|
||||
p->state = TASK_WAKING;
|
||||
|
||||
/*
|
||||
* Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
|
||||
* possible to, falsely, observe p->on_cpu == 0.
|
||||
@@ -2591,6 +2609,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
|
||||
*/
|
||||
smp_rmb();
|
||||
|
||||
/*
|
||||
* If the owning (remote) CPU is still in the middle of schedule() with
|
||||
* this task as prev, considering queueing p on the remote CPUs wake_list
|
||||
* which potentially sends an IPI instead of spinning on p->on_cpu to
|
||||
* let the waker make forward progress. This is safe because IRQs are
|
||||
* disabled and the IPI will deliver after on_cpu is cleared.
|
||||
*/
|
||||
if (READ_ONCE(p->on_cpu) && ttwu_queue_wakelist(p, cpu, wake_flags | WF_ON_RQ))
|
||||
goto unlock;
|
||||
|
||||
/*
|
||||
* If the owning (remote) CPU is still in the middle of schedule() with
|
||||
* this task as prev, wait until its done referencing the task.
|
||||
@@ -2602,28 +2630,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
|
||||
*/
|
||||
smp_cond_load_acquire(&p->on_cpu, !VAL);
|
||||
|
||||
p->sched_contributes_to_load = !!task_contributes_to_load(p);
|
||||
p->state = TASK_WAKING;
|
||||
|
||||
if (p->in_iowait) {
|
||||
delayacct_blkio_end(p);
|
||||
atomic_dec(&task_rq(p)->nr_iowait);
|
||||
}
|
||||
|
||||
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
|
||||
if (task_cpu(p) != cpu) {
|
||||
wake_flags |= WF_MIGRATED;
|
||||
psi_ttwu_dequeue(p);
|
||||
set_task_cpu(p, cpu);
|
||||
}
|
||||
|
||||
#else /* CONFIG_SMP */
|
||||
|
||||
if (p->in_iowait) {
|
||||
delayacct_blkio_end(p);
|
||||
atomic_dec(&task_rq(p)->nr_iowait);
|
||||
}
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
ttwu_queue(p, cpu, wake_flags);
|
||||
@@ -2751,6 +2763,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
|
||||
p->capture_control = NULL;
|
||||
#endif
|
||||
init_numa_balancing(clone_flags, p);
|
||||
#ifdef CONFIG_SMP
|
||||
p->wake_entry_type = CSD_TYPE_TTWU;
|
||||
#endif
|
||||
}
|
||||
|
||||
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
|
||||
@@ -3951,6 +3966,28 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
|
||||
schedstat_inc(this_rq()->sched_count);
|
||||
}
|
||||
|
||||
static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
|
||||
struct rq_flags *rf)
|
||||
{
|
||||
#ifdef CONFIG_SMP
|
||||
const struct sched_class *class;
|
||||
/*
|
||||
* We must do the balancing pass before put_prev_task(), such
|
||||
* that when we release the rq->lock the task is in the same
|
||||
* state as before we took rq->lock.
|
||||
*
|
||||
* We can terminate the balance pass as soon as we know there is
|
||||
* a runnable task of @class priority or higher.
|
||||
*/
|
||||
for_class_range(class, prev->sched_class, &idle_sched_class) {
|
||||
if (class->balance(rq, prev, rf))
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
put_prev_task(rq, prev);
|
||||
}
|
||||
|
||||
/*
|
||||
* Pick up the highest-prio task:
|
||||
*/
|
||||
@@ -3984,22 +4021,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||||
}
|
||||
|
||||
restart:
|
||||
#ifdef CONFIG_SMP
|
||||
/*
|
||||
* We must do the balancing pass before put_next_task(), such
|
||||
* that when we release the rq->lock the task is in the same
|
||||
* state as before we took rq->lock.
|
||||
*
|
||||
* We can terminate the balance pass as soon as we know there is
|
||||
* a runnable task of @class priority or higher.
|
||||
*/
|
||||
for_class_range(class, prev->sched_class, &idle_sched_class) {
|
||||
if (class->balance(rq, prev, rf))
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
put_prev_task(rq, prev);
|
||||
put_prev_task_balance(rq, prev, rf);
|
||||
|
||||
for_each_class(class) {
|
||||
p = class->pick_next_task(rq);
|
||||
@@ -4689,7 +4711,7 @@ int idle_cpu(int cpu)
|
||||
return 0;
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
if (!llist_empty(&rq->wake_list))
|
||||
if (rq->ttwu_pending)
|
||||
return 0;
|
||||
#endif
|
||||
|
||||
@@ -6243,13 +6265,14 @@ void idle_task_exit(void)
|
||||
struct mm_struct *mm = current->active_mm;
|
||||
|
||||
BUG_ON(cpu_online(smp_processor_id()));
|
||||
BUG_ON(current != this_rq()->idle);
|
||||
|
||||
if (mm != &init_mm) {
|
||||
switch_mm(mm, &init_mm, current);
|
||||
current->active_mm = &init_mm;
|
||||
finish_arch_post_lock_switch();
|
||||
}
|
||||
mmdrop(mm);
|
||||
|
||||
/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -6539,7 +6562,6 @@ int sched_cpu_dying(unsigned int cpu)
|
||||
struct rq_flags rf;
|
||||
|
||||
/* Handle pending wakeups and then migrate everything off */
|
||||
sched_ttwu_pending();
|
||||
sched_tick_stop(cpu);
|
||||
|
||||
rq_lock_irqsave(rq, &rf);
|
||||
@@ -6642,6 +6664,8 @@ void __init sched_init(void)
|
||||
root_task_group.cfs_rq = (struct cfs_rq **)ptr;
|
||||
ptr += nr_cpu_ids * sizeof(void **);
|
||||
|
||||
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
|
||||
init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
|
||||
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
|
||||
@@ -6694,7 +6718,6 @@ void __init sched_init(void)
|
||||
init_rt_rq(&rq->rt);
|
||||
init_dl_rq(&rq->dl);
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
|
||||
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
|
||||
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
|
||||
/*
|
||||
@@ -6716,7 +6739,6 @@ void __init sched_init(void)
|
||||
* We achieve this by letting root_task_group's tasks sit
|
||||
* directly in rq->cfs (i.e root_task_group->se[] = NULL).
|
||||
*/
|
||||
init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
|
||||
init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
|
||||
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
||||
|
||||
@@ -6744,6 +6766,8 @@ void __init sched_init(void)
|
||||
#ifdef CONFIG_NO_HZ_COMMON
|
||||
rq->last_blocked_load_update_tick = jiffies;
|
||||
atomic_set(&rq->nohz_flags, 0);
|
||||
|
||||
rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
|
||||
#endif
|
||||
#endif /* CONFIG_SMP */
|
||||
hrtick_rq_init(rq);
|
||||
@@ -7438,6 +7462,8 @@ static DEFINE_MUTEX(cfs_constraints_mutex);
|
||||
|
||||
const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
|
||||
static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
|
||||
/* More than 203 days if BW_SHIFT equals 20. */
|
||||
static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
|
||||
|
||||
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
|
||||
|
||||
@@ -7465,6 +7491,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
|
||||
if (period > max_cfs_quota_period)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Bound quota to defend quota against overflow during bandwidth shift.
|
||||
*/
|
||||
if (quota != RUNTIME_INF && quota > max_cfs_runtime)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Prevent race between setting of cfs_rq->runtime_enabled and
|
||||
* unthrottle_offline_cfs_rqs().
|
||||
|
Reference in New Issue
Block a user