Merge tag 'sched-core-2020-08-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: - Improve uclamp performance by using a static key for the fast path - Add the "sched_util_clamp_min_rt_default" sysctl, to optimize for better power efficiency of RT tasks on battery powered devices. (The default is to maximize performance & reduce RT latencies.) - Improve utime and stime tracking accuracy, which had a fixed boundary of error, which created larger and larger relative errors as the values become larger. This is now replaced with more precise arithmetics, using the new mul_u64_u64_div_u64() helper in math64.h. - Improve the deadline scheduler, such as making it capacity aware - Improve frequency-invariant scheduling - Misc cleanups in energy/power aware scheduling - Add sched_update_nr_running tracepoint to track changes to nr_running - Documentation additions and updates - Misc cleanups and smaller fixes * tag 'sched-core-2020-08-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (54 commits) sched/doc: Factorize bits between sched-energy.rst & sched-capacity.rst sched/doc: Document capacity aware scheduling sched: Document arch_scale_*_capacity() arm, arm64: Fix selection of CONFIG_SCHED_THERMAL_PRESSURE Documentation/sysctl: Document uclamp sysctl knobs sched/uclamp: Add a new sysctl to control RT default boost value sched/uclamp: Fix a deadlock when enabling uclamp static key sched: Remove duplicated tick_nohz_full_enabled() check sched: Fix a typo in a comment sched/uclamp: Remove unnecessary mutex_init() arm, arm64: Select CONFIG_SCHED_THERMAL_PRESSURE sched: Cleanup SCHED_THERMAL_PRESSURE kconfig entry arch_topology, sched/core: Cleanup thermal pressure definition trace/events/sched.h: fix duplicated word linux/sched/mm.h: drop duplicated words in comments smp: Fix a potential usage of stale nr_cpus sched/fair: update_pick_idlest() Select group with lowest group_util when idle_cpus are equal sched: nohz: stop passing around unused "ticks" parameter. sched: Better document ttwu() sched: Add a tracepoint to track rq->nr_running ...
This commit is contained in:
@@ -6,6 +6,10 @@
|
||||
*
|
||||
* Copyright (C) 1991-2002 Linus Torvalds
|
||||
*/
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include <trace/events/sched.h>
|
||||
#undef CREATE_TRACE_POINTS
|
||||
|
||||
#include "sched.h"
|
||||
|
||||
#include <linux/nospec.h>
|
||||
@@ -23,9 +27,6 @@
|
||||
#include "pelt.h"
|
||||
#include "smp.h"
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include <trace/events/sched.h>
|
||||
|
||||
/*
|
||||
* Export tracepoints that act as a bare tracehook (ie: have no trace event
|
||||
* associated with them) to allow external modules to probe them.
|
||||
@@ -36,6 +37,9 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
|
||||
|
||||
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
|
||||
|
||||
@@ -75,6 +79,100 @@ __read_mostly int scheduler_running;
|
||||
*/
|
||||
int sysctl_sched_rt_runtime = 950000;
|
||||
|
||||
|
||||
/*
|
||||
* Serialization rules:
|
||||
*
|
||||
* Lock order:
|
||||
*
|
||||
* p->pi_lock
|
||||
* rq->lock
|
||||
* hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
|
||||
*
|
||||
* rq1->lock
|
||||
* rq2->lock where: rq1 < rq2
|
||||
*
|
||||
* Regular state:
|
||||
*
|
||||
* Normal scheduling state is serialized by rq->lock. __schedule() takes the
|
||||
* local CPU's rq->lock, it optionally removes the task from the runqueue and
|
||||
* always looks at the local rq data structures to find the most elegible task
|
||||
* to run next.
|
||||
*
|
||||
* Task enqueue is also under rq->lock, possibly taken from another CPU.
|
||||
* Wakeups from another LLC domain might use an IPI to transfer the enqueue to
|
||||
* the local CPU to avoid bouncing the runqueue state around [ see
|
||||
* ttwu_queue_wakelist() ]
|
||||
*
|
||||
* Task wakeup, specifically wakeups that involve migration, are horribly
|
||||
* complicated to avoid having to take two rq->locks.
|
||||
*
|
||||
* Special state:
|
||||
*
|
||||
* System-calls and anything external will use task_rq_lock() which acquires
|
||||
* both p->pi_lock and rq->lock. As a consequence the state they change is
|
||||
* stable while holding either lock:
|
||||
*
|
||||
* - sched_setaffinity()/
|
||||
* set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed
|
||||
* - set_user_nice(): p->se.load, p->*prio
|
||||
* - __sched_setscheduler(): p->sched_class, p->policy, p->*prio,
|
||||
* p->se.load, p->rt_priority,
|
||||
* p->dl.dl_{runtime, deadline, period, flags, bw, density}
|
||||
* - sched_setnuma(): p->numa_preferred_nid
|
||||
* - sched_move_task()/
|
||||
* cpu_cgroup_fork(): p->sched_task_group
|
||||
* - uclamp_update_active() p->uclamp*
|
||||
*
|
||||
* p->state <- TASK_*:
|
||||
*
|
||||
* is changed locklessly using set_current_state(), __set_current_state() or
|
||||
* set_special_state(), see their respective comments, or by
|
||||
* try_to_wake_up(). This latter uses p->pi_lock to serialize against
|
||||
* concurrent self.
|
||||
*
|
||||
* p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
|
||||
*
|
||||
* is set by activate_task() and cleared by deactivate_task(), under
|
||||
* rq->lock. Non-zero indicates the task is runnable, the special
|
||||
* ON_RQ_MIGRATING state is used for migration without holding both
|
||||
* rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
|
||||
*
|
||||
* p->on_cpu <- { 0, 1 }:
|
||||
*
|
||||
* is set by prepare_task() and cleared by finish_task() such that it will be
|
||||
* set before p is scheduled-in and cleared after p is scheduled-out, both
|
||||
* under rq->lock. Non-zero indicates the task is running on its CPU.
|
||||
*
|
||||
* [ The astute reader will observe that it is possible for two tasks on one
|
||||
* CPU to have ->on_cpu = 1 at the same time. ]
|
||||
*
|
||||
* task_cpu(p): is changed by set_task_cpu(), the rules are:
|
||||
*
|
||||
* - Don't call set_task_cpu() on a blocked task:
|
||||
*
|
||||
* We don't care what CPU we're not running on, this simplifies hotplug,
|
||||
* the CPU assignment of blocked tasks isn't required to be valid.
|
||||
*
|
||||
* - for try_to_wake_up(), called under p->pi_lock:
|
||||
*
|
||||
* This allows try_to_wake_up() to only take one rq->lock, see its comment.
|
||||
*
|
||||
* - for migration called under rq->lock:
|
||||
* [ see task_on_rq_migrating() in task_rq_lock() ]
|
||||
*
|
||||
* o move_queued_task()
|
||||
* o detach_task()
|
||||
*
|
||||
* - for migration called under double_rq_lock():
|
||||
*
|
||||
* o __migrate_swap_task()
|
||||
* o push_rt_task() / pull_rt_task()
|
||||
* o push_dl_task() / pull_dl_task()
|
||||
* o dl_task_offline_migration()
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* __task_rq_lock - lock the rq @p resides on.
|
||||
*/
|
||||
@@ -791,9 +889,46 @@ unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
|
||||
/* Max allowed maximum utilization */
|
||||
unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
|
||||
|
||||
/*
|
||||
* By default RT tasks run at the maximum performance point/capacity of the
|
||||
* system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
|
||||
* SCHED_CAPACITY_SCALE.
|
||||
*
|
||||
* This knob allows admins to change the default behavior when uclamp is being
|
||||
* used. In battery powered devices, particularly, running at the maximum
|
||||
* capacity and frequency will increase energy consumption and shorten the
|
||||
* battery life.
|
||||
*
|
||||
* This knob only affects RT tasks that their uclamp_se->user_defined == false.
|
||||
*
|
||||
* This knob will not override the system default sched_util_clamp_min defined
|
||||
* above.
|
||||
*/
|
||||
unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
|
||||
|
||||
/* All clamps are required to be less or equal than these values */
|
||||
static struct uclamp_se uclamp_default[UCLAMP_CNT];
|
||||
|
||||
/*
|
||||
* This static key is used to reduce the uclamp overhead in the fast path. It
|
||||
* primarily disables the call to uclamp_rq_{inc, dec}() in
|
||||
* enqueue/dequeue_task().
|
||||
*
|
||||
* This allows users to continue to enable uclamp in their kernel config with
|
||||
* minimum uclamp overhead in the fast path.
|
||||
*
|
||||
* As soon as userspace modifies any of the uclamp knobs, the static key is
|
||||
* enabled, since we have an actual users that make use of uclamp
|
||||
* functionality.
|
||||
*
|
||||
* The knobs that would enable this static key are:
|
||||
*
|
||||
* * A task modifying its uclamp value with sched_setattr().
|
||||
* * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs.
|
||||
* * An admin modifying the cgroup cpu.uclamp.{min, max}
|
||||
*/
|
||||
DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
|
||||
|
||||
/* Integer rounded range for each bucket */
|
||||
#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
|
||||
|
||||
@@ -873,6 +1008,64 @@ unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
|
||||
return uclamp_idle_value(rq, clamp_id, clamp_value);
|
||||
}
|
||||
|
||||
static void __uclamp_update_util_min_rt_default(struct task_struct *p)
|
||||
{
|
||||
unsigned int default_util_min;
|
||||
struct uclamp_se *uc_se;
|
||||
|
||||
lockdep_assert_held(&p->pi_lock);
|
||||
|
||||
uc_se = &p->uclamp_req[UCLAMP_MIN];
|
||||
|
||||
/* Only sync if user didn't override the default */
|
||||
if (uc_se->user_defined)
|
||||
return;
|
||||
|
||||
default_util_min = sysctl_sched_uclamp_util_min_rt_default;
|
||||
uclamp_se_set(uc_se, default_util_min, false);
|
||||
}
|
||||
|
||||
static void uclamp_update_util_min_rt_default(struct task_struct *p)
|
||||
{
|
||||
struct rq_flags rf;
|
||||
struct rq *rq;
|
||||
|
||||
if (!rt_task(p))
|
||||
return;
|
||||
|
||||
/* Protect updates to p->uclamp_* */
|
||||
rq = task_rq_lock(p, &rf);
|
||||
__uclamp_update_util_min_rt_default(p);
|
||||
task_rq_unlock(rq, p, &rf);
|
||||
}
|
||||
|
||||
static void uclamp_sync_util_min_rt_default(void)
|
||||
{
|
||||
struct task_struct *g, *p;
|
||||
|
||||
/*
|
||||
* copy_process() sysctl_uclamp
|
||||
* uclamp_min_rt = X;
|
||||
* write_lock(&tasklist_lock) read_lock(&tasklist_lock)
|
||||
* // link thread smp_mb__after_spinlock()
|
||||
* write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
|
||||
* sched_post_fork() for_each_process_thread()
|
||||
* __uclamp_sync_rt() __uclamp_sync_rt()
|
||||
*
|
||||
* Ensures that either sched_post_fork() will observe the new
|
||||
* uclamp_min_rt or for_each_process_thread() will observe the new
|
||||
* task.
|
||||
*/
|
||||
read_lock(&tasklist_lock);
|
||||
smp_mb__after_spinlock();
|
||||
read_unlock(&tasklist_lock);
|
||||
|
||||
rcu_read_lock();
|
||||
for_each_process_thread(g, p)
|
||||
uclamp_update_util_min_rt_default(p);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static inline struct uclamp_se
|
||||
uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
|
||||
{
|
||||
@@ -990,10 +1183,38 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
|
||||
|
||||
lockdep_assert_held(&rq->lock);
|
||||
|
||||
/*
|
||||
* If sched_uclamp_used was enabled after task @p was enqueued,
|
||||
* we could end up with unbalanced call to uclamp_rq_dec_id().
|
||||
*
|
||||
* In this case the uc_se->active flag should be false since no uclamp
|
||||
* accounting was performed at enqueue time and we can just return
|
||||
* here.
|
||||
*
|
||||
* Need to be careful of the following enqeueue/dequeue ordering
|
||||
* problem too
|
||||
*
|
||||
* enqueue(taskA)
|
||||
* // sched_uclamp_used gets enabled
|
||||
* enqueue(taskB)
|
||||
* dequeue(taskA)
|
||||
* // Must not decrement bukcet->tasks here
|
||||
* dequeue(taskB)
|
||||
*
|
||||
* where we could end up with stale data in uc_se and
|
||||
* bucket[uc_se->bucket_id].
|
||||
*
|
||||
* The following check here eliminates the possibility of such race.
|
||||
*/
|
||||
if (unlikely(!uc_se->active))
|
||||
return;
|
||||
|
||||
bucket = &uc_rq->bucket[uc_se->bucket_id];
|
||||
|
||||
SCHED_WARN_ON(!bucket->tasks);
|
||||
if (likely(bucket->tasks))
|
||||
bucket->tasks--;
|
||||
|
||||
uc_se->active = false;
|
||||
|
||||
/*
|
||||
@@ -1021,6 +1242,15 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
enum uclamp_id clamp_id;
|
||||
|
||||
/*
|
||||
* Avoid any overhead until uclamp is actually used by the userspace.
|
||||
*
|
||||
* The condition is constructed such that a NOP is generated when
|
||||
* sched_uclamp_used is disabled.
|
||||
*/
|
||||
if (!static_branch_unlikely(&sched_uclamp_used))
|
||||
return;
|
||||
|
||||
if (unlikely(!p->sched_class->uclamp_enabled))
|
||||
return;
|
||||
|
||||
@@ -1036,6 +1266,15 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
enum uclamp_id clamp_id;
|
||||
|
||||
/*
|
||||
* Avoid any overhead until uclamp is actually used by the userspace.
|
||||
*
|
||||
* The condition is constructed such that a NOP is generated when
|
||||
* sched_uclamp_used is disabled.
|
||||
*/
|
||||
if (!static_branch_unlikely(&sched_uclamp_used))
|
||||
return;
|
||||
|
||||
if (unlikely(!p->sched_class->uclamp_enabled))
|
||||
return;
|
||||
|
||||
@@ -1114,12 +1353,13 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
|
||||
void *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
bool update_root_tg = false;
|
||||
int old_min, old_max;
|
||||
int old_min, old_max, old_min_rt;
|
||||
int result;
|
||||
|
||||
mutex_lock(&uclamp_mutex);
|
||||
old_min = sysctl_sched_uclamp_util_min;
|
||||
old_max = sysctl_sched_uclamp_util_max;
|
||||
old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
|
||||
|
||||
result = proc_dointvec(table, write, buffer, lenp, ppos);
|
||||
if (result)
|
||||
@@ -1128,7 +1368,9 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
|
||||
goto done;
|
||||
|
||||
if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
|
||||
sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
|
||||
sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
|
||||
sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
|
||||
|
||||
result = -EINVAL;
|
||||
goto undo;
|
||||
}
|
||||
@@ -1144,8 +1386,15 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
|
||||
update_root_tg = true;
|
||||
}
|
||||
|
||||
if (update_root_tg)
|
||||
if (update_root_tg) {
|
||||
static_branch_enable(&sched_uclamp_used);
|
||||
uclamp_update_root_tg();
|
||||
}
|
||||
|
||||
if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
|
||||
static_branch_enable(&sched_uclamp_used);
|
||||
uclamp_sync_util_min_rt_default();
|
||||
}
|
||||
|
||||
/*
|
||||
* We update all RUNNABLE tasks only when task groups are in use.
|
||||
@@ -1158,6 +1407,7 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
|
||||
undo:
|
||||
sysctl_sched_uclamp_util_min = old_min;
|
||||
sysctl_sched_uclamp_util_max = old_max;
|
||||
sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
|
||||
done:
|
||||
mutex_unlock(&uclamp_mutex);
|
||||
|
||||
@@ -1180,6 +1430,15 @@ static int uclamp_validate(struct task_struct *p,
|
||||
if (upper_bound > SCHED_CAPACITY_SCALE)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* We have valid uclamp attributes; make sure uclamp is enabled.
|
||||
*
|
||||
* We need to do that here, because enabling static branches is a
|
||||
* blocking operation which obviously cannot be done while holding
|
||||
* scheduler locks.
|
||||
*/
|
||||
static_branch_enable(&sched_uclamp_used);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1194,17 +1453,20 @@ static void __setscheduler_uclamp(struct task_struct *p,
|
||||
*/
|
||||
for_each_clamp_id(clamp_id) {
|
||||
struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
|
||||
unsigned int clamp_value = uclamp_none(clamp_id);
|
||||
|
||||
/* Keep using defined clamps across class changes */
|
||||
if (uc_se->user_defined)
|
||||
continue;
|
||||
|
||||
/* By default, RT tasks always get 100% boost */
|
||||
/*
|
||||
* RT by default have a 100% boost value that could be modified
|
||||
* at runtime.
|
||||
*/
|
||||
if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
|
||||
clamp_value = uclamp_none(UCLAMP_MAX);
|
||||
__uclamp_update_util_min_rt_default(p);
|
||||
else
|
||||
uclamp_se_set(uc_se, uclamp_none(clamp_id), false);
|
||||
|
||||
uclamp_se_set(uc_se, clamp_value, false);
|
||||
}
|
||||
|
||||
if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
|
||||
@@ -1225,6 +1487,10 @@ static void uclamp_fork(struct task_struct *p)
|
||||
{
|
||||
enum uclamp_id clamp_id;
|
||||
|
||||
/*
|
||||
* We don't need to hold task_rq_lock() when updating p->uclamp_* here
|
||||
* as the task is still at its early fork stages.
|
||||
*/
|
||||
for_each_clamp_id(clamp_id)
|
||||
p->uclamp[clamp_id].active = false;
|
||||
|
||||
@@ -1237,19 +1503,33 @@ static void uclamp_fork(struct task_struct *p)
|
||||
}
|
||||
}
|
||||
|
||||
static void uclamp_post_fork(struct task_struct *p)
|
||||
{
|
||||
uclamp_update_util_min_rt_default(p);
|
||||
}
|
||||
|
||||
static void __init init_uclamp_rq(struct rq *rq)
|
||||
{
|
||||
enum uclamp_id clamp_id;
|
||||
struct uclamp_rq *uc_rq = rq->uclamp;
|
||||
|
||||
for_each_clamp_id(clamp_id) {
|
||||
uc_rq[clamp_id] = (struct uclamp_rq) {
|
||||
.value = uclamp_none(clamp_id)
|
||||
};
|
||||
}
|
||||
|
||||
rq->uclamp_flags = 0;
|
||||
}
|
||||
|
||||
static void __init init_uclamp(void)
|
||||
{
|
||||
struct uclamp_se uc_max = {};
|
||||
enum uclamp_id clamp_id;
|
||||
int cpu;
|
||||
|
||||
mutex_init(&uclamp_mutex);
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
memset(&cpu_rq(cpu)->uclamp, 0,
|
||||
sizeof(struct uclamp_rq)*UCLAMP_CNT);
|
||||
cpu_rq(cpu)->uclamp_flags = 0;
|
||||
}
|
||||
for_each_possible_cpu(cpu)
|
||||
init_uclamp_rq(cpu_rq(cpu));
|
||||
|
||||
for_each_clamp_id(clamp_id) {
|
||||
uclamp_se_set(&init_task.uclamp_req[clamp_id],
|
||||
@@ -1278,6 +1558,7 @@ static inline int uclamp_validate(struct task_struct *p,
|
||||
static void __setscheduler_uclamp(struct task_struct *p,
|
||||
const struct sched_attr *attr) { }
|
||||
static inline void uclamp_fork(struct task_struct *p) { }
|
||||
static inline void uclamp_post_fork(struct task_struct *p) { }
|
||||
static inline void init_uclamp(void) { }
|
||||
#endif /* CONFIG_UCLAMP_TASK */
|
||||
|
||||
@@ -1404,20 +1685,10 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
|
||||
|
||||
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
|
||||
{
|
||||
const struct sched_class *class;
|
||||
|
||||
if (p->sched_class == rq->curr->sched_class) {
|
||||
if (p->sched_class == rq->curr->sched_class)
|
||||
rq->curr->sched_class->check_preempt_curr(rq, p, flags);
|
||||
} else {
|
||||
for_each_class(class) {
|
||||
if (class == rq->curr->sched_class)
|
||||
break;
|
||||
if (class == p->sched_class) {
|
||||
resched_curr(rq);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (p->sched_class > rq->curr->sched_class)
|
||||
resched_curr(rq);
|
||||
|
||||
/*
|
||||
* A queue event has occurred, and we're going to schedule. In
|
||||
@@ -1468,8 +1739,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
|
||||
{
|
||||
lockdep_assert_held(&rq->lock);
|
||||
|
||||
WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
|
||||
dequeue_task(rq, p, DEQUEUE_NOCLOCK);
|
||||
deactivate_task(rq, p, DEQUEUE_NOCLOCK);
|
||||
set_task_cpu(p, new_cpu);
|
||||
rq_unlock(rq, rf);
|
||||
|
||||
@@ -1477,8 +1747,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
|
||||
|
||||
rq_lock(rq, rf);
|
||||
BUG_ON(task_cpu(p) != new_cpu);
|
||||
enqueue_task(rq, p, 0);
|
||||
p->on_rq = TASK_ON_RQ_QUEUED;
|
||||
activate_task(rq, p, 0);
|
||||
check_preempt_curr(rq, p, 0);
|
||||
|
||||
return rq;
|
||||
@@ -2243,12 +2512,31 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
|
||||
}
|
||||
|
||||
/*
|
||||
* Called in case the task @p isn't fully descheduled from its runqueue,
|
||||
* in this case we must do a remote wakeup. Its a 'light' wakeup though,
|
||||
* since all we need to do is flip p->state to TASK_RUNNING, since
|
||||
* the task is still ->on_rq.
|
||||
* Consider @p being inside a wait loop:
|
||||
*
|
||||
* for (;;) {
|
||||
* set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
*
|
||||
* if (CONDITION)
|
||||
* break;
|
||||
*
|
||||
* schedule();
|
||||
* }
|
||||
* __set_current_state(TASK_RUNNING);
|
||||
*
|
||||
* between set_current_state() and schedule(). In this case @p is still
|
||||
* runnable, so all that needs doing is change p->state back to TASK_RUNNING in
|
||||
* an atomic manner.
|
||||
*
|
||||
* By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
|
||||
* then schedule() must still happen and p->state can be changed to
|
||||
* TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
|
||||
* need to do a full wakeup with enqueue.
|
||||
*
|
||||
* Returns: %true when the wakeup is done,
|
||||
* %false otherwise.
|
||||
*/
|
||||
static int ttwu_remote(struct task_struct *p, int wake_flags)
|
||||
static int ttwu_runnable(struct task_struct *p, int wake_flags)
|
||||
{
|
||||
struct rq_flags rf;
|
||||
struct rq *rq;
|
||||
@@ -2389,6 +2677,14 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
#else /* !CONFIG_SMP */
|
||||
|
||||
static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
|
||||
@@ -2396,10 +2692,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
struct rq_flags rf;
|
||||
|
||||
#if defined(CONFIG_SMP)
|
||||
if (ttwu_queue_wakelist(p, cpu, wake_flags))
|
||||
return;
|
||||
#endif
|
||||
|
||||
rq_lock(rq, &rf);
|
||||
update_rq_clock(rq);
|
||||
@@ -2455,8 +2749,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
|
||||
* migration. However the means are completely different as there is no lock
|
||||
* chain to provide order. Instead we do:
|
||||
*
|
||||
* 1) smp_store_release(X->on_cpu, 0)
|
||||
* 2) smp_cond_load_acquire(!X->on_cpu)
|
||||
* 1) smp_store_release(X->on_cpu, 0) -- finish_task()
|
||||
* 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
|
||||
*
|
||||
* Example:
|
||||
*
|
||||
@@ -2496,15 +2790,33 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
|
||||
* @state: the mask of task states that can be woken
|
||||
* @wake_flags: wake modifier flags (WF_*)
|
||||
*
|
||||
* If (@state & @p->state) @p->state = TASK_RUNNING.
|
||||
* Conceptually does:
|
||||
*
|
||||
* If (@state & @p->state) @p->state = TASK_RUNNING.
|
||||
*
|
||||
* If the task was not queued/runnable, also place it back on a runqueue.
|
||||
*
|
||||
* Atomic against schedule() which would dequeue a task, also see
|
||||
* set_current_state().
|
||||
* This function is atomic against schedule() which would dequeue the task.
|
||||
*
|
||||
* This function executes a full memory barrier before accessing the task
|
||||
* state; see set_current_state().
|
||||
* It issues a full memory barrier before accessing @p->state, see the comment
|
||||
* with set_current_state().
|
||||
*
|
||||
* Uses p->pi_lock to serialize against concurrent wake-ups.
|
||||
*
|
||||
* Relies on p->pi_lock stabilizing:
|
||||
* - p->sched_class
|
||||
* - p->cpus_ptr
|
||||
* - p->sched_task_group
|
||||
* in order to do migration, see its use of select_task_rq()/set_task_cpu().
|
||||
*
|
||||
* Tries really hard to only take one task_rq(p)->lock for performance.
|
||||
* Takes rq->lock in:
|
||||
* - ttwu_runnable() -- old rq, unavoidable, see comment there;
|
||||
* - ttwu_queue() -- new rq, for enqueue of the task;
|
||||
* - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
|
||||
*
|
||||
* As a consequence we race really badly with just about everything. See the
|
||||
* many memory barriers and their comments for details.
|
||||
*
|
||||
* Return: %true if @p->state changes (an actual wakeup was done),
|
||||
* %false otherwise.
|
||||
@@ -2520,7 +2832,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
|
||||
/*
|
||||
* We're waking current, this means 'p->on_rq' and 'task_cpu(p)
|
||||
* == smp_processor_id()'. Together this means we can special
|
||||
* case the whole 'p->on_rq && ttwu_remote()' case below
|
||||
* case the whole 'p->on_rq && ttwu_runnable()' case below
|
||||
* without taking any locks.
|
||||
*
|
||||
* In particular:
|
||||
@@ -2541,8 +2853,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
|
||||
/*
|
||||
* If we are going to wake up a thread waiting for CONDITION we
|
||||
* need to ensure that CONDITION=1 done by the caller can not be
|
||||
* reordered with p->state check below. This pairs with mb() in
|
||||
* set_current_state() the waiting thread does.
|
||||
* reordered with p->state check below. This pairs with smp_store_mb()
|
||||
* in set_current_state() that the waiting thread does.
|
||||
*/
|
||||
raw_spin_lock_irqsave(&p->pi_lock, flags);
|
||||
smp_mb__after_spinlock();
|
||||
@@ -2577,7 +2889,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
|
||||
* A similar smb_rmb() lives in try_invoke_on_locked_down_task().
|
||||
*/
|
||||
smp_rmb();
|
||||
if (READ_ONCE(p->on_rq) && ttwu_remote(p, wake_flags))
|
||||
if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
|
||||
goto unlock;
|
||||
|
||||
if (p->in_iowait) {
|
||||
@@ -2990,6 +3302,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
|
||||
return 0;
|
||||
}
|
||||
|
||||
void sched_post_fork(struct task_struct *p)
|
||||
{
|
||||
uclamp_post_fork(p);
|
||||
}
|
||||
|
||||
unsigned long to_ratio(u64 period, u64 runtime)
|
||||
{
|
||||
if (runtime == RUNTIME_INF)
|
||||
@@ -3147,8 +3464,10 @@ static inline void prepare_task(struct task_struct *next)
|
||||
/*
|
||||
* Claim the task as running, we do this before switching to it
|
||||
* such that any running task will have this set.
|
||||
*
|
||||
* See the ttwu() WF_ON_CPU case and its ordering comment.
|
||||
*/
|
||||
next->on_cpu = 1;
|
||||
WRITE_ONCE(next->on_cpu, 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -3156,8 +3475,9 @@ static inline void finish_task(struct task_struct *prev)
|
||||
{
|
||||
#ifdef CONFIG_SMP
|
||||
/*
|
||||
* After ->on_cpu is cleared, the task can be moved to a different CPU.
|
||||
* We must ensure this doesn't happen until the switch is completely
|
||||
* This must be the very last reference to @prev from this CPU. After
|
||||
* p->on_cpu is cleared, the task can be moved to a different CPU. We
|
||||
* must ensure this doesn't happen until the switch is completely
|
||||
* finished.
|
||||
*
|
||||
* In particular, the load of prev->state in finish_task_switch() must
|
||||
@@ -3656,17 +3976,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
|
||||
return ns;
|
||||
}
|
||||
|
||||
DEFINE_PER_CPU(unsigned long, thermal_pressure);
|
||||
|
||||
void arch_set_thermal_pressure(struct cpumask *cpus,
|
||||
unsigned long th_pressure)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
for_each_cpu(cpu, cpus)
|
||||
WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function gets called by the timer code, with HZ frequency.
|
||||
* We call it with interrupts disabled.
|
||||
@@ -4029,8 +4338,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||||
* higher scheduling class, because otherwise those loose the
|
||||
* opportunity to pull in more work from other CPUs.
|
||||
*/
|
||||
if (likely((prev->sched_class == &idle_sched_class ||
|
||||
prev->sched_class == &fair_sched_class) &&
|
||||
if (likely(prev->sched_class <= &fair_sched_class &&
|
||||
rq->nr_running == rq->cfs.h_nr_running)) {
|
||||
|
||||
p = pick_next_task_fair(rq, prev, rf);
|
||||
@@ -5519,6 +5827,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
|
||||
kattr.sched_nice = task_nice(p);
|
||||
|
||||
#ifdef CONFIG_UCLAMP_TASK
|
||||
/*
|
||||
* This could race with another potential updater, but this is fine
|
||||
* because it'll correctly read the old or the new value. We don't need
|
||||
* to guarantee who wins the race as long as it doesn't return garbage.
|
||||
*/
|
||||
kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
|
||||
kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
|
||||
#endif
|
||||
@@ -5876,7 +6189,7 @@ again:
|
||||
if (task_running(p_rq, p) || p->state)
|
||||
goto out_unlock;
|
||||
|
||||
yielded = curr->sched_class->yield_to_task(rq, p, preempt);
|
||||
yielded = curr->sched_class->yield_to_task(rq, p);
|
||||
if (yielded) {
|
||||
schedstat_inc(rq->yld_count);
|
||||
/*
|
||||
@@ -6710,6 +7023,14 @@ void __init sched_init(void)
|
||||
unsigned long ptr = 0;
|
||||
int i;
|
||||
|
||||
/* Make sure the linker didn't screw up */
|
||||
BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
|
||||
&fair_sched_class + 1 != &rt_sched_class ||
|
||||
&rt_sched_class + 1 != &dl_sched_class);
|
||||
#ifdef CONFIG_SMP
|
||||
BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
|
||||
#endif
|
||||
|
||||
wait_bit_init();
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
@@ -7431,6 +7752,8 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
|
||||
if (req.ret)
|
||||
return req.ret;
|
||||
|
||||
static_branch_enable(&sched_uclamp_used);
|
||||
|
||||
mutex_lock(&uclamp_mutex);
|
||||
rcu_read_lock();
|
||||
|
||||
@@ -8118,4 +8441,7 @@ const u32 sched_prio_to_wmult[40] = {
|
||||
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
|
||||
};
|
||||
|
||||
#undef CREATE_TRACE_POINTS
|
||||
void call_trace_sched_update_nr_running(struct rq *rq, int count)
|
||||
{
|
||||
trace_sched_update_nr_running_tp(rq, count);
|
||||
}
|
||||
|
Reference in New Issue
Block a user