Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:

 - Remove the unused per rq load array and all its infrastructure, by
   Dietmar Eggemann.

 - Add utilization clamping support by Patrick Bellasi. This is a
   refinement of the energy aware scheduling framework with support for
   boosting of interactive and capping of background workloads: to make
   sure critical GUI threads get maximum frequency ASAP, and to make
   sure background processing doesn't unnecessarily move to cpufreq
   governor to higher frequencies and less energy efficient CPU modes.

 - Add the bare minimum of tracepoints required for LISA EAS regression
   testing, by Qais Yousef - which allows automated testing of various
   power management features, including energy aware scheduling.

 - Restructure the former tsk_nr_cpus_allowed() facility that the -rt
   kernel used to modify the scheduler's CPU affinity logic such as
   migrate_disable() - introduce the task->cpus_ptr value instead of
   taking the address of &task->cpus_allowed directly - by Sebastian
   Andrzej Siewior.

 - Misc optimizations, fixes, cleanups and small enhancements - see the
   Git log for details.

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (33 commits)
  sched/uclamp: Add uclamp support to energy_compute()
  sched/uclamp: Add uclamp_util_with()
  sched/cpufreq, sched/uclamp: Add clamps for FAIR and RT tasks
  sched/uclamp: Set default clamps for RT tasks
  sched/uclamp: Reset uclamp values on RESET_ON_FORK
  sched/uclamp: Extend sched_setattr() to support utilization clamping
  sched/core: Allow sched_setattr() to use the current policy
  sched/uclamp: Add system default clamps
  sched/uclamp: Enforce last task's UCLAMP_MAX
  sched/uclamp: Add bucket local max tracking
  sched/uclamp: Add CPU's clamp buckets refcounting
  sched/fair: Rename weighted_cpuload() to cpu_runnable_load()
  sched/debug: Export the newly added tracepoints
  sched/debug: Add sched_overutilized tracepoint
  sched/debug: Add new tracepoint to track PELT at se level
  sched/debug: Add new tracepoints to track PELT at rq level
  sched/debug: Add a new sched_trace_*() helper functions
  sched/autogroup: Make autogroup_path() always available
  sched/wait: Deduplicate code with do-while
  sched/topology: Remove unused 'sd' parameter from arch_scale_cpu_capacity()
  ...
This commit is contained in:
Linus Torvalds
2019-07-08 16:39:53 -07:00
49 changed files with 1216 additions and 618 deletions

View File

@@ -2829,7 +2829,7 @@ static void cpuset_fork(struct task_struct *task)
if (task_css_is_root(task, cpuset_cgrp_id))
return;
set_cpus_allowed_ptr(task, &current->cpus_allowed);
set_cpus_allowed_ptr(task, current->cpus_ptr);
task->mems_allowed = current->mems_allowed;
}

View File

@@ -898,6 +898,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_STACKPROTECTOR
tsk->stack_canary = get_random_canary();
#endif
if (orig->cpus_ptr == &orig->cpus_mask)
tsk->cpus_ptr = &tsk->cpus_mask;
/*
* One for us, one for whoever does the "release_task()" (usually

View File

@@ -223,7 +223,7 @@ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
* All CPUs of a domain must have the same micro-architecture
* since they all share the same table.
*/
cap = arch_scale_cpu_capacity(NULL, cpu);
cap = arch_scale_cpu_capacity(cpu);
if (prev_cap && prev_cap != cap) {
pr_err("CPUs of %*pbl must have the same capacity\n",
cpumask_pr_args(span));

View File

@@ -259,7 +259,6 @@ out:
}
#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_SCHED_DEBUG
int autogroup_path(struct task_group *tg, char *buf, int buflen)
{
if (!task_group_is_autogroup(tg))
@@ -267,4 +266,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
}
#endif

View File

@@ -23,6 +23,17 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
/*
* Export tracepoints that act as a bare tracehook (ie: have no trace event
* associated with them) to allow external modules to probe them.
*/
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
@@ -761,6 +772,401 @@ static void set_load_weight(struct task_struct *p, bool update_load)
}
}
#ifdef CONFIG_UCLAMP_TASK
/* Max allowed minimum utilization */
unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
/* Max allowed maximum utilization */
unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
/* All clamps are required to be less or equal than these values */
static struct uclamp_se uclamp_default[UCLAMP_CNT];
/* Integer rounded range for each bucket */
#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
#define for_each_clamp_id(clamp_id) \
for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
{
return clamp_value / UCLAMP_BUCKET_DELTA;
}
static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
{
return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
}
static inline unsigned int uclamp_none(int clamp_id)
{
if (clamp_id == UCLAMP_MIN)
return 0;
return SCHED_CAPACITY_SCALE;
}
static inline void uclamp_se_set(struct uclamp_se *uc_se,
unsigned int value, bool user_defined)
{
uc_se->value = value;
uc_se->bucket_id = uclamp_bucket_id(value);
uc_se->user_defined = user_defined;
}
static inline unsigned int
uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
unsigned int clamp_value)
{
/*
* Avoid blocked utilization pushing up the frequency when we go
* idle (which drops the max-clamp) by retaining the last known
* max-clamp.
*/
if (clamp_id == UCLAMP_MAX) {
rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
return clamp_value;
}
return uclamp_none(UCLAMP_MIN);
}
static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
unsigned int clamp_value)
{
/* Reset max-clamp retention only on idle exit */
if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
return;
WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
}
static inline
unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
unsigned int clamp_value)
{
struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
int bucket_id = UCLAMP_BUCKETS - 1;
/*
* Since both min and max clamps are max aggregated, find the
* top most bucket with tasks in.
*/
for ( ; bucket_id >= 0; bucket_id--) {
if (!bucket[bucket_id].tasks)
continue;
return bucket[bucket_id].value;
}
/* No tasks -- default clamp values */
return uclamp_idle_value(rq, clamp_id, clamp_value);
}
/*
* The effective clamp bucket index of a task depends on, by increasing
* priority:
* - the task specific clamp value, when explicitly requested from userspace
* - the system default clamp value, defined by the sysadmin
*/
static inline struct uclamp_se
uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
{
struct uclamp_se uc_req = p->uclamp_req[clamp_id];
struct uclamp_se uc_max = uclamp_default[clamp_id];
/* System default restrictions always apply */
if (unlikely(uc_req.value > uc_max.value))
return uc_max;
return uc_req;
}
unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
{
struct uclamp_se uc_eff;
/* Task currently refcounted: use back-annotated (effective) value */
if (p->uclamp[clamp_id].active)
return p->uclamp[clamp_id].value;
uc_eff = uclamp_eff_get(p, clamp_id);
return uc_eff.value;
}
/*
* When a task is enqueued on a rq, the clamp bucket currently defined by the
* task's uclamp::bucket_id is refcounted on that rq. This also immediately
* updates the rq's clamp value if required.
*
* Tasks can have a task-specific value requested from user-space, track
* within each bucket the maximum value for tasks refcounted in it.
* This "local max aggregation" allows to track the exact "requested" value
* for each bucket when all its RUNNABLE tasks require the same clamp.
*/
static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
unsigned int clamp_id)
{
struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
struct uclamp_se *uc_se = &p->uclamp[clamp_id];
struct uclamp_bucket *bucket;
lockdep_assert_held(&rq->lock);
/* Update task effective clamp */
p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
bucket = &uc_rq->bucket[uc_se->bucket_id];
bucket->tasks++;
uc_se->active = true;
uclamp_idle_reset(rq, clamp_id, uc_se->value);
/*
* Local max aggregation: rq buckets always track the max
* "requested" clamp value of its RUNNABLE tasks.
*/
if (bucket->tasks == 1 || uc_se->value > bucket->value)
bucket->value = uc_se->value;
if (uc_se->value > READ_ONCE(uc_rq->value))
WRITE_ONCE(uc_rq->value, uc_se->value);
}
/*
* When a task is dequeued from a rq, the clamp bucket refcounted by the task
* is released. If this is the last task reference counting the rq's max
* active clamp value, then the rq's clamp value is updated.
*
* Both refcounted tasks and rq's cached clamp values are expected to be
* always valid. If it's detected they are not, as defensive programming,
* enforce the expected state and warn.
*/
static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
unsigned int clamp_id)
{
struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
struct uclamp_se *uc_se = &p->uclamp[clamp_id];
struct uclamp_bucket *bucket;
unsigned int bkt_clamp;
unsigned int rq_clamp;
lockdep_assert_held(&rq->lock);
bucket = &uc_rq->bucket[uc_se->bucket_id];
SCHED_WARN_ON(!bucket->tasks);
if (likely(bucket->tasks))
bucket->tasks--;
uc_se->active = false;
/*
* Keep "local max aggregation" simple and accept to (possibly)
* overboost some RUNNABLE tasks in the same bucket.
* The rq clamp bucket value is reset to its base value whenever
* there are no more RUNNABLE tasks refcounting it.
*/
if (likely(bucket->tasks))
return;
rq_clamp = READ_ONCE(uc_rq->value);
/*
* Defensive programming: this should never happen. If it happens,
* e.g. due to future modification, warn and fixup the expected value.
*/
SCHED_WARN_ON(bucket->value > rq_clamp);
if (bucket->value >= rq_clamp) {
bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
WRITE_ONCE(uc_rq->value, bkt_clamp);
}
}
static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
{
unsigned int clamp_id;
if (unlikely(!p->sched_class->uclamp_enabled))
return;
for_each_clamp_id(clamp_id)
uclamp_rq_inc_id(rq, p, clamp_id);
/* Reset clamp idle holding when there is one RUNNABLE task */
if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
}
static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
{
unsigned int clamp_id;
if (unlikely(!p->sched_class->uclamp_enabled))
return;
for_each_clamp_id(clamp_id)
uclamp_rq_dec_id(rq, p, clamp_id);
}
int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int old_min, old_max;
static DEFINE_MUTEX(mutex);
int result;
mutex_lock(&mutex);
old_min = sysctl_sched_uclamp_util_min;
old_max = sysctl_sched_uclamp_util_max;
result = proc_dointvec(table, write, buffer, lenp, ppos);
if (result)
goto undo;
if (!write)
goto done;
if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
result = -EINVAL;
goto undo;
}
if (old_min != sysctl_sched_uclamp_util_min) {
uclamp_se_set(&uclamp_default[UCLAMP_MIN],
sysctl_sched_uclamp_util_min, false);
}
if (old_max != sysctl_sched_uclamp_util_max) {
uclamp_se_set(&uclamp_default[UCLAMP_MAX],
sysctl_sched_uclamp_util_max, false);
}
/*
* Updating all the RUNNABLE task is expensive, keep it simple and do
* just a lazy update at each next enqueue time.
*/
goto done;
undo:
sysctl_sched_uclamp_util_min = old_min;
sysctl_sched_uclamp_util_max = old_max;
done:
mutex_unlock(&mutex);
return result;
}
static int uclamp_validate(struct task_struct *p,
const struct sched_attr *attr)
{
unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
lower_bound = attr->sched_util_min;
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
upper_bound = attr->sched_util_max;
if (lower_bound > upper_bound)
return -EINVAL;
if (upper_bound > SCHED_CAPACITY_SCALE)
return -EINVAL;
return 0;
}
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr)
{
unsigned int clamp_id;
/*
* On scheduling class change, reset to default clamps for tasks
* without a task-specific value.
*/
for_each_clamp_id(clamp_id) {
struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
unsigned int clamp_value = uclamp_none(clamp_id);
/* Keep using defined clamps across class changes */
if (uc_se->user_defined)
continue;
/* By default, RT tasks always get 100% boost */
if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
clamp_value = uclamp_none(UCLAMP_MAX);
uclamp_se_set(uc_se, clamp_value, false);
}
if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
return;
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
attr->sched_util_min, true);
}
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
attr->sched_util_max, true);
}
}
static void uclamp_fork(struct task_struct *p)
{
unsigned int clamp_id;
for_each_clamp_id(clamp_id)
p->uclamp[clamp_id].active = false;
if (likely(!p->sched_reset_on_fork))
return;
for_each_clamp_id(clamp_id) {
unsigned int clamp_value = uclamp_none(clamp_id);
/* By default, RT tasks always get 100% boost */
if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
clamp_value = uclamp_none(UCLAMP_MAX);
uclamp_se_set(&p->uclamp_req[clamp_id], clamp_value, false);
}
}
static void __init init_uclamp(void)
{
struct uclamp_se uc_max = {};
unsigned int clamp_id;
int cpu;
for_each_possible_cpu(cpu) {
memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
cpu_rq(cpu)->uclamp_flags = 0;
}
for_each_clamp_id(clamp_id) {
uclamp_se_set(&init_task.uclamp_req[clamp_id],
uclamp_none(clamp_id), false);
}
/* System defaults allow max clamp values for both indexes */
uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
for_each_clamp_id(clamp_id)
uclamp_default[clamp_id] = uc_max;
}
#else /* CONFIG_UCLAMP_TASK */
static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
static inline int uclamp_validate(struct task_struct *p,
const struct sched_attr *attr)
{
return -EOPNOTSUPP;
}
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr) { }
static inline void uclamp_fork(struct task_struct *p) { }
static inline void init_uclamp(void) { }
#endif /* CONFIG_UCLAMP_TASK */
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (!(flags & ENQUEUE_NOCLOCK))
@@ -771,6 +1177,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
psi_enqueue(p, flags & ENQUEUE_WAKEUP);
}
uclamp_rq_inc(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
}
@@ -784,6 +1191,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
psi_dequeue(p, flags & DEQUEUE_SLEEP);
}
uclamp_rq_dec(rq, p);
p->sched_class->dequeue_task(rq, p, flags);
}
@@ -930,7 +1338,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
*/
static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
{
if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
return false;
if (is_per_cpu_kthread(p))
@@ -1025,7 +1433,7 @@ static int migration_cpu_stop(void *data)
local_irq_disable();
/*
* We need to explicitly wake pending tasks before running
* __migrate_task() such that we will not miss enforcing cpus_allowed
* __migrate_task() such that we will not miss enforcing cpus_ptr
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
*/
sched_ttwu_pending();
@@ -1056,7 +1464,7 @@ static int migration_cpu_stop(void *data)
*/
void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
{
cpumask_copy(&p->cpus_allowed, new_mask);
cpumask_copy(&p->cpus_mask, new_mask);
p->nr_cpus_allowed = cpumask_weight(new_mask);
}
@@ -1126,7 +1534,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
goto out;
}
if (cpumask_equal(&p->cpus_allowed, new_mask))
if (cpumask_equal(p->cpus_ptr, new_mask))
goto out;
if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
@@ -1286,10 +1694,10 @@ static int migrate_swap_stop(void *data)
if (task_cpu(arg->src_task) != arg->src_cpu)
goto unlock;
if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
goto unlock;
if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
goto unlock;
__migrate_swap_task(arg->src_task, arg->dst_cpu);
@@ -1331,10 +1739,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
goto out;
if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
goto out;
if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
goto out;
trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
@@ -1479,7 +1887,7 @@ void kick_process(struct task_struct *p)
EXPORT_SYMBOL_GPL(kick_process);
/*
* ->cpus_allowed is protected by both rq->lock and p->pi_lock
* ->cpus_ptr is protected by both rq->lock and p->pi_lock
*
* A few notes on cpu_active vs cpu_online:
*
@@ -1519,14 +1927,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
for_each_cpu(dest_cpu, nodemask) {
if (!cpu_active(dest_cpu))
continue;
if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
return dest_cpu;
}
}
for (;;) {
/* Any allowed, online CPU? */
for_each_cpu(dest_cpu, &p->cpus_allowed) {
for_each_cpu(dest_cpu, p->cpus_ptr) {
if (!is_cpu_allowed(p, dest_cpu))
continue;
@@ -1570,7 +1978,7 @@ out:
}
/*
* The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
* The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
*/
static inline
int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
@@ -1580,11 +1988,11 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
if (p->nr_cpus_allowed > 1)
cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
else
cpu = cpumask_any(&p->cpus_allowed);
cpu = cpumask_any(p->cpus_ptr);
/*
* In order not to call set_task_cpu() on a blocking task we need
* to rely on ttwu() to place the task on a valid ->cpus_allowed
* to rely on ttwu() to place the task on a valid ->cpus_ptr
* CPU.
*
* Since this is common to all placement strategies, this lives here.
@@ -1991,6 +2399,29 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
unsigned long flags;
int cpu, success = 0;
if (p == current) {
/*
* We're waking current, this means 'p->on_rq' and 'task_cpu(p)
* == smp_processor_id()'. Together this means we can special
* case the whole 'p->on_rq && ttwu_remote()' case below
* without taking any locks.
*
* In particular:
* - we rely on Program-Order guarantees for all the ordering,
* - we're serialized against set_special_state() by virtue of
* it disabling IRQs (this allows not taking ->pi_lock).
*/
if (!(p->state & state))
return false;
success = 1;
cpu = task_cpu(p);
trace_sched_waking(p);
p->state = TASK_RUNNING;
trace_sched_wakeup(p);
goto out;
}
/*
* If we are going to wake up a thread waiting for CONDITION we
* need to ensure that CONDITION=1 done by the caller can not be
@@ -2000,7 +2431,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
raw_spin_lock_irqsave(&p->pi_lock, flags);
smp_mb__after_spinlock();
if (!(p->state & state))
goto out;
goto unlock;
trace_sched_waking(p);
@@ -2030,7 +2461,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
smp_rmb();
if (p->on_rq && ttwu_remote(p, wake_flags))
goto stat;
goto unlock;
#ifdef CONFIG_SMP
/*
@@ -2090,10 +2521,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu, wake_flags);
stat:
ttwu_stat(p, cpu, wake_flags);
out:
unlock:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out:
if (success)
ttwu_stat(p, cpu, wake_flags);
return success;
}
@@ -2300,6 +2732,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
*/
p->prio = current->normal_prio;
uclamp_fork(p);
/*
* Revert to default priority/policy on fork if requested.
*/
@@ -2395,7 +2829,7 @@ void wake_up_new_task(struct task_struct *p)
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
* - cpus_ptr can change in the fork path
* - any previously selected CPU might disappear through hotplug
*
* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
@@ -3033,7 +3467,6 @@ void scheduler_tick(void)
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
cpu_load_update_active(rq);
calc_global_load_tick(rq);
psi_task_tick(rq);
@@ -4071,6 +4504,13 @@ static void __setscheduler_params(struct task_struct *p,
static void __setscheduler(struct rq *rq, struct task_struct *p,
const struct sched_attr *attr, bool keep_boost)
{
/*
* If params can't change scheduling class changes aren't allowed
* either.
*/
if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
return;
__setscheduler_params(p, attr);
/*
@@ -4208,6 +4648,13 @@ recheck:
return retval;
}
/* Update task specific "requested" clamps */
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
retval = uclamp_validate(p, attr);
if (retval)
return retval;
}
/*
* Make sure no PI-waiters arrive (or leave) while we are
* changing the priority of the task:
@@ -4237,6 +4684,8 @@ recheck:
goto change;
if (dl_policy(policy) && dl_param_changed(p, attr))
goto change;
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
goto change;
p->sched_reset_on_fork = reset_on_fork;
task_rq_unlock(rq, p, &rf);
@@ -4267,7 +4716,7 @@ change:
* the entire root_domain to become SCHED_DEADLINE. We
* will also fail if there's no bandwidth available.
*/
if (!cpumask_subset(span, &p->cpus_allowed) ||
if (!cpumask_subset(span, p->cpus_ptr) ||
rq->rd->dl_bw.bw == 0) {
task_rq_unlock(rq, p, &rf);
return -EPERM;
@@ -4317,7 +4766,9 @@ change:
put_prev_task(rq, p);
prev_class = p->sched_class;
__setscheduler(rq, p, attr, pi);
__setscheduler_uclamp(p, attr);
if (queued) {
/*
@@ -4493,6 +4944,10 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
if (ret)
return -EFAULT;
if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
size < SCHED_ATTR_SIZE_VER1)
return -EINVAL;
/*
* XXX: Do we want to be lenient like existing syscalls; or do we want
* to be strict and return an error on out-of-bounds values?
@@ -4556,14 +5011,21 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
if ((int)attr.sched_policy < 0)
return -EINVAL;
if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
attr.sched_policy = SETPARAM_POLICY;
rcu_read_lock();
retval = -ESRCH;
p = find_process_by_pid(pid);
if (p != NULL)
retval = sched_setattr(p, &attr);
if (likely(p))
get_task_struct(p);
rcu_read_unlock();
if (likely(p)) {
retval = sched_setattr(p, &attr);
put_task_struct(p);
}
return retval;
}
@@ -4714,6 +5176,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
else
attr.sched_nice = task_nice(p);
#ifdef CONFIG_UCLAMP_TASK
attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
#endif
rcu_read_unlock();
retval = sched_read_attr(uattr, &attr, size);
@@ -4866,7 +5333,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
goto out_unlock;
raw_spin_lock_irqsave(&p->pi_lock, flags);
cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out_unlock:
@@ -5123,7 +5590,7 @@ long __sched io_schedule_timeout(long timeout)
}
EXPORT_SYMBOL(io_schedule_timeout);
void io_schedule(void)
void __sched io_schedule(void)
{
int token;
@@ -5443,7 +5910,7 @@ int task_can_attach(struct task_struct *p,
* allowed nodes is unnecessary. Thus, cpusets are not
* applicable for such threads. This prevents checking for
* success of set_cpus_allowed_ptr() on all attached tasks
* before cpus_allowed may be changed.
* before cpus_mask may be changed.
*/
if (p->flags & PF_NO_SETAFFINITY) {
ret = -EINVAL;
@@ -5470,7 +5937,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
if (curr_cpu == target_cpu)
return 0;
if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
return -EINVAL;
/* TODO: This is not properly updating schedstats */
@@ -5608,7 +6075,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
put_prev_task(rq, next);
/*
* Rules for changing task_struct::cpus_allowed are holding
* Rules for changing task_struct::cpus_mask are holding
* both pi_lock and rq->lock, such that holding either
* stabilizes the mask.
*
@@ -5902,8 +6369,8 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
void __init sched_init(void)
{
int i, j;
unsigned long alloc_size = 0, ptr;
int i;
wait_bit_init();
@@ -6005,10 +6472,6 @@ void __init sched_init(void)
#ifdef CONFIG_RT_GROUP_SCHED
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
rq->cpu_load[j] = 0;
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
@@ -6063,6 +6526,8 @@ void __init sched_init(void)
psi_init();
init_uclamp();
scheduler_running = 1;
}

View File

@@ -120,14 +120,14 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
const struct sched_dl_entity *dl_se = &p->dl;
if (later_mask &&
cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
return 1;
} else {
int best_cpu = cpudl_maximum(cp);
WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
if (later_mask)
cpumask_set_cpu(best_cpu, later_mask);

View File

@@ -196,14 +196,17 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
* based on the task model parameters and gives the minimal utilization
* required to meet deadlines.
*/
unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
unsigned long max, enum schedutil_type type)
unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
unsigned long max, enum schedutil_type type,
struct task_struct *p)
{
unsigned long dl_util, util, irq;
struct rq *rq = cpu_rq(cpu);
if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt))
if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) &&
type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
return max;
}
/*
* Early check to see if IRQ/steal time saturates the CPU, can be
@@ -219,9 +222,16 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
* CFS tasks and we use the same metric to track the effective
* utilization (PELT windows are synchronized) we can directly add them
* to obtain the CPU's actual utilization.
*
* CFS and RT utilization can be boosted or capped, depending on
* utilization clamp constraints requested by currently RUNNABLE
* tasks.
* When there are no CFS RUNNABLE tasks, clamps are released and
* frequency will be gracefully reduced with the utilization decay.
*/
util = util_cfs;
util += cpu_util_rt(rq);
util = util_cfs + cpu_util_rt(rq);
if (type == FREQUENCY_UTIL)
util = uclamp_util_with(rq, util, p);
dl_util = cpu_util_dl(rq);
@@ -276,12 +286,12 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
unsigned long util = cpu_util_cfs(rq);
unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);
sg_cpu->max = max;
sg_cpu->bw_dl = cpu_bw_dl(rq);
return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL);
return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL);
}
/**

View File

@@ -94,11 +94,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
if (skip)
continue;
if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
continue;
if (lowest_mask) {
cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
/*
* We have to ensure that we have at least one bit

View File

@@ -538,7 +538,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
* If we cannot preempt any rq, fall back to pick any
* online CPU:
*/
cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr);
if (cpu >= nr_cpu_ids) {
/*
* Failed to find any suitable CPU.
@@ -1195,7 +1195,7 @@ static void update_curr_dl(struct rq *rq)
&curr->dl);
} else {
unsigned long scale_freq = arch_scale_freq_capacity(cpu);
unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
unsigned long scale_cpu = arch_scale_cpu_capacity(cpu);
scaled_delta_exec = cap_scale(delta_exec, scale_freq);
scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
@@ -1824,7 +1824,7 @@ static void set_curr_task_dl(struct rq *rq)
static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
cpumask_test_cpu(cpu, &p->cpus_allowed))
cpumask_test_cpu(cpu, p->cpus_ptr))
return 1;
return 0;
}
@@ -1974,7 +1974,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
if (unlikely(task_rq(task) != rq ||
!cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) ||
!cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
task_running(rq, task) ||
!dl_task(task) ||
!task_on_rq_queued(task))) {

View File

@@ -233,49 +233,35 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
*tablep = NULL;
}
static int min_load_idx = 0;
static int max_load_idx = CPU_LOAD_IDX_MAX-1;
static void
set_table_entry(struct ctl_table *entry,
const char *procname, void *data, int maxlen,
umode_t mode, proc_handler *proc_handler,
bool load_idx)
umode_t mode, proc_handler *proc_handler)
{
entry->procname = procname;
entry->data = data;
entry->maxlen = maxlen;
entry->mode = mode;
entry->proc_handler = proc_handler;
if (load_idx) {
entry->extra1 = &min_load_idx;
entry->extra2 = &max_load_idx;
}
}
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
struct ctl_table *table = sd_alloc_ctl_entry(14);
struct ctl_table *table = sd_alloc_ctl_entry(9);
if (table == NULL)
return NULL;
set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false);
set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false);
set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false);
set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false);
set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false);
/* &table[13] is terminator */
set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax);
set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax);
set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring);
/* &table[8] is terminator */
return table;
}
@@ -653,8 +639,6 @@ do { \
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
P(nr_running);
SEQ_printf(m, " .%-30s: %lu\n", "load",
rq->load.weight);
P(nr_switches);
P(nr_load_updates);
P(nr_uninterruptible);
@@ -662,11 +646,6 @@ do { \
SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
PN(clock);
PN(clock_task);
P(cpu_load[0]);
P(cpu_load[1]);
P(cpu_load[2]);
P(cpu_load[3]);
P(cpu_load[4]);
#undef P
#undef PN

File diff suppressed because it is too large Load Diff

View File

@@ -39,7 +39,6 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true)
SCHED_FEAT(HRTICK, false)
SCHED_FEAT(DOUBLE_TICK, false)
SCHED_FEAT(LB_BIAS, false)
/*
* Decrement CPU capacity based on time not spent running tasks

View File

@@ -28,6 +28,8 @@
#include "sched.h"
#include "pelt.h"
#include <trace/events/sched.h>
/*
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
@@ -265,6 +267,7 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
{
if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
trace_pelt_se_tp(se);
return 1;
}
@@ -278,6 +281,7 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
cfs_se_util_change(&se->avg);
trace_pelt_se_tp(se);
return 1;
}
@@ -292,6 +296,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
cfs_rq->curr != NULL)) {
___update_load_avg(&cfs_rq->avg, 1, 1);
trace_pelt_cfs_tp(cfs_rq);
return 1;
}
@@ -317,6 +322,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
running)) {
___update_load_avg(&rq->avg_rt, 1, 1);
trace_pelt_rt_tp(rq);
return 1;
}
@@ -340,6 +346,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
running)) {
___update_load_avg(&rq->avg_dl, 1, 1);
trace_pelt_dl_tp(rq);
return 1;
}
@@ -366,7 +373,7 @@ int update_irq_load_avg(struct rq *rq, u64 running)
* reflect the real amount of computation
*/
running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq)));
running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
running = cap_scale(running, arch_scale_cpu_capacity(cpu_of(rq)));
/*
* We know the time that has been used by interrupt since last update
@@ -388,8 +395,10 @@ int update_irq_load_avg(struct rq *rq, u64 running)
1,
1);
if (ret)
if (ret) {
___update_load_avg(&rq->avg_irq, 1, 1);
trace_pelt_irq_tp(rq);
}
return ret;
}

View File

@@ -79,7 +79,7 @@ static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
* Scale the elapsed time to reflect the real amount of
* computation
*/
delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
delta = cap_scale(delta, arch_scale_cpu_capacity(cpu_of(rq)));
delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq)));
rq->clock_pelt += delta;

View File

@@ -1614,7 +1614,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
cpumask_test_cpu(cpu, &p->cpus_allowed))
cpumask_test_cpu(cpu, p->cpus_ptr))
return 1;
return 0;
@@ -1751,7 +1751,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
* Also make sure that it wasn't scheduled on its rq.
*/
if (unlikely(task_rq(task) != rq ||
!cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
!cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
task_running(rq, task) ||
!rt_task(task) ||
!task_on_rq_queued(task))) {
@@ -2400,6 +2400,10 @@ const struct sched_class rt_sched_class = {
.switched_to = switched_to_rt,
.update_curr = update_curr_rt,
#ifdef CONFIG_UCLAMP_TASK
.uclamp_enabled = 1,
#endif
};
#ifdef CONFIG_RT_GROUP_SCHED

View File

@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Generated by Documentation/scheduler/sched-pelt; do not modify. */
static const u32 runnable_avg_yN_inv[] = {
static const u32 runnable_avg_yN_inv[] __maybe_unused = {
0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,

View File

@@ -96,12 +96,6 @@ extern atomic_long_t calc_load_tasks;
extern void calc_global_load_tick(struct rq *this_rq);
extern long calc_load_fold_active(struct rq *this_rq, long adjust);
#ifdef CONFIG_SMP
extern void cpu_load_update_active(struct rq *this_rq);
#else
static inline void cpu_load_update_active(struct rq *this_rq) { }
#endif
/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
@@ -344,8 +338,10 @@ struct cfs_bandwidth {
u64 runtime_expires;
int expires_seq;
short idle;
short period_active;
u8 idle;
u8 period_active;
u8 distribute_running;
u8 slack_started;
struct hrtimer period_timer;
struct hrtimer slack_timer;
struct list_head throttled_cfs_rq;
@@ -354,8 +350,6 @@ struct cfs_bandwidth {
int nr_periods;
int nr_throttled;
u64 throttled_time;
bool distribute_running;
#endif
};
@@ -797,6 +791,48 @@ extern void rto_push_irq_work_func(struct irq_work *work);
#endif
#endif /* CONFIG_SMP */
#ifdef CONFIG_UCLAMP_TASK
/*
* struct uclamp_bucket - Utilization clamp bucket
* @value: utilization clamp value for tasks on this clamp bucket
* @tasks: number of RUNNABLE tasks on this clamp bucket
*
* Keep track of how many tasks are RUNNABLE for a given utilization
* clamp value.
*/
struct uclamp_bucket {
unsigned long value : bits_per(SCHED_CAPACITY_SCALE);
unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE);
};
/*
* struct uclamp_rq - rq's utilization clamp
* @value: currently active clamp values for a rq
* @bucket: utilization clamp buckets affecting a rq
*
* Keep track of RUNNABLE tasks on a rq to aggregate their clamp values.
* A clamp value is affecting a rq when there is at least one task RUNNABLE
* (or actually running) with that value.
*
* There are up to UCLAMP_CNT possible different clamp values, currently there
* are only two: minimum utilization and maximum utilization.
*
* All utilization clamping values are MAX aggregated, since:
* - for util_min: we want to run the CPU at least at the max of the minimum
* utilization required by its currently RUNNABLE tasks.
* - for util_max: we want to allow the CPU to run up to the max of the
* maximum utilization allowed by its currently RUNNABLE tasks.
*
* Since on each system we expect only a limited number of different
* utilization clamp values (UCLAMP_BUCKETS), use a simple array to track
* the metrics required to compute all the per-rq utilization clamp values.
*/
struct uclamp_rq {
unsigned int value;
struct uclamp_bucket bucket[UCLAMP_BUCKETS];
};
#endif /* CONFIG_UCLAMP_TASK */
/*
* This is the main, per-CPU runqueue data structure.
*
@@ -818,8 +854,6 @@ struct rq {
unsigned int nr_preferred_running;
unsigned int numa_migrate_on;
#endif
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
#ifdef CONFIG_NO_HZ_COMMON
#ifdef CONFIG_SMP
unsigned long last_load_update_tick;
@@ -830,11 +864,16 @@ struct rq {
atomic_t nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */
/* capture load from *all* tasks on this CPU: */
struct load_weight load;
unsigned long nr_load_updates;
u64 nr_switches;
#ifdef CONFIG_UCLAMP_TASK
/* Utilization clamp values based on CPU's RUNNABLE tasks */
struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned;
unsigned int uclamp_flags;
#define UCLAMP_FLAG_IDLE 0x01
#endif
struct cfs_rq cfs;
struct rt_rq rt;
struct dl_rq dl;
@@ -1649,6 +1688,10 @@ extern const u32 sched_prio_to_wmult[40];
struct sched_class {
const struct sched_class *next;
#ifdef CONFIG_UCLAMP_TASK
int uclamp_enabled;
#endif
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*yield_task) (struct rq *rq);
@@ -2222,6 +2265,48 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
#ifdef CONFIG_UCLAMP_TASK
unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id);
static __always_inline
unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
struct task_struct *p)
{
unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
if (p) {
min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX));
}
/*
* Since CPU's {min,max}_util clamps are MAX aggregated considering
* RUNNABLE tasks with _different_ clamps, we can end up with an
* inversion. Fix it now when the clamps are applied.
*/
if (unlikely(min_util >= max_util))
return min_util;
return clamp(util, min_util, max_util);
}
static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
{
return uclamp_util_with(rq, util, NULL);
}
#else /* CONFIG_UCLAMP_TASK */
static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
struct task_struct *p)
{
return util;
}
static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
{
return util;
}
#endif /* CONFIG_UCLAMP_TASK */
#ifdef arch_scale_freq_capacity
# ifndef arch_scale_freq_invariant
# define arch_scale_freq_invariant() true
@@ -2237,7 +2322,6 @@ static inline unsigned long capacity_orig_of(int cpu)
}
#endif
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
/**
* enum schedutil_type - CPU utilization type
* @FREQUENCY_UTIL: Utilization used to select frequency
@@ -2253,15 +2337,11 @@ enum schedutil_type {
ENERGY_UTIL,
};
unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
unsigned long max, enum schedutil_type type);
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs)
{
unsigned long max = arch_scale_cpu_capacity(NULL, cpu);
return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL);
}
unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
unsigned long max, enum schedutil_type type,
struct task_struct *p);
static inline unsigned long cpu_bw_dl(struct rq *rq)
{
@@ -2290,11 +2370,13 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
return READ_ONCE(rq->avg_rt.util_avg);
}
#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs)
static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
unsigned long max, enum schedutil_type type,
struct task_struct *p)
{
return cfs;
return 0;
}
#endif
#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
static inline unsigned long cpu_util_irq(struct rq *rq)

View File

@@ -1344,11 +1344,6 @@ sd_init(struct sched_domain_topology_level *tl,
.imbalance_pct = 125,
.cache_nice_tries = 0,
.busy_idx = 0,
.idle_idx = 0,
.newidle_idx = 0,
.wake_idx = 0,
.forkexec_idx = 0,
.flags = 1*SD_LOAD_BALANCE
| 1*SD_BALANCE_NEWIDLE
@@ -1400,13 +1395,10 @@ sd_init(struct sched_domain_topology_level *tl,
} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
sd->imbalance_pct = 117;
sd->cache_nice_tries = 1;
sd->busy_idx = 2;
#ifdef CONFIG_NUMA
} else if (sd->flags & SD_NUMA) {
sd->cache_nice_tries = 2;
sd->busy_idx = 3;
sd->idle_idx = 2;
sd->flags &= ~SD_PREFER_SIBLING;
sd->flags |= SD_SERIALIZE;
@@ -1419,8 +1411,6 @@ sd_init(struct sched_domain_topology_level *tl,
#endif
} else {
sd->cache_nice_tries = 1;
sd->busy_idx = 2;
sd->idle_idx = 1;
}
/*
@@ -1884,10 +1874,10 @@ static struct sched_domain_topology_level
unsigned long cap;
/* Is there any asymmetry? */
cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map));
cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
for_each_cpu(i, cpu_map) {
if (arch_scale_cpu_capacity(NULL, i) != cap) {
if (arch_scale_cpu_capacity(i) != cap) {
asym = true;
break;
}
@@ -1902,7 +1892,7 @@ static struct sched_domain_topology_level
* to everyone.
*/
for_each_cpu(i, cpu_map) {
unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i);
unsigned long max_capacity = arch_scale_cpu_capacity(i);
int tl_id = 0;
for_each_sd_topology(tl) {
@@ -1912,7 +1902,7 @@ static struct sched_domain_topology_level
for_each_cpu_and(j, tl->mask(i), cpu_map) {
unsigned long capacity;
capacity = arch_scale_cpu_capacity(NULL, j);
capacity = arch_scale_cpu_capacity(j);
if (capacity <= max_capacity)
continue;

View File

@@ -118,16 +118,12 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int
bookmark.func = NULL;
INIT_LIST_HEAD(&bookmark.entry);
spin_lock_irqsave(&wq_head->lock, flags);
nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark);
spin_unlock_irqrestore(&wq_head->lock, flags);
while (bookmark.flags & WQ_FLAG_BOOKMARK) {
do {
spin_lock_irqsave(&wq_head->lock, flags);
nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
wake_flags, key, &bookmark);
spin_unlock_irqrestore(&wq_head->lock, flags);
}
} while (bookmark.flags & WQ_FLAG_BOOKMARK);
}
/**

View File

@@ -452,6 +452,22 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = sched_rr_handler,
},
#ifdef CONFIG_UCLAMP_TASK
{
.procname = "sched_util_clamp_min",
.data = &sysctl_sched_uclamp_util_min,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sysctl_sched_uclamp_handler,
},
{
.procname = "sched_util_clamp_max",
.data = &sysctl_sched_uclamp_util_max,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sysctl_sched_uclamp_handler,
},
#endif
#ifdef CONFIG_SCHED_AUTOGROUP
{
.procname = "sched_autogroup_enabled",

View File

@@ -782,7 +782,6 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
*/
if (!ts->tick_stopped) {
calc_load_nohz_start();
cpu_load_update_nohz_start();
quiet_vmstat();
ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
@@ -829,7 +828,6 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
{
/* Update jiffies first */
tick_do_update_jiffies64(now);
cpu_load_update_nohz_stop();
/*
* Clear the timer idle flag, so we avoid IPIs on remote queueing and

View File

@@ -277,7 +277,7 @@ static void move_to_next_cpu(void)
* of this thread, than stop migrating for the duration
* of the current test.
*/
if (!cpumask_equal(current_mask, &current->cpus_allowed))
if (!cpumask_equal(current_mask, current->cpus_ptr))
goto disable;
get_online_cpus();