Merge tag 'sched-core-2020-08-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:

 - Improve uclamp performance by using a static key for the fast path

 - Add the "sched_util_clamp_min_rt_default" sysctl, to optimize for
   better power efficiency of RT tasks on battery powered devices.
   (The default is to maximize performance & reduce RT latencies.)

 - Improve utime and stime tracking accuracy, which had a fixed boundary
   of error, which created larger and larger relative errors as the
   values become larger. This is now replaced with more precise
   arithmetics, using the new mul_u64_u64_div_u64() helper in math64.h.

 - Improve the deadline scheduler, such as making it capacity aware

 - Improve frequency-invariant scheduling

 - Misc cleanups in energy/power aware scheduling

 - Add sched_update_nr_running tracepoint to track changes to nr_running

 - Documentation additions and updates

 - Misc cleanups and smaller fixes

* tag 'sched-core-2020-08-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (54 commits)
  sched/doc: Factorize bits between sched-energy.rst & sched-capacity.rst
  sched/doc: Document capacity aware scheduling
  sched: Document arch_scale_*_capacity()
  arm, arm64: Fix selection of CONFIG_SCHED_THERMAL_PRESSURE
  Documentation/sysctl: Document uclamp sysctl knobs
  sched/uclamp: Add a new sysctl to control RT default boost value
  sched/uclamp: Fix a deadlock when enabling uclamp static key
  sched: Remove duplicated tick_nohz_full_enabled() check
  sched: Fix a typo in a comment
  sched/uclamp: Remove unnecessary mutex_init()
  arm, arm64: Select CONFIG_SCHED_THERMAL_PRESSURE
  sched: Cleanup SCHED_THERMAL_PRESSURE kconfig entry
  arch_topology, sched/core: Cleanup thermal pressure definition
  trace/events/sched.h: fix duplicated word
  linux/sched/mm.h: drop duplicated words in comments
  smp: Fix a potential usage of stale nr_cpus
  sched/fair: update_pick_idlest() Select group with lowest group_util when idle_cpus are equal
  sched: nohz: stop passing around unused "ticks" parameter.
  sched: Better document ttwu()
  sched: Add a tracepoint to track rq->nr_running
  ...
这个提交包含在:
Linus Torvalds
2020-08-03 14:58:38 -07:00
当前提交 e4cbce4d13
修改 48 个文件,包含 1521 行新增332 行删除

查看文件

@@ -2302,6 +2302,7 @@ static __latent_entropy struct task_struct *copy_process(
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
sched_post_fork(p);
cgroup_post_fork(p, args);
perf_event_fork(p);

查看文件

@@ -27,6 +27,7 @@
#include <linux/ptrace.h>
#include <linux/uaccess.h>
#include <linux/numa.h>
#include <linux/sched/isolation.h>
#include <trace/events/sched.h>
@@ -383,7 +384,8 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
* The kernel thread should not inherit these properties.
*/
sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
set_cpus_allowed_ptr(task, cpu_all_mask);
set_cpus_allowed_ptr(task,
housekeeping_cpumask(HK_FLAG_KTHREAD));
}
kfree(create);
return task;
@@ -608,7 +610,7 @@ int kthreadd(void *unused)
/* Setup a clean context for our children to inherit. */
set_task_comm(tsk, "kthreadd");
ignore_signals(tsk);
set_cpus_allowed_ptr(tsk, cpu_all_mask);
set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_FLAG_KTHREAD));
set_mems_allowed(node_states[N_MEMORY]);
current->flags |= PF_NOFREEZE;

查看文件

@@ -6,6 +6,10 @@
*
* Copyright (C) 1991-2002 Linus Torvalds
*/
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
#undef CREATE_TRACE_POINTS
#include "sched.h"
#include <linux/nospec.h>
@@ -23,9 +27,6 @@
#include "pelt.h"
#include "smp.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
/*
* Export tracepoints that act as a bare tracehook (ie: have no trace event
* associated with them) to allow external modules to probe them.
@@ -36,6 +37,9 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -75,6 +79,100 @@ __read_mostly int scheduler_running;
*/
int sysctl_sched_rt_runtime = 950000;
/*
* Serialization rules:
*
* Lock order:
*
* p->pi_lock
* rq->lock
* hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
*
* rq1->lock
* rq2->lock where: rq1 < rq2
*
* Regular state:
*
* Normal scheduling state is serialized by rq->lock. __schedule() takes the
* local CPU's rq->lock, it optionally removes the task from the runqueue and
* always looks at the local rq data structures to find the most elegible task
* to run next.
*
* Task enqueue is also under rq->lock, possibly taken from another CPU.
* Wakeups from another LLC domain might use an IPI to transfer the enqueue to
* the local CPU to avoid bouncing the runqueue state around [ see
* ttwu_queue_wakelist() ]
*
* Task wakeup, specifically wakeups that involve migration, are horribly
* complicated to avoid having to take two rq->locks.
*
* Special state:
*
* System-calls and anything external will use task_rq_lock() which acquires
* both p->pi_lock and rq->lock. As a consequence the state they change is
* stable while holding either lock:
*
* - sched_setaffinity()/
* set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed
* - set_user_nice(): p->se.load, p->*prio
* - __sched_setscheduler(): p->sched_class, p->policy, p->*prio,
* p->se.load, p->rt_priority,
* p->dl.dl_{runtime, deadline, period, flags, bw, density}
* - sched_setnuma(): p->numa_preferred_nid
* - sched_move_task()/
* cpu_cgroup_fork(): p->sched_task_group
* - uclamp_update_active() p->uclamp*
*
* p->state <- TASK_*:
*
* is changed locklessly using set_current_state(), __set_current_state() or
* set_special_state(), see their respective comments, or by
* try_to_wake_up(). This latter uses p->pi_lock to serialize against
* concurrent self.
*
* p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
*
* is set by activate_task() and cleared by deactivate_task(), under
* rq->lock. Non-zero indicates the task is runnable, the special
* ON_RQ_MIGRATING state is used for migration without holding both
* rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
*
* p->on_cpu <- { 0, 1 }:
*
* is set by prepare_task() and cleared by finish_task() such that it will be
* set before p is scheduled-in and cleared after p is scheduled-out, both
* under rq->lock. Non-zero indicates the task is running on its CPU.
*
* [ The astute reader will observe that it is possible for two tasks on one
* CPU to have ->on_cpu = 1 at the same time. ]
*
* task_cpu(p): is changed by set_task_cpu(), the rules are:
*
* - Don't call set_task_cpu() on a blocked task:
*
* We don't care what CPU we're not running on, this simplifies hotplug,
* the CPU assignment of blocked tasks isn't required to be valid.
*
* - for try_to_wake_up(), called under p->pi_lock:
*
* This allows try_to_wake_up() to only take one rq->lock, see its comment.
*
* - for migration called under rq->lock:
* [ see task_on_rq_migrating() in task_rq_lock() ]
*
* o move_queued_task()
* o detach_task()
*
* - for migration called under double_rq_lock():
*
* o __migrate_swap_task()
* o push_rt_task() / pull_rt_task()
* o push_dl_task() / pull_dl_task()
* o dl_task_offline_migration()
*
*/
/*
* __task_rq_lock - lock the rq @p resides on.
*/
@@ -791,9 +889,46 @@ unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
/* Max allowed maximum utilization */
unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
/*
* By default RT tasks run at the maximum performance point/capacity of the
* system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
* SCHED_CAPACITY_SCALE.
*
* This knob allows admins to change the default behavior when uclamp is being
* used. In battery powered devices, particularly, running at the maximum
* capacity and frequency will increase energy consumption and shorten the
* battery life.
*
* This knob only affects RT tasks that their uclamp_se->user_defined == false.
*
* This knob will not override the system default sched_util_clamp_min defined
* above.
*/
unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
/* All clamps are required to be less or equal than these values */
static struct uclamp_se uclamp_default[UCLAMP_CNT];
/*
* This static key is used to reduce the uclamp overhead in the fast path. It
* primarily disables the call to uclamp_rq_{inc, dec}() in
* enqueue/dequeue_task().
*
* This allows users to continue to enable uclamp in their kernel config with
* minimum uclamp overhead in the fast path.
*
* As soon as userspace modifies any of the uclamp knobs, the static key is
* enabled, since we have an actual users that make use of uclamp
* functionality.
*
* The knobs that would enable this static key are:
*
* * A task modifying its uclamp value with sched_setattr().
* * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs.
* * An admin modifying the cgroup cpu.uclamp.{min, max}
*/
DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
/* Integer rounded range for each bucket */
#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
@@ -873,6 +1008,64 @@ unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
return uclamp_idle_value(rq, clamp_id, clamp_value);
}
static void __uclamp_update_util_min_rt_default(struct task_struct *p)
{
unsigned int default_util_min;
struct uclamp_se *uc_se;
lockdep_assert_held(&p->pi_lock);
uc_se = &p->uclamp_req[UCLAMP_MIN];
/* Only sync if user didn't override the default */
if (uc_se->user_defined)
return;
default_util_min = sysctl_sched_uclamp_util_min_rt_default;
uclamp_se_set(uc_se, default_util_min, false);
}
static void uclamp_update_util_min_rt_default(struct task_struct *p)
{
struct rq_flags rf;
struct rq *rq;
if (!rt_task(p))
return;
/* Protect updates to p->uclamp_* */
rq = task_rq_lock(p, &rf);
__uclamp_update_util_min_rt_default(p);
task_rq_unlock(rq, p, &rf);
}
static void uclamp_sync_util_min_rt_default(void)
{
struct task_struct *g, *p;
/*
* copy_process() sysctl_uclamp
* uclamp_min_rt = X;
* write_lock(&tasklist_lock) read_lock(&tasklist_lock)
* // link thread smp_mb__after_spinlock()
* write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
* sched_post_fork() for_each_process_thread()
* __uclamp_sync_rt() __uclamp_sync_rt()
*
* Ensures that either sched_post_fork() will observe the new
* uclamp_min_rt or for_each_process_thread() will observe the new
* task.
*/
read_lock(&tasklist_lock);
smp_mb__after_spinlock();
read_unlock(&tasklist_lock);
rcu_read_lock();
for_each_process_thread(g, p)
uclamp_update_util_min_rt_default(p);
rcu_read_unlock();
}
static inline struct uclamp_se
uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
{
@@ -990,10 +1183,38 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
lockdep_assert_held(&rq->lock);
/*
* If sched_uclamp_used was enabled after task @p was enqueued,
* we could end up with unbalanced call to uclamp_rq_dec_id().
*
* In this case the uc_se->active flag should be false since no uclamp
* accounting was performed at enqueue time and we can just return
* here.
*
* Need to be careful of the following enqeueue/dequeue ordering
* problem too
*
* enqueue(taskA)
* // sched_uclamp_used gets enabled
* enqueue(taskB)
* dequeue(taskA)
* // Must not decrement bukcet->tasks here
* dequeue(taskB)
*
* where we could end up with stale data in uc_se and
* bucket[uc_se->bucket_id].
*
* The following check here eliminates the possibility of such race.
*/
if (unlikely(!uc_se->active))
return;
bucket = &uc_rq->bucket[uc_se->bucket_id];
SCHED_WARN_ON(!bucket->tasks);
if (likely(bucket->tasks))
bucket->tasks--;
uc_se->active = false;
/*
@@ -1021,6 +1242,15 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
{
enum uclamp_id clamp_id;
/*
* Avoid any overhead until uclamp is actually used by the userspace.
*
* The condition is constructed such that a NOP is generated when
* sched_uclamp_used is disabled.
*/
if (!static_branch_unlikely(&sched_uclamp_used))
return;
if (unlikely(!p->sched_class->uclamp_enabled))
return;
@@ -1036,6 +1266,15 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
{
enum uclamp_id clamp_id;
/*
* Avoid any overhead until uclamp is actually used by the userspace.
*
* The condition is constructed such that a NOP is generated when
* sched_uclamp_used is disabled.
*/
if (!static_branch_unlikely(&sched_uclamp_used))
return;
if (unlikely(!p->sched_class->uclamp_enabled))
return;
@@ -1114,12 +1353,13 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
bool update_root_tg = false;
int old_min, old_max;
int old_min, old_max, old_min_rt;
int result;
mutex_lock(&uclamp_mutex);
old_min = sysctl_sched_uclamp_util_min;
old_max = sysctl_sched_uclamp_util_max;
old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
result = proc_dointvec(table, write, buffer, lenp, ppos);
if (result)
@@ -1128,7 +1368,9 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
goto done;
if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
result = -EINVAL;
goto undo;
}
@@ -1144,8 +1386,15 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
update_root_tg = true;
}
if (update_root_tg)
if (update_root_tg) {
static_branch_enable(&sched_uclamp_used);
uclamp_update_root_tg();
}
if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
static_branch_enable(&sched_uclamp_used);
uclamp_sync_util_min_rt_default();
}
/*
* We update all RUNNABLE tasks only when task groups are in use.
@@ -1158,6 +1407,7 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
undo:
sysctl_sched_uclamp_util_min = old_min;
sysctl_sched_uclamp_util_max = old_max;
sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
done:
mutex_unlock(&uclamp_mutex);
@@ -1180,6 +1430,15 @@ static int uclamp_validate(struct task_struct *p,
if (upper_bound > SCHED_CAPACITY_SCALE)
return -EINVAL;
/*
* We have valid uclamp attributes; make sure uclamp is enabled.
*
* We need to do that here, because enabling static branches is a
* blocking operation which obviously cannot be done while holding
* scheduler locks.
*/
static_branch_enable(&sched_uclamp_used);
return 0;
}
@@ -1194,17 +1453,20 @@ static void __setscheduler_uclamp(struct task_struct *p,
*/
for_each_clamp_id(clamp_id) {
struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
unsigned int clamp_value = uclamp_none(clamp_id);
/* Keep using defined clamps across class changes */
if (uc_se->user_defined)
continue;
/* By default, RT tasks always get 100% boost */
/*
* RT by default have a 100% boost value that could be modified
* at runtime.
*/
if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
clamp_value = uclamp_none(UCLAMP_MAX);
__uclamp_update_util_min_rt_default(p);
else
uclamp_se_set(uc_se, uclamp_none(clamp_id), false);
uclamp_se_set(uc_se, clamp_value, false);
}
if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
@@ -1225,6 +1487,10 @@ static void uclamp_fork(struct task_struct *p)
{
enum uclamp_id clamp_id;
/*
* We don't need to hold task_rq_lock() when updating p->uclamp_* here
* as the task is still at its early fork stages.
*/
for_each_clamp_id(clamp_id)
p->uclamp[clamp_id].active = false;
@@ -1237,19 +1503,33 @@ static void uclamp_fork(struct task_struct *p)
}
}
static void uclamp_post_fork(struct task_struct *p)
{
uclamp_update_util_min_rt_default(p);
}
static void __init init_uclamp_rq(struct rq *rq)
{
enum uclamp_id clamp_id;
struct uclamp_rq *uc_rq = rq->uclamp;
for_each_clamp_id(clamp_id) {
uc_rq[clamp_id] = (struct uclamp_rq) {
.value = uclamp_none(clamp_id)
};
}
rq->uclamp_flags = 0;
}
static void __init init_uclamp(void)
{
struct uclamp_se uc_max = {};
enum uclamp_id clamp_id;
int cpu;
mutex_init(&uclamp_mutex);
for_each_possible_cpu(cpu) {
memset(&cpu_rq(cpu)->uclamp, 0,
sizeof(struct uclamp_rq)*UCLAMP_CNT);
cpu_rq(cpu)->uclamp_flags = 0;
}
for_each_possible_cpu(cpu)
init_uclamp_rq(cpu_rq(cpu));
for_each_clamp_id(clamp_id) {
uclamp_se_set(&init_task.uclamp_req[clamp_id],
@@ -1278,6 +1558,7 @@ static inline int uclamp_validate(struct task_struct *p,
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr) { }
static inline void uclamp_fork(struct task_struct *p) { }
static inline void uclamp_post_fork(struct task_struct *p) { }
static inline void init_uclamp(void) { }
#endif /* CONFIG_UCLAMP_TASK */
@@ -1404,20 +1685,10 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
const struct sched_class *class;
if (p->sched_class == rq->curr->sched_class) {
if (p->sched_class == rq->curr->sched_class)
rq->curr->sched_class->check_preempt_curr(rq, p, flags);
} else {
for_each_class(class) {
if (class == rq->curr->sched_class)
break;
if (class == p->sched_class) {
resched_curr(rq);
break;
}
}
}
else if (p->sched_class > rq->curr->sched_class)
resched_curr(rq);
/*
* A queue event has occurred, and we're going to schedule. In
@@ -1468,8 +1739,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
{
lockdep_assert_held(&rq->lock);
WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
dequeue_task(rq, p, DEQUEUE_NOCLOCK);
deactivate_task(rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, new_cpu);
rq_unlock(rq, rf);
@@ -1477,8 +1747,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
rq_lock(rq, rf);
BUG_ON(task_cpu(p) != new_cpu);
enqueue_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
activate_task(rq, p, 0);
check_preempt_curr(rq, p, 0);
return rq;
@@ -2243,12 +2512,31 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
}
/*
* Called in case the task @p isn't fully descheduled from its runqueue,
* in this case we must do a remote wakeup. Its a 'light' wakeup though,
* since all we need to do is flip p->state to TASK_RUNNING, since
* the task is still ->on_rq.
* Consider @p being inside a wait loop:
*
* for (;;) {
* set_current_state(TASK_UNINTERRUPTIBLE);
*
* if (CONDITION)
* break;
*
* schedule();
* }
* __set_current_state(TASK_RUNNING);
*
* between set_current_state() and schedule(). In this case @p is still
* runnable, so all that needs doing is change p->state back to TASK_RUNNING in
* an atomic manner.
*
* By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
* then schedule() must still happen and p->state can be changed to
* TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
* need to do a full wakeup with enqueue.
*
* Returns: %true when the wakeup is done,
* %false otherwise.
*/
static int ttwu_remote(struct task_struct *p, int wake_flags)
static int ttwu_runnable(struct task_struct *p, int wake_flags)
{
struct rq_flags rf;
struct rq *rq;
@@ -2389,6 +2677,14 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
return false;
}
#else /* !CONFIG_SMP */
static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
return false;
}
#endif /* CONFIG_SMP */
static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
@@ -2396,10 +2692,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
#if defined(CONFIG_SMP)
if (ttwu_queue_wakelist(p, cpu, wake_flags))
return;
#endif
rq_lock(rq, &rf);
update_rq_clock(rq);
@@ -2455,8 +2749,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* migration. However the means are completely different as there is no lock
* chain to provide order. Instead we do:
*
* 1) smp_store_release(X->on_cpu, 0)
* 2) smp_cond_load_acquire(!X->on_cpu)
* 1) smp_store_release(X->on_cpu, 0) -- finish_task()
* 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
*
* Example:
*
@@ -2496,15 +2790,33 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* @state: the mask of task states that can be woken
* @wake_flags: wake modifier flags (WF_*)
*
* If (@state & @p->state) @p->state = TASK_RUNNING.
* Conceptually does:
*
* If (@state & @p->state) @p->state = TASK_RUNNING.
*
* If the task was not queued/runnable, also place it back on a runqueue.
*
* Atomic against schedule() which would dequeue a task, also see
* set_current_state().
* This function is atomic against schedule() which would dequeue the task.
*
* This function executes a full memory barrier before accessing the task
* state; see set_current_state().
* It issues a full memory barrier before accessing @p->state, see the comment
* with set_current_state().
*
* Uses p->pi_lock to serialize against concurrent wake-ups.
*
* Relies on p->pi_lock stabilizing:
* - p->sched_class
* - p->cpus_ptr
* - p->sched_task_group
* in order to do migration, see its use of select_task_rq()/set_task_cpu().
*
* Tries really hard to only take one task_rq(p)->lock for performance.
* Takes rq->lock in:
* - ttwu_runnable() -- old rq, unavoidable, see comment there;
* - ttwu_queue() -- new rq, for enqueue of the task;
* - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
*
* As a consequence we race really badly with just about everything. See the
* many memory barriers and their comments for details.
*
* Return: %true if @p->state changes (an actual wakeup was done),
* %false otherwise.
@@ -2520,7 +2832,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
/*
* We're waking current, this means 'p->on_rq' and 'task_cpu(p)
* == smp_processor_id()'. Together this means we can special
* case the whole 'p->on_rq && ttwu_remote()' case below
* case the whole 'p->on_rq && ttwu_runnable()' case below
* without taking any locks.
*
* In particular:
@@ -2541,8 +2853,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
/*
* If we are going to wake up a thread waiting for CONDITION we
* need to ensure that CONDITION=1 done by the caller can not be
* reordered with p->state check below. This pairs with mb() in
* set_current_state() the waiting thread does.
* reordered with p->state check below. This pairs with smp_store_mb()
* in set_current_state() that the waiting thread does.
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
smp_mb__after_spinlock();
@@ -2577,7 +2889,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* A similar smb_rmb() lives in try_invoke_on_locked_down_task().
*/
smp_rmb();
if (READ_ONCE(p->on_rq) && ttwu_remote(p, wake_flags))
if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
goto unlock;
if (p->in_iowait) {
@@ -2990,6 +3302,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
return 0;
}
void sched_post_fork(struct task_struct *p)
{
uclamp_post_fork(p);
}
unsigned long to_ratio(u64 period, u64 runtime)
{
if (runtime == RUNTIME_INF)
@@ -3147,8 +3464,10 @@ static inline void prepare_task(struct task_struct *next)
/*
* Claim the task as running, we do this before switching to it
* such that any running task will have this set.
*
* See the ttwu() WF_ON_CPU case and its ordering comment.
*/
next->on_cpu = 1;
WRITE_ONCE(next->on_cpu, 1);
#endif
}
@@ -3156,8 +3475,9 @@ static inline void finish_task(struct task_struct *prev)
{
#ifdef CONFIG_SMP
/*
* After ->on_cpu is cleared, the task can be moved to a different CPU.
* We must ensure this doesn't happen until the switch is completely
* This must be the very last reference to @prev from this CPU. After
* p->on_cpu is cleared, the task can be moved to a different CPU. We
* must ensure this doesn't happen until the switch is completely
* finished.
*
* In particular, the load of prev->state in finish_task_switch() must
@@ -3656,17 +3976,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns;
}
DEFINE_PER_CPU(unsigned long, thermal_pressure);
void arch_set_thermal_pressure(struct cpumask *cpus,
unsigned long th_pressure)
{
int cpu;
for_each_cpu(cpu, cpus)
WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure);
}
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
@@ -4029,8 +4338,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* higher scheduling class, because otherwise those loose the
* opportunity to pull in more work from other CPUs.
*/
if (likely((prev->sched_class == &idle_sched_class ||
prev->sched_class == &fair_sched_class) &&
if (likely(prev->sched_class <= &fair_sched_class &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = pick_next_task_fair(rq, prev, rf);
@@ -5519,6 +5827,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
kattr.sched_nice = task_nice(p);
#ifdef CONFIG_UCLAMP_TASK
/*
* This could race with another potential updater, but this is fine
* because it'll correctly read the old or the new value. We don't need
* to guarantee who wins the race as long as it doesn't return garbage.
*/
kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
#endif
@@ -5876,7 +6189,7 @@ again:
if (task_running(p_rq, p) || p->state)
goto out_unlock;
yielded = curr->sched_class->yield_to_task(rq, p, preempt);
yielded = curr->sched_class->yield_to_task(rq, p);
if (yielded) {
schedstat_inc(rq->yld_count);
/*
@@ -6710,6 +7023,14 @@ void __init sched_init(void)
unsigned long ptr = 0;
int i;
/* Make sure the linker didn't screw up */
BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
&fair_sched_class + 1 != &rt_sched_class ||
&rt_sched_class + 1 != &dl_sched_class);
#ifdef CONFIG_SMP
BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
#endif
wait_bit_init();
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -7431,6 +7752,8 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
if (req.ret)
return req.ret;
static_branch_enable(&sched_uclamp_used);
mutex_lock(&uclamp_mutex);
rcu_read_lock();
@@ -8118,4 +8441,7 @@ const u32 sched_prio_to_wmult[40] = {
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
#undef CREATE_TRACE_POINTS
void call_trace_sched_update_nr_running(struct rq *rq, int count)
{
trace_sched_update_nr_running_tp(rq, count);
}

查看文件

@@ -121,6 +121,30 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
if (later_mask &&
cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
unsigned long cap, max_cap = 0;
int cpu, max_cpu = -1;
if (!static_branch_unlikely(&sched_asym_cpucapacity))
return 1;
/* Ensure the capacity of the CPUs fits the task. */
for_each_cpu(cpu, later_mask) {
if (!dl_task_fits_capacity(p, cpu)) {
cpumask_clear_cpu(cpu, later_mask);
cap = capacity_orig_of(cpu);
if (cap > max_cap ||
(cpu == task_cpu(p) && cap == max_cap)) {
max_cap = cap;
max_cpu = cpu;
}
}
}
if (cpumask_empty(later_mask))
cpumask_set_cpu(max_cpu, later_mask);
return 1;
} else {
int best_cpu = cpudl_maximum(cp);

查看文件

@@ -210,7 +210,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
unsigned long dl_util, util, irq;
struct rq *rq = cpu_rq(cpu);
if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) &&
if (!uclamp_is_used() &&
type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
return max;
}

查看文件

@@ -519,50 +519,6 @@ void account_idle_ticks(unsigned long ticks)
account_idle_time(cputime);
}
/*
* Perform (stime * rtime) / total, but avoid multiplication overflow by
* losing precision when the numbers are big.
*/
static u64 scale_stime(u64 stime, u64 rtime, u64 total)
{
u64 scaled;
for (;;) {
/* Make sure "rtime" is the bigger of stime/rtime */
if (stime > rtime)
swap(rtime, stime);
/* Make sure 'total' fits in 32 bits */
if (total >> 32)
goto drop_precision;
/* Does rtime (and thus stime) fit in 32 bits? */
if (!(rtime >> 32))
break;
/* Can we just balance rtime/stime rather than dropping bits? */
if (stime >> 31)
goto drop_precision;
/* We can grow stime and shrink rtime and try to make them both fit */
stime <<= 1;
rtime >>= 1;
continue;
drop_precision:
/* We drop from rtime, it has more bits than stime */
rtime >>= 1;
total >>= 1;
}
/*
* Make sure gcc understands that this is a 32x32->64 multiply,
* followed by a 64/32->64 divide.
*/
scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
return scaled;
}
/*
* Adjust tick based cputime random precision against scheduler runtime
* accounting.
@@ -622,7 +578,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
goto update;
}
stime = scale_stime(stime, rtime, stime + utime);
stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
update:
/*

查看文件

@@ -54,15 +54,49 @@ static inline struct dl_bw *dl_bw_of(int i)
static inline int dl_bw_cpus(int i)
{
struct root_domain *rd = cpu_rq(i)->rd;
int cpus = 0;
int cpus;
RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
"sched RCU must be held");
if (cpumask_subset(rd->span, cpu_active_mask))
return cpumask_weight(rd->span);
cpus = 0;
for_each_cpu_and(i, rd->span, cpu_active_mask)
cpus++;
return cpus;
}
static inline unsigned long __dl_bw_capacity(int i)
{
struct root_domain *rd = cpu_rq(i)->rd;
unsigned long cap = 0;
RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
"sched RCU must be held");
for_each_cpu_and(i, rd->span, cpu_active_mask)
cap += capacity_orig_of(i);
return cap;
}
/*
* XXX Fix: If 'rq->rd == def_root_domain' perform AC against capacity
* of the CPU the task is running on rather rd's \Sum CPU capacity.
*/
static inline unsigned long dl_bw_capacity(int i)
{
if (!static_branch_unlikely(&sched_asym_cpucapacity) &&
capacity_orig_of(i) == SCHED_CAPACITY_SCALE) {
return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT;
} else {
return __dl_bw_capacity(i);
}
}
#else
static inline struct dl_bw *dl_bw_of(int i)
{
@@ -73,6 +107,11 @@ static inline int dl_bw_cpus(int i)
{
return 1;
}
static inline unsigned long dl_bw_capacity(int i)
{
return SCHED_CAPACITY_SCALE;
}
#endif
static inline
@@ -1098,7 +1137,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
* cannot use the runtime, and so it replenishes the task. This rule
* works fine for implicit deadline tasks (deadline == period), and the
* CBS was designed for implicit deadline tasks. However, a task with
* constrained deadline (deadine < period) might be awakened after the
* constrained deadline (deadline < period) might be awakened after the
* deadline, but before the next period. In this case, replenishing the
* task would allow it to run for runtime / deadline. As in this case
* deadline < period, CBS enables a task to run for more than the
@@ -1604,6 +1643,7 @@ static int
select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
{
struct task_struct *curr;
bool select_rq;
struct rq *rq;
if (sd_flag != SD_BALANCE_WAKE)
@@ -1623,10 +1663,19 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
* other hand, if it has a shorter deadline, we
* try to make it stay here, it might be important.
*/
if (unlikely(dl_task(curr)) &&
(curr->nr_cpus_allowed < 2 ||
!dl_entity_preempt(&p->dl, &curr->dl)) &&
(p->nr_cpus_allowed > 1)) {
select_rq = unlikely(dl_task(curr)) &&
(curr->nr_cpus_allowed < 2 ||
!dl_entity_preempt(&p->dl, &curr->dl)) &&
p->nr_cpus_allowed > 1;
/*
* Take the capacity of the CPU into account to
* ensure it fits the requirement of the task.
*/
if (static_branch_unlikely(&sched_asym_cpucapacity))
select_rq |= !dl_task_fits_capacity(p, cpu);
if (select_rq) {
int target = find_later_rq(p);
if (target != -1 &&
@@ -2430,8 +2479,8 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
}
}
const struct sched_class dl_sched_class = {
.next = &rt_sched_class,
const struct sched_class dl_sched_class
__attribute__((section("__dl_sched_class"))) = {
.enqueue_task = enqueue_task_dl,
.dequeue_task = dequeue_task_dl,
.yield_task = yield_task_dl,
@@ -2551,11 +2600,12 @@ void sched_dl_do_global(void)
int sched_dl_overflow(struct task_struct *p, int policy,
const struct sched_attr *attr)
{
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
u64 period = attr->sched_period ?: attr->sched_deadline;
u64 runtime = attr->sched_runtime;
u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
int cpus, err = -1;
int cpus, err = -1, cpu = task_cpu(p);
struct dl_bw *dl_b = dl_bw_of(cpu);
unsigned long cap;
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return 0;
@@ -2570,15 +2620,17 @@ int sched_dl_overflow(struct task_struct *p, int policy,
* allocated bandwidth of the container.
*/
raw_spin_lock(&dl_b->lock);
cpus = dl_bw_cpus(task_cpu(p));
cpus = dl_bw_cpus(cpu);
cap = dl_bw_capacity(cpu);
if (dl_policy(policy) && !task_has_dl_policy(p) &&
!__dl_overflow(dl_b, cpus, 0, new_bw)) {
!__dl_overflow(dl_b, cap, 0, new_bw)) {
if (hrtimer_active(&p->dl.inactive_timer))
__dl_sub(dl_b, p->dl.dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);
err = 0;
} else if (dl_policy(policy) && task_has_dl_policy(p) &&
!__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
!__dl_overflow(dl_b, cap, p->dl.dl_bw, new_bw)) {
/*
* XXX this is slightly incorrect: when the task
* utilization decreases, we should delay the total
@@ -2634,6 +2686,14 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
attr->sched_flags = dl_se->flags;
}
/*
* Default limits for DL period; on the top end we guard against small util
* tasks still getting rediculous long effective runtimes, on the bottom end we
* guard against timer DoS.
*/
unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */
/*
* This function validates the new parameters of a -deadline task.
* We ask for the deadline not being zero, and greater or equal
@@ -2646,6 +2706,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
*/
bool __checkparam_dl(const struct sched_attr *attr)
{
u64 period, max, min;
/* special dl tasks don't actually use any parameter */
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return true;
@@ -2669,12 +2731,21 @@ bool __checkparam_dl(const struct sched_attr *attr)
attr->sched_period & (1ULL << 63))
return false;
period = attr->sched_period;
if (!period)
period = attr->sched_deadline;
/* runtime <= deadline <= period (if period != 0) */
if ((attr->sched_period != 0 &&
attr->sched_period < attr->sched_deadline) ||
if (period < attr->sched_deadline ||
attr->sched_deadline < attr->sched_runtime)
return false;
max = (u64)READ_ONCE(sysctl_sched_dl_period_max) * NSEC_PER_USEC;
min = (u64)READ_ONCE(sysctl_sched_dl_period_min) * NSEC_PER_USEC;
if (period < min || period > max)
return false;
return true;
}
@@ -2715,19 +2786,19 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
#ifdef CONFIG_SMP
int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
{
unsigned long flags, cap;
unsigned int dest_cpu;
struct dl_bw *dl_b;
bool overflow;
int cpus, ret;
unsigned long flags;
int ret;
dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
rcu_read_lock_sched();
dl_b = dl_bw_of(dest_cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
cpus = dl_bw_cpus(dest_cpu);
overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
cap = dl_bw_capacity(dest_cpu);
overflow = __dl_overflow(dl_b, cap, 0, p->dl.dl_bw);
if (overflow) {
ret = -EBUSY;
} else {
@@ -2737,6 +2808,8 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo
* We will free resources in the source root_domain
* later on (see set_cpus_allowed_dl()).
*/
int cpus = dl_bw_cpus(dest_cpu);
__dl_add(dl_b, p->dl.dl_bw, cpus);
ret = 0;
}
@@ -2769,16 +2842,15 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
bool dl_cpu_busy(unsigned int cpu)
{
unsigned long flags;
unsigned long flags, cap;
struct dl_bw *dl_b;
bool overflow;
int cpus;
rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
cpus = dl_bw_cpus(cpu);
overflow = __dl_overflow(dl_b, cpus, 0, 0);
cap = dl_bw_capacity(cpu);
overflow = __dl_overflow(dl_b, cap, 0, 0);
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
rcu_read_unlock_sched();

查看文件

@@ -22,8 +22,6 @@
*/
#include "sched.h"
#include <trace/events/sched.h>
/*
* Targeted preemption latency for CPU-bound tasks:
*
@@ -3094,7 +3092,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
#ifdef CONFIG_SMP
do {
u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
u32 divider = get_pelt_divider(&se->avg);
se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
} while (0);
@@ -3440,16 +3438,18 @@ static inline void
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
/*
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
u32 divider;
/* Nothing to update */
if (!delta)
return;
/*
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
divider = get_pelt_divider(&cfs_rq->avg);
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
se->avg.util_sum = se->avg.util_avg * divider;
@@ -3463,16 +3463,18 @@ static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
/*
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
u32 divider;
/* Nothing to update */
if (!delta)
return;
/*
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
divider = get_pelt_divider(&cfs_rq->avg);
/* Set new sched_entity's runnable */
se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
se->avg.runnable_sum = se->avg.runnable_avg * divider;
@@ -3500,7 +3502,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
divider = get_pelt_divider(&cfs_rq->avg);
if (runnable_sum >= 0) {
/*
@@ -3646,7 +3648,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
if (cfs_rq->removed.nr) {
unsigned long r;
u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
u32 divider = get_pelt_divider(&cfs_rq->avg);
raw_spin_lock(&cfs_rq->removed.lock);
swap(cfs_rq->removed.util_avg, removed_util);
@@ -3701,7 +3703,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
u32 divider = get_pelt_divider(&cfs_rq->avg);
/*
* When we attach the @se to the @cfs_rq, we must align the decay
@@ -3922,6 +3924,8 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
enqueued = cfs_rq->avg.util_est.enqueued;
enqueued += _task_util_est(p);
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
trace_sched_util_est_cfs_tp(cfs_rq);
}
/*
@@ -3952,6 +3956,8 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
trace_sched_util_est_cfs_tp(cfs_rq);
/*
* Skip update of task's estimated utilization when the task has not
* yet completed an activation, e.g. being migrated.
@@ -4017,6 +4023,8 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
done:
WRITE_ONCE(p->se.avg.util_est, ue);
trace_sched_util_est_se_tp(&p->se);
}
static inline int task_fits_capacity(struct task_struct *p, long capacity)
@@ -5618,14 +5626,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}
dequeue_throttle:
if (!se)
sub_nr_running(rq, 1);
/* At this point se is NULL and we are at root level*/
sub_nr_running(rq, 1);
/* balance early to pull high priority tasks */
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
rq->next_balance = jiffies;
dequeue_throttle:
util_est_dequeue(&rq->cfs, p, task_sleep);
hrtick_update(rq);
}
@@ -7161,7 +7169,7 @@ static void yield_task_fair(struct rq *rq)
set_skip_buddy(se);
}
static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
@@ -8049,7 +8057,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
};
}
static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long max = arch_scale_cpu_capacity(cpu);
@@ -8081,7 +8089,7 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
unsigned long capacity = scale_rt_capacity(sd, cpu);
unsigned long capacity = scale_rt_capacity(cpu);
struct sched_group *sdg = sd->groups;
cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
@@ -8703,8 +8711,14 @@ static bool update_pick_idlest(struct sched_group *idlest,
case group_has_spare:
/* Select group with most idle CPUs */
if (idlest_sgs->idle_cpus >= sgs->idle_cpus)
if (idlest_sgs->idle_cpus > sgs->idle_cpus)
return false;
/* Select group with lowest group_util */
if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
idlest_sgs->group_util <= sgs->group_util)
return false;
break;
}
@@ -10027,7 +10041,12 @@ static void kick_ilb(unsigned int flags)
{
int ilb_cpu;
nohz.next_balance++;
/*
* Increase nohz.next_balance only when if full ilb is triggered but
* not if we only update stats.
*/
if (flags & NOHZ_BALANCE_KICK)
nohz.next_balance = jiffies+1;
ilb_cpu = find_new_ilb();
@@ -10348,6 +10367,14 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
}
}
/*
* next_balance will be updated only when there is a need.
* When the CPU is attached to null domain for ex, it will not be
* updated.
*/
if (likely(update_next_balance))
nohz.next_balance = next_balance;
/* Newly idle CPU doesn't need an update */
if (idle != CPU_NEWLY_IDLE) {
update_blocked_averages(this_cpu);
@@ -10368,14 +10395,6 @@ abort:
if (has_blocked_load)
WRITE_ONCE(nohz.has_blocked, 1);
/*
* next_balance will be updated only when there is a need.
* When the CPU is attached to null domain for ex, it will not be
* updated.
*/
if (likely(update_next_balance))
nohz.next_balance = next_balance;
return ret;
}
@@ -11118,8 +11137,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
/*
* All the scheduling class methods:
*/
const struct sched_class fair_sched_class = {
.next = &idle_sched_class,
const struct sched_class fair_sched_class
__attribute__((section("__fair_sched_class"))) = {
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
@@ -11292,3 +11311,9 @@ const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
#endif
}
EXPORT_SYMBOL_GPL(sched_trace_rd_span);
int sched_trace_rq_nr_running(struct rq *rq)
{
return rq ? rq->nr_running : -1;
}
EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);

查看文件

@@ -453,11 +453,6 @@ prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
BUG();
}
static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
{
return 0;
}
static void update_curr_idle(struct rq *rq)
{
}
@@ -465,8 +460,8 @@ static void update_curr_idle(struct rq *rq)
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
const struct sched_class idle_sched_class = {
/* .next is NULL */
const struct sched_class idle_sched_class
__attribute__((section("__idle_sched_class"))) = {
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
@@ -486,8 +481,6 @@ const struct sched_class idle_sched_class = {
.task_tick = task_tick_idle,
.get_rr_interval = get_rr_interval_idle,
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
.update_curr = update_curr_idle,

查看文件

@@ -140,7 +140,8 @@ static int __init housekeeping_nohz_full_setup(char *str)
{
unsigned int flags;
flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU |
HK_FLAG_MISC | HK_FLAG_KTHREAD;
return housekeeping_setup(str, flags);
}

查看文件

@@ -347,7 +347,7 @@ static inline void calc_global_nohz(void) { }
*
* Called from the global timer code.
*/
void calc_global_load(unsigned long ticks)
void calc_global_load(void)
{
unsigned long sample_window;
long active, delta;

查看文件

@@ -28,8 +28,6 @@
#include "sched.h"
#include "pelt.h"
#include <trace/events/sched.h>
/*
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
@@ -83,8 +81,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
return c1 + c2 + c3;
}
#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
/*
* Accumulate the three separate parts of the sum; d1 the remainder
* of the last (incomplete) period, d2 the span of full periods and d3
@@ -264,7 +260,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
static __always_inline void
___update_load_avg(struct sched_avg *sa, unsigned long load)
{
u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
u32 divider = get_pelt_divider(sa);
/*
* Step 2: update *_avg.

查看文件

@@ -37,6 +37,11 @@ update_irq_load_avg(struct rq *rq, u64 running)
}
#endif
static inline u32 get_pelt_divider(struct sched_avg *avg)
{
return LOAD_AVG_MAX - 1024 + avg->period_contrib;
}
/*
* When a task is dequeued, its estimated utilization should not be update if
* its util_avg has not been updated at least once.

查看文件

@@ -190,7 +190,6 @@ static void group_init(struct psi_group *group)
INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
mutex_init(&group->avgs_lock);
/* Init trigger-related members */
atomic_set(&group->poll_scheduled, 0);
mutex_init(&group->trigger_lock);
INIT_LIST_HEAD(&group->triggers);
memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
@@ -199,7 +198,7 @@ static void group_init(struct psi_group *group)
memset(group->polling_total, 0, sizeof(group->polling_total));
group->polling_next_update = ULLONG_MAX;
group->polling_until = 0;
rcu_assign_pointer(group->poll_kworker, NULL);
rcu_assign_pointer(group->poll_task, NULL);
}
void __init psi_init(void)
@@ -547,47 +546,38 @@ static u64 update_triggers(struct psi_group *group, u64 now)
return now + group->poll_min_period;
}
/*
* Schedule polling if it's not already scheduled. It's safe to call even from
* hotpath because even though kthread_queue_delayed_work takes worker->lock
* spinlock that spinlock is never contended due to poll_scheduled atomic
* preventing such competition.
*/
/* Schedule polling if it's not already scheduled. */
static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
{
struct kthread_worker *kworker;
struct task_struct *task;
/* Do not reschedule if already scheduled */
if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0)
/*
* Do not reschedule if already scheduled.
* Possible race with a timer scheduled after this check but before
* mod_timer below can be tolerated because group->polling_next_update
* will keep updates on schedule.
*/
if (timer_pending(&group->poll_timer))
return;
rcu_read_lock();
kworker = rcu_dereference(group->poll_kworker);
task = rcu_dereference(group->poll_task);
/*
* kworker might be NULL in case psi_trigger_destroy races with
* psi_task_change (hotpath) which can't use locks
*/
if (likely(kworker))
kthread_queue_delayed_work(kworker, &group->poll_work, delay);
else
atomic_set(&group->poll_scheduled, 0);
if (likely(task))
mod_timer(&group->poll_timer, jiffies + delay);
rcu_read_unlock();
}
static void psi_poll_work(struct kthread_work *work)
static void psi_poll_work(struct psi_group *group)
{
struct kthread_delayed_work *dwork;
struct psi_group *group;
u32 changed_states;
u64 now;
dwork = container_of(work, struct kthread_delayed_work, work);
group = container_of(dwork, struct psi_group, poll_work);
atomic_set(&group->poll_scheduled, 0);
mutex_lock(&group->trigger_lock);
now = sched_clock();
@@ -623,6 +613,35 @@ out:
mutex_unlock(&group->trigger_lock);
}
static int psi_poll_worker(void *data)
{
struct psi_group *group = (struct psi_group *)data;
struct sched_param param = {
.sched_priority = 1,
};
sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
while (true) {
wait_event_interruptible(group->poll_wait,
atomic_cmpxchg(&group->poll_wakeup, 1, 0) ||
kthread_should_stop());
if (kthread_should_stop())
break;
psi_poll_work(group);
}
return 0;
}
static void poll_timer_fn(struct timer_list *t)
{
struct psi_group *group = from_timer(group, t, poll_timer);
atomic_set(&group->poll_wakeup, 1);
wake_up_interruptible(&group->poll_wait);
}
static void record_times(struct psi_group_cpu *groupc, int cpu,
bool memstall_tick)
{
@@ -1099,22 +1118,20 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
mutex_lock(&group->trigger_lock);
if (!rcu_access_pointer(group->poll_kworker)) {
struct sched_param param = {
.sched_priority = 1,
};
struct kthread_worker *kworker;
if (!rcu_access_pointer(group->poll_task)) {
struct task_struct *task;
kworker = kthread_create_worker(0, "psimon");
if (IS_ERR(kworker)) {
task = kthread_create(psi_poll_worker, group, "psimon");
if (IS_ERR(task)) {
kfree(t);
mutex_unlock(&group->trigger_lock);
return ERR_CAST(kworker);
return ERR_CAST(task);
}
sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
kthread_init_delayed_work(&group->poll_work,
psi_poll_work);
rcu_assign_pointer(group->poll_kworker, kworker);
atomic_set(&group->poll_wakeup, 0);
init_waitqueue_head(&group->poll_wait);
wake_up_process(task);
timer_setup(&group->poll_timer, poll_timer_fn, 0);
rcu_assign_pointer(group->poll_task, task);
}
list_add(&t->node, &group->triggers);
@@ -1132,7 +1149,7 @@ static void psi_trigger_destroy(struct kref *ref)
{
struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
struct psi_group *group = t->group;
struct kthread_worker *kworker_to_destroy = NULL;
struct task_struct *task_to_destroy = NULL;
if (static_branch_likely(&psi_disabled))
return;
@@ -1158,13 +1175,13 @@ static void psi_trigger_destroy(struct kref *ref)
period = min(period, div_u64(tmp->win.size,
UPDATES_PER_WINDOW));
group->poll_min_period = period;
/* Destroy poll_kworker when the last trigger is destroyed */
/* Destroy poll_task when the last trigger is destroyed */
if (group->poll_states == 0) {
group->polling_until = 0;
kworker_to_destroy = rcu_dereference_protected(
group->poll_kworker,
task_to_destroy = rcu_dereference_protected(
group->poll_task,
lockdep_is_held(&group->trigger_lock));
rcu_assign_pointer(group->poll_kworker, NULL);
rcu_assign_pointer(group->poll_task, NULL);
}
}
@@ -1172,25 +1189,23 @@ static void psi_trigger_destroy(struct kref *ref)
/*
* Wait for both *trigger_ptr from psi_trigger_replace and
* poll_kworker RCUs to complete their read-side critical sections
* before destroying the trigger and optionally the poll_kworker
* poll_task RCUs to complete their read-side critical sections
* before destroying the trigger and optionally the poll_task
*/
synchronize_rcu();
/*
* Destroy the kworker after releasing trigger_lock to prevent a
* deadlock while waiting for psi_poll_work to acquire trigger_lock
*/
if (kworker_to_destroy) {
if (task_to_destroy) {
/*
* After the RCU grace period has expired, the worker
* can no longer be found through group->poll_kworker.
* can no longer be found through group->poll_task.
* But it might have been already scheduled before
* that - deschedule it cleanly before destroying it.
*/
kthread_cancel_delayed_work_sync(&group->poll_work);
atomic_set(&group->poll_scheduled, 0);
kthread_destroy_worker(kworker_to_destroy);
del_timer_sync(&group->poll_timer);
kthread_stop(task_to_destroy);
}
kfree(t);
}

查看文件

@@ -2429,8 +2429,8 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
return 0;
}
const struct sched_class rt_sched_class = {
.next = &fair_sched_class,
const struct sched_class rt_sched_class
__attribute__((section("__rt_sched_class"))) = {
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,

查看文件

@@ -67,6 +67,7 @@
#include <linux/tsacct_kern.h>
#include <asm/tlb.h>
#include <asm-generic/vmlinux.lds.h>
#ifdef CONFIG_PARAVIRT
# include <asm/paravirt.h>
@@ -75,6 +76,8 @@
#include "cpupri.h"
#include "cpudeadline.h"
#include <trace/events/sched.h>
#ifdef CONFIG_SCHED_DEBUG
# define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
#else
@@ -96,6 +99,7 @@ extern atomic_long_t calc_load_tasks;
extern void calc_global_load_tick(struct rq *this_rq);
extern long calc_load_fold_active(struct rq *this_rq, long adjust);
extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
@@ -310,11 +314,26 @@ void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
__dl_update(dl_b, -((s32)tsk_bw / cpus));
}
static inline
bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
static inline bool __dl_overflow(struct dl_bw *dl_b, unsigned long cap,
u64 old_bw, u64 new_bw)
{
return dl_b->bw != -1 &&
dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw;
}
/*
* Verify the fitness of task @p to run on @cpu taking into account the
* CPU original capacity and the runtime/deadline ratio of the task.
*
* The function will return true if the CPU original capacity of the
* @cpu scaled by SCHED_CAPACITY_SCALE >= runtime/deadline ratio of the
* task and false otherwise.
*/
static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu)
{
unsigned long cap = arch_scale_cpu_capacity(cpu);
return cap_scale(p->dl.dl_deadline, cap) >= p->dl.dl_runtime;
}
extern void init_dl_bw(struct dl_bw *dl_b);
@@ -862,6 +881,8 @@ struct uclamp_rq {
unsigned int value;
struct uclamp_bucket bucket[UCLAMP_BUCKETS];
};
DECLARE_STATIC_KEY_FALSE(sched_uclamp_used);
#endif /* CONFIG_UCLAMP_TASK */
/*
@@ -1182,6 +1203,16 @@ struct rq_flags {
#endif
};
/*
* Lockdep annotation that avoids accidental unlocks; it's like a
* sticky/continuous lockdep_assert_held().
*
* This avoids code that has access to 'struct rq *rq' (basically everything in
* the scheduler) from accidentally unlocking the rq if they do not also have a
* copy of the (on-stack) 'struct rq_flags rf'.
*
* Also see Documentation/locking/lockdep-design.rst.
*/
static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
{
rf->cookie = lockdep_pin_lock(&rq->lock);
@@ -1739,7 +1770,6 @@ extern const u32 sched_prio_to_wmult[40];
#define RETRY_TASK ((void *)-1UL)
struct sched_class {
const struct sched_class *next;
#ifdef CONFIG_UCLAMP_TASK
int uclamp_enabled;
@@ -1748,7 +1778,7 @@ struct sched_class {
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*yield_task) (struct rq *rq);
bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt);
bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
@@ -1796,7 +1826,7 @@ struct sched_class {
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_change_group)(struct task_struct *p, int type);
#endif
};
} __aligned(STRUCT_ALIGNMENT); /* STRUCT_ALIGN(), vmlinux.lds.h */
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
@@ -1810,17 +1840,18 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next)
next->sched_class->set_next_task(rq, next, false);
}
#ifdef CONFIG_SMP
#define sched_class_highest (&stop_sched_class)
#else
#define sched_class_highest (&dl_sched_class)
#endif
/* Defined in include/asm-generic/vmlinux.lds.h */
extern struct sched_class __begin_sched_classes[];
extern struct sched_class __end_sched_classes[];
#define sched_class_highest (__end_sched_classes - 1)
#define sched_class_lowest (__begin_sched_classes - 1)
#define for_class_range(class, _from, _to) \
for (class = (_from); class != (_to); class = class->next)
for (class = (_from); class != (_to); class--)
#define for_each_class(class) \
for_class_range(class, sched_class_highest, NULL)
for_class_range(class, sched_class_highest, sched_class_lowest)
extern const struct sched_class stop_sched_class;
extern const struct sched_class dl_sched_class;
@@ -1930,12 +1961,7 @@ extern int __init sched_tick_offload_init(void);
*/
static inline void sched_update_tick_dependency(struct rq *rq)
{
int cpu;
if (!tick_nohz_full_enabled())
return;
cpu = cpu_of(rq);
int cpu = cpu_of(rq);
if (!tick_nohz_full_cpu(cpu))
return;
@@ -1955,6 +1981,9 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
unsigned prev_nr = rq->nr_running;
rq->nr_running = prev_nr + count;
if (trace_sched_update_nr_running_tp_enabled()) {
call_trace_sched_update_nr_running(rq, count);
}
#ifdef CONFIG_SMP
if (prev_nr < 2 && rq->nr_running >= 2) {
@@ -1969,6 +1998,10 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
static inline void sub_nr_running(struct rq *rq, unsigned count)
{
rq->nr_running -= count;
if (trace_sched_update_nr_running_tp_enabled()) {
call_trace_sched_update_nr_running(rq, count);
}
/* Check if we still need preemption */
sched_update_tick_dependency(rq);
}
@@ -2016,6 +2049,16 @@ void arch_scale_freq_tick(void)
#endif
#ifndef arch_scale_freq_capacity
/**
* arch_scale_freq_capacity - get the frequency scale factor of a given CPU.
* @cpu: the CPU in question.
*
* Return: the frequency scale factor normalized against SCHED_CAPACITY_SCALE, i.e.
*
* f_curr
* ------ * SCHED_CAPACITY_SCALE
* f_max
*/
static __always_inline
unsigned long arch_scale_freq_capacity(int cpu)
{
@@ -2349,12 +2392,35 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#ifdef CONFIG_UCLAMP_TASK
unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
/**
* uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values.
* @rq: The rq to clamp against. Must not be NULL.
* @util: The util value to clamp.
* @p: The task to clamp against. Can be NULL if you want to clamp
* against @rq only.
*
* Clamps the passed @util to the max(@rq, @p) effective uclamp values.
*
* If sched_uclamp_used static key is disabled, then just return the util
* without any clamping since uclamp aggregation at the rq level in the fast
* path is disabled, rendering this operation a NOP.
*
* Use uclamp_eff_value() if you don't care about uclamp values at rq level. It
* will return the correct effective uclamp value of the task even if the
* static key is disabled.
*/
static __always_inline
unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
struct task_struct *p)
{
unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
unsigned long min_util;
unsigned long max_util;
if (!static_branch_likely(&sched_uclamp_used))
return util;
min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
if (p) {
min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
@@ -2371,6 +2437,19 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
return clamp(util, min_util, max_util);
}
/*
* When uclamp is compiled in, the aggregation at rq level is 'turned off'
* by default in the fast path and only gets turned on once userspace performs
* an operation that requires it.
*
* Returns true if userspace opted-in to use uclamp and aggregation at rq level
* hence is active.
*/
static inline bool uclamp_is_used(void)
{
return static_branch_likely(&sched_uclamp_used);
}
#else /* CONFIG_UCLAMP_TASK */
static inline
unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
@@ -2378,6 +2457,11 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
{
return util;
}
static inline bool uclamp_is_used(void)
{
return false;
}
#endif /* CONFIG_UCLAMP_TASK */
#ifdef arch_scale_freq_capacity

查看文件

@@ -102,12 +102,6 @@ prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
BUG(); /* how!?, what priority? */
}
static unsigned int
get_rr_interval_stop(struct rq *rq, struct task_struct *task)
{
return 0;
}
static void update_curr_stop(struct rq *rq)
{
}
@@ -115,8 +109,8 @@ static void update_curr_stop(struct rq *rq)
/*
* Simple, special scheduling class for the per-CPU stop tasks:
*/
const struct sched_class stop_sched_class = {
.next = &dl_sched_class,
const struct sched_class stop_sched_class
__attribute__((section("__stop_sched_class"))) = {
.enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop,
@@ -136,8 +130,6 @@ const struct sched_class stop_sched_class = {
.task_tick = task_tick_stop,
.get_rr_interval = get_rr_interval_stop,
.prio_changed = prio_changed_stop,
.switched_to = switched_to_stop,
.update_curr = update_curr_stop,

查看文件

@@ -1328,7 +1328,7 @@ sd_init(struct sched_domain_topology_level *tl,
sd_flags = (*tl->sd_flags)();
if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
"wrong sd_flags in topology description\n"))
sd_flags &= ~TOPOLOGY_SD_FLAGS;
sd_flags &= TOPOLOGY_SD_FLAGS;
/* Apply detected topology flags */
sd_flags |= dflags;

查看文件

@@ -634,8 +634,7 @@ static int __init nrcpus(char *str)
{
int nr_cpus;
get_option(&str, &nr_cpus);
if (nr_cpus > 0 && nr_cpus < nr_cpu_ids)
if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids)
nr_cpu_ids = nr_cpus;
return 0;

查看文件

@@ -1779,6 +1779,20 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = sched_rt_handler,
},
{
.procname = "sched_deadline_period_max_us",
.data = &sysctl_sched_dl_period_max,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "sched_deadline_period_min_us",
.data = &sysctl_sched_dl_period_min,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "sched_rr_timeslice_ms",
.data = &sysctl_sched_rr_timeslice,
@@ -1801,6 +1815,13 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = sysctl_sched_uclamp_handler,
},
{
.procname = "sched_util_clamp_min_rt_default",
.data = &sysctl_sched_uclamp_util_min_rt_default,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sysctl_sched_uclamp_handler,
},
#endif
#ifdef CONFIG_SCHED_AUTOGROUP
{

查看文件

@@ -2193,7 +2193,7 @@ EXPORT_SYMBOL(ktime_get_coarse_ts64);
void do_timer(unsigned long ticks)
{
jiffies_64 += ticks;
calc_global_load(ticks);
calc_global_load();
}
/**