Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "The main changes in this cycle are:

   - Various NUMA scheduling updates: harmonize the load-balancer and
     NUMA placement logic to not work against each other. The intended
     result is better locality, better utilization and fewer migrations.

   - Introduce Thermal Pressure tracking and optimizations, to improve
     task placement on thermally overloaded systems.

   - Implement frequency invariant scheduler accounting on (some) x86
     CPUs. This is done by observing and sampling the 'recent' CPU
     frequency average at ~tick boundaries. The CPU provides this data
     via the APERF/MPERF MSRs. This hopefully makes our capacity
     estimates more precise and keeps tasks on the same CPU better even
     if it might seem overloaded at a lower momentary frequency. (As
     usual, turbo mode is a complication that we resolve by observing
     the maximum frequency and renormalizing to it.)

   - Add asymmetric CPU capacity wakeup scan to improve capacity
     utilization on asymmetric topologies. (big.LITTLE systems)

   - PSI fixes and optimizations.

   - RT scheduling capacity awareness fixes & improvements.

   - Optimize the CONFIG_RT_GROUP_SCHED constraints code.

   - Misc fixes, cleanups and optimizations - see the changelog for
     details"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (62 commits)
  threads: Update PID limit comment according to futex UAPI change
  sched/fair: Fix condition of avg_load calculation
  sched/rt: cpupri_find: Trigger a full search as fallback
  kthread: Do not preempt current task if it is going to call schedule()
  sched/fair: Improve spreading of utilization
  sched: Avoid scale real weight down to zero
  psi: Move PF_MEMSTALL out of task->flags
  MAINTAINERS: Add maintenance information for psi
  psi: Optimize switching tasks inside shared cgroups
  psi: Fix cpu.pressure for cpu.max and competing cgroups
  sched/core: Distribute tasks within affinity masks
  sched/fair: Fix enqueue_task_fair warning
  thermal/cpu-cooling, sched/core: Move the arch_set_thermal_pressure() API to generic scheduler code
  sched/rt: Remove unnecessary push for unfit tasks
  sched/rt: Allow pulling unfitting task
  sched/rt: Optimize cpupri_find() on non-heterogenous systems
  sched/rt: Re-instate old behavior in select_task_rq_rt()
  sched/rt: cpupri_find: Implement fallback mechanism for !fit case
  sched/fair: Fix reordering of enqueue/dequeue_task_fair()
  sched/fair: Fix runnable_avg for throttled cfs
  ...
This commit is contained in:
Linus Torvalds
2020-03-30 17:01:51 -07:00
37 changed files with 1554 additions and 515 deletions

View File

@@ -356,28 +356,30 @@ struct util_est {
} __attribute__((__aligned__(sizeof(u64))));
/*
* The load_avg/util_avg accumulates an infinite geometric series
* (see __update_load_avg() in kernel/sched/fair.c).
* The load/runnable/util_avg accumulates an infinite geometric series
* (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
*
* [load_avg definition]
*
* load_avg = runnable% * scale_load_down(load)
*
* where runnable% is the time ratio that a sched_entity is runnable.
* For cfs_rq, it is the aggregated load_avg of all runnable and
* blocked sched_entities.
* [runnable_avg definition]
*
* runnable_avg = runnable% * SCHED_CAPACITY_SCALE
*
* [util_avg definition]
*
* util_avg = running% * SCHED_CAPACITY_SCALE
*
* where running% is the time ratio that a sched_entity is running on
* a CPU. For cfs_rq, it is the aggregated util_avg of all runnable
* and blocked sched_entities.
* where runnable% is the time ratio that a sched_entity is runnable and
* running% the time ratio that a sched_entity is running.
*
* load_avg and util_avg don't direcly factor frequency scaling and CPU
* capacity scaling. The scaling is done through the rq_clock_pelt that
* is used for computing those signals (see update_rq_clock_pelt())
* For cfs_rq, they are the aggregated values of all runnable and blocked
* sched_entities.
*
* The load/runnable/util_avg doesn't direcly factor frequency scaling and CPU
* capacity scaling. The scaling is done through the rq_clock_pelt that is used
* for computing those signals (see update_rq_clock_pelt())
*
* N.B., the above ratios (runnable% and running%) themselves are in the
* range of [0, 1]. To do fixed point arithmetics, we therefore scale them
@@ -401,11 +403,11 @@ struct util_est {
struct sched_avg {
u64 last_update_time;
u64 load_sum;
u64 runnable_load_sum;
u64 runnable_sum;
u32 util_sum;
u32 period_contrib;
unsigned long load_avg;
unsigned long runnable_load_avg;
unsigned long runnable_avg;
unsigned long util_avg;
struct util_est util_est;
} ____cacheline_aligned;
@@ -449,7 +451,6 @@ struct sched_statistics {
struct sched_entity {
/* For load-balancing: */
struct load_weight load;
unsigned long runnable_weight;
struct rb_node run_node;
struct list_head group_node;
unsigned int on_rq;
@@ -470,6 +471,8 @@ struct sched_entity {
struct cfs_rq *cfs_rq;
/* rq "owned" by this entity/group: */
struct cfs_rq *my_q;
/* cached value of my_q->h_nr_running */
unsigned long runnable_weight;
#endif
#ifdef CONFIG_SMP
@@ -782,9 +785,12 @@ struct task_struct {
unsigned frozen:1;
#endif
#ifdef CONFIG_BLK_CGROUP
/* to be used once the psi infrastructure lands upstream. */
unsigned use_memdelay:1;
#endif
#ifdef CONFIG_PSI
/* Stalled due to lack of memory */
unsigned in_memstall:1;
#endif
unsigned long atomic_flags; /* Flags requiring atomic access. */
@@ -1479,7 +1485,6 @@ extern struct pid *cad_pid;
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
#define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
#define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */
#define PF_UMH 0x02000000 /* I'm an Usermodehelper process */
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */