Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "The main changes in this cycle are:

   - Various NUMA scheduling updates: harmonize the load-balancer and
     NUMA placement logic to not work against each other. The intended
     result is better locality, better utilization and fewer migrations.

   - Introduce Thermal Pressure tracking and optimizations, to improve
     task placement on thermally overloaded systems.

   - Implement frequency invariant scheduler accounting on (some) x86
     CPUs. This is done by observing and sampling the 'recent' CPU
     frequency average at ~tick boundaries. The CPU provides this data
     via the APERF/MPERF MSRs. This hopefully makes our capacity
     estimates more precise and keeps tasks on the same CPU better even
     if it might seem overloaded at a lower momentary frequency. (As
     usual, turbo mode is a complication that we resolve by observing
     the maximum frequency and renormalizing to it.)

   - Add asymmetric CPU capacity wakeup scan to improve capacity
     utilization on asymmetric topologies. (big.LITTLE systems)

   - PSI fixes and optimizations.

   - RT scheduling capacity awareness fixes & improvements.

   - Optimize the CONFIG_RT_GROUP_SCHED constraints code.

   - Misc fixes, cleanups and optimizations - see the changelog for
     details"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (62 commits)
  threads: Update PID limit comment according to futex UAPI change
  sched/fair: Fix condition of avg_load calculation
  sched/rt: cpupri_find: Trigger a full search as fallback
  kthread: Do not preempt current task if it is going to call schedule()
  sched/fair: Improve spreading of utilization
  sched: Avoid scale real weight down to zero
  psi: Move PF_MEMSTALL out of task->flags
  MAINTAINERS: Add maintenance information for psi
  psi: Optimize switching tasks inside shared cgroups
  psi: Fix cpu.pressure for cpu.max and competing cgroups
  sched/core: Distribute tasks within affinity masks
  sched/fair: Fix enqueue_task_fair warning
  thermal/cpu-cooling, sched/core: Move the arch_set_thermal_pressure() API to generic scheduler code
  sched/rt: Remove unnecessary push for unfit tasks
  sched/rt: Allow pulling unfitting task
  sched/rt: Optimize cpupri_find() on non-heterogenous systems
  sched/rt: Re-instate old behavior in select_task_rq_rt()
  sched/rt: cpupri_find: Implement fallback mechanism for !fit case
  sched/fair: Fix reordering of enqueue/dequeue_task_fair()
  sched/fair: Fix runnable_avg for throttled cfs
  ...
This commit is contained in:
Linus Torvalds
2020-03-30 17:01:51 -07:00
37 changed files with 1554 additions and 515 deletions

View File

@@ -147,6 +147,8 @@ static inline void smpboot_restore_warm_reset_vector(void)
*((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
}
static void init_freq_invariance(void);
/*
* Report back to the Boot Processor during boot time or to the caller processor
* during CPU online.
@@ -183,6 +185,8 @@ static void smp_callin(void)
*/
set_cpu_sibling_map(raw_smp_processor_id());
init_freq_invariance();
/*
* Get our bogomips.
* Update loops_per_jiffy in cpu_data. Previous call to
@@ -1337,7 +1341,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
set_sched_topology(x86_topology);
set_cpu_sibling_map(0);
init_freq_invariance();
smp_sanity_check();
switch (apic_intr_mode) {
@@ -1764,3 +1768,287 @@ void native_play_dead(void)
}
#endif
/*
* APERF/MPERF frequency ratio computation.
*
* The scheduler wants to do frequency invariant accounting and needs a <1
* ratio to account for the 'current' frequency, corresponding to
* freq_curr / freq_max.
*
* Since the frequency freq_curr on x86 is controlled by micro-controller and
* our P-state setting is little more than a request/hint, we need to observe
* the effective frequency 'BusyMHz', i.e. the average frequency over a time
* interval after discarding idle time. This is given by:
*
* BusyMHz = delta_APERF / delta_MPERF * freq_base
*
* where freq_base is the max non-turbo P-state.
*
* The freq_max term has to be set to a somewhat arbitrary value, because we
* can't know which turbo states will be available at a given point in time:
* it all depends on the thermal headroom of the entire package. We set it to
* the turbo level with 4 cores active.
*
* Benchmarks show that's a good compromise between the 1C turbo ratio
* (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
* which would ignore the entire turbo range (a conspicuous part, making
* freq_curr/freq_max always maxed out).
*
* An exception to the heuristic above is the Atom uarch, where we choose the
* highest turbo level for freq_max since Atom's are generally oriented towards
* power efficiency.
*
* Setting freq_max to anything less than the 1C turbo ratio makes the ratio
* freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
*/
DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
static DEFINE_PER_CPU(u64, arch_prev_aperf);
static DEFINE_PER_CPU(u64, arch_prev_mperf);
static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
void arch_set_max_freq_ratio(bool turbo_disabled)
{
arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
arch_turbo_freq_ratio;
}
static bool turbo_disabled(void)
{
u64 misc_en;
int err;
err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
if (err)
return false;
return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
}
static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
{
int err;
err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
if (err)
return false;
err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
if (err)
return false;
*base_freq = (*base_freq >> 16) & 0x3F; /* max P state */
*turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */
return true;
}
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
#define ICPU(model) \
{X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF, 0}
static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = {
ICPU(INTEL_FAM6_XEON_PHI_KNL),
ICPU(INTEL_FAM6_XEON_PHI_KNM),
{}
};
static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = {
ICPU(INTEL_FAM6_SKYLAKE_X),
{}
};
static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = {
ICPU(INTEL_FAM6_ATOM_GOLDMONT),
ICPU(INTEL_FAM6_ATOM_GOLDMONT_D),
ICPU(INTEL_FAM6_ATOM_GOLDMONT_PLUS),
{}
};
static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
int num_delta_fratio)
{
int fratio, delta_fratio, found;
int err, i;
u64 msr;
if (!x86_match_cpu(has_knl_turbo_ratio_limits))
return false;
err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
if (err)
return false;
*base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
if (err)
return false;
fratio = (msr >> 8) & 0xFF;
i = 16;
found = 0;
do {
if (found >= num_delta_fratio) {
*turbo_freq = fratio;
return true;
}
delta_fratio = (msr >> (i + 5)) & 0x7;
if (delta_fratio) {
found += 1;
fratio -= delta_fratio;
}
i += 8;
} while (i < 64);
return true;
}
static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
{
u64 ratios, counts;
u32 group_size;
int err, i;
err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
if (err)
return false;
*base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
if (err)
return false;
err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
if (err)
return false;
for (i = 0; i < 64; i += 8) {
group_size = (counts >> i) & 0xFF;
if (group_size >= size) {
*turbo_freq = (ratios >> i) & 0xFF;
return true;
}
}
return false;
}
static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
{
int err;
err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
if (err)
return false;
err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, turbo_freq);
if (err)
return false;
*base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
*turbo_freq = (*turbo_freq >> 24) & 0xFF; /* 4C turbo */
return true;
}
static bool intel_set_max_freq_ratio(void)
{
u64 base_freq, turbo_freq;
if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
goto out;
if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
goto out;
if (knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
goto out;
if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
goto out;
if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
goto out;
return false;
out:
arch_turbo_freq_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE,
base_freq);
arch_set_max_freq_ratio(turbo_disabled());
return true;
}
static void init_counter_refs(void *arg)
{
u64 aperf, mperf;
rdmsrl(MSR_IA32_APERF, aperf);
rdmsrl(MSR_IA32_MPERF, mperf);
this_cpu_write(arch_prev_aperf, aperf);
this_cpu_write(arch_prev_mperf, mperf);
}
static void init_freq_invariance(void)
{
bool ret = false;
if (smp_processor_id() != 0 || !boot_cpu_has(X86_FEATURE_APERFMPERF))
return;
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
ret = intel_set_max_freq_ratio();
if (ret) {
on_each_cpu(init_counter_refs, NULL, 1);
static_branch_enable(&arch_scale_freq_key);
} else {
pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
}
}
DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
void arch_scale_freq_tick(void)
{
u64 freq_scale;
u64 aperf, mperf;
u64 acnt, mcnt;
if (!arch_scale_freq_invariant())
return;
rdmsrl(MSR_IA32_APERF, aperf);
rdmsrl(MSR_IA32_MPERF, mperf);
acnt = aperf - this_cpu_read(arch_prev_aperf);
mcnt = mperf - this_cpu_read(arch_prev_mperf);
if (!mcnt)
return;
this_cpu_write(arch_prev_aperf, aperf);
this_cpu_write(arch_prev_mperf, mperf);
acnt <<= 2*SCHED_CAPACITY_SHIFT;
mcnt *= arch_max_freq_ratio;
freq_scale = div64_u64(acnt, mcnt);
if (freq_scale > SCHED_CAPACITY_SCALE)
freq_scale = SCHED_CAPACITY_SCALE;
this_cpu_write(arch_freq_scale, freq_scale);
}