Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main changes in this cycle were: - Introduce "Energy Aware Scheduling" - by Quentin Perret. This is a coherent topology description of CPUs in cooperation with the PM subsystem, with the goal to schedule more energy-efficiently on asymetric SMP platform - such as waking up tasks to the more energy-efficient CPUs first, as long as the system isn't oversubscribed. For details of the design, see: https://lore.kernel.org/lkml/20180724122521.22109-1-quentin.perret@arm.com/ - Misc cleanups and smaller enhancements" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (23 commits) sched/fair: Select an energy-efficient CPU on task wake-up sched/fair: Introduce an energy estimation helper function sched/fair: Add over-utilization/tipping point indicator sched/fair: Clean-up update_sg_lb_stats parameters sched/toplogy: Introduce the 'sched_energy_present' static key sched/topology: Make Energy Aware Scheduling depend on schedutil sched/topology: Disable EAS on inappropriate platforms sched/topology: Add lowest CPU asymmetry sched_domain level pointer sched/topology: Reference the Energy Model of CPUs when available PM: Introduce an Energy Model management framework sched/cpufreq: Prepare schedutil for Energy Aware Scheduling sched/topology: Relocate arch_scale_cpu_capacity() to the internal header sched/core: Remove unnecessary unlikely() in push_*_task() sched/topology: Remove the ::smt_gain field from 'struct sched_domain' sched: Fix various typos in comments sched/core: Clean up the #ifdef block in add_nr_running() sched/fair: Make some variables static sched/core: Create task_has_idle_policy() helper sched/fair: Add lsub_positive() and use it consistently sched/fair: Mask UTIL_AVG_UNCHANGED usages ...
这个提交包含在:
@@ -298,3 +298,18 @@ config PM_GENERIC_DOMAINS_OF
|
||||
|
||||
config CPU_PM
|
||||
bool
|
||||
|
||||
config ENERGY_MODEL
|
||||
bool "Energy Model for CPUs"
|
||||
depends on SMP
|
||||
depends on CPU_FREQ
|
||||
default n
|
||||
help
|
||||
Several subsystems (thermal and/or the task scheduler for example)
|
||||
can leverage information about the energy consumed by CPUs to make
|
||||
smarter decisions. This config option enables the framework from
|
||||
which subsystems can access the energy models.
|
||||
|
||||
The exact usage of the energy model is subsystem-dependent.
|
||||
|
||||
If in doubt, say N.
|
||||
|
@@ -15,3 +15,5 @@ obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o
|
||||
obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o
|
||||
|
||||
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
|
||||
|
||||
obj-$(CONFIG_ENERGY_MODEL) += energy_model.o
|
||||
|
201
kernel/power/energy_model.c
普通文件
201
kernel/power/energy_model.c
普通文件
@@ -0,0 +1,201 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Energy Model of CPUs
|
||||
*
|
||||
* Copyright (c) 2018, Arm ltd.
|
||||
* Written by: Quentin Perret, Arm ltd.
|
||||
*/
|
||||
|
||||
#define pr_fmt(fmt) "energy_model: " fmt
|
||||
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/cpumask.h>
|
||||
#include <linux/energy_model.h>
|
||||
#include <linux/sched/topology.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
/* Mapping of each CPU to the performance domain to which it belongs. */
|
||||
static DEFINE_PER_CPU(struct em_perf_domain *, em_data);
|
||||
|
||||
/*
|
||||
* Mutex serializing the registrations of performance domains and letting
|
||||
* callbacks defined by drivers sleep.
|
||||
*/
|
||||
static DEFINE_MUTEX(em_pd_mutex);
|
||||
|
||||
static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
|
||||
struct em_data_callback *cb)
|
||||
{
|
||||
unsigned long opp_eff, prev_opp_eff = ULONG_MAX;
|
||||
unsigned long power, freq, prev_freq = 0;
|
||||
int i, ret, cpu = cpumask_first(span);
|
||||
struct em_cap_state *table;
|
||||
struct em_perf_domain *pd;
|
||||
u64 fmax;
|
||||
|
||||
if (!cb->active_power)
|
||||
return NULL;
|
||||
|
||||
pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
|
||||
if (!pd)
|
||||
return NULL;
|
||||
|
||||
table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL);
|
||||
if (!table)
|
||||
goto free_pd;
|
||||
|
||||
/* Build the list of capacity states for this performance domain */
|
||||
for (i = 0, freq = 0; i < nr_states; i++, freq++) {
|
||||
/*
|
||||
* active_power() is a driver callback which ceils 'freq' to
|
||||
* lowest capacity state of 'cpu' above 'freq' and updates
|
||||
* 'power' and 'freq' accordingly.
|
||||
*/
|
||||
ret = cb->active_power(&power, &freq, cpu);
|
||||
if (ret) {
|
||||
pr_err("pd%d: invalid cap. state: %d\n", cpu, ret);
|
||||
goto free_cs_table;
|
||||
}
|
||||
|
||||
/*
|
||||
* We expect the driver callback to increase the frequency for
|
||||
* higher capacity states.
|
||||
*/
|
||||
if (freq <= prev_freq) {
|
||||
pr_err("pd%d: non-increasing freq: %lu\n", cpu, freq);
|
||||
goto free_cs_table;
|
||||
}
|
||||
|
||||
/*
|
||||
* The power returned by active_state() is expected to be
|
||||
* positive, in milli-watts and to fit into 16 bits.
|
||||
*/
|
||||
if (!power || power > EM_CPU_MAX_POWER) {
|
||||
pr_err("pd%d: invalid power: %lu\n", cpu, power);
|
||||
goto free_cs_table;
|
||||
}
|
||||
|
||||
table[i].power = power;
|
||||
table[i].frequency = prev_freq = freq;
|
||||
|
||||
/*
|
||||
* The hertz/watts efficiency ratio should decrease as the
|
||||
* frequency grows on sane platforms. But this isn't always
|
||||
* true in practice so warn the user if a higher OPP is more
|
||||
* power efficient than a lower one.
|
||||
*/
|
||||
opp_eff = freq / power;
|
||||
if (opp_eff >= prev_opp_eff)
|
||||
pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_cap_state %d >= em_cap_state%d\n",
|
||||
cpu, i, i - 1);
|
||||
prev_opp_eff = opp_eff;
|
||||
}
|
||||
|
||||
/* Compute the cost of each capacity_state. */
|
||||
fmax = (u64) table[nr_states - 1].frequency;
|
||||
for (i = 0; i < nr_states; i++) {
|
||||
table[i].cost = div64_u64(fmax * table[i].power,
|
||||
table[i].frequency);
|
||||
}
|
||||
|
||||
pd->table = table;
|
||||
pd->nr_cap_states = nr_states;
|
||||
cpumask_copy(to_cpumask(pd->cpus), span);
|
||||
|
||||
return pd;
|
||||
|
||||
free_cs_table:
|
||||
kfree(table);
|
||||
free_pd:
|
||||
kfree(pd);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* em_cpu_get() - Return the performance domain for a CPU
|
||||
* @cpu : CPU to find the performance domain for
|
||||
*
|
||||
* Return: the performance domain to which 'cpu' belongs, or NULL if it doesn't
|
||||
* exist.
|
||||
*/
|
||||
struct em_perf_domain *em_cpu_get(int cpu)
|
||||
{
|
||||
return READ_ONCE(per_cpu(em_data, cpu));
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(em_cpu_get);
|
||||
|
||||
/**
|
||||
* em_register_perf_domain() - Register the Energy Model of a performance domain
|
||||
* @span : Mask of CPUs in the performance domain
|
||||
* @nr_states : Number of capacity states to register
|
||||
* @cb : Callback functions providing the data of the Energy Model
|
||||
*
|
||||
* Create Energy Model tables for a performance domain using the callbacks
|
||||
* defined in cb.
|
||||
*
|
||||
* If multiple clients register the same performance domain, all but the first
|
||||
* registration will be ignored.
|
||||
*
|
||||
* Return 0 on success
|
||||
*/
|
||||
int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
|
||||
struct em_data_callback *cb)
|
||||
{
|
||||
unsigned long cap, prev_cap = 0;
|
||||
struct em_perf_domain *pd;
|
||||
int cpu, ret = 0;
|
||||
|
||||
if (!span || !nr_states || !cb)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Use a mutex to serialize the registration of performance domains and
|
||||
* let the driver-defined callback functions sleep.
|
||||
*/
|
||||
mutex_lock(&em_pd_mutex);
|
||||
|
||||
for_each_cpu(cpu, span) {
|
||||
/* Make sure we don't register again an existing domain. */
|
||||
if (READ_ONCE(per_cpu(em_data, cpu))) {
|
||||
ret = -EEXIST;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
/*
|
||||
* All CPUs of a domain must have the same micro-architecture
|
||||
* since they all share the same table.
|
||||
*/
|
||||
cap = arch_scale_cpu_capacity(NULL, cpu);
|
||||
if (prev_cap && prev_cap != cap) {
|
||||
pr_err("CPUs of %*pbl must have the same capacity\n",
|
||||
cpumask_pr_args(span));
|
||||
ret = -EINVAL;
|
||||
goto unlock;
|
||||
}
|
||||
prev_cap = cap;
|
||||
}
|
||||
|
||||
/* Create the performance domain and add it to the Energy Model. */
|
||||
pd = em_create_pd(span, nr_states, cb);
|
||||
if (!pd) {
|
||||
ret = -EINVAL;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
for_each_cpu(cpu, span) {
|
||||
/*
|
||||
* The per-cpu array can be read concurrently from em_cpu_get().
|
||||
* The barrier enforces the ordering needed to make sure readers
|
||||
* can only access well formed em_perf_domain structs.
|
||||
*/
|
||||
smp_store_release(per_cpu_ptr(&em_data, cpu), pd);
|
||||
}
|
||||
|
||||
pr_debug("Created perf domain %*pbl\n", cpumask_pr_args(span));
|
||||
unlock:
|
||||
mutex_unlock(&em_pd_mutex);
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(em_register_perf_domain);
|
在新工单中引用
屏蔽一个用户