Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip into next

Pull scheduler updates from Ingo Molnar:
 "The main scheduling related changes in this cycle were:

   - various sched/numa updates, for better performance

   - tree wide cleanup of open coded nice levels

   - nohz fix related to rq->nr_running use

   - cpuidle changes and continued consolidation to improve the
     kernel/sched/idle.c high level idle scheduling logic.  As part of
     this effort I pulled cpuidle driver changes from Rafael as well.

   - standardized idle polling amongst architectures

   - continued work on preparing better power/energy aware scheduling

   - sched/rt updates

   - misc fixlets and cleanups"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (49 commits)
  sched/numa: Decay ->wakee_flips instead of zeroing
  sched/numa: Update migrate_improves/degrades_locality()
  sched/numa: Allow task switch if load imbalance improves
  sched/rt: Fix 'struct sched_dl_entity' and dl_task_time() comments, to match the current upstream code
  sched: Consolidate open coded implementations of nice level frobbing into nice_to_rlimit() and rlimit_to_nice()
  sched: Initialize rq->age_stamp on processor start
  sched, nohz: Change rq->nr_running to always use wrappers
  sched: Fix the rq->next_balance logic in rebalance_domains() and idle_balance()
  sched: Use clamp() and clamp_val() to make sys_nice() more readable
  sched: Do not zero sg->cpumask and sg->sgp->power in build_sched_groups()
  sched/numa: Fix initialization of sched_domain_topology for NUMA
  sched: Call select_idle_sibling() when not affine_sd
  sched: Simplify return logic in sched_read_attr()
  sched: Simplify return logic in sched_copy_attr()
  sched: Fix exec_start/task_hot on migrated tasks
  arm64: Remove TIF_POLLING_NRFLAG
  metag: Remove TIF_POLLING_NRFLAG
  sched/idle: Make cpuidle_idle_call() void
  sched/idle: Reflow cpuidle_idle_call()
  sched/idle: Delay clearing the polling bit
  ...
This commit is contained in:
Linus Torvalds
2014-06-03 14:00:15 -07:00
48 changed files with 761 additions and 664 deletions

View File

@@ -521,6 +521,39 @@ static inline void init_hrtick(void)
}
#endif /* CONFIG_SCHED_HRTICK */
/*
* cmpxchg based fetch_or, macro so it works for different integer types
*/
#define fetch_or(ptr, val) \
({ typeof(*(ptr)) __old, __val = *(ptr); \
for (;;) { \
__old = cmpxchg((ptr), __val, __val | (val)); \
if (__old == __val) \
break; \
__val = __old; \
} \
__old; \
})
#ifdef TIF_POLLING_NRFLAG
/*
* Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
* this avoids any races wrt polling state changes and thereby avoids
* spurious IPIs.
*/
static bool set_nr_and_not_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
}
#else
static bool set_nr_and_not_polling(struct task_struct *p)
{
set_tsk_need_resched(p);
return true;
}
#endif
/*
* resched_task - mark a task 'to be rescheduled now'.
*
@@ -537,17 +570,15 @@ void resched_task(struct task_struct *p)
if (test_tsk_need_resched(p))
return;
set_tsk_need_resched(p);
cpu = task_cpu(p);
if (cpu == smp_processor_id()) {
set_tsk_need_resched(p);
set_preempt_need_resched();
return;
}
/* NEED_RESCHED must be visible before we test polling */
smp_mb();
if (!tsk_is_polling(p))
if (set_nr_and_not_polling(p))
smp_send_reschedule(cpu);
}
@@ -3018,7 +3049,7 @@ EXPORT_SYMBOL(set_user_nice);
int can_nice(const struct task_struct *p, const int nice)
{
/* convert nice value [19,-20] to rlimit style value [1,40] */
int nice_rlim = 20 - nice;
int nice_rlim = nice_to_rlimit(nice);
return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
capable(CAP_SYS_NICE));
@@ -3042,17 +3073,10 @@ SYSCALL_DEFINE1(nice, int, increment)
* We don't have to worry. Conceptually one call occurs first
* and we have a single winner.
*/
if (increment < -40)
increment = -40;
if (increment > 40)
increment = 40;
increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
nice = task_nice(current) + increment;
if (nice < MIN_NICE)
nice = MIN_NICE;
if (nice > MAX_NICE)
nice = MAX_NICE;
nice = clamp_val(nice, MIN_NICE, MAX_NICE);
if (increment < 0 && !can_nice(current, nice))
return -EPERM;
@@ -3642,13 +3666,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
*/
attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
out:
return ret;
return 0;
err_size:
put_user(sizeof(*attr), &uattr->size);
ret = -E2BIG;
goto out;
return -E2BIG;
}
/**
@@ -3808,7 +3830,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
for (; addr < end; addr++) {
if (*addr)
goto err_size;
return -EFBIG;
}
attr->size = usize;
@@ -3818,12 +3840,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
if (ret)
return -EFAULT;
out:
return ret;
err_size:
ret = -E2BIG;
goto out;
return 0;
}
/**
@@ -5093,10 +5110,20 @@ static struct notifier_block migration_notifier = {
.priority = CPU_PRI_MIGRATION,
};
static void __cpuinit set_cpu_rq_start_time(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
rq->age_stamp = sched_clock_cpu(cpu);
}
static int sched_cpu_active(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_STARTING:
set_cpu_rq_start_time();
return NOTIFY_OK;
case CPU_DOWN_FAILED:
set_cpu_active((long)hcpu, true);
return NOTIFY_OK;
@@ -5305,7 +5332,8 @@ static int sd_degenerate(struct sched_domain *sd)
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
SD_SHARE_CPUPOWER |
SD_SHARE_PKG_RESOURCES)) {
SD_SHARE_PKG_RESOURCES |
SD_SHARE_POWERDOMAIN)) {
if (sd->groups != sd->groups->next)
return 0;
}
@@ -5336,7 +5364,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
SD_BALANCE_EXEC |
SD_SHARE_CPUPOWER |
SD_SHARE_PKG_RESOURCES |
SD_PREFER_SIBLING);
SD_PREFER_SIBLING |
SD_SHARE_POWERDOMAIN);
if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE;
}
@@ -5610,17 +5639,6 @@ static int __init isolated_cpu_setup(char *str)
__setup("isolcpus=", isolated_cpu_setup);
static const struct cpumask *cpu_cpu_mask(int cpu)
{
return cpumask_of_node(cpu_to_node(cpu));
}
struct sd_data {
struct sched_domain **__percpu sd;
struct sched_group **__percpu sg;
struct sched_group_power **__percpu sgp;
};
struct s_data {
struct sched_domain ** __percpu sd;
struct root_domain *rd;
@@ -5633,21 +5651,6 @@ enum s_alloc {
sa_none,
};
struct sched_domain_topology_level;
typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
#define SDTL_OVERLAP 0x01
struct sched_domain_topology_level {
sched_domain_init_f init;
sched_domain_mask_f mask;
int flags;
int numa_level;
struct sd_data data;
};
/*
* Build an iteration mask that can exclude certain CPUs from the upwards
* domain traversal.
@@ -5815,8 +5818,6 @@ build_sched_groups(struct sched_domain *sd, int cpu)
continue;
group = get_group(i, sdd, &sg);
cpumask_clear(sched_group_cpus(sg));
sg->sgp->power = 0;
cpumask_setall(sched_group_mask(sg));
for_each_cpu(j, span) {
@@ -5866,44 +5867,11 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
}
int __weak arch_sd_sibling_asym_packing(void)
{
return 0*SD_ASYM_PACKING;
}
/*
* Initializers for schedule domains
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
*/
#ifdef CONFIG_SCHED_DEBUG
# define SD_INIT_NAME(sd, type) sd->name = #type
#else
# define SD_INIT_NAME(sd, type) do { } while (0)
#endif
#define SD_INIT_FUNC(type) \
static noinline struct sched_domain * \
sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
{ \
struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
*sd = SD_##type##_INIT; \
SD_INIT_NAME(sd, type); \
sd->private = &tl->data; \
return sd; \
}
SD_INIT_FUNC(CPU)
#ifdef CONFIG_SCHED_SMT
SD_INIT_FUNC(SIBLING)
#endif
#ifdef CONFIG_SCHED_MC
SD_INIT_FUNC(MC)
#endif
#ifdef CONFIG_SCHED_BOOK
SD_INIT_FUNC(BOOK)
#endif
static int default_relax_domain_level = -1;
int sched_domain_level_max;
@@ -5991,99 +5959,154 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
*per_cpu_ptr(sdd->sgp, cpu) = NULL;
}
#ifdef CONFIG_SCHED_SMT
static const struct cpumask *cpu_smt_mask(int cpu)
{
return topology_thread_cpumask(cpu);
}
#endif
/*
* Topology list, bottom-up.
*/
static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ sd_init_SIBLING, cpu_smt_mask, },
#endif
#ifdef CONFIG_SCHED_MC
{ sd_init_MC, cpu_coregroup_mask, },
#endif
#ifdef CONFIG_SCHED_BOOK
{ sd_init_BOOK, cpu_book_mask, },
#endif
{ sd_init_CPU, cpu_cpu_mask, },
{ NULL, },
};
static struct sched_domain_topology_level *sched_domain_topology = default_topology;
#define for_each_sd_topology(tl) \
for (tl = sched_domain_topology; tl->init; tl++)
#ifdef CONFIG_NUMA
static int sched_domains_numa_levels;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
static int sched_domains_curr_level;
#endif
static inline int sd_local_flags(int level)
{
if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
return 0;
return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
}
/*
* SD_flags allowed in topology descriptions.
*
* SD_SHARE_CPUPOWER - describes SMT topologies
* SD_SHARE_PKG_RESOURCES - describes shared caches
* SD_NUMA - describes NUMA topologies
* SD_SHARE_POWERDOMAIN - describes shared power domain
*
* Odd one out:
* SD_ASYM_PACKING - describes SMT quirks
*/
#define TOPOLOGY_SD_FLAGS \
(SD_SHARE_CPUPOWER | \
SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \
SD_ASYM_PACKING | \
SD_SHARE_POWERDOMAIN)
static struct sched_domain *
sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
sd_init(struct sched_domain_topology_level *tl, int cpu)
{
struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
int level = tl->numa_level;
int sd_weight = cpumask_weight(
sched_domains_numa_masks[level][cpu_to_node(cpu)]);
int sd_weight, sd_flags = 0;
#ifdef CONFIG_NUMA
/*
* Ugly hack to pass state to sd_numa_mask()...
*/
sched_domains_curr_level = tl->numa_level;
#endif
sd_weight = cpumask_weight(tl->mask(cpu));
if (tl->sd_flags)
sd_flags = (*tl->sd_flags)();
if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
"wrong sd_flags in topology description\n"))
sd_flags &= ~TOPOLOGY_SD_FLAGS;
*sd = (struct sched_domain){
.min_interval = sd_weight,
.max_interval = 2*sd_weight,
.busy_factor = 32,
.imbalance_pct = 125,
.cache_nice_tries = 2,
.busy_idx = 3,
.idle_idx = 2,
.cache_nice_tries = 0,
.busy_idx = 0,
.idle_idx = 0,
.newidle_idx = 0,
.wake_idx = 0,
.forkexec_idx = 0,
.flags = 1*SD_LOAD_BALANCE
| 1*SD_BALANCE_NEWIDLE
| 0*SD_BALANCE_EXEC
| 0*SD_BALANCE_FORK
| 1*SD_BALANCE_EXEC
| 1*SD_BALANCE_FORK
| 0*SD_BALANCE_WAKE
| 0*SD_WAKE_AFFINE
| 1*SD_WAKE_AFFINE
| 0*SD_SHARE_CPUPOWER
| 0*SD_SHARE_PKG_RESOURCES
| 1*SD_SERIALIZE
| 0*SD_SERIALIZE
| 0*SD_PREFER_SIBLING
| 1*SD_NUMA
| sd_local_flags(level)
| 0*SD_NUMA
| sd_flags
,
.last_balance = jiffies,
.balance_interval = sd_weight,
.smt_gain = 0,
.max_newidle_lb_cost = 0,
.next_decay_max_lb_cost = jiffies,
#ifdef CONFIG_SCHED_DEBUG
.name = tl->name,
#endif
};
SD_INIT_NAME(sd, NUMA);
sd->private = &tl->data;
/*
* Ugly hack to pass state to sd_numa_mask()...
* Convert topological properties into behaviour.
*/
sched_domains_curr_level = tl->numa_level;
if (sd->flags & SD_SHARE_CPUPOWER) {
sd->imbalance_pct = 110;
sd->smt_gain = 1178; /* ~15% */
} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
sd->imbalance_pct = 117;
sd->cache_nice_tries = 1;
sd->busy_idx = 2;
#ifdef CONFIG_NUMA
} else if (sd->flags & SD_NUMA) {
sd->cache_nice_tries = 2;
sd->busy_idx = 3;
sd->idle_idx = 2;
sd->flags |= SD_SERIALIZE;
if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
sd->flags &= ~(SD_BALANCE_EXEC |
SD_BALANCE_FORK |
SD_WAKE_AFFINE);
}
#endif
} else {
sd->flags |= SD_PREFER_SIBLING;
sd->cache_nice_tries = 1;
sd->busy_idx = 2;
sd->idle_idx = 1;
}
sd->private = &tl->data;
return sd;
}
/*
* Topology list, bottom-up.
*/
static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
#endif
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
};
struct sched_domain_topology_level *sched_domain_topology = default_topology;
#define for_each_sd_topology(tl) \
for (tl = sched_domain_topology; tl->mask; tl++)
void set_sched_topology(struct sched_domain_topology_level *tl)
{
sched_domain_topology = tl;
}
#ifdef CONFIG_NUMA
static const struct cpumask *sd_numa_mask(int cpu)
{
return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
@@ -6227,7 +6250,10 @@ static void sched_init_numa(void)
}
}
tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
/* Compute default topology size */
for (i = 0; sched_domain_topology[i].mask; i++);
tl = kzalloc((i + level + 1) *
sizeof(struct sched_domain_topology_level), GFP_KERNEL);
if (!tl)
return;
@@ -6235,18 +6261,19 @@ static void sched_init_numa(void)
/*
* Copy the default topology bits..
*/
for (i = 0; default_topology[i].init; i++)
tl[i] = default_topology[i];
for (i = 0; sched_domain_topology[i].mask; i++)
tl[i] = sched_domain_topology[i];
/*
* .. and append 'j' levels of NUMA goodness.
*/
for (j = 0; j < level; i++, j++) {
tl[i] = (struct sched_domain_topology_level){
.init = sd_numa_init,
.mask = sd_numa_mask,
.sd_flags = cpu_numa_flags,
.flags = SDTL_OVERLAP,
.numa_level = j,
SD_INIT_NAME(NUMA)
};
}
@@ -6404,7 +6431,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu)
{
struct sched_domain *sd = tl->init(tl, cpu);
struct sched_domain *sd = sd_init(tl, cpu);
if (!sd)
return child;
@@ -6974,6 +7001,7 @@ void __init sched_init(void)
if (cpu_isolated_map == NULL)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
idle_thread_set_boot_cpu();
set_cpu_rq_start_time();
#endif
init_sched_fair_class();

View File

@@ -520,7 +520,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* We need to take care of a possible races here. In fact, the
* task might have changed its scheduling policy to something
* different from SCHED_DEADLINE or changed its reservation
* parameters (through sched_setscheduler()).
* parameters (through sched_setattr()).
*/
if (!dl_task(p) || dl_se->dl_new)
goto unlock;
@@ -741,7 +741,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
WARN_ON(!dl_prio(prio));
dl_rq->dl_nr_running++;
inc_nr_running(rq_of_dl_rq(dl_rq));
add_nr_running(rq_of_dl_rq(dl_rq), 1);
inc_dl_deadline(dl_rq, deadline);
inc_dl_migration(dl_se, dl_rq);
@@ -755,7 +755,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
WARN_ON(!dl_prio(prio));
WARN_ON(!dl_rq->dl_nr_running);
dl_rq->dl_nr_running--;
dec_nr_running(rq_of_dl_rq(dl_rq));
sub_nr_running(rq_of_dl_rq(dl_rq), 1);
dec_dl_deadline(dl_rq, dl_se->deadline);
dec_dl_migration(dl_se, dl_rq);

View File

@@ -1095,6 +1095,34 @@ static void task_numa_assign(struct task_numa_env *env,
env->best_cpu = env->dst_cpu;
}
static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
long src_load, long dst_load,
struct task_numa_env *env)
{
long imb, old_imb;
/* We care about the slope of the imbalance, not the direction. */
if (dst_load < src_load)
swap(dst_load, src_load);
/* Is the difference below the threshold? */
imb = dst_load * 100 - src_load * env->imbalance_pct;
if (imb <= 0)
return false;
/*
* The imbalance is above the allowed threshold.
* Compare it with the old imbalance.
*/
if (orig_dst_load < orig_src_load)
swap(orig_dst_load, orig_src_load);
old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
/* Would this change make things worse? */
return (old_imb > imb);
}
/*
* This checks if the overall compute and NUMA accesses of the system would
* be improved if the source tasks was migrated to the target dst_cpu taking
@@ -1107,7 +1135,8 @@ static void task_numa_compare(struct task_numa_env *env,
struct rq *src_rq = cpu_rq(env->src_cpu);
struct rq *dst_rq = cpu_rq(env->dst_cpu);
struct task_struct *cur;
long dst_load, src_load;
long orig_src_load, src_load;
long orig_dst_load, dst_load;
long load;
long imp = (groupimp > 0) ? groupimp : taskimp;
@@ -1181,13 +1210,13 @@ static void task_numa_compare(struct task_numa_env *env,
* In the overloaded case, try and keep the load balanced.
*/
balance:
dst_load = env->dst_stats.load;
src_load = env->src_stats.load;
orig_dst_load = env->dst_stats.load;
orig_src_load = env->src_stats.load;
/* XXX missing power terms */
load = task_h_load(env->p);
dst_load += load;
src_load -= load;
dst_load = orig_dst_load + load;
src_load = orig_src_load - load;
if (cur) {
load = task_h_load(cur);
@@ -1195,11 +1224,8 @@ balance:
src_load += load;
}
/* make src_load the smaller */
if (dst_load < src_load)
swap(dst_load, src_load);
if (src_load * env->imbalance_pct < dst_load * 100)
if (load_too_imbalanced(orig_src_load, orig_dst_load,
src_load, dst_load, env))
goto unlock;
assign:
@@ -1301,7 +1327,16 @@ static int task_numa_migrate(struct task_struct *p)
if (env.best_cpu == -1)
return -EAGAIN;
sched_setnuma(p, env.dst_nid);
/*
* If the task is part of a workload that spans multiple NUMA nodes,
* and is migrating into one of the workload's active nodes, remember
* this node as the task's preferred numa node, so the workload can
* settle down.
* A task that migrated to a second choice node will be better off
* trying for a better one later. Do not set the preferred node here.
*/
if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes))
sched_setnuma(p, env.dst_nid);
/*
* Reset the scan period if the task is being rescheduled on an
@@ -1326,12 +1361,15 @@ static int task_numa_migrate(struct task_struct *p)
/* Attempt to migrate a task to a CPU on the preferred node. */
static void numa_migrate_preferred(struct task_struct *p)
{
unsigned long interval = HZ;
/* This task has no NUMA fault statistics yet */
if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
return;
/* Periodically retry migrating the task to the preferred node */
p->numa_migrate_retry = jiffies + HZ;
interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
p->numa_migrate_retry = jiffies + interval;
/* Success if task is already running on preferred CPU */
if (task_node(p) == p->numa_preferred_nid)
@@ -1738,6 +1776,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
struct task_struct *p = current;
bool migrated = flags & TNF_MIGRATED;
int cpu_node = task_node(current);
int local = !!(flags & TNF_FAULT_LOCAL);
int priv;
if (!numabalancing_enabled)
@@ -1786,6 +1825,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
task_numa_group(p, last_cpupid, flags, &priv);
}
/*
* If a workload spans multiple NUMA nodes, a shared fault that
* occurs wholly within the set of nodes that the workload is
* actively using should be counted as local. This allows the
* scan rate to slow down when a workload has settled down.
*/
if (!priv && !local && p->numa_group &&
node_isset(cpu_node, p->numa_group->active_nodes) &&
node_isset(mem_node, p->numa_group->active_nodes))
local = 1;
task_numa_placement(p);
/*
@@ -1800,7 +1850,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
p->numa_faults_locality[local] += pages;
}
static void reset_ptenuma_scan(struct task_struct *p)
@@ -3301,7 +3351,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
}
if (!se)
rq->nr_running -= task_delta;
sub_nr_running(rq, task_delta);
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
@@ -3352,7 +3402,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
}
if (!se)
rq->nr_running += task_delta;
add_nr_running(rq, task_delta);
/* determine whether we need to wake up potentially idle cpu */
if (rq->curr == rq->idle && rq->cfs.nr_running)
@@ -3884,7 +3934,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) {
update_rq_runnable_avg(rq, rq->nr_running);
inc_nr_running(rq);
add_nr_running(rq, 1);
}
hrtick_update(rq);
}
@@ -3944,7 +3994,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}
if (!se) {
dec_nr_running(rq);
sub_nr_running(rq, 1);
update_rq_runnable_avg(rq, 1);
}
hrtick_update(rq);
@@ -4015,7 +4065,7 @@ static void record_wakee(struct task_struct *p)
* about the loss.
*/
if (jiffies > current->wakee_flip_decay_ts + HZ) {
current->wakee_flips = 0;
current->wakee_flips >>= 1;
current->wakee_flip_decay_ts = jiffies;
}
@@ -4449,10 +4499,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
sd = tmp;
}
if (affine_sd) {
if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
prev_cpu = cpu;
if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
prev_cpu = cpu;
if (sd_flag & SD_BALANCE_WAKE) {
new_cpu = select_idle_sibling(p, prev_cpu);
goto unlock;
}
@@ -4520,6 +4570,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
atomic_long_add(se->avg.load_avg_contrib,
&cfs_rq->removed_load);
}
/* We have migrated, no longer consider this task hot */
se->exec_start = 0;
}
#endif /* CONFIG_SMP */
@@ -5070,6 +5123,7 @@ task_hot(struct task_struct *p, u64 now)
/* Returns true if the destination node has incurred more faults */
static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
{
struct numa_group *numa_group = rcu_dereference(p->numa_group);
int src_nid, dst_nid;
if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
@@ -5083,21 +5137,29 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
if (src_nid == dst_nid)
return false;
/* Always encourage migration to the preferred node. */
if (numa_group) {
/* Task is already in the group's interleave set. */
if (node_isset(src_nid, numa_group->active_nodes))
return false;
/* Task is moving into the group's interleave set. */
if (node_isset(dst_nid, numa_group->active_nodes))
return true;
return group_faults(p, dst_nid) > group_faults(p, src_nid);
}
/* Encourage migration to the preferred node. */
if (dst_nid == p->numa_preferred_nid)
return true;
/* If both task and group weight improve, this move is a winner. */
if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
group_weight(p, dst_nid) > group_weight(p, src_nid))
return true;
return false;
return task_faults(p, dst_nid) > task_faults(p, src_nid);
}
static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
{
struct numa_group *numa_group = rcu_dereference(p->numa_group);
int src_nid, dst_nid;
if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@ -5112,16 +5174,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
if (src_nid == dst_nid)
return false;
if (numa_group) {
/* Task is moving within/into the group's interleave set. */
if (node_isset(dst_nid, numa_group->active_nodes))
return false;
/* Task is moving out of the group's interleave set. */
if (node_isset(src_nid, numa_group->active_nodes))
return true;
return group_faults(p, dst_nid) < group_faults(p, src_nid);
}
/* Migrating away from the preferred node is always bad. */
if (src_nid == p->numa_preferred_nid)
return true;
/* If either task or group weight get worse, don't do it. */
if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
group_weight(p, dst_nid) < group_weight(p, src_nid))
return true;
return false;
return task_faults(p, dst_nid) < task_faults(p, src_nid);
}
#else
@@ -5564,6 +5633,7 @@ static unsigned long scale_rt_power(int cpu)
{
struct rq *rq = cpu_rq(cpu);
u64 total, available, age_stamp, avg;
s64 delta;
/*
* Since we're reading these variables without serialization make sure
@@ -5572,7 +5642,11 @@ static unsigned long scale_rt_power(int cpu)
age_stamp = ACCESS_ONCE(rq->age_stamp);
avg = ACCESS_ONCE(rq->rt_avg);
total = sched_avg_period() + (rq_clock(rq) - age_stamp);
delta = rq_clock(rq) - age_stamp;
if (unlikely(delta < 0))
delta = 0;
total = sched_avg_period() + delta;
if (unlikely(total < avg)) {
/* Ensures that power won't end up being negative */
@@ -6640,17 +6714,44 @@ out:
return ld_moved;
}
static inline unsigned long
get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
{
unsigned long interval = sd->balance_interval;
if (cpu_busy)
interval *= sd->busy_factor;
/* scale ms to jiffies */
interval = msecs_to_jiffies(interval);
interval = clamp(interval, 1UL, max_load_balance_interval);
return interval;
}
static inline void
update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
{
unsigned long interval, next;
interval = get_sd_balance_interval(sd, cpu_busy);
next = sd->last_balance + interval;
if (time_after(*next_balance, next))
*next_balance = next;
}
/*
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*/
static int idle_balance(struct rq *this_rq)
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
struct sched_domain *sd;
int pulled_task = 0;
unsigned long next_balance = jiffies + HZ;
u64 curr_cost = 0;
int this_cpu = this_rq->cpu;
idle_enter_fair(this_rq);
@@ -6660,8 +6761,15 @@ static int idle_balance(struct rq *this_rq)
*/
this_rq->idle_stamp = rq_clock(this_rq);
if (this_rq->avg_idle < sysctl_sched_migration_cost)
if (this_rq->avg_idle < sysctl_sched_migration_cost) {
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
update_next_balance(sd, 0, &next_balance);
rcu_read_unlock();
goto out;
}
/*
* Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6671,20 +6779,20 @@ static int idle_balance(struct rq *this_rq)
update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
unsigned long interval;
int continue_balancing = 1;
u64 t0, domain_cost;
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
update_next_balance(sd, 0, &next_balance);
break;
}
if (sd->flags & SD_BALANCE_NEWIDLE) {
t0 = sched_clock_cpu(this_cpu);
/* If we've pulled tasks over stop searching: */
pulled_task = load_balance(this_cpu, this_rq,
sd, CPU_NEWLY_IDLE,
&continue_balancing);
@@ -6696,10 +6804,13 @@ static int idle_balance(struct rq *this_rq)
curr_cost += domain_cost;
}
interval = msecs_to_jiffies(sd->balance_interval);
if (time_after(next_balance, sd->last_balance + interval))
next_balance = sd->last_balance + interval;
if (pulled_task)
update_next_balance(sd, 0, &next_balance);
/*
* Stop searching for tasks to pull if there are
* now runnable tasks on this rq.
*/
if (pulled_task || this_rq->nr_running > 0)
break;
}
rcu_read_unlock();
@@ -6717,20 +6828,13 @@ static int idle_balance(struct rq *this_rq)
if (this_rq->cfs.h_nr_running && !pulled_task)
pulled_task = 1;
if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
/*
* We are going idle. next_balance may be set based on
* a busy processor. So reset next_balance.
*/
this_rq->next_balance = next_balance;
}
out:
/* Move the next balance forward */
if (time_after(this_rq->next_balance, next_balance))
this_rq->next_balance = next_balance;
/* Is there a task of a high priority class? */
if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
((this_rq->stop && this_rq->stop->on_rq) ||
this_rq->dl.dl_nr_running ||
(this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
if (pulled_task) {
@@ -7011,16 +7115,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
break;
}
interval = sd->balance_interval;
if (idle != CPU_IDLE)
interval *= sd->busy_factor;
/* scale ms to jiffies */
interval = msecs_to_jiffies(interval);
interval = clamp(interval, 1UL, max_load_balance_interval);
interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
need_serialize = sd->flags & SD_SERIALIZE;
if (need_serialize) {
if (!spin_trylock(&balancing))
goto out;
@@ -7036,6 +7133,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
}
sd->last_balance = jiffies;
interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
}
if (need_serialize)
spin_unlock(&balancing);

View File

@@ -67,24 +67,21 @@ void __weak arch_cpu_idle(void)
* cpuidle_idle_call - the main idle function
*
* NOTE: no locks or semaphores should be used here
* return non-zero on failure
*/
static int cpuidle_idle_call(void)
static void cpuidle_idle_call(void)
{
struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
int next_state, entered_state, ret;
int next_state, entered_state;
bool broadcast;
/*
* Check if the idle task must be rescheduled. If it is the
* case, exit the function after re-enabling the local irq and
* set again the polling flag
* case, exit the function after re-enabling the local irq.
*/
if (current_clr_polling_and_test()) {
if (need_resched()) {
local_irq_enable();
__current_set_polling();
return 0;
return;
}
/*
@@ -101,96 +98,79 @@ static int cpuidle_idle_call(void)
rcu_idle_enter();
/*
* Check if the cpuidle framework is ready, otherwise fallback
* to the default arch specific idle method
* Ask the cpuidle framework to choose a convenient idle state.
* Fall back to the default arch idle method on errors.
*/
ret = cpuidle_enabled(drv, dev);
if (!ret) {
next_state = cpuidle_select(drv, dev);
if (next_state < 0) {
use_default:
/*
* Ask the governor to choose an idle state it thinks
* it is convenient to go to. There is *always* a
* convenient idle state
* We can't use the cpuidle framework, let's use the default
* idle routine.
*/
next_state = cpuidle_select(drv, dev);
/*
* The idle task must be scheduled, it is pointless to
* go to idle, just update no idle residency and get
* out of this function
*/
if (current_clr_polling_and_test()) {
dev->last_residency = 0;
entered_state = next_state;
if (current_clr_polling_and_test())
local_irq_enable();
} else {
broadcast = !!(drv->states[next_state].flags &
CPUIDLE_FLAG_TIMER_STOP);
else
arch_cpu_idle();
if (broadcast)
/*
* Tell the time framework to switch
* to a broadcast timer because our
* local timer will be shutdown. If a
* local timer is used from another
* cpu as a broadcast timer, this call
* may fail if it is not available
*/
ret = clockevents_notify(
CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
&dev->cpu);
if (!ret) {
trace_cpu_idle_rcuidle(next_state, dev->cpu);
/*
* Enter the idle state previously
* returned by the governor
* decision. This function will block
* until an interrupt occurs and will
* take care of re-enabling the local
* interrupts
*/
entered_state = cpuidle_enter(drv, dev,
next_state);
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT,
dev->cpu);
if (broadcast)
clockevents_notify(
CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
&dev->cpu);
/*
* Give the governor an opportunity to reflect on the
* outcome
*/
cpuidle_reflect(dev, entered_state);
}
}
goto exit_idle;
}
/*
* We can't use the cpuidle framework, let's use the default
* idle routine
*/
if (ret)
arch_cpu_idle();
/*
* The idle task must be scheduled, it is pointless to
* go to idle, just update no idle residency and get
* out of this function
*/
if (current_clr_polling_and_test()) {
dev->last_residency = 0;
entered_state = next_state;
local_irq_enable();
goto exit_idle;
}
broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
/*
* Tell the time framework to switch to a broadcast timer
* because our local timer will be shutdown. If a local timer
* is used from another cpu as a broadcast timer, this call may
* fail if it is not available
*/
if (broadcast &&
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
goto use_default;
trace_cpu_idle_rcuidle(next_state, dev->cpu);
/*
* Enter the idle state previously returned by the governor decision.
* This function will block until an interrupt occurs and will take
* care of re-enabling the local interrupts
*/
entered_state = cpuidle_enter(drv, dev, next_state);
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
if (broadcast)
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
/*
* Give the governor an opportunity to reflect on the outcome
*/
cpuidle_reflect(dev, entered_state);
exit_idle:
__current_set_polling();
/*
* It is up to the idle functions to enable back the local
* interrupt
* It is up to the idle functions to reenable local interrupts
*/
if (WARN_ON_ONCE(irqs_disabled()))
local_irq_enable();
rcu_idle_exit();
start_critical_timings();
return 0;
}
/*

View File

@@ -79,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
#endif
/* We start is dequeued state, because no RT tasks are queued */
rt_rq->rt_queued = 0;
rt_rq->rt_time = 0;
rt_rq->rt_throttled = 0;
@@ -112,6 +114,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
return rt_se->rt_rq;
}
static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
{
struct rt_rq *rt_rq = rt_se->rt_rq;
return rt_rq->rq;
}
void free_rt_sched_group(struct task_group *tg)
{
int i;
@@ -211,10 +220,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
return container_of(rt_rq, struct rq, rt);
}
static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
{
struct task_struct *p = rt_task_of(rt_se);
struct rq *rq = task_rq(p);
return task_rq(p);
}
static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
{
struct rq *rq = rq_of_rt_se(rt_se);
return &rq->rt;
}
@@ -391,6 +406,9 @@ static inline void set_post_schedule(struct rq *rq)
}
#endif /* CONFIG_SMP */
static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
return !list_empty(&rt_se->run_list);
@@ -452,8 +470,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
rt_se = rt_rq->tg->rt_se[cpu];
if (rt_rq->rt_nr_running) {
if (rt_se && !on_rt_rq(rt_se))
if (!rt_se)
enqueue_top_rt_rq(rt_rq);
else if (!on_rt_rq(rt_se))
enqueue_rt_entity(rt_se, false);
if (rt_rq->highest_prio.curr < curr->prio)
resched_task(curr);
}
@@ -466,10 +487,17 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
rt_se = rt_rq->tg->rt_se[cpu];
if (rt_se && on_rt_rq(rt_se))
if (!rt_se)
dequeue_top_rt_rq(rt_rq);
else if (on_rt_rq(rt_se))
dequeue_rt_entity(rt_se);
}
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
{
return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
}
static int rt_se_boosted(struct sched_rt_entity *rt_se)
{
struct rt_rq *rt_rq = group_rt_rq(rt_se);
@@ -532,12 +560,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
if (rt_rq->rt_nr_running)
resched_task(rq_of_rt_rq(rt_rq)->curr);
struct rq *rq = rq_of_rt_rq(rt_rq);
if (!rt_rq->rt_nr_running)
return;
enqueue_top_rt_rq(rt_rq);
resched_task(rq->curr);
}
static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{
dequeue_top_rt_rq(rt_rq);
}
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
{
return rt_rq->rt_throttled;
}
static inline const struct cpumask *sched_rt_period_mask(void)
@@ -922,6 +961,38 @@ static void update_curr_rt(struct rq *rq)
}
}
static void
dequeue_top_rt_rq(struct rt_rq *rt_rq)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
BUG_ON(&rq->rt != rt_rq);
if (!rt_rq->rt_queued)
return;
BUG_ON(!rq->nr_running);
sub_nr_running(rq, rt_rq->rt_nr_running);
rt_rq->rt_queued = 0;
}
static void
enqueue_top_rt_rq(struct rt_rq *rt_rq)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
BUG_ON(&rq->rt != rt_rq);
if (rt_rq->rt_queued)
return;
if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
return;
add_nr_running(rq, rt_rq->rt_nr_running);
rt_rq->rt_queued = 1;
}
#if defined CONFIG_SMP
static void
@@ -1044,13 +1115,24 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
#endif /* CONFIG_RT_GROUP_SCHED */
static inline
unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
{
struct rt_rq *group_rq = group_rt_rq(rt_se);
if (group_rq)
return group_rq->rt_nr_running;
else
return 1;
}
static inline
void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
int prio = rt_se_prio(rt_se);
WARN_ON(!rt_prio(prio));
rt_rq->rt_nr_running++;
rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
inc_rt_prio(rt_rq, prio);
inc_rt_migration(rt_se, rt_rq);
@@ -1062,7 +1144,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
WARN_ON(!rt_prio(rt_se_prio(rt_se)));
WARN_ON(!rt_rq->rt_nr_running);
rt_rq->rt_nr_running--;
rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
dec_rt_prio(rt_rq, rt_se_prio(rt_se));
dec_rt_migration(rt_se, rt_rq);
@@ -1119,6 +1201,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
back = rt_se;
}
dequeue_top_rt_rq(rt_rq_of_se(back));
for (rt_se = back; rt_se; rt_se = rt_se->back) {
if (on_rt_rq(rt_se))
__dequeue_rt_entity(rt_se);
@@ -1127,13 +1211,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
{
struct rq *rq = rq_of_rt_se(rt_se);
dequeue_rt_stack(rt_se);
for_each_sched_rt_entity(rt_se)
__enqueue_rt_entity(rt_se, head);
enqueue_top_rt_rq(&rq->rt);
}
static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
{
struct rq *rq = rq_of_rt_se(rt_se);
dequeue_rt_stack(rt_se);
for_each_sched_rt_entity(rt_se) {
@@ -1142,6 +1231,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
if (rt_rq && rt_rq->rt_nr_running)
__enqueue_rt_entity(rt_se, false);
}
enqueue_top_rt_rq(&rq->rt);
}
/*
@@ -1159,8 +1249,6 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
inc_nr_running(rq);
}
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1171,8 +1259,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
dequeue_rt_entity(rt_se);
dequeue_pushable_task(rq, p);
dec_nr_running(rq);
}
/*
@@ -1377,10 +1463,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
if (prev->sched_class == &rt_sched_class)
update_curr_rt(rq);
if (!rt_rq->rt_nr_running)
return NULL;
if (rt_rq_throttled(rt_rq))
if (!rt_rq->rt_queued)
return NULL;
put_prev_task(rq, prev);
@@ -1892,9 +1975,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
*/
if (p->on_rq && rq->curr != p) {
#ifdef CONFIG_SMP
if (rq->rt.overloaded && push_rt_task(rq) &&
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
/* Don't resched if we changed runqueues */
rq != task_rq(p))
push_rt_task(rq) && rq != task_rq(p))
check_resched = 0;
#endif /* CONFIG_SMP */
if (check_resched && p->prio < rq->curr->prio)

View File

@@ -409,6 +409,8 @@ struct rt_rq {
int overloaded;
struct plist_head pushable_tasks;
#endif
int rt_queued;
int rt_throttled;
u64 rt_time;
u64 rt_runtime;
@@ -423,18 +425,6 @@ struct rt_rq {
#endif
};
#ifdef CONFIG_RT_GROUP_SCHED
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
{
return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
}
#else
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
{
return rt_rq->rt_throttled;
}
#endif
/* Deadline class' related fields in a runqueue */
struct dl_rq {
/* runqueue is an rbtree, ordered by deadline */
@@ -1216,12 +1206,14 @@ extern void update_idle_cpu_load(struct rq *this_rq);
extern void init_task_runnable_average(struct task_struct *p);
static inline void inc_nr_running(struct rq *rq)
static inline void add_nr_running(struct rq *rq, unsigned count)
{
rq->nr_running++;
unsigned prev_nr = rq->nr_running;
rq->nr_running = prev_nr + count;
#ifdef CONFIG_NO_HZ_FULL
if (rq->nr_running == 2) {
if (prev_nr < 2 && rq->nr_running >= 2) {
if (tick_nohz_full_cpu(rq->cpu)) {
/* Order rq->nr_running write against the IPI */
smp_wmb();
@@ -1231,9 +1223,9 @@ static inline void inc_nr_running(struct rq *rq)
#endif
}
static inline void dec_nr_running(struct rq *rq)
static inline void sub_nr_running(struct rq *rq, unsigned count)
{
rq->nr_running--;
rq->nr_running -= count;
}
static inline void rq_last_tick_reset(struct rq *rq)

View File

@@ -41,13 +41,13 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
inc_nr_running(rq);
add_nr_running(rq, 1);
}
static void
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
dec_nr_running(rq);
sub_nr_running(rq, 1);
}
static void yield_task_stop(struct rq *rq)