Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (29 commits) sched: Export account_system_vtime() sched: Call tick_check_idle before __irq_enter sched: Remove irq time from available CPU power sched: Do not account irq time to current task x86: Add IRQ_TIME_ACCOUNTING sched: Add IRQ_TIME_ACCOUNTING, finer accounting of irq time sched: Add a PF flag for ksoftirqd identification sched: Consolidate account_system_vtime extern declaration sched: Fix softirq time accounting sched: Drop group_capacity to 1 only if local group has extra capacity sched: Force balancing on newidle balance if local group has capacity sched: Set group_imb only a task can be pulled from the busiest cpu sched: Do not consider SCHED_IDLE tasks to be cache hot sched: Drop all load weight manipulation for RT tasks sched: Create special class for stop/migrate work sched: Unindent labels sched: Comment updates: fix default latency and granularity numbers tracing/sched: Add sched_pi_setprio tracepoint sched: Give CPU bound RT tasks preference sched: Try not to migrate higher priority RT tasks ...
This commit is contained in:
@@ -25,7 +25,7 @@
|
||||
|
||||
/*
|
||||
* Targeted preemption latency for CPU-bound tasks:
|
||||
* (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
|
||||
* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
|
||||
*
|
||||
* NOTE: this latency value is not the same as the concept of
|
||||
* 'timeslice length' - timeslices in CFS are of variable length
|
||||
@@ -52,7 +52,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
|
||||
|
||||
/*
|
||||
* Minimal preemption granularity for CPU-bound tasks:
|
||||
* (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
|
||||
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
|
||||
*/
|
||||
unsigned int sysctl_sched_min_granularity = 750000ULL;
|
||||
unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
|
||||
@@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
|
||||
static void update_curr(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
struct sched_entity *curr = cfs_rq->curr;
|
||||
u64 now = rq_of(cfs_rq)->clock;
|
||||
u64 now = rq_of(cfs_rq)->clock_task;
|
||||
unsigned long delta_exec;
|
||||
|
||||
if (unlikely(!curr))
|
||||
@@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
/*
|
||||
* We are starting a new run period:
|
||||
*/
|
||||
se->exec_start = rq_of(cfs_rq)->clock;
|
||||
se->exec_start = rq_of(cfs_rq)->clock_task;
|
||||
}
|
||||
|
||||
/**************************************************
|
||||
@@ -1764,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
|
||||
set_task_cpu(p, this_cpu);
|
||||
activate_task(this_rq, p, 0);
|
||||
check_preempt_curr(this_rq, p, 0);
|
||||
|
||||
/* re-arm NEWIDLE balancing when moving tasks */
|
||||
src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
|
||||
this_rq->idle_stamp = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1798,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
|
||||
* 2) too many balance attempts have failed.
|
||||
*/
|
||||
|
||||
tsk_cache_hot = task_hot(p, rq->clock, sd);
|
||||
tsk_cache_hot = task_hot(p, rq->clock_task, sd);
|
||||
if (!tsk_cache_hot ||
|
||||
sd->nr_balance_failed > sd->cache_nice_tries) {
|
||||
#ifdef CONFIG_SCHEDSTATS
|
||||
@@ -2030,12 +2034,14 @@ struct sd_lb_stats {
|
||||
unsigned long this_load;
|
||||
unsigned long this_load_per_task;
|
||||
unsigned long this_nr_running;
|
||||
unsigned long this_has_capacity;
|
||||
|
||||
/* Statistics of the busiest group */
|
||||
unsigned long max_load;
|
||||
unsigned long busiest_load_per_task;
|
||||
unsigned long busiest_nr_running;
|
||||
unsigned long busiest_group_capacity;
|
||||
unsigned long busiest_has_capacity;
|
||||
|
||||
int group_imb; /* Is there imbalance in this sd */
|
||||
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
|
||||
@@ -2058,6 +2064,7 @@ struct sg_lb_stats {
|
||||
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
|
||||
unsigned long group_capacity;
|
||||
int group_imb; /* Is there an imbalance in the group ? */
|
||||
int group_has_capacity; /* Is there extra capacity in the group? */
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -2268,7 +2275,13 @@ unsigned long scale_rt_power(int cpu)
|
||||
u64 total, available;
|
||||
|
||||
total = sched_avg_period() + (rq->clock - rq->age_stamp);
|
||||
available = total - rq->rt_avg;
|
||||
|
||||
if (unlikely(total < rq->rt_avg)) {
|
||||
/* Ensures that power won't end up being negative */
|
||||
available = 0;
|
||||
} else {
|
||||
available = total - rq->rt_avg;
|
||||
}
|
||||
|
||||
if (unlikely((s64)total < SCHED_LOAD_SCALE))
|
||||
total = SCHED_LOAD_SCALE;
|
||||
@@ -2378,7 +2391,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
|
||||
int local_group, const struct cpumask *cpus,
|
||||
int *balance, struct sg_lb_stats *sgs)
|
||||
{
|
||||
unsigned long load, max_cpu_load, min_cpu_load;
|
||||
unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
|
||||
int i;
|
||||
unsigned int balance_cpu = -1, first_idle_cpu = 0;
|
||||
unsigned long avg_load_per_task = 0;
|
||||
@@ -2389,6 +2402,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
|
||||
/* Tally up the load of all CPUs in the group */
|
||||
max_cpu_load = 0;
|
||||
min_cpu_load = ~0UL;
|
||||
max_nr_running = 0;
|
||||
|
||||
for_each_cpu_and(i, sched_group_cpus(group), cpus) {
|
||||
struct rq *rq = cpu_rq(i);
|
||||
@@ -2406,8 +2420,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
|
||||
load = target_load(i, load_idx);
|
||||
} else {
|
||||
load = source_load(i, load_idx);
|
||||
if (load > max_cpu_load)
|
||||
if (load > max_cpu_load) {
|
||||
max_cpu_load = load;
|
||||
max_nr_running = rq->nr_running;
|
||||
}
|
||||
if (min_cpu_load > load)
|
||||
min_cpu_load = load;
|
||||
}
|
||||
@@ -2447,13 +2463,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
|
||||
if (sgs->sum_nr_running)
|
||||
avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
|
||||
|
||||
if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
|
||||
if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
|
||||
sgs->group_imb = 1;
|
||||
|
||||
sgs->group_capacity =
|
||||
DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
|
||||
sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
|
||||
if (!sgs->group_capacity)
|
||||
sgs->group_capacity = fix_small_capacity(sd, group);
|
||||
|
||||
if (sgs->group_capacity > sgs->sum_nr_running)
|
||||
sgs->group_has_capacity = 1;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -2542,9 +2560,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
|
||||
/*
|
||||
* In case the child domain prefers tasks go to siblings
|
||||
* first, lower the sg capacity to one so that we'll try
|
||||
* and move all the excess tasks away.
|
||||
* and move all the excess tasks away. We lower the capacity
|
||||
* of a group only if the local group has the capacity to fit
|
||||
* these excess tasks, i.e. nr_running < group_capacity. The
|
||||
* extra check prevents the case where you always pull from the
|
||||
* heaviest group when it is already under-utilized (possible
|
||||
* with a large weight task outweighs the tasks on the system).
|
||||
*/
|
||||
if (prefer_sibling)
|
||||
if (prefer_sibling && !local_group && sds->this_has_capacity)
|
||||
sgs.group_capacity = min(sgs.group_capacity, 1UL);
|
||||
|
||||
if (local_group) {
|
||||
@@ -2552,12 +2575,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
|
||||
sds->this = sg;
|
||||
sds->this_nr_running = sgs.sum_nr_running;
|
||||
sds->this_load_per_task = sgs.sum_weighted_load;
|
||||
sds->this_has_capacity = sgs.group_has_capacity;
|
||||
} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
|
||||
sds->max_load = sgs.avg_load;
|
||||
sds->busiest = sg;
|
||||
sds->busiest_nr_running = sgs.sum_nr_running;
|
||||
sds->busiest_group_capacity = sgs.group_capacity;
|
||||
sds->busiest_load_per_task = sgs.sum_weighted_load;
|
||||
sds->busiest_has_capacity = sgs.group_has_capacity;
|
||||
sds->group_imb = sgs.group_imb;
|
||||
}
|
||||
|
||||
@@ -2754,6 +2779,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
|
||||
return fix_small_imbalance(sds, this_cpu, imbalance);
|
||||
|
||||
}
|
||||
|
||||
/******* find_busiest_group() helpers end here *********************/
|
||||
|
||||
/**
|
||||
@@ -2805,6 +2831,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
|
||||
* 4) This group is more busy than the avg busieness at this
|
||||
* sched_domain.
|
||||
* 5) The imbalance is within the specified limit.
|
||||
*
|
||||
* Note: when doing newidle balance, if the local group has excess
|
||||
* capacity (i.e. nr_running < group_capacity) and the busiest group
|
||||
* does not have any capacity, we force a load balance to pull tasks
|
||||
* to the local group. In this case, we skip past checks 3, 4 and 5.
|
||||
*/
|
||||
if (!(*balance))
|
||||
goto ret;
|
||||
@@ -2816,6 +2847,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
|
||||
if (!sds.busiest || sds.busiest_nr_running == 0)
|
||||
goto out_balanced;
|
||||
|
||||
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
|
||||
if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
|
||||
!sds.busiest_has_capacity)
|
||||
goto force_balance;
|
||||
|
||||
if (sds.this_load >= sds.max_load)
|
||||
goto out_balanced;
|
||||
|
||||
@@ -2827,6 +2863,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
|
||||
if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
|
||||
goto out_balanced;
|
||||
|
||||
force_balance:
|
||||
/* Looks like there is an imbalance. Compute it */
|
||||
calculate_imbalance(&sds, this_cpu, imbalance);
|
||||
return sds.busiest;
|
||||
@@ -3031,7 +3068,14 @@ redo:
|
||||
|
||||
if (!ld_moved) {
|
||||
schedstat_inc(sd, lb_failed[idle]);
|
||||
sd->nr_balance_failed++;
|
||||
/*
|
||||
* Increment the failure counter only on periodic balance.
|
||||
* We do not want newidle balance, which can be very
|
||||
* frequent, pollute the failure counter causing
|
||||
* excessive cache_hot migrations and active balances.
|
||||
*/
|
||||
if (idle != CPU_NEWLY_IDLE)
|
||||
sd->nr_balance_failed++;
|
||||
|
||||
if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
|
||||
this_cpu)) {
|
||||
@@ -3153,10 +3197,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
|
||||
interval = msecs_to_jiffies(sd->balance_interval);
|
||||
if (time_after(next_balance, sd->last_balance + interval))
|
||||
next_balance = sd->last_balance + interval;
|
||||
if (pulled_task) {
|
||||
this_rq->idle_stamp = 0;
|
||||
if (pulled_task)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
raw_spin_lock(&this_rq->lock);
|
||||
|
Reference in New Issue
Block a user