Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (29 commits)
  sched: Export account_system_vtime()
  sched: Call tick_check_idle before __irq_enter
  sched: Remove irq time from available CPU power
  sched: Do not account irq time to current task
  x86: Add IRQ_TIME_ACCOUNTING
  sched: Add IRQ_TIME_ACCOUNTING, finer accounting of irq time
  sched: Add a PF flag for ksoftirqd identification
  sched: Consolidate account_system_vtime extern declaration
  sched: Fix softirq time accounting
  sched: Drop group_capacity to 1 only if local group has extra capacity
  sched: Force balancing on newidle balance if local group has capacity
  sched: Set group_imb only a task can be pulled from the busiest cpu
  sched: Do not consider SCHED_IDLE tasks to be cache hot
  sched: Drop all load weight manipulation for RT tasks
  sched: Create special class for stop/migrate work
  sched: Unindent labels
  sched: Comment updates: fix default latency and granularity numbers
  tracing/sched: Add sched_pi_setprio tracepoint
  sched: Give CPU bound RT tasks preference
  sched: Try not to migrate higher priority RT tasks
  ...
This commit is contained in:
Linus Torvalds
2010-10-21 12:55:43 -07:00
23 changed files with 733 additions and 185 deletions

View File

@@ -25,7 +25,7 @@
/*
* Targeted preemption latency for CPU-bound tasks:
* (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
*
* NOTE: this latency value is not the same as the concept of
* 'timeslice length' - timeslices in CFS are of variable length
@@ -52,7 +52,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
/*
* Minimal preemption granularity for CPU-bound tasks:
* (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
unsigned int sysctl_sched_min_granularity = 750000ULL;
unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
@@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
u64 now = rq_of(cfs_rq)->clock;
u64 now = rq_of(cfs_rq)->clock_task;
unsigned long delta_exec;
if (unlikely(!curr))
@@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
/*
* We are starting a new run period:
*/
se->exec_start = rq_of(cfs_rq)->clock;
se->exec_start = rq_of(cfs_rq)->clock_task;
}
/**************************************************
@@ -1764,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
set_task_cpu(p, this_cpu);
activate_task(this_rq, p, 0);
check_preempt_curr(this_rq, p, 0);
/* re-arm NEWIDLE balancing when moving tasks */
src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
this_rq->idle_stamp = 0;
}
/*
@@ -1798,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
* 2) too many balance attempts have failed.
*/
tsk_cache_hot = task_hot(p, rq->clock, sd);
tsk_cache_hot = task_hot(p, rq->clock_task, sd);
if (!tsk_cache_hot ||
sd->nr_balance_failed > sd->cache_nice_tries) {
#ifdef CONFIG_SCHEDSTATS
@@ -2030,12 +2034,14 @@ struct sd_lb_stats {
unsigned long this_load;
unsigned long this_load_per_task;
unsigned long this_nr_running;
unsigned long this_has_capacity;
/* Statistics of the busiest group */
unsigned long max_load;
unsigned long busiest_load_per_task;
unsigned long busiest_nr_running;
unsigned long busiest_group_capacity;
unsigned long busiest_has_capacity;
int group_imb; /* Is there imbalance in this sd */
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2058,6 +2064,7 @@ struct sg_lb_stats {
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long group_capacity;
int group_imb; /* Is there an imbalance in the group ? */
int group_has_capacity; /* Is there extra capacity in the group? */
};
/**
@@ -2268,7 +2275,13 @@ unsigned long scale_rt_power(int cpu)
u64 total, available;
total = sched_avg_period() + (rq->clock - rq->age_stamp);
available = total - rq->rt_avg;
if (unlikely(total < rq->rt_avg)) {
/* Ensures that power won't end up being negative */
available = 0;
} else {
available = total - rq->rt_avg;
}
if (unlikely((s64)total < SCHED_LOAD_SCALE))
total = SCHED_LOAD_SCALE;
@@ -2378,7 +2391,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
int local_group, const struct cpumask *cpus,
int *balance, struct sg_lb_stats *sgs)
{
unsigned long load, max_cpu_load, min_cpu_load;
unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
int i;
unsigned int balance_cpu = -1, first_idle_cpu = 0;
unsigned long avg_load_per_task = 0;
@@ -2389,6 +2402,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
/* Tally up the load of all CPUs in the group */
max_cpu_load = 0;
min_cpu_load = ~0UL;
max_nr_running = 0;
for_each_cpu_and(i, sched_group_cpus(group), cpus) {
struct rq *rq = cpu_rq(i);
@@ -2406,8 +2420,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
load = target_load(i, load_idx);
} else {
load = source_load(i, load_idx);
if (load > max_cpu_load)
if (load > max_cpu_load) {
max_cpu_load = load;
max_nr_running = rq->nr_running;
}
if (min_cpu_load > load)
min_cpu_load = load;
}
@@ -2447,13 +2463,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
if (sgs->sum_nr_running)
avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
sgs->group_imb = 1;
sgs->group_capacity =
DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
if (!sgs->group_capacity)
sgs->group_capacity = fix_small_capacity(sd, group);
if (sgs->group_capacity > sgs->sum_nr_running)
sgs->group_has_capacity = 1;
}
/**
@@ -2542,9 +2560,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
/*
* In case the child domain prefers tasks go to siblings
* first, lower the sg capacity to one so that we'll try
* and move all the excess tasks away.
* and move all the excess tasks away. We lower the capacity
* of a group only if the local group has the capacity to fit
* these excess tasks, i.e. nr_running < group_capacity. The
* extra check prevents the case where you always pull from the
* heaviest group when it is already under-utilized (possible
* with a large weight task outweighs the tasks on the system).
*/
if (prefer_sibling)
if (prefer_sibling && !local_group && sds->this_has_capacity)
sgs.group_capacity = min(sgs.group_capacity, 1UL);
if (local_group) {
@@ -2552,12 +2575,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
sds->this = sg;
sds->this_nr_running = sgs.sum_nr_running;
sds->this_load_per_task = sgs.sum_weighted_load;
sds->this_has_capacity = sgs.group_has_capacity;
} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
sds->max_load = sgs.avg_load;
sds->busiest = sg;
sds->busiest_nr_running = sgs.sum_nr_running;
sds->busiest_group_capacity = sgs.group_capacity;
sds->busiest_load_per_task = sgs.sum_weighted_load;
sds->busiest_has_capacity = sgs.group_has_capacity;
sds->group_imb = sgs.group_imb;
}
@@ -2754,6 +2779,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
return fix_small_imbalance(sds, this_cpu, imbalance);
}
/******* find_busiest_group() helpers end here *********************/
/**
@@ -2805,6 +2831,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* 4) This group is more busy than the avg busieness at this
* sched_domain.
* 5) The imbalance is within the specified limit.
*
* Note: when doing newidle balance, if the local group has excess
* capacity (i.e. nr_running < group_capacity) and the busiest group
* does not have any capacity, we force a load balance to pull tasks
* to the local group. In this case, we skip past checks 3, 4 and 5.
*/
if (!(*balance))
goto ret;
@@ -2816,6 +2847,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
if (!sds.busiest || sds.busiest_nr_running == 0)
goto out_balanced;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
!sds.busiest_has_capacity)
goto force_balance;
if (sds.this_load >= sds.max_load)
goto out_balanced;
@@ -2827,6 +2863,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
goto out_balanced;
force_balance:
/* Looks like there is an imbalance. Compute it */
calculate_imbalance(&sds, this_cpu, imbalance);
return sds.busiest;
@@ -3031,7 +3068,14 @@ redo:
if (!ld_moved) {
schedstat_inc(sd, lb_failed[idle]);
sd->nr_balance_failed++;
/*
* Increment the failure counter only on periodic balance.
* We do not want newidle balance, which can be very
* frequent, pollute the failure counter causing
* excessive cache_hot migrations and active balances.
*/
if (idle != CPU_NEWLY_IDLE)
sd->nr_balance_failed++;
if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
this_cpu)) {
@@ -3153,10 +3197,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
interval = msecs_to_jiffies(sd->balance_interval);
if (time_after(next_balance, sd->last_balance + interval))
next_balance = sd->last_balance + interval;
if (pulled_task) {
this_rq->idle_stamp = 0;
if (pulled_task)
break;
}
}
raw_spin_lock(&this_rq->lock);