sched, timers: move calc_load() to scheduler

Dimitri Sivanich noticed that xtime_lock is held write locked across
calc_load() which iterates over all online CPUs. That can cause long
latencies for xtime_lock readers on large SMP systems. 

The load average calculation is an rough estimate anyway so there is
no real need to protect the readers vs. the update. It's not a problem
when the avenrun array is updated while a reader copies the values.

Instead of iterating over all online CPUs let the scheduler_tick code
update the number of active tasks shortly before the avenrun update
happens. The avenrun update itself is handled by the CPU which calls
do_timer().

[ Impact: reduce xtime_lock write locked section ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
This commit is contained in:
Thomas Gleixner
2009-04-11 10:43:41 +02:00
parent 2ff799d3cf
commit dce48a84ad
5 changed files with 81 additions and 66 deletions

View File

@@ -630,6 +630,10 @@ struct rq {
struct list_head migration_queue;
#endif
/* calc_load related fields */
unsigned long calc_load_update;
long calc_load_active;
#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMP
int hrtick_csd_pending;
@@ -1728,6 +1732,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
}
#endif
static void calc_load_account_active(struct rq *this_rq);
#include "sched_stats.h"
#include "sched_idletask.c"
#include "sched_fair.c"
@@ -2856,19 +2862,57 @@ unsigned long nr_iowait(void)
return sum;
}
unsigned long nr_active(void)
/* Variables and functions for calc_load */
static atomic_long_t calc_load_tasks;
static unsigned long calc_load_update;
unsigned long avenrun[3];
EXPORT_SYMBOL(avenrun);
static unsigned long
calc_load(unsigned long load, unsigned long exp, unsigned long active)
{
unsigned long i, running = 0, uninterruptible = 0;
load *= exp;
load += active * (FIXED_1 - exp);
return load >> FSHIFT;
}
for_each_online_cpu(i) {
running += cpu_rq(i)->nr_running;
uninterruptible += cpu_rq(i)->nr_uninterruptible;
/*
* calc_load - update the avenrun load estimates 10 ticks after the
* CPUs have updated calc_load_tasks.
*/
void calc_global_load(void)
{
unsigned long upd = calc_load_update + 10;
long active;
if (time_before(jiffies, upd))
return;
active = atomic_long_read(&calc_load_tasks);
active = active > 0 ? active * FIXED_1 : 0;
avenrun[0] = calc_load(avenrun[0], EXP_1, active);
avenrun[1] = calc_load(avenrun[1], EXP_5, active);
avenrun[2] = calc_load(avenrun[2], EXP_15, active);
calc_load_update += LOAD_FREQ;
}
/*
* Either called from update_cpu_load() or from a cpu going idle
*/
static void calc_load_account_active(struct rq *this_rq)
{
long nr_active, delta;
nr_active = this_rq->nr_running;
nr_active += (long) this_rq->nr_uninterruptible;
if (nr_active != this_rq->calc_load_active) {
delta = nr_active - this_rq->calc_load_active;
this_rq->calc_load_active = nr_active;
atomic_long_add(delta, &calc_load_tasks);
}
if (unlikely((long)uninterruptible < 0))
uninterruptible = 0;
return running + uninterruptible;
}
/*
@@ -2899,6 +2943,11 @@ static void update_cpu_load(struct rq *this_rq)
new_load += scale-1;
this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
}
if (time_after_eq(jiffies, this_rq->calc_load_update)) {
this_rq->calc_load_update += LOAD_FREQ;
calc_load_account_active(this_rq);
}
}
#ifdef CONFIG_SMP
@@ -7091,6 +7140,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
}
}
/*
* remove the tasks which were accounted by rq from calc_load_tasks.
*/
static void calc_global_load_remove(struct rq *rq)
{
atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
}
#endif /* CONFIG_HOTPLUG_CPU */
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7325,6 +7382,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
/* Update our root-domain */
rq = cpu_rq(cpu);
spin_lock_irqsave(&rq->lock, flags);
rq->calc_load_update = calc_load_update;
rq->calc_load_active = 0;
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -7364,7 +7423,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
cpuset_unlock();
migrate_nr_uninterruptible(rq);
BUG_ON(rq->nr_running != 0);
calc_global_load_remove(rq);
/*
* No need to migrate the tasks: it was best-effort if
* they didn't take sched_hotcpu_mutex. Just wake up
@@ -9059,6 +9118,8 @@ void __init sched_init(void)
rq = cpu_rq(i);
spin_lock_init(&rq->lock);
rq->nr_running = 0;
rq->calc_load_active = 0;
rq->calc_load_update = jiffies + LOAD_FREQ;
init_cfs_rq(&rq->cfs, rq);
init_rt_rq(&rq->rt, rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -9166,6 +9227,9 @@ void __init sched_init(void)
* when this runqueue becomes "idle".
*/
init_idle(current, smp_processor_id());
calc_load_update = jiffies + LOAD_FREQ;
/*
* During early bootup we pretend to be a normal task:
*/