Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler changes from Ingo Molnar:
 "The main changes in this cycle are:

   - (much) improved CONFIG_NUMA_BALANCING support from Mel Gorman, Rik
     van Riel, Peter Zijlstra et al.  Yay!

   - optimize preemption counter handling: merge the NEED_RESCHED flag
     into the preempt_count variable, by Peter Zijlstra.

   - wait.h fixes and code reorganization from Peter Zijlstra

   - cfs_bandwidth fixes from Ben Segall

   - SMP load-balancer cleanups from Peter Zijstra

   - idle balancer improvements from Jason Low

   - other fixes and cleanups"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (129 commits)
  ftrace, sched: Add TRACE_FLAG_PREEMPT_RESCHED
  stop_machine: Fix race between stop_two_cpus() and stop_cpus()
  sched: Remove unnecessary iteration over sched domains to update nr_busy_cpus
  sched: Fix asymmetric scheduling for POWER7
  sched: Move completion code from core.c to completion.c
  sched: Move wait code from core.c to wait.c
  sched: Move wait.c into kernel/sched/
  sched/wait: Fix __wait_event_interruptible_lock_irq_timeout()
  sched: Avoid throttle_cfs_rq() racing with period_timer stopping
  sched: Guarantee new group-entities always have weight
  sched: Fix hrtimer_cancel()/rq->lock deadlock
  sched: Fix cfs_bandwidth misuse of hrtimer_expires_remaining
  sched: Fix race on toggling cfs_bandwidth_used
  sched: Remove extra put_online_cpus() inside sched_setaffinity()
  sched/rt: Fix task_tick_rt() comment
  sched/wait: Fix build breakage
  sched/wait: Introduce prepare_to_wait_event()
  sched/wait: Add ___wait_cond_timeout() to wait_event*_timeout() too
  sched: Remove get_online_cpus() usage
  sched: Fix race in migrate_swap_stop()
  ...
This commit is contained in:
Linus Torvalds
2013-11-12 10:20:12 +09:00
當前提交 39cf275a1a
共有 117 個文件被更改,包括 3573 次插入1598 次删除

查看文件

@@ -7,7 +7,7 @@ obj-y = fork.o exec_domain.o panic.o \
sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
extable.o params.o posix-timers.o \
kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \
kthread.o sys_ni.o posix-cpu-timers.o mutex.o \
hrtimer.o rwsem.o nsproxy.o semaphore.o \
notifier.o ksysfs.o cred.o reboot.o \
async.o range.o groups.o lglock.o smpboot.o

查看文件

@@ -10,6 +10,7 @@
#include <linux/mmzone.h>
#include <linux/kbuild.h>
#include <linux/page_cgroup.h>
#include <linux/log2.h>
void foo(void)
{
@@ -17,5 +18,8 @@ void foo(void)
DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
#ifdef CONFIG_SMP
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
#endif
/* End of constants */
}

查看文件

@@ -120,7 +120,7 @@ void context_tracking_user_enter(void)
* instead of preempt_schedule() to exit user context if needed before
* calling the scheduler.
*/
void __sched notrace preempt_schedule_context(void)
asmlinkage void __sched notrace preempt_schedule_context(void)
{
enum ctx_state prev_ctx;

查看文件

@@ -308,6 +308,23 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
}
smpboot_park_threads(cpu);
/*
* By now we've cleared cpu_active_mask, wait for all preempt-disabled
* and RCU users of this state to go away such that all new such users
* will observe it.
*
* For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
* not imply sync_sched(), so explicitly call both.
*/
#ifdef CONFIG_PREEMPT
synchronize_sched();
#endif
synchronize_rcu();
/*
* So now all preempt/rcu users must observe !cpu_active().
*/
err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
if (err) {
/* CPU didn't die: tell everyone. Can't complain. */

查看文件

@@ -44,7 +44,7 @@ static inline int cpu_idle_poll(void)
rcu_idle_enter();
trace_cpu_idle_rcuidle(0, smp_processor_id());
local_irq_enable();
while (!need_resched())
while (!tif_need_resched())
cpu_relax();
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
rcu_idle_exit();
@@ -92,8 +92,7 @@ static void cpu_idle_loop(void)
if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
cpu_idle_poll();
} else {
current_clr_polling();
if (!need_resched()) {
if (!current_clr_polling_and_test()) {
stop_critical_timings();
rcu_idle_enter();
arch_cpu_idle();
@@ -103,9 +102,16 @@ static void cpu_idle_loop(void)
} else {
local_irq_enable();
}
current_set_polling();
__current_set_polling();
}
arch_cpu_idle_exit();
/*
* We need to test and propagate the TIF_NEED_RESCHED
* bit here because we might not have send the
* reschedule IPI to idle tasks.
*/
if (tif_need_resched())
set_preempt_need_resched();
}
tick_nohz_idle_exit();
schedule_preempt_disabled();
@@ -129,7 +135,7 @@ void cpu_startup_entry(enum cpuhp_state state)
*/
boot_init_stack_canary();
#endif
current_set_polling();
__current_set_polling();
arch_cpu_idle_prepare();
cpu_idle_loop();
}

查看文件

@@ -816,9 +816,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
mm->pmd_huge_pte = NULL;
#endif
#ifdef CONFIG_NUMA_BALANCING
mm->first_nid = NUMA_PTE_SCAN_INIT;
#endif
if (!mm_init(mm, tsk))
goto fail_nomem;
@@ -1313,7 +1310,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p);
sched_fork(clone_flags, p);
retval = perf_event_init_task(p);
if (retval)

查看文件

@@ -916,6 +916,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
force_quiescent_state(rsp); /* Kick them all. */
}
/*
* This function really isn't for public consumption, but RCU is special in
* that context switches can allow the state machine to make progress.
*/
extern void resched_cpu(int cpu);
static void print_cpu_stall(struct rcu_state *rsp)
{
int cpu;
@@ -945,7 +951,14 @@ static void print_cpu_stall(struct rcu_state *rsp)
3 * rcu_jiffies_till_stall_check() + 3;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
set_need_resched(); /* kick ourselves to get things going. */
/*
* Attempt to revive the RCU machinery by forcing a context switch.
*
* A context switch would normally allow the RCU state machine to make
* progress and it could be we're stuck in kernel space without context
* switches for an entirely unreasonable amount of time.
*/
resched_cpu(smp_processor_id());
}
static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)

查看文件

@@ -12,6 +12,7 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
obj-y += wait.o completion.o
obj-$(CONFIG_SMP) += cpupri.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o

299
kernel/sched/completion.c Normal file
查看文件

@@ -0,0 +1,299 @@
/*
* Generic wait-for-completion handler;
*
* It differs from semaphores in that their default case is the opposite,
* wait_for_completion default blocks whereas semaphore default non-block. The
* interface also makes it easy to 'complete' multiple waiting threads,
* something which isn't entirely natural for semaphores.
*
* But more importantly, the primitive documents the usage. Semaphores would
* typically be used for exclusion which gives rise to priority inversion.
* Waiting for completion is a typically sync point, but not an exclusion point.
*/
#include <linux/sched.h>
#include <linux/completion.h>
/**
* complete: - signals a single thread waiting on this completion
* @x: holds the state of this particular completion
*
* This will wake up a single thread waiting on this completion. Threads will be
* awakened in the same order in which they were queued.
*
* See also complete_all(), wait_for_completion() and related routines.
*
* It may be assumed that this function implies a write memory barrier before
* changing the task state if and only if any tasks are woken up.
*/
void complete(struct completion *x)
{
unsigned long flags;
spin_lock_irqsave(&x->wait.lock, flags);
x->done++;
__wake_up_locked(&x->wait, TASK_NORMAL, 1);
spin_unlock_irqrestore(&x->wait.lock, flags);
}
EXPORT_SYMBOL(complete);
/**
* complete_all: - signals all threads waiting on this completion
* @x: holds the state of this particular completion
*
* This will wake up all threads waiting on this particular completion event.
*
* It may be assumed that this function implies a write memory barrier before
* changing the task state if and only if any tasks are woken up.
*/
void complete_all(struct completion *x)
{
unsigned long flags;
spin_lock_irqsave(&x->wait.lock, flags);
x->done += UINT_MAX/2;
__wake_up_locked(&x->wait, TASK_NORMAL, 0);
spin_unlock_irqrestore(&x->wait.lock, flags);
}
EXPORT_SYMBOL(complete_all);
static inline long __sched
do_wait_for_common(struct completion *x,
long (*action)(long), long timeout, int state)
{
if (!x->done) {
DECLARE_WAITQUEUE(wait, current);
__add_wait_queue_tail_exclusive(&x->wait, &wait);
do {
if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
break;
}
__set_current_state(state);
spin_unlock_irq(&x->wait.lock);
timeout = action(timeout);
spin_lock_irq(&x->wait.lock);
} while (!x->done && timeout);
__remove_wait_queue(&x->wait, &wait);
if (!x->done)
return timeout;
}
x->done--;
return timeout ?: 1;
}
static inline long __sched
__wait_for_common(struct completion *x,
long (*action)(long), long timeout, int state)
{
might_sleep();
spin_lock_irq(&x->wait.lock);
timeout = do_wait_for_common(x, action, timeout, state);
spin_unlock_irq(&x->wait.lock);
return timeout;
}
static long __sched
wait_for_common(struct completion *x, long timeout, int state)
{
return __wait_for_common(x, schedule_timeout, timeout, state);
}
static long __sched
wait_for_common_io(struct completion *x, long timeout, int state)
{
return __wait_for_common(x, io_schedule_timeout, timeout, state);
}
/**
* wait_for_completion: - waits for completion of a task
* @x: holds the state of this particular completion
*
* This waits to be signaled for completion of a specific task. It is NOT
* interruptible and there is no timeout.
*
* See also similar routines (i.e. wait_for_completion_timeout()) with timeout
* and interrupt capability. Also see complete().
*/
void __sched wait_for_completion(struct completion *x)
{
wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion);
/**
* wait_for_completion_timeout: - waits for completion of a task (w/timeout)
* @x: holds the state of this particular completion
* @timeout: timeout value in jiffies
*
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. The timeout is in jiffies. It is not
* interruptible.
*
* Return: 0 if timed out, and positive (at least 1, or number of jiffies left
* till timeout) if completed.
*/
unsigned long __sched
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
{
return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion_timeout);
/**
* wait_for_completion_io: - waits for completion of a task
* @x: holds the state of this particular completion
*
* This waits to be signaled for completion of a specific task. It is NOT
* interruptible and there is no timeout. The caller is accounted as waiting
* for IO.
*/
void __sched wait_for_completion_io(struct completion *x)
{
wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion_io);
/**
* wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
* @x: holds the state of this particular completion
* @timeout: timeout value in jiffies
*
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. The timeout is in jiffies. It is not
* interruptible. The caller is accounted as waiting for IO.
*
* Return: 0 if timed out, and positive (at least 1, or number of jiffies left
* till timeout) if completed.
*/
unsigned long __sched
wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
{
return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion_io_timeout);
/**
* wait_for_completion_interruptible: - waits for completion of a task (w/intr)
* @x: holds the state of this particular completion
*
* This waits for completion of a specific task to be signaled. It is
* interruptible.
*
* Return: -ERESTARTSYS if interrupted, 0 if completed.
*/
int __sched wait_for_completion_interruptible(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
if (t == -ERESTARTSYS)
return t;
return 0;
}
EXPORT_SYMBOL(wait_for_completion_interruptible);
/**
* wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
* @x: holds the state of this particular completion
* @timeout: timeout value in jiffies
*
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. It is interruptible. The timeout is in jiffies.
*
* Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
* or number of jiffies left till timeout) if completed.
*/
long __sched
wait_for_completion_interruptible_timeout(struct completion *x,
unsigned long timeout)
{
return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
/**
* wait_for_completion_killable: - waits for completion of a task (killable)
* @x: holds the state of this particular completion
*
* This waits to be signaled for completion of a specific task. It can be
* interrupted by a kill signal.
*
* Return: -ERESTARTSYS if interrupted, 0 if completed.
*/
int __sched wait_for_completion_killable(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
if (t == -ERESTARTSYS)
return t;
return 0;
}
EXPORT_SYMBOL(wait_for_completion_killable);
/**
* wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
* @x: holds the state of this particular completion
* @timeout: timeout value in jiffies
*
* This waits for either a completion of a specific task to be
* signaled or for a specified timeout to expire. It can be
* interrupted by a kill signal. The timeout is in jiffies.
*
* Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
* or number of jiffies left till timeout) if completed.
*/
long __sched
wait_for_completion_killable_timeout(struct completion *x,
unsigned long timeout)
{
return wait_for_common(x, timeout, TASK_KILLABLE);
}
EXPORT_SYMBOL(wait_for_completion_killable_timeout);
/**
* try_wait_for_completion - try to decrement a completion without blocking
* @x: completion structure
*
* Return: 0 if a decrement cannot be done without blocking
* 1 if a decrement succeeded.
*
* If a completion is being used as a counting completion,
* attempt to decrement the counter without blocking. This
* enables us to avoid waiting if the resource the completion
* is protecting is not available.
*/
bool try_wait_for_completion(struct completion *x)
{
unsigned long flags;
int ret = 1;
spin_lock_irqsave(&x->wait.lock, flags);
if (!x->done)
ret = 0;
else
x->done--;
spin_unlock_irqrestore(&x->wait.lock, flags);
return ret;
}
EXPORT_SYMBOL(try_wait_for_completion);
/**
* completion_done - Test to see if a completion has any waiters
* @x: completion structure
*
* Return: 0 if there are waiters (wait_for_completion() in progress)
* 1 if there are no waiters.
*
*/
bool completion_done(struct completion *x)
{
unsigned long flags;
int ret = 1;
spin_lock_irqsave(&x->wait.lock, flags);
if (!x->done)
ret = 0;
spin_unlock_irqrestore(&x->wait.lock, flags);
return ret;
}
EXPORT_SYMBOL(completion_done);

查看文件

@@ -513,12 +513,11 @@ static inline void init_hrtick(void)
* might also involve a cross-CPU call to trigger the scheduler on
* the target CPU.
*/
#ifdef CONFIG_SMP
void resched_task(struct task_struct *p)
{
int cpu;
assert_raw_spin_locked(&task_rq(p)->lock);
lockdep_assert_held(&task_rq(p)->lock);
if (test_tsk_need_resched(p))
return;
@@ -526,8 +525,10 @@ void resched_task(struct task_struct *p)
set_tsk_need_resched(p);
cpu = task_cpu(p);
if (cpu == smp_processor_id())
if (cpu == smp_processor_id()) {
set_preempt_need_resched();
return;
}
/* NEED_RESCHED must be visible before we test polling */
smp_mb();
@@ -546,6 +547,7 @@ void resched_cpu(int cpu)
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ_COMMON
/*
* In the semi idle case, use the nearest busy cpu for migrating timers
@@ -693,12 +695,6 @@ void sched_avg_update(struct rq *rq)
}
}
#else /* !CONFIG_SMP */
void resched_task(struct task_struct *p)
{
assert_raw_spin_locked(&task_rq(p)->lock);
set_tsk_need_resched(p);
}
#endif /* CONFIG_SMP */
#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -767,14 +763,14 @@ static void set_load_weight(struct task_struct *p)
static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
update_rq_clock(rq);
sched_info_queued(p);
sched_info_queued(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
}
static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
update_rq_clock(rq);
sched_info_dequeued(p);
sched_info_dequeued(rq, p);
p->sched_class->dequeue_task(rq, p, flags);
}
@@ -987,7 +983,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* ttwu() will sort out the placement.
*/
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
!(task_preempt_count(p) & PREEMPT_ACTIVE));
#ifdef CONFIG_LOCKDEP
/*
@@ -1017,6 +1013,107 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
__set_task_cpu(p, new_cpu);
}
static void __migrate_swap_task(struct task_struct *p, int cpu)
{
if (p->on_rq) {
struct rq *src_rq, *dst_rq;
src_rq = task_rq(p);
dst_rq = cpu_rq(cpu);
deactivate_task(src_rq, p, 0);
set_task_cpu(p, cpu);
activate_task(dst_rq, p, 0);
check_preempt_curr(dst_rq, p, 0);
} else {
/*
* Task isn't running anymore; make it appear like we migrated
* it before it went to sleep. This means on wakeup we make the
* previous cpu our targer instead of where it really is.
*/
p->wake_cpu = cpu;
}
}
struct migration_swap_arg {
struct task_struct *src_task, *dst_task;
int src_cpu, dst_cpu;
};
static int migrate_swap_stop(void *data)
{
struct migration_swap_arg *arg = data;
struct rq *src_rq, *dst_rq;
int ret = -EAGAIN;
src_rq = cpu_rq(arg->src_cpu);
dst_rq = cpu_rq(arg->dst_cpu);
double_raw_lock(&arg->src_task->pi_lock,
&arg->dst_task->pi_lock);
double_rq_lock(src_rq, dst_rq);
if (task_cpu(arg->dst_task) != arg->dst_cpu)
goto unlock;
if (task_cpu(arg->src_task) != arg->src_cpu)
goto unlock;
if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
goto unlock;
if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
goto unlock;
__migrate_swap_task(arg->src_task, arg->dst_cpu);
__migrate_swap_task(arg->dst_task, arg->src_cpu);
ret = 0;
unlock:
double_rq_unlock(src_rq, dst_rq);
raw_spin_unlock(&arg->dst_task->pi_lock);
raw_spin_unlock(&arg->src_task->pi_lock);
return ret;
}
/*
* Cross migrate two tasks
*/
int migrate_swap(struct task_struct *cur, struct task_struct *p)
{
struct migration_swap_arg arg;
int ret = -EINVAL;
arg = (struct migration_swap_arg){
.src_task = cur,
.src_cpu = task_cpu(cur),
.dst_task = p,
.dst_cpu = task_cpu(p),
};
if (arg.src_cpu == arg.dst_cpu)
goto out;
/*
* These three tests are all lockless; this is OK since all of them
* will be re-checked with proper locks held further down the line.
*/
if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
goto out;
if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
goto out;
if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
goto out;
ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
out:
return ret;
}
struct migration_arg {
struct task_struct *task;
int dest_cpu;
@@ -1236,9 +1333,9 @@ out:
* The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
*/
static inline
int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
{
int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -1330,12 +1427,13 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
if (rq->idle_stamp) {
u64 delta = rq_clock(rq) - rq->idle_stamp;
u64 max = 2*sysctl_sched_migration_cost;
u64 max = 2*rq->max_idle_balance_cost;
if (delta > max)
update_avg(&rq->avg_idle, delta);
if (rq->avg_idle > max)
rq->avg_idle = max;
else
update_avg(&rq->avg_idle, delta);
rq->idle_stamp = 0;
}
#endif
@@ -1396,6 +1494,14 @@ static void sched_ttwu_pending(void)
void scheduler_ipi(void)
{
/*
* Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
* TIF_NEED_RESCHED remotely (for the first time) will also send
* this IPI.
*/
if (tif_need_resched())
set_preempt_need_resched();
if (llist_empty(&this_rq()->wake_list)
&& !tick_nohz_full_cpu(smp_processor_id())
&& !got_nohz_idle_kick())
@@ -1513,7 +1619,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (p->sched_class->task_waking)
p->sched_class->task_waking(p);
cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
@@ -1595,7 +1701,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)
*
* __sched_fork() is basic setup used by init_idle() too:
*/
static void __sched_fork(struct task_struct *p)
static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
{
p->on_rq = 0;
@@ -1619,16 +1725,24 @@ static void __sched_fork(struct task_struct *p)
#ifdef CONFIG_NUMA_BALANCING
if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
p->mm->numa_next_scan = jiffies;
p->mm->numa_next_reset = jiffies;
p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
p->mm->numa_scan_seq = 0;
}
if (clone_flags & CLONE_VM)
p->numa_preferred_nid = current->numa_preferred_nid;
else
p->numa_preferred_nid = -1;
p->node_stamp = 0ULL;
p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
p->numa_work.next = &p->numa_work;
p->numa_faults = NULL;
p->numa_faults_buffer = NULL;
INIT_LIST_HEAD(&p->numa_entry);
p->numa_group = NULL;
#endif /* CONFIG_NUMA_BALANCING */
}
@@ -1654,12 +1768,12 @@ void set_numabalancing_state(bool enabled)
/*
* fork()/clone()-time setup:
*/
void sched_fork(struct task_struct *p)
void sched_fork(unsigned long clone_flags, struct task_struct *p)
{
unsigned long flags;
int cpu = get_cpu();
__sched_fork(p);
__sched_fork(clone_flags, p);
/*
* We mark the process as running here. This guarantees that
* nobody will actually run it, and a signal or other external
@@ -1717,10 +1831,7 @@ void sched_fork(struct task_struct *p)
#if defined(CONFIG_SMP)
p->on_cpu = 0;
#endif
#ifdef CONFIG_PREEMPT_COUNT
/* Want to start with kernel preemption disabled. */
task_thread_info(p)->preempt_count = 1;
#endif
init_task_preempt_count(p);
#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
#endif
@@ -1747,7 +1858,7 @@ void wake_up_new_task(struct task_struct *p)
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
*/
set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
/* Initialize new task's runnable average */
@@ -1838,7 +1949,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
trace_sched_switch(prev, next);
sched_info_switch(prev, next);
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
prepare_lock_switch(rq, next);
@@ -1890,6 +2001,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
if (mm)
mmdrop(mm);
if (unlikely(prev_state == TASK_DEAD)) {
task_numa_free(prev);
/*
* Remove function-return probe instances associated with this
* task and put them back on the free list.
@@ -2073,7 +2186,7 @@ void sched_exec(void)
int dest_cpu;
raw_spin_lock_irqsave(&p->pi_lock, flags);
dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
if (dest_cpu == smp_processor_id())
goto unlock;
@@ -2215,7 +2328,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_PREEMPT_TRACER))
void __kprobes add_preempt_count(int val)
void __kprobes preempt_count_add(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
/*
@@ -2224,7 +2337,7 @@ void __kprobes add_preempt_count(int val)
if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
return;
#endif
preempt_count() += val;
__preempt_count_add(val);
#ifdef CONFIG_DEBUG_PREEMPT
/*
* Spinlock count overflowing soon?
@@ -2235,9 +2348,9 @@ void __kprobes add_preempt_count(int val)
if (preempt_count() == val)
trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
}
EXPORT_SYMBOL(add_preempt_count);
EXPORT_SYMBOL(preempt_count_add);
void __kprobes sub_preempt_count(int val)
void __kprobes preempt_count_sub(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
/*
@@ -2255,9 +2368,9 @@ void __kprobes sub_preempt_count(int val)
if (preempt_count() == val)
trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
preempt_count() -= val;
__preempt_count_sub(val);
}
EXPORT_SYMBOL(sub_preempt_count);
EXPORT_SYMBOL(preempt_count_sub);
#endif
@@ -2430,6 +2543,7 @@ need_resched:
put_prev_task(rq, prev);
next = pick_next_task(rq);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
rq->skip_clock_update = 0;
if (likely(prev != next)) {
@@ -2520,9 +2634,9 @@ asmlinkage void __sched notrace preempt_schedule(void)
return;
do {
add_preempt_count_notrace(PREEMPT_ACTIVE);
__preempt_count_add(PREEMPT_ACTIVE);
__schedule();
sub_preempt_count_notrace(PREEMPT_ACTIVE);
__preempt_count_sub(PREEMPT_ACTIVE);
/*
* Check again in case we missed a preemption opportunity
@@ -2541,20 +2655,19 @@ EXPORT_SYMBOL(preempt_schedule);
*/
asmlinkage void __sched preempt_schedule_irq(void)
{
struct thread_info *ti = current_thread_info();
enum ctx_state prev_state;
/* Catch callers which need to be fixed */
BUG_ON(ti->preempt_count || !irqs_disabled());
BUG_ON(preempt_count() || !irqs_disabled());
prev_state = exception_enter();
do {
add_preempt_count(PREEMPT_ACTIVE);
__preempt_count_add(PREEMPT_ACTIVE);
local_irq_enable();
__schedule();
local_irq_disable();
sub_preempt_count(PREEMPT_ACTIVE);
__preempt_count_sub(PREEMPT_ACTIVE);
/*
* Check again in case we missed a preemption opportunity
@@ -2575,393 +2688,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
}
EXPORT_SYMBOL(default_wake_function);
/*
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
* number) then we wake all the non-exclusive tasks and one exclusive task.
*
* There are circumstances in which we can try to wake a task which has already
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
* zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, int wake_flags, void *key)
{
wait_queue_t *curr, *next;
list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
unsigned flags = curr->flags;
if (curr->func(curr, mode, wake_flags, key) &&
(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
}
}
/**
* __wake_up - wake up threads blocked on a waitqueue.
* @q: the waitqueue
* @mode: which threads
* @nr_exclusive: how many wake-one or wake-many threads to wake up
* @key: is directly passed to the wakeup function
*
* It may be assumed that this function implies a write memory barrier before
* changing the task state if and only if any tasks are woken up.
*/
void __wake_up(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, void *key)
{
unsigned long flags;
spin_lock_irqsave(&q->lock, flags);
__wake_up_common(q, mode, nr_exclusive, 0, key);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(__wake_up);
/*
* Same as __wake_up but called with the spinlock in wait_queue_head_t held.
*/
void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
{
__wake_up_common(q, mode, nr, 0, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_locked);
void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
{
__wake_up_common(q, mode, 1, 0, key);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_key);
/**
* __wake_up_sync_key - wake up threads blocked on a waitqueue.
* @q: the waitqueue
* @mode: which threads
* @nr_exclusive: how many wake-one or wake-many threads to wake up
* @key: opaque value to be passed to wakeup targets
*
* The sync wakeup differs that the waker knows that it will schedule
* away soon, so while the target thread will be woken up, it will not
* be migrated to another CPU - ie. the two threads are 'synchronized'
* with each other. This can prevent needless bouncing between CPUs.
*
* On UP it can prevent extra preemption.
*
* It may be assumed that this function implies a write memory barrier before
* changing the task state if and only if any tasks are woken up.
*/
void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, void *key)
{
unsigned long flags;
int wake_flags = WF_SYNC;
if (unlikely(!q))
return;
if (unlikely(nr_exclusive != 1))
wake_flags = 0;
spin_lock_irqsave(&q->lock, flags);
__wake_up_common(q, mode, nr_exclusive, wake_flags, key);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL_GPL(__wake_up_sync_key);
/*
* __wake_up_sync - see __wake_up_sync_key()
*/
void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
{
__wake_up_sync_key(q, mode, nr_exclusive, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
/**
* complete: - signals a single thread waiting on this completion
* @x: holds the state of this particular completion
*
* This will wake up a single thread waiting on this completion. Threads will be
* awakened in the same order in which they were queued.
*
* See also complete_all(), wait_for_completion() and related routines.
*
* It may be assumed that this function implies a write memory barrier before
* changing the task state if and only if any tasks are woken up.
*/
void complete(struct completion *x)
{
unsigned long flags;
spin_lock_irqsave(&x->wait.lock, flags);
x->done++;
__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
spin_unlock_irqrestore(&x->wait.lock, flags);
}
EXPORT_SYMBOL(complete);
/**
* complete_all: - signals all threads waiting on this completion
* @x: holds the state of this particular completion
*
* This will wake up all threads waiting on this particular completion event.
*
* It may be assumed that this function implies a write memory barrier before
* changing the task state if and only if any tasks are woken up.
*/
void complete_all(struct completion *x)
{
unsigned long flags;
spin_lock_irqsave(&x->wait.lock, flags);
x->done += UINT_MAX/2;
__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
spin_unlock_irqrestore(&x->wait.lock, flags);
}
EXPORT_SYMBOL(complete_all);
static inline long __sched
do_wait_for_common(struct completion *x,
long (*action)(long), long timeout, int state)
{
if (!x->done) {
DECLARE_WAITQUEUE(wait, current);
__add_wait_queue_tail_exclusive(&x->wait, &wait);
do {
if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
break;
}
__set_current_state(state);
spin_unlock_irq(&x->wait.lock);
timeout = action(timeout);
spin_lock_irq(&x->wait.lock);
} while (!x->done && timeout);
__remove_wait_queue(&x->wait, &wait);
if (!x->done)
return timeout;
}
x->done--;
return timeout ?: 1;
}
static inline long __sched
__wait_for_common(struct completion *x,
long (*action)(long), long timeout, int state)
{
might_sleep();
spin_lock_irq(&x->wait.lock);
timeout = do_wait_for_common(x, action, timeout, state);
spin_unlock_irq(&x->wait.lock);
return timeout;
}
static long __sched
wait_for_common(struct completion *x, long timeout, int state)
{
return __wait_for_common(x, schedule_timeout, timeout, state);
}
static long __sched
wait_for_common_io(struct completion *x, long timeout, int state)
{
return __wait_for_common(x, io_schedule_timeout, timeout, state);
}
/**
* wait_for_completion: - waits for completion of a task
* @x: holds the state of this particular completion
*
* This waits to be signaled for completion of a specific task. It is NOT
* interruptible and there is no timeout.
*
* See also similar routines (i.e. wait_for_completion_timeout()) with timeout
* and interrupt capability. Also see complete().
*/
void __sched wait_for_completion(struct completion *x)
{
wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion);
/**
* wait_for_completion_timeout: - waits for completion of a task (w/timeout)
* @x: holds the state of this particular completion
* @timeout: timeout value in jiffies
*
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. The timeout is in jiffies. It is not
* interruptible.
*
* Return: 0 if timed out, and positive (at least 1, or number of jiffies left
* till timeout) if completed.
*/
unsigned long __sched
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
{
return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion_timeout);
/**
* wait_for_completion_io: - waits for completion of a task
* @x: holds the state of this particular completion
*
* This waits to be signaled for completion of a specific task. It is NOT
* interruptible and there is no timeout. The caller is accounted as waiting
* for IO.
*/
void __sched wait_for_completion_io(struct completion *x)
{
wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion_io);
/**
* wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
* @x: holds the state of this particular completion
* @timeout: timeout value in jiffies
*
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. The timeout is in jiffies. It is not
* interruptible. The caller is accounted as waiting for IO.
*
* Return: 0 if timed out, and positive (at least 1, or number of jiffies left
* till timeout) if completed.
*/
unsigned long __sched
wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
{
return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion_io_timeout);
/**
* wait_for_completion_interruptible: - waits for completion of a task (w/intr)
* @x: holds the state of this particular completion
*
* This waits for completion of a specific task to be signaled. It is
* interruptible.
*
* Return: -ERESTARTSYS if interrupted, 0 if completed.
*/
int __sched wait_for_completion_interruptible(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
if (t == -ERESTARTSYS)
return t;
return 0;
}
EXPORT_SYMBOL(wait_for_completion_interruptible);
/**
* wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
* @x: holds the state of this particular completion
* @timeout: timeout value in jiffies
*
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. It is interruptible. The timeout is in jiffies.
*
* Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
* or number of jiffies left till timeout) if completed.
*/
long __sched
wait_for_completion_interruptible_timeout(struct completion *x,
unsigned long timeout)
{
return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
/**
* wait_for_completion_killable: - waits for completion of a task (killable)
* @x: holds the state of this particular completion
*
* This waits to be signaled for completion of a specific task. It can be
* interrupted by a kill signal.
*
* Return: -ERESTARTSYS if interrupted, 0 if completed.
*/
int __sched wait_for_completion_killable(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
if (t == -ERESTARTSYS)
return t;
return 0;
}
EXPORT_SYMBOL(wait_for_completion_killable);
/**
* wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
* @x: holds the state of this particular completion
* @timeout: timeout value in jiffies
*
* This waits for either a completion of a specific task to be
* signaled or for a specified timeout to expire. It can be
* interrupted by a kill signal. The timeout is in jiffies.
*
* Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
* or number of jiffies left till timeout) if completed.
*/
long __sched
wait_for_completion_killable_timeout(struct completion *x,
unsigned long timeout)
{
return wait_for_common(x, timeout, TASK_KILLABLE);
}
EXPORT_SYMBOL(wait_for_completion_killable_timeout);
/**
* try_wait_for_completion - try to decrement a completion without blocking
* @x: completion structure
*
* Return: 0 if a decrement cannot be done without blocking
* 1 if a decrement succeeded.
*
* If a completion is being used as a counting completion,
* attempt to decrement the counter without blocking. This
* enables us to avoid waiting if the resource the completion
* is protecting is not available.
*/
bool try_wait_for_completion(struct completion *x)
{
unsigned long flags;
int ret = 1;
spin_lock_irqsave(&x->wait.lock, flags);
if (!x->done)
ret = 0;
else
x->done--;
spin_unlock_irqrestore(&x->wait.lock, flags);
return ret;
}
EXPORT_SYMBOL(try_wait_for_completion);
/**
* completion_done - Test to see if a completion has any waiters
* @x: completion structure
*
* Return: 0 if there are waiters (wait_for_completion() in progress)
* 1 if there are no waiters.
*
*/
bool completion_done(struct completion *x)
{
unsigned long flags;
int ret = 1;
spin_lock_irqsave(&x->wait.lock, flags);
if (!x->done)
ret = 0;
spin_unlock_irqrestore(&x->wait.lock, flags);
return ret;
}
EXPORT_SYMBOL(completion_done);
static long __sched
sleep_on_common(wait_queue_head_t *q, int state, long timeout)
{
@@ -3598,13 +3324,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
struct task_struct *p;
int retval;
get_online_cpus();
rcu_read_lock();
p = find_process_by_pid(pid);
if (!p) {
rcu_read_unlock();
put_online_cpus();
return -ESRCH;
}
@@ -3661,7 +3385,6 @@ out_free_cpus_allowed:
free_cpumask_var(cpus_allowed);
out_put_task:
put_task_struct(p);
put_online_cpus();
return retval;
}
@@ -3706,7 +3429,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
unsigned long flags;
int retval;
get_online_cpus();
rcu_read_lock();
retval = -ESRCH;
@@ -3719,12 +3441,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
goto out_unlock;
raw_spin_lock_irqsave(&p->pi_lock, flags);
cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out_unlock:
rcu_read_unlock();
put_online_cpus();
return retval;
}
@@ -3794,16 +3515,11 @@ SYSCALL_DEFINE0(sched_yield)
return 0;
}
static inline int should_resched(void)
{
return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
}
static void __cond_resched(void)
{
add_preempt_count(PREEMPT_ACTIVE);
__preempt_count_add(PREEMPT_ACTIVE);
__schedule();
sub_preempt_count(PREEMPT_ACTIVE);
__preempt_count_sub(PREEMPT_ACTIVE);
}
int __sched _cond_resched(void)
@@ -4186,7 +3902,7 @@ void init_idle(struct task_struct *idle, int cpu)
raw_spin_lock_irqsave(&rq->lock, flags);
__sched_fork(idle);
__sched_fork(0, idle);
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
@@ -4212,7 +3928,7 @@ void init_idle(struct task_struct *idle, int cpu)
raw_spin_unlock_irqrestore(&rq->lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
task_thread_info(idle)->preempt_count = 0;
init_idle_preempt_count(idle, cpu);
/*
* The idle tasks have their own, simple scheduling class:
@@ -4346,6 +4062,53 @@ fail:
return ret;
}
#ifdef CONFIG_NUMA_BALANCING
/* Migrate current task p to target_cpu */
int migrate_task_to(struct task_struct *p, int target_cpu)
{
struct migration_arg arg = { p, target_cpu };
int curr_cpu = task_cpu(p);
if (curr_cpu == target_cpu)
return 0;
if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
return -EINVAL;
/* TODO: This is not properly updating schedstats */
return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
}
/*
* Requeue a task on a given node and accurately track the number of NUMA
* tasks on the runqueues
*/
void sched_setnuma(struct task_struct *p, int nid)
{
struct rq *rq;
unsigned long flags;
bool on_rq, running;
rq = task_rq_lock(p, &flags);
on_rq = p->on_rq;
running = task_current(rq, p);
if (on_rq)
dequeue_task(rq, p, 0);
if (running)
p->sched_class->put_prev_task(rq, p);
p->numa_preferred_nid = nid;
if (running)
p->sched_class->set_curr_task(rq);
if (on_rq)
enqueue_task(rq, p, 0);
task_rq_unlock(rq, p, &flags);
}
#endif
/*
* migration_cpu_stop - this will be executed by a highprio stopper thread
* and performs thread migration by bumping thread off CPU then
@@ -5119,6 +4882,9 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
DEFINE_PER_CPU(struct sched_domain *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);
DEFINE_PER_CPU(struct sched_domain *, sd_numa);
DEFINE_PER_CPU(struct sched_domain *, sd_busy);
DEFINE_PER_CPU(struct sched_domain *, sd_asym);
static void update_top_cache_domain(int cpu)
{
@@ -5130,11 +4896,18 @@ static void update_top_cache_domain(int cpu)
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent);
}
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
per_cpu(sd_llc_id, cpu) = id;
sd = lowest_flag_domain(cpu, SD_NUMA);
rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
}
/*
@@ -5654,6 +5427,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
| 0*SD_SHARE_PKG_RESOURCES
| 1*SD_SERIALIZE
| 0*SD_PREFER_SIBLING
| 1*SD_NUMA
| sd_local_flags(level)
,
.last_balance = jiffies,
@@ -6335,14 +6109,17 @@ void __init sched_init_smp(void)
sched_init_numa();
get_online_cpus();
/*
* There's no userspace yet to cause hotplug operations; hence all the
* cpu masks are stable and all blatant races in the below code cannot
* happen.
*/
mutex_lock(&sched_domains_mutex);
init_sched_domains(cpu_active_mask);
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
if (cpumask_empty(non_isolated_cpus))
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
mutex_unlock(&sched_domains_mutex);
put_online_cpus();
hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
@@ -6505,6 +6282,7 @@ void __init sched_init(void)
rq->online = 0;
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
INIT_LIST_HEAD(&rq->cfs_tasks);
@@ -7277,7 +7055,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
runtime_enabled = quota != RUNTIME_INF;
runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
/*
* If we need to toggle cfs_bandwidth_used, off->on must occur
* before making related changes, and on->off must occur afterwards
*/
if (runtime_enabled && !runtime_was_enabled)
cfs_bandwidth_usage_inc();
raw_spin_lock_irq(&cfs_b->lock);
cfs_b->period = ns_to_ktime(period);
cfs_b->quota = quota;
@@ -7303,6 +7086,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
unthrottle_cfs_rq(cfs_rq);
raw_spin_unlock_irq(&rq->lock);
}
if (runtime_was_enabled && !runtime_enabled)
cfs_bandwidth_usage_dec();
out_unlock:
mutex_unlock(&cfs_constraints_mutex);

查看文件

@@ -15,6 +15,7 @@
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
#include <linux/utsname.h>
#include <linux/mempolicy.h>
#include "sched.h"
@@ -137,6 +138,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
#endif
#ifdef CONFIG_NUMA_BALANCING
SEQ_printf(m, " %d", cpu_to_node(task_cpu(p)));
#endif
#ifdef CONFIG_CGROUP_SCHED
SEQ_printf(m, " %s", task_group_path(task_group(p)));
#endif
@@ -159,7 +163,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
read_lock_irqsave(&tasklist_lock, flags);
do_each_thread(g, p) {
if (!p->on_rq || task_cpu(p) != rq_cpu)
if (task_cpu(p) != rq_cpu)
continue;
print_task(m, rq, p);
@@ -225,6 +229,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
atomic_read(&cfs_rq->tg->runnable_avg));
#endif
#endif
#ifdef CONFIG_CFS_BANDWIDTH
SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
cfs_rq->tg->cfs_bandwidth.timer_active);
SEQ_printf(m, " .%-30s: %d\n", "throttled",
cfs_rq->throttled);
SEQ_printf(m, " .%-30s: %d\n", "throttle_count",
cfs_rq->throttle_count);
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
print_cfs_group_stats(m, cpu, cfs_rq->tg);
@@ -345,7 +357,7 @@ static void sched_debug_header(struct seq_file *m)
cpu_clk = local_clock();
local_irq_restore(flags);
SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n",
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
@@ -488,6 +500,56 @@ static int __init init_sched_debug_procfs(void)
__initcall(init_sched_debug_procfs);
#define __P(F) \
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
#define P(F) \
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
#define __PN(F) \
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
#define PN(F) \
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
static void sched_show_numa(struct task_struct *p, struct seq_file *m)
{
#ifdef CONFIG_NUMA_BALANCING
struct mempolicy *pol;
int node, i;
if (p->mm)
P(mm->numa_scan_seq);
task_lock(p);
pol = p->mempolicy;
if (pol && !(pol->flags & MPOL_F_MORON))
pol = NULL;
mpol_get(pol);
task_unlock(p);
SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0));
for_each_online_node(node) {
for (i = 0; i < 2; i++) {
unsigned long nr_faults = -1;
int cpu_current, home_node;
if (p->numa_faults)
nr_faults = p->numa_faults[2*node + i];
cpu_current = !i ? (task_node(p) == node) :
(pol && node_isset(node, pol->v.nodes));
home_node = (p->numa_preferred_nid == node);
SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n",
i, node, cpu_current, home_node, nr_faults);
}
}
mpol_put(pol);
#endif
}
void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
{
unsigned long nr_switches;
@@ -591,6 +653,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
SEQ_printf(m, "%-45s:%21Ld\n",
"clock-delta", (long long)(t1-t0));
}
sched_show_numa(p, m);
}
void proc_sched_set_task(struct task_struct *p)

文件差異過大導致無法顯示 Load Diff

查看文件

@@ -63,10 +63,23 @@ SCHED_FEAT(LB_MIN, false)
/*
* Apply the automatic NUMA scheduling policy. Enabled automatically
* at runtime if running on a NUMA machine. Can be controlled via
* numa_balancing=. Allow PTE scanning to be forced on UMA machines
* for debugging the core machinery.
* numa_balancing=
*/
#ifdef CONFIG_NUMA_BALANCING
SCHED_FEAT(NUMA, false)
SCHED_FEAT(NUMA_FORCE, false)
/*
* NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
* higher number of hinting faults are recorded during active load
* balancing.
*/
SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
/*
* NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
* lower number of hinting faults have been recorded. As this has
* the potential to prevent a task ever migrating to a new node
* due to CPU overload it is disabled by default.
*/
SCHED_FEAT(NUMA_RESIST_LOWER, false)
#endif

查看文件

@@ -9,7 +9,7 @@
#ifdef CONFIG_SMP
static int
select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}

查看文件

@@ -246,8 +246,10 @@ static inline void rt_set_overload(struct rq *rq)
* if we should look at the mask. It would be a shame
* if we looked at the mask, but the mask was not
* updated yet.
*
* Matched by the barrier in pull_rt_task().
*/
wmb();
smp_wmb();
atomic_inc(&rq->rd->rto_count);
}
@@ -1169,13 +1171,10 @@ static void yield_task_rt(struct rq *rq)
static int find_lowest_rq(struct task_struct *task);
static int
select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
{
struct task_struct *curr;
struct rq *rq;
int cpu;
cpu = task_cpu(p);
if (p->nr_cpus_allowed == 1)
goto out;
@@ -1213,8 +1212,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
*/
if (curr && unlikely(rt_task(curr)) &&
(curr->nr_cpus_allowed < 2 ||
curr->prio <= p->prio) &&
(p->nr_cpus_allowed > 1)) {
curr->prio <= p->prio)) {
int target = find_lowest_rq(p);
if (target != -1)
@@ -1630,6 +1628,12 @@ static int pull_rt_task(struct rq *this_rq)
if (likely(!rt_overloaded(this_rq)))
return 0;
/*
* Match the barrier from rt_set_overloaded; this guarantees that if we
* see overloaded we must also see the rto_mask bit.
*/
smp_rmb();
for_each_cpu(cpu, this_rq->rd->rto_mask) {
if (this_cpu == cpu)
continue;
@@ -1931,8 +1935,8 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
p->rt.time_slice = sched_rr_timeslice;
/*
* Requeue to the end of queue if we (and all of our ancestors) are the
* only element on the queue
* Requeue to the end of queue if we (and all of our ancestors) are not
* the only element on the queue
*/
for_each_sched_rt_entity(rt_se) {
if (rt_se->run_list.prev != rt_se->run_list.next) {

查看文件

@@ -6,6 +6,7 @@
#include <linux/spinlock.h>
#include <linux/stop_machine.h>
#include <linux/tick.h>
#include <linux/slab.h>
#include "cpupri.h"
#include "cpuacct.h"
@@ -408,6 +409,10 @@ struct rq {
* remote CPUs use both these fields when doing load calculation.
*/
unsigned int nr_running;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
#endif
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
unsigned long last_load_update_tick;
@@ -476,6 +481,9 @@ struct rq {
u64 age_stamp;
u64 idle_stamp;
u64 avg_idle;
/* This is used to determine avg_idle's max value */
u64 max_idle_balance_cost;
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -552,6 +560,12 @@ static inline u64 rq_clock_task(struct rq *rq)
return rq->clock_task;
}
#ifdef CONFIG_NUMA_BALANCING
extern void sched_setnuma(struct task_struct *p, int node);
extern int migrate_task_to(struct task_struct *p, int cpu);
extern int migrate_swap(struct task_struct *, struct task_struct *);
#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_SMP
#define rcu_dereference_check_sched_domain(p) \
@@ -593,9 +607,24 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
return hsd;
}
static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
{
struct sched_domain *sd;
for_each_domain(cpu, sd) {
if (sd->flags & flag)
break;
}
return sd;
}
DECLARE_PER_CPU(struct sched_domain *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
DECLARE_PER_CPU(struct sched_domain *, sd_numa);
DECLARE_PER_CPU(struct sched_domain *, sd_busy);
DECLARE_PER_CPU(struct sched_domain *, sd_asym);
struct sched_group_power {
atomic_t ref;
@@ -605,6 +634,7 @@ struct sched_group_power {
*/
unsigned int power, power_orig;
unsigned long next_update;
int imbalance; /* XXX unrelated to power but shared group state */
/*
* Number of busy cpus in this group.
*/
@@ -719,6 +749,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
*/
smp_wmb();
task_thread_info(p)->cpu = cpu;
p->wake_cpu = cpu;
#endif
}
@@ -974,7 +1005,7 @@ struct sched_class {
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
@@ -1220,6 +1251,24 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
}
static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
{
if (l1 > l2)
swap(l1, l2);
spin_lock(l1);
spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
}
static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
{
if (l1 > l2)
swap(l1, l2);
raw_spin_lock(l1);
raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
}
/*
* double_rq_lock - safely lock two runqueues
*
@@ -1305,7 +1354,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
extern void cfs_bandwidth_usage_inc(void);
extern void cfs_bandwidth_usage_dec(void);
#ifdef CONFIG_NO_HZ_COMMON
enum rq_nohz_flag_bits {

查看文件

@@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
* from dequeue_task() to account for possible rq->clock skew across cpus. The
* delta taken on each cpu would annul the skew.
*/
static inline void sched_info_dequeued(struct task_struct *t)
static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
{
unsigned long long now = rq_clock(task_rq(t)), delta = 0;
unsigned long long now = rq_clock(rq), delta = 0;
if (unlikely(sched_info_on()))
if (t->sched_info.last_queued)
@@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
sched_info_reset_dequeued(t);
t->sched_info.run_delay += delta;
rq_sched_info_dequeued(task_rq(t), delta);
rq_sched_info_dequeued(rq, delta);
}
/*
@@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct task_struct *t)
* long it was waiting to run. We also note when it began so that we
* can keep stats on how long its timeslice is.
*/
static void sched_info_arrive(struct task_struct *t)
static void sched_info_arrive(struct rq *rq, struct task_struct *t)
{
unsigned long long now = rq_clock(task_rq(t)), delta = 0;
unsigned long long now = rq_clock(rq), delta = 0;
if (t->sched_info.last_queued)
delta = now - t->sched_info.last_queued;
@@ -88,7 +88,7 @@ static void sched_info_arrive(struct task_struct *t)
t->sched_info.last_arrival = now;
t->sched_info.pcount++;
rq_sched_info_arrive(task_rq(t), delta);
rq_sched_info_arrive(rq, delta);
}
/*
@@ -96,11 +96,11 @@ static void sched_info_arrive(struct task_struct *t)
* the timestamp if it is already not set. It's assumed that
* sched_info_dequeued() will clear that stamp when appropriate.
*/
static inline void sched_info_queued(struct task_struct *t)
static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
{
if (unlikely(sched_info_on()))
if (!t->sched_info.last_queued)
t->sched_info.last_queued = rq_clock(task_rq(t));
t->sched_info.last_queued = rq_clock(rq);
}
/*
@@ -111,15 +111,15 @@ static inline void sched_info_queued(struct task_struct *t)
* sched_info_queued() to mark that it has now again started waiting on
* the runqueue.
*/
static inline void sched_info_depart(struct task_struct *t)
static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
{
unsigned long long delta = rq_clock(task_rq(t)) -
unsigned long long delta = rq_clock(rq) -
t->sched_info.last_arrival;
rq_sched_info_depart(task_rq(t), delta);
rq_sched_info_depart(rq, delta);
if (t->state == TASK_RUNNING)
sched_info_queued(t);
sched_info_queued(rq, t);
}
/*
@@ -128,32 +128,34 @@ static inline void sched_info_depart(struct task_struct *t)
* the idle task.) We are only called when prev != next.
*/
static inline void
__sched_info_switch(struct task_struct *prev, struct task_struct *next)
__sched_info_switch(struct rq *rq,
struct task_struct *prev, struct task_struct *next)
{
struct rq *rq = task_rq(prev);
/*
* prev now departs the cpu. It's not interesting to record
* stats about how efficient we were at scheduling the idle
* process, however.
*/
if (prev != rq->idle)
sched_info_depart(prev);
sched_info_depart(rq, prev);
if (next != rq->idle)
sched_info_arrive(next);
sched_info_arrive(rq, next);
}
static inline void
sched_info_switch(struct task_struct *prev, struct task_struct *next)
sched_info_switch(struct rq *rq,
struct task_struct *prev, struct task_struct *next)
{
if (unlikely(sched_info_on()))
__sched_info_switch(prev, next);
__sched_info_switch(rq, prev, next);
}
#else
#define sched_info_queued(t) do { } while (0)
#define sched_info_queued(rq, t) do { } while (0)
#define sched_info_reset_dequeued(t) do { } while (0)
#define sched_info_dequeued(t) do { } while (0)
#define sched_info_switch(t, next) do { } while (0)
#define sched_info_dequeued(rq, t) do { } while (0)
#define sched_info_depart(rq, t) do { } while (0)
#define sched_info_arrive(rq, next) do { } while (0)
#define sched_info_switch(rq, t, next) do { } while (0)
#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
/*

查看文件

@@ -11,7 +11,7 @@
#ifdef CONFIG_SMP
static int
select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
{
return task_cpu(p); /* stop tasks as never migrate */
}

查看文件

@@ -52,6 +52,109 @@ void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
EXPORT_SYMBOL(remove_wait_queue);
/*
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
* number) then we wake all the non-exclusive tasks and one exclusive task.
*
* There are circumstances in which we can try to wake a task which has already
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
* zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, int wake_flags, void *key)
{
wait_queue_t *curr, *next;
list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
unsigned flags = curr->flags;
if (curr->func(curr, mode, wake_flags, key) &&
(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
}
}
/**
* __wake_up - wake up threads blocked on a waitqueue.
* @q: the waitqueue
* @mode: which threads
* @nr_exclusive: how many wake-one or wake-many threads to wake up
* @key: is directly passed to the wakeup function
*
* It may be assumed that this function implies a write memory barrier before
* changing the task state if and only if any tasks are woken up.
*/
void __wake_up(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, void *key)
{
unsigned long flags;
spin_lock_irqsave(&q->lock, flags);
__wake_up_common(q, mode, nr_exclusive, 0, key);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(__wake_up);
/*
* Same as __wake_up but called with the spinlock in wait_queue_head_t held.
*/
void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
{
__wake_up_common(q, mode, nr, 0, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_locked);
void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
{
__wake_up_common(q, mode, 1, 0, key);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_key);
/**
* __wake_up_sync_key - wake up threads blocked on a waitqueue.
* @q: the waitqueue
* @mode: which threads
* @nr_exclusive: how many wake-one or wake-many threads to wake up
* @key: opaque value to be passed to wakeup targets
*
* The sync wakeup differs that the waker knows that it will schedule
* away soon, so while the target thread will be woken up, it will not
* be migrated to another CPU - ie. the two threads are 'synchronized'
* with each other. This can prevent needless bouncing between CPUs.
*
* On UP it can prevent extra preemption.
*
* It may be assumed that this function implies a write memory barrier before
* changing the task state if and only if any tasks are woken up.
*/
void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, void *key)
{
unsigned long flags;
int wake_flags = 1; /* XXX WF_SYNC */
if (unlikely(!q))
return;
if (unlikely(nr_exclusive != 1))
wake_flags = 0;
spin_lock_irqsave(&q->lock, flags);
__wake_up_common(q, mode, nr_exclusive, wake_flags, key);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL_GPL(__wake_up_sync_key);
/*
* __wake_up_sync - see __wake_up_sync_key()
*/
void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
{
__wake_up_sync_key(q, mode, nr_exclusive, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
/*
* Note: we use "set_current_state()" _after_ the wait-queue add,
* because we need a memory barrier there on SMP, so that any
@@ -92,6 +195,30 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
}
EXPORT_SYMBOL(prepare_to_wait_exclusive);
long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
unsigned long flags;
if (signal_pending_state(state, current))
return -ERESTARTSYS;
wait->private = current;
wait->func = autoremove_wake_function;
spin_lock_irqsave(&q->lock, flags);
if (list_empty(&wait->task_list)) {
if (wait->flags & WQ_FLAG_EXCLUSIVE)
__add_wait_queue_tail(q, wait);
else
__add_wait_queue(q, wait);
}
set_current_state(state);
spin_unlock_irqrestore(&q->lock, flags);
return 0;
}
EXPORT_SYMBOL(prepare_to_wait_event);
/**
* finish_wait - clean up after waiting in a queue
* @q: waitqueue waited on

查看文件

@@ -99,13 +99,13 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
raw_local_irq_save(flags);
/*
* The preempt tracer hooks into add_preempt_count and will break
* The preempt tracer hooks into preempt_count_add and will break
* lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
* is set and before current->softirq_enabled is cleared.
* We must manually increment preempt_count here and manually
* call the trace_preempt_off later.
*/
preempt_count() += cnt;
__preempt_count_add(cnt);
/*
* Were softirqs turned off above:
*/
@@ -119,7 +119,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
#else /* !CONFIG_TRACE_IRQFLAGS */
static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
{
add_preempt_count(cnt);
preempt_count_add(cnt);
barrier();
}
#endif /* CONFIG_TRACE_IRQFLAGS */
@@ -137,7 +137,7 @@ static void __local_bh_enable(unsigned int cnt)
if (softirq_count() == cnt)
trace_softirqs_on(_RET_IP_);
sub_preempt_count(cnt);
preempt_count_sub(cnt);
}
/*
@@ -168,7 +168,7 @@ static inline void _local_bh_enable_ip(unsigned long ip)
* Keep preemption disabled until we are done with
* softirq processing:
*/
sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1);
if (unlikely(!in_interrupt() && local_softirq_pending())) {
/*
@@ -178,7 +178,7 @@ static inline void _local_bh_enable_ip(unsigned long ip)
do_softirq();
}
dec_preempt_count();
preempt_count_dec();
#ifdef CONFIG_TRACE_IRQFLAGS
local_irq_enable();
#endif
@@ -260,7 +260,7 @@ restart:
" exited with %08x?\n", vec_nr,
softirq_to_name[vec_nr], h->action,
prev_count, preempt_count());
preempt_count() = prev_count;
preempt_count_set(prev_count);
}
rcu_bh_qs(cpu);
@@ -378,7 +378,7 @@ void irq_exit(void)
account_irq_exit_time(current);
trace_hardirq_exit();
sub_preempt_count(HARDIRQ_OFFSET);
preempt_count_sub(HARDIRQ_OFFSET);
if (!in_interrupt() && local_softirq_pending())
invoke_softirq();

查看文件

@@ -20,6 +20,7 @@
#include <linux/kallsyms.h>
#include <linux/smpboot.h>
#include <linux/atomic.h>
#include <linux/lglock.h>
/*
* Structure to determine completion condition and record errors. May
@@ -43,6 +44,14 @@ static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
static bool stop_machine_initialized = false;
/*
* Avoids a race between stop_two_cpus and global stop_cpus, where
* the stoppers could get queued up in reverse order, leading to
* system deadlock. Using an lglock means stop_two_cpus remains
* relatively cheap.
*/
DEFINE_STATIC_LGLOCK(stop_cpus_lock);
static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
{
memset(done, 0, sizeof(*done));
@@ -115,6 +124,184 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
return done.executed ? done.ret : -ENOENT;
}
/* This controls the threads on each CPU. */
enum multi_stop_state {
/* Dummy starting state for thread. */
MULTI_STOP_NONE,
/* Awaiting everyone to be scheduled. */
MULTI_STOP_PREPARE,
/* Disable interrupts. */
MULTI_STOP_DISABLE_IRQ,
/* Run the function */
MULTI_STOP_RUN,
/* Exit */
MULTI_STOP_EXIT,
};
struct multi_stop_data {
int (*fn)(void *);
void *data;
/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
unsigned int num_threads;
const struct cpumask *active_cpus;
enum multi_stop_state state;
atomic_t thread_ack;
};
static void set_state(struct multi_stop_data *msdata,
enum multi_stop_state newstate)
{
/* Reset ack counter. */
atomic_set(&msdata->thread_ack, msdata->num_threads);
smp_wmb();
msdata->state = newstate;
}
/* Last one to ack a state moves to the next state. */
static void ack_state(struct multi_stop_data *msdata)
{
if (atomic_dec_and_test(&msdata->thread_ack))
set_state(msdata, msdata->state + 1);
}
/* This is the cpu_stop function which stops the CPU. */
static int multi_cpu_stop(void *data)
{
struct multi_stop_data *msdata = data;
enum multi_stop_state curstate = MULTI_STOP_NONE;
int cpu = smp_processor_id(), err = 0;
unsigned long flags;
bool is_active;
/*
* When called from stop_machine_from_inactive_cpu(), irq might
* already be disabled. Save the state and restore it on exit.
*/
local_save_flags(flags);
if (!msdata->active_cpus)
is_active = cpu == cpumask_first(cpu_online_mask);
else
is_active = cpumask_test_cpu(cpu, msdata->active_cpus);
/* Simple state machine */
do {
/* Chill out and ensure we re-read multi_stop_state. */
cpu_relax();
if (msdata->state != curstate) {
curstate = msdata->state;
switch (curstate) {
case MULTI_STOP_DISABLE_IRQ:
local_irq_disable();
hard_irq_disable();
break;
case MULTI_STOP_RUN:
if (is_active)
err = msdata->fn(msdata->data);
break;
default:
break;
}
ack_state(msdata);
}
} while (curstate != MULTI_STOP_EXIT);
local_irq_restore(flags);
return err;
}
struct irq_cpu_stop_queue_work_info {
int cpu1;
int cpu2;
struct cpu_stop_work *work1;
struct cpu_stop_work *work2;
};
/*
* This function is always run with irqs and preemption disabled.
* This guarantees that both work1 and work2 get queued, before
* our local migrate thread gets the chance to preempt us.
*/
static void irq_cpu_stop_queue_work(void *arg)
{
struct irq_cpu_stop_queue_work_info *info = arg;
cpu_stop_queue_work(info->cpu1, info->work1);
cpu_stop_queue_work(info->cpu2, info->work2);
}
/**
* stop_two_cpus - stops two cpus
* @cpu1: the cpu to stop
* @cpu2: the other cpu to stop
* @fn: function to execute
* @arg: argument to @fn
*
* Stops both the current and specified CPU and runs @fn on one of them.
*
* returns when both are completed.
*/
int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
{
struct cpu_stop_done done;
struct cpu_stop_work work1, work2;
struct irq_cpu_stop_queue_work_info call_args;
struct multi_stop_data msdata;
preempt_disable();
msdata = (struct multi_stop_data){
.fn = fn,
.data = arg,
.num_threads = 2,
.active_cpus = cpumask_of(cpu1),
};
work1 = work2 = (struct cpu_stop_work){
.fn = multi_cpu_stop,
.arg = &msdata,
.done = &done
};
call_args = (struct irq_cpu_stop_queue_work_info){
.cpu1 = cpu1,
.cpu2 = cpu2,
.work1 = &work1,
.work2 = &work2,
};
cpu_stop_init_done(&done, 2);
set_state(&msdata, MULTI_STOP_PREPARE);
/*
* If we observe both CPUs active we know _cpu_down() cannot yet have
* queued its stop_machine works and therefore ours will get executed
* first. Or its not either one of our CPUs that's getting unplugged,
* in which case we don't care.
*
* This relies on the stopper workqueues to be FIFO.
*/
if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
preempt_enable();
return -ENOENT;
}
lg_local_lock(&stop_cpus_lock);
/*
* Queuing needs to be done by the lowest numbered CPU, to ensure
* that works are always queued in the same order on every CPU.
* This prevents deadlocks.
*/
smp_call_function_single(min(cpu1, cpu2),
&irq_cpu_stop_queue_work,
&call_args, 0);
lg_local_unlock(&stop_cpus_lock);
preempt_enable();
wait_for_completion(&done.completion);
return done.executed ? done.ret : -ENOENT;
}
/**
* stop_one_cpu_nowait - stop a cpu but don't wait for completion
* @cpu: cpu to stop
@@ -159,10 +346,10 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
* preempted by a stopper which might wait for other stoppers
* to enter @fn which can lead to deadlock.
*/
preempt_disable();
lg_global_lock(&stop_cpus_lock);
for_each_cpu(cpu, cpumask)
cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
preempt_enable();
lg_global_unlock(&stop_cpus_lock);
}
static int __stop_cpus(const struct cpumask *cpumask,
@@ -359,98 +546,14 @@ early_initcall(cpu_stop_init);
#ifdef CONFIG_STOP_MACHINE
/* This controls the threads on each CPU. */
enum stopmachine_state {
/* Dummy starting state for thread. */
STOPMACHINE_NONE,
/* Awaiting everyone to be scheduled. */
STOPMACHINE_PREPARE,
/* Disable interrupts. */
STOPMACHINE_DISABLE_IRQ,
/* Run the function */
STOPMACHINE_RUN,
/* Exit */
STOPMACHINE_EXIT,
};
struct stop_machine_data {
int (*fn)(void *);
void *data;
/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
unsigned int num_threads;
const struct cpumask *active_cpus;
enum stopmachine_state state;
atomic_t thread_ack;
};
static void set_state(struct stop_machine_data *smdata,
enum stopmachine_state newstate)
{
/* Reset ack counter. */
atomic_set(&smdata->thread_ack, smdata->num_threads);
smp_wmb();
smdata->state = newstate;
}
/* Last one to ack a state moves to the next state. */
static void ack_state(struct stop_machine_data *smdata)
{
if (atomic_dec_and_test(&smdata->thread_ack))
set_state(smdata, smdata->state + 1);
}
/* This is the cpu_stop function which stops the CPU. */
static int stop_machine_cpu_stop(void *data)
{
struct stop_machine_data *smdata = data;
enum stopmachine_state curstate = STOPMACHINE_NONE;
int cpu = smp_processor_id(), err = 0;
unsigned long flags;
bool is_active;
/*
* When called from stop_machine_from_inactive_cpu(), irq might
* already be disabled. Save the state and restore it on exit.
*/
local_save_flags(flags);
if (!smdata->active_cpus)
is_active = cpu == cpumask_first(cpu_online_mask);
else
is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
/* Simple state machine */
do {
/* Chill out and ensure we re-read stopmachine_state. */
cpu_relax();
if (smdata->state != curstate) {
curstate = smdata->state;
switch (curstate) {
case STOPMACHINE_DISABLE_IRQ:
local_irq_disable();
hard_irq_disable();
break;
case STOPMACHINE_RUN:
if (is_active)
err = smdata->fn(smdata->data);
break;
default:
break;
}
ack_state(smdata);
}
} while (curstate != STOPMACHINE_EXIT);
local_irq_restore(flags);
return err;
}
int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
{
struct stop_machine_data smdata = { .fn = fn, .data = data,
.num_threads = num_online_cpus(),
.active_cpus = cpus };
struct multi_stop_data msdata = {
.fn = fn,
.data = data,
.num_threads = num_online_cpus(),
.active_cpus = cpus,
};
if (!stop_machine_initialized) {
/*
@@ -461,7 +564,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
unsigned long flags;
int ret;
WARN_ON_ONCE(smdata.num_threads != 1);
WARN_ON_ONCE(msdata.num_threads != 1);
local_irq_save(flags);
hard_irq_disable();
@@ -472,8 +575,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
}
/* Set the initial state and stop all online cpus. */
set_state(&smdata, STOPMACHINE_PREPARE);
return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
set_state(&msdata, MULTI_STOP_PREPARE);
return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
}
int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
@@ -513,25 +616,25 @@ EXPORT_SYMBOL_GPL(stop_machine);
int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
const struct cpumask *cpus)
{
struct stop_machine_data smdata = { .fn = fn, .data = data,
struct multi_stop_data msdata = { .fn = fn, .data = data,
.active_cpus = cpus };
struct cpu_stop_done done;
int ret;
/* Local CPU must be inactive and CPU hotplug in progress. */
BUG_ON(cpu_active(raw_smp_processor_id()));
smdata.num_threads = num_active_cpus() + 1; /* +1 for local */
msdata.num_threads = num_active_cpus() + 1; /* +1 for local */
/* No proper task established and can't sleep - busy wait for lock. */
while (!mutex_trylock(&stop_cpus_mutex))
cpu_relax();
/* Schedule work on other CPUs and execute directly for local CPU */
set_state(&smdata, STOPMACHINE_PREPARE);
set_state(&msdata, MULTI_STOP_PREPARE);
cpu_stop_init_done(&done, num_active_cpus());
queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata,
queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
&done);
ret = stop_machine_cpu_stop(&smdata);
ret = multi_cpu_stop(&msdata);
/* Busy wait for completion. */
while (!completion_done(&done.completion))

查看文件

@@ -370,13 +370,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "numa_balancing_scan_period_reset",
.data = &sysctl_numa_balancing_scan_period_reset,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "numa_balancing_scan_period_max_ms",
.data = &sysctl_numa_balancing_scan_period_max,
@@ -391,6 +384,20 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "numa_balancing_settle_count",
.data = &sysctl_numa_balancing_settle_count,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "numa_balancing_migrate_deferred",
.data = &sysctl_numa_balancing_migrate_deferred,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_SCHED_DEBUG */
{

查看文件

@@ -1092,7 +1092,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
unsigned long data)
{
int preempt_count = preempt_count();
int count = preempt_count();
#ifdef CONFIG_LOCKDEP
/*
@@ -1119,16 +1119,16 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
lock_map_release(&lockdep_map);
if (preempt_count != preempt_count()) {
if (count != preempt_count()) {
WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
fn, preempt_count, preempt_count());
fn, count, preempt_count());
/*
* Restore the preempt count. That gives us a decent
* chance to survive and extract information. If the
* callback kept a lock held, bad luck, but not worse
* than the BUG() we had.
*/
preempt_count() = preempt_count;
preempt_count_set(count);
}
}

查看文件

@@ -1509,7 +1509,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
#endif
((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
(tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
(test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
}
EXPORT_SYMBOL_GPL(tracing_generic_entry_update);

查看文件

@@ -124,6 +124,7 @@ enum trace_flag_type {
TRACE_FLAG_NEED_RESCHED = 0x04,
TRACE_FLAG_HARDIRQ = 0x08,
TRACE_FLAG_SOFTIRQ = 0x10,
TRACE_FLAG_PREEMPT_RESCHED = 0x20,
};
#define TRACE_BUF_SIZE 1024

查看文件

@@ -618,8 +618,23 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
'.';
need_resched =
(entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
switch (entry->flags & (TRACE_FLAG_NEED_RESCHED |
TRACE_FLAG_PREEMPT_RESCHED)) {
case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED:
need_resched = 'N';
break;
case TRACE_FLAG_NEED_RESCHED:
need_resched = 'n';
break;
case TRACE_FLAG_PREEMPT_RESCHED:
need_resched = 'p';
break;
default:
need_resched = '.';
break;
}
hardsoft_irq =
(hardirq && softirq) ? 'H' :
hardirq ? 'h' :