Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"The main changes are:
- lockless wakeup support for futexes and IPC message queues
(Davidlohr Bueso, Peter Zijlstra)
- Replace spinlocks with atomics in thread_group_cputimer(), to
improve scalability (Jason Low)
- NUMA balancing improvements (Rik van Riel)
- SCHED_DEADLINE improvements (Wanpeng Li)
- clean up and reorganize preemption helpers (Frederic Weisbecker)
- decouple page fault disabling machinery from the preemption
counter, to improve debuggability and robustness (David
Hildenbrand)
- SCHED_DEADLINE documentation updates (Luca Abeni)
- topology CPU masks cleanups (Bartosz Golaszewski)
- /proc/sched_debug improvements (Srikar Dronamraju)"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (79 commits)
sched/deadline: Remove needless parameter in dl_runtime_exceeded()
sched: Remove superfluous resetting of the p->dl_throttled flag
sched/deadline: Drop duplicate init_sched_dl_class() declaration
sched/deadline: Reduce rq lock contention by eliminating locking of non-feasible target
sched/deadline: Make init_sched_dl_class() __init
sched/deadline: Optimize pull_dl_task()
sched/preempt: Add static_key() to preempt_notifiers
sched/preempt: Fix preempt notifiers documentation about hlist_del() within unsafe iteration
sched/stop_machine: Fix deadlock between multiple stop_two_cpus()
sched/debug: Add sum_sleep_runtime to /proc/<pid>/sched
sched/debug: Replace vruntime with wait_sum in /proc/sched_debug
sched/debug: Properly format runnable tasks in /proc/sched_debug
sched/numa: Only consider less busy nodes as numa balancing destinations
Revert 095bebf61a
("sched/numa: Do not move past the balance point if unbalanced")
sched/fair: Prevent throttling in early pick_next_task_fair()
preempt: Reorganize the notrace definitions a bit
preempt: Use preempt_schedule_context() as the official tracing preemption point
sched: Make preempt_schedule_context() function-tracing safe
x86: Remove cpu_sibling_mask() and cpu_core_mask()
x86: Replace cpu_**_mask() with topology_**_cpumask()
...
This commit is contained in:
@@ -1091,10 +1091,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
|
||||
{
|
||||
unsigned long cpu_limit;
|
||||
|
||||
/* Thread group counters. */
|
||||
thread_group_cputime_init(sig);
|
||||
|
||||
cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
|
||||
cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
|
||||
if (cpu_limit != RLIM_INFINITY) {
|
||||
sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
|
||||
sig->cputimer.running = 1;
|
||||
@@ -1396,6 +1393,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
|
||||
p->hardirq_context = 0;
|
||||
p->softirq_context = 0;
|
||||
#endif
|
||||
|
||||
p->pagefault_disabled = 0;
|
||||
|
||||
#ifdef CONFIG_LOCKDEP
|
||||
p->lockdep_depth = 0; /* no locks held yet */
|
||||
p->curr_chain_key = 0;
|
||||
|
@@ -1090,9 +1090,11 @@ static void __unqueue_futex(struct futex_q *q)
|
||||
|
||||
/*
|
||||
* The hash bucket lock must be held when this is called.
|
||||
* Afterwards, the futex_q must not be accessed.
|
||||
* Afterwards, the futex_q must not be accessed. Callers
|
||||
* must ensure to later call wake_up_q() for the actual
|
||||
* wakeups to occur.
|
||||
*/
|
||||
static void wake_futex(struct futex_q *q)
|
||||
static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
|
||||
{
|
||||
struct task_struct *p = q->task;
|
||||
|
||||
@@ -1100,14 +1102,10 @@ static void wake_futex(struct futex_q *q)
|
||||
return;
|
||||
|
||||
/*
|
||||
* We set q->lock_ptr = NULL _before_ we wake up the task. If
|
||||
* a non-futex wake up happens on another CPU then the task
|
||||
* might exit and p would dereference a non-existing task
|
||||
* struct. Prevent this by holding a reference on p across the
|
||||
* wake up.
|
||||
* Queue the task for later wakeup for after we've released
|
||||
* the hb->lock. wake_q_add() grabs reference to p.
|
||||
*/
|
||||
get_task_struct(p);
|
||||
|
||||
wake_q_add(wake_q, p);
|
||||
__unqueue_futex(q);
|
||||
/*
|
||||
* The waiting task can free the futex_q as soon as
|
||||
@@ -1117,9 +1115,6 @@ static void wake_futex(struct futex_q *q)
|
||||
*/
|
||||
smp_wmb();
|
||||
q->lock_ptr = NULL;
|
||||
|
||||
wake_up_state(p, TASK_NORMAL);
|
||||
put_task_struct(p);
|
||||
}
|
||||
|
||||
static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
|
||||
@@ -1217,6 +1212,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
|
||||
struct futex_q *this, *next;
|
||||
union futex_key key = FUTEX_KEY_INIT;
|
||||
int ret;
|
||||
WAKE_Q(wake_q);
|
||||
|
||||
if (!bitset)
|
||||
return -EINVAL;
|
||||
@@ -1244,13 +1240,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
|
||||
if (!(this->bitset & bitset))
|
||||
continue;
|
||||
|
||||
wake_futex(this);
|
||||
mark_wake_futex(&wake_q, this);
|
||||
if (++ret >= nr_wake)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
spin_unlock(&hb->lock);
|
||||
wake_up_q(&wake_q);
|
||||
out_put_key:
|
||||
put_futex_key(&key);
|
||||
out:
|
||||
@@ -1269,6 +1266,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
|
||||
struct futex_hash_bucket *hb1, *hb2;
|
||||
struct futex_q *this, *next;
|
||||
int ret, op_ret;
|
||||
WAKE_Q(wake_q);
|
||||
|
||||
retry:
|
||||
ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
|
||||
@@ -1320,7 +1318,7 @@ retry_private:
|
||||
ret = -EINVAL;
|
||||
goto out_unlock;
|
||||
}
|
||||
wake_futex(this);
|
||||
mark_wake_futex(&wake_q, this);
|
||||
if (++ret >= nr_wake)
|
||||
break;
|
||||
}
|
||||
@@ -1334,7 +1332,7 @@ retry_private:
|
||||
ret = -EINVAL;
|
||||
goto out_unlock;
|
||||
}
|
||||
wake_futex(this);
|
||||
mark_wake_futex(&wake_q, this);
|
||||
if (++op_ret >= nr_wake2)
|
||||
break;
|
||||
}
|
||||
@@ -1344,6 +1342,7 @@ retry_private:
|
||||
|
||||
out_unlock:
|
||||
double_unlock_hb(hb1, hb2);
|
||||
wake_up_q(&wake_q);
|
||||
out_put_keys:
|
||||
put_futex_key(&key2);
|
||||
out_put_key1:
|
||||
@@ -1503,6 +1502,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
|
||||
struct futex_pi_state *pi_state = NULL;
|
||||
struct futex_hash_bucket *hb1, *hb2;
|
||||
struct futex_q *this, *next;
|
||||
WAKE_Q(wake_q);
|
||||
|
||||
if (requeue_pi) {
|
||||
/*
|
||||
@@ -1679,7 +1679,7 @@ retry_private:
|
||||
* woken by futex_unlock_pi().
|
||||
*/
|
||||
if (++task_count <= nr_wake && !requeue_pi) {
|
||||
wake_futex(this);
|
||||
mark_wake_futex(&wake_q, this);
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -1719,6 +1719,7 @@ retry_private:
|
||||
out_unlock:
|
||||
free_pi_state(pi_state);
|
||||
double_unlock_hb(hb1, hb2);
|
||||
wake_up_q(&wake_q);
|
||||
hb_waiters_dec(hb2);
|
||||
|
||||
/*
|
||||
|
@@ -60,6 +60,28 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu)
|
||||
}
|
||||
EXPORT_SYMBOL(lg_local_unlock_cpu);
|
||||
|
||||
void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
|
||||
{
|
||||
BUG_ON(cpu1 == cpu2);
|
||||
|
||||
/* lock in cpu order, just like lg_global_lock */
|
||||
if (cpu2 < cpu1)
|
||||
swap(cpu1, cpu2);
|
||||
|
||||
preempt_disable();
|
||||
lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
|
||||
arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
|
||||
arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
|
||||
}
|
||||
|
||||
void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
|
||||
{
|
||||
lock_release(&lg->lock_dep_map, 1, _RET_IP_);
|
||||
arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
|
||||
arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
void lg_global_lock(struct lglock *lg)
|
||||
{
|
||||
int i;
|
||||
|
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
|
||||
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
|
||||
endif
|
||||
|
||||
obj-y += core.o proc.o clock.o cputime.o
|
||||
obj-y += core.o loadavg.o clock.o cputime.o
|
||||
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
|
||||
obj-y += wait.o completion.o idle.o
|
||||
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
|
||||
|
@@ -1,5 +1,3 @@
|
||||
#ifdef CONFIG_SCHED_AUTOGROUP
|
||||
|
||||
#include "sched.h"
|
||||
|
||||
#include <linux/proc_fs.h>
|
||||
@@ -141,7 +139,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
|
||||
|
||||
p->signal->autogroup = autogroup_kref_get(ag);
|
||||
|
||||
if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
|
||||
if (!READ_ONCE(sysctl_sched_autogroup_enabled))
|
||||
goto out;
|
||||
|
||||
for_each_thread(p, t)
|
||||
@@ -249,5 +247,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
|
||||
return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
|
||||
}
|
||||
#endif /* CONFIG_SCHED_DEBUG */
|
||||
|
||||
#endif /* CONFIG_SCHED_AUTOGROUP */
|
||||
|
@@ -29,7 +29,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
|
||||
static inline struct task_group *
|
||||
autogroup_task_group(struct task_struct *p, struct task_group *tg)
|
||||
{
|
||||
int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
|
||||
int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);
|
||||
|
||||
if (enabled && task_wants_autogroup(p, tg))
|
||||
return p->signal->autogroup->tg;
|
||||
|
@@ -511,7 +511,7 @@ static bool set_nr_and_not_polling(struct task_struct *p)
|
||||
static bool set_nr_if_polling(struct task_struct *p)
|
||||
{
|
||||
struct thread_info *ti = task_thread_info(p);
|
||||
typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
|
||||
typeof(ti->flags) old, val = READ_ONCE(ti->flags);
|
||||
|
||||
for (;;) {
|
||||
if (!(val & _TIF_POLLING_NRFLAG))
|
||||
@@ -541,6 +541,52 @@ static bool set_nr_if_polling(struct task_struct *p)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
void wake_q_add(struct wake_q_head *head, struct task_struct *task)
|
||||
{
|
||||
struct wake_q_node *node = &task->wake_q;
|
||||
|
||||
/*
|
||||
* Atomically grab the task, if ->wake_q is !nil already it means
|
||||
* its already queued (either by us or someone else) and will get the
|
||||
* wakeup due to that.
|
||||
*
|
||||
* This cmpxchg() implies a full barrier, which pairs with the write
|
||||
* barrier implied by the wakeup in wake_up_list().
|
||||
*/
|
||||
if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
|
||||
return;
|
||||
|
||||
get_task_struct(task);
|
||||
|
||||
/*
|
||||
* The head is context local, there can be no concurrency.
|
||||
*/
|
||||
*head->lastp = node;
|
||||
head->lastp = &node->next;
|
||||
}
|
||||
|
||||
void wake_up_q(struct wake_q_head *head)
|
||||
{
|
||||
struct wake_q_node *node = head->first;
|
||||
|
||||
while (node != WAKE_Q_TAIL) {
|
||||
struct task_struct *task;
|
||||
|
||||
task = container_of(node, struct task_struct, wake_q);
|
||||
BUG_ON(!task);
|
||||
/* task can safely be re-inserted now */
|
||||
node = node->next;
|
||||
task->wake_q.next = NULL;
|
||||
|
||||
/*
|
||||
* wake_up_process() implies a wmb() to pair with the queueing
|
||||
* in wake_q_add() so as not to miss wakeups.
|
||||
*/
|
||||
wake_up_process(task);
|
||||
put_task_struct(task);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* resched_curr - mark rq's current task 'to be rescheduled now'.
|
||||
*
|
||||
@@ -2105,12 +2151,15 @@ void wake_up_new_task(struct task_struct *p)
|
||||
|
||||
#ifdef CONFIG_PREEMPT_NOTIFIERS
|
||||
|
||||
static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
|
||||
|
||||
/**
|
||||
* preempt_notifier_register - tell me when current is being preempted & rescheduled
|
||||
* @notifier: notifier struct to register
|
||||
*/
|
||||
void preempt_notifier_register(struct preempt_notifier *notifier)
|
||||
{
|
||||
static_key_slow_inc(&preempt_notifier_key);
|
||||
hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(preempt_notifier_register);
|
||||
@@ -2119,15 +2168,16 @@ EXPORT_SYMBOL_GPL(preempt_notifier_register);
|
||||
* preempt_notifier_unregister - no longer interested in preemption notifications
|
||||
* @notifier: notifier struct to unregister
|
||||
*
|
||||
* This is safe to call from within a preemption notifier.
|
||||
* This is *not* safe to call from within a preemption notifier.
|
||||
*/
|
||||
void preempt_notifier_unregister(struct preempt_notifier *notifier)
|
||||
{
|
||||
hlist_del(¬ifier->link);
|
||||
static_key_slow_dec(&preempt_notifier_key);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
|
||||
|
||||
static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
|
||||
static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
|
||||
{
|
||||
struct preempt_notifier *notifier;
|
||||
|
||||
@@ -2135,9 +2185,15 @@ static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
|
||||
notifier->ops->sched_in(notifier, raw_smp_processor_id());
|
||||
}
|
||||
|
||||
static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
|
||||
{
|
||||
if (static_key_false(&preempt_notifier_key))
|
||||
__fire_sched_in_preempt_notifiers(curr);
|
||||
}
|
||||
|
||||
static void
|
||||
fire_sched_out_preempt_notifiers(struct task_struct *curr,
|
||||
struct task_struct *next)
|
||||
__fire_sched_out_preempt_notifiers(struct task_struct *curr,
|
||||
struct task_struct *next)
|
||||
{
|
||||
struct preempt_notifier *notifier;
|
||||
|
||||
@@ -2145,13 +2201,21 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
|
||||
notifier->ops->sched_out(notifier, next);
|
||||
}
|
||||
|
||||
static __always_inline void
|
||||
fire_sched_out_preempt_notifiers(struct task_struct *curr,
|
||||
struct task_struct *next)
|
||||
{
|
||||
if (static_key_false(&preempt_notifier_key))
|
||||
__fire_sched_out_preempt_notifiers(curr, next);
|
||||
}
|
||||
|
||||
#else /* !CONFIG_PREEMPT_NOTIFIERS */
|
||||
|
||||
static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
|
||||
static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
|
||||
{
|
||||
}
|
||||
|
||||
static void
|
||||
static inline void
|
||||
fire_sched_out_preempt_notifiers(struct task_struct *curr,
|
||||
struct task_struct *next)
|
||||
{
|
||||
@@ -2397,9 +2461,9 @@ unsigned long nr_iowait_cpu(int cpu)
|
||||
|
||||
void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
|
||||
{
|
||||
struct rq *this = this_rq();
|
||||
*nr_waiters = atomic_read(&this->nr_iowait);
|
||||
*load = this->cpu_load[0];
|
||||
struct rq *rq = this_rq();
|
||||
*nr_waiters = atomic_read(&rq->nr_iowait);
|
||||
*load = rq->load.weight;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
@@ -2497,6 +2561,7 @@ void scheduler_tick(void)
|
||||
update_rq_clock(rq);
|
||||
curr->sched_class->task_tick(rq, curr, 0);
|
||||
update_cpu_load_active(rq);
|
||||
calc_global_load_tick(rq);
|
||||
raw_spin_unlock(&rq->lock);
|
||||
|
||||
perf_event_task_tick();
|
||||
@@ -2525,7 +2590,7 @@ void scheduler_tick(void)
|
||||
u64 scheduler_tick_max_deferment(void)
|
||||
{
|
||||
struct rq *rq = this_rq();
|
||||
unsigned long next, now = ACCESS_ONCE(jiffies);
|
||||
unsigned long next, now = READ_ONCE(jiffies);
|
||||
|
||||
next = rq->last_sched_tick + HZ;
|
||||
|
||||
@@ -2726,9 +2791,7 @@ again:
|
||||
* - return from syscall or exception to user-space
|
||||
* - return from interrupt-handler to user-space
|
||||
*
|
||||
* WARNING: all callers must re-check need_resched() afterward and reschedule
|
||||
* accordingly in case an event triggered the need for rescheduling (such as
|
||||
* an interrupt waking up a task) while preemption was disabled in __schedule().
|
||||
* WARNING: must be called with preemption disabled!
|
||||
*/
|
||||
static void __sched __schedule(void)
|
||||
{
|
||||
@@ -2737,7 +2800,6 @@ static void __sched __schedule(void)
|
||||
struct rq *rq;
|
||||
int cpu;
|
||||
|
||||
preempt_disable();
|
||||
cpu = smp_processor_id();
|
||||
rq = cpu_rq(cpu);
|
||||
rcu_note_context_switch();
|
||||
@@ -2801,8 +2863,6 @@ static void __sched __schedule(void)
|
||||
raw_spin_unlock_irq(&rq->lock);
|
||||
|
||||
post_schedule(rq);
|
||||
|
||||
sched_preempt_enable_no_resched();
|
||||
}
|
||||
|
||||
static inline void sched_submit_work(struct task_struct *tsk)
|
||||
@@ -2823,7 +2883,9 @@ asmlinkage __visible void __sched schedule(void)
|
||||
|
||||
sched_submit_work(tsk);
|
||||
do {
|
||||
preempt_disable();
|
||||
__schedule();
|
||||
sched_preempt_enable_no_resched();
|
||||
} while (need_resched());
|
||||
}
|
||||
EXPORT_SYMBOL(schedule);
|
||||
@@ -2862,15 +2924,14 @@ void __sched schedule_preempt_disabled(void)
|
||||
static void __sched notrace preempt_schedule_common(void)
|
||||
{
|
||||
do {
|
||||
__preempt_count_add(PREEMPT_ACTIVE);
|
||||
preempt_active_enter();
|
||||
__schedule();
|
||||
__preempt_count_sub(PREEMPT_ACTIVE);
|
||||
preempt_active_exit();
|
||||
|
||||
/*
|
||||
* Check again in case we missed a preemption opportunity
|
||||
* between schedule and now.
|
||||
*/
|
||||
barrier();
|
||||
} while (need_resched());
|
||||
}
|
||||
|
||||
@@ -2894,9 +2955,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
|
||||
NOKPROBE_SYMBOL(preempt_schedule);
|
||||
EXPORT_SYMBOL(preempt_schedule);
|
||||
|
||||
#ifdef CONFIG_CONTEXT_TRACKING
|
||||
/**
|
||||
* preempt_schedule_context - preempt_schedule called by tracing
|
||||
* preempt_schedule_notrace - preempt_schedule called by tracing
|
||||
*
|
||||
* The tracing infrastructure uses preempt_enable_notrace to prevent
|
||||
* recursion and tracing preempt enabling caused by the tracing
|
||||
@@ -2909,7 +2969,7 @@ EXPORT_SYMBOL(preempt_schedule);
|
||||
* instead of preempt_schedule() to exit user context if needed before
|
||||
* calling the scheduler.
|
||||
*/
|
||||
asmlinkage __visible void __sched notrace preempt_schedule_context(void)
|
||||
asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
|
||||
{
|
||||
enum ctx_state prev_ctx;
|
||||
|
||||
@@ -2917,7 +2977,13 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
|
||||
return;
|
||||
|
||||
do {
|
||||
__preempt_count_add(PREEMPT_ACTIVE);
|
||||
/*
|
||||
* Use raw __prempt_count() ops that don't call function.
|
||||
* We can't call functions before disabling preemption which
|
||||
* disarm preemption tracing recursions.
|
||||
*/
|
||||
__preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
|
||||
barrier();
|
||||
/*
|
||||
* Needs preempt disabled in case user_exit() is traced
|
||||
* and the tracer calls preempt_enable_notrace() causing
|
||||
@@ -2927,12 +2993,11 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
|
||||
__schedule();
|
||||
exception_exit(prev_ctx);
|
||||
|
||||
__preempt_count_sub(PREEMPT_ACTIVE);
|
||||
barrier();
|
||||
__preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
|
||||
} while (need_resched());
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(preempt_schedule_context);
|
||||
#endif /* CONFIG_CONTEXT_TRACKING */
|
||||
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
|
||||
|
||||
#endif /* CONFIG_PREEMPT */
|
||||
|
||||
@@ -2952,17 +3017,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
|
||||
prev_state = exception_enter();
|
||||
|
||||
do {
|
||||
__preempt_count_add(PREEMPT_ACTIVE);
|
||||
preempt_active_enter();
|
||||
local_irq_enable();
|
||||
__schedule();
|
||||
local_irq_disable();
|
||||
__preempt_count_sub(PREEMPT_ACTIVE);
|
||||
|
||||
/*
|
||||
* Check again in case we missed a preemption opportunity
|
||||
* between schedule and now.
|
||||
*/
|
||||
barrier();
|
||||
preempt_active_exit();
|
||||
} while (need_resched());
|
||||
|
||||
exception_exit(prev_state);
|
||||
@@ -3040,7 +3099,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
|
||||
if (!dl_prio(p->normal_prio) ||
|
||||
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
|
||||
p->dl.dl_boosted = 1;
|
||||
p->dl.dl_throttled = 0;
|
||||
enqueue_flag = ENQUEUE_REPLENISH;
|
||||
} else
|
||||
p->dl.dl_boosted = 0;
|
||||
@@ -5314,7 +5372,7 @@ static struct notifier_block migration_notifier = {
|
||||
.priority = CPU_PRI_MIGRATION,
|
||||
};
|
||||
|
||||
static void __cpuinit set_cpu_rq_start_time(void)
|
||||
static void set_cpu_rq_start_time(void)
|
||||
{
|
||||
int cpu = smp_processor_id();
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
@@ -7734,11 +7792,11 @@ static long sched_group_rt_runtime(struct task_group *tg)
|
||||
return rt_runtime_us;
|
||||
}
|
||||
|
||||
static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
|
||||
static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
|
||||
{
|
||||
u64 rt_runtime, rt_period;
|
||||
|
||||
rt_period = (u64)rt_period_us * NSEC_PER_USEC;
|
||||
rt_period = rt_period_us * NSEC_PER_USEC;
|
||||
rt_runtime = tg->rt_bandwidth.rt_runtime;
|
||||
|
||||
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
|
||||
|
@@ -567,7 +567,7 @@ static void cputime_advance(cputime_t *counter, cputime_t new)
|
||||
{
|
||||
cputime_t old;
|
||||
|
||||
while (new > (old = ACCESS_ONCE(*counter)))
|
||||
while (new > (old = READ_ONCE(*counter)))
|
||||
cmpxchg_cputime(counter, old, new);
|
||||
}
|
||||
|
||||
|
@@ -640,7 +640,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
|
||||
}
|
||||
|
||||
static
|
||||
int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
|
||||
int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
|
||||
{
|
||||
return (dl_se->runtime <= 0);
|
||||
}
|
||||
@@ -684,7 +684,7 @@ static void update_curr_dl(struct rq *rq)
|
||||
sched_rt_avg_update(rq, delta_exec);
|
||||
|
||||
dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
|
||||
if (dl_runtime_exceeded(rq, dl_se)) {
|
||||
if (dl_runtime_exceeded(dl_se)) {
|
||||
dl_se->dl_throttled = 1;
|
||||
__dequeue_task_dl(rq, curr, 0);
|
||||
if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
|
||||
@@ -995,7 +995,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
|
||||
rq = cpu_rq(cpu);
|
||||
|
||||
rcu_read_lock();
|
||||
curr = ACCESS_ONCE(rq->curr); /* unlocked access */
|
||||
curr = READ_ONCE(rq->curr); /* unlocked access */
|
||||
|
||||
/*
|
||||
* If we are dealing with a -deadline task, we must
|
||||
@@ -1012,7 +1012,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
|
||||
(p->nr_cpus_allowed > 1)) {
|
||||
int target = find_later_rq(p);
|
||||
|
||||
if (target != -1)
|
||||
if (target != -1 &&
|
||||
dl_time_before(p->dl.deadline,
|
||||
cpu_rq(target)->dl.earliest_dl.curr))
|
||||
cpu = target;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
@@ -1230,6 +1232,32 @@ next_node:
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the earliest pushable rq's task, which is suitable to be executed
|
||||
* on the CPU, NULL otherwise:
|
||||
*/
|
||||
static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
|
||||
{
|
||||
struct rb_node *next_node = rq->dl.pushable_dl_tasks_leftmost;
|
||||
struct task_struct *p = NULL;
|
||||
|
||||
if (!has_pushable_dl_tasks(rq))
|
||||
return NULL;
|
||||
|
||||
next_node:
|
||||
if (next_node) {
|
||||
p = rb_entry(next_node, struct task_struct, pushable_dl_tasks);
|
||||
|
||||
if (pick_dl_task(rq, p, cpu))
|
||||
return p;
|
||||
|
||||
next_node = rb_next(next_node);
|
||||
goto next_node;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
|
||||
|
||||
static int find_later_rq(struct task_struct *task)
|
||||
@@ -1333,6 +1361,17 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
|
||||
|
||||
later_rq = cpu_rq(cpu);
|
||||
|
||||
if (!dl_time_before(task->dl.deadline,
|
||||
later_rq->dl.earliest_dl.curr)) {
|
||||
/*
|
||||
* Target rq has tasks of equal or earlier deadline,
|
||||
* retrying does not release any lock and is unlikely
|
||||
* to yield a different result.
|
||||
*/
|
||||
later_rq = NULL;
|
||||
break;
|
||||
}
|
||||
|
||||
/* Retry if something changed. */
|
||||
if (double_lock_balance(rq, later_rq)) {
|
||||
if (unlikely(task_rq(task) != rq ||
|
||||
@@ -1514,7 +1553,7 @@ static int pull_dl_task(struct rq *this_rq)
|
||||
if (src_rq->dl.dl_nr_running <= 1)
|
||||
goto skip;
|
||||
|
||||
p = pick_next_earliest_dl_task(src_rq, this_cpu);
|
||||
p = pick_earliest_pushable_dl_task(src_rq, this_cpu);
|
||||
|
||||
/*
|
||||
* We found a task to be pulled if:
|
||||
@@ -1659,7 +1698,7 @@ static void rq_offline_dl(struct rq *rq)
|
||||
cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
|
||||
}
|
||||
|
||||
void init_sched_dl_class(void)
|
||||
void __init init_sched_dl_class(void)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
|
@@ -132,12 +132,14 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
|
||||
p->prio);
|
||||
#ifdef CONFIG_SCHEDSTATS
|
||||
SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
|
||||
SPLIT_NS(p->se.vruntime),
|
||||
SPLIT_NS(p->se.statistics.wait_sum),
|
||||
SPLIT_NS(p->se.sum_exec_runtime),
|
||||
SPLIT_NS(p->se.statistics.sum_sleep_runtime));
|
||||
#else
|
||||
SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
|
||||
0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
|
||||
SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
|
||||
0LL, 0L,
|
||||
SPLIT_NS(p->se.sum_exec_runtime),
|
||||
0LL, 0L);
|
||||
#endif
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
SEQ_printf(m, " %d", task_node(p));
|
||||
@@ -156,7 +158,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
|
||||
SEQ_printf(m,
|
||||
"\nrunnable tasks:\n"
|
||||
" task PID tree-key switches prio"
|
||||
" exec-runtime sum-exec sum-sleep\n"
|
||||
" wait-time sum-exec sum-sleep\n"
|
||||
"------------------------------------------------------"
|
||||
"----------------------------------------------------\n");
|
||||
|
||||
@@ -582,6 +584,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
|
||||
nr_switches = p->nvcsw + p->nivcsw;
|
||||
|
||||
#ifdef CONFIG_SCHEDSTATS
|
||||
PN(se.statistics.sum_sleep_runtime);
|
||||
PN(se.statistics.wait_start);
|
||||
PN(se.statistics.sleep_start);
|
||||
PN(se.statistics.block_start);
|
||||
|
@@ -141,9 +141,9 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w)
|
||||
*
|
||||
* This idea comes from the SD scheduler of Con Kolivas:
|
||||
*/
|
||||
static int get_update_sysctl_factor(void)
|
||||
static unsigned int get_update_sysctl_factor(void)
|
||||
{
|
||||
unsigned int cpus = min_t(int, num_online_cpus(), 8);
|
||||
unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
|
||||
unsigned int factor;
|
||||
|
||||
switch (sysctl_sched_tunable_scaling) {
|
||||
@@ -576,7 +576,7 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
|
||||
loff_t *ppos)
|
||||
{
|
||||
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
||||
int factor = get_update_sysctl_factor();
|
||||
unsigned int factor = get_update_sysctl_factor();
|
||||
|
||||
if (ret || !write)
|
||||
return ret;
|
||||
@@ -834,7 +834,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
|
||||
|
||||
static unsigned int task_scan_min(struct task_struct *p)
|
||||
{
|
||||
unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
|
||||
unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
|
||||
unsigned int scan, floor;
|
||||
unsigned int windows = 1;
|
||||
|
||||
@@ -1198,11 +1198,9 @@ static void task_numa_assign(struct task_numa_env *env,
|
||||
static bool load_too_imbalanced(long src_load, long dst_load,
|
||||
struct task_numa_env *env)
|
||||
{
|
||||
long imb, old_imb;
|
||||
long orig_src_load, orig_dst_load;
|
||||
long src_capacity, dst_capacity;
|
||||
long orig_src_load;
|
||||
long load_a, load_b;
|
||||
long moved_load;
|
||||
long imb;
|
||||
|
||||
/*
|
||||
* The load is corrected for the CPU capacity available on each node.
|
||||
@@ -1215,39 +1213,30 @@ static bool load_too_imbalanced(long src_load, long dst_load,
|
||||
dst_capacity = env->dst_stats.compute_capacity;
|
||||
|
||||
/* We care about the slope of the imbalance, not the direction. */
|
||||
load_a = dst_load;
|
||||
load_b = src_load;
|
||||
if (load_a < load_b)
|
||||
swap(load_a, load_b);
|
||||
if (dst_load < src_load)
|
||||
swap(dst_load, src_load);
|
||||
|
||||
/* Is the difference below the threshold? */
|
||||
imb = load_a * src_capacity * 100 -
|
||||
load_b * dst_capacity * env->imbalance_pct;
|
||||
imb = dst_load * src_capacity * 100 -
|
||||
src_load * dst_capacity * env->imbalance_pct;
|
||||
if (imb <= 0)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* The imbalance is above the allowed threshold.
|
||||
* Allow a move that brings us closer to a balanced situation,
|
||||
* without moving things past the point of balance.
|
||||
* Compare it with the old imbalance.
|
||||
*/
|
||||
orig_src_load = env->src_stats.load;
|
||||
orig_dst_load = env->dst_stats.load;
|
||||
|
||||
/*
|
||||
* In a task swap, there will be one load moving from src to dst,
|
||||
* and another moving back. This is the net sum of both moves.
|
||||
* A simple task move will always have a positive value.
|
||||
* Allow the move if it brings the system closer to a balanced
|
||||
* situation, without crossing over the balance point.
|
||||
*/
|
||||
moved_load = orig_src_load - src_load;
|
||||
if (orig_dst_load < orig_src_load)
|
||||
swap(orig_dst_load, orig_src_load);
|
||||
|
||||
if (moved_load > 0)
|
||||
/* Moving src -> dst. Did we overshoot balance? */
|
||||
return src_load * dst_capacity < dst_load * src_capacity;
|
||||
else
|
||||
/* Moving dst -> src. Did we overshoot balance? */
|
||||
return dst_load * src_capacity < src_load * dst_capacity;
|
||||
old_imb = orig_dst_load * src_capacity * 100 -
|
||||
orig_src_load * dst_capacity * env->imbalance_pct;
|
||||
|
||||
/* Would this change make things worse? */
|
||||
return (imb > old_imb);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1409,6 +1398,30 @@ static void task_numa_find_cpu(struct task_numa_env *env,
|
||||
}
|
||||
}
|
||||
|
||||
/* Only move tasks to a NUMA node less busy than the current node. */
|
||||
static bool numa_has_capacity(struct task_numa_env *env)
|
||||
{
|
||||
struct numa_stats *src = &env->src_stats;
|
||||
struct numa_stats *dst = &env->dst_stats;
|
||||
|
||||
if (src->has_free_capacity && !dst->has_free_capacity)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Only consider a task move if the source has a higher load
|
||||
* than the destination, corrected for CPU capacity on each node.
|
||||
*
|
||||
* src->load dst->load
|
||||
* --------------------- vs ---------------------
|
||||
* src->compute_capacity dst->compute_capacity
|
||||
*/
|
||||
if (src->load * dst->compute_capacity >
|
||||
dst->load * src->compute_capacity)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static int task_numa_migrate(struct task_struct *p)
|
||||
{
|
||||
struct task_numa_env env = {
|
||||
@@ -1463,7 +1476,8 @@ static int task_numa_migrate(struct task_struct *p)
|
||||
update_numa_stats(&env.dst_stats, env.dst_nid);
|
||||
|
||||
/* Try to find a spot on the preferred nid. */
|
||||
task_numa_find_cpu(&env, taskimp, groupimp);
|
||||
if (numa_has_capacity(&env))
|
||||
task_numa_find_cpu(&env, taskimp, groupimp);
|
||||
|
||||
/*
|
||||
* Look at other nodes in these cases:
|
||||
@@ -1494,7 +1508,8 @@ static int task_numa_migrate(struct task_struct *p)
|
||||
env.dist = dist;
|
||||
env.dst_nid = nid;
|
||||
update_numa_stats(&env.dst_stats, env.dst_nid);
|
||||
task_numa_find_cpu(&env, taskimp, groupimp);
|
||||
if (numa_has_capacity(&env))
|
||||
task_numa_find_cpu(&env, taskimp, groupimp);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1794,7 +1809,12 @@ static void task_numa_placement(struct task_struct *p)
|
||||
u64 runtime, period;
|
||||
spinlock_t *group_lock = NULL;
|
||||
|
||||
seq = ACCESS_ONCE(p->mm->numa_scan_seq);
|
||||
/*
|
||||
* The p->mm->numa_scan_seq field gets updated without
|
||||
* exclusive access. Use READ_ONCE() here to ensure
|
||||
* that the field is read in a single access:
|
||||
*/
|
||||
seq = READ_ONCE(p->mm->numa_scan_seq);
|
||||
if (p->numa_scan_seq == seq)
|
||||
return;
|
||||
p->numa_scan_seq = seq;
|
||||
@@ -1938,7 +1958,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
|
||||
}
|
||||
|
||||
rcu_read_lock();
|
||||
tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
|
||||
tsk = READ_ONCE(cpu_rq(cpu)->curr);
|
||||
|
||||
if (!cpupid_match_pid(tsk, cpupid))
|
||||
goto no_join;
|
||||
@@ -2107,7 +2127,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
|
||||
|
||||
static void reset_ptenuma_scan(struct task_struct *p)
|
||||
{
|
||||
ACCESS_ONCE(p->mm->numa_scan_seq)++;
|
||||
/*
|
||||
* We only did a read acquisition of the mmap sem, so
|
||||
* p->mm->numa_scan_seq is written to without exclusive access
|
||||
* and the update is not guaranteed to be atomic. That's not
|
||||
* much of an issue though, since this is just used for
|
||||
* statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
|
||||
* expensive, to avoid any form of compiler optimizations:
|
||||
*/
|
||||
WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
|
||||
p->mm->numa_scan_offset = 0;
|
||||
}
|
||||
|
||||
@@ -4323,6 +4351,189 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
/*
|
||||
* per rq 'load' arrray crap; XXX kill this.
|
||||
*/
|
||||
|
||||
/*
|
||||
* The exact cpuload at various idx values, calculated at every tick would be
|
||||
* load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
|
||||
*
|
||||
* If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
|
||||
* on nth tick when cpu may be busy, then we have:
|
||||
* load = ((2^idx - 1) / 2^idx)^(n-1) * load
|
||||
* load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
|
||||
*
|
||||
* decay_load_missed() below does efficient calculation of
|
||||
* load = ((2^idx - 1) / 2^idx)^(n-1) * load
|
||||
* avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
|
||||
*
|
||||
* The calculation is approximated on a 128 point scale.
|
||||
* degrade_zero_ticks is the number of ticks after which load at any
|
||||
* particular idx is approximated to be zero.
|
||||
* degrade_factor is a precomputed table, a row for each load idx.
|
||||
* Each column corresponds to degradation factor for a power of two ticks,
|
||||
* based on 128 point scale.
|
||||
* Example:
|
||||
* row 2, col 3 (=12) says that the degradation at load idx 2 after
|
||||
* 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
|
||||
*
|
||||
* With this power of 2 load factors, we can degrade the load n times
|
||||
* by looking at 1 bits in n and doing as many mult/shift instead of
|
||||
* n mult/shifts needed by the exact degradation.
|
||||
*/
|
||||
#define DEGRADE_SHIFT 7
|
||||
static const unsigned char
|
||||
degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
|
||||
static const unsigned char
|
||||
degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
|
||||
{0, 0, 0, 0, 0, 0, 0, 0},
|
||||
{64, 32, 8, 0, 0, 0, 0, 0},
|
||||
{96, 72, 40, 12, 1, 0, 0},
|
||||
{112, 98, 75, 43, 15, 1, 0},
|
||||
{120, 112, 98, 76, 45, 16, 2} };
|
||||
|
||||
/*
|
||||
* Update cpu_load for any missed ticks, due to tickless idle. The backlog
|
||||
* would be when CPU is idle and so we just decay the old load without
|
||||
* adding any new load.
|
||||
*/
|
||||
static unsigned long
|
||||
decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
|
||||
{
|
||||
int j = 0;
|
||||
|
||||
if (!missed_updates)
|
||||
return load;
|
||||
|
||||
if (missed_updates >= degrade_zero_ticks[idx])
|
||||
return 0;
|
||||
|
||||
if (idx == 1)
|
||||
return load >> missed_updates;
|
||||
|
||||
while (missed_updates) {
|
||||
if (missed_updates % 2)
|
||||
load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
|
||||
|
||||
missed_updates >>= 1;
|
||||
j++;
|
||||
}
|
||||
return load;
|
||||
}
|
||||
|
||||
/*
|
||||
* Update rq->cpu_load[] statistics. This function is usually called every
|
||||
* scheduler tick (TICK_NSEC). With tickless idle this will not be called
|
||||
* every tick. We fix it up based on jiffies.
|
||||
*/
|
||||
static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
|
||||
unsigned long pending_updates)
|
||||
{
|
||||
int i, scale;
|
||||
|
||||
this_rq->nr_load_updates++;
|
||||
|
||||
/* Update our load: */
|
||||
this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
|
||||
for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
|
||||
unsigned long old_load, new_load;
|
||||
|
||||
/* scale is effectively 1 << i now, and >> i divides by scale */
|
||||
|
||||
old_load = this_rq->cpu_load[i];
|
||||
old_load = decay_load_missed(old_load, pending_updates - 1, i);
|
||||
new_load = this_load;
|
||||
/*
|
||||
* Round up the averaging division if load is increasing. This
|
||||
* prevents us from getting stuck on 9 if the load is 10, for
|
||||
* example.
|
||||
*/
|
||||
if (new_load > old_load)
|
||||
new_load += scale - 1;
|
||||
|
||||
this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
|
||||
}
|
||||
|
||||
sched_avg_update(this_rq);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NO_HZ_COMMON
|
||||
/*
|
||||
* There is no sane way to deal with nohz on smp when using jiffies because the
|
||||
* cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
|
||||
* causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
|
||||
*
|
||||
* Therefore we cannot use the delta approach from the regular tick since that
|
||||
* would seriously skew the load calculation. However we'll make do for those
|
||||
* updates happening while idle (nohz_idle_balance) or coming out of idle
|
||||
* (tick_nohz_idle_exit).
|
||||
*
|
||||
* This means we might still be one tick off for nohz periods.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Called from nohz_idle_balance() to update the load ratings before doing the
|
||||
* idle balance.
|
||||
*/
|
||||
static void update_idle_cpu_load(struct rq *this_rq)
|
||||
{
|
||||
unsigned long curr_jiffies = READ_ONCE(jiffies);
|
||||
unsigned long load = this_rq->cfs.runnable_load_avg;
|
||||
unsigned long pending_updates;
|
||||
|
||||
/*
|
||||
* bail if there's load or we're actually up-to-date.
|
||||
*/
|
||||
if (load || curr_jiffies == this_rq->last_load_update_tick)
|
||||
return;
|
||||
|
||||
pending_updates = curr_jiffies - this_rq->last_load_update_tick;
|
||||
this_rq->last_load_update_tick = curr_jiffies;
|
||||
|
||||
__update_cpu_load(this_rq, load, pending_updates);
|
||||
}
|
||||
|
||||
/*
|
||||
* Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
|
||||
*/
|
||||
void update_cpu_load_nohz(void)
|
||||
{
|
||||
struct rq *this_rq = this_rq();
|
||||
unsigned long curr_jiffies = READ_ONCE(jiffies);
|
||||
unsigned long pending_updates;
|
||||
|
||||
if (curr_jiffies == this_rq->last_load_update_tick)
|
||||
return;
|
||||
|
||||
raw_spin_lock(&this_rq->lock);
|
||||
pending_updates = curr_jiffies - this_rq->last_load_update_tick;
|
||||
if (pending_updates) {
|
||||
this_rq->last_load_update_tick = curr_jiffies;
|
||||
/*
|
||||
* We were idle, this means load 0, the current load might be
|
||||
* !0 due to remote wakeups and the sort.
|
||||
*/
|
||||
__update_cpu_load(this_rq, 0, pending_updates);
|
||||
}
|
||||
raw_spin_unlock(&this_rq->lock);
|
||||
}
|
||||
#endif /* CONFIG_NO_HZ */
|
||||
|
||||
/*
|
||||
* Called from scheduler_tick()
|
||||
*/
|
||||
void update_cpu_load_active(struct rq *this_rq)
|
||||
{
|
||||
unsigned long load = this_rq->cfs.runnable_load_avg;
|
||||
/*
|
||||
* See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
|
||||
*/
|
||||
this_rq->last_load_update_tick = jiffies;
|
||||
__update_cpu_load(this_rq, load, 1);
|
||||
}
|
||||
|
||||
/* Used instead of source_load when we know the type == 0 */
|
||||
static unsigned long weighted_cpuload(const int cpu)
|
||||
{
|
||||
@@ -4375,7 +4586,7 @@ static unsigned long capacity_orig_of(int cpu)
|
||||
static unsigned long cpu_avg_load_per_task(int cpu)
|
||||
{
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
|
||||
unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
|
||||
unsigned long load_avg = rq->cfs.runnable_load_avg;
|
||||
|
||||
if (nr_running)
|
||||
@@ -5126,18 +5337,21 @@ again:
|
||||
* entity, update_curr() will update its vruntime, otherwise
|
||||
* forget we've ever seen it.
|
||||
*/
|
||||
if (curr && curr->on_rq)
|
||||
update_curr(cfs_rq);
|
||||
else
|
||||
curr = NULL;
|
||||
if (curr) {
|
||||
if (curr->on_rq)
|
||||
update_curr(cfs_rq);
|
||||
else
|
||||
curr = NULL;
|
||||
|
||||
/*
|
||||
* This call to check_cfs_rq_runtime() will do the throttle and
|
||||
* dequeue its entity in the parent(s). Therefore the 'simple'
|
||||
* nr_running test will indeed be correct.
|
||||
*/
|
||||
if (unlikely(check_cfs_rq_runtime(cfs_rq)))
|
||||
goto simple;
|
||||
/*
|
||||
* This call to check_cfs_rq_runtime() will do the
|
||||
* throttle and dequeue its entity in the parent(s).
|
||||
* Therefore the 'simple' nr_running test will indeed
|
||||
* be correct.
|
||||
*/
|
||||
if (unlikely(check_cfs_rq_runtime(cfs_rq)))
|
||||
goto simple;
|
||||
}
|
||||
|
||||
se = pick_next_entity(cfs_rq, curr);
|
||||
cfs_rq = group_cfs_rq(se);
|
||||
@@ -5467,10 +5681,15 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
/* Returns true if the destination node has incurred more faults */
|
||||
/*
|
||||
* Returns true if the destination node is the preferred node.
|
||||
* Needs to match fbq_classify_rq(): if there is a runnable task
|
||||
* that is not on its preferred node, we should identify it.
|
||||
*/
|
||||
static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
|
||||
{
|
||||
struct numa_group *numa_group = rcu_dereference(p->numa_group);
|
||||
unsigned long src_faults, dst_faults;
|
||||
int src_nid, dst_nid;
|
||||
|
||||
if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
|
||||
@@ -5484,29 +5703,30 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
|
||||
if (src_nid == dst_nid)
|
||||
return false;
|
||||
|
||||
if (numa_group) {
|
||||
/* Task is already in the group's interleave set. */
|
||||
if (node_isset(src_nid, numa_group->active_nodes))
|
||||
return false;
|
||||
|
||||
/* Task is moving into the group's interleave set. */
|
||||
if (node_isset(dst_nid, numa_group->active_nodes))
|
||||
return true;
|
||||
|
||||
return group_faults(p, dst_nid) > group_faults(p, src_nid);
|
||||
}
|
||||
|
||||
/* Encourage migration to the preferred node. */
|
||||
if (dst_nid == p->numa_preferred_nid)
|
||||
return true;
|
||||
|
||||
return task_faults(p, dst_nid) > task_faults(p, src_nid);
|
||||
/* Migrating away from the preferred node is bad. */
|
||||
if (src_nid == p->numa_preferred_nid)
|
||||
return false;
|
||||
|
||||
if (numa_group) {
|
||||
src_faults = group_faults(p, src_nid);
|
||||
dst_faults = group_faults(p, dst_nid);
|
||||
} else {
|
||||
src_faults = task_faults(p, src_nid);
|
||||
dst_faults = task_faults(p, dst_nid);
|
||||
}
|
||||
|
||||
return dst_faults > src_faults;
|
||||
}
|
||||
|
||||
|
||||
static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
|
||||
{
|
||||
struct numa_group *numa_group = rcu_dereference(p->numa_group);
|
||||
unsigned long src_faults, dst_faults;
|
||||
int src_nid, dst_nid;
|
||||
|
||||
if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
|
||||
@@ -5521,23 +5741,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
|
||||
if (src_nid == dst_nid)
|
||||
return false;
|
||||
|
||||
if (numa_group) {
|
||||
/* Task is moving within/into the group's interleave set. */
|
||||
if (node_isset(dst_nid, numa_group->active_nodes))
|
||||
return false;
|
||||
|
||||
/* Task is moving out of the group's interleave set. */
|
||||
if (node_isset(src_nid, numa_group->active_nodes))
|
||||
return true;
|
||||
|
||||
return group_faults(p, dst_nid) < group_faults(p, src_nid);
|
||||
}
|
||||
|
||||
/* Migrating away from the preferred node is always bad. */
|
||||
/* Migrating away from the preferred node is bad. */
|
||||
if (src_nid == p->numa_preferred_nid)
|
||||
return true;
|
||||
|
||||
return task_faults(p, dst_nid) < task_faults(p, src_nid);
|
||||
/* Encourage migration to the preferred node. */
|
||||
if (dst_nid == p->numa_preferred_nid)
|
||||
return false;
|
||||
|
||||
if (numa_group) {
|
||||
src_faults = group_faults(p, src_nid);
|
||||
dst_faults = group_faults(p, dst_nid);
|
||||
} else {
|
||||
src_faults = task_faults(p, src_nid);
|
||||
dst_faults = task_faults(p, dst_nid);
|
||||
}
|
||||
|
||||
return dst_faults < src_faults;
|
||||
}
|
||||
|
||||
#else
|
||||
@@ -6037,8 +6257,8 @@ static unsigned long scale_rt_capacity(int cpu)
|
||||
* Since we're reading these variables without serialization make sure
|
||||
* we read them once before doing sanity checks on them.
|
||||
*/
|
||||
age_stamp = ACCESS_ONCE(rq->age_stamp);
|
||||
avg = ACCESS_ONCE(rq->rt_avg);
|
||||
age_stamp = READ_ONCE(rq->age_stamp);
|
||||
avg = READ_ONCE(rq->rt_avg);
|
||||
delta = __rq_clock_broken(rq) - age_stamp;
|
||||
|
||||
if (unlikely(delta < 0))
|
||||
|
@@ -1,7 +1,9 @@
|
||||
/*
|
||||
* kernel/sched/proc.c
|
||||
* kernel/sched/loadavg.c
|
||||
*
|
||||
* Kernel load calculations, forked from sched/core.c
|
||||
* This file contains the magic bits required to compute the global loadavg
|
||||
* figure. Its a silly number but people think its important. We go through
|
||||
* great pains to make it work on big machines and tickless kernels.
|
||||
*/
|
||||
|
||||
#include <linux/export.h>
|
||||
@@ -81,7 +83,7 @@ long calc_load_fold_active(struct rq *this_rq)
|
||||
long nr_active, delta = 0;
|
||||
|
||||
nr_active = this_rq->nr_running;
|
||||
nr_active += (long) this_rq->nr_uninterruptible;
|
||||
nr_active += (long)this_rq->nr_uninterruptible;
|
||||
|
||||
if (nr_active != this_rq->calc_load_active) {
|
||||
delta = nr_active - this_rq->calc_load_active;
|
||||
@@ -186,6 +188,7 @@ void calc_load_enter_idle(void)
|
||||
delta = calc_load_fold_active(this_rq);
|
||||
if (delta) {
|
||||
int idx = calc_load_write_idx();
|
||||
|
||||
atomic_long_add(delta, &calc_load_idle[idx]);
|
||||
}
|
||||
}
|
||||
@@ -241,18 +244,20 @@ fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
|
||||
{
|
||||
unsigned long result = 1UL << frac_bits;
|
||||
|
||||
if (n) for (;;) {
|
||||
if (n & 1) {
|
||||
result *= x;
|
||||
result += 1UL << (frac_bits - 1);
|
||||
result >>= frac_bits;
|
||||
if (n) {
|
||||
for (;;) {
|
||||
if (n & 1) {
|
||||
result *= x;
|
||||
result += 1UL << (frac_bits - 1);
|
||||
result >>= frac_bits;
|
||||
}
|
||||
n >>= 1;
|
||||
if (!n)
|
||||
break;
|
||||
x *= x;
|
||||
x += 1UL << (frac_bits - 1);
|
||||
x >>= frac_bits;
|
||||
}
|
||||
n >>= 1;
|
||||
if (!n)
|
||||
break;
|
||||
x *= x;
|
||||
x += 1UL << (frac_bits - 1);
|
||||
x >>= frac_bits;
|
||||
}
|
||||
|
||||
return result;
|
||||
@@ -285,7 +290,6 @@ static unsigned long
|
||||
calc_load_n(unsigned long load, unsigned long exp,
|
||||
unsigned long active, unsigned int n)
|
||||
{
|
||||
|
||||
return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
|
||||
}
|
||||
|
||||
@@ -339,6 +343,8 @@ static inline void calc_global_nohz(void) { }
|
||||
/*
|
||||
* calc_load - update the avenrun load estimates 10 ticks after the
|
||||
* CPUs have updated calc_load_tasks.
|
||||
*
|
||||
* Called from the global timer code.
|
||||
*/
|
||||
void calc_global_load(unsigned long ticks)
|
||||
{
|
||||
@@ -370,10 +376,10 @@ void calc_global_load(unsigned long ticks)
|
||||
}
|
||||
|
||||
/*
|
||||
* Called from update_cpu_load() to periodically update this CPU's
|
||||
* Called from scheduler_tick() to periodically update this CPU's
|
||||
* active count.
|
||||
*/
|
||||
static void calc_load_account_active(struct rq *this_rq)
|
||||
void calc_global_load_tick(struct rq *this_rq)
|
||||
{
|
||||
long delta;
|
||||
|
||||
@@ -386,199 +392,3 @@ static void calc_load_account_active(struct rq *this_rq)
|
||||
|
||||
this_rq->calc_load_update += LOAD_FREQ;
|
||||
}
|
||||
|
||||
/*
|
||||
* End of global load-average stuff
|
||||
*/
|
||||
|
||||
/*
|
||||
* The exact cpuload at various idx values, calculated at every tick would be
|
||||
* load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
|
||||
*
|
||||
* If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
|
||||
* on nth tick when cpu may be busy, then we have:
|
||||
* load = ((2^idx - 1) / 2^idx)^(n-1) * load
|
||||
* load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
|
||||
*
|
||||
* decay_load_missed() below does efficient calculation of
|
||||
* load = ((2^idx - 1) / 2^idx)^(n-1) * load
|
||||
* avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
|
||||
*
|
||||
* The calculation is approximated on a 128 point scale.
|
||||
* degrade_zero_ticks is the number of ticks after which load at any
|
||||
* particular idx is approximated to be zero.
|
||||
* degrade_factor is a precomputed table, a row for each load idx.
|
||||
* Each column corresponds to degradation factor for a power of two ticks,
|
||||
* based on 128 point scale.
|
||||
* Example:
|
||||
* row 2, col 3 (=12) says that the degradation at load idx 2 after
|
||||
* 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
|
||||
*
|
||||
* With this power of 2 load factors, we can degrade the load n times
|
||||
* by looking at 1 bits in n and doing as many mult/shift instead of
|
||||
* n mult/shifts needed by the exact degradation.
|
||||
*/
|
||||
#define DEGRADE_SHIFT 7
|
||||
static const unsigned char
|
||||
degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
|
||||
static const unsigned char
|
||||
degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
|
||||
{0, 0, 0, 0, 0, 0, 0, 0},
|
||||
{64, 32, 8, 0, 0, 0, 0, 0},
|
||||
{96, 72, 40, 12, 1, 0, 0},
|
||||
{112, 98, 75, 43, 15, 1, 0},
|
||||
{120, 112, 98, 76, 45, 16, 2} };
|
||||
|
||||
/*
|
||||
* Update cpu_load for any missed ticks, due to tickless idle. The backlog
|
||||
* would be when CPU is idle and so we just decay the old load without
|
||||
* adding any new load.
|
||||
*/
|
||||
static unsigned long
|
||||
decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
|
||||
{
|
||||
int j = 0;
|
||||
|
||||
if (!missed_updates)
|
||||
return load;
|
||||
|
||||
if (missed_updates >= degrade_zero_ticks[idx])
|
||||
return 0;
|
||||
|
||||
if (idx == 1)
|
||||
return load >> missed_updates;
|
||||
|
||||
while (missed_updates) {
|
||||
if (missed_updates % 2)
|
||||
load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
|
||||
|
||||
missed_updates >>= 1;
|
||||
j++;
|
||||
}
|
||||
return load;
|
||||
}
|
||||
|
||||
/*
|
||||
* Update rq->cpu_load[] statistics. This function is usually called every
|
||||
* scheduler tick (TICK_NSEC). With tickless idle this will not be called
|
||||
* every tick. We fix it up based on jiffies.
|
||||
*/
|
||||
static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
|
||||
unsigned long pending_updates)
|
||||
{
|
||||
int i, scale;
|
||||
|
||||
this_rq->nr_load_updates++;
|
||||
|
||||
/* Update our load: */
|
||||
this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
|
||||
for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
|
||||
unsigned long old_load, new_load;
|
||||
|
||||
/* scale is effectively 1 << i now, and >> i divides by scale */
|
||||
|
||||
old_load = this_rq->cpu_load[i];
|
||||
old_load = decay_load_missed(old_load, pending_updates - 1, i);
|
||||
new_load = this_load;
|
||||
/*
|
||||
* Round up the averaging division if load is increasing. This
|
||||
* prevents us from getting stuck on 9 if the load is 10, for
|
||||
* example.
|
||||
*/
|
||||
if (new_load > old_load)
|
||||
new_load += scale - 1;
|
||||
|
||||
this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
|
||||
}
|
||||
|
||||
sched_avg_update(this_rq);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
static inline unsigned long get_rq_runnable_load(struct rq *rq)
|
||||
{
|
||||
return rq->cfs.runnable_load_avg;
|
||||
}
|
||||
#else
|
||||
static inline unsigned long get_rq_runnable_load(struct rq *rq)
|
||||
{
|
||||
return rq->load.weight;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_NO_HZ_COMMON
|
||||
/*
|
||||
* There is no sane way to deal with nohz on smp when using jiffies because the
|
||||
* cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
|
||||
* causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
|
||||
*
|
||||
* Therefore we cannot use the delta approach from the regular tick since that
|
||||
* would seriously skew the load calculation. However we'll make do for those
|
||||
* updates happening while idle (nohz_idle_balance) or coming out of idle
|
||||
* (tick_nohz_idle_exit).
|
||||
*
|
||||
* This means we might still be one tick off for nohz periods.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Called from nohz_idle_balance() to update the load ratings before doing the
|
||||
* idle balance.
|
||||
*/
|
||||
void update_idle_cpu_load(struct rq *this_rq)
|
||||
{
|
||||
unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
|
||||
unsigned long load = get_rq_runnable_load(this_rq);
|
||||
unsigned long pending_updates;
|
||||
|
||||
/*
|
||||
* bail if there's load or we're actually up-to-date.
|
||||
*/
|
||||
if (load || curr_jiffies == this_rq->last_load_update_tick)
|
||||
return;
|
||||
|
||||
pending_updates = curr_jiffies - this_rq->last_load_update_tick;
|
||||
this_rq->last_load_update_tick = curr_jiffies;
|
||||
|
||||
__update_cpu_load(this_rq, load, pending_updates);
|
||||
}
|
||||
|
||||
/*
|
||||
* Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
|
||||
*/
|
||||
void update_cpu_load_nohz(void)
|
||||
{
|
||||
struct rq *this_rq = this_rq();
|
||||
unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
|
||||
unsigned long pending_updates;
|
||||
|
||||
if (curr_jiffies == this_rq->last_load_update_tick)
|
||||
return;
|
||||
|
||||
raw_spin_lock(&this_rq->lock);
|
||||
pending_updates = curr_jiffies - this_rq->last_load_update_tick;
|
||||
if (pending_updates) {
|
||||
this_rq->last_load_update_tick = curr_jiffies;
|
||||
/*
|
||||
* We were idle, this means load 0, the current load might be
|
||||
* !0 due to remote wakeups and the sort.
|
||||
*/
|
||||
__update_cpu_load(this_rq, 0, pending_updates);
|
||||
}
|
||||
raw_spin_unlock(&this_rq->lock);
|
||||
}
|
||||
#endif /* CONFIG_NO_HZ */
|
||||
|
||||
/*
|
||||
* Called from scheduler_tick()
|
||||
*/
|
||||
void update_cpu_load_active(struct rq *this_rq)
|
||||
{
|
||||
unsigned long load = get_rq_runnable_load(this_rq);
|
||||
/*
|
||||
* See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
|
||||
*/
|
||||
this_rq->last_load_update_tick = jiffies;
|
||||
__update_cpu_load(this_rq, load, 1);
|
||||
|
||||
calc_load_account_active(this_rq);
|
||||
}
|
@@ -1323,7 +1323,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
|
||||
rq = cpu_rq(cpu);
|
||||
|
||||
rcu_read_lock();
|
||||
curr = ACCESS_ONCE(rq->curr); /* unlocked access */
|
||||
curr = READ_ONCE(rq->curr); /* unlocked access */
|
||||
|
||||
/*
|
||||
* If the current task on @p's runqueue is an RT task, then
|
||||
|
@@ -26,8 +26,14 @@ extern __read_mostly int scheduler_running;
|
||||
extern unsigned long calc_load_update;
|
||||
extern atomic_long_t calc_load_tasks;
|
||||
|
||||
extern void calc_global_load_tick(struct rq *this_rq);
|
||||
extern long calc_load_fold_active(struct rq *this_rq);
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
extern void update_cpu_load_active(struct rq *this_rq);
|
||||
#else
|
||||
static inline void update_cpu_load_active(struct rq *this_rq) { }
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Helpers for converting nanosecond timing to jiffy resolution
|
||||
@@ -707,7 +713,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
|
||||
|
||||
static inline u64 __rq_clock_broken(struct rq *rq)
|
||||
{
|
||||
return ACCESS_ONCE(rq->clock);
|
||||
return READ_ONCE(rq->clock);
|
||||
}
|
||||
|
||||
static inline u64 rq_clock(struct rq *rq)
|
||||
@@ -1284,7 +1290,6 @@ extern void update_max_interval(void);
|
||||
extern void init_sched_dl_class(void);
|
||||
extern void init_sched_rt_class(void);
|
||||
extern void init_sched_fair_class(void);
|
||||
extern void init_sched_dl_class(void);
|
||||
|
||||
extern void resched_curr(struct rq *rq);
|
||||
extern void resched_cpu(int cpu);
|
||||
@@ -1298,8 +1303,6 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
|
||||
|
||||
unsigned long to_ratio(u64 period, u64 runtime);
|
||||
|
||||
extern void update_idle_cpu_load(struct rq *this_rq);
|
||||
|
||||
extern void init_task_runnable_average(struct task_struct *p);
|
||||
|
||||
static inline void add_nr_running(struct rq *rq, unsigned count)
|
||||
|
@@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk)
|
||||
{
|
||||
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
|
||||
|
||||
if (!cputimer->running)
|
||||
/* Check if cputimer isn't running. This is accessed without locking. */
|
||||
if (!READ_ONCE(cputimer->running))
|
||||
return false;
|
||||
|
||||
/*
|
||||
@@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
|
||||
if (!cputimer_running(tsk))
|
||||
return;
|
||||
|
||||
raw_spin_lock(&cputimer->lock);
|
||||
cputimer->cputime.utime += cputime;
|
||||
raw_spin_unlock(&cputimer->lock);
|
||||
atomic64_add(cputime, &cputimer->cputime_atomic.utime);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
|
||||
if (!cputimer_running(tsk))
|
||||
return;
|
||||
|
||||
raw_spin_lock(&cputimer->lock);
|
||||
cputimer->cputime.stime += cputime;
|
||||
raw_spin_unlock(&cputimer->lock);
|
||||
atomic64_add(cputime, &cputimer->cputime_atomic.stime);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
|
||||
if (!cputimer_running(tsk))
|
||||
return;
|
||||
|
||||
raw_spin_lock(&cputimer->lock);
|
||||
cputimer->cputime.sum_exec_runtime += ns;
|
||||
raw_spin_unlock(&cputimer->lock);
|
||||
atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
|
||||
}
|
||||
|
@@ -601,7 +601,7 @@ EXPORT_SYMBOL(bit_wait_io);
|
||||
|
||||
__sched int bit_wait_timeout(struct wait_bit_key *word)
|
||||
{
|
||||
unsigned long now = ACCESS_ONCE(jiffies);
|
||||
unsigned long now = READ_ONCE(jiffies);
|
||||
if (signal_pending_state(current->state, current))
|
||||
return 1;
|
||||
if (time_after_eq(now, word->timeout))
|
||||
@@ -613,7 +613,7 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);
|
||||
|
||||
__sched int bit_wait_io_timeout(struct wait_bit_key *word)
|
||||
{
|
||||
unsigned long now = ACCESS_ONCE(jiffies);
|
||||
unsigned long now = READ_ONCE(jiffies);
|
||||
if (signal_pending_state(current->state, current))
|
||||
return 1;
|
||||
if (time_after_eq(now, word->timeout))
|
||||
|
@@ -245,7 +245,7 @@ static inline void print_dropped_signal(int sig)
|
||||
* RETURNS:
|
||||
* %true if @mask is set, %false if made noop because @task was dying.
|
||||
*/
|
||||
bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask)
|
||||
bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask)
|
||||
{
|
||||
BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
|
||||
JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
|
||||
@@ -297,7 +297,7 @@ void task_clear_jobctl_trapping(struct task_struct *task)
|
||||
* CONTEXT:
|
||||
* Must be called with @task->sighand->siglock held.
|
||||
*/
|
||||
void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask)
|
||||
void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask)
|
||||
{
|
||||
BUG_ON(mask & ~JOBCTL_PENDING_MASK);
|
||||
|
||||
@@ -2000,7 +2000,7 @@ static bool do_signal_stop(int signr)
|
||||
struct signal_struct *sig = current->signal;
|
||||
|
||||
if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
|
||||
unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
|
||||
unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
|
||||
struct task_struct *t;
|
||||
|
||||
/* signr will be recorded in task->jobctl for retries */
|
||||
|
@@ -211,25 +211,6 @@ static int multi_cpu_stop(void *data)
|
||||
return err;
|
||||
}
|
||||
|
||||
struct irq_cpu_stop_queue_work_info {
|
||||
int cpu1;
|
||||
int cpu2;
|
||||
struct cpu_stop_work *work1;
|
||||
struct cpu_stop_work *work2;
|
||||
};
|
||||
|
||||
/*
|
||||
* This function is always run with irqs and preemption disabled.
|
||||
* This guarantees that both work1 and work2 get queued, before
|
||||
* our local migrate thread gets the chance to preempt us.
|
||||
*/
|
||||
static void irq_cpu_stop_queue_work(void *arg)
|
||||
{
|
||||
struct irq_cpu_stop_queue_work_info *info = arg;
|
||||
cpu_stop_queue_work(info->cpu1, info->work1);
|
||||
cpu_stop_queue_work(info->cpu2, info->work2);
|
||||
}
|
||||
|
||||
/**
|
||||
* stop_two_cpus - stops two cpus
|
||||
* @cpu1: the cpu to stop
|
||||
@@ -245,7 +226,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
|
||||
{
|
||||
struct cpu_stop_done done;
|
||||
struct cpu_stop_work work1, work2;
|
||||
struct irq_cpu_stop_queue_work_info call_args;
|
||||
struct multi_stop_data msdata;
|
||||
|
||||
preempt_disable();
|
||||
@@ -262,13 +242,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
|
||||
.done = &done
|
||||
};
|
||||
|
||||
call_args = (struct irq_cpu_stop_queue_work_info){
|
||||
.cpu1 = cpu1,
|
||||
.cpu2 = cpu2,
|
||||
.work1 = &work1,
|
||||
.work2 = &work2,
|
||||
};
|
||||
|
||||
cpu_stop_init_done(&done, 2);
|
||||
set_state(&msdata, MULTI_STOP_PREPARE);
|
||||
|
||||
@@ -285,16 +258,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
lg_local_lock(&stop_cpus_lock);
|
||||
/*
|
||||
* Queuing needs to be done by the lowest numbered CPU, to ensure
|
||||
* that works are always queued in the same order on every CPU.
|
||||
* This prevents deadlocks.
|
||||
*/
|
||||
smp_call_function_single(min(cpu1, cpu2),
|
||||
&irq_cpu_stop_queue_work,
|
||||
&call_args, 1);
|
||||
lg_local_unlock(&stop_cpus_lock);
|
||||
lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
|
||||
cpu_stop_queue_work(cpu1, &work1);
|
||||
cpu_stop_queue_work(cpu2, &work2);
|
||||
lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
|
||||
|
||||
preempt_enable();
|
||||
|
||||
wait_for_completion(&done.completion);
|
||||
|
@@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
|
||||
/*
|
||||
* Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
|
||||
* to avoid race conditions with concurrent updates to cputime.
|
||||
*/
|
||||
static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
|
||||
{
|
||||
if (b->utime > a->utime)
|
||||
a->utime = b->utime;
|
||||
u64 curr_cputime;
|
||||
retry:
|
||||
curr_cputime = atomic64_read(cputime);
|
||||
if (sum_cputime > curr_cputime) {
|
||||
if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
|
||||
goto retry;
|
||||
}
|
||||
}
|
||||
|
||||
if (b->stime > a->stime)
|
||||
a->stime = b->stime;
|
||||
static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum)
|
||||
{
|
||||
__update_gt_cputime(&cputime_atomic->utime, sum->utime);
|
||||
__update_gt_cputime(&cputime_atomic->stime, sum->stime);
|
||||
__update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime);
|
||||
}
|
||||
|
||||
if (b->sum_exec_runtime > a->sum_exec_runtime)
|
||||
a->sum_exec_runtime = b->sum_exec_runtime;
|
||||
/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */
|
||||
static inline void sample_cputime_atomic(struct task_cputime *times,
|
||||
struct task_cputime_atomic *atomic_times)
|
||||
{
|
||||
times->utime = atomic64_read(&atomic_times->utime);
|
||||
times->stime = atomic64_read(&atomic_times->stime);
|
||||
times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime);
|
||||
}
|
||||
|
||||
void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
|
||||
{
|
||||
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
|
||||
struct task_cputime sum;
|
||||
unsigned long flags;
|
||||
|
||||
if (!cputimer->running) {
|
||||
/* Check if cputimer isn't running. This is accessed without locking. */
|
||||
if (!READ_ONCE(cputimer->running)) {
|
||||
/*
|
||||
* The POSIX timer interface allows for absolute time expiry
|
||||
* values through the TIMER_ABSTIME flag, therefore we have
|
||||
* to synchronize the timer to the clock every time we start
|
||||
* it.
|
||||
* to synchronize the timer to the clock every time we start it.
|
||||
*/
|
||||
thread_group_cputime(tsk, &sum);
|
||||
raw_spin_lock_irqsave(&cputimer->lock, flags);
|
||||
cputimer->running = 1;
|
||||
update_gt_cputime(&cputimer->cputime, &sum);
|
||||
} else
|
||||
raw_spin_lock_irqsave(&cputimer->lock, flags);
|
||||
*times = cputimer->cputime;
|
||||
raw_spin_unlock_irqrestore(&cputimer->lock, flags);
|
||||
update_gt_cputime(&cputimer->cputime_atomic, &sum);
|
||||
|
||||
/*
|
||||
* We're setting cputimer->running without a lock. Ensure
|
||||
* this only gets written to in one operation. We set
|
||||
* running after update_gt_cputime() as a small optimization,
|
||||
* but barriers are not required because update_gt_cputime()
|
||||
* can handle concurrent updates.
|
||||
*/
|
||||
WRITE_ONCE(cputimer->running, 1);
|
||||
}
|
||||
sample_cputime_atomic(times, &cputimer->cputime_atomic);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -582,7 +605,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
|
||||
if (!task_cputime_zero(&tsk->cputime_expires))
|
||||
return false;
|
||||
|
||||
if (tsk->signal->cputimer.running)
|
||||
/* Check if cputimer is running. This is accessed without locking. */
|
||||
if (READ_ONCE(tsk->signal->cputimer.running))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
@@ -852,10 +876,10 @@ static void check_thread_timers(struct task_struct *tsk,
|
||||
/*
|
||||
* Check for the special case thread timers.
|
||||
*/
|
||||
soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
|
||||
soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
|
||||
if (soft != RLIM_INFINITY) {
|
||||
unsigned long hard =
|
||||
ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
|
||||
READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
|
||||
|
||||
if (hard != RLIM_INFINITY &&
|
||||
tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
|
||||
@@ -882,14 +906,12 @@ static void check_thread_timers(struct task_struct *tsk,
|
||||
}
|
||||
}
|
||||
|
||||
static void stop_process_timers(struct signal_struct *sig)
|
||||
static inline void stop_process_timers(struct signal_struct *sig)
|
||||
{
|
||||
struct thread_group_cputimer *cputimer = &sig->cputimer;
|
||||
unsigned long flags;
|
||||
|
||||
raw_spin_lock_irqsave(&cputimer->lock, flags);
|
||||
cputimer->running = 0;
|
||||
raw_spin_unlock_irqrestore(&cputimer->lock, flags);
|
||||
/* Turn off cputimer->running. This is done without locking. */
|
||||
WRITE_ONCE(cputimer->running, 0);
|
||||
}
|
||||
|
||||
static u32 onecputick;
|
||||
@@ -958,11 +980,11 @@ static void check_process_timers(struct task_struct *tsk,
|
||||
SIGPROF);
|
||||
check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
|
||||
SIGVTALRM);
|
||||
soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
|
||||
soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
|
||||
if (soft != RLIM_INFINITY) {
|
||||
unsigned long psecs = cputime_to_secs(ptime);
|
||||
unsigned long hard =
|
||||
ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
|
||||
READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
|
||||
cputime_t x;
|
||||
if (psecs >= hard) {
|
||||
/*
|
||||
@@ -1111,12 +1133,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
|
||||
}
|
||||
|
||||
sig = tsk->signal;
|
||||
if (sig->cputimer.running) {
|
||||
/* Check if cputimer is running. This is accessed without locking. */
|
||||
if (READ_ONCE(sig->cputimer.running)) {
|
||||
struct task_cputime group_sample;
|
||||
|
||||
raw_spin_lock(&sig->cputimer.lock);
|
||||
group_sample = sig->cputimer.cputime;
|
||||
raw_spin_unlock(&sig->cputimer.lock);
|
||||
sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);
|
||||
|
||||
if (task_cputime_expired(&group_sample, &sig->cputime_expires))
|
||||
return 1;
|
||||
@@ -1157,7 +1178,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
|
||||
* If there are any active process wide timers (POSIX 1.b, itimers,
|
||||
* RLIMIT_CPU) cputimer must be running.
|
||||
*/
|
||||
if (tsk->signal->cputimer.running)
|
||||
if (READ_ONCE(tsk->signal->cputimer.running))
|
||||
check_process_timers(tsk, &firing);
|
||||
|
||||
/*
|
||||
|
Reference in New Issue
Block a user