Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "The main scheduler changes in this cycle were:

   - support Intel Turbo Boost Max Technology 3.0 (TBM3) by introducig a
     notion of 'better cores', which the scheduler will prefer to
     schedule single threaded workloads on. (Tim Chen, Srinivas
     Pandruvada)

   - enhance the handling of asymmetric capacity CPUs further (Morten
     Rasmussen)

   - improve/fix load handling when moving tasks between task groups
     (Vincent Guittot)

   - simplify and clean up the cputime code (Stanislaw Gruszka)

   - improve mass fork()ed task spread a.k.a. hackbench speedup (Vincent
     Guittot)

   - make struct kthread kmalloc()ed and related fixes (Oleg Nesterov)

   - add uaccess atomicity debugging (when using access_ok() in the
     wrong context), under CONFIG_DEBUG_ATOMIC_SLEEP=y (Peter Zijlstra)

   - implement various fixes, cleanups and other enhancements (Daniel
     Bristot de Oliveira, Martin Schwidefsky, Rafael J. Wysocki)"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (41 commits)
  sched/core: Use load_avg for selecting idlest group
  sched/core: Fix find_idlest_group() for fork
  kthread: Don't abuse kthread_create_on_cpu() in __kthread_create_worker()
  kthread: Don't use to_live_kthread() in kthread_[un]park()
  kthread: Don't use to_live_kthread() in kthread_stop()
  Revert "kthread: Pin the stack via try_get_task_stack()/put_task_stack() in to_live_kthread() function"
  kthread: Make struct kthread kmalloc'ed
  x86/uaccess, sched/preempt: Verify access_ok() context
  sched/x86: Make CONFIG_SCHED_MC_PRIO=y easier to enable
  sched/x86: Change CONFIG_SCHED_ITMT to CONFIG_SCHED_MC_PRIO
  x86/sched: Use #include <linux/mutex.h> instead of #include <asm/mutex.h>
  cpufreq/intel_pstate: Use CPPC to get max performance
  acpi/bus: Set _OSC for diverse core support
  acpi/bus: Enable HWP CPPC objects
  x86/sched: Add SD_ASYM_PACKING flags to x86 ITMT CPU
  x86/sysctl: Add sysctl for ITMT scheduling feature
  x86: Enable Intel Turbo Boost Max Technology 3.0
  x86/topology: Define x86's arch_update_cpu_topology
  sched: Extend scheduler's asym packing
  sched/fair: Clean up the tunable parameter definitions
  ...
This commit is contained in:
Linus Torvalds
2016-12-12 12:15:10 -08:00
36 changed files with 1155 additions and 409 deletions

View File

@@ -1995,14 +1995,15 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* @state: the mask of task states that can be woken
* @wake_flags: wake modifier flags (WF_*)
*
* Put it on the run-queue if it's not already there. The "current"
* thread is always on the run-queue (except when the actual
* re-schedule is in progress), and as such you're allowed to do
* the simpler "current->state = TASK_RUNNING" to mark yourself
* runnable without the overhead of this.
* If (@state & @p->state) @p->state = TASK_RUNNING.
*
* Return: %true if @p was woken up, %false if it was already running.
* or @state didn't match @p's state.
* If the task was not queued/runnable, also place it back on a runqueue.
*
* Atomic against schedule() which would dequeue a task, also see
* set_current_state().
*
* Return: %true if @p->state changes (an actual wakeup was done),
* %false otherwise.
*/
static int
try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
@@ -5707,7 +5708,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
printk(KERN_CONT " %*pbl",
cpumask_pr_args(sched_group_cpus(group)));
if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
printk(KERN_CONT " (cpu_capacity = %d)",
printk(KERN_CONT " (cpu_capacity = %lu)",
group->sgc->capacity);
}
@@ -6184,6 +6185,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
* die on a /0 trap.
*/
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
/*
* Make sure the first group of this domain contains the
@@ -6301,7 +6303,22 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
WARN_ON(!sg);
do {
int cpu, max_cpu = -1;
sg->group_weight = cpumask_weight(sched_group_cpus(sg));
if (!(sd->flags & SD_ASYM_PACKING))
goto next;
for_each_cpu(cpu, sched_group_cpus(sg)) {
if (max_cpu < 0)
max_cpu = cpu;
else if (sched_asym_prefer(cpu, max_cpu))
max_cpu = cpu;
}
sg->asym_prefer_cpu = max_cpu;
next:
sg = sg->next;
} while (sg != sd->groups);
@@ -7602,6 +7619,7 @@ void __init sched_init(void)
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
/*
* How much cpu bandwidth does root_task_group get?
*

View File

@@ -297,7 +297,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v)
for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
seq_printf(sf, "%s %lld\n",
cpuacct_stat_desc[stat],
cputime64_to_clock_t(val[stat]));
(long long)cputime64_to_clock_t(val[stat]));
}
return 0;

View File

@@ -128,16 +128,13 @@ static inline void task_group_account_field(struct task_struct *p, int index,
* Account user cpu time to a process.
* @p: the process that the cpu time gets accounted to
* @cputime: the cpu time spent in user space since the last update
* @cputime_scaled: cputime scaled by cpu frequency
*/
void account_user_time(struct task_struct *p, cputime_t cputime,
cputime_t cputime_scaled)
void account_user_time(struct task_struct *p, cputime_t cputime)
{
int index;
/* Add user time to process. */
p->utime += cputime;
p->utimescaled += cputime_scaled;
account_group_user_time(p, cputime);
index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
@@ -153,16 +150,13 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
* Account guest cpu time to a process.
* @p: the process that the cpu time gets accounted to
* @cputime: the cpu time spent in virtual machine since the last update
* @cputime_scaled: cputime scaled by cpu frequency
*/
static void account_guest_time(struct task_struct *p, cputime_t cputime,
cputime_t cputime_scaled)
static void account_guest_time(struct task_struct *p, cputime_t cputime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
/* Add guest time to process. */
p->utime += cputime;
p->utimescaled += cputime_scaled;
account_group_user_time(p, cputime);
p->gtime += cputime;
@@ -180,16 +174,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
* Account system cpu time to a process and desired cpustat field
* @p: the process that the cpu time gets accounted to
* @cputime: the cpu time spent in kernel space since the last update
* @cputime_scaled: cputime scaled by cpu frequency
* @target_cputime64: pointer to cpustat field that has to be updated
* @index: pointer to cpustat field that has to be updated
*/
static inline
void __account_system_time(struct task_struct *p, cputime_t cputime,
cputime_t cputime_scaled, int index)
void __account_system_time(struct task_struct *p, cputime_t cputime, int index)
{
/* Add system time to process. */
p->stime += cputime;
p->stimescaled += cputime_scaled;
account_group_system_time(p, cputime);
/* Add system time to cpustat. */
@@ -204,15 +195,14 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
* @p: the process that the cpu time gets accounted to
* @hardirq_offset: the offset to subtract from hardirq_count()
* @cputime: the cpu time spent in kernel space since the last update
* @cputime_scaled: cputime scaled by cpu frequency
*/
void account_system_time(struct task_struct *p, int hardirq_offset,
cputime_t cputime, cputime_t cputime_scaled)
cputime_t cputime)
{
int index;
if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
account_guest_time(p, cputime, cputime_scaled);
account_guest_time(p, cputime);
return;
}
@@ -223,7 +213,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
else
index = CPUTIME_SYSTEM;
__account_system_time(p, cputime, cputime_scaled, index);
__account_system_time(p, cputime, index);
}
/*
@@ -390,7 +380,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
struct rq *rq, int ticks)
{
u64 cputime = (__force u64) cputime_one_jiffy * ticks;
cputime_t scaled, other;
cputime_t other;
/*
* When returning from idle, many ticks can get accounted at
@@ -403,7 +393,6 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
if (other >= cputime)
return;
cputime -= other;
scaled = cputime_to_scaled(cputime);
if (this_cpu_ksoftirqd() == p) {
/*
@@ -411,15 +400,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
* So, we have to handle it separately here.
* Also, p->stime needs to be updated for ksoftirqd.
*/
__account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
__account_system_time(p, cputime, CPUTIME_SOFTIRQ);
} else if (user_tick) {
account_user_time(p, cputime, scaled);
account_user_time(p, cputime);
} else if (p == rq->idle) {
account_idle_time(cputime);
} else if (p->flags & PF_VCPU) { /* System time or guest time */
account_guest_time(p, cputime, scaled);
account_guest_time(p, cputime);
} else {
__account_system_time(p, cputime, scaled, CPUTIME_SYSTEM);
__account_system_time(p, cputime, CPUTIME_SYSTEM);
}
}
@@ -502,7 +491,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
*/
void account_process_tick(struct task_struct *p, int user_tick)
{
cputime_t cputime, scaled, steal;
cputime_t cputime, steal;
struct rq *rq = this_rq();
if (vtime_accounting_cpu_enabled())
@@ -520,12 +509,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
return;
cputime -= steal;
scaled = cputime_to_scaled(cputime);
if (user_tick)
account_user_time(p, cputime, scaled);
account_user_time(p, cputime);
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
account_system_time(p, HARDIRQ_OFFSET, cputime, scaled);
account_system_time(p, HARDIRQ_OFFSET, cputime);
else
account_idle_time(cputime);
}
@@ -746,7 +734,7 @@ static void __vtime_account_system(struct task_struct *tsk)
{
cputime_t delta_cpu = get_vtime_delta(tsk);
account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu));
account_system_time(tsk, irq_count(), delta_cpu);
}
void vtime_account_system(struct task_struct *tsk)
@@ -767,7 +755,7 @@ void vtime_account_user(struct task_struct *tsk)
tsk->vtime_snap_whence = VTIME_SYS;
if (vtime_delta(tsk)) {
delta_cpu = get_vtime_delta(tsk);
account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
account_user_time(tsk, delta_cpu);
}
write_seqcount_end(&tsk->vtime_seqcount);
}
@@ -863,29 +851,25 @@ cputime_t task_gtime(struct task_struct *t)
* add up the pending nohz execution time since the last
* cputime snapshot.
*/
static void
fetch_task_cputime(struct task_struct *t,
cputime_t *u_dst, cputime_t *s_dst,
cputime_t *u_src, cputime_t *s_src,
cputime_t *udelta, cputime_t *sdelta)
void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
{
cputime_t delta;
unsigned int seq;
unsigned long long delta;
if (!vtime_accounting_enabled()) {
*utime = t->utime;
*stime = t->stime;
return;
}
do {
*udelta = 0;
*sdelta = 0;
seq = read_seqcount_begin(&t->vtime_seqcount);
if (u_dst)
*u_dst = *u_src;
if (s_dst)
*s_dst = *s_src;
*utime = t->utime;
*stime = t->stime;
/* Task is sleeping, nothing to add */
if (t->vtime_snap_whence == VTIME_INACTIVE ||
is_idle_task(t))
if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t))
continue;
delta = vtime_delta(t);
@@ -894,54 +878,10 @@ fetch_task_cputime(struct task_struct *t,
* Task runs either in user or kernel space, add pending nohz time to
* the right place.
*/
if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) {
*udelta = delta;
} else {
if (t->vtime_snap_whence == VTIME_SYS)
*sdelta = delta;
}
if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU)
*utime += delta;
else if (t->vtime_snap_whence == VTIME_SYS)
*stime += delta;
} while (read_seqcount_retry(&t->vtime_seqcount, seq));
}
void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
{
cputime_t udelta, sdelta;
if (!vtime_accounting_enabled()) {
if (utime)
*utime = t->utime;
if (stime)
*stime = t->stime;
return;
}
fetch_task_cputime(t, utime, stime, &t->utime,
&t->stime, &udelta, &sdelta);
if (utime)
*utime += udelta;
if (stime)
*stime += sdelta;
}
void task_cputime_scaled(struct task_struct *t,
cputime_t *utimescaled, cputime_t *stimescaled)
{
cputime_t udelta, sdelta;
if (!vtime_accounting_enabled()) {
if (utimescaled)
*utimescaled = t->utimescaled;
if (stimescaled)
*stimescaled = t->stimescaled;
return;
}
fetch_task_cputime(t, utimescaled, stimescaled,
&t->utimescaled, &t->stimescaled, &udelta, &sdelta);
if (utimescaled)
*utimescaled += cputime_to_scaled(udelta);
if (stimescaled)
*stimescaled += cputime_to_scaled(sdelta);
}
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */

View File

@@ -586,7 +586,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
/*
* The task might have changed its scheduling policy to something
* different than SCHED_DEADLINE (through switched_fromd_dl()).
* different than SCHED_DEADLINE (through switched_from_dl()).
*/
if (!dl_task(p)) {
__dl_clear_params(p);
@@ -1137,7 +1137,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie coo
pull_dl_task(rq);
lockdep_repin_lock(&rq->lock, cookie);
/*
* pull_rt_task() can drop (and re-acquire) rq->lock; this
* pull_dl_task() can drop (and re-acquire) rq->lock; this
* means a stop task can slip in, in which case we need to
* re-start task selection.
*/

File diff suppressed because it is too large Load Diff

View File

@@ -404,6 +404,7 @@ struct cfs_rq {
unsigned long runnable_load_avg;
#ifdef CONFIG_FAIR_GROUP_SCHED
unsigned long tg_load_avg_contrib;
unsigned long propagate_avg;
#endif
atomic_long_t removed_load_avg, removed_util_avg;
#ifndef CONFIG_64BIT
@@ -539,6 +540,11 @@ struct dl_rq {
#ifdef CONFIG_SMP
static inline bool sched_asym_prefer(int a, int b)
{
return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b);
}
/*
* We add the notion of a root-domain which will be used to define per-domain
* variables. Each exclusive cpuset essentially defines an island domain by
@@ -623,6 +629,7 @@ struct rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
struct list_head *tmp_alone_branch;
#endif /* CONFIG_FAIR_GROUP_SCHED */
/*
@@ -892,7 +899,8 @@ struct sched_group_capacity {
* CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
* for a single CPU.
*/
unsigned int capacity;
unsigned long capacity;
unsigned long min_capacity; /* Min per-CPU capacity in group */
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
@@ -905,6 +913,7 @@ struct sched_group {
unsigned int group_weight;
struct sched_group_capacity *sgc;
int asym_prefer_cpu; /* cpu of highest priority in group */
/*
* The CPUs this group covers.