Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar: - MAINTAINERS: Add Mark Rutland as perf submaintainer, Juri Lelli and Vincent Guittot as scheduler submaintainers. Add Dietmar Eggemann, Steven Rostedt, Ben Segall and Mel Gorman as scheduler reviewers. As perf and the scheduler is getting bigger and more complex, document the status quo of current responsibilities and interests, and spread the review pain^H^H^H^H fun via an increase in the Cc: linecount generated by scripts/get_maintainer.pl. :-) - Add another series of patches that brings the -rt (PREEMPT_RT) tree closer to mainline: split the monolithic CONFIG_PREEMPT dependencies into a new CONFIG_PREEMPTION category that will allow the eventual introduction of CONFIG_PREEMPT_RT. Still a few more hundred patches to go though. - Extend the CPU cgroup controller with uclamp.min and uclamp.max to allow the finer shaping of CPU bandwidth usage. - Micro-optimize energy-aware wake-ups from O(CPUS^2) to O(CPUS). - Improve the behavior of high CPU count, high thread count applications running under cpu.cfs_quota_us constraints. - Improve balancing with SCHED_IDLE (SCHED_BATCH) tasks present. - Improve CPU isolation housekeeping CPU allocation NUMA locality. - Fix deadline scheduler bandwidth calculations and logic when cpusets rebuilds the topology, or when it gets deadline-throttled while it's being offlined. - Convert the cpuset_mutex to percpu_rwsem, to allow it to be used from setscheduler() system calls without creating global serialization. Add new synchronization between cpuset topology-changing events and the deadline acceptance tests in setscheduler(), which were broken before. - Rework the active_mm state machine to be less confusing and more optimal. - Rework (simplify) the pick_next_task() slowpath. - Improve load-balancing on AMD EPYC systems. - ... and misc cleanups, smaller fixes and improvements - please see the Git log for more details. * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (53 commits) sched/psi: Correct overly pessimistic size calculation sched/fair: Speed-up energy-aware wake-ups sched/uclamp: Always use 'enum uclamp_id' for clamp_id values sched/uclamp: Update CPU's refcount on TG's clamp changes sched/uclamp: Use TG's clamps to restrict TASK's clamps sched/uclamp: Propagate system defaults to the root group sched/uclamp: Propagate parent clamps sched/uclamp: Extend CPU's cgroup controller sched/topology: Improve load balancing on AMD EPYC systems arch, ia64: Make NUMA select SMP sched, perf: MAINTAINERS update, add submaintainers and reviewers sched/fair: Use rq_lock/unlock in online_fair_sched_group cpufreq: schedutil: fix equation in comment sched: Rework pick_next_task() slow-path sched: Allow put_prev_task() to drop rq->lock sched/fair: Expose newidle_balance() sched: Add task_struct pointer to sched_class::set_curr_task sched: Rework CPU hotplug task selection sched/{rt,deadline}: Fix set_next_task vs pick_next_task sched: Fix kerneldoc comment for ia64_set_curr_task ...
2019-09-16 17:25:49 -07:00
parent 772c1d06bd 563c4f85f9
commit 7e67a85999
60 changed files with 1274 additions and 595 deletions
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -529,6 +529,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
 static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
 {
 	struct rq *later_rq = NULL;
+	struct dl_bw *dl_b;

 	later_rq = find_lock_later_rq(p, rq);
 	if (!later_rq) {
@@ -557,6 +558,38 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
 		double_lock_balance(rq, later_rq);
 	}

+	if (p->dl.dl_non_contending || p->dl.dl_throttled) {
+		/*
+		 * Inactive timer is armed (or callback is running, but
+		 * waiting for us to release rq locks). In any case, when it
+		 * will fire (or continue), it will see running_bw of this
+		 * task migrated to later_rq (and correctly handle it).
+		 */
+		sub_running_bw(&p->dl, &rq->dl);
+		sub_rq_bw(&p->dl, &rq->dl);
+
+		add_rq_bw(&p->dl, &later_rq->dl);
+		add_running_bw(&p->dl, &later_rq->dl);
+	} else {
+		sub_rq_bw(&p->dl, &rq->dl);
+		add_rq_bw(&p->dl, &later_rq->dl);
+	}
+
+	/*
+	 * And we finally need to fixup root_domain(s) bandwidth accounting,
+	 * since p is still hanging out in the old (now moved to default) root
+	 * domain.
+	 */
+	dl_b = &rq->rd->dl_bw;
+	raw_spin_lock(&dl_b->lock);
+	__dl_sub(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
+	raw_spin_unlock(&dl_b->lock);
+
+	dl_b = &later_rq->rd->dl_bw;
+	raw_spin_lock(&dl_b->lock);
+	__dl_add(dl_b, p->dl.dl_bw, cpumask_weight(later_rq->rd->span));
+	raw_spin_unlock(&dl_b->lock);
+
 	set_task_cpu(p, later_rq->cpu);
 	double_unlock_balance(later_rq, rq);

@@ -1694,12 +1727,20 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
 }
 #endif

-static inline void set_next_task(struct rq *rq, struct task_struct *p)
+static void set_next_task_dl(struct rq *rq, struct task_struct *p)
 {
 	p->se.exec_start = rq_clock_task(rq);

 	/* You can't push away the running task */
 	dequeue_pushable_dl_task(rq, p);
+
+	if (hrtick_enabled(rq))
+		start_hrtick_dl(rq, p);
+
+	if (rq->curr->sched_class != &dl_sched_class)
+		update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+
+	deadline_queue_push_tasks(rq);
 }

 static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
@@ -1720,64 +1761,42 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 	struct task_struct *p;
 	struct dl_rq *dl_rq;

+	WARN_ON_ONCE(prev || rf);
+
 	dl_rq = &rq->dl;

-	if (need_pull_dl_task(rq, prev)) {
-		/*
-		 * This is OK, because current is on_cpu, which avoids it being
-		 * picked for load-balance and preemption/IRQs are still
-		 * disabled avoiding further scheduler activity on it and we're
-		 * being very careful to re-start the picking loop.
-		 */
-		rq_unpin_lock(rq, rf);
-		pull_dl_task(rq);
-		rq_repin_lock(rq, rf);
-		/*
-		 * pull_dl_task() can drop (and re-acquire) rq->lock; this
-		 * means a stop task can slip in, in which case we need to
-		 * re-start task selection.
-		 */
-		if (rq->stop && task_on_rq_queued(rq->stop))
-			return RETRY_TASK;
-	}
-
-	/*
-	 * When prev is DL, we may throttle it in put_prev_task().
-	 * So, we update time before we check for dl_nr_running.
-	 */
-	if (prev->sched_class == &dl_sched_class)
-		update_curr_dl(rq);
-
 	if (unlikely(!dl_rq->dl_nr_running))
 		return NULL;

-	put_prev_task(rq, prev);
-
 	dl_se = pick_next_dl_entity(rq, dl_rq);
 	BUG_ON(!dl_se);

 	p = dl_task_of(dl_se);

-	set_next_task(rq, p);
-
-	if (hrtick_enabled(rq))
-		start_hrtick_dl(rq, p);
-
-	deadline_queue_push_tasks(rq);
-
-	if (rq->curr->sched_class != &dl_sched_class)
-		update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+	set_next_task_dl(rq, p);

 	return p;
 }

-static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
+static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
 {
 	update_curr_dl(rq);

 	update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
 	if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
 		enqueue_pushable_dl_task(rq, p);
+
+	if (rf && !on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) {
+		/*
+		 * This is OK, because current is on_cpu, which avoids it being
+		 * picked for load-balance and preemption/IRQs are still
+		 * disabled avoiding further scheduler activity on it and we've
+		 * not yet started the picking loop.
+		 */
+		rq_unpin_lock(rq, rf);
+		pull_dl_task(rq);
+		rq_repin_lock(rq, rf);
+	}
 }

 /*
@@ -1811,11 +1830,6 @@ static void task_fork_dl(struct task_struct *p)
 	 */
 }

-static void set_curr_task_dl(struct rq *rq)
-{
-	set_next_task(rq, rq->curr);
-}
-
 #ifdef CONFIG_SMP

 /* Only try algorithms three times */
@@ -2275,6 +2289,36 @@ void __init init_sched_dl_class(void)
 					GFP_KERNEL, cpu_to_node(i));
 }

+void dl_add_task_root_domain(struct task_struct *p)
+{
+	struct rq_flags rf;
+	struct rq *rq;
+	struct dl_bw *dl_b;
+
+	rq = task_rq_lock(p, &rf);
+	if (!dl_task(p))
+		goto unlock;
+
+	dl_b = &rq->rd->dl_bw;
+	raw_spin_lock(&dl_b->lock);
+
+	__dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
+
+	raw_spin_unlock(&dl_b->lock);
+
+unlock:
+	task_rq_unlock(rq, p, &rf);
+}
+
+void dl_clear_root_domain(struct root_domain *rd)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&rd->dl_bw.lock, flags);
+	rd->dl_bw.total_bw = 0;
+	raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags);
+}
+
 #endif /* CONFIG_SMP */

 static void switched_from_dl(struct rq *rq, struct task_struct *p)
@@ -2395,6 +2439,7 @@ const struct sched_class dl_sched_class = {

 	.pick_next_task		= pick_next_task_dl,
 	.put_prev_task		= put_prev_task_dl,
+	.set_next_task		= set_next_task_dl,

 #ifdef CONFIG_SMP
 	.select_task_rq		= select_task_rq_dl,
@@ -2405,7 +2450,6 @@ const struct sched_class dl_sched_class = {
 	.task_woken		= task_woken_dl,
 #endif

-	.set_curr_task		= set_curr_task_dl,
 	.task_tick		= task_tick_dl,
 	.task_fork              = task_fork_dl,