Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar: "The main changes in this cycle were: - refcount conversions - Solve the rq->leaf_cfs_rq_list can of worms for real. - improve power-aware scheduling - add sysctl knob for Energy Aware Scheduling - documentation updates - misc other changes" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (34 commits) kthread: Do not use TIMER_IRQSAFE kthread: Convert worker lock to raw spinlock sched/fair: Use non-atomic cpumask_{set,clear}_cpu() sched/fair: Remove unused 'sd' parameter from select_idle_smt() sched/wait: Use freezable_schedule() when possible sched/fair: Prune, fix and simplify the nohz_balancer_kick() comment block sched/fair: Explain LLC nohz kick condition sched/fair: Simplify nohz_balancer_kick() sched/topology: Fix percpu data types in struct sd_data & struct s_data sched/fair: Simplify post_init_entity_util_avg() by calling it with a task_struct pointer argument sched/fair: Fix O(nr_cgroups) in the load balancing path sched/fair: Optimize update_blocked_averages() sched/fair: Fix insertion in rq->leaf_cfs_rq_list sched/fair: Add tmp_alone_branch assertion sched/core: Use READ_ONCE()/WRITE_ONCE() in move_queued_task()/task_rq_lock() sched/debug: Initialize sd_sysctl_cpus if !CONFIG_CPUMASK_OFFSTACK sched/pelt: Skip updating util_est when utilization is higher than CPU's capacity sched/fair: Update scale invariance of PELT sched/fair: Move the rq_of() helper function sched/core: Convert task_struct.stack_refcount to refcount_t ...
2019-03-06 08:14:05 -08:00
parent 203b6609e0 ad01423aed
commit 45802da05e
29 changed files with 1169 additions and 328 deletions
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -13,6 +13,7 @@
 #include <linux/securebits.h>
 #include <linux/seqlock.h>
 #include <linux/rbtree.h>
+#include <linux/refcount.h>
 #include <linux/sched/autogroup.h>
 #include <net/net_namespace.h>
 #include <linux/sched/rt.h>
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -86,7 +86,7 @@ enum {

 struct kthread_worker {
 	unsigned int		flags;
-	spinlock_t		lock;
+	raw_spinlock_t		lock;
 	struct list_head	work_list;
 	struct list_head	delayed_work_list;
 	struct task_struct	*task;
@@ -107,7 +107,7 @@ struct kthread_delayed_work {
 };

 #define KTHREAD_WORKER_INIT(worker)	{				\
-	.lock = __SPIN_LOCK_UNLOCKED((worker).lock),			\
+	.lock = __RAW_SPIN_LOCK_UNLOCKED((worker).lock),		\
 	.work_list = LIST_HEAD_INIT((worker).work_list),		\
 	.delayed_work_list = LIST_HEAD_INIT((worker).delayed_work_list),\
 	}
@@ -165,9 +165,8 @@ extern void __kthread_init_worker(struct kthread_worker *worker,
 #define kthread_init_delayed_work(dwork, fn)				\
 	do {								\
 		kthread_init_work(&(dwork)->work, (fn));		\
-		__init_timer(&(dwork)->timer,				\
-			     kthread_delayed_work_timer_fn,		\
-			     TIMER_IRQSAFE);				\
+		timer_setup(&(dwork)->timer,				\
+			     kthread_delayed_work_timer_fn, 0);		\
 	} while (0)

 int kthread_worker_fn(void *worker_ptr);
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -21,6 +21,7 @@
 #include <linux/seccomp.h>
 #include <linux/nodemask.h>
 #include <linux/rcupdate.h>
+#include <linux/refcount.h>
 #include <linux/resource.h>
 #include <linux/latencytop.h>
 #include <linux/sched/prio.h>
@@ -356,12 +357,6 @@ struct util_est {
 * For cfs_rq, it is the aggregated load_avg of all runnable and
 * blocked sched_entities.
 *
- * load_avg may also take frequency scaling into account:
- *
- *   load_avg = runnable% * scale_load_down(load) * freq%
- *
- * where freq% is the CPU frequency normalized to the highest frequency.
- *
 * [util_avg definition]
 *
 *   util_avg = running% * SCHED_CAPACITY_SCALE
@@ -370,17 +365,14 @@ struct util_est {
 * a CPU. For cfs_rq, it is the aggregated util_avg of all runnable
 * and blocked sched_entities.
 *
- * util_avg may also factor frequency scaling and CPU capacity scaling:
+ * load_avg and util_avg don't direcly factor frequency scaling and CPU
+ * capacity scaling. The scaling is done through the rq_clock_pelt that
+ * is used for computing those signals (see update_rq_clock_pelt())
 *
- *   util_avg = running% * SCHED_CAPACITY_SCALE * freq% * capacity%
- *
- * where freq% is the same as above, and capacity% is the CPU capacity
- * normalized to the greatest capacity (due to uarch differences, etc).
- *
- * N.B., the above ratios (runnable%, running%, freq%, and capacity%)
- * themselves are in the range of [0, 1]. To do fixed point arithmetics,
- * we therefore scale them to as large a range as necessary. This is for
- * example reflected by util_avg's SCHED_CAPACITY_SCALE.
+ * N.B., the above ratios (runnable% and running%) themselves are in the
+ * range of [0, 1]. To do fixed point arithmetics, we therefore scale them
+ * to as large a range as necessary. This is for example reflected by
+ * util_avg's SCHED_CAPACITY_SCALE.
 *
 * [Overflow issue]
 *
@@ -607,7 +599,7 @@ struct task_struct {
 	randomized_struct_fields_start

 	void				*stack;
-	atomic_t			usage;
+	refcount_t			usage;
 	/* Per task flags (PF_*), defined further below: */
 	unsigned int			flags;
 	unsigned int			ptrace;
@@ -1187,7 +1179,7 @@ struct task_struct {
 #endif
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 	/* A live task holds one reference: */
-	atomic_t			stack_refcount;
+	refcount_t			stack_refcount;
 #endif
 #ifdef CONFIG_LIVEPATCH
 	int patch_state;
@@ -1403,7 +1395,6 @@ extern struct pid *cad_pid;
 #define PF_UMH			0x02000000	/* I'm an Usermodehelper process */
 #define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_allowed */
 #define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
-#define PF_MUTEX_TESTER		0x20000000	/* Thread belongs to the rt mutex tester */
 #define PF_FREEZER_SKIP		0x40000000	/* Freezer should not count it as freezable */
 #define PF_SUSPEND_TASK		0x80000000      /* This thread called freeze_processes() and should not be frozen */

@@ -1753,9 +1744,9 @@ static __always_inline bool need_resched(void)
 static inline unsigned int task_cpu(const struct task_struct *p)
 {
 #ifdef CONFIG_THREAD_INFO_IN_TASK
-	return p->cpu;
+	return READ_ONCE(p->cpu);
 #else
-	return task_thread_info(p)->cpu;
+	return READ_ONCE(task_thread_info(p)->cpu);
 #endif
 }

--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -8,13 +8,14 @@
 #include <linux/sched/jobctl.h>
 #include <linux/sched/task.h>
 #include <linux/cred.h>
+#include <linux/refcount.h>

 /*
 * Types defining task->signal and task->sighand and APIs using them:
 */

 struct sighand_struct {
-	atomic_t		count;
+	refcount_t		count;
 	struct k_sigaction	action[_NSIG];
 	spinlock_t		siglock;
 	wait_queue_head_t	signalfd_wqh;
@@ -82,7 +83,7 @@ struct multiprocess_signals {
 * the locking of signal_struct.
 */
 struct signal_struct {
-	atomic_t		sigcnt;
+	refcount_t		sigcnt;
 	atomic_t		live;
 	int			nr_threads;
 	struct list_head	thread_head;
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -83,4 +83,11 @@ extern int sysctl_schedstats(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp,
 				 loff_t *ppos);

+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+extern unsigned int sysctl_sched_energy_aware;
+extern int sched_energy_aware_handler(struct ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp,
+				 loff_t *ppos);
+#endif
+
 #endif /* _LINUX_SCHED_SYSCTL_H */
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -88,13 +88,13 @@ extern void sched_exec(void);
 #define sched_exec()   {}
 #endif

-#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
+#define get_task_struct(tsk) do { refcount_inc(&(tsk)->usage); } while(0)

 extern void __put_task_struct(struct task_struct *t);

 static inline void put_task_struct(struct task_struct *t)
 {
-	if (atomic_dec_and_test(&t->usage))
+	if (refcount_dec_and_test(&t->usage))
 		__put_task_struct(t);
 }

--- a/include/linux/sched/task_stack.h
+++ b/include/linux/sched/task_stack.h
@@ -61,7 +61,7 @@ static inline unsigned long *end_of_stack(struct task_struct *p)
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 static inline void *try_get_task_stack(struct task_struct *tsk)
 {
-	return atomic_inc_not_zero(&tsk->stack_refcount) ?
+	return refcount_inc_not_zero(&tsk->stack_refcount) ?
 		task_stack_page(tsk) : NULL;
 }

--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -176,10 +176,10 @@ typedef int (*sched_domain_flags_f)(void);
 #define SDTL_OVERLAP	0x01

 struct sd_data {
-	struct sched_domain **__percpu sd;
-	struct sched_domain_shared **__percpu sds;
-	struct sched_group **__percpu sg;
-	struct sched_group_capacity **__percpu sgc;
+	struct sched_domain *__percpu *sd;
+	struct sched_domain_shared *__percpu *sds;
+	struct sched_group *__percpu *sg;
+	struct sched_group_capacity *__percpu *sgc;
 };

 struct sched_domain_topology_level {
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -308,7 +308,7 @@ do {										\

 #define __wait_event_freezable(wq_head, condition)				\
 	___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0,		\
-			    schedule(); try_to_freeze())
+			    freezable_schedule())

 /**
 * wait_event_freezable - sleep (or freeze) until a condition gets true
@@ -367,7 +367,7 @@ do {										\
 #define __wait_event_freezable_timeout(wq_head, condition, timeout)		\
 	___wait_event(wq_head, ___wait_cond_timeout(condition),			\
 		      TASK_INTERRUPTIBLE, 0, timeout,				\
-		      __ret = schedule_timeout(__ret); try_to_freeze())
+		      __ret = freezable_schedule_timeout(__ret))

 /*
 * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid
@@ -588,7 +588,7 @@ do {										\

 #define __wait_event_freezable_exclusive(wq, condition)				\
 	___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0,			\
-			schedule(); try_to_freeze())
+			freezable_schedule())

 #define wait_event_freezable_exclusive(wq, condition)				\
 ({										\