sched: Add snapshot of Window Assisted Load Tracking (WALT)

This snapshot is taken from msm-4.19 as of commit 5debecbe7195 ("trace: filter out spurious preemption and IRQs disable traces"). Change-Id: I8fab4084971baadcaa037f40ab549fc073a4b1ea Signed-off-by: Satya Durga Srinivasu Prabhala <satyap@codeaurora.org>
2019-09-09 15:32:44 -07:00
parent 6df025a02c
commit 64b577b9cc
24 changed files with 2050 additions and 92 deletions
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -183,9 +183,65 @@ static struct attribute_group crash_note_cpu_attr_group = {
 };
 #endif

+#ifdef CONFIG_SCHED_WALT
+static ssize_t sched_load_boost_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	ssize_t rc;
+	int boost;
+	struct cpu *cpu = container_of(dev, struct cpu, dev);
+	int cpuid = cpu->dev.id;
+
+	boost = per_cpu(sched_load_boost, cpuid);
+	rc = scnprintf(buf, PAGE_SIZE-2, "%d\n", boost);
+
+	return rc;
+}
+
+static ssize_t __ref sched_load_boost_store(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	int err;
+	int boost;
+	struct cpu *cpu = container_of(dev, struct cpu, dev);
+	int cpuid = cpu->dev.id;
+
+	err = kstrtoint(strstrip((char *)buf), 0, &boost);
+	if (err)
+		return err;
+
+	/*
+	 * -100 is low enough to cancel out CPU's load and make it near zro.
+	 * 1000 is close to the maximum value that cpu_util_freq_{walt,pelt}
+	 * can take without overflow.
+	 */
+	if (boost < -100 || boost > 1000)
+		return -EINVAL;
+
+	per_cpu(sched_load_boost, cpuid) = boost;
+
+	return count;
+}
+
+static DEVICE_ATTR_RW(sched_load_boost);
+
+static struct attribute *sched_cpu_attrs[] = {
+	&dev_attr_sched_load_boost.attr,
+	NULL
+};
+
+static struct attribute_group sched_cpu_attr_group = {
+	.attrs = sched_cpu_attrs,
+};
+#endif
+
 static const struct attribute_group *common_cpu_attr_groups[] = {
 #ifdef CONFIG_KEXEC
 	&crash_note_cpu_attr_group,
+#endif
+#ifdef CONFIG_SCHED_WALT
+	&sched_cpu_attr_group,
 #endif
 	NULL
 };
@@ -193,6 +249,9 @@ static const struct attribute_group *common_cpu_attr_groups[] = {
 static const struct attribute_group *hotplugable_cpu_attr_groups[] = {
 #ifdef CONFIG_KEXEC
 	&crash_note_cpu_attr_group,
+#endif
+#ifdef CONFIG_SCHED_WALT
+	&sched_cpu_attr_group,
 #endif
 	NULL
 };
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1459,6 +1459,56 @@ static const struct file_operations proc_pid_sched_operations = {

 #endif

+/*
+ * Print out various scheduling related per-task fields:
+ */
+
+#ifdef CONFIG_SCHED_WALT
+extern int __weak sched_wake_up_idle_show(struct seq_file *m, void *v);
+extern ssize_t __weak sched_wake_up_idle_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *offset);
+extern int __weak sched_wake_up_idle_open(struct inode *inode,
+						struct file *filp);
+
+static const struct file_operations proc_pid_sched_wake_up_idle_operations = {
+	.open		= sched_wake_up_idle_open,
+	.read		= seq_read,
+	.write		= sched_wake_up_idle_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+extern int __weak sched_init_task_load_show(struct seq_file *m, void *v);
+extern ssize_t __weak
+sched_init_task_load_write(struct file *file, const char __user *buf,
+					size_t count, loff_t *offset);
+extern int __weak
+sched_init_task_load_open(struct inode *inode, struct file *filp);
+
+static const struct file_operations proc_pid_sched_init_task_load_operations = {
+	.open		= sched_init_task_load_open,
+	.read		= seq_read,
+	.write		= sched_init_task_load_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+extern int __weak sched_group_id_show(struct seq_file *m, void *v);
+extern ssize_t __weak
+sched_group_id_write(struct file *file, const char __user *buf,
+					size_t count, loff_t *offset);
+extern int __weak sched_group_id_open(struct inode *inode, struct file *filp);
+
+static const struct file_operations proc_pid_sched_group_id_operations = {
+	.open		= sched_group_id_open,
+	.read		= seq_read,
+	.write		= sched_group_id_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#endif	/* CONFIG_SCHED_WALT */
+
 #ifdef CONFIG_SCHED_AUTOGROUP
 /*
 * Print out autogroup related information:
@@ -3011,6 +3061,13 @@ static const struct pid_entry tgid_base_stuff[] = {
 	ONE("status",     S_IRUGO, proc_pid_status),
 	ONE("personality", S_IRUSR, proc_pid_personality),
 	ONE("limits",	  S_IRUGO, proc_pid_limits),
+#ifdef CONFIG_SCHED_WALT
+	REG("sched_wake_up_idle", 00644,
+				proc_pid_sched_wake_up_idle_operations),
+	REG("sched_init_task_load", 00644,
+				proc_pid_sched_init_task_load_operations),
+	REG("sched_group_id", 00666, proc_pid_sched_group_id_operations),
+#endif
 #ifdef CONFIG_SCHED_DEBUG
 	REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -69,6 +69,9 @@ enum cpuhp_state {
 	CPUHP_SLAB_PREPARE,
 	CPUHP_MD_RAID5_PREPARE,
 	CPUHP_RCUTREE_PREP,
+#ifdef CONFIG_SCHED_WALT
+	CPUHP_CORE_CTL_ISOLATION_DEAD,
+#endif
 	CPUHP_CPUIDLE_COUPLED_PREPARE,
 	CPUHP_POWERPC_PMAC_PREPARE,
 	CPUHP_POWERPC_MMU_CTX_PREPARE,
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -117,6 +117,18 @@ struct task_group;
 					 (task->flags & PF_FROZEN) == 0 && \
 					 (task->state & TASK_NOLOAD) == 0)

+/*
+ * Enum for display driver to provide varying refresh rates
+ */
+enum fps {
+	FPS0 = 0,
+	FPS30 = 30,
+	FPS48 = 48,
+	FPS60 = 60,
+	FPS90 = 90,
+	FPS120 = 120,
+};
+
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP

 /*
@@ -212,6 +224,21 @@ struct task_group;
 /* Task command name length: */
 #define TASK_COMM_LEN			16

+enum task_event {
+	PUT_PREV_TASK   = 0,
+	PICK_NEXT_TASK  = 1,
+	TASK_WAKE       = 2,
+	TASK_MIGRATE    = 3,
+	TASK_UPDATE     = 4,
+	IRQ_UPDATE      = 5,
+};
+
+/* Note: this need to be in sync with migrate_type_names array */
+enum migrate_types {
+	GROUP_TO_RQ,
+	RQ_TO_GROUP,
+};
+
 extern void scheduler_tick(void);

 #define	MAX_SCHEDULE_TIMEOUT		LONG_MAX
@@ -478,6 +505,89 @@ struct sched_entity {
 #endif
 };

+struct cpu_cycle_counter_cb {
+	u64 (*get_cpu_cycle_counter)(int cpu);
+};
+
+DECLARE_PER_CPU_READ_MOSTLY(int, sched_load_boost);
+
+#ifdef CONFIG_SCHED_WALT
+extern void __weak sched_exit(struct task_struct *p);
+extern int __weak
+register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb);
+extern void __weak
+sched_update_cpu_freq_min_max(const cpumask_t *cpus, u32 fmin, u32 fmax);
+extern void __weak free_task_load_ptrs(struct task_struct *p);
+extern void __weak sched_set_refresh_rate(enum fps fps);
+
+#define RAVG_HIST_SIZE_MAX  5
+#define NUM_BUSY_BUCKETS 10
+
+/* ravg represents frequency scaled cpu-demand of tasks */
+struct ravg {
+	/*
+	 * 'mark_start' marks the beginning of an event (task waking up, task
+	 * starting to execute, task being preempted) within a window
+	 *
+	 * 'sum' represents how runnable a task has been within current
+	 * window. It incorporates both running time and wait time and is
+	 * frequency scaled.
+	 *
+	 * 'sum_history' keeps track of history of 'sum' seen over previous
+	 * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are
+	 * ignored.
+	 *
+	 * 'demand' represents maximum sum seen over previous
+	 * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency
+	 * demand for tasks.
+	 *
+	 * 'curr_window_cpu' represents task's contribution to cpu busy time on
+	 * various CPUs in the current window
+	 *
+	 * 'prev_window_cpu' represents task's contribution to cpu busy time on
+	 * various CPUs in the previous window
+	 *
+	 * 'curr_window' represents the sum of all entries in curr_window_cpu
+	 *
+	 * 'prev_window' represents the sum of all entries in prev_window_cpu
+	 *
+	 * 'pred_demand' represents task's current predicted cpu busy time
+	 *
+	 * 'busy_buckets' groups historical busy time into different buckets
+	 * used for prediction
+	 *
+	 * 'demand_scaled' represents task's demand scaled to 1024
+	 */
+	u64 mark_start;
+	u32 sum, demand;
+	u32 coloc_demand;
+	u32 sum_history[RAVG_HIST_SIZE_MAX];
+	u32 *curr_window_cpu, *prev_window_cpu;
+	u32 curr_window, prev_window;
+	u32 pred_demand;
+	u8 busy_buckets[NUM_BUSY_BUCKETS];
+	u16 demand_scaled;
+	u16 pred_demand_scaled;
+	u64 active_time;
+	u64 last_win_size;
+};
+#else
+static inline void sched_exit(struct task_struct *p) { }
+
+static inline int
+register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb)
+{
+	return 0;
+}
+
+static inline void free_task_load_ptrs(struct task_struct *p) { }
+
+static inline void sched_update_cpu_freq_min_max(const cpumask_t *cpus,
+					u32 fmin, u32 fmax) { }
+
+static inline void sched_set_refresh_rate(enum fps fps) { }
+#endif /* CONFIG_SCHED_WALT */
+
 struct sched_rt_entity {
 	struct list_head		run_list;
 	unsigned long			timeout;
@@ -675,6 +785,20 @@ struct task_struct {
 	const struct sched_class	*sched_class;
 	struct sched_entity		se;
 	struct sched_rt_entity		rt;
+#ifdef CONFIG_SCHED_WALT
+	u64 last_sleep_ts;
+	bool wake_up_idle;
+	struct ravg ravg;
+	u32 init_load_pct;
+	u64 last_wake_ts;
+	u64 last_enqueued_ts;
+	struct related_thread_group *grp;
+	struct list_head grp_list;
+	u64 cpu_cycles;
+	bool misfit;
+	u8 unfilter;
+#endif
+
 #ifdef CONFIG_CGROUP_SCHED
 	struct task_group		*sched_task_group;
 #endif
@@ -2000,4 +2124,37 @@ int sched_trace_rq_cpu(struct rq *rq);

 const struct cpumask *sched_trace_rd_span(struct root_domain *rd);

+#ifdef CONFIG_SCHED_WALT
+#define PF_WAKE_UP_IDLE	1
+static inline u32 sched_get_wake_up_idle(struct task_struct *p)
+{
+	return p->wake_up_idle;
+}
+
+static inline int sched_set_wake_up_idle(struct task_struct *p,
+						int wake_up_idle)
+{
+	p->wake_up_idle = !!wake_up_idle;
+	return 0;
+}
+
+static inline void set_wake_up_idle(bool enabled)
+{
+	current->wake_up_idle = enabled;
+}
+#else
+static inline u32 sched_get_wake_up_idle(struct task_struct *p)
+{
+	return 0;
+}
+
+static inline int sched_set_wake_up_idle(struct task_struct *p,
+						int wake_up_idle)
+{
+	return 0;
+}
+
+static inline void set_wake_up_idle(bool enabled) {}
+#endif
+
 #endif
--- a/include/linux/sched/core_ctl.h
+++ b/include/linux/sched/core_ctl.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2016, 2019, The Linux Foundation. All rights reserved.
+ */
+
+#ifndef __CORE_CTL_H
+#define __CORE_CTL_H
+
+#define MAX_CPUS_PER_CLUSTER 6
+#define MAX_CLUSTERS 3
+
+struct core_ctl_notif_data {
+	unsigned int nr_big;
+	unsigned int coloc_load_pct;
+	unsigned int ta_util_pct[MAX_CLUSTERS];
+	unsigned int cur_cap_pct[MAX_CLUSTERS];
+};
+
+#ifdef CONFIG_SCHED_WALT
+extern int __weak core_ctl_set_boost(bool boost);
+extern void __weak core_ctl_notifier_register(struct notifier_block *n);
+extern void __weak core_ctl_notifier_unregister(struct notifier_block *n);
+#else
+static inline int core_ctl_set_boost(bool boost)
+{
+	return 0;
+}
+static inline void core_ctl_notifier_register(struct notifier_block *n) {}
+static inline void core_ctl_notifier_unregister(struct notifier_block *n) {}
+#endif
+#endif
--- a/include/linux/sched/cpufreq.h
+++ b/include/linux/sched/cpufreq.h
@@ -10,6 +10,11 @@

 #define SCHED_CPUFREQ_IOWAIT	(1U << 0)
 #define SCHED_CPUFREQ_MIGRATION	(1U << 1)
+#define SCHED_CPUFREQ_INTERCLUSTER_MIG	(1U << 3)
+#define SCHED_CPUFREQ_WALT	(1U << 4)
+#define SCHED_CPUFREQ_PL	(1U << 5)
+#define SCHED_CPUFREQ_EARLY_DET	(1U << 6)
+#define SCHED_CPUFREQ_CONTINUE	(1U << 8)

 #ifdef CONFIG_CPU_FREQ
 struct update_util_data {
--- a/include/linux/sched/stat.h
+++ b/include/linux/sched/stat.h
@@ -21,6 +21,28 @@ extern bool single_task_running(void);
 extern unsigned long nr_iowait(void);
 extern unsigned long nr_iowait_cpu(int cpu);

+#ifdef CONFIG_SCHED_WALT
+extern void __weak sched_update_nr_prod(int cpu, long delta, bool inc);
+extern unsigned int __weak sched_get_cpu_util(int cpu);
+extern void __weak sched_update_hyst_times(void);
+extern u64 __weak sched_lpm_disallowed_time(int cpu);
+#else
+static inline void sched_update_nr_prod(int cpu, long delta, bool inc) {}
+static inline unsigned int sched_get_cpu_util(int cpu)
+{
+	return 0;
+}
+static inline u64 sched_get_cpu_last_busy_time(int cpu)
+{
+	return 0;
+}
+static inline void sched_update_hyst_times(void) {}
+static inline u64 sched_lpm_disallowed_time(int cpu)
+{
+	return 0;
+}
+#endif
+
 static inline int sched_info_on(void)
 {
 #ifdef CONFIG_SCHEDSTATS
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -24,6 +24,42 @@ extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
+#ifdef CONFIG_SCHED_WALT
+extern unsigned int __weak sysctl_sched_user_hint;
+extern const int __weak sched_user_hint_max;
+extern unsigned int __weak sysctl_sched_cpu_high_irqload;
+extern unsigned int __weak sysctl_sched_boost;
+extern unsigned int __weak sysctl_sched_group_upmigrate_pct;
+extern unsigned int __weak sysctl_sched_group_downmigrate_pct;
+extern unsigned int __weak sysctl_sched_conservative_pl;
+extern unsigned int __weak sysctl_sched_walt_rotate_big_tasks;
+extern unsigned int __weak sysctl_sched_min_task_util_for_boost;
+extern unsigned int __weak sysctl_sched_min_task_util_for_colocation;
+extern unsigned int __weak sysctl_sched_asym_cap_sibling_freq_match_pct;
+extern unsigned int __weak sysctl_sched_coloc_downmigrate_ns;
+extern unsigned int __weak sysctl_sched_task_unfilter_nr_windows;
+extern unsigned int __weak sysctl_sched_busy_hyst_enable_cpus;
+extern unsigned int __weak sysctl_sched_busy_hyst;
+extern unsigned int __weak sysctl_sched_coloc_busy_hyst_enable_cpus;
+extern unsigned int __weak sysctl_sched_coloc_busy_hyst;
+extern unsigned int __weak sysctl_sched_coloc_busy_hyst_max_ms;
+extern unsigned int __weak sysctl_sched_window_stats_policy;
+extern unsigned int __weak sysctl_sched_ravg_window_nr_ticks;
+
+extern int __weak
+walt_proc_group_thresholds_handler(struct ctl_table *table, int write,
+			void __user *buffer, size_t *lenp,
+			loff_t *ppos);
+extern int __weak
+walt_proc_user_hint_handler(struct ctl_table *table, int write,
+			void __user *buffer, size_t *lenp,
+			loff_t *ppos);
+
+extern int __weak
+sched_ravg_window_handler(struct ctl_table *table, int write,
+			void __user *buffer, size_t *lenp,
+			loff_t *ppos);
+#endif

 enum sched_tunable_scaling {
 	SCHED_TUNABLESCALING_NONE,
@@ -47,6 +83,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
 		loff_t *ppos);
 #endif

+#ifdef CONFIG_SCHED_WALT
+extern int __weak sched_boost_handler(struct ctl_table *table, int write,
+			void __user *buffer, size_t *lenp, loff_t *ppos);
+#endif
 /*
 *  control realtime throttling:
 *
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -73,6 +73,9 @@ extern int proc_do_large_bitmap(struct ctl_table *, int,
 extern int proc_do_static_key(struct ctl_table *table, int write,
 			      void __user *buffer, size_t *lenp,
 			      loff_t *ppos);
+extern int proc_douintvec_ravg_window(struct ctl_table *table, int write,
+			      void __user *buffer, size_t *lenp,
+			      loff_t *ppos);

 /*
 * Register a set of sysctl names by calling register_sysctl_table
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -470,6 +470,15 @@ config HAVE_SCHED_AVG_IRQ
 	depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING
 	depends on SMP

+config SCHED_WALT
+	bool "Support window based load tracking"
+	depends on SMP
+	help
+	This feature will allow the scheduler to maintain a tunable window
+	based set of metrics for tasks and runqueues. These metrics can be
+	used to guide task placement as well as task frequency requirements
+	for cpufreq governors.
+
 config BSD_PROCESS_ACCT
 	bool "BSD Process Accounting"
 	depends on MULTIUSER
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -91,6 +91,9 @@ struct task_struct init_task
 #endif
 #ifdef CONFIG_CGROUP_SCHED
 	.sched_task_group = &root_task_group,
+#endif
+#ifdef CONFIG_SCHED_WALT
+	.wake_up_idle	= false,
 #endif
 	.ptraced	= LIST_HEAD_INIT(init_task.ptraced),
 	.ptrace_entry	= LIST_HEAD_INIT(init_task.ptrace_entry),
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -761,6 +761,7 @@ void __noreturn do_exit(long code)
 	}

 	exit_signals(tsk);  /* sets PF_EXITING */
+	sched_exit(tsk);
 	/*
 	 * Ensure that all new tsk->pi_lock acquisitions must observe
 	 * PF_EXITING. Serializes against futex.c:attach_to_pi_owner().
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2283,6 +2283,7 @@ bad_fork_cleanup_perf:
 	perf_event_free_task(p);
 bad_fork_cleanup_policy:
 	lockdep_free_task(p);
+	free_task_load_ptrs(p);
 #ifdef CONFIG_NUMA
 	mpol_put(p->mempolicy);
 bad_fork_cleanup_threadgroup_lock:
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -19,6 +19,7 @@
 #include "../smpboot.h"

 #include "pelt.h"
+#include "walt.h"

 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
@@ -1298,6 +1299,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)

 	uclamp_rq_inc(rq, p);
 	p->sched_class->enqueue_task(rq, p, flags);
+	walt_update_last_enqueue(p);
 }

 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1312,6 +1314,10 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)

 	uclamp_rq_dec(rq, p);
 	p->sched_class->dequeue_task(rq, p, flags);
+#ifdef CONFIG_SCHED_WALT
+	if (p == rq->ed_task)
+		early_detection_notify(rq, sched_ktime_clock());
+#endif
 }

 void activate_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1331,6 +1337,11 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 	if (task_contributes_to_load(p))
 		rq->nr_uninterruptible++;

+#ifdef CONFIG_SCHED_WALT
+	if (flags & DEQUEUE_SLEEP)
+		clear_ed_task(p, rq);
+#endif
+
 	dequeue_task(rq, p, flags);
 }

@@ -1492,8 +1503,11 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,

 	WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
 	dequeue_task(rq, p, DEQUEUE_NOCLOCK);
+	double_lock_balance(rq, cpu_rq(new_cpu));
+	if (!(rq->clock_update_flags & RQCF_UPDATED))
+		update_rq_clock(rq);
 	set_task_cpu(p, new_cpu);
-	rq_unlock(rq, rf);
+	double_rq_unlock(cpu_rq(new_cpu), rq);

 	rq = cpu_rq(new_cpu);

@@ -1750,12 +1764,13 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		p->se.nr_migrations++;
 		rseq_migrate(p);
 		perf_event_task_migrate(p);
+
+		fixup_busy_time(p, new_cpu);
 	}

 	__set_task_cpu(p, new_cpu);
 }

-#ifdef CONFIG_NUMA_BALANCING
 static void __migrate_swap_task(struct task_struct *p, int cpu)
 {
 	if (task_on_rq_queued(p)) {
@@ -1870,7 +1885,6 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
 out:
 	return ret;
 }
-#endif /* CONFIG_NUMA_BALANCING */

 /*
 * wait_task_inactive - wait for a thread to unschedule.
@@ -2616,6 +2630,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	 */
 	smp_cond_load_acquire(&p->on_cpu, !VAL);

+	walt_try_to_wake_up(p);
+
 	p->sched_contributes_to_load = !!task_contributes_to_load(p);
 	p->state = TASK_WAKING;

@@ -2644,6 +2660,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 unlock:
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 out:
+#ifdef CONFIG_SCHED_WALT
+	if (success && sched_predl) {
+		raw_spin_lock_irqsave(&cpu_rq(cpu)->lock, flags);
+		if (do_pl_notif(cpu_rq(cpu)))
+			cpufreq_update_util(cpu_rq(cpu),
+						SCHED_CPUFREQ_WALT |
+						SCHED_CPUFREQ_PL);
+		raw_spin_unlock_irqrestore(&cpu_rq(cpu)->lock, flags);
+	}
+#endif
+
 	if (success)
 		ttwu_stat(p, cpu, wake_flags);
 	preempt_enable();
@@ -2689,6 +2716,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.nr_migrations		= 0;
 	p->se.vruntime			= 0;
+#ifdef CONFIG_SCHED_WALT
+	p->last_sleep_ts		= 0;
+	p->wake_up_idle			= false;
+#endif
 	INIT_LIST_HEAD(&p->se.group_node);

 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -2840,6 +2871,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 {
 	unsigned long flags;

+	init_new_task_load(p);
 	__sched_fork(clone_flags, p);
 	/*
 	 * We mark the process as NEW here. This guarantees that
@@ -2945,7 +2977,9 @@ void wake_up_new_task(struct task_struct *p)
 	struct rq_flags rf;
 	struct rq *rq;

+	add_new_task_to_grp(p);
 	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+
 	p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
 	/*
@@ -2963,6 +2997,7 @@ void wake_up_new_task(struct task_struct *p)
 	update_rq_clock(rq);
 	post_init_entity_util_avg(p);

+	mark_task_starting(p);
 	activate_task(rq, p, ENQUEUE_NOCLOCK);
 	trace_sched_wakeup_new(p);
 	check_preempt_curr(rq, p, WF_FORK);
@@ -3497,6 +3532,9 @@ void sched_exec(void)
 	unsigned long flags;
 	int dest_cpu;

+	if (sched_energy_enabled())
+		return;
+
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
 	if (dest_cpu == smp_processor_id())
@@ -3592,16 +3630,30 @@ void scheduler_tick(void)
 	struct rq *rq = cpu_rq(cpu);
 	struct task_struct *curr = rq->curr;
 	struct rq_flags rf;
+	u64 wallclock;
+	bool early_notif;
+	u32 old_load;
+	struct related_thread_group *grp;
+	unsigned int flag = 0;

 	sched_clock_tick();

 	rq_lock(rq, &rf);

+	old_load = task_load(curr);
+	set_window_start(rq);
+	wallclock = sched_ktime_clock();
+	walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
 	update_rq_clock(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
 	calc_global_load_tick(rq);
 	psi_task_tick(rq);

+	early_notif = early_detection_notify(rq, wallclock);
+	if (early_notif)
+		flag = SCHED_CPUFREQ_WALT | SCHED_CPUFREQ_EARLY_DET;
+
+	cpufreq_update_util(rq, flag);
 	rq_unlock(rq, &rf);

 	perf_event_task_tick();
@@ -3610,6 +3662,15 @@ void scheduler_tick(void)
 	rq->idle_balance = idle_cpu(cpu);
 	trigger_load_balance(rq);
 #endif
+
+	rcu_read_lock();
+	grp = task_related_thread_group(curr);
+	if (update_preferred_cluster(grp, curr, old_load, true))
+		set_preferred_cluster(grp);
+	rcu_read_unlock();
+
+	if (curr->sched_class == &fair_sched_class)
+		check_for_migration(rq, curr);
 }

 #ifdef CONFIG_NO_HZ_FULL
@@ -4005,6 +4066,7 @@ static void __sched notrace __schedule(bool preempt)
 	struct rq_flags rf;
 	struct rq *rq;
 	int cpu;
+	u64 wallclock;

 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
@@ -4052,7 +4114,15 @@ static void __sched notrace __schedule(bool preempt)
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();

+	wallclock = sched_ktime_clock();
 	if (likely(prev != next)) {
+#ifdef CONFIG_SCHED_WALT
+		if (!prev->on_rq)
+			prev->last_sleep_ts = wallclock;
+#endif
+
+		walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
+		walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
 		rq->nr_switches++;
 		/*
 		 * RCU users of rcu_dereference(rq->curr) may not see
@@ -4080,6 +4150,7 @@ static void __sched notrace __schedule(bool preempt)
 		/* Also unlocks the rq: */
 		rq = context_switch(rq, prev, next, &rf);
 	} else {
+		walt_update_task_ravg(prev, rq, TASK_UPDATE, wallclock, 0);
 		rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
 		rq_unlock_irq(rq, &rf);
 	}
@@ -4669,7 +4740,7 @@ struct task_struct *idle_task(int cpu)
 *
 * The task of @pid, if found. %NULL otherwise.
 */
-static struct task_struct *find_process_by_pid(pid_t pid)
+struct task_struct *find_process_by_pid(pid_t pid)
 {
 	return pid ? find_task_by_vpid(pid) : current;
 }
@@ -6253,7 +6324,7 @@ void idle_task_exit(void)
 *
 * Also see the comment "Global load-average calculations".
 */
-static void calc_load_migrate(struct rq *rq)
+void calc_load_migrate(struct rq *rq)
 {
 	long delta = calc_load_fold_active(rq, 1);
 	if (delta)
@@ -6285,7 +6356,7 @@ static struct task_struct *__pick_migrate_task(struct rq *rq)
 * there's no concurrency possible, we hold the required locks anyway
 * because of lock validation efforts.
 */
-static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
+void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
 {
 	struct rq *rq = dead_rq;
 	struct task_struct *next, *stop = rq->stop;
@@ -6512,6 +6583,11 @@ int sched_cpu_deactivate(unsigned int cpu)
 static void sched_rq_cpu_starting(unsigned int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	set_window_start(rq);
+	raw_spin_unlock_irqrestore(&rq->lock, flags);

 	rq->calc_load_update = calc_load_update;
 	update_max_interval();
@@ -6521,6 +6597,7 @@ int sched_cpu_starting(unsigned int cpu)
 {
 	sched_rq_cpu_starting(cpu);
 	sched_tick_start(cpu);
+	clear_walt_request(cpu);
 	return 0;
 }

@@ -6535,6 +6612,7 @@ int sched_cpu_dying(unsigned int cpu)
 	sched_tick_stop(cpu);

 	rq_lock_irqsave(rq, &rf);
+
 	if (rq->rd) {
 		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 		set_rq_offline(rq);
@@ -6543,6 +6621,8 @@ int sched_cpu_dying(unsigned int cpu)
 	BUG_ON(rq->nr_running != 1);
 	rq_unlock_irqrestore(rq, &rf);

+	clear_walt_request(cpu);
+
 	calc_load_migrate(rq);
 	update_max_interval();
 	nohz_balance_exit_idle(rq);
@@ -6564,6 +6644,8 @@ void __init sched_init_smp(void)
 	sched_init_domains(cpu_active_mask);
 	mutex_unlock(&sched_domains_mutex);

+	update_cluster_topology();
+
 	/* Move init over to a non-isolated CPU */
 	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
 		BUG();
@@ -6618,6 +6700,8 @@ void __init sched_init(void)

 	wait_bit_init();

+	init_clusters();
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	ptr += 2 * nr_cpu_ids * sizeof(void **);
 #endif
@@ -6729,6 +6813,7 @@ void __init sched_init(void)
 		rq->idle_stamp = 0;
 		rq->avg_idle = 2*sysctl_sched_migration_cost;
 		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
+		walt_sched_init_rq(rq);

 		INIT_LIST_HEAD(&rq->cfs_tasks);

@@ -6743,6 +6828,8 @@ void __init sched_init(void)
 		atomic_set(&rq->nr_iowait, 0);
 	}

+	BUG_ON(alloc_related_thread_groups());
+
 	set_load_weight(&init_task, false);

 	/*
@@ -6758,6 +6845,7 @@ void __init sched_init(void)
 	 * when this runqueue becomes "idle".
 	 */
 	init_idle(current, smp_processor_id());
+	init_new_task_load(current);

 	calc_load_update = jiffies + LOAD_FREQ;

@@ -6972,6 +7060,97 @@ void ia64_set_curr_task(int cpu, struct task_struct *p)
 /* task_group_lock serializes the addition/removal of task groups */
 static DEFINE_SPINLOCK(task_group_lock);

+#if defined(CONFIG_SCHED_WALT) && defined(CONFIG_UCLAMP_TASK_GROUP)
+static inline void walt_init_sched_boost(struct task_group *tg)
+{
+	tg->sched_boost_no_override = false;
+	tg->sched_boost_enabled = true;
+	tg->colocate = false;
+	tg->colocate_update_disabled = false;
+}
+
+void update_cgroup_boost_settings(void)
+{
+	struct task_group *tg;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tg, &task_groups, list) {
+		if (tg->sched_boost_no_override)
+			continue;
+
+		tg->sched_boost_enabled = false;
+	}
+	rcu_read_unlock();
+}
+
+void restore_cgroup_boost_settings(void)
+{
+	struct task_group *tg;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tg, &task_groups, list)
+		tg->sched_boost_enabled = true;
+	rcu_read_unlock();
+}
+
+static void walt_schedgp_attach(struct cgroup_taskset *tset)
+{
+	struct task_struct *task;
+	struct cgroup_subsys_state *css;
+	bool colocate;
+
+	cgroup_taskset_first(tset, &css);
+	tg = css_tg(css);
+
+	colocate = tg->colocate;
+
+	cgroup_taskset_for_each(task, css, tset)
+		sync_cgroup_colocation(task, colocate);
+}
+
+static u64
+sched_boost_override_read(struct cgroup_subsys_state *css,
+					struct cftype *cft)
+{
+	struct task_group *tg = css_tg(css);
+
+	return (u64) tg->sched_boost_no_override;
+}
+
+static int sched_boost_override_write(struct cgroup_subsys_state *css,
+					struct cftype *cft, u64 override)
+{
+	struct task_group *tg = css_tg(css);
+
+	tg->sched_boost_no_override = !!override;
+	return 0;
+}
+
+static u64 sched_colocate_read(struct cgroup_subsys_state *css,
+						struct cftype *cft)
+{
+	struct task_group *tg = css_tg(css);
+
+	return (u64) tg->colocate;
+}
+
+static int sched_colocate_write(struct cgroup_subsys_state *css,
+				struct cftype *cft, u64 colocate)
+{
+	struct task_group *tg = css_tg(css);
+
+	if (tg->colocate_update_disabled)
+		return -EPERM;
+
+	tg->colocate = !!colocate;
+	tg->colocate_update_disabled = true;
+	return 0;
+}
+#else
+static inline void walt_init_sched_boost(struct task_group *tg) { }
+static void walt_schedgp_attach(struct cgroup_taskset *tset) { }
+#endif /* CONFIG_SCHED_WALT */
+
 static inline void alloc_uclamp_sched_group(struct task_group *tg,
 					    struct task_group *parent)
 {
@@ -7139,6 +7318,7 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 	if (IS_ERR(tg))
 		return ERR_PTR(-ENOMEM);

+	walt_init_sched_boost(tg);
 	return &tg->css;
 }

@@ -7225,6 +7405,8 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)

 	cgroup_taskset_for_each(task, css, tset)
 		sched_move_task(task);
+
+	walt_schedgp_attach(tset);
 }

 #ifdef CONFIG_UCLAMP_TASK_GROUP
@@ -7784,7 +7966,21 @@ static struct cftype cpu_legacy_files[] = {
 		.read_u64 = cpu_uclamp_ls_read_u64,
 		.write_u64 = cpu_uclamp_ls_write_u64,
 	},
-#endif
+#ifdef CONFIG_SCHED_WALT
+	{
+		.name = "uclamp.sched_boost_no_override",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = sched_boost_override_read,
+		.write_u64 = sched_boost_override_write,
+	},
+	{
+		.name = "uclamp.colocate",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = sched_colocate_read,
+		.write_u64 = sched_colocate_write,
+	},
+#endif /* CONFIG_SCHED_WALT */
+#endif /* CONFIG_UCLAMP_TASK_GROUP */
 	{ }	/* Terminate */
 };

@@ -7971,7 +8167,21 @@ static struct cftype cpu_files[] = {
 		.read_u64 = cpu_uclamp_ls_read_u64,
 		.write_u64 = cpu_uclamp_ls_write_u64,
 	},
-#endif
+#ifdef CONFIG_SCHED_WALT
+	{
+		.name = "uclamp.sched_boost_no_override",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = sched_boost_override_read,
+		.write_u64 = sched_boost_override_write,
+	},
+	{
+		.name = "uclamp.colocate",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = sched_colocate_read,
+		.write_u64 = sched_colocate_write,
+	},
+#endif /* CONFIG_SCHED_WALT */
+#endif /* CONFIG_UCLAMP_TASK_GROUP */
 	{ }	/* terminate */
 };

@@ -8040,3 +8250,57 @@ const u32 sched_prio_to_wmult[40] = {
 };

 #undef CREATE_TRACE_POINTS
+
+__read_mostly bool sched_predl = 1;
+
+void enqueue_task_core(struct rq *rq, struct task_struct *p, int flags)
+{
+	enqueue_task(rq, p, 0);
+}
+
+void dequeue_task_core(struct rq *rq, struct task_struct *p, int flags)
+{
+	dequeue_task(rq, p, 0);
+}
+
+#ifdef CONFIG_SCHED_WALT
+void sched_account_irqtime(int cpu, struct task_struct *curr,
+				u64 delta, u64 wallclock)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags, nr_windows;
+	u64 cur_jiffies_ts;
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+
+	/*
+	 * cputime (wallclock) uses sched_clock so use the same here for
+	 * consistency.
+	 */
+	delta += sched_clock() - wallclock;
+	cur_jiffies_ts = get_jiffies_64();
+
+	if (is_idle_task(curr))
+		walt_update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(),
+								delta);
+
+	nr_windows = cur_jiffies_ts - rq->irqload_ts;
+
+	if (nr_windows) {
+		if (nr_windows < 10) {
+			/* Decay CPU's irqload by 3/4 for each window. */
+			rq->avg_irqload *= (3 * nr_windows);
+			rq->avg_irqload = div64_u64(rq->avg_irqload,
+							4 * nr_windows);
+		} else {
+			rq->avg_irqload = 0;
+		}
+		rq->avg_irqload += rq->cur_irqload;
+		rq->cur_irqload = 0;
+	}
+
+	rq->cur_irqload += delta;
+	rq->irqload_ts = cur_jiffies_ts;
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+#endif
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -4,6 +4,7 @@
 */
 #include <linux/cpufreq_times.h>
 #include "sched.h"
+#include "walt.h"

 #ifdef CONFIG_IRQ_TIME_ACCOUNTING

@@ -53,11 +54,18 @@ void irqtime_account_irq(struct task_struct *curr)
 	struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
 	s64 delta;
 	int cpu;
+#ifdef CONFIG_SCHED_WALT
+	u64 wallclock;
+	bool account = true;
+#endif

 	if (!sched_clock_irqtime)
 		return;

 	cpu = smp_processor_id();
+#ifdef CONFIG_SCHED_WALT
+	wallclock = sched_clock_cpu(cpu);
+#endif
 	delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
 	irqtime->irq_start_time += delta;

@@ -71,6 +79,15 @@ void irqtime_account_irq(struct task_struct *curr)
 		irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
 	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
 		irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
+#ifdef CONFIG_SCHED_WALT
+	else
+		account = false;
+
+	if (account)
+		sched_account_irqtime(cpu, curr, delta, wallclock);
+	else if (curr != this_cpu_ksoftirqd())
+		sched_account_irqstart(cpu, curr, wallclock);
+#endif
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);

--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -17,6 +17,7 @@
 */
 #include "sched.h"
 #include "pelt.h"
+#include "walt.h"

 struct dl_bandwidth def_dl_bandwidth;

@@ -1380,6 +1381,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 	WARN_ON(!dl_prio(prio));
 	dl_rq->dl_nr_running++;
 	add_nr_running(rq_of_dl_rq(dl_rq), 1);
+	walt_inc_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));

 	inc_dl_deadline(dl_rq, deadline);
 	inc_dl_migration(dl_se, dl_rq);
@@ -1394,6 +1396,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 	WARN_ON(!dl_rq->dl_nr_running);
 	dl_rq->dl_nr_running--;
 	sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+	walt_dec_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));

 	dec_dl_deadline(dl_rq, dl_se->deadline);
 	dec_dl_migration(dl_se, dl_rq);
@@ -2101,7 +2104,9 @@ retry:
 	}

 	deactivate_task(rq, next_task, 0);
+	next_task->on_rq = TASK_ON_RQ_MIGRATING;
 	set_task_cpu(next_task, later_rq->cpu);
+	next_task->on_rq = TASK_ON_RQ_QUEUED;

 	/*
 	 * Update the later_rq clock here, because the clock is used
@@ -2195,7 +2200,9 @@ static void pull_dl_task(struct rq *this_rq)
 			resched = true;

 			deactivate_task(src_rq, p, 0);
+			p->on_rq = TASK_ON_RQ_MIGRATING;
 			set_task_cpu(p, this_cpu);
+			p->on_rq = TASK_ON_RQ_QUEUED;
 			activate_task(this_rq, p, 0);
 			dmin = p->dl.deadline;

@@ -2458,6 +2465,9 @@ const struct sched_class dl_sched_class = {
 	.switched_to		= switched_to_dl,

 	.update_curr		= update_curr_dl,
+#ifdef CONFIG_SCHED_WALT
+	.fixup_walt_sched_stats	= fixup_walt_sched_stats_common,
+#endif
 };

 int sched_dl_global_validate(void)
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -646,6 +646,19 @@ do {									\
 	SEQ_printf(m, "  .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
 	PN(clock);
 	PN(clock_task);
+#ifdef CONFIG_SMP
+	P(cpu_capacity);
+#endif
+#ifdef CONFIG_SCHED_WALT
+	P(cluster->max_possible_capacity);
+	P(cluster->efficiency);
+	P(cluster->cur_freq);
+	P(cluster->max_freq);
+	P(cluster->exec_scale_factor);
+	P(walt_stats.nr_big_tasks);
+	SEQ_printf(m, "  .%-30s: %llu\n", "walt_stats.cumulative_runnable_avg",
+			rq->walt_stats.cumulative_runnable_avg_scaled);
+#endif
 #undef P
 #undef PN

@@ -724,6 +737,11 @@ static void sched_debug_header(struct seq_file *m)
 	PN(sysctl_sched_wakeup_granularity);
 	P(sysctl_sched_child_runs_first);
 	P(sysctl_sched_features);
+#ifdef CONFIG_SCHED_WALT
+	P(sched_init_task_load_windows);
+	P(sched_ravg_window);
+	P(sched_load_granule);
+#endif
 #undef PN
 #undef P

@@ -915,6 +933,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 		P_SCHEDSTAT(se.statistics.nr_wakeups_passive);
 		P_SCHEDSTAT(se.statistics.nr_wakeups_idle);

+#ifdef CONFIG_SCHED_WALT
+		P(ravg.demand);
+#endif
 		avg_atom = p->se.sum_exec_runtime;
 		if (nr_switches)
 			avg_atom = div64_ul(avg_atom, nr_switches);
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -24,6 +24,12 @@

 #include <trace/events/sched.h>

+#include "walt.h"
+
+#ifdef CONFIG_SMP
+static inline bool task_fits_max(struct task_struct *p, int cpu);
+#endif /* CONFIG_SMP */
+
 /*
 * Targeted preemption latency for CPU-bound tasks:
 *
@@ -85,6 +91,7 @@ unsigned int sysctl_sched_wakeup_granularity			= 1000000UL;
 static unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;

 const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
+DEFINE_PER_CPU_READ_MOSTLY(int, sched_load_boost);

 #ifdef CONFIG_SMP
 /*
@@ -118,6 +125,8 @@ int __weak arch_asym_cpu_priority(int cpu)
 unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
 #endif

+unsigned int sched_small_task_threshold = 102;
+
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
@@ -3689,11 +3698,6 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
 	return cfs_rq->avg.load_avg;
 }

-static inline unsigned long task_util(struct task_struct *p)
-{
-	return READ_ONCE(p->se.avg.util_avg);
-}
-
 static inline unsigned long _task_util_est(struct task_struct *p)
 {
 	struct util_est ue = READ_ONCE(p->se.avg.util_est);
@@ -3703,6 +3707,9 @@ static inline unsigned long _task_util_est(struct task_struct *p)

 static inline unsigned long task_util_est(struct task_struct *p)
 {
+#ifdef CONFIG_SCHED_WALT
+	return p->ravg.demand_scaled;
+#endif
 	return max(task_util(p), _task_util_est(p));
 }

@@ -4514,13 +4521,16 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
 		qcfs_rq->h_nr_running -= task_delta;
 		qcfs_rq->idle_h_nr_running -= idle_task_delta;
+		walt_dec_throttled_cfs_rq_stats(&qcfs_rq->walt_stats, cfs_rq);

 		if (qcfs_rq->load.weight)
 			dequeue = 0;
 	}

-	if (!se)
+	if (!se) {
 		sub_nr_running(rq, task_delta);
+		walt_dec_throttled_cfs_rq_stats(&rq->walt_stats, cfs_rq);
+	}

 	cfs_rq->throttled = 1;
 	cfs_rq->throttled_clock = rq_clock(rq);
@@ -4554,6 +4564,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 	struct sched_entity *se;
 	int enqueue = 1;
 	long task_delta, idle_task_delta;
+	struct cfs_rq *tcfs_rq __maybe_unused = cfs_rq;

 	se = cfs_rq->tg->se[cpu_of(rq)];

@@ -4583,6 +4594,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
 		cfs_rq->h_nr_running += task_delta;
 		cfs_rq->idle_h_nr_running += idle_task_delta;
+		walt_inc_throttled_cfs_rq_stats(&cfs_rq->walt_stats, tcfs_rq);

 		if (cfs_rq_throttled(cfs_rq))
 			break;
@@ -4590,8 +4602,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)

 	assert_list_leaf_cfs_rq(rq);

-	if (!se)
+	if (!se) {
 		add_nr_running(rq, task_delta);
+		walt_inc_throttled_cfs_rq_stats(&rq->walt_stats, tcfs_rq);
+	}

 	/* Determine whether we need to wake up potentially idle CPU: */
 	if (rq->curr == rq->idle && rq->cfs.nr_running)
@@ -4982,6 +4996,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->runtime_enabled = 0;
 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
+	walt_init_cfs_rq_stats(cfs_rq);
 }

 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -5161,8 +5176,6 @@ static inline void hrtick_update(struct rq *rq)
 #endif

 #ifdef CONFIG_SMP
-static inline unsigned long cpu_util(int cpu);
-
 static inline bool cpu_overutilized(int cpu)
 {
 	return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
@@ -5223,6 +5236,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 		cfs_rq->h_nr_running++;
 		cfs_rq->idle_h_nr_running += idle_h_nr_running;
+		walt_inc_cfs_rq_stats(cfs_rq, p);

 		flags = ENQUEUE_WAKEUP;
 	}
@@ -5231,6 +5245,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		cfs_rq = cfs_rq_of(se);
 		cfs_rq->h_nr_running++;
 		cfs_rq->idle_h_nr_running += idle_h_nr_running;
+		walt_inc_cfs_rq_stats(cfs_rq, p);

 		if (cfs_rq_throttled(cfs_rq))
 			break;
@@ -5241,6 +5256,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)

 	if (!se) {
 		add_nr_running(rq, 1);
+		inc_rq_walt_stats(rq, p);
 		/*
 		 * Since new tasks are assigned an initial util_avg equal to
 		 * half of the spare capacity of their CPU, tiny tasks have the
@@ -5308,6 +5324,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 		cfs_rq->h_nr_running--;
 		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+		walt_dec_cfs_rq_stats(cfs_rq, p);

 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight) {
@@ -5328,6 +5345,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		cfs_rq = cfs_rq_of(se);
 		cfs_rq->h_nr_running--;
 		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+		walt_dec_cfs_rq_stats(cfs_rq, p);

 		if (cfs_rq_throttled(cfs_rq))
 			break;
@@ -5336,8 +5354,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		update_cfs_group(se);
 	}

-	if (!se)
+	if (!se) {
 		sub_nr_running(rq, 1);
+		dec_rq_walt_stats(rq, p);
+	}

 	util_est_dequeue(&rq->cfs, p, task_sleep);
 	hrtick_update(rq);
@@ -5375,11 +5395,6 @@ static unsigned long cpu_runnable_load(struct rq *rq)
 	return cfs_rq_runnable_load_avg(&rq->cfs);
 }

-static unsigned long capacity_of(int cpu)
-{
-	return cpu_rq(cpu)->cpu_capacity;
-}
-
 static unsigned long cpu_avg_load_per_task(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -6047,58 +6062,6 @@ static unsigned int uclamp_task_util(struct task_struct *p)
 #endif
 }

-/**
- * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
- * @cpu: the CPU to get the utilization of
- *
- * The unit of the return value must be the one of capacity so we can compare
- * the utilization with the capacity of the CPU that is available for CFS task
- * (ie cpu_capacity).
- *
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on a CPU. It represents
- * the amount of utilization of a CPU in the range [0..capacity_orig] where
- * capacity_orig is the cpu_capacity available at the highest frequency
- * (arch_scale_freq_capacity()).
- * The utilization of a CPU converges towards a sum equal to or less than the
- * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
- * the running time on this CPU scaled by capacity_curr.
- *
- * The estimated utilization of a CPU is defined to be the maximum between its
- * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
- * currently RUNNABLE on that CPU.
- * This allows to properly represent the expected utilization of a CPU which
- * has just got a big task running since a long sleep period. At the same time
- * however it preserves the benefits of the "blocked utilization" in
- * describing the potential for other tasks waking up on the same CPU.
- *
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
- * higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
- * the average stabilizes with the new running time. We need to check that the
- * utilization stays within the range of [0..capacity_orig] and cap it if
- * necessary. Without utilization capping, a group could be seen as overloaded
- * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
- * available capacity. We allow utilization to overshoot capacity_curr (but not
- * capacity_orig) as it useful for predicting the capacity required after task
- * migrations (scheduler-driven DVFS).
- *
- * Return: the (estimated) utilization for the specified CPU
- */
-static inline unsigned long cpu_util(int cpu)
-{
-	struct cfs_rq *cfs_rq;
-	unsigned int util;
-
-	cfs_rq = &cpu_rq(cpu)->cfs;
-	util = READ_ONCE(cfs_rq->avg.util_avg);
-
-	if (sched_feat(UTIL_EST))
-		util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
-
-	return min_t(unsigned long, util, capacity_orig_of(cpu));
-}
-
 /*
 * cpu_util_without: compute cpu utilization without any contributions from *p
 * @cpu: the CPU which utilization is requested
@@ -6114,13 +6077,29 @@ static inline unsigned long cpu_util(int cpu)
 */
 static unsigned long cpu_util_without(int cpu, struct task_struct *p)
 {
+#ifndef CONFIG_SCHED_WALT
 	struct cfs_rq *cfs_rq;
+#endif
 	unsigned int util;

+#ifdef CONFIG_SCHED_WALT
+	/*
+	 * WALT does not decay idle tasks in the same manner
+	 * as PELT, so it makes little sense to subtract task
+	 * utilization from cpu utilization. Instead just use
+	 * cpu_util for this case.
+	 */
+	if (likely(p->state == TASK_WAKING))
+		return cpu_util(cpu);
+#endif
+
 	/* Task has no contribution or is new */
 	if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
 		return cpu_util(cpu);

+#ifdef CONFIG_SCHED_WALT
+	util = max_t(long, cpu_util(cpu) - task_util(p), 0);
+#else
 	cfs_rq = &cpu_rq(cpu)->cfs;
 	util = READ_ONCE(cfs_rq->avg.util_avg);

@@ -6179,6 +6158,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)

 		util = max(util, estimated);
 	}
+#endif

 	/*
 	 * Utilization (estimated) can exceed the CPU capacity, thus let's
@@ -6188,6 +6168,18 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
 	return min_t(unsigned long, util, capacity_orig_of(cpu));
 }

+/*
+ * Returns the current capacity of cpu after applying both
+ * cpu and freq scaling.
+ */
+unsigned long capacity_curr_of(int cpu)
+{
+	unsigned long max_cap = cpu_rq(cpu)->cpu_capacity_orig;
+	unsigned long scale_freq = arch_scale_freq_capacity(cpu);
+
+	return cap_scale(max_cap, scale_freq);
+}
+
 /*
 * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
 * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
@@ -6344,7 +6336,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
 * other use-cases too. So, until someone finds a better way to solve this,
 * let's keep things simple by re-using the existing slow path.
 */
-static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sync)
+int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sync)
 {
 	unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
 	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
@@ -7363,7 +7355,13 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
 	lockdep_assert_held(&env->src_rq->lock);

 	deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
+	lockdep_off();
+	double_lock_balance(env->src_rq, env->dst_rq);
+	if (!(env->src_rq->clock_update_flags & RQCF_UPDATED))
+		update_rq_clock(env->src_rq);
 	set_task_cpu(p, env->dst_cpu);
+	double_unlock_balance(env->src_rq, env->dst_rq);
+	lockdep_on();
 }

 /*
@@ -8891,8 +8889,6 @@ static int need_active_balance(struct lb_env *env)
 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
 }

-static int active_load_balance_cpu_stop(void *data);
-
 static int should_we_balance(struct lb_env *env)
 {
 	struct sched_group *sg = env->sd->groups;
@@ -9244,7 +9240,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
 * least 1 task to be running on each physical CPU where possible, and
 * avoids physical / logical imbalances.
 */
-static int active_load_balance_cpu_stop(void *data)
+int active_load_balance_cpu_stop(void *data)
 {
 	struct rq *busiest_rq = data;
 	int busiest_cpu = cpu_of(busiest_rq);
@@ -10615,6 +10611,10 @@ const struct sched_class fair_sched_class = {
 #ifdef CONFIG_UCLAMP_TASK
 	.uclamp_enabled		= 1,
 #endif
+
+#ifdef CONFIG_SCHED_WALT
+	.fixup_walt_sched_stats	= walt_fixup_sched_stats_fair,
+#endif
 };

 #ifdef CONFIG_SCHED_DEBUG
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -61,7 +61,8 @@ static noinline int __cpuidle cpu_idle_poll(void)
 	stop_critical_timings();

 	while (!tif_need_resched() &&
-		(cpu_idle_force_poll || tick_check_broadcast_expired()))
+		(cpu_idle_force_poll || tick_check_broadcast_expired() ||
+		is_reserved(smp_processor_id())))
 		cpu_relax();
 	start_critical_timings();
 	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
@@ -257,7 +258,8 @@ static void do_idle(void)
 		 * broadcast device expired for us, we don't want to go deep
 		 * idle as we know that the IPI is going to arrive right away.
 		 */
-		if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
+		if (cpu_idle_force_poll || tick_check_broadcast_expired() ||
+				is_reserved(smp_processor_id())) {
 			tick_nohz_idle_restart_tick();
 			cpu_idle_poll();
 		} else {
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -6,6 +6,7 @@
 #include "sched.h"

 #include "pelt.h"
+#include "walt.h"

 int sched_rr_timeslice = RR_TIMESLICE;
 int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
@@ -2388,6 +2389,10 @@ const struct sched_class rt_sched_class = {
 #ifdef CONFIG_UCLAMP_TASK
 	.uclamp_enabled		= 1,
 #endif
+
+#ifdef CONFIG_SCHED_WALT
+	.fixup_walt_sched_stats	= fixup_walt_sched_stats_common,
+#endif
 };

 #ifdef CONFIG_RT_GROUP_SCHED
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -84,6 +84,73 @@
 struct rq;
 struct cpuidle_state;

+extern __read_mostly bool sched_predl;
+
+struct sched_walt_cpu_load {
+	unsigned long prev_window_util;
+	unsigned long nl;
+	unsigned long pl;
+	bool rtgb_active;
+	u64 ws;
+};
+
+#ifdef CONFIG_SCHED_WALT
+#define DECLARE_BITMAP_ARRAY(name, nr, bits) \
+	unsigned long name[nr][BITS_TO_LONGS(bits)]
+
+extern unsigned int __weak sched_ravg_window;
+
+struct walt_sched_stats {
+	int nr_big_tasks;
+	u64 cumulative_runnable_avg_scaled;
+	u64 pred_demands_sum_scaled;
+};
+
+struct cpu_cycle {
+	u64 cycles;
+	u64 time;
+};
+
+struct group_cpu_time {
+	u64 curr_runnable_sum;
+	u64 prev_runnable_sum;
+	u64 nt_curr_runnable_sum;
+	u64 nt_prev_runnable_sum;
+};
+
+struct load_subtractions {
+	u64 window_start;
+	u64 subs;
+	u64 new_subs;
+};
+
+#define NUM_TRACKED_WINDOWS 2
+#define NUM_LOAD_INDICES 1000
+
+struct sched_cluster {
+	raw_spinlock_t load_lock;
+	struct list_head list;
+	struct cpumask cpus;
+	int id;
+	int max_power_cost;
+	int min_power_cost;
+	int max_possible_capacity;
+	int efficiency; /* Differentiate cpus with different IPC capability */
+	unsigned int exec_scale_factor;
+	/*
+	 * max_freq = user maximum
+	 * max_mitigated_freq = thermal defined maximum
+	 * max_possible_freq = maximum supported by hardware
+	 */
+	unsigned int cur_freq, max_freq, max_mitigated_freq, min_freq;
+	unsigned int max_possible_freq;
+	bool freq_init_done;
+	u64 aggr_grp_load;
+};
+
+extern __weak cpumask_t asym_cap_sibling_cpus;
+#endif /* CONFIG_SCHED_WALT */
+
 /* task_struct::on_rq states: */
 #define TASK_ON_RQ_QUEUED	1
 #define TASK_ON_RQ_MIGRATING	2
@@ -401,7 +468,24 @@ struct task_group {
 	struct uclamp_se	uclamp[UCLAMP_CNT];
 	/* Latency-sensitive flag used for a task group */
 	unsigned int		latency_sensitive;
-#endif
+#ifdef CONFIG_SCHED_WALT
+	/* Toggle ability to override sched boost enabled */
+	bool sched_boost_no_override;
+	/*
+	 * Controls whether a cgroup is eligible for sched boost or not. This
+	 * can temporariliy be disabled by the kernel based on the no_override
+	 * flag above.
+	 */
+	bool sched_boost_enabled;
+	/*
+	 * Controls whether tasks of this cgroup should be colocated with each
+	 * other and tasks of other cgroups that have the same flag turned on.
+	 */
+	bool colocate;
+	/* Controls whether further updates are allowed to the colocate flag */
+	bool colocate_update_disabled;
+#endif /* CONFIG_SCHED_WALT */
+#endif /* CONFIG_UCLAMP_TASK_GROUP */

 };

@@ -565,6 +649,10 @@ struct cfs_rq {
 	struct list_head	leaf_cfs_rq_list;
 	struct task_group	*tg;	/* group that "owns" this runqueue */

+#ifdef CONFIG_SCHED_WALT
+	struct walt_sched_stats walt_stats;
+#endif
+
 #ifdef CONFIG_CFS_BANDWIDTH
 	int			runtime_enabled;
 	s64			runtime_remaining;
@@ -961,6 +1049,41 @@ struct rq {
 	u64			max_idle_balance_cost;
 #endif

+#ifdef CONFIG_SCHED_WALT
+	struct task_struct	*push_task;
+	struct sched_cluster	*cluster;
+	struct cpumask		freq_domain_cpumask;
+	struct walt_sched_stats walt_stats;
+
+	u64			window_start;
+	u32			prev_window_size;
+	unsigned long		walt_flags;
+
+	u64			cur_irqload;
+	u64			avg_irqload;
+	u64			irqload_ts;
+	struct task_struct	*ed_task;
+	struct cpu_cycle	cc;
+	u64			old_busy_time, old_busy_time_group;
+	u64			old_estimated_time;
+	u64			curr_runnable_sum;
+	u64			prev_runnable_sum;
+	u64			nt_curr_runnable_sum;
+	u64			nt_prev_runnable_sum;
+	u64			cum_window_demand_scaled;
+	struct group_cpu_time	grp_time;
+	struct load_subtractions load_subs[NUM_TRACKED_WINDOWS];
+	DECLARE_BITMAP_ARRAY(top_tasks_bitmap,
+			NUM_TRACKED_WINDOWS, NUM_LOAD_INDICES);
+	u8			*top_tasks[NUM_TRACKED_WINDOWS];
+	u8			curr_table;
+	int			prev_top;
+	int			curr_top;
+	bool			notif_pending;
+	u64			last_cc_update;
+	u64			cycles;
+#endif /* CONFIG_SCHED_WALT */
+
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 	u64			prev_irq_time;
 #endif
@@ -1306,8 +1429,6 @@ enum numa_faults_stats {
 };
 extern void sched_setnuma(struct task_struct *p, int node);
 extern int migrate_task_to(struct task_struct *p, int cpu);
-extern int migrate_swap(struct task_struct *p, struct task_struct *t,
-			int cpu, int scpu);
 extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
 #else
 static inline void
@@ -1316,6 +1437,9 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
 }
 #endif /* CONFIG_NUMA_BALANCING */

+extern int migrate_swap(struct task_struct *p, struct task_struct *t,
+			int cpu, int scpu);
+
 #ifdef CONFIG_SMP

 static inline void
@@ -1782,8 +1906,15 @@ struct sched_class {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	void (*task_change_group)(struct task_struct *p, int type);
 #endif
+
+#ifdef CONFIG_SCHED_WALT
+	void (*fixup_walt_sched_stats)(struct rq *rq, struct task_struct *p,
+					u16 updated_demand_scaled,
+					u16 updated_pred_demand_scaled);
+#endif
 };

+
 static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
 	WARN_ON_ONCE(rq->curr != prev);
@@ -1960,6 +2091,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
 {
 	unsigned prev_nr = rq->nr_running;

+	sched_update_nr_prod(cpu_of(rq), count, true);
 	rq->nr_running = prev_nr + count;

 #ifdef CONFIG_SMP
@@ -1974,6 +2106,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count)

 static inline void sub_nr_running(struct rq *rq, unsigned count)
 {
+	sched_update_nr_prod(cpu_of(rq), count, false);
 	rq->nr_running -= count;
 	/* Check if we still need preemption */
 	sched_update_tick_dependency(rq);
@@ -2014,6 +2147,18 @@ static inline int hrtick_enabled(struct rq *rq)

 #endif /* CONFIG_SCHED_HRTICK */

+#ifdef CONFIG_SCHED_WALT
+u64 __weak sched_ktime_clock(void);
+unsigned long __weak
+cpu_util_freq_walt(int cpu, struct sched_walt_cpu_load *walt_load);
+#else
+#define sched_ravg_window TICK_NSEC
+static inline u64 sched_ktime_clock(void)
+{
+	return 0;
+}
+#endif
+
 #ifndef arch_scale_freq_capacity
 static __always_inline
 unsigned long arch_scale_freq_capacity(int cpu)
@@ -2031,8 +2176,127 @@ unsigned long arch_scale_max_freq_capacity(struct sched_domain *sd, int cpu)
 }
 #endif

+unsigned long capacity_curr_of(int cpu);
+
 #ifdef CONFIG_SMP
-#ifdef CONFIG_PREEMPTION
+static inline unsigned long capacity_of(int cpu)
+{
+	return cpu_rq(cpu)->cpu_capacity;
+}
+
+static inline unsigned long capacity_orig_of(int cpu)
+{
+	return cpu_rq(cpu)->cpu_capacity_orig;
+}
+
+static inline unsigned long task_util(struct task_struct *p)
+{
+#ifdef CONFIG_SCHED_WALT
+	return p->ravg.demand_scaled;
+#endif
+	return READ_ONCE(p->se.avg.util_avg);
+}
+
+/**
+ * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
+ * @cpu: the CPU to get the utilization of
+ *
+ * The unit of the return value must be the one of capacity so we can compare
+ * the utilization with the capacity of the CPU that is available for CFS task
+ * (ie cpu_capacity).
+ *
+ * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on a CPU. It represents
+ * the amount of utilization of a CPU in the range [0..capacity_orig] where
+ * capacity_orig is the cpu_capacity available at the highest frequency
+ * (arch_scale_freq_capacity()).
+ * The utilization of a CPU converges towards a sum equal to or less than the
+ * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
+ * the running time on this CPU scaled by capacity_curr.
+ *
+ * The estimated utilization of a CPU is defined to be the maximum between its
+ * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
+ * currently RUNNABLE on that CPU.
+ * This allows to properly represent the expected utilization of a CPU which
+ * has just got a big task running since a long sleep period. At the same time
+ * however it preserves the benefits of the "blocked utilization" in
+ * describing the potential for other tasks waking up on the same CPU.
+ *
+ * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * higher than capacity_orig because of unfortunate rounding in
+ * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * the average stabilizes with the new running time. We need to check that the
+ * utilization stays within the range of [0..capacity_orig] and cap it if
+ * necessary. Without utilization capping, a group could be seen as overloaded
+ * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
+ * available capacity. We allow utilization to overshoot capacity_curr (but not
+ * capacity_orig) as it useful for predicting the capacity required after task
+ * migrations (scheduler-driven DVFS).
+ *
+ * Return: the (estimated) utilization for the specified CPU
+ */
+static inline unsigned long cpu_util(int cpu)
+{
+	struct cfs_rq *cfs_rq;
+	unsigned int util;
+
+#ifdef CONFIG_SCHED_WALT
+	u64 walt_cpu_util =
+		cpu_rq(cpu)->walt_stats.cumulative_runnable_avg_scaled;
+
+	return min_t(unsigned long, walt_cpu_util, capacity_orig_of(cpu));
+#endif
+
+	cfs_rq = &cpu_rq(cpu)->cfs;
+	util = READ_ONCE(cfs_rq->avg.util_avg);
+
+	if (sched_feat(UTIL_EST))
+		util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
+
+	return min_t(unsigned long, util, capacity_orig_of(cpu));
+}
+
+static inline unsigned long cpu_util_cum(int cpu, int delta)
+{
+	u64 util = cpu_rq(cpu)->cfs.avg.util_avg;
+	unsigned long capacity = capacity_orig_of(cpu);
+
+#ifdef CONFIG_SCHED_WALT
+	util = cpu_rq(cpu)->cum_window_demand_scaled;
+#endif
+	delta += util;
+	if (delta < 0)
+		return 0;
+
+	return (delta >= capacity) ? capacity : delta;
+}
+
+static inline unsigned long
+cpu_util_freq(int cpu, struct sched_walt_cpu_load *walt_load)
+{
+#ifdef CONFIG_SCHED_WALT
+	return cpu_util_freq_walt(cpu, walt_load);
+#else
+	return cpu_util(cpu);
+#endif
+}
+
+extern unsigned int capacity_margin_freq;
+
+static inline unsigned long
+add_capacity_margin(unsigned long cpu_capacity, int cpu)
+{
+	cpu_capacity  = cpu_capacity * capacity_margin_freq *
+			(100 + per_cpu(sched_load_boost, cpu));
+	cpu_capacity /= 100;
+	cpu_capacity /= SCHED_CAPACITY_SCALE;
+	return cpu_capacity;
+}
+
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_PREEMPT

 static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);

@@ -2345,6 +2609,11 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
 {
 	struct update_util_data *data;

+#ifdef CONFIG_SCHED_WALT
+	if (!(flags & SCHED_CPUFREQ_WALT))
+		return;
+#endif
+
 	data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,
 						  cpu_of(rq)));
 	if (data)
@@ -2432,13 +2701,6 @@ static inline bool uclamp_latency_sensitive(struct task_struct *p)
 # define arch_scale_freq_invariant()	false
 #endif

-#ifdef CONFIG_SMP
-static inline unsigned long capacity_orig_of(int cpu)
-{
-	return cpu_rq(cpu)->cpu_capacity_orig;
-}
-#endif
-
 /**
 * enum schedutil_type - CPU utilization type
 * @FREQUENCY_UTIL:	Utilization used to select frequency
@@ -2570,3 +2832,499 @@ static inline void membarrier_switch_mm(struct rq *rq,
 {
 }
 #endif
+
+enum sched_boost_policy {
+	SCHED_BOOST_NONE,
+	SCHED_BOOST_ON_BIG,
+	SCHED_BOOST_ON_ALL,
+};
+
+#ifdef CONFIG_SCHED_WALT
+
+static inline int cluster_first_cpu(struct sched_cluster *cluster)
+{
+	return cpumask_first(&cluster->cpus);
+}
+
+struct related_thread_group {
+	int id;
+	raw_spinlock_t lock;
+	struct list_head tasks;
+	struct list_head list;
+	bool skip_min;
+	struct rcu_head rcu;
+	u64 last_update;
+	u64 downmigrate_ts;
+	u64 start_ts;
+};
+
+extern struct sched_cluster *sched_cluster[NR_CPUS];
+
+extern unsigned int __weak sched_disable_window_stats;
+extern unsigned int max_possible_freq;
+extern unsigned int min_max_freq;
+extern unsigned int max_possible_efficiency;
+extern unsigned int min_possible_efficiency;
+extern unsigned int max_possible_capacity;
+extern unsigned int __weak min_max_possible_capacity;
+extern unsigned int max_power_cost;
+extern unsigned int __read_mostly __weak sched_init_task_load_windows;
+extern unsigned int  __read_mostly __weak sched_load_granule;
+
+extern int __weak update_preferred_cluster(struct related_thread_group *grp,
+			struct task_struct *p, u32 old_load, bool from_tick);
+extern void __weak set_preferred_cluster(struct related_thread_group *grp);
+extern void __weak add_new_task_to_grp(struct task_struct *new);
+
+#define NO_BOOST 0
+#define FULL_THROTTLE_BOOST 1
+#define CONSERVATIVE_BOOST 2
+#define RESTRAINED_BOOST 3
+#define FULL_THROTTLE_BOOST_DISABLE -1
+#define CONSERVATIVE_BOOST_DISABLE -2
+#define RESTRAINED_BOOST_DISABLE -3
+#define MAX_NUM_BOOST_TYPE (RESTRAINED_BOOST+1)
+
+static inline int asym_cap_siblings(int cpu1, int cpu2)
+{
+	return (cpumask_test_cpu(cpu1, &asym_cap_sibling_cpus) &&
+		cpumask_test_cpu(cpu2, &asym_cap_sibling_cpus));
+}
+
+static inline int cpu_max_possible_capacity(int cpu)
+{
+	return cpu_rq(cpu)->cluster->max_possible_capacity;
+}
+
+static inline unsigned int cluster_max_freq(struct sched_cluster *cluster)
+{
+	/*
+	 * Governor and thermal driver don't know the other party's mitigation
+	 * voting. So struct cluster saves both and return min() for current
+	 * cluster fmax.
+	 */
+	return min(cluster->max_mitigated_freq, cluster->max_freq);
+}
+
+static inline unsigned int cpu_max_freq(int cpu)
+{
+	return cluster_max_freq(cpu_rq(cpu)->cluster);
+}
+
+static inline unsigned int cpu_max_possible_freq(int cpu)
+{
+	return cpu_rq(cpu)->cluster->max_possible_freq;
+}
+
+static inline bool hmp_capable(void)
+{
+	return max_possible_capacity != min_max_possible_capacity;
+}
+
+static inline bool is_max_capacity_cpu(int cpu)
+{
+	return cpu_max_possible_capacity(cpu) == max_possible_capacity;
+}
+
+static inline bool is_min_capacity_cpu(int cpu)
+{
+	return cpu_max_possible_capacity(cpu) == min_max_possible_capacity;
+}
+
+static inline unsigned int task_load(struct task_struct *p)
+{
+	return p->ravg.demand;
+}
+
+static inline unsigned int task_pl(struct task_struct *p)
+{
+	return p->ravg.pred_demand;
+}
+
+static inline bool task_in_related_thread_group(struct task_struct *p)
+{
+	return !!(rcu_access_pointer(p->grp) != NULL);
+}
+
+static inline
+struct related_thread_group *task_related_thread_group(struct task_struct *p)
+{
+	return rcu_dereference(p->grp);
+}
+
+/* Is frequency of two cpus synchronized with each other? */
+static inline int same_freq_domain(int src_cpu, int dst_cpu)
+{
+	struct rq *rq = cpu_rq(src_cpu);
+
+	if (src_cpu == dst_cpu)
+		return 1;
+
+	if (asym_cap_siblings(src_cpu, dst_cpu))
+		return 1;
+
+	return cpumask_test_cpu(dst_cpu, &rq->freq_domain_cpumask);
+}
+
+#define CPU_RESERVED    1
+
+extern enum sched_boost_policy __weak boost_policy;
+extern unsigned int __weak sched_task_filter_util;
+static inline enum sched_boost_policy sched_boost_policy(void)
+{
+	return boost_policy;
+}
+
+extern unsigned int __weak sched_boost_type;
+static inline int sched_boost(void)
+{
+	return sched_boost_type;
+}
+
+static inline bool rt_boost_on_big(void)
+{
+	return sched_boost() == FULL_THROTTLE_BOOST ?
+			(sched_boost_policy() == SCHED_BOOST_ON_BIG) : false;
+}
+
+static inline bool is_full_throttle_boost(void)
+{
+	return sched_boost() == FULL_THROTTLE_BOOST;
+}
+
+extern int __weak preferred_cluster(struct sched_cluster *cluster,
+						struct task_struct *p);
+extern struct sched_cluster *rq_cluster(struct rq *rq);
+
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+static inline bool task_sched_boost(struct task_struct *p)
+{
+	struct cgroup_subsys_state *css = task_css(p, cpu_cgrp_id);
+	struct task_group *tg;
+
+	if (!css)
+		return false;
+	tg = container_of(css, struct task_group, css);
+
+	return tg->sched_boost_enabled;
+}
+
+extern int __weak sync_cgroup_colocation(struct task_struct *p, bool insert);
+extern void update_cgroup_boost_settings(void);
+extern void restore_cgroup_boost_settings(void);
+#else
+static inline bool
+same_schedtg(struct task_struct *tsk1, struct task_struct *tsk2)
+{
+	return true;
+}
+
+static inline bool task_sched_boost(struct task_struct *p)
+{
+	return true;
+}
+
+static inline void update_cgroup_boost_settings(void) { }
+static inline void restore_cgroup_boost_settings(void) { }
+#endif
+
+extern int __weak alloc_related_thread_groups(void);
+
+extern void __weak check_for_migration(struct rq *rq, struct task_struct *p);
+
+static inline int is_reserved(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	return test_bit(CPU_RESERVED, &rq->walt_flags);
+}
+
+static inline int mark_reserved(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	return test_and_set_bit(CPU_RESERVED, &rq->walt_flags);
+}
+
+static inline void clear_reserved(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	clear_bit(CPU_RESERVED, &rq->walt_flags);
+}
+
+static inline bool
+task_in_cum_window_demand(struct rq *rq, struct task_struct *p)
+{
+	return cpu_of(rq) == task_cpu(p) && (p->on_rq || p->last_sleep_ts >=
+							 rq->window_start);
+}
+
+static inline void walt_fixup_cum_window_demand(struct rq *rq, s64 scaled_delta)
+{
+	rq->cum_window_demand_scaled += scaled_delta;
+	if (unlikely((s64)rq->cum_window_demand_scaled < 0))
+		rq->cum_window_demand_scaled = 0;
+}
+
+extern unsigned long __weak thermal_cap(int cpu);
+
+extern void __weak clear_walt_request(int cpu);
+
+extern enum sched_boost_policy sched_boost_policy(void);
+extern void sched_boost_parse_dt(void);
+extern void __weak clear_ed_task(struct task_struct *p, struct rq *rq);
+extern bool __weak early_detection_notify(struct rq *rq, u64 wallclock);
+
+static inline unsigned int power_cost(int cpu, u64 demand)
+{
+	return cpu_max_possible_capacity(cpu);
+}
+
+void __weak note_task_waking(struct task_struct *p, u64 wallclock);
+
+static inline bool task_placement_boost_enabled(struct task_struct *p)
+{
+	if (task_sched_boost(p))
+		return sched_boost_policy() != SCHED_BOOST_NONE;
+
+	return false;
+}
+
+static inline enum sched_boost_policy task_boost_policy(struct task_struct *p)
+{
+	enum sched_boost_policy policy = task_sched_boost(p) ?
+						sched_boost_policy() :
+						SCHED_BOOST_NONE;
+	if (policy == SCHED_BOOST_ON_BIG) {
+		/*
+		 * Filter out tasks less than min task util threshold
+		 * under conservative boost.
+		 */
+		if (sched_boost() == CONSERVATIVE_BOOST &&
+				task_util(p) <= sched_task_filter_util)
+			policy = SCHED_BOOST_NONE;
+	}
+
+	return policy;
+}
+
+static inline bool is_min_capacity_cluster(struct sched_cluster *cluster)
+{
+	return is_min_capacity_cpu(cluster_first_cpu(cluster));
+}
+
+extern void __weak walt_fixup_sched_stats_fair(struct rq *rq,
+					struct task_struct *p,
+					u16 updated_demand_scaled,
+					u16 updated_pred_demand_scaled);
+extern void __weak walt_fixup_nr_big_tasks(struct rq *rq, struct task_struct *p,
+					int delta, bool inc);
+#else   /* CONFIG_SCHED_WALT */
+
+struct walt_sched_stats;
+struct related_thread_group;
+struct sched_cluster;
+
+static inline bool task_sched_boost(struct task_struct *p)
+{
+	return false;
+}
+
+static inline bool task_placement_boost_enabled(struct task_struct *p)
+{
+	return false;
+}
+
+static inline void check_for_migration(struct rq *rq, struct task_struct *p) { }
+
+static inline int sched_boost(void)
+{
+	return 0;
+}
+
+static inline bool rt_boost_on_big(void)
+{
+	return false;
+}
+
+static inline bool is_full_throttle_boost(void)
+{
+	return false;
+}
+
+static inline enum sched_boost_policy task_boost_policy(struct task_struct *p)
+{
+	return SCHED_BOOST_NONE;
+}
+
+static inline bool
+task_in_cum_window_demand(struct rq *rq, struct task_struct *p)
+{
+	return false;
+}
+
+static inline bool hmp_capable(void) { return false; }
+static inline bool is_max_capacity_cpu(int cpu) { return true; }
+static inline bool is_min_capacity_cpu(int cpu) { return true; }
+
+static inline int
+preferred_cluster(struct sched_cluster *cluster, struct task_struct *p)
+{
+	return -1;
+}
+
+static inline struct sched_cluster *rq_cluster(struct rq *rq)
+{
+	return NULL;
+}
+
+static inline int asym_cap_siblings(int cpu1, int cpu2) { return 0; }
+
+static inline void set_preferred_cluster(struct related_thread_group *grp) { }
+
+static inline bool task_in_related_thread_group(struct task_struct *p)
+{
+	return false;
+}
+
+static inline
+struct related_thread_group *task_related_thread_group(struct task_struct *p)
+{
+	return NULL;
+}
+
+static inline u32 task_load(struct task_struct *p) { return 0; }
+static inline u32 task_pl(struct task_struct *p) { return 0; }
+
+static inline int update_preferred_cluster(struct related_thread_group *grp,
+			struct task_struct *p, u32 old_load, bool from_tick)
+{
+	return 0;
+}
+
+static inline void add_new_task_to_grp(struct task_struct *new) {}
+
+static inline int same_freq_domain(int src_cpu, int dst_cpu)
+{
+	return 1;
+}
+
+static inline int mark_reserved(int cpu)
+{
+	return 0;
+}
+
+static inline void clear_reserved(int cpu) { }
+static inline int alloc_related_thread_groups(void) { return 0; }
+
+static inline void walt_fixup_cum_window_demand(struct rq *rq,
+						s64 scaled_delta) { }
+
+#ifdef CONFIG_SMP
+static inline unsigned long thermal_cap(int cpu)
+{
+	return cpu_rq(cpu)->cpu_capacity_orig;
+}
+#endif
+
+static inline void clear_walt_request(int cpu) { }
+
+static inline int is_reserved(int cpu)
+{
+	return 0;
+}
+
+static inline enum sched_boost_policy sched_boost_policy(void)
+{
+	return SCHED_BOOST_NONE;
+}
+
+static inline void sched_boost_parse_dt(void) { }
+
+static inline void clear_ed_task(struct task_struct *p, struct rq *rq) { }
+
+static inline bool early_detection_notify(struct rq *rq, u64 wallclock)
+{
+	return 0;
+}
+
+#ifdef CONFIG_SMP
+static inline unsigned int power_cost(int cpu, u64 demand)
+{
+	return SCHED_CAPACITY_SCALE;
+}
+#endif
+
+static inline void note_task_waking(struct task_struct *p, u64 wallclock) { }
+#endif  /* CONFIG_SCHED_WALT */
+
+struct sched_avg_stats {
+	int nr;
+	int nr_misfit;
+	int nr_max;
+	int nr_scaled;
+};
+extern void sched_get_nr_running_avg(struct sched_avg_stats *stats);
+
+#if defined(CONFIG_SCHED_WALT) && defined(CONFIG_CFS_BANDWIDTH)
+
+extern void walt_init_cfs_rq_stats(struct cfs_rq *cfs_rq);
+extern void walt_inc_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p);
+extern void walt_dec_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p);
+extern void walt_inc_throttled_cfs_rq_stats(struct walt_sched_stats *stats,
+							struct cfs_rq *cfs_rq);
+extern void walt_dec_throttled_cfs_rq_stats(struct walt_sched_stats *stats,
+							struct cfs_rq *cfs_rq);
+#else
+static inline void walt_init_cfs_rq_stats(struct cfs_rq *cfs_rq) {}
+static inline void
+walt_inc_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p) {}
+static inline void
+walt_dec_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p) {}
+
+#define walt_inc_throttled_cfs_rq_stats(...)
+#define walt_dec_throttled_cfs_rq_stats(...)
+
+#endif
+
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_SCHED_WALT
+extern int __weak group_balance_cpu_not_isolated(struct sched_group *sg);
+#else
+static inline int group_balance_cpu_not_isolated(struct sched_group *sg)
+{
+	return group_balance_cpu(sg);
+}
+#endif /* CONFIG_SCHED_WALT */
+#endif /* CONFIG_SMP */
+
+extern int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu,
+								int sync);
+extern int active_load_balance_cpu_stop(void *data);
+
+#ifdef CONFIG_HOTPLUG_CPU
+extern void set_rq_online(struct rq *rq);
+extern void set_rq_offline(struct rq *rq);
+extern void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf);
+extern void calc_load_migrate(struct rq *rq);
+#ifdef CONFIG_SCHED_WALT
+extern void __weak
+detach_one_task_core(struct task_struct *p, struct rq *rq,
+						struct list_head *tasks);
+extern void __weak attach_tasks_core(struct list_head *tasks, struct rq *rq);
+#else
+static inline void
+detach_one_task_core(struct task_struct *p, struct rq *rq,
+						struct list_head *tasks)
+{
+}
+static inline void attach_tasks_core(struct list_head *tasks, struct rq *rq) {}
+#endif
+#endif
+
+extern struct task_struct *find_process_by_pid(pid_t pid);
+
+extern void enqueue_task_core(struct rq *rq, struct task_struct *p, int flags);
+extern void dequeue_task_core(struct rq *rq, struct task_struct *p, int flags);
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -8,6 +8,7 @@
 * See kernel/stop_machine.c
 */
 #include "sched.h"
+#include "walt.h"

 #ifdef CONFIG_SMP
 static int
@@ -50,12 +51,14 @@ static void
 enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
 	add_nr_running(rq, 1);
+	walt_inc_cumulative_runnable_avg(rq, p);
 }

 static void
 dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
 	sub_nr_running(rq, 1);
+	walt_dec_cumulative_runnable_avg(rq, p);
 }

 static void yield_task_stop(struct rq *rq)
@@ -144,4 +147,7 @@ const struct sched_class stop_sched_class = {
 	.prio_changed		= prio_changed_stop,
 	.switched_to		= switched_to_stop,
 	.update_curr		= update_curr_stop,
+#ifdef CONFIG_SCHED_WALT
+	.fixup_walt_sched_stats	= fixup_walt_sched_stats_common,
+#endif
 };
--- a/kernel/sched/walt.h
+++ b/kernel/sched/walt.h
@@ -0,0 +1,265 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2016-2019, The Linux Foundation. All rights reserved.
+ */
+
+#ifndef __WALT_H
+#define __WALT_H
+
+#ifdef CONFIG_SCHED_WALT
+
+#include <linux/sched/sysctl.h>
+#include <linux/sched/core_ctl.h>
+
+#define EXITING_TASK_MARKER	0xdeaddead
+
+extern void __weak
+walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event,
+						u64 wallclock, u64 irqtime);
+
+static inline void
+fixup_cumulative_runnable_avg(struct walt_sched_stats *stats,
+			      s64 demand_scaled_delta,
+			      s64 pred_demand_scaled_delta)
+{
+	if (sched_disable_window_stats)
+		return;
+
+	stats->cumulative_runnable_avg_scaled += demand_scaled_delta;
+	BUG_ON((s64)stats->cumulative_runnable_avg_scaled < 0);
+
+	stats->pred_demands_sum_scaled += pred_demand_scaled_delta;
+	BUG_ON((s64)stats->pred_demands_sum_scaled < 0);
+}
+
+static inline void
+walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p)
+{
+	if (sched_disable_window_stats)
+		return;
+
+	fixup_cumulative_runnable_avg(&rq->walt_stats, p->ravg.demand_scaled,
+				      p->ravg.pred_demand_scaled);
+
+	/*
+	 * Add a task's contribution to the cumulative window demand when
+	 *
+	 * (1) task is enqueued with on_rq = 1 i.e migration,
+	 *     prio/cgroup/class change.
+	 * (2) task is waking for the first time in this window.
+	 */
+	if (p->on_rq || (p->last_sleep_ts < rq->window_start))
+		walt_fixup_cum_window_demand(rq, p->ravg.demand_scaled);
+}
+
+static inline void
+walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p)
+{
+	if (sched_disable_window_stats)
+		return;
+
+	fixup_cumulative_runnable_avg(&rq->walt_stats,
+				      -(s64)p->ravg.demand_scaled,
+				      -(s64)p->ravg.pred_demand_scaled);
+
+	/*
+	 * on_rq will be 1 for sleeping tasks. So check if the task
+	 * is migrating or dequeuing in RUNNING state to change the
+	 * prio/cgroup/class.
+	 */
+	if (task_on_rq_migrating(p) || p->state == TASK_RUNNING)
+		walt_fixup_cum_window_demand(rq, -(s64)p->ravg.demand_scaled);
+}
+
+extern void __weak
+fixup_walt_sched_stats_common(struct rq *rq, struct task_struct *p,
+					  u16 updated_demand_scaled,
+					  u16 updated_pred_demand_scaled);
+extern void __weak inc_rq_walt_stats(struct rq *rq, struct task_struct *p);
+extern void __weak dec_rq_walt_stats(struct rq *rq, struct task_struct *p);
+extern void __weak fixup_busy_time(struct task_struct *p, int new_cpu);
+extern void __weak init_new_task_load(struct task_struct *p);
+extern void __weak mark_task_starting(struct task_struct *p);
+extern void __weak set_window_start(struct rq *rq);
+extern bool __weak do_pl_notif(struct rq *rq);
+
+#define SCHED_HIGH_IRQ_TIMEOUT 3
+static inline u64 sched_irqload(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	s64 delta;
+
+	delta = get_jiffies_64() - rq->irqload_ts;
+	/*
+	 * Current context can be preempted by irq and rq->irqload_ts can be
+	 * updated by irq context so that delta can be negative.
+	 * But this is okay and we can safely return as this means there
+	 * was recent irq occurrence.
+	 */
+
+	if (delta < SCHED_HIGH_IRQ_TIMEOUT)
+		return rq->avg_irqload;
+	else
+		return 0;
+}
+
+static inline int sched_cpu_high_irqload(int cpu)
+{
+	return sched_irqload(cpu) >= sysctl_sched_cpu_high_irqload;
+}
+
+static inline int exiting_task(struct task_struct *p)
+{
+	return (p->ravg.sum_history[0] == EXITING_TASK_MARKER);
+}
+
+static inline u64
+scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq)
+{
+	return div64_u64(load * (u64)src_freq, (u64)dst_freq);
+}
+
+extern void __weak sched_account_irqstart(int cpu, struct task_struct *curr,
+				   u64 wallclock);
+
+static inline unsigned int max_task_load(void)
+{
+	return sched_ravg_window;
+}
+
+extern void __weak update_cluster_topology(void);
+
+extern void __weak init_clusters(void);
+
+extern void sched_account_irqtime(int cpu, struct task_struct *curr,
+				 u64 delta, u64 wallclock);
+
+static inline int same_cluster(int src_cpu, int dst_cpu)
+{
+	return cpu_rq(src_cpu)->cluster == cpu_rq(dst_cpu)->cluster;
+}
+
+void __weak walt_sched_init_rq(struct rq *rq);
+
+static inline void walt_update_last_enqueue(struct task_struct *p)
+{
+	p->last_enqueued_ts = sched_ktime_clock();
+}
+
+static inline bool is_suh_max(void)
+{
+	return sysctl_sched_user_hint == sched_user_hint_max;
+}
+
+#define DEFAULT_CGROUP_COLOC_ID 1
+static inline bool walt_should_kick_upmigrate(struct task_struct *p, int cpu)
+{
+	struct related_thread_group *rtg = p->grp;
+
+	if (is_suh_max() && rtg && rtg->id == DEFAULT_CGROUP_COLOC_ID &&
+			    rtg->skip_min && p->unfilter)
+		return is_min_capacity_cpu(cpu);
+
+	return false;
+}
+
+extern bool is_rtgb_active(void);
+extern u64 get_rtgb_active_time(void);
+
+/* utility function to update walt signals at wakeup */
+static inline void walt_try_to_wake_up(struct task_struct *p)
+{
+	struct rq *rq = cpu_rq(task_cpu(p));
+	struct rq_flags rf;
+	u64 wallclock;
+	unsigned int old_load;
+	struct related_thread_group *grp = NULL;
+
+	rq_lock_irqsave(rq, &rf);
+	old_load = task_load(p);
+	wallclock = sched_ktime_clock();
+	walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+	walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+	note_task_waking(p, wallclock);
+	rq_unlock_irqrestore(rq, &rf);
+
+	rcu_read_lock();
+	grp = task_related_thread_group(p);
+	if (update_preferred_cluster(grp, p, old_load, false))
+		set_preferred_cluster(grp);
+	rcu_read_unlock();
+}
+
+#else /* CONFIG_SCHED_WALT */
+
+static inline void walt_sched_init_rq(struct rq *rq) { }
+static inline void walt_update_last_enqueue(struct task_struct *p) { }
+static inline void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
+				int event, u64 wallclock, u64 irqtime) { }
+static inline void walt_inc_cumulative_runnable_avg(struct rq *rq,
+		struct task_struct *p)
+{
+}
+
+static inline void walt_dec_cumulative_runnable_avg(struct rq *rq,
+		 struct task_struct *p)
+{
+}
+
+static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { }
+static inline void init_new_task_load(struct task_struct *p)
+{
+}
+
+static inline void mark_task_starting(struct task_struct *p) { }
+static inline void set_window_start(struct rq *rq) { }
+static inline int sched_cpu_high_irqload(int cpu) { return 0; }
+
+static inline void sched_account_irqstart(int cpu, struct task_struct *curr,
+					  u64 wallclock)
+{
+}
+
+static inline void update_cluster_topology(void) { }
+static inline void init_clusters(void) {}
+static inline void sched_account_irqtime(int cpu, struct task_struct *curr,
+				 u64 delta, u64 wallclock)
+{
+}
+
+static inline int same_cluster(int src_cpu, int dst_cpu) { return 1; }
+static inline bool do_pl_notif(struct rq *rq) { return false; }
+
+static inline void
+inc_rq_walt_stats(struct rq *rq, struct task_struct *p) { }
+
+static inline void
+dec_rq_walt_stats(struct rq *rq, struct task_struct *p) { }
+
+static inline void
+fixup_walt_sched_stats_common(struct rq *rq, struct task_struct *p,
+			      u16 updated_demand_scaled,
+			      u16 updated_pred_demand_scaled)
+{
+}
+
+static inline u64 sched_irqload(int cpu)
+{
+	return 0;
+}
+
+static inline bool walt_should_kick_upmigrate(struct task_struct *p, int cpu)
+{
+	return false;
+}
+
+static inline u64 get_rtgb_active_time(void)
+{
+	return 0;
+}
+
+#define walt_try_to_wake_up(a) {}
+
+#endif /* CONFIG_SCHED_WALT */
+
+#endif
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -64,6 +64,7 @@
 #include <linux/binfmts.h>
 #include <linux/sched/sysctl.h>
 #include <linux/sched/coredump.h>
+#include <linux/sched/stat.h>
 #include <linux/kexec.h>
 #include <linux/bpf.h>
 #include <linux/mount.h>
@@ -126,6 +127,7 @@ static int sixty = 60;
 #endif

 static int __maybe_unused neg_one = -1;
+
 static int __maybe_unused two = 2;
 static int __maybe_unused four = 4;
 static unsigned long zero_ul;
@@ -140,7 +142,12 @@ static int ten_thousand = 10000;
 static int six_hundred_forty_kb = 640 * 1024;
 #endif
 #ifdef CONFIG_SCHED_WALT
+static int neg_three = -3;
+static int three = 3;
 static int two_hundred_fifty_five = 255;
+const int sched_user_hint_max = 1000;
+static unsigned int ns_per_sec = NSEC_PER_SEC;
+static unsigned int one_hundred_thousand = 100000;
 #endif

 /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
@@ -231,6 +238,10 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
 #endif
 static int proc_dopipe_max_size(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp, loff_t *ppos);
+#ifdef CONFIG_SCHED_WALT
+static int proc_douintvec_minmax_schedhyst(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos);
+#endif

 #ifdef CONFIG_MAGIC_SYSRQ
 /* Note: sysrq code uses its own private copy */
@@ -328,6 +339,172 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_SCHED_WALT
+	{
+		.procname	= "sched_user_hint",
+		.data		= &sysctl_sched_user_hint,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= walt_proc_user_hint_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= (void *)&sched_user_hint_max,
+	},
+	{
+		.procname	= "sched_window_stats_policy",
+		.data		= &sysctl_sched_window_stats_policy,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &four,
+	},
+	{
+		.procname	= "sched_cpu_high_irqload",
+		.data		= &sysctl_sched_cpu_high_irqload,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sched_group_upmigrate",
+		.data		= &sysctl_sched_group_upmigrate_pct,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= walt_proc_group_thresholds_handler,
+		.extra1		= &sysctl_sched_group_downmigrate_pct,
+	},
+	{
+		.procname	= "sched_group_downmigrate",
+		.data		= &sysctl_sched_group_downmigrate_pct,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= walt_proc_group_thresholds_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &sysctl_sched_group_upmigrate_pct,
+	},
+	{
+		.procname	= "sched_boost",
+		.data		= &sysctl_sched_boost,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_boost_handler,
+		.extra1		= &neg_three,
+		.extra2		= &three,
+	},
+	{
+		.procname	= "sched_conservative_pl",
+		.data		= &sysctl_sched_conservative_pl,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{
+		.procname	= "sched_walt_rotate_big_tasks",
+		.data		= &sysctl_sched_walt_rotate_big_tasks,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{
+		.procname	= "sched_min_task_util_for_boost",
+		.data		= &sysctl_sched_min_task_util_for_boost,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &one_thousand,
+	},
+	{
+		.procname	= "sched_min_task_util_for_colocation",
+		.data		= &sysctl_sched_min_task_util_for_colocation,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &one_thousand,
+	},
+	{
+		.procname	= "sched_asym_cap_sibling_freq_match_pct",
+		.data		= &sysctl_sched_asym_cap_sibling_freq_match_pct,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ONE,
+		.extra2		= &one_hundred,
+	},
+	{
+		.procname	= "sched_coloc_downmigrate_ns",
+		.data		= &sysctl_sched_coloc_downmigrate_ns,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax,
+	},
+	{
+		.procname	= "sched_task_unfilter_nr_windows",
+		.data		= &sysctl_sched_task_unfilter_nr_windows,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ONE,
+		.extra2		= &two_hundred_fifty_five,
+	},
+	{
+		.procname	= "sched_busy_hysteresis_enable_cpus",
+		.data		= &sysctl_sched_busy_hyst_enable_cpus,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax_schedhyst,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &two_hundred_fifty_five,
+	},
+	{
+		.procname	= "sched_busy_hyst_ns",
+		.data		= &sysctl_sched_busy_hyst,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax_schedhyst,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &ns_per_sec,
+	},
+	{
+		.procname	= "sched_coloc_busy_hysteresis_enable_cpus",
+		.data		= &sysctl_sched_coloc_busy_hyst_enable_cpus,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax_schedhyst,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &two_hundred_fifty_five,
+	},
+	{
+		.procname	= "sched_coloc_busy_hyst_ns",
+		.data		= &sysctl_sched_coloc_busy_hyst,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax_schedhyst,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &ns_per_sec,
+	},
+	{
+		.procname	= "sched_coloc_busy_hyst_max_ms",
+		.data		= &sysctl_sched_coloc_busy_hyst_max_ms,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax_schedhyst,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &one_hundred_thousand,
+	},
+	{
+		.procname	= "sched_ravg_window_nr_ticks",
+		.data		= &sysctl_sched_ravg_window_nr_ticks,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_ravg_window_handler,
+	},
+#endif
 #ifdef CONFIG_SCHED_DEBUG
 	{
 		.procname	= "sched_min_granularity_ns",
@@ -2874,6 +3051,19 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
 }
 #endif

+#ifdef CONFIG_SCHED_WALT
+static int proc_douintvec_minmax_schedhyst(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos);
+
+	if (!ret && write)
+		sched_update_hyst_times();
+
+	return ret;
+}
+#endif
+
 static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
 				     void __user *buffer,
 				     size_t *lenp, loff_t *ppos,
@@ -3341,6 +3531,29 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 	return err;
 }

+static int do_proc_douintvec_rwin(bool *negp, unsigned long *lvalp,
+					int *valp, int write, void *data)
+{
+	if (write) {
+		if (*lvalp == 0 || *lvalp == 2 || *lvalp == 5)
+			*valp = *lvalp;
+		else
+			return -EINVAL;
+	} else {
+		*negp = false;
+		*lvalp = *valp;
+	}
+
+	return 0;
+}
+
+int proc_douintvec_ravg_window(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return do_proc_dointvec(table, write, buffer, lenp, ppos,
+				do_proc_douintvec_rwin, NULL);
+}
+
 #else /* CONFIG_PROC_SYSCTL */

 int proc_dostring(struct ctl_table *table, int write,
@@ -3410,6 +3623,12 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 	return -ENOSYS;
 }

+int proc_douintvec_ravg_window(struct ctl_table *table, int write,
+			    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
 #endif /* CONFIG_PROC_SYSCTL */

 #if defined(CONFIG_SYSCTL)