Browse Source

sched/walt: Introduce WALT as a module

WALT improves the scheduler performance from a power/perf and thermal
perspective. Bring it in as a module.

Change-Id: Ibeb6c0480796e8d8fcd81e1bdda7a117ae02c980
Signed-off-by: Shaleen Agrawal <[email protected]>
Shaleen Agrawal 4 years ago
parent
commit
d3b261dbd2

+ 2 - 0
Kconfig

@@ -30,3 +30,5 @@ source "lib/Kconfig"
 source "lib/Kconfig.debug"
 
 source "Documentation/Kconfig"
+
+source "kernel/sched/walt/Kconfig"

+ 1 - 0
arch/arm64/configs/vendor/lahaina_GKI.config

@@ -230,3 +230,4 @@ CONFIG_EDAC_QCOM=m
 CONFIG_EDAC_QCOM_LLCC_PANIC_ON_UE=y
 # CONFIG_EDAC_QCOM_LLCC_PANIC_ON_CE is not set
 CONFIG_MSM_BOOT_STATS=m
+CONFIG_ARM_QCOM_CPUFREQ_HW=m

+ 0 - 3
include/trace/events/preemptirq.h

@@ -3,9 +3,6 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM preemptirq
 
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH trace/events
-
 #if !defined(_TRACE_PREEMPTIRQ_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_PREEMPTIRQ_H
 

+ 0 - 56
include/trace/hooks/restricted_preemptirq.h

@@ -1,56 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (c) 2020, The Linux Foundation. All rights reserved.
- */
-#if !defined(_TRACE_RESTRICTED_PREEMPTIRQ_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_RESTRICTED_PREEMPTIRQ_H
-
-#ifdef CONFIG_PREEMPTIRQ_TRACEPOINTS
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM restricted_preemptirq
-
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH trace/hooks
-
-#include <linux/tracepoint.h>
-#include <trace/hooks/vendor_hooks.h>
-
-#ifdef CONFIG_TRACE_IRQFLAGS
-DECLARE_RESTRICTED_HOOK(restricted_irq_disable,
-			TP_PROTO(unsigned long ip, unsigned long parent_ip),
-			TP_ARGS(ip, parent_ip), 1);
-
-DECLARE_RESTRICTED_HOOK(restricted_irq_enable,
-		       TP_PROTO(unsigned long ip, unsigned long parent_ip),
-		       TP_ARGS(ip, parent_ip), 1);
-
-#else
-#define trace_restricted_irq_enable(ip, parent_ip)
-#define trace_restricted_irq_disable(ip, parent_ip)
-#endif /* CONFIG_TRACE_IRQFLAGS */
-
-#ifdef CONFIG_TRACE_PREEMPT_TOGGLE
-DECLARE_RESTRICTED_HOOK(restricted_preempt_disable,
-		       TP_PROTO(unsigned long ip, unsigned long parent_ip),
-		       TP_ARGS(ip, parent_ip), 1);
-
-DECLARE_RESTRICTED_HOOK(restricted_preempt_enable,
-			TP_PROTO(unsigned long ip, unsigned long parent_ip),
-			TP_ARGS(ip, parent_ip), 1);
-
-#else
-#define trace_restricted_preempt_enable(ip, parent_ip)
-#define trace_restricted_preempt_disable(ip, parent_ip)
-#endif /* CONFIG_TRACE_PREEMPT_TOGGLE */
-
-#include <trace/define_trace.h>
-
-#else  /* ! CONFIG_PREEMPTIRQ_TRACEPOINTS */
-#define trace_restricted_irq_enable(...)
-#define trace_restricted_irq_disable(...)
-#define trace_restricted_preempt_enable(...)
-#define trace_restricted_preempt_disable(...)
-#endif /* ! CONFIG_PREEMPTIRQ_TRACEPOINTS */
-
-#endif /* TRACE_RESTRICTED_PREEMPTIRQ_H || TRACE_HEADER_MULTI_READ */

+ 1 - 1
kernel/sched/Makefile

@@ -26,7 +26,6 @@ obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle.o fair.o rt.o deadline.o
 obj-y += wait.o wait_bit.o swait.o completion.o
 
-obj-$(CONFIG_SCHED_WALT) += walt.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
@@ -37,3 +36,4 @@ obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
 obj-$(CONFIG_MEMBARRIER) += membarrier.o
 obj-$(CONFIG_CPU_ISOLATION) += isolation.o
 obj-$(CONFIG_PSI) += psi.o
+obj-$(CONFIG_SCHED_WALT) += walt/

+ 1 - 11
kernel/sched/core.c

@@ -4333,10 +4333,6 @@ static noinline void __schedule_bug(struct task_struct *prev)
 	if (panic_on_warn)
 		panic("scheduling while atomic\n");
 
-#if defined(CONFIG_PANIC_ON_SCHED_BUG) && defined(CONFIG_SCHED_WALT)
-	BUG();
-#endif
-
 	trace_android_rvh_schedule_bug(NULL);
 
 	dump_stack();
@@ -7199,9 +7195,6 @@ void __init sched_init_smp(void)
 	/* Move init over to a non-isolated CPU */
 	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
 		BUG();
-#ifdef CONFIG_SCHED_WALT
-	cpumask_copy(&current->wts.cpus_requested, cpu_possible_mask);
-#endif
 
 	sched_init_granularity();
 
@@ -7490,9 +7483,6 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
 		pr_err("Preemption disabled at:");
 		print_ip_sym(KERN_ERR, preempt_disable_ip);
 	}
-#ifdef CONFIG_PANIC_ON_SCHED_BUG
-	BUG();
-#endif
 
 	trace_android_rvh_schedule_bug(NULL);
 
@@ -8648,7 +8638,7 @@ static struct cftype cpu_files[] = {
 		.read_u64 = cpu_uclamp_ls_read_u64,
 		.write_u64 = cpu_uclamp_ls_write_u64,
 	},
-#endif /* CONFIG_UCLAMP_TASK_GROUP */
+#endif
 	{ }	/* terminate */
 };
 

+ 1 - 0
kernel/sched/debug.c

@@ -83,6 +83,7 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
 #include "features.h"
 };
 EXPORT_SYMBOL_GPL(sched_feat_keys);
+
 #undef SCHED_FEAT
 
 static void sched_feat_disable(int i)

+ 31 - 0
kernel/sched/walt/Kconfig

@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# QTI WALT based scheduler
+#
+menu "QTI WALT based scheduler features"
+
+config SCHED_WALT
+	tristate "Support window based load tracking"
+	depends on SMP
+	help
+	This feature will allow the scheduler to maintain a tunable window
+	based set of metrics for tasks and runqueues. These metrics can be
+	used to guide task placement as well as task frequency requirements
+	for cpufreq governors.
+
+config SCHED_WALT_DEBUG
+	tristate "WALT debug module"
+	select TRACE_PREEMPT_TOGGLE
+	select TRACE_IRQFLAGS
+	help
+	  This module provides the means of debugging long preempt and
+	  irq disable code. This helps in identifying the scheduling
+	  latencies. The module rely on preemptirq trace hooks and
+	  print the stacktrace to the ftrace upon long preempt and irq
+	  events. Sysctl knobs are available for the user to configure
+	  the thresholds.
+
+	  This module also used to crash the system to catch issues
+	  in scenarios like RT throttling and sleeping while in atomic
+	  context etc.
+endmenu

+ 10 - 0
kernel/sched/walt/Makefile

@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+KCOV_INSTRUMENT := n
+KCSAN_SANITIZE := n
+
+obj-$(CONFIG_SCHED_WALT) += sched-walt.o
+sched-walt-$(CONFIG_SCHED_WALT) := walt.o boost.o sched_avg.o qc_vas.o core_ctl.o trace.o input-boost.o sysctl.o cpufreq_walt.o fixup.o walt_lb.o walt_rt.o walt_cfs.o
+
+obj-$(CONFIG_SCHED_WALT_DEBUG) += sched-walt-debug.o
+sched-walt-debug-$(CONFIG_SCHED_WALT_DEBUG) := walt_debug.o preemptirq_long.o

+ 301 - 0
kernel/sched/walt/boost.c

@@ -0,0 +1,301 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2012-2021, The Linux Foundation. All rights reserved.
+ */
+
+#include <linux/of.h>
+#include <linux/sched/core_ctl.h>
+
+#include "walt.h"
+#include "trace.h"
+
+/*
+ * Scheduler boost is a mechanism to temporarily place tasks on CPUs
+ * with higher capacity than those where a task would have normally
+ * ended up with their load characteristics. Any entity enabling
+ * boost is responsible for disabling it as well.
+ */
+
+static enum sched_boost_policy boost_policy_dt = SCHED_BOOST_NONE;
+static DEFINE_MUTEX(boost_mutex);
+
+struct task_group *task_group_topapp;
+struct task_group *task_group_foreground;
+
+void walt_init_tg(struct task_group *tg)
+{
+	struct walt_task_group *wtg;
+
+	wtg = (struct walt_task_group *) tg->android_vendor_data1;
+
+	wtg->colocate = false;
+	wtg->sched_boost_enable[NO_BOOST] = false;
+	wtg->sched_boost_enable[FULL_THROTTLE_BOOST] = true;
+	wtg->sched_boost_enable[CONSERVATIVE_BOOST] = false;
+	wtg->sched_boost_enable[RESTRAINED_BOOST] = false;
+}
+
+void walt_init_topapp_tg(struct task_group *tg)
+{
+	struct walt_task_group *wtg;
+
+	wtg = (struct walt_task_group *) tg->android_vendor_data1;
+
+	wtg->colocate = true;
+	wtg->sched_boost_enable[NO_BOOST] = false;
+	wtg->sched_boost_enable[FULL_THROTTLE_BOOST] = true;
+	wtg->sched_boost_enable[CONSERVATIVE_BOOST] = true;
+	wtg->sched_boost_enable[RESTRAINED_BOOST] = false;
+}
+
+void walt_init_foreground_tg(struct task_group *tg)
+{
+	struct walt_task_group *wtg;
+
+	wtg = (struct walt_task_group *) tg->android_vendor_data1;
+
+	wtg->colocate = false;
+	wtg->sched_boost_enable[NO_BOOST] = false;
+	wtg->sched_boost_enable[FULL_THROTTLE_BOOST] = true;
+	wtg->sched_boost_enable[CONSERVATIVE_BOOST] = true;
+	wtg->sched_boost_enable[RESTRAINED_BOOST] = false;
+}
+
+/*
+ * Scheduler boost type and boost policy might at first seem unrelated,
+ * however, there exists a connection between them that will allow us
+ * to use them interchangeably during placement decisions. We'll explain
+ * the connection here in one possible way so that the implications are
+ * clear when looking at placement policies.
+ *
+ * When policy = SCHED_BOOST_NONE, type is either none or RESTRAINED
+ * When policy = SCHED_BOOST_ON_ALL or SCHED_BOOST_ON_BIG, type can
+ * neither be none nor RESTRAINED.
+ */
+static void set_boost_policy(int type)
+{
+	if (type == NO_BOOST || type == RESTRAINED_BOOST) {
+		boost_policy = SCHED_BOOST_NONE;
+		return;
+	}
+
+	if (boost_policy_dt) {
+		boost_policy = boost_policy_dt;
+		return;
+	}
+
+	if (hmp_capable()) {
+		boost_policy = SCHED_BOOST_ON_BIG;
+		return;
+	}
+
+	boost_policy = SCHED_BOOST_ON_ALL;
+}
+
+static bool verify_boost_params(int type)
+{
+	return type >= RESTRAINED_BOOST_DISABLE && type <= RESTRAINED_BOOST;
+}
+
+static void sched_no_boost_nop(void)
+{
+}
+
+static void sched_full_throttle_boost_enter(void)
+{
+	core_ctl_set_boost(true);
+	walt_enable_frequency_aggregation(true);
+}
+
+static void sched_full_throttle_boost_exit(void)
+{
+	core_ctl_set_boost(false);
+	walt_enable_frequency_aggregation(false);
+}
+
+static void sched_conservative_boost_enter(void)
+{
+}
+
+static void sched_conservative_boost_exit(void)
+{
+}
+
+static void sched_restrained_boost_enter(void)
+{
+	walt_enable_frequency_aggregation(true);
+}
+
+static void sched_restrained_boost_exit(void)
+{
+	walt_enable_frequency_aggregation(false);
+}
+
+struct sched_boost_data {
+	int	refcount;
+	void	(*enter)(void);
+	void	(*exit)(void);
+};
+
+static struct sched_boost_data sched_boosts[] = {
+	[NO_BOOST] = {
+		.refcount	= 0,
+		.enter		= sched_no_boost_nop,
+		.exit		= sched_no_boost_nop,
+	},
+	[FULL_THROTTLE_BOOST] = {
+		.refcount	= 0,
+		.enter		= sched_full_throttle_boost_enter,
+		.exit		= sched_full_throttle_boost_exit,
+	},
+	[CONSERVATIVE_BOOST] = {
+		.refcount	= 0,
+		.enter		= sched_conservative_boost_enter,
+		.exit		= sched_conservative_boost_exit,
+	},
+	[RESTRAINED_BOOST] = {
+		.refcount	= 0,
+		.enter		= sched_restrained_boost_enter,
+		.exit		= sched_restrained_boost_exit,
+	},
+};
+
+#define SCHED_BOOST_START FULL_THROTTLE_BOOST
+#define SCHED_BOOST_END (RESTRAINED_BOOST + 1)
+
+static int sched_effective_boost(void)
+{
+	int i;
+
+	/*
+	 * The boosts are sorted in descending order by
+	 * priority.
+	 */
+	for (i = SCHED_BOOST_START; i < SCHED_BOOST_END; i++) {
+		if (sched_boosts[i].refcount >= 1)
+			return i;
+	}
+
+	return NO_BOOST;
+}
+
+static void sched_boost_disable(int type)
+{
+	struct sched_boost_data *sb = &sched_boosts[type];
+	int next_boost;
+
+	if (sb->refcount <= 0)
+		return;
+
+	sb->refcount--;
+
+	if (sb->refcount)
+		return;
+
+	/*
+	 * This boost's refcount becomes zero, so it must
+	 * be disabled. Disable it first and then apply
+	 * the next boost.
+	 */
+	sb->exit();
+
+	next_boost = sched_effective_boost();
+	sched_boosts[next_boost].enter();
+}
+
+static void sched_boost_enable(int type)
+{
+	struct sched_boost_data *sb = &sched_boosts[type];
+	int next_boost, prev_boost = sched_boost_type;
+
+	sb->refcount++;
+
+	if (sb->refcount != 1)
+		return;
+
+	/*
+	 * This boost enable request did not come before.
+	 * Take this new request and find the next boost
+	 * by aggregating all the enabled boosts. If there
+	 * is a change, disable the previous boost and enable
+	 * the next boost.
+	 */
+
+	next_boost = sched_effective_boost();
+	if (next_boost == prev_boost)
+		return;
+
+	sched_boosts[prev_boost].exit();
+	sched_boosts[next_boost].enter();
+}
+
+static void sched_boost_disable_all(void)
+{
+	int i;
+
+	for (i = SCHED_BOOST_START; i < SCHED_BOOST_END; i++) {
+		if (sched_boosts[i].refcount > 0) {
+			sched_boosts[i].exit();
+			sched_boosts[i].refcount = 0;
+		}
+	}
+}
+
+static void _sched_set_boost(int type)
+{
+	if (type == 0)
+		sched_boost_disable_all();
+	else if (type > 0)
+		sched_boost_enable(type);
+	else
+		sched_boost_disable(-type);
+
+	/*
+	 * sysctl_sched_boost holds the boost request from
+	 * user space which could be different from the
+	 * effectively enabled boost. Update the effective
+	 * boost here.
+	 */
+
+	sched_boost_type = sched_effective_boost();
+	sysctl_sched_boost = sched_boost_type;
+	set_boost_policy(sysctl_sched_boost);
+	trace_sched_set_boost(sysctl_sched_boost);
+}
+
+int sched_set_boost(int type)
+{
+	int ret = 0;
+
+	mutex_lock(&boost_mutex);
+	if (verify_boost_params(type))
+		_sched_set_boost(type);
+	else
+		ret = -EINVAL;
+	mutex_unlock(&boost_mutex);
+	return ret;
+}
+
+int sched_boost_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos)
+{
+	int ret;
+	unsigned int *data = (unsigned int *)table->data;
+
+	mutex_lock(&boost_mutex);
+
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+	if (ret || !write)
+		goto done;
+
+	if (verify_boost_params(*data))
+		_sched_set_boost(*data);
+	else
+		ret = -EINVAL;
+
+done:
+	mutex_unlock(&boost_mutex);
+	return ret;
+}

+ 1307 - 0
kernel/sched/walt/core_ctl.c

@@ -0,0 +1,1307 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2014-2021, The Linux Foundation. All rights reserved.
+ */
+
+#define pr_fmt(fmt)	"core_ctl: " fmt
+
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/cpufreq.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/syscore_ops.h>
+#include <uapi/linux/sched/types.h>
+#include <linux/sched/core_ctl.h>
+
+#include "walt.h"
+#include "trace.h"
+
+struct cluster_data {
+	bool			inited;
+	unsigned int		min_cpus;
+	unsigned int		max_cpus;
+	unsigned int		offline_delay_ms;
+	unsigned int		busy_up_thres[MAX_CPUS_PER_CLUSTER];
+	unsigned int		busy_down_thres[MAX_CPUS_PER_CLUSTER];
+	unsigned int		active_cpus;
+	unsigned int		num_cpus;
+	unsigned int		nr_paused_cpus;
+	unsigned int		nr_not_preferred_cpus;
+	cpumask_t		cpu_mask;
+	unsigned int		need_cpus;
+	unsigned int		task_thres;
+	unsigned int		max_nr;
+	unsigned int		nr_prev_assist;
+	unsigned int		nr_prev_assist_thresh;
+	s64			need_ts;
+	struct list_head	lru;
+	bool			enable;
+	int			nrrun;
+	unsigned int		first_cpu;
+	unsigned int		boost;
+	struct kobject		kobj;
+	unsigned int		strict_nrrun;
+};
+
+struct cpu_data {
+	bool			is_busy;
+	unsigned int		busy;
+	unsigned int		cpu;
+	bool			not_preferred;
+	struct cluster_data	*cluster;
+	struct list_head	sib;
+	bool			paused_by_us;
+};
+
+static DEFINE_PER_CPU(struct cpu_data, cpu_state);
+static struct cluster_data cluster_state[MAX_CLUSTERS];
+static unsigned int num_clusters;
+
+#define for_each_cluster(cluster, idx) \
+	for (; (idx) < num_clusters && ((cluster) = &cluster_state[idx]);\
+		idx++)
+
+/* single core_ctl thread for all pause/unpause core_ctl operations */
+struct task_struct *core_ctl_thread;
+
+/* single lock per single thread for core_ctl
+ * protects core_ctl_pending flag
+ */
+spinlock_t core_ctl_pending_lock;
+bool core_ctl_pending;
+
+static DEFINE_SPINLOCK(state_lock);
+static void apply_need(struct cluster_data *state);
+static void wake_up_core_ctl_thread(void);
+static bool initialized;
+
+ATOMIC_NOTIFIER_HEAD(core_ctl_notifier);
+static unsigned int last_nr_big;
+
+static unsigned int get_active_cpu_count(const struct cluster_data *cluster);
+
+/* ========================= sysfs interface =========================== */
+
+static ssize_t store_min_cpus(struct cluster_data *state,
+				const char *buf, size_t count)
+{
+	unsigned int val;
+
+	if (sscanf(buf, "%u\n", &val) != 1)
+		return -EINVAL;
+
+	state->min_cpus = min(val, state->num_cpus);
+	apply_need(state);
+
+	return count;
+}
+
+static ssize_t show_min_cpus(const struct cluster_data *state, char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "%u\n", state->min_cpus);
+}
+
+static ssize_t store_max_cpus(struct cluster_data *state,
+				const char *buf, size_t count)
+{
+	unsigned int val;
+
+	if (sscanf(buf, "%u\n", &val) != 1)
+		return -EINVAL;
+
+	state->max_cpus = min(val, state->num_cpus);
+	apply_need(state);
+
+	return count;
+}
+
+static ssize_t show_max_cpus(const struct cluster_data *state, char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "%u\n", state->max_cpus);
+}
+
+static ssize_t store_offline_delay_ms(struct cluster_data *state,
+					const char *buf, size_t count)
+{
+	unsigned int val;
+
+	if (sscanf(buf, "%u\n", &val) != 1)
+		return -EINVAL;
+
+	state->offline_delay_ms = val;
+	apply_need(state);
+
+	return count;
+}
+
+static ssize_t show_task_thres(const struct cluster_data *state, char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "%u\n", state->task_thres);
+}
+
+static ssize_t store_task_thres(struct cluster_data *state,
+				const char *buf, size_t count)
+{
+	unsigned int val;
+
+	if (sscanf(buf, "%u\n", &val) != 1)
+		return -EINVAL;
+
+	if (val < state->num_cpus)
+		return -EINVAL;
+
+	state->task_thres = val;
+	apply_need(state);
+
+	return count;
+}
+
+static ssize_t show_nr_prev_assist_thresh(const struct cluster_data *state,
+								char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "%u\n", state->nr_prev_assist_thresh);
+}
+
+static ssize_t store_nr_prev_assist_thresh(struct cluster_data *state,
+				const char *buf, size_t count)
+{
+	unsigned int val;
+
+	if (sscanf(buf, "%u\n", &val) != 1)
+		return -EINVAL;
+
+	state->nr_prev_assist_thresh = val;
+	apply_need(state);
+
+	return count;
+}
+
+static ssize_t show_offline_delay_ms(const struct cluster_data *state,
+				     char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "%u\n", state->offline_delay_ms);
+}
+
+static ssize_t store_busy_up_thres(struct cluster_data *state,
+					const char *buf, size_t count)
+{
+	unsigned int val[MAX_CPUS_PER_CLUSTER];
+	int ret, i;
+
+	ret = sscanf(buf, "%u %u %u %u %u %u\n",
+			&val[0], &val[1], &val[2], &val[3],
+			&val[4], &val[5]);
+	if (ret != 1 && ret != state->num_cpus)
+		return -EINVAL;
+
+	if (ret == 1) {
+		for (i = 0; i < state->num_cpus; i++)
+			state->busy_up_thres[i] = val[0];
+	} else {
+		for (i = 0; i < state->num_cpus; i++)
+			state->busy_up_thres[i] = val[i];
+	}
+	apply_need(state);
+	return count;
+}
+
+static ssize_t show_busy_up_thres(const struct cluster_data *state, char *buf)
+{
+	int i, count = 0;
+
+	for (i = 0; i < state->num_cpus; i++)
+		count += scnprintf(buf + count, PAGE_SIZE - count, "%u ",
+				  state->busy_up_thres[i]);
+
+	count += scnprintf(buf + count, PAGE_SIZE - count, "\n");
+	return count;
+}
+
+static ssize_t store_busy_down_thres(struct cluster_data *state,
+					const char *buf, size_t count)
+{
+	unsigned int val[MAX_CPUS_PER_CLUSTER];
+	int ret, i;
+
+	ret = sscanf(buf, "%u %u %u %u %u %u\n",
+			&val[0], &val[1], &val[2], &val[3],
+			&val[4], &val[5]);
+	if (ret != 1 && ret != state->num_cpus)
+		return -EINVAL;
+
+	if (ret == 1) {
+		for (i = 0; i < state->num_cpus; i++)
+			state->busy_down_thres[i] = val[0];
+	} else {
+		for (i = 0; i < state->num_cpus; i++)
+			state->busy_down_thres[i] = val[i];
+	}
+	apply_need(state);
+	return count;
+}
+
+static ssize_t show_busy_down_thres(const struct cluster_data *state, char *buf)
+{
+	int i, count = 0;
+
+	for (i = 0; i < state->num_cpus; i++)
+		count += scnprintf(buf + count, PAGE_SIZE - count, "%u ",
+				  state->busy_down_thres[i]);
+
+	count += scnprintf(buf + count, PAGE_SIZE - count, "\n");
+	return count;
+}
+
+static ssize_t store_enable(struct cluster_data *state,
+				const char *buf, size_t count)
+{
+	unsigned int val;
+	bool bval;
+
+	if (sscanf(buf, "%u\n", &val) != 1)
+		return -EINVAL;
+
+	bval = !!val;
+	if (bval != state->enable) {
+		state->enable = bval;
+		apply_need(state);
+	}
+
+	return count;
+}
+
+static ssize_t show_enable(const struct cluster_data *state, char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "%u\n", state->enable);
+}
+
+static ssize_t show_need_cpus(const struct cluster_data *state, char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "%u\n", state->need_cpus);
+}
+
+static ssize_t show_active_cpus(const struct cluster_data *state, char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "%u\n", state->active_cpus);
+}
+
+static ssize_t show_global_state(const struct cluster_data *state, char *buf)
+{
+	struct cpu_data *c;
+	struct cluster_data *cluster;
+	ssize_t count = 0;
+	unsigned int cpu;
+
+	spin_lock_irq(&state_lock);
+	for_each_possible_cpu(cpu) {
+		c = &per_cpu(cpu_state, cpu);
+		cluster = c->cluster;
+		if (!cluster || !cluster->inited)
+			continue;
+
+		count += scnprintf(buf + count, PAGE_SIZE - count,
+					"CPU%u\n", cpu);
+		count += scnprintf(buf + count, PAGE_SIZE - count,
+					"\tCPU: %u\n", c->cpu);
+		count += scnprintf(buf + count, PAGE_SIZE - count,
+					"\tOnline: %u\n",
+					cpu_online(c->cpu));
+		count += scnprintf(buf + count, PAGE_SIZE - count,
+					"\tPaused: %u\n",
+					!cpu_active(c->cpu));
+		count += scnprintf(buf + count, PAGE_SIZE - count,
+					"\tFirst CPU: %u\n",
+						cluster->first_cpu);
+		count += scnprintf(buf + count, PAGE_SIZE - count,
+					"\tBusy%%: %u\n", c->busy);
+		count += scnprintf(buf + count, PAGE_SIZE - count,
+					"\tIs busy: %u\n", c->is_busy);
+		count += scnprintf(buf + count, PAGE_SIZE - count,
+					"\tNot preferred: %u\n",
+						c->not_preferred);
+		count += scnprintf(buf + count, PAGE_SIZE - count,
+					"\tNr running: %u\n", cluster->nrrun);
+		count += scnprintf(buf + count, PAGE_SIZE - count,
+			"\tActive CPUs: %u\n", get_active_cpu_count(cluster));
+		count += scnprintf(buf + count, PAGE_SIZE - count,
+				"\tNeed CPUs: %u\n", cluster->need_cpus);
+		count += scnprintf(buf + count, PAGE_SIZE - count,
+				"\tNr paused CPUs: %u\n",
+						cluster->nr_paused_cpus);
+		count += scnprintf(buf + count, PAGE_SIZE - count,
+				"\tBoost: %u\n", (unsigned int) cluster->boost);
+	}
+	spin_unlock_irq(&state_lock);
+
+	return count;
+}
+
+static ssize_t store_not_preferred(struct cluster_data *state,
+				   const char *buf, size_t count)
+{
+	struct cpu_data *c;
+	unsigned int i;
+	unsigned int val[MAX_CPUS_PER_CLUSTER];
+	unsigned long flags;
+	int ret;
+	int not_preferred_count = 0;
+
+	ret = sscanf(buf, "%u %u %u %u %u %u\n",
+			&val[0], &val[1], &val[2], &val[3],
+			&val[4], &val[5]);
+	if (ret != state->num_cpus)
+		return -EINVAL;
+
+	spin_lock_irqsave(&state_lock, flags);
+	for (i = 0; i < state->num_cpus; i++) {
+		c = &per_cpu(cpu_state, i + state->first_cpu);
+		c->not_preferred = val[i];
+		not_preferred_count += !!val[i];
+	}
+	state->nr_not_preferred_cpus = not_preferred_count;
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	return count;
+}
+
+static ssize_t show_not_preferred(const struct cluster_data *state, char *buf)
+{
+	struct cpu_data *c;
+	ssize_t count = 0;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&state_lock, flags);
+	for (i = 0; i < state->num_cpus; i++) {
+		c = &per_cpu(cpu_state, i + state->first_cpu);
+		count += scnprintf(buf + count, PAGE_SIZE - count,
+				"CPU#%d: %u\n", c->cpu, c->not_preferred);
+	}
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	return count;
+}
+
+struct core_ctl_attr {
+	struct attribute	attr;
+	ssize_t			(*show)(const struct cluster_data *cd, char *c);
+	ssize_t			(*store)(struct cluster_data *cd, const char *c,
+							size_t count);
+};
+
+#define core_ctl_attr_ro(_name)		\
+static struct core_ctl_attr _name =	\
+__ATTR(_name, 0444, show_##_name, NULL)
+
+#define core_ctl_attr_rw(_name)			\
+static struct core_ctl_attr _name =		\
+__ATTR(_name, 0644, show_##_name, store_##_name)
+
+core_ctl_attr_rw(min_cpus);
+core_ctl_attr_rw(max_cpus);
+core_ctl_attr_rw(offline_delay_ms);
+core_ctl_attr_rw(busy_up_thres);
+core_ctl_attr_rw(busy_down_thres);
+core_ctl_attr_rw(task_thres);
+core_ctl_attr_rw(nr_prev_assist_thresh);
+core_ctl_attr_ro(need_cpus);
+core_ctl_attr_ro(active_cpus);
+core_ctl_attr_ro(global_state);
+core_ctl_attr_rw(not_preferred);
+core_ctl_attr_rw(enable);
+
+static struct attribute *default_attrs[] = {
+	&min_cpus.attr,
+	&max_cpus.attr,
+	&offline_delay_ms.attr,
+	&busy_up_thres.attr,
+	&busy_down_thres.attr,
+	&task_thres.attr,
+	&nr_prev_assist_thresh.attr,
+	&enable.attr,
+	&need_cpus.attr,
+	&active_cpus.attr,
+	&global_state.attr,
+	&not_preferred.attr,
+	NULL
+};
+
+#define to_cluster_data(k) container_of(k, struct cluster_data, kobj)
+#define to_attr(a) container_of(a, struct core_ctl_attr, attr)
+static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	struct cluster_data *data = to_cluster_data(kobj);
+	struct core_ctl_attr *cattr = to_attr(attr);
+	ssize_t ret = -EIO;
+
+	if (cattr->show)
+		ret = cattr->show(data, buf);
+
+	return ret;
+}
+
+static ssize_t store(struct kobject *kobj, struct attribute *attr,
+		     const char *buf, size_t count)
+{
+	struct cluster_data *data = to_cluster_data(kobj);
+	struct core_ctl_attr *cattr = to_attr(attr);
+	ssize_t ret = -EIO;
+
+	if (cattr->store)
+		ret = cattr->store(data, buf, count);
+
+	return ret;
+}
+
+static const struct sysfs_ops sysfs_ops = {
+	.show	= show,
+	.store	= store,
+};
+
+static struct kobj_type ktype_core_ctl = {
+	.sysfs_ops	= &sysfs_ops,
+	.default_attrs	= default_attrs,
+};
+
+/* ==================== runqueue based core count =================== */
+
+static struct sched_avg_stats nr_stats[WALT_NR_CPUS];
+
+/*
+ * nr_need:
+ *   Number of tasks running on this cluster plus
+ *   tasks running on higher capacity clusters.
+ *   To find out CPUs needed from this cluster.
+ *
+ * For example:
+ *   On dual cluster system with 4 min capacity
+ *   CPUs and 4 max capacity CPUs, if there are
+ *   4 small tasks running on min capacity CPUs
+ *   and 2 big tasks running on 2 max capacity
+ *   CPUs, nr_need has to be 6 for min capacity
+ *   cluster and 2 for max capacity cluster.
+ *   This is because, min capacity cluster has to
+ *   account for tasks running on max capacity
+ *   cluster, so that, the min capacity cluster
+ *   can be ready to accommodate tasks running on max
+ *   capacity CPUs if the demand of tasks goes down.
+ */
+static int compute_cluster_nr_need(int index)
+{
+	int cpu;
+	struct cluster_data *cluster;
+	int nr_need = 0;
+
+	for_each_cluster(cluster, index) {
+		for_each_cpu(cpu, &cluster->cpu_mask)
+			nr_need += nr_stats[cpu].nr;
+	}
+
+	return nr_need;
+}
+
+/*
+ * prev_misfit_need:
+ *   Tasks running on smaller capacity cluster which
+ *   needs to be migrated to higher capacity cluster.
+ *   To find out how many tasks need higher capacity CPUs.
+ *
+ * For example:
+ *   On dual cluster system with 4 min capacity
+ *   CPUs and 4 max capacity CPUs, if there are
+ *   2 small tasks and 2 big tasks running on
+ *   min capacity CPUs and no tasks running on
+ *   max cpacity, prev_misfit_need of min capacity
+ *   cluster will be 0 and prev_misfit_need of
+ *   max capacity cluster will be 2.
+ */
+static int compute_prev_cluster_misfit_need(int index)
+{
+	int cpu;
+	struct cluster_data *prev_cluster;
+	int prev_misfit_need = 0;
+
+	/*
+	 * Lowest capacity cluster does not have to
+	 * accommodate any misfit tasks.
+	 */
+	if (index == 0)
+		return 0;
+
+	prev_cluster = &cluster_state[index - 1];
+
+	for_each_cpu(cpu, &prev_cluster->cpu_mask)
+		prev_misfit_need += nr_stats[cpu].nr_misfit;
+
+	return prev_misfit_need;
+}
+
+static int compute_cluster_max_nr(int index)
+{
+	int cpu;
+	struct cluster_data *cluster = &cluster_state[index];
+	int max_nr = 0;
+
+	for_each_cpu(cpu, &cluster->cpu_mask)
+		max_nr = max(max_nr, nr_stats[cpu].nr_max);
+
+	return max_nr;
+}
+
+static int cluster_real_big_tasks(int index)
+{
+	int nr_big = 0;
+	int cpu;
+	struct cluster_data *cluster = &cluster_state[index];
+
+	if (index == 0) {
+		for_each_cpu(cpu, &cluster->cpu_mask)
+			nr_big += nr_stats[cpu].nr_misfit;
+	} else {
+		for_each_cpu(cpu, &cluster->cpu_mask)
+			nr_big += nr_stats[cpu].nr;
+	}
+
+	return nr_big;
+}
+
+/*
+ * prev_nr_need_assist:
+ *   Tasks that are eligible to run on the previous
+ *   cluster but cannot run because of insufficient
+ *   CPUs there. prev_nr_need_assist is indicative
+ *   of number of CPUs in this cluster that should
+ *   assist its previous cluster to makeup for
+ *   insufficient CPUs there.
+ *
+ * For example:
+ *   On tri-cluster system with 4 min capacity
+ *   CPUs, 3 intermediate capacity CPUs and 1
+ *   max capacity CPU, if there are 4 small
+ *   tasks running on min capacity CPUs, 4 big
+ *   tasks running on intermediate capacity CPUs
+ *   and no tasks running on max capacity CPU,
+ *   prev_nr_need_assist for min & max capacity
+ *   clusters will be 0, but, for intermediate
+ *   capacity cluster prev_nr_need_assist will
+ *   be 1 as it has 3 CPUs, but, there are 4 big
+ *   tasks to be served.
+ */
+static int prev_cluster_nr_need_assist(int index)
+{
+	int need = 0;
+	int cpu;
+	struct cluster_data *prev_cluster;
+
+	if (index == 0)
+		return 0;
+
+	index--;
+	prev_cluster = &cluster_state[index];
+
+	/*
+	 * Next cluster should not assist, while there are paused cpus
+	 * in this cluster.
+	 */
+	if (prev_cluster->nr_paused_cpus)
+		return 0;
+
+	for_each_cpu(cpu, &prev_cluster->cpu_mask)
+		need += nr_stats[cpu].nr;
+
+	need += compute_prev_cluster_misfit_need(index);
+
+	if (need > prev_cluster->active_cpus)
+		need = need - prev_cluster->active_cpus;
+	else
+		need = 0;
+
+	return need;
+}
+
+/*
+ * This is only implemented for min capacity cluster.
+ *
+ * Bringing a little CPU out of pause and using it
+ * more does not hurt power as much as bringing big CPUs.
+ *
+ * little cluster provides help needed for the other clusters.
+ * we take nr_scaled (which gives better resolution) and find
+ * the total nr in the system. Then take out the active higher
+ * capacity CPUs from the nr and consider the remaining nr as
+ * strict and consider that many little CPUs are needed.
+ */
+static int compute_cluster_nr_strict_need(int index)
+{
+	int cpu;
+	struct cluster_data *cluster;
+	int nr_strict_need = 0;
+
+	if (index != 0)
+		return 0;
+
+	for_each_cluster(cluster, index) {
+		int nr_scaled = 0;
+		int active_cpus = cluster->active_cpus;
+
+		for_each_cpu(cpu, &cluster->cpu_mask)
+			nr_scaled += nr_stats[cpu].nr_scaled;
+
+		nr_scaled /= 100;
+
+		/*
+		 * For little cluster, nr_scaled becomes the nr_strict,
+		 * for other cluster, overflow is counted towards
+		 * the little cluster need.
+		 */
+		if (index == 0)
+			nr_strict_need += nr_scaled;
+		else
+			nr_strict_need += max(0, nr_scaled - active_cpus);
+	}
+
+	return nr_strict_need;
+}
+static void update_running_avg(void)
+{
+	struct cluster_data *cluster;
+	unsigned int index = 0;
+	unsigned long flags;
+	int big_avg = 0;
+
+	sched_get_nr_running_avg(nr_stats);
+
+	spin_lock_irqsave(&state_lock, flags);
+	for_each_cluster(cluster, index) {
+		int nr_need, prev_misfit_need;
+
+		if (!cluster->inited)
+			continue;
+
+		nr_need = compute_cluster_nr_need(index);
+		prev_misfit_need = compute_prev_cluster_misfit_need(index);
+
+		cluster->nrrun = nr_need + prev_misfit_need;
+		cluster->max_nr = compute_cluster_max_nr(index);
+		cluster->nr_prev_assist = prev_cluster_nr_need_assist(index);
+
+		cluster->strict_nrrun = compute_cluster_nr_strict_need(index);
+
+		trace_core_ctl_update_nr_need(cluster->first_cpu, nr_need,
+					prev_misfit_need,
+					cluster->nrrun, cluster->max_nr,
+					cluster->nr_prev_assist);
+
+		big_avg += cluster_real_big_tasks(index);
+	}
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	last_nr_big = big_avg;
+	walt_rotation_checkpoint(big_avg);
+}
+
+#define MAX_NR_THRESHOLD	4
+/* adjust needed CPUs based on current runqueue information */
+static unsigned int apply_task_need(const struct cluster_data *cluster,
+				    unsigned int new_need)
+{
+	/* resume all cores if there are enough tasks */
+	if (cluster->nrrun >= cluster->task_thres)
+		return cluster->num_cpus;
+
+	/*
+	 * resume as many cores as the previous cluster
+	 * needs assistance with.
+	 */
+	if (cluster->nr_prev_assist >= cluster->nr_prev_assist_thresh)
+		new_need = new_need + cluster->nr_prev_assist;
+
+	/* only resume more cores if there are tasks to run */
+	if (cluster->nrrun > new_need)
+		new_need = new_need + 1;
+
+	/*
+	 * We don't want tasks to be overcrowded in a cluster.
+	 * If any CPU has more than MAX_NR_THRESHOLD in the last
+	 * window, bring another CPU to help out.
+	 */
+	if (cluster->max_nr > MAX_NR_THRESHOLD)
+		new_need = new_need + 1;
+
+	/*
+	 * For little cluster, we use a bit more relaxed approach
+	 * and impose the strict nr condition. Because all tasks can
+	 * spill onto little if big cluster is crowded.
+	 */
+	if (new_need < cluster->strict_nrrun)
+		new_need = cluster->strict_nrrun;
+
+	return new_need;
+}
+
+/* ======================= load based core count  ====================== */
+
+static unsigned int apply_limits(const struct cluster_data *cluster,
+				 unsigned int need_cpus)
+{
+	return min(max(cluster->min_cpus, need_cpus), cluster->max_cpus);
+}
+
+static unsigned int get_active_cpu_count(const struct cluster_data *cluster)
+{
+	return cluster->num_cpus -
+				 sched_pause_count(&cluster->cpu_mask, true);
+}
+
+static bool is_active(const struct cpu_data *state)
+{
+	return cpu_online(state->cpu) && cpu_active(state->cpu);
+}
+
+static bool adjustment_possible(const struct cluster_data *cluster,
+							unsigned int need)
+{
+	return (need < cluster->active_cpus || (need > cluster->active_cpus &&
+						cluster->nr_paused_cpus));
+}
+
+static bool need_all_cpus(const struct cluster_data *cluster)
+{
+	return (is_min_capacity_cpu(cluster->first_cpu) &&
+		sched_ravg_window < DEFAULT_SCHED_RAVG_WINDOW);
+}
+
+static bool eval_need(struct cluster_data *cluster)
+{
+	unsigned long flags;
+	struct cpu_data *c;
+	unsigned int need_cpus = 0, last_need, thres_idx;
+	int ret = 0;
+	bool need_flag = false;
+	unsigned int new_need;
+	s64 now, elapsed;
+
+	if (unlikely(!cluster->inited))
+		return false;
+
+	spin_lock_irqsave(&state_lock, flags);
+
+	if (cluster->boost || !cluster->enable || need_all_cpus(cluster)) {
+		need_cpus = cluster->max_cpus;
+	} else {
+		cluster->active_cpus = get_active_cpu_count(cluster);
+		thres_idx = cluster->active_cpus ? cluster->active_cpus - 1 : 0;
+		list_for_each_entry(c, &cluster->lru, sib) {
+			bool old_is_busy = c->is_busy;
+
+			if (c->busy >= cluster->busy_up_thres[thres_idx] ||
+			    sched_cpu_high_irqload(c->cpu))
+				c->is_busy = true;
+			else if (c->busy < cluster->busy_down_thres[thres_idx])
+				c->is_busy = false;
+
+			trace_core_ctl_set_busy(c->cpu, c->busy, old_is_busy,
+						c->is_busy);
+			need_cpus += c->is_busy;
+		}
+		need_cpus = apply_task_need(cluster, need_cpus);
+	}
+	new_need = apply_limits(cluster, need_cpus);
+	need_flag = adjustment_possible(cluster, new_need);
+
+	last_need = cluster->need_cpus;
+	now = ktime_to_ms(ktime_get());
+
+	if (new_need > cluster->active_cpus) {
+		ret = 1;
+	} else {
+		/*
+		 * When there is no change in need and there are no more
+		 * active CPUs than currently needed, just update the
+		 * need time stamp and return.
+		 */
+		if (new_need == last_need && new_need == cluster->active_cpus) {
+			cluster->need_ts = now;
+			spin_unlock_irqrestore(&state_lock, flags);
+			return false;
+		}
+
+		elapsed = now - cluster->need_ts;
+		ret = elapsed >= cluster->offline_delay_ms;
+	}
+
+	if (ret) {
+		cluster->need_ts = now;
+		cluster->need_cpus = new_need;
+	}
+	trace_core_ctl_eval_need(cluster->first_cpu, last_need, new_need,
+				 ret && need_flag);
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	return ret && need_flag;
+}
+
+static void apply_need(struct cluster_data *cluster)
+{
+	if (eval_need(cluster))
+		wake_up_core_ctl_thread();
+}
+
+/* ========================= core count enforcement ==================== */
+
+static void wake_up_core_ctl_thread(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&core_ctl_pending_lock, flags);
+	core_ctl_pending = true;
+	spin_unlock_irqrestore(&core_ctl_pending_lock, flags);
+
+	wake_up_process(core_ctl_thread);
+}
+
+static u64 core_ctl_check_timestamp;
+
+int core_ctl_set_boost(bool boost)
+{
+	unsigned int index = 0;
+	struct cluster_data *cluster = NULL;
+	unsigned long flags;
+	int ret = 0;
+	bool boost_state_changed = false;
+
+	if (unlikely(!initialized))
+		return 0;
+
+	spin_lock_irqsave(&state_lock, flags);
+	for_each_cluster(cluster, index) {
+		if (boost) {
+			boost_state_changed = !cluster->boost;
+			++cluster->boost;
+		} else {
+			if (!cluster->boost) {
+				ret = -EINVAL;
+				break;
+			}
+			--cluster->boost;
+			boost_state_changed = !cluster->boost;
+		}
+	}
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	if (boost_state_changed) {
+		index = 0;
+		for_each_cluster(cluster, index)
+			apply_need(cluster);
+	}
+
+	if (cluster)
+		trace_core_ctl_set_boost(cluster->boost, ret);
+
+	return ret;
+}
+EXPORT_SYMBOL(core_ctl_set_boost);
+
+void core_ctl_notifier_register(struct notifier_block *n)
+{
+	atomic_notifier_chain_register(&core_ctl_notifier, n);
+}
+
+void core_ctl_notifier_unregister(struct notifier_block *n)
+{
+	atomic_notifier_chain_unregister(&core_ctl_notifier, n);
+}
+
+static void core_ctl_call_notifier(void)
+{
+	struct core_ctl_notif_data ndata = {0};
+	struct notifier_block *nb;
+
+	/*
+	 * Don't bother querying the stats when the notifier
+	 * chain is empty.
+	 */
+	rcu_read_lock();
+	nb = rcu_dereference_raw(core_ctl_notifier.head);
+	rcu_read_unlock();
+
+	if (!nb)
+		return;
+
+	ndata.nr_big = last_nr_big;
+	walt_fill_ta_data(&ndata);
+	trace_core_ctl_notif_data(ndata.nr_big, ndata.coloc_load_pct,
+			ndata.ta_util_pct, ndata.cur_cap_pct);
+
+	atomic_notifier_call_chain(&core_ctl_notifier, 0, &ndata);
+}
+
+void core_ctl_check(u64 window_start)
+{
+	int cpu;
+	struct cpu_data *c;
+	struct cluster_data *cluster;
+	unsigned int index = 0;
+	unsigned long flags;
+
+	if (unlikely(!initialized))
+		return;
+
+	if (window_start == core_ctl_check_timestamp)
+		return;
+
+	core_ctl_check_timestamp = window_start;
+
+	spin_lock_irqsave(&state_lock, flags);
+	for_each_possible_cpu(cpu) {
+
+		c = &per_cpu(cpu_state, cpu);
+		cluster = c->cluster;
+
+		if (!cluster || !cluster->inited)
+			continue;
+
+		c->busy = sched_get_cpu_util(cpu);
+	}
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	update_running_avg();
+
+	for_each_cluster(cluster, index) {
+		if (eval_need(cluster))
+			wake_up_core_ctl_thread();
+	}
+
+	core_ctl_call_notifier();
+}
+
+static void move_cpu_lru(struct cpu_data *cpu_data)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&state_lock, flags);
+	list_del(&cpu_data->sib);
+	list_add_tail(&cpu_data->sib, &cpu_data->cluster->lru);
+	spin_unlock_irqrestore(&state_lock, flags);
+}
+
+static bool should_we_pause(int cpu, struct cluster_data *cluster)
+{
+	return true;
+}
+
+static void try_to_pause(struct cluster_data *cluster, unsigned int need,
+			 struct cpumask *pause_cpus)
+{
+	struct cpu_data *c, *tmp;
+	unsigned long flags;
+	unsigned int num_cpus = cluster->num_cpus;
+	unsigned int nr_paused = 0;
+	bool first_pass = cluster->nr_not_preferred_cpus;
+
+	/*
+	 * Protect against entry being removed (and added at tail) by other
+	 * thread (hotplug).
+	 */
+	spin_lock_irqsave(&state_lock, flags);
+	list_for_each_entry_safe(c, tmp, &cluster->lru, sib) {
+		if (!num_cpus--)
+			break;
+
+		if (!is_active(c))
+			continue;
+		if (cluster->active_cpus == need)
+			break;
+		/* Don't pause busy CPUs. */
+		if (c->is_busy)
+			continue;
+		/*
+		 * We pause only the not_preferred CPUs. If none
+		 * of the CPUs are selected as not_preferred, then
+		 * all CPUs are eligible for pausing.
+		 */
+		if (cluster->nr_not_preferred_cpus && !c->not_preferred)
+			continue;
+
+		if (!should_we_pause(c->cpu, cluster))
+			continue;
+
+		spin_unlock_irqrestore(&state_lock, flags);
+
+		pr_debug("Trying to pause CPU%u\n", c->cpu);
+
+		cpumask_set_cpu(c->cpu, pause_cpus);
+		sched_pause_pending(c->cpu);
+
+		c->paused_by_us = true;
+		move_cpu_lru(c);
+		nr_paused++;
+
+		cluster->active_cpus = get_active_cpu_count(cluster);
+		spin_lock_irqsave(&state_lock, flags);
+	}
+	cluster->nr_paused_cpus += nr_paused;
+	spin_unlock_irqrestore(&state_lock, flags);
+
+again:
+	/*
+	 * If the number of active CPUs is within the limits, then
+	 * don't force pause of any busy CPUs.
+	 */
+	if (cluster->active_cpus <= cluster->max_cpus)
+		return;
+
+	nr_paused = 0;
+	num_cpus = cluster->num_cpus;
+	spin_lock_irqsave(&state_lock, flags);
+	list_for_each_entry_safe(c, tmp, &cluster->lru, sib) {
+		if (!num_cpus--)
+			break;
+
+		if (!is_active(c))
+			continue;
+		if (cluster->active_cpus <= cluster->max_cpus)
+			break;
+
+		if (first_pass && !c->not_preferred)
+			continue;
+
+		spin_unlock_irqrestore(&state_lock, flags);
+
+		cpumask_set_cpu(c->cpu, pause_cpus);
+		sched_pause_pending(c->cpu);
+
+		c->paused_by_us = true;
+		move_cpu_lru(c);
+		nr_paused++;
+
+		cluster->active_cpus = get_active_cpu_count(cluster);
+		spin_lock_irqsave(&state_lock, flags);
+	}
+
+	cluster->nr_paused_cpus += nr_paused;
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	if (first_pass && cluster->active_cpus > cluster->max_cpus) {
+		first_pass = false;
+		goto again;
+	}
+}
+
+static void __try_to_resume(struct cluster_data *cluster,
+			       unsigned int need, bool force, struct cpumask *unpause_cpus)
+{
+	struct cpu_data *c, *tmp;
+	unsigned long flags;
+	unsigned int num_cpus = cluster->num_cpus;
+	unsigned int nr_unpaused = 0;
+
+	/*
+	 * Protect against entry being removed (and added at tail) by other
+	 * thread (hotplug).
+	 */
+	spin_lock_irqsave(&state_lock, flags);
+	list_for_each_entry_safe(c, tmp, &cluster->lru, sib) {
+		if (!num_cpus--)
+			break;
+
+		if (!c->paused_by_us)
+			continue;
+		if ((cpu_online(c->cpu) && cpu_active(c->cpu)) ||
+			(!force && c->not_preferred))
+			continue;
+		if (cluster->active_cpus == need)
+			break;
+
+		spin_unlock_irqrestore(&state_lock, flags);
+
+		pr_debug("Trying to resume CPU%u\n", c->cpu);
+
+		cpumask_set_cpu(c->cpu, unpause_cpus);
+		sched_unpause_pending(c->cpu);
+
+		c->paused_by_us = false;
+		move_cpu_lru(c);
+		nr_unpaused++;
+
+		cluster->active_cpus = get_active_cpu_count(cluster);
+		spin_lock_irqsave(&state_lock, flags);
+	}
+	cluster->nr_paused_cpus -= nr_unpaused;
+	spin_unlock_irqrestore(&state_lock, flags);
+}
+
+static void try_to_resume(struct cluster_data *cluster, unsigned int need,
+			  struct cpumask *unpause_cpus)
+{
+	bool force_use_non_preferred = false;
+
+	__try_to_resume(cluster, need, force_use_non_preferred, unpause_cpus);
+
+	if (cluster->active_cpus == need)
+		return;
+
+	force_use_non_preferred = true;
+	__try_to_resume(cluster, need, force_use_non_preferred, unpause_cpus);
+}
+
+static void __ref do_core_ctl(void)
+{
+	struct cluster_data *cluster;
+	unsigned int index = 0;
+	unsigned int need;
+	cpumask_t cpus_to_pause = { CPU_BITS_NONE };
+	cpumask_t cpus_to_unpause = { CPU_BITS_NONE };
+
+	for_each_cluster(cluster, index) {
+
+		eval_need(cluster);
+
+		need = apply_limits(cluster, cluster->need_cpus);
+
+		if (adjustment_possible(cluster, need)) {
+			pr_debug("Trying to adjust group %u from %u to %u\n",
+				 cluster->first_cpu, cluster->active_cpus, need);
+
+			if (cluster->active_cpus > need)
+				try_to_pause(cluster, need, &cpus_to_pause);
+
+			else if (cluster->active_cpus < need)
+				try_to_resume(cluster, need, &cpus_to_unpause);
+		}
+	}
+
+	if (cpumask_any(&cpus_to_pause) < nr_cpu_ids)
+		pause_cpus(&cpus_to_pause);
+
+	if (cpumask_any(&cpus_to_unpause) < nr_cpu_ids)
+		resume_cpus(&cpus_to_unpause);
+}
+
+static int __ref try_core_ctl(void *data)
+{
+	unsigned long flags;
+
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		spin_lock_irqsave(&core_ctl_pending_lock, flags);
+		if (!core_ctl_pending) {
+			spin_unlock_irqrestore(&core_ctl_pending_lock, flags);
+			schedule();
+			if (kthread_should_stop())
+				break;
+			spin_lock_irqsave(&core_ctl_pending_lock, flags);
+		}
+		set_current_state(TASK_RUNNING);
+		core_ctl_pending = false;
+		spin_unlock_irqrestore(&core_ctl_pending_lock, flags);
+
+		do_core_ctl();
+	}
+
+	return 0;
+}
+
+/* ============================ init code ============================== */
+
+static struct cluster_data *find_cluster_by_first_cpu(unsigned int first_cpu)
+{
+	unsigned int i;
+
+	for (i = 0; i < num_clusters; ++i) {
+		if (cluster_state[i].first_cpu == first_cpu)
+			return &cluster_state[i];
+	}
+
+	return NULL;
+}
+
+static int cluster_init(const struct cpumask *mask)
+{
+	struct device *dev;
+	unsigned int first_cpu = cpumask_first(mask);
+	struct cluster_data *cluster;
+	struct cpu_data *state;
+	unsigned int cpu;
+
+	if (find_cluster_by_first_cpu(first_cpu))
+		return 0;
+
+	dev = get_cpu_device(first_cpu);
+	if (!dev)
+		return -ENODEV;
+
+	pr_info("Creating CPU group %d\n", first_cpu);
+
+	if (num_clusters == MAX_CLUSTERS) {
+		pr_err("Unsupported number of clusters. Only %u supported\n",
+								MAX_CLUSTERS);
+		return -EINVAL;
+	}
+	cluster = &cluster_state[num_clusters];
+	++num_clusters;
+
+	cpumask_copy(&cluster->cpu_mask, mask);
+	cluster->num_cpus = cpumask_weight(mask);
+	if (cluster->num_cpus > MAX_CPUS_PER_CLUSTER) {
+		pr_err("HW configuration not supported\n");
+		return -EINVAL;
+	}
+	cluster->first_cpu = first_cpu;
+	cluster->min_cpus = 1;
+	cluster->max_cpus = cluster->num_cpus;
+	cluster->need_cpus = cluster->num_cpus;
+	cluster->offline_delay_ms = 100;
+	cluster->task_thres = UINT_MAX;
+	cluster->nr_prev_assist_thresh = UINT_MAX;
+	cluster->nrrun = cluster->num_cpus;
+	cluster->enable = true;
+	cluster->nr_not_preferred_cpus = 0;
+	cluster->strict_nrrun = 0;
+	INIT_LIST_HEAD(&cluster->lru);
+
+	for_each_cpu(cpu, mask) {
+		pr_info("Init CPU%u state\n", cpu);
+
+		state = &per_cpu(cpu_state, cpu);
+		state->cluster = cluster;
+		state->cpu = cpu;
+		list_add_tail(&state->sib, &cluster->lru);
+	}
+	cluster->active_cpus = get_active_cpu_count(cluster);
+
+	cluster->inited = true;
+
+	kobject_init(&cluster->kobj, &ktype_core_ctl);
+	return kobject_add(&cluster->kobj, &dev->kobj, "core_ctl");
+}
+
+int core_ctl_init(void)
+{
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+	struct walt_sched_cluster *cluster;
+	int ret;
+
+	/* initialize our single kthread */
+	core_ctl_thread = kthread_run(try_core_ctl, NULL, "core_ctl");
+
+	if (IS_ERR(core_ctl_thread))
+		return PTR_ERR(core_ctl_thread);
+
+	spin_lock_init(&core_ctl_pending_lock);
+
+	sched_setscheduler_nocheck(core_ctl_thread, SCHED_FIFO, &param);
+
+	for_each_sched_cluster(cluster) {
+		ret = cluster_init(&cluster->cpus);
+		if (ret)
+			pr_warn("unable to create core ctl group: %d\n", ret);
+	}
+
+	initialized = true;
+
+	return 0;
+}

+ 886 - 0
kernel/sched/walt/cpufreq_walt.c

@@ -0,0 +1,886 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This is based on schedutil governor but modified to work with
+ * WALT.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Copyright (c) 2020-2021, The Linux Foundation. All rights reserved.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kthread.h>
+#include <trace/events/power.h>
+
+#include "walt.h"
+#include "trace.h"
+
+struct waltgov_tunables {
+	struct gov_attr_set	attr_set;
+	unsigned int		up_rate_limit_us;
+	unsigned int		down_rate_limit_us;
+	unsigned int		hispeed_load;
+	unsigned int		hispeed_freq;
+	unsigned int		rtg_boost_freq;
+	bool			pl;
+};
+
+struct waltgov_policy {
+	struct cpufreq_policy	*policy;
+	u64			last_ws;
+	u64			curr_cycles;
+	u64			last_cyc_update_time;
+	unsigned long		avg_cap;
+	struct waltgov_tunables	*tunables;
+	struct list_head	tunables_hook;
+	unsigned long		hispeed_util;
+	unsigned long		rtg_boost_util;
+	unsigned long		max;
+
+	raw_spinlock_t		update_lock;
+	u64			last_freq_update_time;
+	s64			min_rate_limit_ns;
+	s64			up_rate_delay_ns;
+	s64			down_rate_delay_ns;
+	unsigned int		next_freq;
+	unsigned int		cached_raw_freq;
+
+	/* The next fields are only needed if fast switch cannot be used: */
+	struct	irq_work	irq_work;
+	struct	kthread_work	work;
+	struct	mutex		work_lock;
+	struct	kthread_worker	worker;
+	struct task_struct	*thread;
+
+	bool			limits_changed;
+	bool			need_freq_update;
+};
+
+struct waltgov_cpu {
+	struct waltgov_callback	cb;
+	struct waltgov_policy	*wg_policy;
+	unsigned int		cpu;
+	struct walt_cpu_load	walt_load;
+	unsigned long		util;
+	unsigned long		max;
+	unsigned int		flags;
+};
+
+DEFINE_PER_CPU(struct waltgov_callback *, waltgov_cb_data);
+static DEFINE_PER_CPU(struct waltgov_cpu, waltgov_cpu);
+static DEFINE_PER_CPU(struct waltgov_tunables *, cached_tunables);
+
+/************************ Governor internals ***********************/
+
+static bool waltgov_should_update_freq(struct waltgov_policy *wg_policy, u64 time)
+{
+	s64 delta_ns;
+
+	if (unlikely(wg_policy->limits_changed)) {
+		wg_policy->limits_changed = false;
+		wg_policy->need_freq_update = true;
+		return true;
+	}
+
+	/*
+	 * No need to recalculate next freq for min_rate_limit_us
+	 * at least. However we might still decide to further rate
+	 * limit once frequency change direction is decided, according
+	 * to the separate rate limits.
+	 */
+
+	delta_ns = time - wg_policy->last_freq_update_time;
+	return delta_ns >= wg_policy->min_rate_limit_ns;
+}
+
+static bool waltgov_up_down_rate_limit(struct waltgov_policy *wg_policy, u64 time,
+				     unsigned int next_freq)
+{
+	s64 delta_ns;
+
+	delta_ns = time - wg_policy->last_freq_update_time;
+
+	if (next_freq > wg_policy->next_freq &&
+	    delta_ns < wg_policy->up_rate_delay_ns)
+		return true;
+
+	if (next_freq < wg_policy->next_freq &&
+	    delta_ns < wg_policy->down_rate_delay_ns)
+		return true;
+
+	return false;
+}
+
+static bool waltgov_update_next_freq(struct waltgov_policy *wg_policy, u64 time,
+				   unsigned int next_freq)
+{
+	if (wg_policy->next_freq == next_freq)
+		return false;
+
+	if (waltgov_up_down_rate_limit(wg_policy, time, next_freq))
+		return false;
+
+	wg_policy->next_freq = next_freq;
+	wg_policy->last_freq_update_time = time;
+
+	return true;
+}
+
+static unsigned long freq_to_util(struct waltgov_policy *wg_policy,
+				  unsigned int freq)
+{
+	return mult_frac(wg_policy->max, freq,
+			 wg_policy->policy->cpuinfo.max_freq);
+}
+
+#define KHZ 1000
+static void waltgov_track_cycles(struct waltgov_policy *wg_policy,
+				unsigned int prev_freq,
+				u64 upto)
+{
+	u64 delta_ns, cycles;
+	u64 next_ws = wg_policy->last_ws + sched_ravg_window;
+
+	upto = min(upto, next_ws);
+	/* Track cycles in current window */
+	delta_ns = upto - wg_policy->last_cyc_update_time;
+	delta_ns *= prev_freq;
+	do_div(delta_ns, (NSEC_PER_SEC / KHZ));
+	cycles = delta_ns;
+	wg_policy->curr_cycles += cycles;
+	wg_policy->last_cyc_update_time = upto;
+}
+
+static void waltgov_calc_avg_cap(struct waltgov_policy *wg_policy, u64 curr_ws,
+				unsigned int prev_freq)
+{
+	u64 last_ws = wg_policy->last_ws;
+	unsigned int avg_freq;
+
+	BUG_ON(curr_ws < last_ws);
+	if (curr_ws <= last_ws)
+		return;
+
+	/* If we skipped some windows */
+	if (curr_ws > (last_ws + sched_ravg_window)) {
+		avg_freq = prev_freq;
+		/* Reset tracking history */
+		wg_policy->last_cyc_update_time = curr_ws;
+	} else {
+		waltgov_track_cycles(wg_policy, prev_freq, curr_ws);
+		avg_freq = wg_policy->curr_cycles;
+		avg_freq /= sched_ravg_window / (NSEC_PER_SEC / KHZ);
+	}
+	wg_policy->avg_cap = freq_to_util(wg_policy, avg_freq);
+	wg_policy->curr_cycles = 0;
+	wg_policy->last_ws = curr_ws;
+}
+
+static void waltgov_fast_switch(struct waltgov_policy *wg_policy, u64 time,
+			      unsigned int next_freq)
+{
+	struct cpufreq_policy *policy = wg_policy->policy;
+	unsigned int cpu;
+
+	if (!waltgov_update_next_freq(wg_policy, time, next_freq))
+		return;
+
+	waltgov_track_cycles(wg_policy, wg_policy->policy->cur, time);
+	next_freq = cpufreq_driver_fast_switch(policy, next_freq);
+	if (!next_freq)
+		return;
+
+	policy->cur = next_freq;
+
+	if (trace_cpu_frequency_enabled()) {
+		for_each_cpu(cpu, policy->cpus)
+			trace_cpu_frequency(next_freq, cpu);
+	}
+}
+
+static void waltgov_deferred_update(struct waltgov_policy *wg_policy, u64 time,
+				  unsigned int next_freq)
+{
+	if (!waltgov_update_next_freq(wg_policy, time, next_freq))
+		return;
+
+	walt_irq_work_queue(&wg_policy->irq_work);
+}
+
+#define TARGET_LOAD 80
+static unsigned int get_next_freq(struct waltgov_policy *wg_policy,
+				  unsigned long util, unsigned long max)
+{
+	struct cpufreq_policy *policy = wg_policy->policy;
+	/*
+	 * TODO:
+		unsigned int freq = arch_scale_freq_invariant() ?
+				policy->cpuinfo.max_freq : policy->cur;
+	 */
+	unsigned int freq = policy->cpuinfo.max_freq;
+
+	freq = map_util_freq(util, freq, max);
+	trace_waltgov_next_freq(policy->cpu, util, max, freq);
+
+	if (freq == wg_policy->cached_raw_freq && !wg_policy->need_freq_update)
+		return wg_policy->next_freq;
+
+	wg_policy->need_freq_update = false;
+	wg_policy->cached_raw_freq = freq;
+	return cpufreq_driver_resolve_freq(policy, freq);
+}
+
+static unsigned long waltgov_get_util(struct waltgov_cpu *wg_cpu)
+{
+	struct rq *rq = cpu_rq(wg_cpu->cpu);
+	unsigned long max = arch_scale_cpu_capacity(wg_cpu->cpu);
+	unsigned long util;
+
+	wg_cpu->max = max;
+	util = cpu_util_freq_walt(wg_cpu->cpu, &wg_cpu->walt_load);
+	return uclamp_rq_util_with(rq, util, NULL);
+}
+
+#define NL_RATIO 75
+#define DEFAULT_HISPEED_LOAD 90
+#define DEFAULT_CPU0_RTG_BOOST_FREQ 1000000
+#define DEFAULT_CPU4_RTG_BOOST_FREQ 0
+#define DEFAULT_CPU7_RTG_BOOST_FREQ 0
+static void waltgov_walt_adjust(struct waltgov_cpu *wg_cpu, unsigned long *util,
+			      unsigned long *max)
+{
+	struct waltgov_policy *wg_policy = wg_cpu->wg_policy;
+	bool is_migration = wg_cpu->flags & WALT_CPUFREQ_IC_MIGRATION;
+	bool is_rtg_boost = wg_cpu->walt_load.rtgb_active;
+	unsigned long nl = wg_cpu->walt_load.nl;
+	unsigned long cpu_util = wg_cpu->util;
+	bool is_hiload;
+	unsigned long pl = wg_cpu->walt_load.pl;
+
+	if (is_rtg_boost)
+		*util = max(*util, wg_policy->rtg_boost_util);
+
+	is_hiload = (cpu_util >= mult_frac(wg_policy->avg_cap,
+					   wg_policy->tunables->hispeed_load,
+					   100));
+
+	if (is_hiload && !is_migration)
+		*util = max(*util, wg_policy->hispeed_util);
+
+	if (is_hiload && nl >= mult_frac(cpu_util, NL_RATIO, 100))
+		*util = *max;
+
+	if (wg_policy->tunables->pl) {
+		if (sysctl_sched_conservative_pl)
+			pl = mult_frac(pl, TARGET_LOAD, 100);
+		*util = max(*util, pl);
+	}
+}
+
+static inline unsigned long target_util(struct waltgov_policy *wg_policy,
+				  unsigned int freq)
+{
+	unsigned long util;
+
+	util = freq_to_util(wg_policy, freq);
+	util = mult_frac(util, TARGET_LOAD, 100);
+	return util;
+}
+
+static unsigned int waltgov_next_freq_shared(struct waltgov_cpu *wg_cpu, u64 time)
+{
+	struct waltgov_policy *wg_policy = wg_cpu->wg_policy;
+	struct cpufreq_policy *policy = wg_policy->policy;
+	unsigned long util = 0, max = 1;
+	unsigned int j;
+
+	for_each_cpu(j, policy->cpus) {
+		struct waltgov_cpu *j_wg_cpu = &per_cpu(waltgov_cpu, j);
+		unsigned long j_util, j_max;
+
+		/*
+		 * If the util value for all CPUs in a policy is 0, just using >
+		 * will result in a max value of 1. WALT stats can later update
+		 * the aggregated util value, causing get_next_freq() to compute
+		 * freq = max_freq * 1.25 * (util / max) for nonzero util,
+		 * leading to spurious jumps to fmax.
+		 */
+		j_util = j_wg_cpu->util;
+		j_max = j_wg_cpu->max;
+
+		if (j_util * max >= j_max * util) {
+			util = j_util;
+			max = j_max;
+		}
+
+		waltgov_walt_adjust(j_wg_cpu, &util, &max);
+	}
+
+	return get_next_freq(wg_policy, util, max);
+}
+
+static void waltgov_update_freq(struct waltgov_callback *cb, u64 time,
+				unsigned int flags)
+{
+	struct waltgov_cpu *wg_cpu = container_of(cb, struct waltgov_cpu, cb);
+	struct waltgov_policy *wg_policy = wg_cpu->wg_policy;
+	unsigned long hs_util, boost_util;
+	unsigned int next_f;
+
+	if (!wg_policy->tunables->pl && flags & WALT_CPUFREQ_PL)
+		return;
+
+	wg_cpu->util = waltgov_get_util(wg_cpu);
+	wg_cpu->flags = flags;
+	raw_spin_lock(&wg_policy->update_lock);
+
+	if (wg_policy->max != wg_cpu->max) {
+		wg_policy->max = wg_cpu->max;
+		hs_util = target_util(wg_policy,
+					wg_policy->tunables->hispeed_freq);
+		wg_policy->hispeed_util = hs_util;
+
+		boost_util = target_util(wg_policy,
+				    wg_policy->tunables->rtg_boost_freq);
+		wg_policy->rtg_boost_util = boost_util;
+	}
+
+	waltgov_calc_avg_cap(wg_policy, wg_cpu->walt_load.ws,
+			   wg_policy->policy->cur);
+
+	trace_waltgov_util_update(wg_cpu->cpu, wg_cpu->util, wg_policy->avg_cap,
+				wg_cpu->max, wg_cpu->walt_load.nl,
+				wg_cpu->walt_load.pl,
+				wg_cpu->walt_load.rtgb_active, flags);
+
+	if (waltgov_should_update_freq(wg_policy, time) &&
+	    !(flags & WALT_CPUFREQ_CONTINUE)) {
+		next_f = waltgov_next_freq_shared(wg_cpu, time);
+
+		if (wg_policy->policy->fast_switch_enabled)
+			waltgov_fast_switch(wg_policy, time, next_f);
+		else
+			waltgov_deferred_update(wg_policy, time, next_f);
+	}
+
+	raw_spin_unlock(&wg_policy->update_lock);
+}
+
+static void waltgov_work(struct kthread_work *work)
+{
+	struct waltgov_policy *wg_policy = container_of(work, struct waltgov_policy, work);
+	unsigned int freq;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&wg_policy->update_lock, flags);
+	freq = wg_policy->next_freq;
+	waltgov_track_cycles(wg_policy, wg_policy->policy->cur,
+			   ktime_get_ns());
+	raw_spin_unlock_irqrestore(&wg_policy->update_lock, flags);
+
+	mutex_lock(&wg_policy->work_lock);
+	__cpufreq_driver_target(wg_policy->policy, freq, CPUFREQ_RELATION_L);
+	mutex_unlock(&wg_policy->work_lock);
+}
+
+static void waltgov_irq_work(struct irq_work *irq_work)
+{
+	struct waltgov_policy *wg_policy;
+
+	wg_policy = container_of(irq_work, struct waltgov_policy, irq_work);
+
+	kthread_queue_work(&wg_policy->worker, &wg_policy->work);
+}
+
+/************************** sysfs interface ************************/
+
+static inline struct waltgov_tunables *to_waltgov_tunables(struct gov_attr_set *attr_set)
+{
+	return container_of(attr_set, struct waltgov_tunables, attr_set);
+}
+
+static DEFINE_MUTEX(min_rate_lock);
+
+static void update_min_rate_limit_ns(struct waltgov_policy *wg_policy)
+{
+	mutex_lock(&min_rate_lock);
+	wg_policy->min_rate_limit_ns = min(wg_policy->up_rate_delay_ns,
+					   wg_policy->down_rate_delay_ns);
+	mutex_unlock(&min_rate_lock);
+}
+
+static ssize_t up_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+{
+	struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", tunables->up_rate_limit_us);
+}
+
+static ssize_t down_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+{
+	struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", tunables->down_rate_limit_us);
+}
+
+static ssize_t up_rate_limit_us_store(struct gov_attr_set *attr_set,
+				      const char *buf, size_t count)
+{
+	struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set);
+	struct waltgov_policy *wg_policy;
+	unsigned int rate_limit_us;
+
+	if (kstrtouint(buf, 10, &rate_limit_us))
+		return -EINVAL;
+
+	tunables->up_rate_limit_us = rate_limit_us;
+
+	list_for_each_entry(wg_policy, &attr_set->policy_list, tunables_hook) {
+		wg_policy->up_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
+		update_min_rate_limit_ns(wg_policy);
+	}
+
+	return count;
+}
+
+static ssize_t down_rate_limit_us_store(struct gov_attr_set *attr_set,
+					const char *buf, size_t count)
+{
+	struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set);
+	struct waltgov_policy *wg_policy;
+	unsigned int rate_limit_us;
+
+	if (kstrtouint(buf, 10, &rate_limit_us))
+		return -EINVAL;
+
+	tunables->down_rate_limit_us = rate_limit_us;
+
+	list_for_each_entry(wg_policy, &attr_set->policy_list, tunables_hook) {
+		wg_policy->down_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
+		update_min_rate_limit_ns(wg_policy);
+	}
+
+	return count;
+}
+
+static struct governor_attr up_rate_limit_us = __ATTR_RW(up_rate_limit_us);
+static struct governor_attr down_rate_limit_us = __ATTR_RW(down_rate_limit_us);
+
+static ssize_t hispeed_load_show(struct gov_attr_set *attr_set, char *buf)
+{
+	struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", tunables->hispeed_load);
+}
+
+static ssize_t hispeed_load_store(struct gov_attr_set *attr_set,
+				  const char *buf, size_t count)
+{
+	struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set);
+
+	if (kstrtouint(buf, 10, &tunables->hispeed_load))
+		return -EINVAL;
+
+	tunables->hispeed_load = min(100U, tunables->hispeed_load);
+
+	return count;
+}
+
+static ssize_t hispeed_freq_show(struct gov_attr_set *attr_set, char *buf)
+{
+	struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", tunables->hispeed_freq);
+}
+
+static ssize_t hispeed_freq_store(struct gov_attr_set *attr_set,
+					const char *buf, size_t count)
+{
+	struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set);
+	unsigned int val;
+	struct waltgov_policy *wg_policy;
+	unsigned long hs_util;
+	unsigned long flags;
+
+	if (kstrtouint(buf, 10, &val))
+		return -EINVAL;
+
+	tunables->hispeed_freq = val;
+	list_for_each_entry(wg_policy, &attr_set->policy_list, tunables_hook) {
+		raw_spin_lock_irqsave(&wg_policy->update_lock, flags);
+		hs_util = target_util(wg_policy,
+					wg_policy->tunables->hispeed_freq);
+		wg_policy->hispeed_util = hs_util;
+		raw_spin_unlock_irqrestore(&wg_policy->update_lock, flags);
+	}
+
+	return count;
+}
+
+static ssize_t rtg_boost_freq_show(struct gov_attr_set *attr_set, char *buf)
+{
+	struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", tunables->rtg_boost_freq);
+}
+
+static ssize_t rtg_boost_freq_store(struct gov_attr_set *attr_set,
+				    const char *buf, size_t count)
+{
+	struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set);
+	unsigned int val;
+	struct waltgov_policy *wg_policy;
+	unsigned long boost_util;
+	unsigned long flags;
+
+	if (kstrtouint(buf, 10, &val))
+		return -EINVAL;
+
+	tunables->rtg_boost_freq = val;
+	list_for_each_entry(wg_policy, &attr_set->policy_list, tunables_hook) {
+		raw_spin_lock_irqsave(&wg_policy->update_lock, flags);
+		boost_util = target_util(wg_policy,
+					  wg_policy->tunables->rtg_boost_freq);
+		wg_policy->rtg_boost_util = boost_util;
+		raw_spin_unlock_irqrestore(&wg_policy->update_lock, flags);
+	}
+
+	return count;
+}
+
+static ssize_t pl_show(struct gov_attr_set *attr_set, char *buf)
+{
+	struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", tunables->pl);
+}
+
+static ssize_t pl_store(struct gov_attr_set *attr_set, const char *buf,
+				   size_t count)
+{
+	struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set);
+
+	if (kstrtobool(buf, &tunables->pl))
+		return -EINVAL;
+
+	return count;
+}
+
+static struct governor_attr hispeed_load = __ATTR_RW(hispeed_load);
+static struct governor_attr hispeed_freq = __ATTR_RW(hispeed_freq);
+static struct governor_attr rtg_boost_freq = __ATTR_RW(rtg_boost_freq);
+static struct governor_attr pl = __ATTR_RW(pl);
+
+static struct attribute *waltgov_attributes[] = {
+	&up_rate_limit_us.attr,
+	&down_rate_limit_us.attr,
+	&hispeed_load.attr,
+	&hispeed_freq.attr,
+	&rtg_boost_freq.attr,
+	&pl.attr,
+	NULL
+};
+
+static struct kobj_type waltgov_tunables_ktype = {
+	.default_attrs	= waltgov_attributes,
+	.sysfs_ops	= &governor_sysfs_ops,
+};
+
+/********************** cpufreq governor interface *********************/
+
+static struct cpufreq_governor walt_gov;
+
+static struct waltgov_policy *waltgov_policy_alloc(struct cpufreq_policy *policy)
+{
+	struct waltgov_policy *wg_policy;
+
+	wg_policy = kzalloc(sizeof(*wg_policy), GFP_KERNEL);
+	if (!wg_policy)
+		return NULL;
+
+	wg_policy->policy = policy;
+	raw_spin_lock_init(&wg_policy->update_lock);
+	return wg_policy;
+}
+
+static void waltgov_policy_free(struct waltgov_policy *wg_policy)
+{
+	kfree(wg_policy);
+}
+
+static int waltgov_kthread_create(struct waltgov_policy *wg_policy)
+{
+	struct task_struct *thread;
+	struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
+	struct cpufreq_policy *policy = wg_policy->policy;
+	int ret;
+
+	/* kthread only required for slow path */
+	if (policy->fast_switch_enabled)
+		return 0;
+
+	kthread_init_work(&wg_policy->work, waltgov_work);
+	kthread_init_worker(&wg_policy->worker);
+	thread = kthread_create(kthread_worker_fn, &wg_policy->worker,
+				"waltgov:%d",
+				cpumask_first(policy->related_cpus));
+	if (IS_ERR(thread)) {
+		pr_err("failed to create waltgov thread: %ld\n", PTR_ERR(thread));
+		return PTR_ERR(thread);
+	}
+
+	ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param);
+	if (ret) {
+		kthread_stop(thread);
+		pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
+		return ret;
+	}
+
+	wg_policy->thread = thread;
+	kthread_bind_mask(thread, policy->related_cpus);
+	init_irq_work(&wg_policy->irq_work, waltgov_irq_work);
+	mutex_init(&wg_policy->work_lock);
+
+	wake_up_process(thread);
+
+	return 0;
+}
+
+static void waltgov_kthread_stop(struct waltgov_policy *wg_policy)
+{
+	/* kthread only required for slow path */
+	if (wg_policy->policy->fast_switch_enabled)
+		return;
+
+	kthread_flush_worker(&wg_policy->worker);
+	kthread_stop(wg_policy->thread);
+	mutex_destroy(&wg_policy->work_lock);
+}
+
+static void waltgov_tunables_save(struct cpufreq_policy *policy,
+		struct waltgov_tunables *tunables)
+{
+	int cpu;
+	struct waltgov_tunables *cached = per_cpu(cached_tunables, policy->cpu);
+
+	if (!cached) {
+		cached = kzalloc(sizeof(*tunables), GFP_KERNEL);
+		if (!cached)
+			return;
+
+		for_each_cpu(cpu, policy->related_cpus)
+			per_cpu(cached_tunables, cpu) = cached;
+	}
+
+	cached->pl = tunables->pl;
+	cached->hispeed_load = tunables->hispeed_load;
+	cached->rtg_boost_freq = tunables->rtg_boost_freq;
+	cached->hispeed_freq = tunables->hispeed_freq;
+	cached->up_rate_limit_us = tunables->up_rate_limit_us;
+	cached->down_rate_limit_us = tunables->down_rate_limit_us;
+}
+
+static void waltgov_tunables_restore(struct cpufreq_policy *policy)
+{
+	struct waltgov_policy *wg_policy = policy->governor_data;
+	struct waltgov_tunables *tunables = wg_policy->tunables;
+	struct waltgov_tunables *cached = per_cpu(cached_tunables, policy->cpu);
+
+	if (!cached)
+		return;
+
+	tunables->pl = cached->pl;
+	tunables->hispeed_load = cached->hispeed_load;
+	tunables->rtg_boost_freq = cached->rtg_boost_freq;
+	tunables->hispeed_freq = cached->hispeed_freq;
+	tunables->up_rate_limit_us = cached->up_rate_limit_us;
+	tunables->down_rate_limit_us = cached->down_rate_limit_us;
+}
+
+static int waltgov_init(struct cpufreq_policy *policy)
+{
+	struct waltgov_policy *wg_policy;
+	struct waltgov_tunables *tunables;
+	int ret = 0;
+
+	/* State should be equivalent to EXIT */
+	if (policy->governor_data)
+		return -EBUSY;
+
+	cpufreq_enable_fast_switch(policy);
+
+	if (policy->fast_switch_possible && !policy->fast_switch_enabled)
+		BUG_ON(1);
+
+	wg_policy = waltgov_policy_alloc(policy);
+	if (!wg_policy) {
+		ret = -ENOMEM;
+		goto disable_fast_switch;
+	}
+
+	ret = waltgov_kthread_create(wg_policy);
+	if (ret)
+		goto free_wg_policy;
+
+	tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
+	if (!tunables) {
+		ret = -ENOMEM;
+		goto stop_kthread;
+	}
+
+	gov_attr_set_init(&tunables->attr_set, &wg_policy->tunables_hook);
+	tunables->hispeed_load = DEFAULT_HISPEED_LOAD;
+
+	switch (policy->cpu) {
+	default:
+	case 0:
+		tunables->rtg_boost_freq = DEFAULT_CPU0_RTG_BOOST_FREQ;
+		break;
+	case 4:
+		tunables->rtg_boost_freq = DEFAULT_CPU4_RTG_BOOST_FREQ;
+		break;
+	case 7:
+		tunables->rtg_boost_freq = DEFAULT_CPU7_RTG_BOOST_FREQ;
+		break;
+	}
+
+	policy->governor_data = wg_policy;
+	wg_policy->tunables = tunables;
+	waltgov_tunables_restore(policy);
+
+	ret = kobject_init_and_add(&tunables->attr_set.kobj, &waltgov_tunables_ktype,
+				   get_governor_parent_kobj(policy), "%s",
+				   walt_gov.name);
+	if (ret)
+		goto fail;
+
+	return 0;
+
+fail:
+	kobject_put(&tunables->attr_set.kobj);
+	policy->governor_data = NULL;
+	kfree(tunables);
+stop_kthread:
+	waltgov_kthread_stop(wg_policy);
+free_wg_policy:
+	waltgov_policy_free(wg_policy);
+disable_fast_switch:
+	cpufreq_disable_fast_switch(policy);
+
+	pr_err("initialization failed (error %d)\n", ret);
+	return ret;
+}
+
+static void waltgov_exit(struct cpufreq_policy *policy)
+{
+	struct waltgov_policy *wg_policy = policy->governor_data;
+	struct waltgov_tunables *tunables = wg_policy->tunables;
+	unsigned int count;
+
+	count = gov_attr_set_put(&tunables->attr_set, &wg_policy->tunables_hook);
+	policy->governor_data = NULL;
+	if (!count) {
+		waltgov_tunables_save(policy, tunables);
+		kfree(tunables);
+	}
+
+	waltgov_kthread_stop(wg_policy);
+	waltgov_policy_free(wg_policy);
+	cpufreq_disable_fast_switch(policy);
+}
+
+static int waltgov_start(struct cpufreq_policy *policy)
+{
+	struct waltgov_policy *wg_policy = policy->governor_data;
+	unsigned int cpu;
+
+	wg_policy->up_rate_delay_ns =
+		wg_policy->tunables->up_rate_limit_us * NSEC_PER_USEC;
+	wg_policy->down_rate_delay_ns =
+		wg_policy->tunables->down_rate_limit_us * NSEC_PER_USEC;
+	update_min_rate_limit_ns(wg_policy);
+	wg_policy->last_freq_update_time	= 0;
+	wg_policy->next_freq			= 0;
+	wg_policy->limits_changed		= false;
+	wg_policy->need_freq_update		= false;
+	wg_policy->cached_raw_freq		= 0;
+
+	for_each_cpu(cpu, policy->cpus) {
+		struct waltgov_cpu *wg_cpu = &per_cpu(waltgov_cpu, cpu);
+
+		memset(wg_cpu, 0, sizeof(*wg_cpu));
+		wg_cpu->cpu			= cpu;
+		wg_cpu->wg_policy		= wg_policy;
+	}
+
+	for_each_cpu(cpu, policy->cpus) {
+		struct waltgov_cpu *wg_cpu = &per_cpu(waltgov_cpu, cpu);
+
+		waltgov_add_callback(cpu, &wg_cpu->cb, waltgov_update_freq);
+	}
+
+	return 0;
+}
+
+static void waltgov_stop(struct cpufreq_policy *policy)
+{
+	struct waltgov_policy *wg_policy = policy->governor_data;
+	unsigned int cpu;
+
+	for_each_cpu(cpu, policy->cpus)
+		waltgov_remove_callback(cpu);
+
+	synchronize_rcu();
+
+	if (!policy->fast_switch_enabled) {
+		irq_work_sync(&wg_policy->irq_work);
+		kthread_cancel_work_sync(&wg_policy->work);
+	}
+}
+
+static void waltgov_limits(struct cpufreq_policy *policy)
+{
+	struct waltgov_policy *wg_policy = policy->governor_data;
+	unsigned long flags, now;
+	unsigned int freq;
+
+	if (!policy->fast_switch_enabled) {
+		mutex_lock(&wg_policy->work_lock);
+		raw_spin_lock_irqsave(&wg_policy->update_lock, flags);
+		waltgov_track_cycles(wg_policy, wg_policy->policy->cur,
+				   ktime_get_ns());
+		raw_spin_unlock_irqrestore(&wg_policy->update_lock, flags);
+		cpufreq_policy_apply_limits(policy);
+		mutex_unlock(&wg_policy->work_lock);
+	} else {
+		raw_spin_lock_irqsave(&wg_policy->update_lock, flags);
+		freq = policy->cur;
+		now = ktime_get_ns();
+
+		/*
+		 * cpufreq_driver_resolve_freq() has a clamp, so we do not need
+		 * to do any sort of additional validation here.
+		 */
+		freq = cpufreq_driver_resolve_freq(policy, freq);
+		wg_policy->cached_raw_freq = freq;
+		waltgov_fast_switch(wg_policy, now, freq);
+		raw_spin_unlock_irqrestore(&wg_policy->update_lock, flags);
+	}
+
+	wg_policy->limits_changed = true;
+}
+
+static struct cpufreq_governor walt_gov = {
+	.name			= "walt",
+	.init			= waltgov_init,
+	.exit			= waltgov_exit,
+	.start			= waltgov_start,
+	.stop			= waltgov_stop,
+	.limits			= waltgov_limits,
+	.owner			= THIS_MODULE,
+};
+
+int waltgov_register(void)
+{
+	return cpufreq_register_governor(&walt_gov);
+}

+ 91 - 0
kernel/sched/walt/fixup.c

@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2016-2021, The Linux Foundation. All rights reserved.
+ */
+
+#include <trace/hooks/cpufreq.h>
+
+#include "walt.h"
+
+unsigned int cpuinfo_max_freq_cached;
+
+char sched_lib_name[LIB_PATH_LENGTH];
+unsigned int sched_lib_mask_force;
+
+bool is_sched_lib_based_app(pid_t pid)
+{
+	const char *name = NULL;
+	char *libname, *lib_list;
+	struct vm_area_struct *vma;
+	char path_buf[LIB_PATH_LENGTH];
+	char *tmp_lib_name;
+	bool found = false;
+	struct task_struct *p;
+	struct mm_struct *mm;
+
+	if (strnlen(sched_lib_name, LIB_PATH_LENGTH) == 0)
+		return false;
+
+	tmp_lib_name = kmalloc(LIB_PATH_LENGTH, GFP_KERNEL);
+	if (!tmp_lib_name)
+		return false;
+
+	rcu_read_lock();
+
+	p = pid ? get_pid_task(find_vpid(pid), PIDTYPE_PID) : current;
+	if (!p) {
+		rcu_read_unlock();
+		kfree(tmp_lib_name);
+		return false;
+	}
+
+	/* Prevent p going away */
+	get_task_struct(p);
+	rcu_read_unlock();
+
+	mm = get_task_mm(p);
+	if (!mm)
+		goto put_task_struct;
+
+	down_read(&mm->mmap_lock);
+	for (vma = mm->mmap; vma ; vma = vma->vm_next) {
+		if (vma->vm_file && vma->vm_flags & VM_EXEC) {
+			name = d_path(&vma->vm_file->f_path,
+					path_buf, LIB_PATH_LENGTH);
+			if (IS_ERR(name))
+				goto release_sem;
+
+			strlcpy(tmp_lib_name, sched_lib_name, LIB_PATH_LENGTH);
+			lib_list = tmp_lib_name;
+			while ((libname = strsep(&lib_list, ","))) {
+				libname = skip_spaces(libname);
+				if (strnstr(name, libname,
+					strnlen(name, LIB_PATH_LENGTH))) {
+					found = true;
+					goto release_sem;
+				}
+			}
+		}
+	}
+
+release_sem:
+	up_read(&mm->mmap_lock);
+	mmput(mm);
+put_task_struct:
+	put_task_struct(p);
+	kfree(tmp_lib_name);
+	return found;
+}
+
+void android_vh_show_max_freq(void *unused, struct cpufreq_policy *policy,
+				unsigned int *max_freq)
+{
+	if (!cpuinfo_max_freq_cached)
+		return;
+
+	if (!(BIT(policy->cpu) & sched_lib_mask_force))
+		return;
+
+	if (is_sched_lib_based_app(current->pid))
+		*max_freq = cpuinfo_max_freq_cached << 1;
+}

+ 300 - 0
kernel/sched/walt/input-boost.c

@@ -0,0 +1,300 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2013-2015,2017,2019-2021, The Linux Foundation. All rights reserved.
+ */
+
+#define pr_fmt(fmt) "input-boost: " fmt
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/input.h>
+#include <linux/time.h>
+#include <linux/sysfs.h>
+#include <linux/pm_qos.h>
+
+#include "walt.h"
+
+#define input_boost_attr_rw(_name)		\
+static struct kobj_attribute _name##_attr =	\
+__ATTR(_name, 0644, show_##_name, store_##_name)
+
+#define show_one(file_name)			\
+static ssize_t show_##file_name			\
+(struct kobject *kobj, struct kobj_attribute *attr, char *buf)	\
+{								\
+	return scnprintf(buf, PAGE_SIZE, "%u\n", file_name);	\
+}
+
+#define store_one(file_name)					\
+static ssize_t store_##file_name				\
+(struct kobject *kobj, struct kobj_attribute *attr,		\
+const char *buf, size_t count)					\
+{								\
+								\
+	sscanf(buf, "%u", &file_name);				\
+	return count;						\
+}
+
+struct cpu_sync {
+	int		cpu;
+	unsigned int	input_boost_min;
+	unsigned int	input_boost_freq;
+};
+
+static DEFINE_PER_CPU(struct cpu_sync, sync_info);
+static struct workqueue_struct *input_boost_wq;
+
+static struct work_struct input_boost_work;
+
+static bool sched_boost_active;
+
+static struct delayed_work input_boost_rem;
+static u64 last_input_time;
+#define MIN_INPUT_INTERVAL (150 * USEC_PER_MSEC)
+
+static DEFINE_PER_CPU(struct freq_qos_request, qos_req);
+
+static void boost_adjust_notify(struct cpufreq_policy *policy)
+{
+	unsigned int cpu = policy->cpu;
+	struct cpu_sync *s = &per_cpu(sync_info, cpu);
+	unsigned int ib_min = s->input_boost_min;
+	struct freq_qos_request *req = &per_cpu(qos_req, cpu);
+	int ret;
+
+	pr_debug("CPU%u policy min before boost: %u kHz\n",
+			 cpu, policy->min);
+	pr_debug("CPU%u boost min: %u kHz\n", cpu, ib_min);
+
+	ret = freq_qos_update_request(req, ib_min);
+
+	if (ret < 0)
+		pr_err("Failed to update freq constraint in boost_adjust: %d\n",
+								ib_min);
+
+	pr_debug("CPU%u policy min after boost: %u kHz\n", cpu, policy->min);
+}
+
+static void update_policy_online(void)
+{
+	unsigned int i;
+	struct cpufreq_policy *policy;
+	struct cpumask online_cpus;
+
+	/* Re-evaluate policy to trigger adjust notifier for online CPUs */
+	get_online_cpus();
+	online_cpus = *cpu_online_mask;
+	for_each_cpu(i, &online_cpus) {
+		policy = cpufreq_cpu_get(i);
+		if (!policy) {
+			pr_err("%s: cpufreq policy not found for cpu%d\n",
+							__func__, i);
+			return;
+		}
+
+		cpumask_andnot(&online_cpus, &online_cpus,
+						policy->related_cpus);
+		boost_adjust_notify(policy);
+	}
+	put_online_cpus();
+}
+
+static void do_input_boost_rem(struct work_struct *work)
+{
+	unsigned int i, ret;
+	struct cpu_sync *i_sync_info;
+
+	/* Reset the input_boost_min for all CPUs in the system */
+	pr_debug("Resetting input boost min for all CPUs\n");
+	for_each_possible_cpu(i) {
+		i_sync_info = &per_cpu(sync_info, i);
+		i_sync_info->input_boost_min = 0;
+	}
+
+	/* Update policies for all online CPUs */
+	update_policy_online();
+
+	if (sched_boost_active) {
+		ret = sched_set_boost(0);
+		if (!ret)
+			pr_err("input-boost: sched boost disable failed\n");
+		sched_boost_active = false;
+	}
+}
+
+static void do_input_boost(struct work_struct *work)
+{
+	unsigned int i, ret;
+	struct cpu_sync *i_sync_info;
+
+	cancel_delayed_work_sync(&input_boost_rem);
+	if (sched_boost_active) {
+		sched_set_boost(0);
+		sched_boost_active = false;
+	}
+
+	/* Set the input_boost_min for all CPUs in the system */
+	pr_debug("Setting input boost min for all CPUs\n");
+	for (i = 0; i < 8; i++) {
+		i_sync_info = &per_cpu(sync_info, i);
+		i_sync_info->input_boost_min = sysctl_input_boost_freq[i];
+	}
+
+	/* Update policies for all online CPUs */
+	update_policy_online();
+
+	/* Enable scheduler boost to migrate tasks to big cluster */
+	if (sysctl_sched_boost_on_input > 0) {
+		ret = sched_set_boost(sysctl_sched_boost_on_input);
+		if (ret)
+			pr_err("input-boost: sched boost enable failed\n");
+		else
+			sched_boost_active = true;
+	}
+
+	queue_delayed_work(input_boost_wq, &input_boost_rem,
+					msecs_to_jiffies(sysctl_input_boost_ms));
+}
+
+static void inputboost_input_event(struct input_handle *handle,
+		unsigned int type, unsigned int code, int value)
+{
+	u64 now;
+	int cpu;
+	int enabled = 0;
+
+	for_each_possible_cpu(cpu) {
+		if (sysctl_input_boost_freq[cpu] > 0) {
+			enabled = 1;
+			break;
+		}
+	}
+	if (!enabled)
+		return;
+
+	now = ktime_to_us(ktime_get());
+	if (now - last_input_time < MIN_INPUT_INTERVAL)
+		return;
+
+	if (work_pending(&input_boost_work))
+		return;
+
+	queue_work(input_boost_wq, &input_boost_work);
+	last_input_time = ktime_to_us(ktime_get());
+}
+
+static int inputboost_input_connect(struct input_handler *handler,
+		struct input_dev *dev, const struct input_device_id *id)
+{
+	struct input_handle *handle;
+	int error;
+
+	handle = kzalloc(sizeof(struct input_handle), GFP_KERNEL);
+	if (!handle)
+		return -ENOMEM;
+
+	handle->dev = dev;
+	handle->handler = handler;
+	handle->name = "cpufreq";
+
+	error = input_register_handle(handle);
+	if (error)
+		goto err2;
+
+	error = input_open_device(handle);
+	if (error)
+		goto err1;
+
+	return 0;
+err1:
+	input_unregister_handle(handle);
+err2:
+	kfree(handle);
+	return error;
+}
+
+static void inputboost_input_disconnect(struct input_handle *handle)
+{
+	input_close_device(handle);
+	input_unregister_handle(handle);
+	kfree(handle);
+}
+
+static const struct input_device_id inputboost_ids[] = {
+	/* multi-touch touchscreen */
+	{
+		.flags = INPUT_DEVICE_ID_MATCH_EVBIT |
+			INPUT_DEVICE_ID_MATCH_ABSBIT,
+		.evbit = { BIT_MASK(EV_ABS) },
+		.absbit = { [BIT_WORD(ABS_MT_POSITION_X)] =
+			BIT_MASK(ABS_MT_POSITION_X) |
+			BIT_MASK(ABS_MT_POSITION_Y)
+		},
+	},
+	/* touchpad */
+	{
+		.flags = INPUT_DEVICE_ID_MATCH_KEYBIT |
+			INPUT_DEVICE_ID_MATCH_ABSBIT,
+		.keybit = { [BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH) },
+		.absbit = { [BIT_WORD(ABS_X)] =
+			BIT_MASK(ABS_X) | BIT_MASK(ABS_Y)
+		},
+	},
+	/* Keypad */
+	{
+		.flags = INPUT_DEVICE_ID_MATCH_EVBIT,
+		.evbit = { BIT_MASK(EV_KEY) },
+	},
+	{ },
+};
+
+static struct input_handler inputboost_input_handler = {
+	.event		= inputboost_input_event,
+	.connect	= inputboost_input_connect,
+	.disconnect	= inputboost_input_disconnect,
+	.name		= "input-boost",
+	.id_table	= inputboost_ids,
+};
+
+struct kobject *input_boost_kobj;
+int input_boost_init(void)
+{
+	int cpu, ret;
+	struct cpu_sync *s;
+	struct cpufreq_policy *policy;
+	struct freq_qos_request *req;
+
+	input_boost_wq = alloc_workqueue("inputboost_wq", WQ_HIGHPRI, 0);
+	if (!input_boost_wq)
+		return -EFAULT;
+
+	INIT_WORK(&input_boost_work, do_input_boost);
+	INIT_DELAYED_WORK(&input_boost_rem, do_input_boost_rem);
+
+	for_each_possible_cpu(cpu) {
+		s = &per_cpu(sync_info, cpu);
+		s->cpu = cpu;
+		req = &per_cpu(qos_req, cpu);
+		policy = cpufreq_cpu_get(cpu);
+		if (!policy) {
+			pr_err("%s: cpufreq policy not found for cpu%d\n",
+							__func__, cpu);
+			return -ESRCH;
+		}
+
+		ret = freq_qos_add_request(&policy->constraints, req,
+						FREQ_QOS_MIN, policy->min);
+		if (ret < 0) {
+			pr_err("%s: Failed to add freq constraint (%d)\n",
+							__func__, ret);
+			return ret;
+		}
+	}
+
+	ret = input_register_handler(&inputboost_input_handler);
+	return 0;
+}

+ 177 - 0
kernel/sched/walt/preemptirq_long.c

@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2020-2021 The Linux Foundation. All rights reserved.
+ */
+
+#include <linux/ftrace.h>
+#include <linux/sched.h>
+#include <linux/sysctl.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <trace/hooks/preemptirq.h>
+#define CREATE_TRACE_POINTS
+#include "preemptirq_long.h"
+
+#define IRQSOFF_SENTINEL 0x0fffDEAD
+
+static unsigned int sysctl_preemptoff_tracing_threshold_ns = 1000000;
+static unsigned int sysctl_irqsoff_tracing_threshold_ns = 5000000;
+static unsigned int sysctl_irqsoff_dmesg_output_enabled;
+static unsigned int sysctl_irqsoff_crash_sentinel_value;
+static unsigned int sysctl_irqsoff_crash_threshold_ns = 10000000;
+
+static unsigned int half_million = 500000;
+static unsigned int one_hundred_million = 100000000;
+static unsigned int one_million = 1000000;
+
+static DEFINE_PER_CPU(u64, irq_disabled_ts);
+
+/*
+ * preemption disable tracking require additional context
+ * to rule out false positives. see the comment in
+ * test_preempt_disable_long() for more details.
+ */
+struct preempt_store {
+	u64		ts;
+	int		pid;
+	unsigned long	ncsw;
+};
+static DEFINE_PER_CPU(struct preempt_store, the_ps);
+
+static void note_irq_disable(void *u1, unsigned long u2, unsigned long u3)
+{
+	if (is_idle_task(current))
+		return;
+
+	/*
+	 * We just have to note down the time stamp here. We
+	 * use stacktrace trigger feature to print the stacktrace.
+	 */
+	this_cpu_write(irq_disabled_ts, sched_clock());
+}
+
+static void test_irq_disable_long(void *u1, unsigned long u2, unsigned long u3)
+{
+	u64 ts = this_cpu_read(irq_disabled_ts);
+
+	if (!ts)
+		return;
+
+	this_cpu_write(irq_disabled_ts, 0);
+	ts = sched_clock() - ts;
+
+	if (ts > sysctl_irqsoff_tracing_threshold_ns) {
+		trace_irq_disable_long(ts);
+
+		if (sysctl_irqsoff_dmesg_output_enabled == IRQSOFF_SENTINEL)
+			printk_deferred("D=%llu C:(%ps<-%ps<-%ps<-%ps)\n",
+					ts, (void *)CALLER_ADDR2,
+					(void *)CALLER_ADDR3,
+					(void *)CALLER_ADDR4,
+					(void *)CALLER_ADDR5);
+	}
+
+	if (sysctl_irqsoff_crash_sentinel_value == IRQSOFF_SENTINEL &&
+			ts > sysctl_irqsoff_crash_threshold_ns) {
+		printk_deferred("delta=%llu(ns) > crash_threshold=%u(ns) Task=%s\n",
+				ts, sysctl_irqsoff_crash_threshold_ns,
+				current->comm);
+		BUG_ON(1);
+	}
+}
+
+static void note_preempt_disable(void *u1, unsigned long u2, unsigned long u3)
+{
+	struct preempt_store *ps = &per_cpu(the_ps, raw_smp_processor_id());
+
+	ps->ts = sched_clock();
+	ps->pid = current->pid;
+	ps->ncsw = current->nvcsw + current->nivcsw;
+}
+
+static void test_preempt_disable_long(void *u1, unsigned long u2,
+				      unsigned long u3)
+{
+	struct preempt_store *ps = &per_cpu(the_ps, raw_smp_processor_id());
+	u64 delta = 0;
+
+	if (!ps->ts)
+		return;
+
+	/*
+	 * schedule() calls __schedule() with preemption disabled.
+	 * if we had entered idle and exiting idle now, we think
+	 * preemption is disabled the whole time. Detect this by
+	 * checking if the preemption is disabled across the same
+	 * task. There is a possiblity that the same task is scheduled
+	 * after idle. To rule out this possibility, compare the
+	 * context switch count also.
+	 */
+	if (ps->pid == current->pid && (ps->ncsw == current->nvcsw +
+				current->nivcsw))
+		delta = sched_clock() - ps->ts;
+
+	ps->ts = 0;
+	if (delta > sysctl_preemptoff_tracing_threshold_ns)
+		trace_preempt_disable_long(delta);
+}
+
+static struct ctl_table preemptirq_long_table[] = {
+	{
+		.procname	= "preemptoff_tracing_threshold_ns",
+		.data		= &sysctl_preemptoff_tracing_threshold_ns,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "irqsoff_tracing_threshold_ns",
+		.data		= &sysctl_irqsoff_tracing_threshold_ns,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax,
+		.extra1		= &half_million,
+		.extra2		= &one_hundred_million,
+	},
+	{
+		.procname	= "irqsoff_dmesg_output_enabled",
+		.data		= &sysctl_irqsoff_dmesg_output_enabled,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "irqsoff_crash_sentinel_value",
+		.data		= &sysctl_irqsoff_crash_sentinel_value,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "irqsoff_crash_threshold_ns",
+		.data		= &sysctl_irqsoff_crash_threshold_ns,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax,
+		.extra1		= &one_million,
+		.extra2		= &one_hundred_million,
+	},
+	{ }
+};
+
+int preemptirq_long_init(void)
+{
+	if (!register_sysctl("preemptirq", preemptirq_long_table)) {
+		pr_err("Fail to register sysctl table\n");
+		return -EPERM;
+	}
+
+	register_trace_android_rvh_irqs_disable(note_irq_disable, NULL);
+	register_trace_android_rvh_irqs_enable(test_irq_disable_long, NULL);
+	register_trace_android_rvh_preempt_disable(note_preempt_disable, NULL);
+	register_trace_android_rvh_preempt_enable(test_preempt_disable_long,
+						 NULL);
+
+	return 0;
+}

+ 2 - 2
include/trace/events/preemptirq_long.h → kernel/sched/walt/preemptirq_long.h

@@ -1,13 +1,13 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (c) 2020 The Linux Foundation. All rights reserved.
+ * Copyright (c) 2021 The Linux Foundation. All rights reserved.
  */
 
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM preemptirq_long
 
 #undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH trace/events
+#define TRACE_INCLUDE_PATH .
 
 #if !defined(_TRACE_PREEMPTIRQ_LONG_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_PREEMPTIRQ_LONG_H

+ 52 - 0
kernel/sched/walt/qc_vas.c

@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2019-2021, The Linux Foundation. All rights reserved.
+ */
+
+#include <linux/irq.h>
+#include <linux/delay.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+
+#include "walt.h"
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+cpumask_t pending_active_mask = CPU_MASK_NONE;
+int sched_pause_count(const cpumask_t *mask, bool include_offline)
+{
+	cpumask_t count_mask = CPU_MASK_NONE;
+	cpumask_t pause_mask = CPU_MASK_NONE;
+
+	if (cpumask_any(&pending_active_mask) >= nr_cpu_ids) {
+		/* initialize pending_active_state */
+		cpumask_copy(&pending_active_mask, cpu_active_mask);
+	}
+
+	if (include_offline) {
+
+		/* get all offline or paused cpus */
+		cpumask_complement(&pause_mask, &pending_active_mask);
+		cpumask_complement(&count_mask, cpu_online_mask);
+		cpumask_or(&count_mask, &count_mask, &pause_mask);
+
+		/* get all offline or paused cpus in this cluster */
+		cpumask_and(&count_mask, &count_mask, mask);
+	} else {
+		cpumask_andnot(&count_mask, mask, &pending_active_mask);
+	}
+
+	return cpumask_weight(&count_mask);
+}
+
+void sched_pause_pending(int cpu)
+{
+	cpumask_clear_cpu(cpu, &pending_active_mask);
+}
+
+void sched_unpause_pending(int cpu)
+{
+	cpumask_set_cpu(cpu, &pending_active_mask);
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */

+ 250 - 0
kernel/sched/walt/sched_avg.c

@@ -0,0 +1,250 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2012, 2015-2021, The Linux Foundation. All rights reserved.
+ */
+
+/*
+ * Scheduler hook for average runqueue determination
+ */
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/hrtimer.h>
+#include <linux/sched.h>
+#include <linux/math64.h>
+
+#include "walt.h"
+#include "trace.h"
+
+static DEFINE_PER_CPU(u64, nr_prod_sum);
+static DEFINE_PER_CPU(u64, last_time);
+static DEFINE_PER_CPU(u64, nr_big_prod_sum);
+static DEFINE_PER_CPU(u64, nr);
+static DEFINE_PER_CPU(u64, nr_max);
+
+static DEFINE_PER_CPU(spinlock_t, nr_lock) = __SPIN_LOCK_UNLOCKED(nr_lock);
+static s64 last_get_time;
+
+static DEFINE_PER_CPU(atomic64_t, busy_hyst_end_time) = ATOMIC64_INIT(0);
+
+static DEFINE_PER_CPU(u64, hyst_time);
+static DEFINE_PER_CPU(u64, coloc_hyst_busy);
+static DEFINE_PER_CPU(u64, coloc_hyst_time);
+
+#define NR_THRESHOLD_PCT		15
+#define MAX_RTGB_TIME (sysctl_sched_coloc_busy_hyst_max_ms * NSEC_PER_MSEC)
+
+/**
+ * sched_get_nr_running_avg
+ * @return: Average nr_running, iowait and nr_big_tasks value since last poll.
+ *	    Returns the avg * 100 to return up to two decimal points
+ *	    of accuracy.
+ *
+ * Obtains the average nr_running value since the last poll.
+ * This function may not be called concurrently with itself
+ */
+void sched_get_nr_running_avg(struct sched_avg_stats *stats)
+{
+	int cpu;
+	u64 curr_time = sched_clock();
+	u64 period = curr_time - last_get_time;
+	u64 tmp_nr, tmp_misfit;
+	bool any_hyst_time = false;
+
+	if (!period)
+		return;
+
+	/* read and reset nr_running counts */
+	for_each_possible_cpu(cpu) {
+		unsigned long flags;
+		u64 diff;
+
+		spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags);
+		curr_time = sched_clock();
+		diff = curr_time - per_cpu(last_time, cpu);
+		BUG_ON((s64)diff < 0);
+
+		tmp_nr = per_cpu(nr_prod_sum, cpu);
+		tmp_nr += per_cpu(nr, cpu) * diff;
+		tmp_nr = div64_u64((tmp_nr * 100), period);
+
+		tmp_misfit = per_cpu(nr_big_prod_sum, cpu);
+		tmp_misfit += walt_big_tasks(cpu) * diff;
+		tmp_misfit = div64_u64((tmp_misfit * 100), period);
+
+		/*
+		 * NR_THRESHOLD_PCT is to make sure that the task ran
+		 * at least 85% in the last window to compensate any
+		 * over estimating being done.
+		 */
+		stats[cpu].nr = (int)div64_u64((tmp_nr + NR_THRESHOLD_PCT),
+								100);
+		stats[cpu].nr_misfit = (int)div64_u64((tmp_misfit +
+						NR_THRESHOLD_PCT), 100);
+		stats[cpu].nr_max = per_cpu(nr_max, cpu);
+		stats[cpu].nr_scaled = tmp_nr;
+
+		trace_sched_get_nr_running_avg(cpu, stats[cpu].nr,
+				stats[cpu].nr_misfit, stats[cpu].nr_max,
+				stats[cpu].nr_scaled);
+
+		per_cpu(last_time, cpu) = curr_time;
+		per_cpu(nr_prod_sum, cpu) = 0;
+		per_cpu(nr_big_prod_sum, cpu) = 0;
+		per_cpu(nr_max, cpu) = per_cpu(nr, cpu);
+
+		spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags);
+	}
+
+	for_each_possible_cpu(cpu) {
+		if (per_cpu(coloc_hyst_time, cpu)) {
+			any_hyst_time = true;
+			break;
+		}
+	}
+	if (any_hyst_time && get_rtgb_active_time() >= MAX_RTGB_TIME)
+		sched_update_hyst_times();
+
+	last_get_time = curr_time;
+
+}
+EXPORT_SYMBOL(sched_get_nr_running_avg);
+
+void sched_update_hyst_times(void)
+{
+	bool rtgb_active;
+	int cpu;
+	unsigned long cpu_cap, coloc_busy_pct;
+
+	rtgb_active = is_rtgb_active() && (sched_boost() != CONSERVATIVE_BOOST)
+			&& (get_rtgb_active_time() < MAX_RTGB_TIME);
+
+	for_each_possible_cpu(cpu) {
+		cpu_cap = arch_scale_cpu_capacity(cpu);
+		coloc_busy_pct = sysctl_sched_coloc_busy_hyst_cpu_busy_pct[cpu];
+		per_cpu(hyst_time, cpu) = (BIT(cpu)
+			     & sysctl_sched_busy_hyst_enable_cpus) ?
+			     sysctl_sched_busy_hyst : 0;
+		per_cpu(coloc_hyst_time, cpu) = ((BIT(cpu)
+			     & sysctl_sched_coloc_busy_hyst_enable_cpus)
+			     && rtgb_active) ?
+			     sysctl_sched_coloc_busy_hyst_cpu[cpu] : 0;
+		per_cpu(coloc_hyst_busy, cpu) = mult_frac(cpu_cap,
+							coloc_busy_pct, 100);
+	}
+}
+
+#define BUSY_NR_RUN		3
+#define BUSY_LOAD_FACTOR	10
+static inline void update_busy_hyst_end_time(int cpu, bool dequeue,
+				unsigned long prev_nr_run, u64 curr_time)
+{
+	bool nr_run_trigger = false;
+	bool load_trigger = false, coloc_load_trigger = false;
+	u64 agg_hyst_time;
+
+	if (!per_cpu(hyst_time, cpu) && !per_cpu(coloc_hyst_time, cpu))
+		return;
+
+	if (prev_nr_run >= BUSY_NR_RUN && per_cpu(nr, cpu) < BUSY_NR_RUN)
+		nr_run_trigger = true;
+
+	if (dequeue && (cpu_util(cpu) * BUSY_LOAD_FACTOR) >
+			capacity_orig_of(cpu))
+		load_trigger = true;
+
+	if (dequeue && cpu_util(cpu) > per_cpu(coloc_hyst_busy, cpu))
+		coloc_load_trigger = true;
+
+	agg_hyst_time = max((nr_run_trigger || load_trigger) ?
+				per_cpu(hyst_time, cpu) : 0,
+				(nr_run_trigger || coloc_load_trigger) ?
+				per_cpu(coloc_hyst_time, cpu) : 0);
+
+	if (agg_hyst_time)
+		atomic64_set(&per_cpu(busy_hyst_end_time, cpu),
+				curr_time + agg_hyst_time);
+}
+
+int sched_busy_hyst_handler(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	if (table->maxlen > (sizeof(unsigned int) * num_possible_cpus()))
+		table->maxlen = sizeof(unsigned int) * num_possible_cpus();
+
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+	if (!ret && write)
+		sched_update_hyst_times();
+
+	return ret;
+}
+
+/**
+ * sched_update_nr_prod
+ * @cpu: The core id of the nr running driver.
+ * @enq: enqueue/dequeue happening on this CPU.
+ * @return: N/A
+ *
+ * Update average with latest nr_running value for CPU
+ */
+void sched_update_nr_prod(int cpu, bool enq)
+{
+	u64 diff;
+	u64 curr_time;
+	unsigned long flags, nr_running;
+
+	spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags);
+	nr_running = per_cpu(nr, cpu);
+	curr_time = sched_clock();
+	diff = curr_time - per_cpu(last_time, cpu);
+	BUG_ON((s64)diff < 0);
+	per_cpu(last_time, cpu) = curr_time;
+	per_cpu(nr, cpu) = cpu_rq(cpu)->nr_running;
+
+	if (per_cpu(nr, cpu) > per_cpu(nr_max, cpu))
+		per_cpu(nr_max, cpu) = per_cpu(nr, cpu);
+
+	update_busy_hyst_end_time(cpu, !enq, nr_running, curr_time);
+
+	per_cpu(nr_prod_sum, cpu) += nr_running * diff;
+	per_cpu(nr_big_prod_sum, cpu) += walt_big_tasks(cpu) * diff;
+	spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags);
+}
+
+/*
+ * Returns the CPU utilization % in the last window.
+ */
+unsigned int sched_get_cpu_util(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	u64 util;
+	unsigned long capacity, flags;
+	unsigned int busy;
+	struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1;
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+
+	capacity = capacity_orig_of(cpu);
+
+	util = wrq->prev_runnable_sum + wrq->grp_time.prev_runnable_sum;
+	util = div64_u64(util, sched_ravg_window >> SCHED_CAPACITY_SHIFT);
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+	util = (util >= capacity) ? capacity : util;
+	busy = div64_ul((util * 100), capacity);
+	return busy;
+}
+
+u64 sched_lpm_disallowed_time(int cpu)
+{
+	u64 now = sched_clock();
+	u64 bias_end_time = atomic64_read(&per_cpu(busy_hyst_end_time, cpu));
+
+	if (now < bias_end_time)
+		return bias_end_time - now;
+
+	return 0;
+}
+EXPORT_SYMBOL(sched_lpm_disallowed_time);

+ 900 - 0
kernel/sched/walt/sysctl.c

@@ -0,0 +1,900 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2020-2021, The Linux Foundation. All rights reserved.
+ */
+
+#include "walt.h"
+
+static int neg_three = -3;
+static int three = 3;
+static int two_hundred_fifty_five = 255;
+static unsigned int ns_per_sec = NSEC_PER_SEC;
+static unsigned int one_hundred_thousand = 100000;
+static unsigned int two_hundred_million = 200000000;
+static int __maybe_unused two = 2;
+static int __maybe_unused four = 4;
+static int one_hundred = 100;
+static int one_thousand = 1000;
+
+/*
+ * CFS task prio range is [100 ... 139]
+ * 120 is the default prio.
+ * RTG boost range is [100 ... 119] because giving
+ * boost for [120 .. 139] does not make sense.
+ * 99 means disabled and it is the default value.
+ */
+static unsigned int min_cfs_boost_prio = 99;
+static unsigned int max_cfs_boost_prio = 119;
+
+unsigned int sysctl_sched_capacity_margin_up_pct[MAX_MARGIN_LEVELS];
+unsigned int sysctl_sched_capacity_margin_dn_pct[MAX_MARGIN_LEVELS];
+unsigned int sysctl_sched_busy_hyst_enable_cpus;
+unsigned int sysctl_sched_busy_hyst;
+unsigned int sysctl_sched_coloc_busy_hyst_enable_cpus;
+unsigned int sysctl_sched_coloc_busy_hyst_cpu[WALT_NR_CPUS];
+unsigned int sysctl_sched_coloc_busy_hyst_max_ms;
+unsigned int sysctl_sched_coloc_busy_hyst_cpu_busy_pct[WALT_NR_CPUS];
+unsigned int sysctl_sched_boost;
+unsigned int sysctl_sched_wake_up_idle[2];
+unsigned int sysctl_input_boost_ms;
+unsigned int sysctl_input_boost_freq[8];
+unsigned int sysctl_sched_boost_on_input;
+unsigned int sysctl_sched_init_stage;
+unsigned int sysctl_sched_load_boost[WALT_NR_CPUS];
+
+/* sysctl nodes accesed by other files */
+unsigned int __read_mostly sysctl_sched_coloc_downmigrate_ns;
+unsigned int __read_mostly sysctl_sched_group_downmigrate_pct;
+unsigned int __read_mostly sysctl_sched_group_upmigrate_pct;
+unsigned int __read_mostly sysctl_sched_window_stats_policy;
+unsigned int sysctl_sched_ravg_window_nr_ticks;
+unsigned int sysctl_sched_dynamic_ravg_window_enable;
+unsigned int sysctl_sched_walt_rotate_big_tasks;
+unsigned int sysctl_sched_task_unfilter_period;
+unsigned int __read_mostly sysctl_sched_asym_cap_sibling_freq_match_pct;
+unsigned int sysctl_walt_low_latency_task_threshold; /* disabled by default */
+unsigned int sysctl_task_read_pid;
+unsigned int sysctl_sched_conservative_pl;
+unsigned int sysctl_sched_min_task_util_for_boost = 51;
+unsigned int sysctl_sched_min_task_util_for_colocation = 35;
+unsigned int sysctl_sched_many_wakeup_threshold = WALT_MANY_WAKEUP_DEFAULT;
+const int sched_user_hint_max = 1000;
+
+static void init_tg_pointers(void)
+{
+	struct cgroup_subsys_state *css = &root_task_group.css;
+	struct cgroup_subsys_state *top_css = css;
+
+	/* ptrs are already initialized */
+	if (task_group_topapp)
+		return;
+
+	css_for_each_child(css, top_css) {
+		if (!strcmp(css->cgroup->kn->name, "top-app")) {
+			task_group_topapp = css_tg(css);
+			walt_init_topapp_tg(task_group_topapp);
+		} else if (!strcmp(css->cgroup->kn->name, "foreground")) {
+			task_group_foreground = css_tg(css);
+			walt_init_foreground_tg(task_group_foreground);
+		} else {
+			walt_init_tg(css_tg(css));
+		}
+	}
+}
+
+static int walt_init_stage_handler(struct ctl_table *table,
+				int write, void __user *buffer, size_t *lenp,
+				loff_t *ppos)
+{
+	int ret;
+	static DEFINE_MUTEX(mutex);
+	int old_value = sysctl_sched_init_stage;
+
+	mutex_lock(&mutex);
+
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+	if (ret || !write)
+		goto unlock;
+
+	if (sysctl_sched_init_stage == 1 &&
+			old_value != sysctl_sched_init_stage) {
+		init_tg_pointers();
+	}
+
+unlock:
+	mutex_unlock(&mutex);
+	return ret;
+}
+
+static int walt_proc_group_thresholds_handler(struct ctl_table *table, int write,
+				       void __user *buffer, size_t *lenp,
+				       loff_t *ppos)
+{
+	int ret;
+	static DEFINE_MUTEX(mutex);
+	struct rq *rq = cpu_rq(cpumask_first(cpu_possible_mask));
+	unsigned long flags;
+
+	if (unlikely(num_sched_clusters <= 0))
+		return -EPERM;
+
+	mutex_lock(&mutex);
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret || !write) {
+		mutex_unlock(&mutex);
+		return ret;
+	}
+
+	/*
+	 * The load scale factor update happens with all
+	 * rqs locked. so acquiring 1 CPU rq lock and
+	 * updating the thresholds is sufficient for
+	 * an atomic update.
+	 */
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	walt_update_group_thresholds();
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+	mutex_unlock(&mutex);
+
+	return ret;
+}
+
+static int walt_proc_user_hint_handler(struct ctl_table *table,
+				int write, void __user *buffer, size_t *lenp,
+				loff_t *ppos)
+{
+	int ret;
+	unsigned int old_value;
+	static DEFINE_MUTEX(mutex);
+
+	mutex_lock(&mutex);
+
+	sched_user_hint_reset_time = jiffies + HZ;
+	old_value = sysctl_sched_user_hint;
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret || !write || (old_value == sysctl_sched_user_hint))
+		goto unlock;
+
+	walt_irq_work_queue(&walt_migration_irq_work);
+
+unlock:
+	mutex_unlock(&mutex);
+	return ret;
+}
+
+static int sched_ravg_window_handler(struct ctl_table *table,
+				int write, void __user *buffer, size_t *lenp,
+				loff_t *ppos)
+{
+	int ret = -EPERM;
+	static DEFINE_MUTEX(mutex);
+	int val = sysctl_sched_ravg_window_nr_ticks;
+
+	struct ctl_table tmp = {
+		.data	= &val,
+		.maxlen	= sizeof(val),
+		.mode	= table->mode,
+	};
+
+	mutex_lock(&mutex);
+
+	if (write && (HZ != 250 || !sysctl_sched_dynamic_ravg_window_enable))
+		goto unlock;
+
+	ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+	if (ret || !write || (val == sysctl_sched_ravg_window_nr_ticks))
+		goto unlock;
+
+	if (val != 2 && val != 3 && val != 4 && val != 5 && val != 8) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	sysctl_sched_ravg_window_nr_ticks = val;
+	sched_window_nr_ticks_change();
+
+unlock:
+	mutex_unlock(&mutex);
+	return ret;
+}
+
+enum {
+	TASK_BEGIN = 0,
+	WAKE_UP_IDLE,
+	INIT_TASK_LOAD,
+	GROUP_ID,
+	PER_TASK_BOOST,
+	PER_TASK_BOOST_PERIOD_MS,
+	LOW_LATENCY,
+};
+
+static int sched_task_handler(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp,
+				loff_t *ppos)
+{
+	int ret, param;
+	struct task_struct *task;
+	int pid_and_val[2] = {-1, -1};
+	int val;
+	struct walt_task_struct *wts;
+
+	struct ctl_table tmp = {
+		.data	= &pid_and_val,
+		.maxlen	= sizeof(pid_and_val),
+		.mode	= table->mode,
+	};
+	static DEFINE_MUTEX(mutex);
+
+	mutex_lock(&mutex);
+
+	if (!write) {
+		if (sysctl_task_read_pid <= 0) {
+			ret = -ENOENT;
+			goto unlock_mutex;
+		}
+		task = get_pid_task(find_vpid(sysctl_task_read_pid),
+				PIDTYPE_PID);
+		if (!task) {
+			ret = -ENOENT;
+			goto put_task;
+		}
+		wts = (struct walt_task_struct *) task->android_vendor_data1;
+		pid_and_val[0] = sysctl_task_read_pid;
+		param = (unsigned long)table->data;
+		switch (param) {
+		case WAKE_UP_IDLE:
+			pid_and_val[1] = wts->wake_up_idle;
+			break;
+		case INIT_TASK_LOAD:
+			pid_and_val[1] = wts->init_load_pct;
+			break;
+		case GROUP_ID:
+			pid_and_val[1] = sched_get_group_id(task);
+			break;
+		case PER_TASK_BOOST:
+			pid_and_val[1] = wts->boost;
+			break;
+		case PER_TASK_BOOST_PERIOD_MS:
+			pid_and_val[1] =
+				div64_ul(wts->boost_period,
+					 1000000UL);
+			break;
+		case LOW_LATENCY:
+			pid_and_val[1] = wts->low_latency;
+			break;
+		default:
+			ret = -EINVAL;
+			goto put_task;
+		}
+		ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+		goto put_task;
+	}
+
+	ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+	if (ret)
+		goto unlock_mutex;
+
+	if (pid_and_val[0] <= 0 || pid_and_val[1] < 0) {
+		ret = -ENOENT;
+		goto unlock_mutex;
+	}
+
+	/* parsed the values successfully in pid_and_val[] array */
+	task = get_pid_task(find_vpid(pid_and_val[0]), PIDTYPE_PID);
+	if (!task) {
+		ret = -ENOENT;
+		goto unlock_mutex;
+	}
+	wts = (struct walt_task_struct *) task->android_vendor_data1;
+	param = (unsigned long)table->data;
+	val = pid_and_val[1];
+	switch (param) {
+	case WAKE_UP_IDLE:
+		wts->wake_up_idle = val;
+		break;
+	case INIT_TASK_LOAD:
+		if (pid_and_val[1] < 0 || pid_and_val[1] > 100) {
+			ret = -EINVAL;
+			goto put_task;
+		}
+		wts->init_load_pct = val;
+		break;
+	case GROUP_ID:
+		ret = sched_set_group_id(task, val);
+		break;
+	case PER_TASK_BOOST:
+		if (val < TASK_BOOST_NONE || val >= TASK_BOOST_END) {
+			ret = -EINVAL;
+			goto put_task;
+		}
+		wts->boost = val;
+		if (val == 0)
+			wts->boost_period = 0;
+		break;
+	case PER_TASK_BOOST_PERIOD_MS:
+		if (wts->boost == 0 && val) {
+			/* setting boost period w/o boost is invalid */
+			ret = -EINVAL;
+			goto put_task;
+		}
+		wts->boost_period = (u64)val * 1000 * 1000;
+		wts->boost_expires = sched_clock() + wts->boost_period;
+		break;
+	case LOW_LATENCY:
+		wts->low_latency = val;
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+put_task:
+	put_task_struct(task);
+unlock_mutex:
+	mutex_unlock(&mutex);
+
+	return ret;
+}
+
+static int sched_load_boost_handler(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp,
+				loff_t *ppos)
+{
+	int ret, i;
+	unsigned int *data = (unsigned int *)table->data;
+	int val[WALT_NR_CPUS];
+
+	struct ctl_table tmp = {
+		.data	= &val,
+		.maxlen	= sizeof(val),
+		.mode	= table->mode,
+	};
+	static DEFINE_MUTEX(mutex);
+
+	mutex_lock(&mutex);
+
+	if (!write) {
+		ret = proc_dointvec(table, write, buffer, lenp, ppos);
+		goto unlock_mutex;
+	}
+
+	ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+	if (ret)
+		goto unlock_mutex;
+
+	for (i = 0; i < WALT_NR_CPUS; i++) {
+		if (val[i] < -100 || val[i] > 1000) {
+			ret = -EINVAL;
+			goto unlock_mutex;
+		}
+	}
+
+	/* all things checkout update the value */
+	for (i = 0; i < WALT_NR_CPUS; i++)
+		data[i] = val[i];
+
+unlock_mutex:
+	mutex_unlock(&mutex);
+
+	return ret;
+}
+
+#ifdef CONFIG_PROC_SYSCTL
+static void sched_update_updown_migrate_values(bool up)
+{
+	int i = 0, cpu;
+	struct walt_sched_cluster *cluster;
+	int cap_margin_levels = num_sched_clusters - 1;
+
+	if (cap_margin_levels > 1) {
+		/*
+		 * No need to worry about CPUs in last cluster
+		 * if there are more than 2 clusters in the system
+		 */
+		for_each_sched_cluster(cluster) {
+			for_each_cpu(cpu, &cluster->cpus) {
+				if (up)
+					sched_capacity_margin_up[cpu] =
+					SCHED_FIXEDPOINT_SCALE * 100 /
+					sysctl_sched_capacity_margin_up_pct[i];
+				else
+					sched_capacity_margin_down[cpu] =
+					SCHED_FIXEDPOINT_SCALE * 100 /
+					sysctl_sched_capacity_margin_dn_pct[i];
+			}
+
+			if (++i >= cap_margin_levels)
+				break;
+		}
+	} else {
+		for_each_possible_cpu(cpu) {
+			if (up)
+				sched_capacity_margin_up[cpu] =
+
+				SCHED_FIXEDPOINT_SCALE * 100 /
+				sysctl_sched_capacity_margin_up_pct[0];
+			else
+				sched_capacity_margin_down[cpu] =
+				sysctl_sched_capacity_margin_dn_pct[0];
+		}
+	}
+}
+
+int sched_updown_migrate_handler(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp,
+				loff_t *ppos)
+{
+	int ret, i;
+	unsigned int *data = (unsigned int *)table->data;
+	static DEFINE_MUTEX(mutex);
+	int cap_margin_levels = num_sched_clusters ? num_sched_clusters - 1 : 0;
+	int val[MAX_MARGIN_LEVELS];
+	struct ctl_table tmp = {
+		.data	= &val,
+		.maxlen	= sizeof(int) * cap_margin_levels,
+		.mode	= table->mode,
+	};
+
+	if (cap_margin_levels <= 0)
+		return -EINVAL;
+
+	mutex_lock(&mutex);
+
+	if (!write) {
+		ret = proc_dointvec(table, write, buffer, lenp, ppos);
+		goto unlock_mutex;
+	}
+
+	ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+	if (ret)
+		goto unlock_mutex;
+
+	/* check if valid pct values are passed in */
+	for (i = 0; i < cap_margin_levels; i++) {
+		if (val[i] <= 0 || val[i] > 100) {
+			ret = -EINVAL;
+			goto unlock_mutex;
+		}
+	}
+
+	/* check up pct is greater than dn pct */
+	if (data == &sysctl_sched_capacity_margin_up_pct[0]) {
+		for (i = 0; i < cap_margin_levels; i++) {
+			if (val[i] < sysctl_sched_capacity_margin_dn_pct[i]) {
+				ret = -EINVAL;
+				goto unlock_mutex;
+			}
+		}
+	} else {
+		for (i = 0; i < cap_margin_levels; i++) {
+			if (sysctl_sched_capacity_margin_up_pct[i] < val[i]) {
+				ret = -EINVAL;
+				goto unlock_mutex;
+			}
+		}
+	}
+
+	/* all things checkout update the value */
+	for (i = 0; i < cap_margin_levels; i++)
+		data[i] = val[i];
+
+	/* update individual cpu thresholds */
+	sched_update_updown_migrate_values(data == &sysctl_sched_capacity_margin_up_pct[0]);
+
+unlock_mutex:
+	mutex_unlock(&mutex);
+
+	return ret;
+}
+#endif /* CONFIG_PROC_SYSCTL */
+
+struct ctl_table input_boost_sysctls[] = {
+	{
+		.procname	= "input_boost_ms",
+		.data		= &sysctl_input_boost_ms,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &one_hundred_thousand,
+	},
+	{
+		.procname	= "input_boost_freq",
+		.data		= &sysctl_input_boost_freq,
+		.maxlen		= sizeof(unsigned int) * 8,
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
+	},
+	{
+		.procname	= "sched_boost_on_input",
+		.data		= &sysctl_sched_boost_on_input,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
+	},
+	{ }
+};
+
+struct ctl_table walt_table[] = {
+	{
+		.procname	= "sched_init_stage",
+		.data		= &sysctl_sched_init_stage,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= walt_init_stage_handler,
+	},
+	{
+		.procname	= "sched_user_hint",
+		.data		= &sysctl_sched_user_hint,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= walt_proc_user_hint_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= (void *)&sched_user_hint_max,
+	},
+	{
+		.procname	= "sched_window_stats_policy",
+		.data		= &sysctl_sched_window_stats_policy,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &four,
+	},
+	{
+		.procname	= "sched_group_upmigrate",
+		.data		= &sysctl_sched_group_upmigrate_pct,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= walt_proc_group_thresholds_handler,
+		.extra1		= &sysctl_sched_group_downmigrate_pct,
+	},
+	{
+		.procname	= "sched_group_downmigrate",
+		.data		= &sysctl_sched_group_downmigrate_pct,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= walt_proc_group_thresholds_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &sysctl_sched_group_upmigrate_pct,
+	},
+	{
+		.procname	= "sched_boost",
+		.data		= &sysctl_sched_boost,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_boost_handler,
+		.extra1		= &neg_three,
+		.extra2		= &three,
+	},
+	{
+		.procname	= "sched_conservative_pl",
+		.data		= &sysctl_sched_conservative_pl,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{
+		.procname	= "sched_many_wakeup_threshold",
+		.data		= &sysctl_sched_many_wakeup_threshold,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &two,
+		.extra2		= &one_thousand,
+	},
+	{
+		.procname	= "sched_walt_rotate_big_tasks",
+		.data		= &sysctl_sched_walt_rotate_big_tasks,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{
+		.procname	= "sched_min_task_util_for_boost",
+		.data		= &sysctl_sched_min_task_util_for_boost,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &one_thousand,
+	},
+	{
+		.procname	= "sched_min_task_util_for_colocation",
+		.data		= &sysctl_sched_min_task_util_for_colocation,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &one_thousand,
+	},
+	{
+		.procname	= "sched_asym_cap_sibling_freq_match_pct",
+		.data		= &sysctl_sched_asym_cap_sibling_freq_match_pct,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ONE,
+		.extra2		= &one_hundred,
+	},
+	{
+		.procname	= "sched_coloc_downmigrate_ns",
+		.data		= &sysctl_sched_coloc_downmigrate_ns,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax,
+	},
+	{
+		.procname	= "sched_task_unfilter_period",
+		.data		= &sysctl_sched_task_unfilter_period,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ONE,
+		.extra2		= &two_hundred_million,
+	},
+	{
+		.procname	= "sched_busy_hysteresis_enable_cpus",
+		.data		= &sysctl_sched_busy_hyst_enable_cpus,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_busy_hyst_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &two_hundred_fifty_five,
+	},
+	{
+		.procname	= "sched_busy_hyst_ns",
+		.data		= &sysctl_sched_busy_hyst,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_busy_hyst_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &ns_per_sec,
+	},
+	{
+		.procname	= "sched_coloc_busy_hysteresis_enable_cpus",
+		.data		= &sysctl_sched_coloc_busy_hyst_enable_cpus,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_busy_hyst_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &two_hundred_fifty_five,
+	},
+	{
+		.procname	= "sched_coloc_busy_hyst_cpu_ns",
+		.data		= &sysctl_sched_coloc_busy_hyst_cpu,
+		.maxlen		= sizeof(unsigned int) * WALT_NR_CPUS,
+		.mode		= 0644,
+		.proc_handler	= sched_busy_hyst_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &ns_per_sec,
+	},
+	{
+		.procname	= "sched_coloc_busy_hyst_max_ms",
+		.data		= &sysctl_sched_coloc_busy_hyst_max_ms,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_busy_hyst_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &one_hundred_thousand,
+	},
+	{
+		.procname	= "sched_coloc_busy_hyst_cpu_busy_pct",
+		.data		= &sysctl_sched_coloc_busy_hyst_cpu_busy_pct,
+		.maxlen		= sizeof(unsigned int) * WALT_NR_CPUS,
+		.mode		= 0644,
+		.proc_handler	= sched_busy_hyst_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &one_hundred,
+	},
+	{
+		.procname	= "sched_ravg_window_nr_ticks",
+		.data		= &sysctl_sched_ravg_window_nr_ticks,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_ravg_window_handler,
+	},
+	{
+		.procname	= "sched_dynamic_ravg_window_enable",
+		.data		= &sysctl_sched_dynamic_ravg_window_enable,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{
+		.procname	= "sched_upmigrate",
+		.data		= &sysctl_sched_capacity_margin_up_pct,
+		.maxlen		= sizeof(unsigned int) * MAX_MARGIN_LEVELS,
+		.mode		= 0644,
+		.proc_handler	= sched_updown_migrate_handler,
+	},
+	{
+		.procname	= "sched_downmigrate",
+		.data		= &sysctl_sched_capacity_margin_dn_pct,
+		.maxlen		= sizeof(unsigned int) * MAX_MARGIN_LEVELS,
+		.mode		= 0644,
+		.proc_handler	= sched_updown_migrate_handler,
+	},
+	{
+		.procname	= "sched_prefer_spread",
+		.data		= &sysctl_sched_prefer_spread,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &four,
+	},
+	{
+		.procname	= "walt_rtg_cfs_boost_prio",
+		.data		= &sysctl_walt_rtg_cfs_boost_prio,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_cfs_boost_prio,
+		.extra2		= &max_cfs_boost_prio,
+	},
+	{
+		.procname	= "walt_low_latency_task_threshold",
+		.data		= &sysctl_walt_low_latency_task_threshold,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &one_thousand,
+	},
+	{
+		.procname	= "sched_force_lb_enable",
+		.data		= &sysctl_sched_force_lb_enable,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{
+		.procname	= "sched_lib_name",
+		.data		= sched_lib_name,
+		.maxlen		= LIB_PATH_LENGTH,
+		.mode		= 0644,
+		.proc_handler	= proc_dostring,
+	},
+	{
+		.procname	= "sched_lib_mask_force",
+		.data		= &sched_lib_mask_force,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &two_hundred_fifty_five,
+	},
+	{
+		.procname	= "input_boost",
+		.mode		= 0555,
+		.child		= input_boost_sysctls,
+	},
+	{
+		.procname	= "sched_wake_up_idle",
+		.data		= (int *) WAKE_UP_IDLE,
+		.maxlen		= sizeof(unsigned int) * 2,
+		.mode		= 0644,
+		.proc_handler	= sched_task_handler,
+	},
+	{
+		.procname	= "sched_init_task_load",
+		.data		= (int *) INIT_TASK_LOAD,
+		.maxlen		= sizeof(unsigned int) * 2,
+		.mode		= 0644,
+		.proc_handler	= sched_task_handler,
+	},
+	{
+		.procname	= "sched_group_id",
+		.data		= (int *) GROUP_ID,
+		.maxlen		= sizeof(unsigned int) * 2,
+		.mode		= 0644,
+		.proc_handler	= sched_task_handler,
+	},
+	{
+		.procname	= "sched_per_task_boost",
+		.data		= (int *) PER_TASK_BOOST,
+		.maxlen		= sizeof(unsigned int) * 2,
+		.mode		= 0644,
+		.proc_handler	= sched_task_handler,
+	},
+	{
+		.procname	= "sched_per_task_boost_period_ms",
+		.data		= (int *) PER_TASK_BOOST_PERIOD_MS,
+		.maxlen		= sizeof(unsigned int) * 2,
+		.mode		= 0644,
+		.proc_handler	= sched_task_handler,
+	},
+	{
+		.procname	= "sched_low_latency",
+		.data		= (int *) LOW_LATENCY,
+		.maxlen		= sizeof(unsigned int) * 2,
+		.mode		= 0644,
+		.proc_handler	= sched_task_handler,
+	},
+	{
+		.procname	= "sched_task_read_pid",
+		.data		= &sysctl_task_read_pid,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sched_load_boost",
+		.data		= &sysctl_sched_load_boost,
+		.maxlen		= sizeof(unsigned int) * 8,
+		.mode		= 0644,
+		.proc_handler	= sched_load_boost_handler,
+	},
+	{ }
+};
+
+struct ctl_table walt_base_table[] = {
+	{
+		.procname	= "walt",
+		.mode		= 0555,
+		.child		= walt_table,
+	},
+	{ },
+};
+
+void walt_tunables(void)
+{
+	int i;
+
+	for (i = 0; i < MAX_MARGIN_LEVELS; i++) {
+		sysctl_sched_capacity_margin_up_pct[i] = 95; /* ~5% margin */
+		sysctl_sched_capacity_margin_dn_pct[i] = 85; /* ~15% margin */
+	}
+
+	sysctl_sched_group_upmigrate_pct = 100;
+
+	sysctl_sched_group_downmigrate_pct = 95;
+
+	sysctl_sched_asym_cap_sibling_freq_match_pct = 100;
+
+	sysctl_sched_task_unfilter_period = 100000000;
+
+	sysctl_sched_window_stats_policy = WINDOW_STATS_MAX_RECENT_AVG;
+
+	sysctl_sched_ravg_window_nr_ticks = (HZ / NR_WINDOWS_PER_SEC);
+
+	sysctl_sched_dynamic_ravg_window_enable = (HZ == 250);
+
+	sched_load_granule = DEFAULT_SCHED_RAVG_WINDOW / NUM_LOAD_INDICES;
+
+	sysctl_sched_min_task_util_for_boost = 51;
+
+	sysctl_sched_min_task_util_for_colocation = 35;
+
+	for (i = 0; i < WALT_NR_CPUS; i++) {
+		sysctl_sched_coloc_busy_hyst_cpu[i] = 39000000;
+		sysctl_sched_coloc_busy_hyst_cpu_busy_pct[i] = 10;
+	}
+
+	sysctl_sched_coloc_busy_hyst_enable_cpus = 112;
+
+	sysctl_sched_coloc_busy_hyst_max_ms = 5000;
+
+	sysctl_walt_rtg_cfs_boost_prio = 99; /* disabled by default */
+
+	sched_ravg_window = DEFAULT_SCHED_RAVG_WINDOW;
+
+	sysctl_input_boost_ms = 40;
+
+	for (i = 0; i < 8; i++)
+		sysctl_input_boost_freq[i] = 0;
+}

+ 84 - 0
kernel/sched/walt/trace.c

@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2019-2021, The Linux Foundation. All rights reserved.
+ */
+
+#include "walt.h"
+
+static inline void __window_data(u32 *dst, u32 *src)
+{
+	if (src)
+		memcpy(dst, src, nr_cpu_ids * sizeof(u32));
+	else
+		memset(dst, 0, nr_cpu_ids * sizeof(u32));
+}
+
+struct trace_seq;
+const char *__window_print(struct trace_seq *p, const u32 *buf, int buf_len)
+{
+	int i;
+	const char *ret = p->buffer + seq_buf_used(&p->seq);
+
+	for (i = 0; i < buf_len; i++)
+		trace_seq_printf(p, "%u ", buf[i]);
+
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
+static inline s64 __rq_update_sum(struct rq *rq, bool curr, bool new)
+{
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+
+	if (curr)
+		if (new)
+			return wrq->nt_curr_runnable_sum;
+		else
+			return wrq->curr_runnable_sum;
+	else
+		if (new)
+			return wrq->nt_prev_runnable_sum;
+		else
+			return wrq->prev_runnable_sum;
+}
+
+static inline s64 __grp_update_sum(struct rq *rq, bool curr, bool new)
+{
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+
+	if (curr)
+		if (new)
+			return wrq->grp_time.nt_curr_runnable_sum;
+		else
+			return wrq->grp_time.curr_runnable_sum;
+	else
+		if (new)
+			return wrq->grp_time.nt_prev_runnable_sum;
+		else
+			return wrq->grp_time.prev_runnable_sum;
+}
+
+static inline s64
+__get_update_sum(struct rq *rq, enum migrate_types migrate_type,
+		 bool src, bool new, bool curr)
+{
+	switch (migrate_type) {
+	case RQ_TO_GROUP:
+		if (src)
+			return __rq_update_sum(rq, curr, new);
+		else
+			return __grp_update_sum(rq, curr, new);
+	case GROUP_TO_RQ:
+		if (src)
+			return __grp_update_sum(rq, curr, new);
+		else
+			return __rq_update_sum(rq, curr, new);
+	default:
+		WARN_ON_ONCE(1);
+		return -EINVAL;
+	}
+}
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"

+ 1097 - 0
kernel/sched/walt/trace.h

@@ -0,0 +1,1097 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2019-2021, The Linux Foundation. All rights reserved.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM schedwalt
+
+#if !defined(_TRACE_WALT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_WALT_H
+
+#include <linux/tracepoint.h>
+
+#include "walt.h"
+
+struct rq;
+struct group_cpu_time;
+struct walt_task_struct;
+struct walt_rq;
+struct walt_related_thread_group;
+
+extern const char *task_event_names[];
+
+TRACE_EVENT(sched_update_pred_demand,
+
+	TP_PROTO(struct task_struct *p, u32 runtime, int pct,
+		 unsigned int pred_demand, struct walt_task_struct *wts),
+
+	TP_ARGS(p, runtime, pct, pred_demand, wts),
+
+	TP_STRUCT__entry(
+		__array(char,		comm, TASK_COMM_LEN)
+		__field(pid_t,		pid)
+		__field(unsigned int,	runtime)
+		__field(int,		pct)
+		__field(unsigned int,	pred_demand)
+		__array(u8,		bucket, NUM_BUSY_BUCKETS)
+		__field(int,		cpu)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->runtime	= runtime;
+		__entry->pct		= pct;
+		__entry->pred_demand	= pred_demand;
+		memcpy(__entry->bucket, wts->busy_buckets,
+					NUM_BUSY_BUCKETS * sizeof(u8));
+		__entry->cpu		= task_cpu(p);
+	),
+
+	TP_printk("%d (%s): runtime %u pct %d cpu %d pred_demand %u (buckets: %u %u %u %u %u %u %u %u %u %u)",
+		__entry->pid, __entry->comm,
+		__entry->runtime, __entry->pct, __entry->cpu,
+		__entry->pred_demand, __entry->bucket[0], __entry->bucket[1],
+		__entry->bucket[2], __entry->bucket[3], __entry->bucket[4],
+		__entry->bucket[5], __entry->bucket[6], __entry->bucket[7],
+		__entry->bucket[8], __entry->bucket[9])
+);
+
+TRACE_EVENT(sched_update_history,
+
+	TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int samples,
+			enum task_event evt, struct walt_rq *wrq, struct walt_task_struct *wts),
+
+	TP_ARGS(rq, p, runtime, samples, evt, wrq, wts),
+
+	TP_STRUCT__entry(
+		__array(char,			comm, TASK_COMM_LEN)
+		__field(pid_t,			pid)
+		__field(unsigned int,		runtime)
+		__field(int,			samples)
+		__field(enum task_event,	evt)
+		__field(unsigned int,		demand)
+		__field(unsigned int,		coloc_demand)
+		__field(unsigned int,		pred_demand)
+		__array(u32,			hist, RAVG_HIST_SIZE_MAX)
+		__field(unsigned int,		nr_big_tasks)
+		__field(int,			cpu)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->runtime	= runtime;
+		__entry->samples	= samples;
+		__entry->evt		= evt;
+		__entry->demand		= wts->demand;
+		__entry->coloc_demand	= wts->coloc_demand;
+		__entry->pred_demand	= wts->pred_demand;
+		memcpy(__entry->hist, wts->sum_history,
+					RAVG_HIST_SIZE_MAX * sizeof(u32));
+		__entry->nr_big_tasks	= wrq->walt_stats.nr_big_tasks;
+		__entry->cpu		= rq->cpu;
+	),
+
+	TP_printk("%d (%s): runtime %u samples %d event %s demand %u coloc_demand %u pred_demand %u (hist: %u %u %u %u %u) cpu %d nr_big %u",
+		__entry->pid, __entry->comm,
+		__entry->runtime, __entry->samples,
+		task_event_names[__entry->evt],
+		__entry->demand, __entry->coloc_demand, __entry->pred_demand,
+		__entry->hist[0], __entry->hist[1],
+		__entry->hist[2], __entry->hist[3],
+		__entry->hist[4], __entry->cpu, __entry->nr_big_tasks)
+);
+
+TRACE_EVENT(sched_get_task_cpu_cycles,
+
+	TP_PROTO(int cpu, int event, u64 cycles,
+			u64 exec_time, struct task_struct *p),
+
+	TP_ARGS(cpu, event, cycles, exec_time, p),
+
+	TP_STRUCT__entry(
+		__field(int,	cpu)
+		__field(int,	event)
+		__field(u64,	cycles)
+		__field(u64,	exec_time)
+		__field(u32,	freq)
+		__field(u32,	legacy_freq)
+		__field(u32,	max_freq)
+		__field(pid_t,	pid)
+		__array(char,	comm, TASK_COMM_LEN)
+	),
+
+	TP_fast_assign(
+		__entry->cpu		= cpu;
+		__entry->event		= event;
+		__entry->cycles		= cycles;
+		__entry->exec_time	= exec_time;
+		__entry->freq		= cpu_cycles_to_freq(cycles, exec_time);
+		__entry->legacy_freq	= sched_cpu_legacy_freq(cpu);
+		__entry->max_freq	= cpu_max_freq(cpu);
+		__entry->pid		= p->pid;
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("cpu=%d event=%d cycles=%llu exec_time=%llu freq=%u legacy_freq=%u max_freq=%u task=%d (%s)",
+		  __entry->cpu, __entry->event, __entry->cycles,
+		  __entry->exec_time, __entry->freq, __entry->legacy_freq,
+		  __entry->max_freq, __entry->pid, __entry->comm)
+);
+
+TRACE_EVENT(sched_update_task_ravg,
+
+	TP_PROTO(struct task_struct *p, struct rq *rq, enum task_event evt,
+		 u64 wallclock, u64 irqtime,
+		 struct group_cpu_time *cpu_time, struct walt_rq *wrq,
+		 struct walt_task_struct *wts),
+
+	TP_ARGS(p, rq, evt, wallclock, irqtime, cpu_time, wrq, wts),
+
+	TP_STRUCT__entry(
+		__array(char,			comm, TASK_COMM_LEN)
+		__field(pid_t,			pid)
+		__field(pid_t,			cur_pid)
+		__field(unsigned int,		cur_freq)
+		__field(u64,			wallclock)
+		__field(u64,			mark_start)
+		__field(u64,			delta_m)
+		__field(u64,			win_start)
+		__field(u64,			delta)
+		__field(u64,			irqtime)
+		__field(enum task_event,	evt)
+		__field(unsigned int,		demand)
+		__field(unsigned int,		coloc_demand)
+		__field(unsigned int,		sum)
+		__field(int,			cpu)
+		__field(unsigned int,		pred_demand)
+		__field(u64,			rq_cs)
+		__field(u64,			rq_ps)
+		__field(u64,			grp_cs)
+		__field(u64,			grp_ps)
+		__field(u64,			grp_nt_cs)
+		__field(u64,			grp_nt_ps)
+		__field(u32,			curr_window)
+		__field(u32,			prev_window)
+		__dynamic_array(u32,		curr_sum, nr_cpu_ids)
+		__dynamic_array(u32,		prev_sum, nr_cpu_ids)
+		__field(u64,			nt_cs)
+		__field(u64,			nt_ps)
+		__field(u64,			active_time)
+		__field(u32,			curr_top)
+		__field(u32,			prev_top)
+	),
+
+	TP_fast_assign(
+		__entry->wallclock	= wallclock;
+		__entry->win_start	= wrq->window_start;
+		__entry->delta		= (wallclock - wrq->window_start);
+		__entry->evt		= evt;
+		__entry->cpu		= rq->cpu;
+		__entry->cur_pid	= rq->curr->pid;
+		__entry->cur_freq	= wrq->task_exec_scale;
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->mark_start	= wts->mark_start;
+		__entry->delta_m	= (wallclock - wts->mark_start);
+		__entry->demand		= wts->demand;
+		__entry->coloc_demand	= wts->coloc_demand;
+		__entry->sum		= wts->sum;
+		__entry->irqtime	= irqtime;
+		__entry->pred_demand	= wts->pred_demand;
+		__entry->rq_cs		= wrq->curr_runnable_sum;
+		__entry->rq_ps		= wrq->prev_runnable_sum;
+		__entry->grp_cs		= cpu_time ? cpu_time->curr_runnable_sum : 0;
+		__entry->grp_ps		= cpu_time ? cpu_time->prev_runnable_sum : 0;
+		__entry->grp_nt_cs	= cpu_time ?
+					cpu_time->nt_curr_runnable_sum : 0;
+		__entry->grp_nt_ps	= cpu_time ?
+					cpu_time->nt_prev_runnable_sum : 0;
+		__entry->curr_window	= wts->curr_window;
+		__entry->prev_window	= wts->prev_window;
+		__window_data(__get_dynamic_array(curr_sum),
+						wts->curr_window_cpu);
+		__window_data(__get_dynamic_array(prev_sum),
+						wts->prev_window_cpu);
+		__entry->nt_cs		= wrq->nt_curr_runnable_sum;
+		__entry->nt_ps		= wrq->nt_prev_runnable_sum;
+		__entry->active_time	= wts->active_time;
+		__entry->curr_top	= wrq->curr_top;
+		__entry->prev_top	= wrq->prev_top;
+	),
+
+	TP_printk("wc %llu ws %llu delta %llu event %s cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u coloc_demand: %u sum %u irqtime %llu pred_demand %u rq_cs %llu rq_ps %llu cur_window %u (%s) prev_window %u (%s) nt_cs %llu nt_ps %llu active_time %u grp_cs %lld grp_ps %lld, grp_nt_cs %llu, grp_nt_ps: %llu curr_top %u prev_top %u",
+		__entry->wallclock, __entry->win_start, __entry->delta,
+		task_event_names[__entry->evt], __entry->cpu,
+		__entry->cur_freq, __entry->cur_pid,
+		__entry->pid, __entry->comm, __entry->mark_start,
+		__entry->delta_m, __entry->demand, __entry->coloc_demand,
+		__entry->sum, __entry->irqtime, __entry->pred_demand,
+		__entry->rq_cs, __entry->rq_ps, __entry->curr_window,
+		__window_print(p, __get_dynamic_array(curr_sum), nr_cpu_ids),
+		__entry->prev_window,
+		__window_print(p, __get_dynamic_array(prev_sum), nr_cpu_ids),
+		__entry->nt_cs, __entry->nt_ps,
+		__entry->active_time, __entry->grp_cs,
+		__entry->grp_ps, __entry->grp_nt_cs, __entry->grp_nt_ps,
+		__entry->curr_top, __entry->prev_top)
+);
+
+TRACE_EVENT(sched_update_task_ravg_mini,
+
+	TP_PROTO(struct task_struct *p, struct rq *rq, enum task_event evt,
+		 u64 wallclock, u64 irqtime,
+		 struct group_cpu_time *cpu_time, struct walt_rq *wrq,
+		 struct walt_task_struct *wts),
+
+	TP_ARGS(p, rq, evt, wallclock, irqtime, cpu_time, wrq, wts),
+
+	TP_STRUCT__entry(
+		__array(char,			comm, TASK_COMM_LEN)
+		__field(pid_t,			pid)
+		__field(u64,			wallclock)
+		__field(u64,			mark_start)
+		__field(u64,			delta_m)
+		__field(u64,			win_start)
+		__field(u64,			delta)
+		__field(enum task_event,	evt)
+		__field(unsigned int,		demand)
+		__field(int,			cpu)
+		__field(u64,			rq_cs)
+		__field(u64,			rq_ps)
+		__field(u64,			grp_cs)
+		__field(u64,			grp_ps)
+		__field(u32,			curr_window)
+		__field(u32,			prev_window)
+	),
+
+	TP_fast_assign(
+		__entry->wallclock	= wallclock;
+		__entry->win_start	= wrq->window_start;
+		__entry->delta		= (wallclock - wrq->window_start);
+		__entry->evt		= evt;
+		__entry->cpu		= rq->cpu;
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->mark_start	= wts->mark_start;
+		__entry->delta_m	= (wallclock - wts->mark_start);
+		__entry->demand		= wts->demand;
+		__entry->rq_cs		= wrq->curr_runnable_sum;
+		__entry->rq_ps		= wrq->prev_runnable_sum;
+		__entry->grp_cs		= cpu_time ? cpu_time->curr_runnable_sum : 0;
+		__entry->grp_ps		= cpu_time ? cpu_time->prev_runnable_sum : 0;
+		__entry->curr_window	= wts->curr_window;
+		__entry->prev_window	= wts->prev_window;
+	),
+
+	TP_printk("wc %llu ws %llu delta %llu event %s cpu %d task %d (%s) ms %llu delta %llu demand %u rq_cs %llu rq_ps %llu cur_window %u prev_window %u grp_cs %lld grp_ps %lld",
+		__entry->wallclock, __entry->win_start, __entry->delta,
+		task_event_names[__entry->evt], __entry->cpu,
+		__entry->pid, __entry->comm, __entry->mark_start,
+		__entry->delta_m, __entry->demand,
+		__entry->rq_cs, __entry->rq_ps, __entry->curr_window,
+		__entry->prev_window, __entry->grp_cs, __entry->grp_ps)
+);
+
+struct migration_sum_data;
+extern const char *migrate_type_names[];
+
+TRACE_EVENT(sched_set_preferred_cluster,
+
+	TP_PROTO(struct walt_related_thread_group *grp, u64 total_demand),
+
+	TP_ARGS(grp, total_demand),
+
+	TP_STRUCT__entry(
+		__field(int,		id)
+		__field(u64,		total_demand)
+		__field(bool,		skip_min)
+	),
+
+	TP_fast_assign(
+		__entry->id		= grp->id;
+		__entry->total_demand	= total_demand;
+		__entry->skip_min	= grp->skip_min;
+	),
+
+	TP_printk("group_id %d total_demand %llu skip_min %d",
+			__entry->id, __entry->total_demand,
+			__entry->skip_min)
+);
+
+TRACE_EVENT(sched_migration_update_sum,
+
+	TP_PROTO(struct task_struct *p, enum migrate_types migrate_type,
+							struct rq *rq),
+
+	TP_ARGS(p, migrate_type, rq),
+
+	TP_STRUCT__entry(
+		__field(int,			tcpu)
+		__field(int,			pid)
+		__field(enum migrate_types,	migrate_type)
+		__field(s64,			src_cs)
+		__field(s64,			src_ps)
+		__field(s64,			dst_cs)
+		__field(s64,			dst_ps)
+		__field(s64,			src_nt_cs)
+		__field(s64,			src_nt_ps)
+		__field(s64,			dst_nt_cs)
+		__field(s64,			dst_nt_ps)
+	),
+
+	TP_fast_assign(
+		__entry->tcpu		= task_cpu(p);
+		__entry->pid		= p->pid;
+		__entry->migrate_type	= migrate_type;
+		__entry->src_cs		= __get_update_sum(rq, migrate_type,
+							   true, false, true);
+		__entry->src_ps		= __get_update_sum(rq, migrate_type,
+							   true, false, false);
+		__entry->dst_cs		= __get_update_sum(rq, migrate_type,
+							   false, false, true);
+		__entry->dst_ps		= __get_update_sum(rq, migrate_type,
+							   false, false, false);
+		__entry->src_nt_cs	= __get_update_sum(rq, migrate_type,
+							   true, true, true);
+		__entry->src_nt_ps	= __get_update_sum(rq, migrate_type,
+							   true, true, false);
+		__entry->dst_nt_cs	= __get_update_sum(rq, migrate_type,
+							   false, true, true);
+		__entry->dst_nt_ps	= __get_update_sum(rq, migrate_type,
+							   false, true, false);
+	),
+
+	TP_printk("pid %d task_cpu %d migrate_type %s src_cs %llu src_ps %llu dst_cs %lld dst_ps %lld src_nt_cs %llu src_nt_ps %llu dst_nt_cs %lld dst_nt_ps %lld",
+		__entry->pid, __entry->tcpu,
+		migrate_type_names[__entry->migrate_type],
+		__entry->src_cs, __entry->src_ps, __entry->dst_cs,
+		__entry->dst_ps, __entry->src_nt_cs, __entry->src_nt_ps,
+		__entry->dst_nt_cs, __entry->dst_nt_ps)
+);
+
+TRACE_EVENT(sched_set_boost,
+
+	TP_PROTO(int type),
+
+	TP_ARGS(type),
+
+	TP_STRUCT__entry(
+		__field(int, type)
+	),
+
+	TP_fast_assign(
+		__entry->type = type;
+	),
+
+	TP_printk("type %d", __entry->type)
+);
+
+TRACE_EVENT(sched_load_to_gov,
+
+	TP_PROTO(struct rq *rq, u64 aggr_grp_load, u32 tt_load,
+		int freq_aggr, u64 load, int policy,
+		int big_task_rotation,
+		unsigned int user_hint,
+		struct walt_rq *wrq),
+	TP_ARGS(rq, aggr_grp_load, tt_load, freq_aggr, load, policy,
+		big_task_rotation, user_hint, wrq),
+
+	TP_STRUCT__entry(
+		__field(int,	cpu)
+		__field(int,	policy)
+		__field(int,	ed_task_pid)
+		__field(u64,	aggr_grp_load)
+		__field(int,	freq_aggr)
+		__field(u64,	tt_load)
+		__field(u64,	rq_ps)
+		__field(u64,	grp_rq_ps)
+		__field(u64,	nt_ps)
+		__field(u64,	grp_nt_ps)
+		__field(u64,	pl)
+		__field(u64,	load)
+		__field(int,	big_task_rotation)
+		__field(unsigned int, user_hint)
+	),
+
+	TP_fast_assign(
+		__entry->cpu		= cpu_of(rq);
+		__entry->policy		= policy;
+		__entry->ed_task_pid	=
+				wrq->ed_task ? wrq->ed_task->pid : -1;
+		__entry->aggr_grp_load	= aggr_grp_load;
+		__entry->freq_aggr	= freq_aggr;
+		__entry->tt_load	= tt_load;
+		__entry->rq_ps		= wrq->prev_runnable_sum;
+		__entry->grp_rq_ps	= wrq->grp_time.prev_runnable_sum;
+		__entry->nt_ps		= wrq->nt_prev_runnable_sum;
+		__entry->grp_nt_ps	= wrq->grp_time.nt_prev_runnable_sum;
+		__entry->pl		= wrq->walt_stats.pred_demands_sum_scaled;
+		__entry->load		= load;
+		__entry->big_task_rotation	= big_task_rotation;
+		__entry->user_hint	= user_hint;
+	),
+
+	TP_printk("cpu=%d policy=%d ed_task_pid=%d aggr_grp_load=%llu freq_aggr=%d tt_load=%llu rq_ps=%llu grp_rq_ps=%llu nt_ps=%llu grp_nt_ps=%llu pl=%llu load=%llu big_task_rotation=%d user_hint=%u",
+		__entry->cpu, __entry->policy, __entry->ed_task_pid,
+		__entry->aggr_grp_load, __entry->freq_aggr,
+		__entry->tt_load, __entry->rq_ps, __entry->grp_rq_ps,
+		__entry->nt_ps, __entry->grp_nt_ps, __entry->pl, __entry->load,
+		__entry->big_task_rotation, __entry->user_hint)
+);
+
+TRACE_EVENT(core_ctl_eval_need,
+
+	TP_PROTO(unsigned int cpu, unsigned int old_need,
+		unsigned int new_need, unsigned int updated),
+	TP_ARGS(cpu, old_need, new_need, updated),
+	TP_STRUCT__entry(
+		__field(u32, cpu)
+		__field(u32, old_need)
+		__field(u32, new_need)
+		__field(u32, updated)
+	),
+	TP_fast_assign(
+		__entry->cpu		= cpu;
+		__entry->old_need	= old_need;
+		__entry->new_need	= new_need;
+		__entry->updated	= updated;
+	),
+	TP_printk("cpu=%u, old_need=%u, new_need=%u, updated=%u", __entry->cpu,
+			__entry->old_need, __entry->new_need, __entry->updated)
+);
+
+TRACE_EVENT(core_ctl_set_busy,
+
+	TP_PROTO(unsigned int cpu, unsigned int busy,
+		unsigned int old_is_busy, unsigned int is_busy),
+	TP_ARGS(cpu, busy, old_is_busy, is_busy),
+	TP_STRUCT__entry(
+		__field(u32, cpu)
+		__field(u32, busy)
+		__field(u32, old_is_busy)
+		__field(u32, is_busy)
+		__field(bool, high_irqload)
+	),
+	TP_fast_assign(
+		__entry->cpu		= cpu;
+		__entry->busy		= busy;
+		__entry->old_is_busy	= old_is_busy;
+		__entry->is_busy	= is_busy;
+		__entry->high_irqload	= sched_cpu_high_irqload(cpu);
+	),
+	TP_printk("cpu=%u, busy=%u, old_is_busy=%u, new_is_busy=%u high_irqload=%d",
+		__entry->cpu, __entry->busy, __entry->old_is_busy,
+		__entry->is_busy, __entry->high_irqload)
+);
+
+TRACE_EVENT(core_ctl_set_boost,
+
+	TP_PROTO(u32 refcount, s32 ret),
+	TP_ARGS(refcount, ret),
+	TP_STRUCT__entry(
+		__field(u32, refcount)
+		__field(s32, ret)
+	),
+	TP_fast_assign(
+		__entry->refcount	= refcount;
+		__entry->ret		= ret;
+	),
+	TP_printk("refcount=%u, ret=%d", __entry->refcount, __entry->ret)
+);
+
+TRACE_EVENT(core_ctl_update_nr_need,
+
+	TP_PROTO(int cpu, int nr_need, int prev_misfit_need,
+		int nrrun, int max_nr, int nr_prev_assist),
+
+	TP_ARGS(cpu, nr_need, prev_misfit_need, nrrun, max_nr, nr_prev_assist),
+
+	TP_STRUCT__entry(
+		__field(int, cpu)
+		__field(int, nr_need)
+		__field(int, prev_misfit_need)
+		__field(int, nrrun)
+		__field(int, max_nr)
+		__field(int, nr_prev_assist)
+	),
+
+	TP_fast_assign(
+		__entry->cpu			= cpu;
+		__entry->nr_need		= nr_need;
+		__entry->prev_misfit_need	= prev_misfit_need;
+		__entry->nrrun			= nrrun;
+		__entry->max_nr			= max_nr;
+		__entry->nr_prev_assist		= nr_prev_assist;
+	),
+
+	TP_printk("cpu=%d nr_need=%d prev_misfit_need=%d nrrun=%d max_nr=%d nr_prev_assist=%d",
+		__entry->cpu, __entry->nr_need, __entry->prev_misfit_need,
+		__entry->nrrun, __entry->max_nr, __entry->nr_prev_assist)
+);
+
+TRACE_EVENT(core_ctl_notif_data,
+
+	TP_PROTO(u32 nr_big, u32 ta_load, u32 *ta_util, u32 *cur_cap),
+
+	TP_ARGS(nr_big, ta_load, ta_util, cur_cap),
+
+	TP_STRUCT__entry(
+		__field(u32, nr_big)
+		__field(u32, ta_load)
+		__array(u32, ta_util, MAX_CLUSTERS)
+		__array(u32, cur_cap, MAX_CLUSTERS)
+	),
+
+	TP_fast_assign(
+		__entry->nr_big		= nr_big;
+		__entry->ta_load	= ta_load;
+		memcpy(__entry->ta_util, ta_util, MAX_CLUSTERS * sizeof(u32));
+		memcpy(__entry->cur_cap, cur_cap, MAX_CLUSTERS * sizeof(u32));
+	),
+
+	TP_printk("nr_big=%u ta_load=%u ta_util=(%u %u %u) cur_cap=(%u %u %u)",
+		  __entry->nr_big, __entry->ta_load,
+		  __entry->ta_util[0], __entry->ta_util[1],
+		  __entry->ta_util[2], __entry->cur_cap[0],
+		  __entry->cur_cap[1], __entry->cur_cap[2])
+);
+
+/*
+ * Tracepoint for sched_get_nr_running_avg
+ */
+TRACE_EVENT(sched_get_nr_running_avg,
+
+	TP_PROTO(int cpu, int nr, int nr_misfit, int nr_max, int nr_scaled),
+
+	TP_ARGS(cpu, nr, nr_misfit, nr_max, nr_scaled),
+
+	TP_STRUCT__entry(
+		__field(int, cpu)
+		__field(int, nr)
+		__field(int, nr_misfit)
+		__field(int, nr_max)
+		__field(int, nr_scaled)
+	),
+
+	TP_fast_assign(
+		__entry->cpu		= cpu;
+		__entry->nr		= nr;
+		__entry->nr_misfit	= nr_misfit;
+		__entry->nr_max		= nr_max;
+		__entry->nr_scaled	= nr_scaled;
+	),
+
+	TP_printk("cpu=%d nr=%d nr_misfit=%d nr_max=%d nr_scaled=%d",
+		__entry->cpu, __entry->nr, __entry->nr_misfit, __entry->nr_max,
+		__entry->nr_scaled)
+);
+
+/*
+ * sched_pause - called when cores are paused/unpaused
+ *
+ * @start: 1 if start of pause/resume op, 0 otherwise
+ * @requested_cpus: mask of cpus requested in this op
+ * @active_cpus: mask of currently active cpus
+ * @start_time: time of the start of the operation
+ * @pause: 1 if pausing, 0 if resuming
+ */
+TRACE_EVENT(sched_pause,
+
+	TP_PROTO(unsigned int start, unsigned int requested_cpus, unsigned int active_cpus,
+		     u64 start_time, unsigned char pause),
+
+	TP_ARGS(start, requested_cpus, active_cpus, start_time, pause),
+
+	TP_STRUCT__entry(
+		    __field(u32, start)
+		    __field(u32, requested_cpus)
+		    __field(u32, active_cpus)
+		    __field(u32, time)
+		    __field(unsigned char, pause)
+		    ),
+
+	TP_fast_assign(
+		    __entry->start		= start;
+		    __entry->requested_cpus	= requested_cpus;
+		    __entry->active_cpus	= active_cpus;
+		    __entry->time		= div64_u64(sched_clock() - start_time, 1000);
+		    __entry->pause		= pause;
+		    ),
+
+	TP_printk("start=%d req cpus=0x%x act cpus=0x%x time=%u us paused=%d",
+		      __entry->start, __entry->requested_cpus, __entry->active_cpus,
+		      __entry->time, __entry->pause)
+);
+
+TRACE_EVENT(sched_ravg_window_change,
+
+	TP_PROTO(unsigned int sched_ravg_window, unsigned int new_sched_ravg_window
+		, u64 change_time),
+
+	TP_ARGS(sched_ravg_window, new_sched_ravg_window, change_time),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, sched_ravg_window)
+		__field(unsigned int, new_sched_ravg_window)
+		__field(u64, change_time)
+	),
+
+	TP_fast_assign(
+		__entry->sched_ravg_window	= sched_ravg_window;
+		__entry->new_sched_ravg_window	= new_sched_ravg_window;
+		__entry->change_time		= change_time;
+	),
+
+	TP_printk("from=%u to=%u at=%lu",
+		__entry->sched_ravg_window, __entry->new_sched_ravg_window,
+		__entry->change_time)
+);
+
+TRACE_EVENT(waltgov_util_update,
+	    TP_PROTO(int cpu,
+		     unsigned long util, unsigned long avg_cap,
+		     unsigned long max_cap, unsigned long nl, unsigned long pl,
+		     unsigned int rtgb, unsigned int flags),
+	    TP_ARGS(cpu, util, avg_cap, max_cap, nl, pl, rtgb, flags),
+	    TP_STRUCT__entry(
+		    __field(int, cpu)
+		    __field(unsigned long, util)
+		    __field(unsigned long, avg_cap)
+		    __field(unsigned long, max_cap)
+		    __field(unsigned long, nl)
+		    __field(unsigned long, pl)
+		    __field(unsigned int, rtgb)
+		    __field(unsigned int, flags)
+	    ),
+	    TP_fast_assign(
+		    __entry->cpu	= cpu;
+		    __entry->util	= util;
+		    __entry->avg_cap	= avg_cap;
+		    __entry->max_cap	= max_cap;
+		    __entry->nl		= nl;
+		    __entry->pl		= pl;
+		    __entry->rtgb	= rtgb;
+		    __entry->flags	= flags;
+	    ),
+	    TP_printk("cpu=%d util=%lu avg_cap=%lu max_cap=%lu nl=%lu pl=%lu rtgb=%u flags=0x%x",
+		      __entry->cpu, __entry->util, __entry->avg_cap,
+		      __entry->max_cap, __entry->nl,
+		      __entry->pl, __entry->rtgb, __entry->flags)
+);
+
+TRACE_EVENT(waltgov_next_freq,
+	    TP_PROTO(unsigned int cpu, unsigned long util, unsigned long max,
+		     unsigned int freq),
+	    TP_ARGS(cpu, util, max, freq),
+	    TP_STRUCT__entry(
+		    __field(unsigned int, cpu)
+		    __field(unsigned long, util)
+		    __field(unsigned long, max)
+		    __field(unsigned int, freq)
+	    ),
+	    TP_fast_assign(
+		    __entry->cpu	= cpu;
+		    __entry->util	= util;
+		    __entry->max	= max;
+		    __entry->freq	= freq;
+	    ),
+	    TP_printk("cpu=%u util=%lu max=%lu freq=%u",
+		      __entry->cpu,
+		      __entry->util,
+		      __entry->max,
+		      __entry->freq)
+);
+
+TRACE_EVENT(walt_active_load_balance,
+
+	TP_PROTO(struct task_struct *p, int prev_cpu, int new_cpu, struct walt_task_struct *wts),
+
+	TP_ARGS(p, prev_cpu, new_cpu, wts),
+
+	TP_STRUCT__entry(
+		__field(pid_t, pid)
+		__field(bool, misfit)
+		__field(int, prev_cpu)
+		__field(int, new_cpu)
+	),
+
+	TP_fast_assign(
+		__entry->pid		= p->pid;
+		__entry->misfit		= wts->misfit;
+		__entry->prev_cpu	= prev_cpu;
+		__entry->new_cpu	= new_cpu;
+	),
+
+	TP_printk("pid=%d misfit=%d prev_cpu=%d new_cpu=%d\n",
+			__entry->pid, __entry->misfit, __entry->prev_cpu,
+			__entry->new_cpu)
+);
+
+TRACE_EVENT(walt_find_busiest_queue,
+
+	TP_PROTO(int dst_cpu, int busiest_cpu, unsigned long src_mask),
+
+	TP_ARGS(dst_cpu, busiest_cpu, src_mask),
+
+	TP_STRUCT__entry(
+		__field(int, dst_cpu)
+		__field(int, busiest_cpu)
+		__field(unsigned long, src_mask)
+	),
+
+	TP_fast_assign(
+		__entry->dst_cpu	= dst_cpu;
+		__entry->busiest_cpu	= busiest_cpu;
+		__entry->src_mask	= src_mask;
+	),
+
+	TP_printk("dst_cpu=%d busiest_cpu=%d src_mask=%lx\n",
+			__entry->dst_cpu, __entry->busiest_cpu,
+			__entry->src_mask)
+);
+
+TRACE_EVENT(walt_nohz_balance_kick,
+
+	TP_PROTO(struct rq *rq),
+
+	TP_ARGS(rq),
+
+	TP_STRUCT__entry(
+		__field(int, cpu)
+		__field(unsigned int, nr_running)
+		__field(unsigned int, nr_cfs_running)
+	),
+
+	TP_fast_assign(
+		__entry->cpu		= rq->cpu;
+		__entry->nr_running	= rq->nr_running;
+		__entry->nr_cfs_running	= rq->cfs.h_nr_running;
+	),
+
+	TP_printk("cpu=%d nr_running=%u nr_cfs_running=%u\n",
+			__entry->cpu, __entry->nr_running,
+			__entry->nr_cfs_running)
+);
+
+TRACE_EVENT(walt_newidle_balance,
+
+	TP_PROTO(int this_cpu, int busy_cpu, int pulled),
+
+	TP_ARGS(this_cpu, busy_cpu, pulled),
+
+	TP_STRUCT__entry(
+		__field(int, this_cpu)
+		__field(int, busy_cpu)
+		__field(int, pulled)
+		__field(unsigned int, this_nr_running)
+	),
+
+	TP_fast_assign(
+		__entry->this_cpu		= this_cpu;
+		__entry->busy_cpu		= busy_cpu;
+		__entry->pulled			= pulled;
+		__entry->this_nr_running	= cpu_rq(this_cpu)->nr_running;
+	),
+
+	TP_printk("this_cpu=%d busy_cpu=%d pulled=%d this_nr_running=%u\n",
+			__entry->this_cpu, __entry->busy_cpu, __entry->pulled,
+			__entry->this_nr_running)
+);
+
+TRACE_EVENT(walt_lb_cpu_util,
+
+	TP_PROTO(int cpu, struct walt_rq *wrq),
+
+	TP_ARGS(cpu, wrq),
+
+	TP_STRUCT__entry(
+		__field(int, cpu)
+		__field(unsigned int, nr_running)
+		__field(unsigned int, cfs_nr_running)
+		__field(unsigned int, nr_big)
+		__field(unsigned int, nr_rtg_high_prio_tasks)
+		__field(unsigned int, cpu_util)
+		__field(unsigned int, capacity_orig)
+	),
+
+	TP_fast_assign(
+		__entry->cpu			= cpu;
+		__entry->nr_running		= cpu_rq(cpu)->nr_running;
+		__entry->cfs_nr_running		= cpu_rq(cpu)->cfs.h_nr_running;
+		__entry->nr_big			= wrq->walt_stats.nr_big_tasks;
+		__entry->nr_rtg_high_prio_tasks	= walt_nr_rtg_high_prio(cpu);
+		__entry->cpu_util		= cpu_util(cpu);
+		__entry->capacity_orig		= capacity_orig_of(cpu);
+	),
+
+	TP_printk("cpu=%d nr_running=%u cfs_nr_running=%u nr_big=%u nr_rtg_hp=%u cpu_util=%u capacity_orig=%u",
+		__entry->cpu, __entry->nr_running, __entry->cfs_nr_running,
+		__entry->nr_big, __entry->nr_rtg_high_prio_tasks,
+		__entry->cpu_util, __entry->capacity_orig)
+);
+
+TRACE_EVENT(sched_cpu_util,
+
+	TP_PROTO(int cpu),
+
+	TP_ARGS(cpu),
+
+	TP_STRUCT__entry(
+		__field(unsigned int,	cpu)
+		__field(unsigned int,	nr_running)
+		__field(long,		cpu_util)
+		__field(long,		cpu_util_cum)
+		__field(unsigned int,	capacity_curr)
+		__field(unsigned int,	capacity)
+		__field(unsigned int,	capacity_orig)
+		__field(unsigned int,	idle_exit_latency)
+		__field(u64,		irqload)
+		__field(int,		online)
+		__field(int,		inactive)
+		__field(int,		reserved)
+		__field(int,		high_irq_load)
+		__field(unsigned int,	nr_rtg_high_prio_tasks)
+	),
+
+	TP_fast_assign(
+		__entry->cpu		= cpu;
+		__entry->nr_running	= cpu_rq(cpu)->nr_running;
+		__entry->cpu_util	= cpu_util(cpu);
+		__entry->cpu_util_cum	= cpu_util_cum(cpu, 0);
+		__entry->capacity_curr	= capacity_curr_of(cpu);
+		__entry->capacity	= capacity_of(cpu);
+		__entry->capacity_orig	= capacity_orig_of(cpu);
+		__entry->idle_exit_latency	= walt_get_idle_exit_latency(cpu_rq(cpu));
+		__entry->irqload		= sched_irqload(cpu);
+		__entry->online			= cpu_online(cpu);
+		__entry->inactive		= !cpu_active(cpu);
+		__entry->reserved		= is_reserved(cpu);
+		__entry->high_irq_load		= sched_cpu_high_irqload(cpu);
+		__entry->nr_rtg_high_prio_tasks	= walt_nr_rtg_high_prio(cpu);
+	),
+
+	TP_printk("cpu=%d nr_running=%d cpu_util=%ld cpu_util_cum=%ld capacity_curr=%u capacity=%u capacity_orig=%u idle_exit_latency=%u irqload=%llu online=%u, inactive=%u, reserved=%u, high_irq_load=%u nr_rtg_hp=%u",
+		__entry->cpu, __entry->nr_running, __entry->cpu_util,
+		__entry->cpu_util_cum, __entry->capacity_curr,
+		__entry->capacity, __entry->capacity_orig,
+		__entry->idle_exit_latency, __entry->irqload, __entry->online,
+		__entry->inactive, __entry->reserved, __entry->high_irq_load,
+		__entry->nr_rtg_high_prio_tasks)
+);
+
+TRACE_EVENT(sched_compute_energy,
+
+	TP_PROTO(struct task_struct *p, int eval_cpu,
+		unsigned long eval_energy,
+		unsigned long prev_energy,
+		unsigned long best_energy,
+		unsigned long best_energy_cpu),
+
+	TP_ARGS(p, eval_cpu, eval_energy, prev_energy, best_energy,
+		best_energy_cpu),
+
+	TP_STRUCT__entry(
+		__field(int,		pid)
+		__array(char,		comm, TASK_COMM_LEN)
+		__field(unsigned long,	util)
+		__field(int,		prev_cpu)
+		__field(unsigned long,	prev_energy)
+		__field(int,		eval_cpu)
+		__field(unsigned long,	eval_energy)
+		__field(int,		best_energy_cpu)
+		__field(unsigned long,	best_energy)
+	),
+
+	TP_fast_assign(
+		__entry->pid			= p->pid;
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->util			= task_util(p);
+		__entry->prev_cpu		= task_cpu(p);
+		__entry->prev_energy		= prev_energy;
+		__entry->eval_cpu		= eval_cpu;
+		__entry->eval_energy		= eval_energy;
+		__entry->best_energy_cpu	= best_energy_cpu;
+		__entry->best_energy		= best_energy;
+	),
+
+	TP_printk("pid=%d comm=%s util=%lu prev_cpu=%d prev_energy=%lu eval_cpu=%d eval_energy=%lu best_energy_cpu=%d best_energy=%lu",
+		__entry->pid, __entry->comm, __entry->util, __entry->prev_cpu,
+		__entry->prev_energy, __entry->eval_cpu, __entry->eval_energy,
+		__entry->best_energy_cpu, __entry->best_energy)
+)
+
+TRACE_EVENT(sched_task_util,
+
+	TP_PROTO(struct task_struct *p, unsigned long candidates,
+		int best_energy_cpu, bool sync, int need_idle, int fastpath,
+		bool placement_boost, u64 start_t,
+		bool uclamp_boosted, bool is_rtg, bool rtg_skip_min,
+		int start_cpu),
+
+	TP_ARGS(p, candidates, best_energy_cpu, sync, need_idle, fastpath,
+		placement_boost, start_t, uclamp_boosted, is_rtg, rtg_skip_min,
+		start_cpu),
+
+	TP_STRUCT__entry(
+		__field(int,		pid)
+		__array(char,		comm, TASK_COMM_LEN)
+		__field(unsigned long,	util)
+		__field(unsigned long,	candidates)
+		__field(int,		prev_cpu)
+		__field(int,		best_energy_cpu)
+		__field(bool,		sync)
+		__field(int,		need_idle)
+		__field(int,		fastpath)
+		__field(int,		placement_boost)
+		__field(int,		rtg_cpu)
+		__field(u64,		latency)
+		__field(bool,		uclamp_boosted)
+		__field(bool,		is_rtg)
+		__field(bool,		rtg_skip_min)
+		__field(int,		start_cpu)
+		__field(u32,		unfilter)
+		__field(unsigned long,	cpus_allowed)
+		__field(int,		task_boost)
+		__field(bool,		low_latency)
+	),
+
+	TP_fast_assign(
+		__entry->pid			= p->pid;
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->util			= task_util(p);
+		__entry->prev_cpu		= task_cpu(p);
+		__entry->candidates		= candidates;
+		__entry->best_energy_cpu	= best_energy_cpu;
+		__entry->sync			= sync;
+		__entry->need_idle		= need_idle;
+		__entry->fastpath		= fastpath;
+		__entry->placement_boost	= placement_boost;
+		__entry->latency		= (sched_clock() - start_t);
+		__entry->uclamp_boosted		= uclamp_boosted;
+		__entry->is_rtg			= is_rtg;
+		__entry->rtg_skip_min		= rtg_skip_min;
+		__entry->start_cpu		= start_cpu;
+		__entry->unfilter		=
+			((struct walt_task_struct *) p->android_vendor_data1)->unfilter;
+		__entry->cpus_allowed		= cpumask_bits(&p->cpus_mask)[0];
+		__entry->task_boost		= per_task_boost(p);
+		__entry->low_latency		= walt_low_latency_task(p);
+	),
+
+	TP_printk("pid=%d comm=%s util=%lu prev_cpu=%d candidates=%#lx best_energy_cpu=%d sync=%d need_idle=%d fastpath=%d placement_boost=%d latency=%llu stune_boosted=%d is_rtg=%d rtg_skip_min=%d start_cpu=%d unfilter=%u affinity=%lx task_boost=%d low_latency=%d",
+		__entry->pid, __entry->comm, __entry->util, __entry->prev_cpu,
+		__entry->candidates, __entry->best_energy_cpu, __entry->sync,
+		__entry->need_idle, __entry->fastpath, __entry->placement_boost,
+		__entry->latency, __entry->uclamp_boosted,
+		__entry->is_rtg, __entry->rtg_skip_min, __entry->start_cpu,
+		__entry->unfilter, __entry->cpus_allowed, __entry->task_boost,
+		__entry->low_latency)
+);
+
+/*
+ * Tracepoint for find_best_target
+ */
+TRACE_EVENT(sched_find_best_target,
+
+	TP_PROTO(struct task_struct *tsk,
+		 unsigned long min_util, int start_cpu,
+		 int best_idle, int most_spare_cap, int target,
+		 int order_index, int end_index,
+		 int skip, bool running),
+
+	TP_ARGS(tsk, min_util, start_cpu,
+		best_idle, most_spare_cap, target,
+		order_index, end_index, skip, running),
+
+	TP_STRUCT__entry(
+		__array(char,		comm, TASK_COMM_LEN)
+		__field(pid_t,		pid)
+		__field(unsigned long,	min_util)
+		__field(int,		start_cpu)
+		__field(int,		best_idle)
+		__field(int,		most_spare_cap)
+		__field(int,		target)
+		__field(int,		order_index)
+		__field(int,		end_index)
+		__field(int,		skip)
+		__field(bool,		running)
+		),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+		__entry->pid		= tsk->pid;
+		__entry->min_util	= min_util;
+		__entry->start_cpu	= start_cpu;
+		__entry->best_idle	= best_idle;
+		__entry->most_spare_cap	= most_spare_cap;
+		__entry->target		= target;
+		__entry->order_index	= order_index;
+		__entry->end_index	= end_index;
+		__entry->skip		= skip;
+		__entry->running	= running;
+		),
+
+	TP_printk("pid=%d comm=%s start_cpu=%d best_idle=%d most_spare_cap=%d target=%d order_index=%d end_index=%d skip=%d running=%d",
+		  __entry->pid, __entry->comm,
+		  __entry->start_cpu,
+		  __entry->best_idle,
+		  __entry->most_spare_cap,
+		  __entry->target,
+		  __entry->order_index,
+		  __entry->end_index,
+		  __entry->skip,
+		  __entry->running)
+);
+
+TRACE_EVENT(sched_enq_deq_task,
+
+	TP_PROTO(struct task_struct *p, bool enqueue,
+				unsigned int cpus_allowed),
+
+	TP_ARGS(p, enqueue, cpus_allowed),
+
+	TP_STRUCT__entry(
+		__array(char,		comm, TASK_COMM_LEN)
+		__field(pid_t,		pid)
+		__field(int,		prio)
+		__field(int,		cpu)
+		__field(bool,		enqueue)
+		__field(unsigned int,	nr_running)
+		__field(unsigned int,	rt_nr_running)
+		__field(unsigned int,	cpus_allowed)
+		__field(unsigned int,	demand)
+		__field(unsigned int,	pred_demand)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+		__entry->cpu		= task_cpu(p);
+		__entry->enqueue	= enqueue;
+		__entry->nr_running	= task_rq(p)->nr_running;
+		__entry->rt_nr_running	= task_rq(p)->rt.rt_nr_running;
+		__entry->cpus_allowed	= cpus_allowed;
+		__entry->demand		= task_load(p);
+		__entry->pred_demand	= task_pl(p);
+	),
+
+	TP_printk("cpu=%d %s comm=%s pid=%d prio=%d nr_running=%u rt_nr_running=%u affine=%x demand=%u pred_demand=%u",
+			__entry->cpu,
+			__entry->enqueue ? "enqueue" : "dequeue",
+			__entry->comm, __entry->pid,
+			__entry->prio, __entry->nr_running,
+			__entry->rt_nr_running,
+			__entry->cpus_allowed, __entry->demand,
+			__entry->pred_demand)
+);
+#endif /* _TRACE_WALT_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../kernel/sched/walt
+#define TRACE_INCLUDE_FILE trace
+
+#include <trace/define_trace.h>

+ 4136 - 0
kernel/sched/walt/walt.c

@@ -0,0 +1,4136 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2016-2021, The Linux Foundation. All rights reserved.
+ */
+
+#include <linux/syscore_ops.h>
+#include <linux/cpufreq.h>
+#include <linux/list_sort.h>
+#include <linux/jiffies.h>
+#include <linux/sched/stat.h>
+#include <linux/module.h>
+#include <linux/kmemleak.h>
+#include <linux/ktime.h>
+#include <linux/qcom-cpufreq-hw.h>
+#include <linux/cpumask.h>
+
+#include <trace/hooks/sched.h>
+#include <trace/hooks/cpufreq.h>
+
+#include "walt.h"
+#include "trace.h"
+
+const char *task_event_names[] = {
+	"PUT_PREV_TASK",
+	"PICK_NEXT_TASK",
+	"TASK_WAKE",
+	"TASK_MIGRATE",
+	"TASK_UPDATE",
+	"IRQ_UPDATE"
+};
+
+const char *migrate_type_names[] = {
+	"GROUP_TO_RQ",
+	"RQ_TO_GROUP",
+	"RQ_TO_RQ",
+	"GROUP_TO_GROUP"
+};
+
+#define SCHED_FREQ_ACCOUNT_WAIT_TIME 0
+#define SCHED_ACCOUNT_WAIT_TIME 1
+
+#define EARLY_DETECTION_DURATION 9500000
+#define MAX_NUM_CGROUP_COLOC_ID 20
+
+#define MAX_NR_CLUSTERS			3
+
+#define FREQ_REPORT_MAX_CPU_LOAD_TOP_TASK	0
+#define FREQ_REPORT_CPU_LOAD			1
+#define FREQ_REPORT_TOP_TASK			2
+
+#define NEW_TASK_ACTIVE_TIME 100000000
+
+unsigned int sysctl_sched_user_hint;
+
+static ktime_t ktime_last;
+static bool sched_ktime_suspended;
+
+static bool use_cycle_counter;
+static DEFINE_MUTEX(cluster_lock);
+static u64 walt_load_reported_window;
+
+static struct irq_work walt_cpufreq_irq_work;
+struct irq_work walt_migration_irq_work;
+unsigned int walt_rotation_enabled;
+cpumask_t asym_cap_sibling_cpus = CPU_MASK_NONE;
+unsigned int sched_boost_type;
+enum sched_boost_policy boost_policy;
+
+unsigned int __read_mostly sched_ravg_window = 20000000;
+unsigned int min_max_possible_capacity = 1024;
+unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */
+/* Initial task load. Newly created tasks are assigned this load. */
+unsigned int __read_mostly sched_init_task_load_windows;
+/*
+ * Task load is categorized into buckets for the purpose of top task tracking.
+ * The entire range of load from 0 to sched_ravg_window needs to be covered
+ * in NUM_LOAD_INDICES number of buckets. Therefore the size of each bucket
+ * is given by sched_ravg_window / NUM_LOAD_INDICES. Since the default value
+ * of sched_ravg_window is DEFAULT_SCHED_RAVG_WINDOW, use that to compute
+ * sched_load_granule.
+ */
+unsigned int __read_mostly sched_load_granule;
+__read_mostly bool sched_predl = true;
+
+/*
+ *@boost:should be 0,1,2.
+ *@period:boost time based on ms units.
+ */
+int set_task_boost(int boost, u64 period)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) current->android_vendor_data1;
+
+	if (boost < TASK_BOOST_NONE || boost >= TASK_BOOST_END)
+		return -EINVAL;
+	if (boost) {
+		wts->boost = boost;
+		wts->boost_period = (u64)period * 1000 * 1000;
+		wts->boost_expires = sched_clock() + wts->boost_period;
+	} else {
+		wts->boost = 0;
+		wts->boost_expires = 0;
+		wts->boost_period = 0;
+	}
+	return 0;
+}
+
+u64 sched_ktime_clock(void)
+{
+	if (unlikely(sched_ktime_suspended))
+		return ktime_to_ns(ktime_last);
+	return ktime_get_ns();
+}
+
+static void sched_resume(void)
+{
+	sched_ktime_suspended = false;
+}
+
+static int sched_suspend(void)
+{
+	ktime_last = ktime_get();
+	sched_ktime_suspended = true;
+	return 0;
+}
+
+static struct syscore_ops sched_syscore_ops = {
+	.resume		= sched_resume,
+	.suspend	= sched_suspend
+};
+
+int sched_init_ops(void)
+{
+	register_syscore_ops(&sched_syscore_ops);
+	return 0;
+}
+
+void acquire_rq_locks_irqsave(const cpumask_t *cpus,
+				     unsigned long *flags)
+{
+	int cpu;
+	int level = 0;
+
+	local_irq_save(*flags);
+
+	for_each_cpu(cpu, cpus) {
+		if (level == 0)
+			raw_spin_lock(&cpu_rq(cpu)->lock);
+		else
+			raw_spin_lock_nested(&cpu_rq(cpu)->lock, level);
+		level++;
+	}
+}
+
+void release_rq_locks_irqrestore(const cpumask_t *cpus,
+					unsigned long *flags)
+{
+	int cpu;
+
+	for_each_cpu(cpu, cpus)
+		raw_spin_unlock(&cpu_rq(cpu)->lock);
+	local_irq_restore(*flags);
+}
+
+static unsigned int walt_cpu_high_irqload;
+
+__read_mostly unsigned int sched_ravg_hist_size = 5;
+
+static __read_mostly unsigned int sched_io_is_busy = 1;
+
+/* Window size (in ns) */
+__read_mostly unsigned int new_sched_ravg_window = DEFAULT_SCHED_RAVG_WINDOW;
+
+static DEFINE_SPINLOCK(sched_ravg_window_lock);
+u64 sched_ravg_window_change_time;
+
+/*
+ * A after-boot constant divisor for cpu_util_freq_walt() to apply the load
+ * boost.
+ */
+static __read_mostly unsigned int walt_cpu_util_freq_divisor;
+
+unsigned int __read_mostly sched_init_task_load_windows_scaled;
+unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15;
+
+/* Size of bitmaps maintained to track top tasks */
+static const unsigned int top_tasks_bitmap_size =
+		BITS_TO_LONGS(NUM_LOAD_INDICES + 1) * sizeof(unsigned long);
+
+/*
+ * This governs what load needs to be used when reporting CPU busy time
+ * to the cpufreq governor.
+ */
+__read_mostly unsigned int sysctl_sched_freq_reporting_policy;
+
+__read_mostly unsigned int walt_scale_demand_divisor;
+#define scale_demand(d) ((d)/walt_scale_demand_divisor)
+
+#define SCHED_PRINT(arg)	pr_emerg("%s=%llu", #arg, arg)
+#define STRG(arg)		#arg
+
+static inline void walt_task_dump(struct task_struct *p)
+{
+	char buff[WALT_NR_CPUS * 16];
+	int i, j = 0;
+	int buffsz = WALT_NR_CPUS * 16;
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	SCHED_PRINT(p->pid);
+	SCHED_PRINT(wts->mark_start);
+	SCHED_PRINT(wts->demand);
+	SCHED_PRINT(wts->coloc_demand);
+	SCHED_PRINT(sched_ravg_window);
+	SCHED_PRINT(new_sched_ravg_window);
+
+	for (i = 0 ; i < nr_cpu_ids; i++)
+		j += scnprintf(buff + j, buffsz - j, "%u ",
+				wts->curr_window_cpu[i]);
+	printk_deferred("%s=%d (%s)\n", STRG(wts->curr_window),
+			wts->curr_window, buff);
+
+	for (i = 0, j = 0 ; i < nr_cpu_ids; i++)
+		j += scnprintf(buff + j, buffsz - j, "%u ",
+				wts->prev_window_cpu[i]);
+	printk_deferred("%s=%d (%s)\n", STRG(wts->prev_window),
+			wts->prev_window, buff);
+
+	SCHED_PRINT(wts->last_wake_ts);
+	SCHED_PRINT(wts->last_enqueued_ts);
+	SCHED_PRINT(wts->misfit);
+	SCHED_PRINT(wts->unfilter);
+}
+
+static inline void walt_rq_dump(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct task_struct *tsk = cpu_curr(cpu);
+	int i;
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+
+	/*
+	 * Increment the task reference so that it can't be
+	 * freed on a remote CPU. Since we are going to
+	 * enter panic, there is no need to decrement the
+	 * task reference. Decrementing the task reference
+	 * can't be done in atomic context, especially with
+	 * rq locks held.
+	 */
+	get_task_struct(tsk);
+	pr_emerg("CPU:%d nr_running:%u current: %d (%s)\n",
+			cpu, rq->nr_running, tsk->pid, tsk->comm);
+
+	printk_deferred("==========================================");
+	SCHED_PRINT(wrq->window_start);
+	SCHED_PRINT(wrq->prev_window_size);
+	SCHED_PRINT(wrq->curr_runnable_sum);
+	SCHED_PRINT(wrq->prev_runnable_sum);
+	SCHED_PRINT(wrq->nt_curr_runnable_sum);
+	SCHED_PRINT(wrq->nt_prev_runnable_sum);
+	SCHED_PRINT(wrq->cum_window_demand_scaled);
+	SCHED_PRINT(wrq->task_exec_scale);
+	SCHED_PRINT(wrq->grp_time.curr_runnable_sum);
+	SCHED_PRINT(wrq->grp_time.prev_runnable_sum);
+	SCHED_PRINT(wrq->grp_time.nt_curr_runnable_sum);
+	SCHED_PRINT(wrq->grp_time.nt_prev_runnable_sum);
+	for (i = 0 ; i < NUM_TRACKED_WINDOWS; i++) {
+		printk_deferred("wrq->load_subs[%d].window_start=%llu)\n", i,
+				wrq->load_subs[i].window_start);
+		printk_deferred("wrq->load_subs[%d].subs=%llu)\n", i,
+				wrq->load_subs[i].subs);
+		printk_deferred("wrq->load_subs[%d].new_subs=%llu)\n", i,
+				wrq->load_subs[i].new_subs);
+	}
+	walt_task_dump(tsk);
+	SCHED_PRINT(sched_capacity_margin_up[cpu]);
+	SCHED_PRINT(sched_capacity_margin_down[cpu]);
+}
+
+static inline void walt_dump(void)
+{
+	int cpu;
+
+	pr_emerg("============ WALT RQ DUMP START ==============\n");
+	pr_emerg("Sched ktime_get: %llu\n", sched_ktime_clock());
+	pr_emerg("Time last window changed=%lu\n",
+			sched_ravg_window_change_time);
+	for_each_online_cpu(cpu)
+		walt_rq_dump(cpu);
+	SCHED_PRINT(max_possible_capacity);
+	SCHED_PRINT(min_max_possible_capacity);
+	pr_emerg("============ WALT RQ DUMP END ==============\n");
+}
+
+static int in_sched_bug;
+#define SCHED_BUG_ON(condition)				\
+({							\
+	if (unlikely(!!(condition)) && !in_sched_bug) {	\
+		in_sched_bug = 1;			\
+		walt_dump();				\
+		BUG_ON(condition);			\
+	}						\
+})
+
+static inline void
+fixup_cumulative_runnable_avg(struct walt_sched_stats *stats,
+			      s64 demand_scaled_delta,
+			      s64 pred_demand_scaled_delta)
+{
+	stats->cumulative_runnable_avg_scaled += demand_scaled_delta;
+	BUG_ON((s64)stats->cumulative_runnable_avg_scaled < 0);
+
+	stats->pred_demands_sum_scaled += pred_demand_scaled_delta;
+	BUG_ON((s64)stats->pred_demands_sum_scaled < 0);
+}
+
+static void fixup_walt_sched_stats_common(struct rq *rq, struct task_struct *p,
+				   u16 updated_demand_scaled,
+				   u16 updated_pred_demand_scaled)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+	s64 task_load_delta = (s64)updated_demand_scaled -
+			      wts->demand_scaled;
+	s64 pred_demand_delta = (s64)updated_pred_demand_scaled -
+				wts->pred_demand_scaled;
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+
+	fixup_cumulative_runnable_avg(&wrq->walt_stats, task_load_delta,
+				      pred_demand_delta);
+}
+
+/*
+ * Demand aggregation for frequency purpose:
+ *
+ * CPU demand of tasks from various related groups is aggregated per-cluster and
+ * added to the "max_busy_cpu" in that cluster, where max_busy_cpu is determined
+ * by just wrq->prev_runnable_sum.
+ *
+ * Some examples follow, which assume:
+ *	Cluster0 = CPU0-3, Cluster1 = CPU4-7
+ *	One related thread group A that has tasks A0, A1, A2
+ *
+ *	A->cpu_time[X].curr/prev_sum = counters in which cpu execution stats of
+ *	tasks belonging to group A are accumulated when they run on cpu X.
+ *
+ *	CX->curr/prev_sum = counters in which cpu execution stats of all tasks
+ *	not belonging to group A are accumulated when they run on cpu X
+ *
+ * Lets say the stats for window M was as below:
+ *
+ *	C0->prev_sum = 1ms, A->cpu_time[0].prev_sum = 5ms
+ *		Task A0 ran 5ms on CPU0
+ *		Task B0 ran 1ms on CPU0
+ *
+ *	C1->prev_sum = 5ms, A->cpu_time[1].prev_sum = 6ms
+ *		Task A1 ran 4ms on CPU1
+ *		Task A2 ran 2ms on CPU1
+ *		Task B1 ran 5ms on CPU1
+ *
+ *	C2->prev_sum = 0ms, A->cpu_time[2].prev_sum = 0
+ *		CPU2 idle
+ *
+ *	C3->prev_sum = 0ms, A->cpu_time[3].prev_sum = 0
+ *		CPU3 idle
+ *
+ * In this case, CPU1 was most busy going by just its prev_sum counter. Demand
+ * from all group A tasks are added to CPU1. IOW, at end of window M, cpu busy
+ * time reported to governor will be:
+ *
+ *
+ *	C0 busy time = 1ms
+ *	C1 busy time = 5 + 5 + 6 = 16ms
+ *
+ */
+__read_mostly bool sched_freq_aggr_en;
+
+static u64
+update_window_start(struct rq *rq, u64 wallclock, int event)
+{
+	s64 delta;
+	int nr_windows;
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+	u64 old_window_start = wrq->window_start;
+
+	delta = wallclock - wrq->window_start;
+	if (delta < 0) {
+		pr_emerg("WALT-BUG CPU%d; wallclock=%llu is lesser than window_start=%llu",
+			rq->cpu, wallclock, wrq->window_start);
+		SCHED_BUG_ON(1);
+	}
+	if (delta < sched_ravg_window)
+		return old_window_start;
+
+	nr_windows = div64_u64(delta, sched_ravg_window);
+	wrq->window_start += (u64)nr_windows * (u64)sched_ravg_window;
+
+	wrq->cum_window_demand_scaled =
+			wrq->walt_stats.cumulative_runnable_avg_scaled;
+	wrq->prev_window_size = sched_ravg_window;
+
+	return old_window_start;
+}
+
+/*
+ * Assumes rq_lock is held and wallclock was recorded in the same critical
+ * section as this function's invocation.
+ */
+static inline u64 read_cycle_counter(int cpu, u64 wallclock)
+{
+	struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1;
+
+	if (wrq->last_cc_update != wallclock) {
+		wrq->cycles = qcom_cpufreq_get_cpu_cycle_counter(cpu);
+		wrq->last_cc_update = wallclock;
+	}
+
+	return wrq->cycles;
+}
+
+static void update_task_cpu_cycles(struct task_struct *p, int cpu,
+				   u64 wallclock)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	if (use_cycle_counter)
+		wts->cpu_cycles = read_cycle_counter(cpu, wallclock);
+}
+
+static inline bool is_ed_enabled(void)
+{
+	return (walt_rotation_enabled || (sched_boost_policy() !=
+		SCHED_BOOST_NONE));
+}
+
+static inline bool is_ed_task(struct task_struct *p, u64 wallclock)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	return (wallclock - wts->last_wake_ts >= EARLY_DETECTION_DURATION);
+}
+
+static bool is_ed_task_present(struct rq *rq, u64 wallclock)
+{
+	struct task_struct *p;
+	int loop_max = 10;
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+
+	wrq->ed_task = NULL;
+
+	if (!is_ed_enabled() || !rq->cfs.h_nr_running)
+		return false;
+
+	list_for_each_entry(p, &rq->cfs_tasks, se.group_node) {
+		if (!loop_max)
+			break;
+
+		if (is_ed_task(p, wallclock)) {
+			wrq->ed_task = p;
+			return true;
+		}
+
+		loop_max--;
+	}
+
+	return false;
+}
+
+static void walt_sched_account_irqstart(int cpu, struct task_struct *curr)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+
+	if (!wrq->window_start)
+		return;
+
+	/* We're here without rq->lock held, IRQ disabled */
+	raw_spin_lock(&rq->lock);
+	update_task_cpu_cycles(curr, cpu, sched_ktime_clock());
+	raw_spin_unlock(&rq->lock);
+}
+
+static void walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event,
+						u64 wallclock, u64 irqtime);
+static void walt_sched_account_irqend(int cpu, struct task_struct *curr, u64 delta)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	walt_update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(), delta);
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+/*
+ * Return total number of tasks "eligible" to run on higher capacity cpus
+ */
+unsigned int walt_big_tasks(int cpu)
+{
+	struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1;
+
+	return wrq->walt_stats.nr_big_tasks;
+}
+
+void clear_walt_request(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+
+	clear_reserved(cpu);
+	if (wrq->push_task) {
+		struct task_struct *push_task = NULL;
+
+		raw_spin_lock_irqsave(&rq->lock, flags);
+		if (wrq->push_task) {
+			clear_reserved(rq->push_cpu);
+			push_task = wrq->push_task;
+			wrq->push_task = NULL;
+		}
+		rq->active_balance = 0;
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
+		if (push_task)
+			put_task_struct(push_task);
+	}
+}
+
+/*
+ * Special case the last index and provide a fast path for index = 0.
+ * Note that sched_load_granule can change underneath us if we are not
+ * holding any runqueue locks while calling the two functions below.
+ */
+static u32 top_task_load(struct rq *rq)
+{
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+	int index = wrq->prev_top;
+	u8 prev = 1 - wrq->curr_table;
+
+	if (!index) {
+		int msb = NUM_LOAD_INDICES - 1;
+
+		if (!test_bit(msb, wrq->top_tasks_bitmap[prev]))
+			return 0;
+		else
+			return sched_load_granule;
+	} else if (index == NUM_LOAD_INDICES - 1) {
+		return sched_ravg_window;
+	} else {
+		return (index + 1) * sched_load_granule;
+	}
+}
+
+unsigned long sched_user_hint_reset_time;
+static bool is_cluster_hosting_top_app(struct walt_sched_cluster *cluster);
+
+static inline bool
+should_apply_suh_freq_boost(struct walt_sched_cluster *cluster)
+{
+	if (sched_freq_aggr_en || !sysctl_sched_user_hint ||
+				  !cluster->aggr_grp_load)
+		return false;
+
+	return is_cluster_hosting_top_app(cluster);
+}
+
+static inline u64 freq_policy_load(struct rq *rq)
+{
+	unsigned int reporting_policy = sysctl_sched_freq_reporting_policy;
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+	struct walt_sched_cluster *cluster = wrq->cluster;
+	u64 aggr_grp_load = cluster->aggr_grp_load;
+	u64 load, tt_load = 0;
+	struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu_of(rq));
+
+	if (wrq->ed_task != NULL) {
+		load = sched_ravg_window;
+		goto done;
+	}
+
+	if (sched_freq_aggr_en)
+		load = wrq->prev_runnable_sum + aggr_grp_load;
+	else
+		load = wrq->prev_runnable_sum +
+					wrq->grp_time.prev_runnable_sum;
+
+	if (cpu_ksoftirqd && cpu_ksoftirqd->state == TASK_RUNNING)
+		load = max_t(u64, load, task_load(cpu_ksoftirqd));
+
+	tt_load = top_task_load(rq);
+	switch (reporting_policy) {
+	case FREQ_REPORT_MAX_CPU_LOAD_TOP_TASK:
+		load = max_t(u64, load, tt_load);
+		break;
+	case FREQ_REPORT_TOP_TASK:
+		load = tt_load;
+		break;
+	case FREQ_REPORT_CPU_LOAD:
+		break;
+	default:
+		break;
+	}
+
+	if (should_apply_suh_freq_boost(cluster)) {
+		if (is_suh_max())
+			load = sched_ravg_window;
+		else
+			load = div64_u64(load * sysctl_sched_user_hint,
+					 (u64)100);
+	}
+
+done:
+	trace_sched_load_to_gov(rq, aggr_grp_load, tt_load, sched_freq_aggr_en,
+				load, reporting_policy, walt_rotation_enabled,
+				sysctl_sched_user_hint, wrq);
+	return load;
+}
+
+static bool rtgb_active;
+
+static inline unsigned long
+__cpu_util_freq_walt(int cpu, struct walt_cpu_load *walt_load)
+{
+	u64 util, util_unboosted;
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long capacity = capacity_orig_of(cpu);
+	int boost;
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+
+	boost = sysctl_sched_load_boost[cpu];
+	util_unboosted = util = freq_policy_load(rq);
+	util = div64_u64(util * (100 + boost),
+			walt_cpu_util_freq_divisor);
+
+	if (walt_load) {
+		u64 nl = wrq->nt_prev_runnable_sum +
+				wrq->grp_time.nt_prev_runnable_sum;
+		u64 pl = wrq->walt_stats.pred_demands_sum_scaled;
+
+		/* do_pl_notif() needs unboosted signals */
+		wrq->old_busy_time = div64_u64(util_unboosted,
+						sched_ravg_window >>
+						SCHED_CAPACITY_SHIFT);
+		wrq->old_estimated_time = pl;
+
+		nl = div64_u64(nl * (100 + boost), walt_cpu_util_freq_divisor);
+
+		walt_load->nl = nl;
+		walt_load->pl = pl;
+		walt_load->ws = walt_load_reported_window;
+		walt_load->rtgb_active = rtgb_active;
+	}
+
+	return (util >= capacity) ? capacity : util;
+}
+
+#define ADJUSTED_ASYM_CAP_CPU_UTIL(orig, other, x)	\
+			(max(orig, mult_frac(other, x, 100)))
+
+unsigned long
+cpu_util_freq_walt(int cpu, struct walt_cpu_load *walt_load)
+{
+	struct walt_cpu_load wl_other = {0};
+	unsigned long util = 0, util_other = 0;
+	unsigned long capacity = capacity_orig_of(cpu);
+	int i, mpct = sysctl_sched_asym_cap_sibling_freq_match_pct;
+
+	if (!cpumask_test_cpu(cpu, &asym_cap_sibling_cpus))
+		return __cpu_util_freq_walt(cpu, walt_load);
+
+	for_each_cpu(i, &asym_cap_sibling_cpus) {
+		if (i == cpu)
+			util = __cpu_util_freq_walt(cpu, walt_load);
+		else
+			util_other = __cpu_util_freq_walt(i, &wl_other);
+	}
+
+	if (cpu == cpumask_last(&asym_cap_sibling_cpus))
+		mpct = 100;
+
+	util = ADJUSTED_ASYM_CAP_CPU_UTIL(util, util_other, mpct);
+
+	walt_load->nl = ADJUSTED_ASYM_CAP_CPU_UTIL(walt_load->nl, wl_other.nl,
+						   mpct);
+	walt_load->pl = ADJUSTED_ASYM_CAP_CPU_UTIL(walt_load->pl, wl_other.pl,
+						   mpct);
+
+	return (util >= capacity) ? capacity : util;
+}
+
+/*
+ * In this function we match the accumulated subtractions with the current
+ * and previous windows we are operating with. Ignore any entries where
+ * the window start in the load_subtraction struct does not match either
+ * the curent or the previous window. This could happen whenever CPUs
+ * become idle or busy with interrupts disabled for an extended period.
+ */
+static inline void account_load_subtractions(struct rq *rq)
+{
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+	u64 ws = wrq->window_start;
+	u64 prev_ws = ws - wrq->prev_window_size;
+	struct load_subtractions *ls = wrq->load_subs;
+	int i;
+
+	for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
+		if (ls[i].window_start == ws) {
+			wrq->curr_runnable_sum -= ls[i].subs;
+			wrq->nt_curr_runnable_sum -= ls[i].new_subs;
+		} else if (ls[i].window_start == prev_ws) {
+			wrq->prev_runnable_sum -= ls[i].subs;
+			wrq->nt_prev_runnable_sum -= ls[i].new_subs;
+		}
+
+		ls[i].subs = 0;
+		ls[i].new_subs = 0;
+	}
+
+	SCHED_BUG_ON((s64)wrq->prev_runnable_sum < 0);
+	SCHED_BUG_ON((s64)wrq->curr_runnable_sum < 0);
+	SCHED_BUG_ON((s64)wrq->nt_prev_runnable_sum < 0);
+	SCHED_BUG_ON((s64)wrq->nt_curr_runnable_sum < 0);
+}
+
+static inline void create_subtraction_entry(struct rq *rq, u64 ws, int index)
+{
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+
+	wrq->load_subs[index].window_start = ws;
+	wrq->load_subs[index].subs = 0;
+	wrq->load_subs[index].new_subs = 0;
+}
+
+static int get_top_index(unsigned long *bitmap, unsigned long old_top)
+{
+	int index = find_next_bit(bitmap, NUM_LOAD_INDICES, old_top);
+
+	if (index == NUM_LOAD_INDICES)
+		return 0;
+
+	return NUM_LOAD_INDICES - 1 - index;
+}
+
+static bool get_subtraction_index(struct rq *rq, u64 ws)
+{
+	int i;
+	u64 oldest = ULLONG_MAX;
+	int oldest_index = 0;
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+
+	for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
+		u64 entry_ws = wrq->load_subs[i].window_start;
+
+		if (ws == entry_ws)
+			return i;
+
+		if (entry_ws < oldest) {
+			oldest = entry_ws;
+			oldest_index = i;
+		}
+	}
+
+	create_subtraction_entry(rq, ws, oldest_index);
+	return oldest_index;
+}
+
+static void update_rq_load_subtractions(int index, struct rq *rq,
+					u32 sub_load, bool new_task)
+{
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+
+	wrq->load_subs[index].subs += sub_load;
+	if (new_task)
+		wrq->load_subs[index].new_subs += sub_load;
+}
+
+static inline struct walt_sched_cluster *cpu_cluster(int cpu)
+{
+	struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1;
+
+	return wrq->cluster;
+}
+
+static void update_cluster_load_subtractions(struct task_struct *p,
+					int cpu, u64 ws, bool new_task)
+{
+	struct walt_sched_cluster *cluster = cpu_cluster(cpu);
+	struct cpumask cluster_cpus = cluster->cpus;
+	struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1;
+	u64 prev_ws = ws - wrq->prev_window_size;
+	int i;
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	cpumask_clear_cpu(cpu, &cluster_cpus);
+	raw_spin_lock(&cluster->load_lock);
+
+	for_each_cpu(i, &cluster_cpus) {
+		struct rq *rq = cpu_rq(i);
+		int index;
+
+		if (wts->curr_window_cpu[i]) {
+			index = get_subtraction_index(rq, ws);
+			update_rq_load_subtractions(index, rq,
+				wts->curr_window_cpu[i], new_task);
+			wts->curr_window_cpu[i] = 0;
+		}
+
+		if (wts->prev_window_cpu[i]) {
+			index = get_subtraction_index(rq, prev_ws);
+			update_rq_load_subtractions(index, rq,
+				wts->prev_window_cpu[i], new_task);
+			wts->prev_window_cpu[i] = 0;
+		}
+	}
+
+	raw_spin_unlock(&cluster->load_lock);
+}
+
+static inline void inter_cluster_migration_fixup
+	(struct task_struct *p, int new_cpu, int task_cpu, bool new_task)
+{
+	struct rq *dest_rq = cpu_rq(new_cpu);
+	struct rq *src_rq = cpu_rq(task_cpu);
+	struct walt_rq *dest_wrq = (struct walt_rq *) dest_rq->android_vendor_data1;
+	struct walt_rq *src_wrq = (struct walt_rq *) src_rq->android_vendor_data1;
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	if (same_freq_domain(new_cpu, task_cpu))
+		return;
+
+	wts->curr_window_cpu[new_cpu] = wts->curr_window;
+	wts->prev_window_cpu[new_cpu] = wts->prev_window;
+
+	dest_wrq->curr_runnable_sum += wts->curr_window;
+	dest_wrq->prev_runnable_sum += wts->prev_window;
+
+	if (src_wrq->curr_runnable_sum < wts->curr_window_cpu[task_cpu]) {
+		printk_deferred("WALT-BUG pid=%u CPU%d -> CPU%d src_crs=%llu is lesser than task_contrib=%llu",
+				p->pid, src_rq->cpu, dest_rq->cpu,
+				src_wrq->curr_runnable_sum,
+				wts->curr_window_cpu[task_cpu]);
+		walt_task_dump(p);
+		SCHED_BUG_ON(1);
+	}
+	src_wrq->curr_runnable_sum -= wts->curr_window_cpu[task_cpu];
+
+	if (src_wrq->prev_runnable_sum < wts->prev_window_cpu[task_cpu]) {
+		printk_deferred("WALT-BUG pid=%u CPU%d -> CPU%d src_prs=%llu is lesser than task_contrib=%llu",
+				p->pid, src_rq->cpu, dest_rq->cpu,
+				src_wrq->prev_runnable_sum,
+				wts->prev_window_cpu[task_cpu]);
+		walt_task_dump(p);
+		SCHED_BUG_ON(1);
+	}
+	src_wrq->prev_runnable_sum -= wts->prev_window_cpu[task_cpu];
+
+	if (new_task) {
+		dest_wrq->nt_curr_runnable_sum += wts->curr_window;
+		dest_wrq->nt_prev_runnable_sum += wts->prev_window;
+
+		if (src_wrq->nt_curr_runnable_sum <
+				wts->curr_window_cpu[task_cpu]) {
+			printk_deferred("WALT-BUG pid=%u CPU%d -> CPU%d src_nt_crs=%llu is lesser than task_contrib=%llu",
+					p->pid, src_rq->cpu, dest_rq->cpu,
+					src_wrq->nt_curr_runnable_sum,
+					wts->curr_window_cpu[task_cpu]);
+			walt_task_dump(p);
+			SCHED_BUG_ON(1);
+		}
+		src_wrq->nt_curr_runnable_sum -=
+				wts->curr_window_cpu[task_cpu];
+
+		if (src_wrq->nt_prev_runnable_sum <
+				wts->prev_window_cpu[task_cpu]) {
+			printk_deferred("WALT-BUG pid=%u CPU%d -> CPU%d src_nt_prs=%llu is lesser than task_contrib=%llu",
+					p->pid, src_rq->cpu, dest_rq->cpu,
+					src_wrq->nt_prev_runnable_sum,
+					wts->prev_window_cpu[task_cpu]);
+			walt_task_dump(p);
+			SCHED_BUG_ON(1);
+		}
+		src_wrq->nt_prev_runnable_sum -=
+				wts->prev_window_cpu[task_cpu];
+	}
+
+	wts->curr_window_cpu[task_cpu] = 0;
+	wts->prev_window_cpu[task_cpu] = 0;
+
+	update_cluster_load_subtractions(p, task_cpu,
+			src_wrq->window_start, new_task);
+}
+
+static u32 load_to_index(u32 load)
+{
+	u32 index = load / sched_load_granule;
+
+	return min(index, (u32)(NUM_LOAD_INDICES - 1));
+}
+
+static void
+migrate_top_tasks(struct task_struct *p, struct rq *src_rq, struct rq *dst_rq)
+{
+	int index;
+	int top_index;
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+	u32 curr_window = wts->curr_window;
+	u32 prev_window = wts->prev_window;
+	struct walt_rq *dst_wrq = (struct walt_rq *) dst_rq->android_vendor_data1;
+	struct walt_rq *src_wrq = (struct walt_rq *) src_rq->android_vendor_data1;
+	u8 src = src_wrq->curr_table;
+	u8 dst = dst_wrq->curr_table;
+	u8 *src_table;
+	u8 *dst_table;
+
+	if (curr_window) {
+		src_table = src_wrq->top_tasks[src];
+		dst_table = dst_wrq->top_tasks[dst];
+		index = load_to_index(curr_window);
+		src_table[index] -= 1;
+		dst_table[index] += 1;
+
+		if (!src_table[index])
+			__clear_bit(NUM_LOAD_INDICES - index - 1,
+				src_wrq->top_tasks_bitmap[src]);
+
+		if (dst_table[index] == 1)
+			__set_bit(NUM_LOAD_INDICES - index - 1,
+				dst_wrq->top_tasks_bitmap[dst]);
+
+		if (index > dst_wrq->curr_top)
+			dst_wrq->curr_top = index;
+
+		top_index = src_wrq->curr_top;
+		if (index == top_index && !src_table[index])
+			src_wrq->curr_top = get_top_index(
+				src_wrq->top_tasks_bitmap[src], top_index);
+	}
+
+	if (prev_window) {
+		src = 1 - src;
+		dst = 1 - dst;
+		src_table = src_wrq->top_tasks[src];
+		dst_table = dst_wrq->top_tasks[dst];
+		index = load_to_index(prev_window);
+		src_table[index] -= 1;
+		dst_table[index] += 1;
+
+		if (!src_table[index])
+			__clear_bit(NUM_LOAD_INDICES - index - 1,
+				src_wrq->top_tasks_bitmap[src]);
+
+		if (dst_table[index] == 1)
+			__set_bit(NUM_LOAD_INDICES - index - 1,
+				dst_wrq->top_tasks_bitmap[dst]);
+
+		if (index > dst_wrq->prev_top)
+			dst_wrq->prev_top = index;
+
+		top_index = src_wrq->prev_top;
+		if (index == top_index && !src_table[index])
+			src_wrq->prev_top = get_top_index(
+				src_wrq->top_tasks_bitmap[src], top_index);
+	}
+}
+
+static inline bool is_new_task(struct task_struct *p)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	return wts->active_time < NEW_TASK_ACTIVE_TIME;
+}
+
+static void fixup_busy_time(struct task_struct *p, int new_cpu)
+{
+	struct rq *src_rq = task_rq(p);
+	struct rq *dest_rq = cpu_rq(new_cpu);
+	u64 wallclock;
+	u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
+	u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
+	u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
+	u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
+	bool new_task;
+	struct walt_related_thread_group *grp;
+	long pstate;
+	struct walt_rq *dest_wrq = (struct walt_rq *) dest_rq->android_vendor_data1;
+	struct walt_rq *src_wrq = (struct walt_rq *) src_rq->android_vendor_data1;
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	if (!p->on_rq && p->state != TASK_WAKING)
+		return;
+
+	pstate = p->state;
+
+	if (pstate == TASK_WAKING)
+		double_rq_lock(src_rq, dest_rq);
+
+	wallclock = sched_ktime_clock();
+
+	walt_update_task_ravg(task_rq(p)->curr, task_rq(p),
+			 TASK_UPDATE,
+			 wallclock, 0);
+	walt_update_task_ravg(dest_rq->curr, dest_rq,
+			 TASK_UPDATE, wallclock, 0);
+
+	walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE,
+			 wallclock, 0);
+
+	update_task_cpu_cycles(p, new_cpu, wallclock);
+
+	new_task = is_new_task(p);
+	/* Protected by rq_lock */
+	grp = wts->grp;
+
+	/*
+	 * For frequency aggregation, we continue to do migration fixups
+	 * even for intra cluster migrations. This is because, the aggregated
+	 * load has to reported on a single CPU regardless.
+	 */
+	if (grp) {
+		struct group_cpu_time *cpu_time;
+
+		cpu_time = &src_wrq->grp_time;
+		src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+		src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+		src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+		src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+
+		cpu_time = &dest_wrq->grp_time;
+		dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+		dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+		dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+		dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+
+		if (wts->curr_window) {
+			*src_curr_runnable_sum -= wts->curr_window;
+			*dst_curr_runnable_sum += wts->curr_window;
+			if (new_task) {
+				*src_nt_curr_runnable_sum -= wts->curr_window;
+				*dst_nt_curr_runnable_sum += wts->curr_window;
+			}
+		}
+
+		if (wts->prev_window) {
+			*src_prev_runnable_sum -= wts->prev_window;
+			*dst_prev_runnable_sum += wts->prev_window;
+			if (new_task) {
+				*src_nt_prev_runnable_sum -= wts->prev_window;
+				*dst_nt_prev_runnable_sum += wts->prev_window;
+			}
+		}
+	} else {
+		inter_cluster_migration_fixup(p, new_cpu,
+						task_cpu(p), new_task);
+	}
+
+	migrate_top_tasks(p, src_rq, dest_rq);
+
+	if (!same_freq_domain(new_cpu, task_cpu(p))) {
+		src_wrq->notif_pending = true;
+		dest_wrq->notif_pending = true;
+		walt_irq_work_queue(&walt_migration_irq_work);
+	}
+
+	if (is_ed_enabled()) {
+		if (p == src_wrq->ed_task) {
+			src_wrq->ed_task = NULL;
+			dest_wrq->ed_task = p;
+		} else if (is_ed_task(p, wallclock)) {
+			dest_wrq->ed_task = p;
+		}
+	}
+
+	if (pstate == TASK_WAKING)
+		double_rq_unlock(src_rq, dest_rq);
+}
+
+static void set_window_start(struct rq *rq)
+{
+	static int sync_cpu_available;
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+	struct walt_rq *sync_wrq;
+	struct walt_task_struct *wts = (struct walt_task_struct *) rq->curr->android_vendor_data1;
+
+	if (likely(wrq->window_start))
+		return;
+
+	if (!sync_cpu_available) {
+		wrq->window_start = 1;
+		sync_cpu_available = 1;
+		atomic64_set(&walt_irq_work_lastq_ws, wrq->window_start);
+		walt_load_reported_window =
+					atomic64_read(&walt_irq_work_lastq_ws);
+
+	} else {
+		struct rq *sync_rq = cpu_rq(cpumask_any(cpu_online_mask));
+
+		sync_wrq = (struct walt_rq *) sync_rq->android_vendor_data1;
+		raw_spin_unlock(&rq->lock);
+		double_rq_lock(rq, sync_rq);
+		wrq->window_start = sync_wrq->window_start;
+		wrq->curr_runnable_sum = wrq->prev_runnable_sum = 0;
+		wrq->nt_curr_runnable_sum = wrq->nt_prev_runnable_sum = 0;
+		raw_spin_unlock(&sync_rq->lock);
+	}
+
+	wts->mark_start = wrq->window_start;
+}
+
+#define INC_STEP 8
+#define DEC_STEP 2
+#define CONSISTENT_THRES 16
+#define INC_STEP_BIG 16
+/*
+ * bucket_increase - update the count of all buckets
+ *
+ * @buckets: array of buckets tracking busy time of a task
+ * @idx: the index of bucket to be incremented
+ *
+ * Each time a complete window finishes, count of bucket that runtime
+ * falls in (@idx) is incremented. Counts of all other buckets are
+ * decayed. The rate of increase and decay could be different based
+ * on current count in the bucket.
+ */
+static inline void bucket_increase(u8 *buckets, int idx)
+{
+	int i, step;
+
+	for (i = 0; i < NUM_BUSY_BUCKETS; i++) {
+		if (idx != i) {
+			if (buckets[i] > DEC_STEP)
+				buckets[i] -= DEC_STEP;
+			else
+				buckets[i] = 0;
+		} else {
+			step = buckets[i] >= CONSISTENT_THRES ?
+						INC_STEP_BIG : INC_STEP;
+			if (buckets[i] > U8_MAX - step)
+				buckets[i] = U8_MAX;
+			else
+				buckets[i] += step;
+		}
+	}
+}
+
+static inline int busy_to_bucket(u32 normalized_rt)
+{
+	int bidx;
+
+	bidx = mult_frac(normalized_rt, NUM_BUSY_BUCKETS, max_task_load());
+	bidx = min(bidx, NUM_BUSY_BUCKETS - 1);
+
+	/*
+	 * Combine lowest two buckets. The lowest frequency falls into
+	 * 2nd bucket and thus keep predicting lowest bucket is not
+	 * useful.
+	 */
+	if (!bidx)
+		bidx++;
+
+	return bidx;
+}
+
+/*
+ * get_pred_busy - calculate predicted demand for a task on runqueue
+ *
+ * @p: task whose prediction is being updated
+ * @start: starting bucket. returned prediction should not be lower than
+ *         this bucket.
+ * @runtime: runtime of the task. returned prediction should not be lower
+ *           than this runtime.
+ * Note: @start can be derived from @runtime. It's passed in only to
+ * avoid duplicated calculation in some cases.
+ *
+ * A new predicted busy time is returned for task @p based on @runtime
+ * passed in. The function searches through buckets that represent busy
+ * time equal to or bigger than @runtime and attempts to find the bucket
+ * to use for prediction. Once found, it searches through historical busy
+ * time and returns the latest that falls into the bucket. If no such busy
+ * time exists, it returns the medium of that bucket.
+ */
+static u32 get_pred_busy(struct task_struct *p,
+				int start, u32 runtime)
+{
+	int i;
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+	u8 *buckets = wts->busy_buckets;
+	u32 *hist = wts->sum_history;
+	u32 dmin, dmax;
+	u64 cur_freq_runtime = 0;
+	int first = NUM_BUSY_BUCKETS, final;
+	u32 ret = runtime;
+
+	/* skip prediction for new tasks due to lack of history */
+	if (unlikely(is_new_task(p)))
+		goto out;
+
+	/* find minimal bucket index to pick */
+	for (i = start; i < NUM_BUSY_BUCKETS; i++) {
+		if (buckets[i]) {
+			first = i;
+			break;
+		}
+	}
+	/* if no higher buckets are filled, predict runtime */
+	if (first >= NUM_BUSY_BUCKETS)
+		goto out;
+
+	/* compute the bucket for prediction */
+	final = first;
+
+	/* determine demand range for the predicted bucket */
+	if (final < 2) {
+		/* lowest two buckets are combined */
+		dmin = 0;
+		final = 1;
+	} else {
+		dmin = mult_frac(final, max_task_load(), NUM_BUSY_BUCKETS);
+	}
+	dmax = mult_frac(final + 1, max_task_load(), NUM_BUSY_BUCKETS);
+
+	/*
+	 * search through runtime history and return first runtime that falls
+	 * into the range of predicted bucket.
+	 */
+	for (i = 0; i < sched_ravg_hist_size; i++) {
+		if (hist[i] >= dmin && hist[i] < dmax) {
+			ret = hist[i];
+			break;
+		}
+	}
+	/* no historical runtime within bucket found, use average of the bin */
+	if (ret < dmin)
+		ret = (dmin + dmax) / 2;
+	/*
+	 * when updating in middle of a window, runtime could be higher
+	 * than all recorded history. Always predict at least runtime.
+	 */
+	ret = max(runtime, ret);
+out:
+	trace_sched_update_pred_demand(p, runtime,
+		mult_frac((unsigned int)cur_freq_runtime, 100,
+			  sched_ravg_window), ret, wts);
+	return ret;
+}
+
+static inline u32 calc_pred_demand(struct task_struct *p)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	if (wts->pred_demand >= wts->curr_window)
+		return wts->pred_demand;
+
+	return get_pred_busy(p, busy_to_bucket(wts->curr_window),
+			     wts->curr_window);
+}
+
+/*
+ * predictive demand of a task is calculated at the window roll-over.
+ * if the task current window busy time exceeds the predicted
+ * demand, update it here to reflect the task needs.
+ */
+static void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
+{
+	u32 new, old;
+	u16 new_scaled;
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	if (!sched_predl)
+		return;
+
+	if (is_idle_task(p))
+		return;
+
+	if (event != PUT_PREV_TASK && event != TASK_UPDATE &&
+			(!SCHED_FREQ_ACCOUNT_WAIT_TIME ||
+			 (event != TASK_MIGRATE &&
+			 event != PICK_NEXT_TASK)))
+		return;
+
+	/*
+	 * TASK_UPDATE can be called on sleeping task, when its moved between
+	 * related groups
+	 */
+	if (event == TASK_UPDATE) {
+		if (!p->on_rq && !SCHED_FREQ_ACCOUNT_WAIT_TIME)
+			return;
+	}
+
+	new = calc_pred_demand(p);
+	old = wts->pred_demand;
+
+	if (old >= new)
+		return;
+
+	new_scaled = scale_demand(new);
+	if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
+				!p->dl.dl_throttled))
+		fixup_walt_sched_stats_common(rq, p,
+				wts->demand_scaled,
+				new_scaled);
+
+	wts->pred_demand = new;
+	wts->pred_demand_scaled = new_scaled;
+}
+
+static void clear_top_tasks_bitmap(unsigned long *bitmap)
+{
+	memset(bitmap, 0, top_tasks_bitmap_size);
+	__set_bit(NUM_LOAD_INDICES, bitmap);
+}
+
+static inline void clear_top_tasks_table(u8 *table)
+{
+	memset(table, 0, NUM_LOAD_INDICES * sizeof(u8));
+}
+
+static void update_top_tasks(struct task_struct *p, struct rq *rq,
+		u32 old_curr_window, int new_window, bool full_window)
+{
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+	u8 curr = wrq->curr_table;
+	u8 prev = 1 - curr;
+	u8 *curr_table = wrq->top_tasks[curr];
+	u8 *prev_table = wrq->top_tasks[prev];
+	int old_index, new_index, update_index;
+	u32 curr_window = wts->curr_window;
+	u32 prev_window = wts->prev_window;
+	bool zero_index_update;
+
+	if (old_curr_window == curr_window && !new_window)
+		return;
+
+	old_index = load_to_index(old_curr_window);
+	new_index = load_to_index(curr_window);
+
+	if (!new_window) {
+		zero_index_update = !old_curr_window && curr_window;
+		if (old_index != new_index || zero_index_update) {
+			if (old_curr_window)
+				curr_table[old_index] -= 1;
+			if (curr_window)
+				curr_table[new_index] += 1;
+			if (new_index > wrq->curr_top)
+				wrq->curr_top = new_index;
+		}
+
+		if (!curr_table[old_index])
+			__clear_bit(NUM_LOAD_INDICES - old_index - 1,
+				wrq->top_tasks_bitmap[curr]);
+
+		if (curr_table[new_index] == 1)
+			__set_bit(NUM_LOAD_INDICES - new_index - 1,
+				wrq->top_tasks_bitmap[curr]);
+
+		return;
+	}
+
+	/*
+	 * The window has rolled over for this task. By the time we get
+	 * here, curr/prev swaps would has already occurred. So we need
+	 * to use prev_window for the new index.
+	 */
+	update_index = load_to_index(prev_window);
+
+	if (full_window) {
+		/*
+		 * Two cases here. Either 'p' ran for the entire window or
+		 * it didn't run at all. In either case there is no entry
+		 * in the prev table. If 'p' ran the entire window, we just
+		 * need to create a new entry in the prev table. In this case
+		 * update_index will be correspond to sched_ravg_window
+		 * so we can unconditionally update the top index.
+		 */
+		if (prev_window) {
+			prev_table[update_index] += 1;
+			wrq->prev_top = update_index;
+		}
+
+		if (prev_table[update_index] == 1)
+			__set_bit(NUM_LOAD_INDICES - update_index - 1,
+				wrq->top_tasks_bitmap[prev]);
+	} else {
+		zero_index_update = !old_curr_window && prev_window;
+		if (old_index != update_index || zero_index_update) {
+			if (old_curr_window)
+				prev_table[old_index] -= 1;
+
+			prev_table[update_index] += 1;
+
+			if (update_index > wrq->prev_top)
+				wrq->prev_top = update_index;
+
+			if (!prev_table[old_index])
+				__clear_bit(NUM_LOAD_INDICES - old_index - 1,
+						wrq->top_tasks_bitmap[prev]);
+
+			if (prev_table[update_index] == 1)
+				__set_bit(NUM_LOAD_INDICES - update_index - 1,
+						wrq->top_tasks_bitmap[prev]);
+		}
+	}
+
+	if (curr_window) {
+		curr_table[new_index] += 1;
+
+		if (new_index > wrq->curr_top)
+			wrq->curr_top = new_index;
+
+		if (curr_table[new_index] == 1)
+			__set_bit(NUM_LOAD_INDICES - new_index - 1,
+				wrq->top_tasks_bitmap[curr]);
+	}
+}
+
+static void rollover_top_tasks(struct rq *rq, bool full_window)
+{
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+	u8 curr_table = wrq->curr_table;
+	u8 prev_table = 1 - curr_table;
+	int curr_top = wrq->curr_top;
+
+	clear_top_tasks_table(wrq->top_tasks[prev_table]);
+	clear_top_tasks_bitmap(wrq->top_tasks_bitmap[prev_table]);
+
+	if (full_window) {
+		curr_top = 0;
+		clear_top_tasks_table(wrq->top_tasks[curr_table]);
+		clear_top_tasks_bitmap(wrq->top_tasks_bitmap[curr_table]);
+	}
+
+	wrq->curr_table = prev_table;
+	wrq->prev_top = curr_top;
+	wrq->curr_top = 0;
+}
+
+static u32 empty_windows[WALT_NR_CPUS];
+
+static void rollover_task_window(struct task_struct *p, bool full_window)
+{
+	u32 *curr_cpu_windows = empty_windows;
+	u32 curr_window;
+	int i;
+	struct walt_rq *wrq = (struct walt_rq *) task_rq(p)->android_vendor_data1;
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	/* Rollover the sum */
+	curr_window = 0;
+
+	if (!full_window) {
+		curr_window = wts->curr_window;
+		curr_cpu_windows = wts->curr_window_cpu;
+	}
+
+	wts->prev_window = curr_window;
+	wts->curr_window = 0;
+
+	/* Roll over individual CPU contributions */
+	for (i = 0; i < nr_cpu_ids; i++) {
+		wts->prev_window_cpu[i] = curr_cpu_windows[i];
+		wts->curr_window_cpu[i] = 0;
+	}
+
+	if (is_new_task(p))
+		wts->active_time += wrq->prev_window_size;
+}
+
+static inline int cpu_is_waiting_on_io(struct rq *rq)
+{
+	if (!sched_io_is_busy)
+		return 0;
+
+	return atomic_read(&rq->nr_iowait);
+}
+
+static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
+				     u64 irqtime, int event)
+{
+	if (is_idle_task(p)) {
+		/* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
+		if (event == PICK_NEXT_TASK)
+			return 0;
+
+		/* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
+		return irqtime || cpu_is_waiting_on_io(rq);
+	}
+
+	if (event == TASK_WAKE)
+		return 0;
+
+	if (event == PUT_PREV_TASK || event == IRQ_UPDATE)
+		return 1;
+
+	/*
+	 * TASK_UPDATE can be called on sleeping task, when its moved between
+	 * related groups
+	 */
+	if (event == TASK_UPDATE) {
+		if (rq->curr == p)
+			return 1;
+
+		return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0;
+	}
+
+	/* TASK_MIGRATE, PICK_NEXT_TASK left */
+	return SCHED_FREQ_ACCOUNT_WAIT_TIME;
+}
+
+#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y)
+
+static inline u64 scale_exec_time(u64 delta, struct rq *rq)
+{
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+
+	return (delta * wrq->task_exec_scale) >> 10;
+}
+
+/* Convert busy time to frequency equivalent
+ * Assumes load is scaled to 1024
+ */
+static inline unsigned int load_to_freq(struct rq *rq, unsigned int load)
+{
+	return mult_frac(cpu_max_possible_freq(cpu_of(rq)), load,
+		 (unsigned int)arch_scale_cpu_capacity(cpu_of(rq)));
+}
+
+static bool do_pl_notif(struct rq *rq)
+{
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+	u64 prev = wrq->old_busy_time;
+	u64 pl = wrq->walt_stats.pred_demands_sum_scaled;
+	int cpu = cpu_of(rq);
+
+	/* If already at max freq, bail out */
+	if (capacity_orig_of(cpu) == capacity_curr_of(cpu))
+		return false;
+
+	prev = max(prev, wrq->old_estimated_time);
+
+	/* 400 MHz filter. */
+	return (pl > prev) && (load_to_freq(rq, pl - prev) > 400000);
+}
+
+static void rollover_cpu_window(struct rq *rq, bool full_window)
+{
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+	u64 curr_sum = wrq->curr_runnable_sum;
+	u64 nt_curr_sum = wrq->nt_curr_runnable_sum;
+	u64 grp_curr_sum = wrq->grp_time.curr_runnable_sum;
+	u64 grp_nt_curr_sum = wrq->grp_time.nt_curr_runnable_sum;
+
+	if (unlikely(full_window)) {
+		curr_sum = 0;
+		nt_curr_sum = 0;
+		grp_curr_sum = 0;
+		grp_nt_curr_sum = 0;
+	}
+
+	wrq->prev_runnable_sum = curr_sum;
+	wrq->nt_prev_runnable_sum = nt_curr_sum;
+	wrq->grp_time.prev_runnable_sum = grp_curr_sum;
+	wrq->grp_time.nt_prev_runnable_sum = grp_nt_curr_sum;
+
+	wrq->curr_runnable_sum = 0;
+	wrq->nt_curr_runnable_sum = 0;
+	wrq->grp_time.curr_runnable_sum = 0;
+	wrq->grp_time.nt_curr_runnable_sum = 0;
+}
+
+/*
+ * Account cpu activity in its
+ * busy time counters(wrq->curr/prev_runnable_sum)
+ */
+static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
+				 int event, u64 wallclock, u64 irqtime)
+{
+	int new_window, full_window = 0;
+	int p_is_curr_task = (p == rq->curr);
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+	u64 mark_start = wts->mark_start;
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+	u64 window_start = wrq->window_start;
+	u32 window_size = wrq->prev_window_size;
+	u64 delta;
+	u64 *curr_runnable_sum = &wrq->curr_runnable_sum;
+	u64 *prev_runnable_sum = &wrq->prev_runnable_sum;
+	u64 *nt_curr_runnable_sum = &wrq->nt_curr_runnable_sum;
+	u64 *nt_prev_runnable_sum = &wrq->nt_prev_runnable_sum;
+	bool new_task;
+	struct walt_related_thread_group *grp;
+	int cpu = rq->cpu;
+	u32 old_curr_window = wts->curr_window;
+
+	new_window = mark_start < window_start;
+	if (new_window)
+		full_window = (window_start - mark_start) >= window_size;
+
+	/*
+	 * Handle per-task window rollover. We don't care about the
+	 * idle task.
+	 */
+	if (!is_idle_task(p)) {
+		if (new_window)
+			rollover_task_window(p, full_window);
+	}
+
+	new_task = is_new_task(p);
+
+	if (p_is_curr_task && new_window) {
+		rollover_cpu_window(rq, full_window);
+		rollover_top_tasks(rq, full_window);
+	}
+
+	if (!account_busy_for_cpu_time(rq, p, irqtime, event))
+		goto done;
+
+	grp = wts->grp;
+	if (grp) {
+		struct group_cpu_time *cpu_time = &wrq->grp_time;
+
+		curr_runnable_sum = &cpu_time->curr_runnable_sum;
+		prev_runnable_sum = &cpu_time->prev_runnable_sum;
+
+		nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+		nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+	}
+
+	if (!new_window) {
+		/*
+		 * account_busy_for_cpu_time() = 1 so busy time needs
+		 * to be accounted to the current window. No rollover
+		 * since we didn't start a new window. An example of this is
+		 * when a task starts execution and then sleeps within the
+		 * same window.
+		 */
+
+		if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
+			delta = wallclock - mark_start;
+		else
+			delta = irqtime;
+		delta = scale_exec_time(delta, rq);
+		*curr_runnable_sum += delta;
+		if (new_task)
+			*nt_curr_runnable_sum += delta;
+
+		if (!is_idle_task(p)) {
+			wts->curr_window += delta;
+			wts->curr_window_cpu[cpu] += delta;
+		}
+
+		goto done;
+	}
+
+	if (!p_is_curr_task) {
+		/*
+		 * account_busy_for_cpu_time() = 1 so busy time needs
+		 * to be accounted to the current window. A new window
+		 * has also started, but p is not the current task, so the
+		 * window is not rolled over - just split up and account
+		 * as necessary into curr and prev. The window is only
+		 * rolled over when a new window is processed for the current
+		 * task.
+		 *
+		 * Irqtime can't be accounted by a task that isn't the
+		 * currently running task.
+		 */
+
+		if (!full_window) {
+			/*
+			 * A full window hasn't elapsed, account partial
+			 * contribution to previous completed window.
+			 */
+			delta = scale_exec_time(window_start - mark_start, rq);
+			wts->prev_window += delta;
+			wts->prev_window_cpu[cpu] += delta;
+		} else {
+			/*
+			 * Since at least one full window has elapsed,
+			 * the contribution to the previous window is the
+			 * full window (window_size).
+			 */
+			delta = scale_exec_time(window_size, rq);
+			wts->prev_window = delta;
+			wts->prev_window_cpu[cpu] = delta;
+		}
+
+		*prev_runnable_sum += delta;
+		if (new_task)
+			*nt_prev_runnable_sum += delta;
+
+		/* Account piece of busy time in the current window. */
+		delta = scale_exec_time(wallclock - window_start, rq);
+		*curr_runnable_sum += delta;
+		if (new_task)
+			*nt_curr_runnable_sum += delta;
+
+		wts->curr_window = delta;
+		wts->curr_window_cpu[cpu] = delta;
+
+		goto done;
+	}
+
+	if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
+		/*
+		 * account_busy_for_cpu_time() = 1 so busy time needs
+		 * to be accounted to the current window. A new window
+		 * has started and p is the current task so rollover is
+		 * needed. If any of these three above conditions are true
+		 * then this busy time can't be accounted as irqtime.
+		 *
+		 * Busy time for the idle task need not be accounted.
+		 *
+		 * An example of this would be a task that starts execution
+		 * and then sleeps once a new window has begun.
+		 */
+
+		if (!full_window) {
+			/*
+			 * A full window hasn't elapsed, account partial
+			 * contribution to previous completed window.
+			 */
+			delta = scale_exec_time(window_start - mark_start, rq);
+			if (!is_idle_task(p)) {
+				wts->prev_window += delta;
+				wts->prev_window_cpu[cpu] += delta;
+			}
+		} else {
+			/*
+			 * Since at least one full window has elapsed,
+			 * the contribution to the previous window is the
+			 * full window (window_size).
+			 */
+			delta = scale_exec_time(window_size, rq);
+			if (!is_idle_task(p)) {
+				wts->prev_window = delta;
+				wts->prev_window_cpu[cpu] = delta;
+			}
+		}
+
+		/*
+		 * Rollover is done here by overwriting the values in
+		 * prev_runnable_sum and curr_runnable_sum.
+		 */
+		*prev_runnable_sum += delta;
+		if (new_task)
+			*nt_prev_runnable_sum += delta;
+
+		/* Account piece of busy time in the current window. */
+		delta = scale_exec_time(wallclock - window_start, rq);
+		*curr_runnable_sum += delta;
+		if (new_task)
+			*nt_curr_runnable_sum += delta;
+
+		if (!is_idle_task(p)) {
+			wts->curr_window = delta;
+			wts->curr_window_cpu[cpu] = delta;
+		}
+
+		goto done;
+	}
+
+	if (irqtime) {
+		/*
+		 * account_busy_for_cpu_time() = 1 so busy time needs
+		 * to be accounted to the current window. A new window
+		 * has started and p is the current task so rollover is
+		 * needed. The current task must be the idle task because
+		 * irqtime is not accounted for any other task.
+		 *
+		 * Irqtime will be accounted each time we process IRQ activity
+		 * after a period of idleness, so we know the IRQ busy time
+		 * started at wallclock - irqtime.
+		 */
+
+		SCHED_BUG_ON(!is_idle_task(p));
+		mark_start = wallclock - irqtime;
+
+		/*
+		 * Roll window over. If IRQ busy time was just in the current
+		 * window then that is all that need be accounted.
+		 */
+		if (mark_start > window_start) {
+			*curr_runnable_sum = scale_exec_time(irqtime, rq);
+			return;
+		}
+
+		/*
+		 * The IRQ busy time spanned multiple windows. Process the
+		 * busy time preceding the current window start first.
+		 */
+		delta = window_start - mark_start;
+		if (delta > window_size)
+			delta = window_size;
+		delta = scale_exec_time(delta, rq);
+		*prev_runnable_sum += delta;
+
+		/* Process the remaining IRQ busy time in the current window. */
+		delta = wallclock - window_start;
+		wrq->curr_runnable_sum = scale_exec_time(delta, rq);
+
+		return;
+	}
+
+done:
+	if (!is_idle_task(p))
+		update_top_tasks(p, rq, old_curr_window,
+					new_window, full_window);
+}
+
+static inline u32 predict_and_update_buckets(
+			struct task_struct *p, u32 runtime) {
+	int bidx;
+	u32 pred_demand;
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	if (!sched_predl)
+		return 0;
+
+	bidx = busy_to_bucket(runtime);
+	pred_demand = get_pred_busy(p, bidx, runtime);
+	bucket_increase(wts->busy_buckets, bidx);
+
+	return pred_demand;
+}
+
+static int
+account_busy_for_task_demand(struct rq *rq, struct task_struct *p, int event)
+{
+	/*
+	 * No need to bother updating task demand for the idle task.
+	 */
+	if (is_idle_task(p))
+		return 0;
+
+	/*
+	 * When a task is waking up it is completing a segment of non-busy
+	 * time. Likewise, if wait time is not treated as busy time, then
+	 * when a task begins to run or is migrated, it is not running and
+	 * is completing a segment of non-busy time.
+	 */
+	if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME &&
+			 (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
+		return 0;
+
+	/*
+	 * The idle exit time is not accounted for the first task _picked_ up to
+	 * run on the idle CPU.
+	 */
+	if (event == PICK_NEXT_TASK && rq->curr == rq->idle)
+		return 0;
+
+	/*
+	 * TASK_UPDATE can be called on sleeping task, when its moved between
+	 * related groups
+	 */
+	if (event == TASK_UPDATE) {
+		if (rq->curr == p)
+			return 1;
+
+		return p->on_rq ? SCHED_ACCOUNT_WAIT_TIME : 0;
+	}
+
+	return 1;
+}
+
+/*
+ * Called when new window is starting for a task, to record cpu usage over
+ * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
+ * when, say, a real-time task runs without preemption for several windows at a
+ * stretch.
+ */
+static void update_history(struct rq *rq, struct task_struct *p,
+			 u32 runtime, int samples, int event)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+	u32 *hist = &wts->sum_history[0];
+	int ridx, widx;
+	u32 max = 0, avg, demand, pred_demand;
+	u64 sum = 0;
+	u16 demand_scaled, pred_demand_scaled;
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+
+	/* Ignore windows where task had no activity */
+	if (!runtime || is_idle_task(p) || !samples)
+		goto done;
+
+	/* Push new 'runtime' value onto stack */
+	widx = sched_ravg_hist_size - 1;
+	ridx = widx - samples;
+	for (; ridx >= 0; --widx, --ridx) {
+		hist[widx] = hist[ridx];
+		sum += hist[widx];
+		if (hist[widx] > max)
+			max = hist[widx];
+	}
+
+	for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) {
+		hist[widx] = runtime;
+		sum += hist[widx];
+		if (hist[widx] > max)
+			max = hist[widx];
+	}
+
+	wts->sum = 0;
+
+	if (sysctl_sched_window_stats_policy == WINDOW_STATS_RECENT) {
+		demand = runtime;
+	} else if (sysctl_sched_window_stats_policy == WINDOW_STATS_MAX) {
+		demand = max;
+	} else {
+		avg = div64_u64(sum, sched_ravg_hist_size);
+		if (sysctl_sched_window_stats_policy == WINDOW_STATS_AVG)
+			demand = avg;
+		else
+			demand = max(avg, runtime);
+	}
+	pred_demand = predict_and_update_buckets(p, runtime);
+	demand_scaled = scale_demand(demand);
+	pred_demand_scaled = scale_demand(pred_demand);
+
+	/*
+	 * A throttled deadline sched class task gets dequeued without
+	 * changing p->on_rq. Since the dequeue decrements walt stats
+	 * avoid decrementing it here again.
+	 *
+	 * When window is rolled over, the cumulative window demand
+	 * is reset to the cumulative runnable average (contribution from
+	 * the tasks on the runqueue). If the current task is dequeued
+	 * already, it's demand is not included in the cumulative runnable
+	 * average. So add the task demand separately to cumulative window
+	 * demand.
+	 */
+	if (!task_has_dl_policy(p) || !p->dl.dl_throttled) {
+		if (task_on_rq_queued(p))
+			fixup_walt_sched_stats_common(rq, p,
+					demand_scaled, pred_demand_scaled);
+	}
+
+	wts->demand = demand;
+	wts->demand_scaled = demand_scaled;
+	wts->coloc_demand = div64_u64(sum, sched_ravg_hist_size);
+	wts->pred_demand = pred_demand;
+	wts->pred_demand_scaled = pred_demand_scaled;
+
+	if (demand_scaled > sysctl_sched_min_task_util_for_colocation)
+		wts->unfilter = sysctl_sched_task_unfilter_period;
+	else
+		if (wts->unfilter)
+			wts->unfilter = max_t(int, 0,
+				wts->unfilter - wrq->prev_window_size);
+
+done:
+	trace_sched_update_history(rq, p, runtime, samples, event, wrq, wts);
+}
+
+static u64 add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	delta = scale_exec_time(delta, rq);
+	wts->sum += delta;
+	if (unlikely(wts->sum > sched_ravg_window))
+		wts->sum = sched_ravg_window;
+
+	return delta;
+}
+
+/*
+ * Account cpu demand of task and/or update task's cpu demand history
+ *
+ * ms = wts->mark_start;
+ * wc = wallclock
+ * ws = wrq->window_start
+ *
+ * Three possibilities:
+ *
+ *	a) Task event is contained within one window.
+ *		window_start < mark_start < wallclock
+ *
+ *		ws   ms  wc
+ *		|    |   |
+ *		V    V   V
+ *		|---------------|
+ *
+ *	In this case, wts->sum is updated *iff* event is appropriate
+ *	(ex: event == PUT_PREV_TASK)
+ *
+ *	b) Task event spans two windows.
+ *		mark_start < window_start < wallclock
+ *
+ *		ms   ws   wc
+ *		|    |    |
+ *		V    V    V
+ *		-----|-------------------
+ *
+ *	In this case, wts->sum is updated with (ws - ms) *iff* event
+ *	is appropriate, then a new window sample is recorded followed
+ *	by wts->sum being set to (wc - ws) *iff* event is appropriate.
+ *
+ *	c) Task event spans more than two windows.
+ *
+ *		ms ws_tmp			   ws  wc
+ *		|  |				   |   |
+ *		V  V				   V   V
+ *		---|-------|-------|-------|-------|------
+ *		   |				   |
+ *		   |<------ nr_full_windows ------>|
+ *
+ *	In this case, wts->sum is updated with (ws_tmp - ms) first *iff*
+ *	event is appropriate, window sample of wts->sum is recorded,
+ *	'nr_full_window' samples of window_size is also recorded *iff*
+ *	event is appropriate and finally wts->sum is set to (wc - ws)
+ *	*iff* event is appropriate.
+ *
+ * IMPORTANT : Leave wts->mark_start unchanged, as update_cpu_busy_time()
+ * depends on it!
+ */
+static u64 update_task_demand(struct task_struct *p, struct rq *rq,
+			       int event, u64 wallclock)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+	u64 mark_start = wts->mark_start;
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+	u64 delta, window_start = wrq->window_start;
+	int new_window, nr_full_windows;
+	u32 window_size = sched_ravg_window;
+	u64 runtime;
+
+	new_window = mark_start < window_start;
+	if (!account_busy_for_task_demand(rq, p, event)) {
+		if (new_window)
+			/*
+			 * If the time accounted isn't being accounted as
+			 * busy time, and a new window started, only the
+			 * previous window need be closed out with the
+			 * pre-existing demand. Multiple windows may have
+			 * elapsed, but since empty windows are dropped,
+			 * it is not necessary to account those.
+			 */
+			update_history(rq, p, wts->sum, 1, event);
+		return 0;
+	}
+
+	if (!new_window) {
+		/*
+		 * The simple case - busy time contained within the existing
+		 * window.
+		 */
+		return add_to_task_demand(rq, p, wallclock - mark_start);
+	}
+
+	/*
+	 * Busy time spans at least two windows. Temporarily rewind
+	 * window_start to first window boundary after mark_start.
+	 */
+	delta = window_start - mark_start;
+	nr_full_windows = div64_u64(delta, window_size);
+	window_start -= (u64)nr_full_windows * (u64)window_size;
+
+	/* Process (window_start - mark_start) first */
+	runtime = add_to_task_demand(rq, p, window_start - mark_start);
+
+	/* Push new sample(s) into task's demand history */
+	update_history(rq, p, wts->sum, 1, event);
+	if (nr_full_windows) {
+		u64 scaled_window = scale_exec_time(window_size, rq);
+
+		update_history(rq, p, scaled_window, nr_full_windows, event);
+		runtime += nr_full_windows * scaled_window;
+	}
+
+	/*
+	 * Roll window_start back to current to process any remainder
+	 * in current window.
+	 */
+	window_start += (u64)nr_full_windows * (u64)window_size;
+
+	/* Process (wallclock - window_start) next */
+	mark_start = window_start;
+	runtime += add_to_task_demand(rq, p, wallclock - mark_start);
+
+	return runtime;
+}
+
+static inline unsigned int cpu_cur_freq(int cpu)
+{
+	struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1;
+
+	return wrq->cluster->cur_freq;
+}
+
+static void
+update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event,
+			  u64 wallclock, u64 irqtime)
+{
+	u64 cur_cycles;
+	u64 cycles_delta;
+	u64 time_delta;
+	int cpu = cpu_of(rq);
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	lockdep_assert_held(&rq->lock);
+
+	if (!use_cycle_counter) {
+		wrq->task_exec_scale = DIV64_U64_ROUNDUP(cpu_cur_freq(cpu) *
+				arch_scale_cpu_capacity(cpu),
+				wrq->cluster->max_possible_freq);
+		return;
+	}
+
+	cur_cycles = read_cycle_counter(cpu, wallclock);
+
+	/*
+	 * If current task is idle task and irqtime == 0 CPU was
+	 * indeed idle and probably its cycle counter was not
+	 * increasing.  We still need estimatied CPU frequency
+	 * for IO wait time accounting.  Use the previously
+	 * calculated frequency in such a case.
+	 */
+	if (!is_idle_task(rq->curr) || irqtime) {
+		if (unlikely(cur_cycles < wts->cpu_cycles))
+			cycles_delta = cur_cycles + (U64_MAX -
+				wts->cpu_cycles);
+		else
+			cycles_delta = cur_cycles - wts->cpu_cycles;
+		cycles_delta = cycles_delta * NSEC_PER_MSEC;
+
+		if (event == IRQ_UPDATE && is_idle_task(p))
+			/*
+			 * Time between mark_start of idle task and IRQ handler
+			 * entry time is CPU cycle counter stall period.
+			 * Upon IRQ handler entry walt_sched_account_irqstart()
+			 * replenishes idle task's cpu cycle counter so
+			 * cycles_delta now represents increased cycles during
+			 * IRQ handler rather than time between idle entry and
+			 * IRQ exit.  Thus use irqtime as time delta.
+			 */
+			time_delta = irqtime;
+		else
+			time_delta = wallclock - wts->mark_start;
+		SCHED_BUG_ON((s64)time_delta < 0);
+
+		wrq->task_exec_scale = DIV64_U64_ROUNDUP(cycles_delta *
+				arch_scale_cpu_capacity(cpu),
+				time_delta *
+					wrq->cluster->max_possible_freq);
+
+		trace_sched_get_task_cpu_cycles(cpu, event,
+				cycles_delta, time_delta, p);
+	}
+
+	wts->cpu_cycles = cur_cycles;
+}
+
+static inline void run_walt_irq_work(u64 old_window_start, struct rq *rq)
+{
+	u64 result;
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+
+	if (old_window_start == wrq->window_start)
+		return;
+
+	result = atomic64_cmpxchg(&walt_irq_work_lastq_ws, old_window_start,
+				   wrq->window_start);
+	if (result == old_window_start)
+		walt_irq_work_queue(&walt_cpufreq_irq_work);
+}
+
+/* Reflect task activity on its demand and cpu's busy time statistics */
+static void walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event,
+						u64 wallclock, u64 irqtime)
+{
+	u64 old_window_start;
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	if (!wrq->window_start || wts->mark_start == wallclock)
+		return;
+
+	lockdep_assert_held(&rq->lock);
+
+	old_window_start = update_window_start(rq, wallclock, event);
+
+	if (!wts->mark_start) {
+		update_task_cpu_cycles(p, cpu_of(rq), wallclock);
+		goto done;
+	}
+
+	update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime);
+	update_task_demand(p, rq, event, wallclock);
+	update_cpu_busy_time(p, rq, event, wallclock, irqtime);
+	update_task_pred_demand(rq, p, event);
+
+	trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime,
+				&wrq->grp_time, wrq, wts);
+	trace_sched_update_task_ravg_mini(p, rq, event, wallclock, irqtime,
+				&wrq->grp_time, wrq, wts);
+
+done:
+	wts->mark_start = wallclock;
+
+	run_walt_irq_work(old_window_start, rq);
+}
+
+u32 sched_get_init_task_load(struct task_struct *p)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	return wts->init_load_pct;
+}
+
+int sched_set_init_task_load(struct task_struct *p, int init_load_pct)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	if (init_load_pct < 0 || init_load_pct > 100)
+		return -EINVAL;
+
+	wts->init_load_pct = init_load_pct;
+
+	return 0;
+}
+
+static void init_new_task_load(struct task_struct *p)
+{
+	int i;
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+	struct walt_task_struct *cur_wts =
+		(struct walt_task_struct *) current->android_vendor_data1;
+	u32 init_load_windows = sched_init_task_load_windows;
+	u32 init_load_windows_scaled = sched_init_task_load_windows_scaled;
+	u32 init_load_pct = cur_wts->init_load_pct;
+
+	wts->init_load_pct = 0;
+	rcu_assign_pointer(wts->grp, NULL);
+	INIT_LIST_HEAD(&wts->grp_list);
+
+	wts->mark_start = 0;
+	wts->sum = 0;
+	wts->curr_window = 0;
+	wts->prev_window = 0;
+	wts->active_time = 0;
+	for (i = 0; i < NUM_BUSY_BUCKETS; ++i)
+		wts->busy_buckets[i] = 0;
+
+	wts->cpu_cycles = 0;
+
+	memset(wts->curr_window_cpu, 0, sizeof(u32) * WALT_NR_CPUS);
+	memset(wts->prev_window_cpu, 0, sizeof(u32) * WALT_NR_CPUS);
+
+	if (init_load_pct) {
+		init_load_windows = div64_u64((u64)init_load_pct *
+			  (u64)sched_ravg_window, 100);
+		init_load_windows_scaled = scale_demand(init_load_windows);
+	}
+
+	wts->demand = init_load_windows;
+	wts->demand_scaled = init_load_windows_scaled;
+	wts->coloc_demand = init_load_windows;
+	wts->pred_demand = 0;
+	wts->pred_demand_scaled = 0;
+	for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
+		wts->sum_history[i] = init_load_windows;
+	wts->misfit = false;
+	wts->rtg_high_prio = false;
+	wts->unfilter = sysctl_sched_task_unfilter_period;
+}
+
+static void init_existing_task_load(struct task_struct *p)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	init_new_task_load(p);
+	cpumask_copy(&wts->cpus_requested, &p->cpus_mask);
+}
+
+static void walt_task_dead(struct task_struct *p)
+{
+	sched_set_group_id(p, 0);
+}
+
+static void reset_task_stats(struct task_struct *p)
+{
+	int i = 0;
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	memset(wts->curr_window_cpu, 0, sizeof(u32) * WALT_NR_CPUS);
+	memset(wts->prev_window_cpu, 0, sizeof(u32) * WALT_NR_CPUS);
+
+	wts->mark_start = 0;
+	wts->sum = 0;
+	wts->demand = 0;
+	wts->coloc_demand = 0;
+	for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
+		wts->sum_history[i] = 0;
+	wts->curr_window = 0;
+	wts->prev_window = 0;
+	wts->pred_demand = 0;
+	for (i = 0; i < NUM_BUSY_BUCKETS; ++i)
+		wts->busy_buckets[i] = 0;
+	wts->demand_scaled = 0;
+	wts->pred_demand_scaled = 0;
+	wts->active_time = 0;
+}
+
+static void mark_task_starting(struct task_struct *p)
+{
+	u64 wallclock;
+	struct rq *rq = task_rq(p);
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	if (!wrq->window_start) {
+		reset_task_stats(p);
+		return;
+	}
+
+	wallclock = sched_ktime_clock();
+	wts->mark_start = wts->last_wake_ts = wallclock;
+	wts->last_enqueued_ts = wallclock;
+	update_task_cpu_cycles(p, cpu_of(rq), wallclock);
+}
+
+/*
+ * Task groups whose aggregate demand on a cpu is more than
+ * sched_group_upmigrate need to be up-migrated if possible.
+ */
+unsigned int __read_mostly sched_group_upmigrate = 20000000;
+
+/*
+ * Task groups, once up-migrated, will need to drop their aggregate
+ * demand to less than sched_group_downmigrate before they are "down"
+ * migrated.
+ */
+unsigned int __read_mostly sched_group_downmigrate = 19000000;
+
+void walt_update_group_thresholds(void)
+{
+	unsigned int min_scale = arch_scale_cpu_capacity(
+				cluster_first_cpu(sched_cluster[0]));
+	u64 min_ms = min_scale * (sched_ravg_window >> SCHED_CAPACITY_SHIFT);
+
+	sched_group_upmigrate = div64_ul(min_ms *
+				sysctl_sched_group_upmigrate_pct, 100);
+	sched_group_downmigrate = div64_ul(min_ms *
+				sysctl_sched_group_downmigrate_pct, 100);
+}
+
+struct walt_sched_cluster *sched_cluster[WALT_NR_CPUS];
+__read_mostly int num_sched_clusters;
+
+struct list_head cluster_head;
+
+static struct walt_sched_cluster init_cluster = {
+	.list			= LIST_HEAD_INIT(init_cluster.list),
+	.id			= 0,
+	.cur_freq		= 1,
+	.max_possible_freq	= 1,
+	.aggr_grp_load		= 0,
+};
+
+static void init_clusters(void)
+{
+	init_cluster.cpus = *cpu_possible_mask;
+	raw_spin_lock_init(&init_cluster.load_lock);
+	INIT_LIST_HEAD(&cluster_head);
+	list_add(&init_cluster.list, &cluster_head);
+}
+
+static void
+insert_cluster(struct walt_sched_cluster *cluster, struct list_head *head)
+{
+	struct walt_sched_cluster *tmp;
+	struct list_head *iter = head;
+
+	list_for_each_entry(tmp, head, list) {
+		if (arch_scale_cpu_capacity(cluster_first_cpu(cluster))
+			< arch_scale_cpu_capacity(cluster_first_cpu(tmp)))
+			break;
+		iter = &tmp->list;
+	}
+
+	list_add(&cluster->list, iter);
+}
+
+static struct walt_sched_cluster *alloc_new_cluster(const struct cpumask *cpus)
+{
+	struct walt_sched_cluster *cluster = NULL;
+
+	cluster = kzalloc(sizeof(struct walt_sched_cluster), GFP_ATOMIC);
+	BUG_ON(!cluster);
+
+	INIT_LIST_HEAD(&cluster->list);
+	cluster->cur_freq		=	1;
+	cluster->max_possible_freq	=	1;
+
+	raw_spin_lock_init(&cluster->load_lock);
+	cluster->cpus = *cpus;
+
+	return cluster;
+}
+
+static void add_cluster(const struct cpumask *cpus, struct list_head *head)
+{
+	struct walt_sched_cluster *cluster = alloc_new_cluster(cpus);
+	int i;
+	struct walt_rq *wrq;
+
+	for_each_cpu(i, cpus) {
+		wrq = (struct walt_rq *) cpu_rq(i)->android_vendor_data1;
+		wrq->cluster = cluster;
+	}
+
+	insert_cluster(cluster, head);
+	num_sched_clusters++;
+}
+
+static void cleanup_clusters(struct list_head *head)
+{
+	struct walt_sched_cluster *cluster, *tmp;
+	int i;
+	struct walt_rq *wrq;
+
+	list_for_each_entry_safe(cluster, tmp, head, list) {
+		for_each_cpu(i, &cluster->cpus) {
+			wrq = (struct walt_rq *) cpu_rq(i)->android_vendor_data1;
+			wrq->cluster = &init_cluster;
+		}
+		list_del(&cluster->list);
+		num_sched_clusters--;
+		kfree(cluster);
+	}
+}
+
+static inline void assign_cluster_ids(struct list_head *head)
+{
+	struct walt_sched_cluster *cluster;
+	int pos = 0;
+
+	list_for_each_entry(cluster, head, list) {
+		cluster->id = pos;
+		sched_cluster[pos++] = cluster;
+	}
+
+	WARN_ON(pos > MAX_NR_CLUSTERS);
+}
+
+static inline void
+move_list(struct list_head *dst, struct list_head *src, bool sync_rcu)
+{
+	struct list_head *first, *last;
+
+	first = src->next;
+	last = src->prev;
+
+	if (sync_rcu) {
+		INIT_LIST_HEAD_RCU(src);
+		synchronize_rcu();
+	}
+
+	first->prev = dst;
+	dst->prev = last;
+	last->next = dst;
+
+	/* Ensure list sanity before making the head visible to all CPUs. */
+	smp_mb();
+	dst->next = first;
+}
+
+static void update_all_clusters_stats(void)
+{
+	struct walt_sched_cluster *cluster;
+	u64 highest_mpc = 0, lowest_mpc = U64_MAX;
+
+	for_each_sched_cluster(cluster) {
+		u64 mpc = arch_scale_cpu_capacity(
+				cluster_first_cpu(cluster));
+
+		if (mpc > highest_mpc)
+			highest_mpc = mpc;
+
+		if (mpc < lowest_mpc)
+			lowest_mpc = mpc;
+	}
+
+	max_possible_capacity = highest_mpc;
+	min_max_possible_capacity = lowest_mpc;
+	walt_update_group_thresholds();
+}
+
+static bool walt_clusters_parsed;
+cpumask_t __read_mostly **cpu_array;
+
+static void init_cpu_array(void)
+{
+	int i;
+
+	cpu_array = kcalloc(num_sched_clusters, sizeof(cpumask_t *),
+			GFP_ATOMIC | __GFP_NOFAIL);
+	if (!cpu_array)
+		SCHED_BUG_ON(1);
+
+	for (i = 0; i < num_sched_clusters; i++) {
+		cpu_array[i] = kcalloc(num_sched_clusters, sizeof(cpumask_t),
+			GFP_ATOMIC | __GFP_NOFAIL);
+		if (!cpu_array[i])
+			SCHED_BUG_ON(1);
+	}
+}
+
+static void build_cpu_array(void)
+{
+	int i;
+
+	if (!cpu_array)
+		SCHED_BUG_ON(1);
+	/* Construct cpu_array row by row */
+	for (i = 0; i < num_sched_clusters; i++) {
+		int j, k = 1;
+
+		/* Fill out first column with appropriate cpu arrays */
+		cpumask_copy(&cpu_array[i][0], &sched_cluster[i]->cpus);
+		/*
+		 * k starts from column 1 because 0 is filled
+		 * Fill clusters for the rest of the row,
+		 * above i in ascending order
+		 */
+		for (j = i + 1; j < num_sched_clusters; j++) {
+			cpumask_copy(&cpu_array[i][k],
+					&sched_cluster[j]->cpus);
+			k++;
+		}
+
+		/*
+		 * k starts from where we left off above.
+		 * Fill clusters below i in descending order.
+		 */
+		for (j = i - 1; j >= 0; j--) {
+			cpumask_copy(&cpu_array[i][k],
+					&sched_cluster[j]->cpus);
+			k++;
+		}
+	}
+}
+
+static void walt_get_possible_siblings(int cpuid, struct cpumask *cluster_cpus)
+{
+	int cpu;
+	struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
+
+	if (cpuid_topo->package_id == -1)
+		return;
+
+	for_each_possible_cpu(cpu) {
+		cpu_topo = &cpu_topology[cpu];
+
+		if (cpuid_topo->package_id != cpu_topo->package_id)
+			continue;
+		cpumask_set_cpu(cpu, cluster_cpus);
+	}
+}
+
+static void walt_update_cluster_topology(void)
+{
+	struct cpumask cpus = *cpu_possible_mask;
+	struct cpumask cluster_cpus;
+	struct walt_sched_cluster *cluster;
+	struct list_head new_head;
+	int i;
+	struct walt_rq *wrq;
+
+	INIT_LIST_HEAD(&new_head);
+
+	for_each_cpu(i, &cpus) {
+		cpumask_clear(&cluster_cpus);
+		walt_get_possible_siblings(i, &cluster_cpus);
+		if (cpumask_empty(&cluster_cpus)) {
+			WARN(1, "WALT: Invalid cpu topology!!");
+			cleanup_clusters(&new_head);
+			return;
+		}
+		cpumask_andnot(&cpus, &cpus, &cluster_cpus);
+		add_cluster(&cluster_cpus, &new_head);
+	}
+
+	assign_cluster_ids(&new_head);
+
+	list_for_each_entry(cluster, &new_head, list) {
+		struct cpufreq_policy *policy;
+
+		policy = cpufreq_cpu_get_raw(cluster_first_cpu(cluster));
+		/*
+		 * walt_update_cluster_topology() must be called AFTER policies
+		 * for all cpus are initialized. If not, simply BUG().
+		 */
+		SCHED_BUG_ON(!policy);
+
+		if (policy) {
+			cluster->max_possible_freq = policy->cpuinfo.max_freq;
+			for_each_cpu(i, &cluster->cpus) {
+				wrq = (struct walt_rq *) cpu_rq(i)->android_vendor_data1;
+				cpumask_copy(&wrq->freq_domain_cpumask,
+					     policy->related_cpus);
+			}
+			cpuinfo_max_freq_cached = (cpuinfo_max_freq_cached >
+			policy->cpuinfo.max_freq) ?: policy->cpuinfo.max_freq;
+		}
+	}
+
+	/*
+	 * Ensure cluster ids are visible to all CPUs before making
+	 * cluster_head visible.
+	 */
+	move_list(&cluster_head, &new_head, false);
+	update_all_clusters_stats();
+	cluster = NULL;
+
+	for_each_sched_cluster(cluster) {
+		if (cpumask_weight(&cluster->cpus) == 1)
+			cpumask_or(&asym_cap_sibling_cpus,
+				   &asym_cap_sibling_cpus, &cluster->cpus);
+	}
+
+	if (cpumask_weight(&asym_cap_sibling_cpus) == 1)
+		cpumask_clear(&asym_cap_sibling_cpus);
+
+	init_cpu_array();
+	build_cpu_array();
+
+	walt_clusters_parsed = true;
+}
+
+static int cpufreq_notifier_trans(struct notifier_block *nb,
+		unsigned long val, void *data)
+{
+	struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data;
+	unsigned int cpu = freq->policy->cpu, new_freq = freq->new;
+	unsigned long flags;
+	struct walt_sched_cluster *cluster;
+	struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1;
+	struct cpumask policy_cpus = wrq->freq_domain_cpumask;
+	int i, j;
+
+	if (use_cycle_counter)
+		return NOTIFY_DONE;
+	wrq = (struct walt_rq *) cpu_rq(cpumask_first(&policy_cpus))->android_vendor_data1;
+	if (wrq->cluster == &init_cluster)
+		return NOTIFY_DONE;
+
+	if (val != CPUFREQ_POSTCHANGE)
+		return NOTIFY_DONE;
+
+	if (cpu_cur_freq(cpu) == new_freq)
+		return NOTIFY_OK;
+
+	for_each_cpu(i, &policy_cpus) {
+		wrq = (struct walt_rq *) cpu_rq(i)->android_vendor_data1;
+		cluster = wrq->cluster;
+
+		for_each_cpu(j, &cluster->cpus) {
+			struct rq *rq = cpu_rq(j);
+
+			raw_spin_lock_irqsave(&rq->lock, flags);
+			walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
+					 sched_ktime_clock(), 0);
+			raw_spin_unlock_irqrestore(&rq->lock, flags);
+		}
+
+		cluster->cur_freq = new_freq;
+		cpumask_andnot(&policy_cpus, &policy_cpus, &cluster->cpus);
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block notifier_trans_block = {
+	.notifier_call = cpufreq_notifier_trans
+};
+
+static void walt_init_cycle_counter(void)
+{
+	if (qcom_cpufreq_get_cpu_cycle_counter(smp_processor_id()) != U64_MAX) {
+		use_cycle_counter = true;
+		return;
+	}
+
+	cpufreq_register_notifier(&notifier_trans_block,
+				  CPUFREQ_TRANSITION_NOTIFIER);
+}
+
+static void transfer_busy_time(struct rq *rq,
+				struct walt_related_thread_group *grp,
+					struct task_struct *p, int event);
+
+/*
+ * Enable colocation and frequency aggregation for all threads in a process.
+ * The children inherits the group id from the parent.
+ */
+
+struct walt_related_thread_group
+			*related_thread_groups[MAX_NUM_CGROUP_COLOC_ID];
+static LIST_HEAD(active_related_thread_groups);
+static DEFINE_RWLOCK(related_thread_group_lock);
+
+static inline
+void update_best_cluster(struct walt_related_thread_group *grp,
+				   u64 demand, bool boost)
+{
+	if (boost) {
+		/*
+		 * since we are in boost, we can keep grp on min, the boosts
+		 * will ensure tasks get to bigs
+		 */
+		grp->skip_min = false;
+		return;
+	}
+
+	if (is_suh_max())
+		demand = sched_group_upmigrate;
+
+	if (!grp->skip_min) {
+		if (demand >= sched_group_upmigrate)
+			grp->skip_min = true;
+		return;
+	}
+	if (demand < sched_group_downmigrate) {
+		if (!sysctl_sched_coloc_downmigrate_ns) {
+			grp->skip_min = false;
+			return;
+		}
+		if (!grp->downmigrate_ts) {
+			grp->downmigrate_ts = grp->last_update;
+			return;
+		}
+		if (grp->last_update - grp->downmigrate_ts >
+				sysctl_sched_coloc_downmigrate_ns) {
+			grp->downmigrate_ts = 0;
+			grp->skip_min = false;
+		}
+	} else if (grp->downmigrate_ts)
+		grp->downmigrate_ts = 0;
+}
+
+static void _set_preferred_cluster(struct walt_related_thread_group *grp)
+{
+	struct task_struct *p;
+	u64 combined_demand = 0;
+	bool group_boost = false;
+	u64 wallclock;
+	bool prev_skip_min = grp->skip_min;
+	struct walt_task_struct *wts;
+	struct list_head *task_list;
+
+	if (list_empty(&grp->tasks)) {
+		grp->skip_min = false;
+		goto out;
+	}
+
+	if (!hmp_capable()) {
+		grp->skip_min = false;
+		goto out;
+	}
+
+	wallclock = sched_ktime_clock();
+
+	/*
+	 * wakeup of two or more related tasks could race with each other and
+	 * could result in multiple calls to _set_preferred_cluster being issued
+	 * at same time. Avoid overhead in such cases of rechecking preferred
+	 * cluster
+	 */
+	if (wallclock - grp->last_update < sched_ravg_window / 10)
+		return;
+
+	list_for_each(task_list, &grp->tasks) {
+		p = (struct task_struct *) task_list;
+		wts = (struct walt_task_struct *) p->android_vendor_data1;
+		if (task_boost_policy(p) == SCHED_BOOST_ON_BIG) {
+			group_boost = true;
+			break;
+		}
+
+		if (wts->mark_start < wallclock -
+		    (sched_ravg_window * sched_ravg_hist_size))
+			continue;
+
+		combined_demand += wts->coloc_demand;
+		if (!trace_sched_set_preferred_cluster_enabled()) {
+			if (combined_demand > sched_group_upmigrate)
+				break;
+		}
+	}
+
+	grp->last_update = wallclock;
+	update_best_cluster(grp, combined_demand, group_boost);
+	trace_sched_set_preferred_cluster(grp, combined_demand);
+
+out:
+	if (grp->id == DEFAULT_CGROUP_COLOC_ID
+			&& grp->skip_min != prev_skip_min) {
+		if (grp->skip_min)
+			grp->start_ts = sched_clock();
+		sched_update_hyst_times();
+	}
+}
+
+static void set_preferred_cluster(struct walt_related_thread_group *grp)
+{
+	raw_spin_lock(&grp->lock);
+	_set_preferred_cluster(grp);
+	raw_spin_unlock(&grp->lock);
+}
+
+static int update_preferred_cluster(struct walt_related_thread_group *grp,
+		struct task_struct *p, u32 old_load, bool from_tick)
+{
+	u32 new_load = task_load(p);
+
+	if (!grp)
+		return 0;
+
+	if (unlikely(from_tick && is_suh_max()))
+		return 1;
+
+	/*
+	 * Update if task's load has changed significantly or a complete window
+	 * has passed since we last updated preference
+	 */
+	if (abs(new_load - old_load) > sched_ravg_window / 4 ||
+		sched_ktime_clock() - grp->last_update > sched_ravg_window)
+		return 1;
+
+	return 0;
+}
+
+#define ADD_TASK	0
+#define REM_TASK	1
+
+static inline struct walt_related_thread_group*
+lookup_related_thread_group(unsigned int group_id)
+{
+	return related_thread_groups[group_id];
+}
+
+static int alloc_related_thread_groups(void)
+{
+	int i;
+	struct walt_related_thread_group *grp;
+
+	/* groupd_id = 0 is invalid as it's special id to remove group. */
+	for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) {
+		grp = kzalloc(sizeof(*grp), GFP_ATOMIC | GFP_NOWAIT);
+		BUG_ON(!grp);
+
+		grp->id = i;
+		INIT_LIST_HEAD(&grp->tasks);
+		INIT_LIST_HEAD(&grp->list);
+		raw_spin_lock_init(&grp->lock);
+
+		related_thread_groups[i] = grp;
+	}
+
+	return 0;
+}
+
+static void remove_task_from_group(struct task_struct *p)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+	struct walt_related_thread_group *grp = wts->grp;
+	struct rq *rq;
+	int empty_group = 1;
+	struct rq_flags rf;
+
+	raw_spin_lock(&grp->lock);
+
+	rq = __task_rq_lock(p, &rf);
+	transfer_busy_time(rq, wts->grp, p, REM_TASK);
+	list_del_init(&wts->grp_list);
+	rcu_assign_pointer(wts->grp, NULL);
+	__task_rq_unlock(rq, &rf);
+
+	if (!list_empty(&grp->tasks)) {
+		empty_group = 0;
+		_set_preferred_cluster(grp);
+	}
+
+	raw_spin_unlock(&grp->lock);
+
+	/* Reserved groups cannot be destroyed */
+	if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID)
+		 /*
+		  * We test whether grp->list is attached with list_empty()
+		  * hence re-init the list after deletion.
+		  */
+		list_del_init(&grp->list);
+}
+
+static int
+add_task_to_group(struct task_struct *p, struct walt_related_thread_group *grp)
+{
+	struct rq *rq;
+	struct rq_flags rf;
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	raw_spin_lock(&grp->lock);
+
+	/*
+	 * Change wts->grp under rq->lock. Will prevent races with read-side
+	 * reference of wts->grp in various hot-paths
+	 */
+	rq = __task_rq_lock(p, &rf);
+	transfer_busy_time(rq, grp, p, ADD_TASK);
+	list_add(&wts->grp_list, &grp->tasks);
+	rcu_assign_pointer(wts->grp, grp);
+	__task_rq_unlock(rq, &rf);
+
+	_set_preferred_cluster(grp);
+
+	raw_spin_unlock(&grp->lock);
+
+	return 0;
+}
+
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+static inline bool uclamp_task_colocated(struct task_struct *p)
+{
+	struct cgroup_subsys_state *css;
+	struct task_group *tg;
+	bool colocate;
+	struct walt_task_group *wtg;
+
+	rcu_read_lock();
+	css = task_css(p, cpu_cgrp_id);
+	if (!css) {
+		rcu_read_unlock();
+		return false;
+	}
+	tg = container_of(css, struct task_group, css);
+	wtg = (struct walt_task_group *) tg->android_vendor_data1;
+	colocate = wtg->colocate;
+	rcu_read_unlock();
+
+	return colocate;
+}
+#else
+static inline bool uclamp_task_colocated(struct task_struct *p)
+{
+	return false;
+}
+#endif /* CONFIG_UCLAMP_TASK_GROUP */
+
+static void add_new_task_to_grp(struct task_struct *new)
+{
+	unsigned long flags;
+	struct walt_related_thread_group *grp;
+	struct walt_task_struct *wts = (struct walt_task_struct *) new->android_vendor_data1;
+
+	/*
+	 * If the task does not belong to colocated schedtune
+	 * cgroup, nothing to do. We are checking this without
+	 * lock. Even if there is a race, it will be added
+	 * to the co-located cgroup via cgroup attach.
+	 */
+	if (!uclamp_task_colocated(new))
+		return;
+
+	grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID);
+	write_lock_irqsave(&related_thread_group_lock, flags);
+
+	/*
+	 * It's possible that someone already added the new task to the
+	 * group. or it might have taken out from the colocated schedtune
+	 * cgroup. check these conditions under lock.
+	 */
+	if (!uclamp_task_colocated(new) || wts->grp) {
+		write_unlock_irqrestore(&related_thread_group_lock, flags);
+		return;
+	}
+
+	raw_spin_lock(&grp->lock);
+
+	rcu_assign_pointer(wts->grp, grp);
+	list_add(&wts->grp_list, &grp->tasks);
+
+	raw_spin_unlock(&grp->lock);
+	write_unlock_irqrestore(&related_thread_group_lock, flags);
+}
+
+static int __sched_set_group_id(struct task_struct *p, unsigned int group_id)
+{
+	int rc = 0;
+	unsigned long flags;
+	struct walt_related_thread_group *grp = NULL;
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	if (group_id >= MAX_NUM_CGROUP_COLOC_ID)
+		return -EINVAL;
+
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	write_lock(&related_thread_group_lock);
+
+	/* Switching from one group to another directly is not permitted */
+	if ((!wts->grp && !group_id) || (wts->grp && group_id))
+		goto done;
+
+	if (!group_id) {
+		remove_task_from_group(p);
+		goto done;
+	}
+
+	grp = lookup_related_thread_group(group_id);
+	if (list_empty(&grp->list))
+		list_add(&grp->list, &active_related_thread_groups);
+
+	rc = add_task_to_group(p, grp);
+done:
+	write_unlock(&related_thread_group_lock);
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+	return rc;
+}
+
+int sched_set_group_id(struct task_struct *p, unsigned int group_id)
+{
+	/* DEFAULT_CGROUP_COLOC_ID is a reserved id */
+	if (group_id == DEFAULT_CGROUP_COLOC_ID)
+		return -EINVAL;
+
+	return __sched_set_group_id(p, group_id);
+}
+
+unsigned int sched_get_group_id(struct task_struct *p)
+{
+	unsigned int group_id;
+	struct walt_related_thread_group *grp;
+
+	rcu_read_lock();
+	grp = task_related_thread_group(p);
+	group_id = grp ? grp->id : 0;
+	rcu_read_unlock();
+
+	return group_id;
+}
+
+/*
+ * We create a default colocation group at boot. There is no need to
+ * synchronize tasks between cgroups at creation time because the
+ * correct cgroup hierarchy is not available at boot. Therefore cgroup
+ * colocation is turned off by default even though the colocation group
+ * itself has been allocated. Furthermore this colocation group cannot
+ * be destroyted once it has been created. All of this has been as part
+ * of runtime optimizations.
+ *
+ * The job of synchronizing tasks to the colocation group is done when
+ * the colocation flag in the cgroup is turned on.
+ */
+static int create_default_coloc_group(void)
+{
+	struct walt_related_thread_group *grp = NULL;
+	unsigned long flags;
+
+	grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID);
+	write_lock_irqsave(&related_thread_group_lock, flags);
+	list_add(&grp->list, &active_related_thread_groups);
+	write_unlock_irqrestore(&related_thread_group_lock, flags);
+	return 0;
+}
+
+static int sync_cgroup_colocation(struct task_struct *p, bool insert)
+{
+	unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0;
+
+	return __sched_set_group_id(p, grp_id);
+}
+
+static void android_rvh_cpu_cgroup_attach(void *unused,
+						struct cgroup_taskset *tset)
+{
+	struct task_struct *task;
+	struct cgroup_subsys_state *css;
+	bool colocate;
+	struct task_group *tg;
+	struct walt_task_group *wtg;
+
+	cgroup_taskset_first(tset, &css);
+	if (!css)
+		return;
+
+	tg = container_of(css, struct task_group, css);
+	wtg = (struct walt_task_group *) tg->android_vendor_data1;
+	colocate = wtg->colocate;
+
+	cgroup_taskset_for_each(task, css, tset)
+		sync_cgroup_colocation(task, colocate);
+}
+
+static bool is_cluster_hosting_top_app(struct walt_sched_cluster *cluster)
+{
+	struct walt_related_thread_group *grp;
+	bool grp_on_min;
+
+	grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID);
+
+	if (!grp)
+		return false;
+
+	grp_on_min = !grp->skip_min &&
+			(sched_boost_policy() != SCHED_BOOST_ON_BIG);
+
+	return (is_min_capacity_cluster(cluster) == grp_on_min);
+}
+
+static void note_task_waking(struct task_struct *p, u64 wallclock)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	wts->last_wake_ts = wallclock;
+}
+
+/*
+ * Task's cpu usage is accounted in:
+ *	wrq->curr/prev_runnable_sum,  when its ->grp is NULL
+ *	grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL
+ *
+ * Transfer task's cpu usage between those counters when transitioning between
+ * groups
+ */
+static void transfer_busy_time(struct rq *rq,
+				struct walt_related_thread_group *grp,
+					struct task_struct *p, int event)
+{
+	u64 wallclock;
+	struct group_cpu_time *cpu_time;
+	u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
+	u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
+	u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
+	u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
+	int migrate_type;
+	int cpu = cpu_of(rq);
+	bool new_task;
+	int i;
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	wallclock = sched_ktime_clock();
+
+	walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+	walt_update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0);
+	new_task = is_new_task(p);
+
+	cpu_time = &wrq->grp_time;
+	if (event == ADD_TASK) {
+		migrate_type = RQ_TO_GROUP;
+
+		src_curr_runnable_sum = &wrq->curr_runnable_sum;
+		dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+		src_prev_runnable_sum = &wrq->prev_runnable_sum;
+		dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+
+		src_nt_curr_runnable_sum = &wrq->nt_curr_runnable_sum;
+		dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+		src_nt_prev_runnable_sum = &wrq->nt_prev_runnable_sum;
+		dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+
+		if (*src_curr_runnable_sum < wts->curr_window_cpu[cpu]) {
+			printk_deferred("WALT-BUG pid=%u CPU=%d event=%d src_crs=%llu is lesser than task_contrib=%llu",
+					p->pid, cpu, event, *src_curr_runnable_sum,
+					wts->curr_window_cpu[cpu]);
+			walt_task_dump(p);
+			SCHED_BUG_ON(1);
+		}
+		*src_curr_runnable_sum -= wts->curr_window_cpu[cpu];
+
+		if (*src_prev_runnable_sum < wts->prev_window_cpu[cpu]) {
+			printk_deferred("WALT-BUG pid=%u CPU=%d event=%d src_prs=%llu is lesser than task_contrib=%llu",
+					p->pid, cpu, event, *src_prev_runnable_sum,
+					wts->prev_window_cpu[cpu]);
+			walt_task_dump(p);
+			SCHED_BUG_ON(1);
+		}
+		*src_prev_runnable_sum -= wts->prev_window_cpu[cpu];
+
+		if (new_task) {
+			if (*src_nt_curr_runnable_sum <
+					wts->curr_window_cpu[cpu]) {
+				printk_deferred("WALT-BUG pid=%u CPU=%d event=%d src_nt_crs=%llu is lesser than task_contrib=%llu",
+						p->pid, cpu, event,
+						*src_nt_curr_runnable_sum,
+						wts->curr_window_cpu[cpu]);
+				walt_task_dump(p);
+				SCHED_BUG_ON(1);
+			}
+			*src_nt_curr_runnable_sum -=
+					wts->curr_window_cpu[cpu];
+
+			if (*src_nt_prev_runnable_sum <
+					wts->prev_window_cpu[cpu]) {
+				printk_deferred("WALT-BUG pid=%u CPU=%d event=%d src_nt_prs=%llu is lesser than task_contrib=%llu",
+						p->pid, cpu, event,
+						*src_nt_prev_runnable_sum,
+						wts->prev_window_cpu[cpu]);
+				walt_task_dump(p);
+				SCHED_BUG_ON(1);
+			}
+			*src_nt_prev_runnable_sum -=
+					wts->prev_window_cpu[cpu];
+		}
+
+		update_cluster_load_subtractions(p, cpu,
+				wrq->window_start, new_task);
+
+	} else {
+		migrate_type = GROUP_TO_RQ;
+
+		src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
+		dst_curr_runnable_sum = &wrq->curr_runnable_sum;
+		src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
+		dst_prev_runnable_sum = &wrq->prev_runnable_sum;
+
+		src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
+		dst_nt_curr_runnable_sum = &wrq->nt_curr_runnable_sum;
+		src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
+		dst_nt_prev_runnable_sum = &wrq->nt_prev_runnable_sum;
+
+		if (*src_curr_runnable_sum < wts->curr_window) {
+			printk_deferred("WALT-UG pid=%u CPU=%d event=%d src_crs=%llu is lesser than task_contrib=%llu",
+					p->pid, cpu, event, *src_curr_runnable_sum,
+					wts->curr_window);
+			walt_task_dump(p);
+			SCHED_BUG_ON(1);
+		}
+		*src_curr_runnable_sum -= wts->curr_window;
+
+		if (*src_prev_runnable_sum < wts->prev_window) {
+			printk_deferred("WALT-BUG pid=%u CPU=%d event=%d src_prs=%llu is lesser than task_contrib=%llu",
+					p->pid, cpu, event, *src_prev_runnable_sum,
+					wts->prev_window);
+			walt_task_dump(p);
+			SCHED_BUG_ON(1);
+		}
+		*src_prev_runnable_sum -= wts->prev_window;
+
+		if (new_task) {
+			if (*src_nt_curr_runnable_sum < wts->curr_window) {
+				printk_deferred("WALT-BUG pid=%u CPU=%d event=%d src_nt_crs=%llu is lesser than task_contrib=%llu",
+						p->pid, cpu, event,
+						*src_nt_curr_runnable_sum,
+						wts->curr_window);
+				walt_task_dump(p);
+				SCHED_BUG_ON(1);
+			}
+			*src_nt_curr_runnable_sum -= wts->curr_window;
+
+			if (*src_nt_prev_runnable_sum < wts->prev_window) {
+				printk_deferred("WALT-BUG pid=%u CPU=%d event=%d src_nt_prs=%llu is lesser than task_contrib=%llu",
+						p->pid, cpu, event,
+						*src_nt_prev_runnable_sum,
+						wts->prev_window);
+				walt_task_dump(p);
+				SCHED_BUG_ON(1);
+			}
+			*src_nt_prev_runnable_sum -= wts->prev_window;
+		}
+
+		/*
+		 * Need to reset curr/prev windows for all CPUs, not just the
+		 * ones in the same cluster. Since inter cluster migrations
+		 * did not result in the appropriate book keeping, the values
+		 * per CPU would be inaccurate.
+		 */
+		for_each_possible_cpu(i) {
+			wts->curr_window_cpu[i] = 0;
+			wts->prev_window_cpu[i] = 0;
+		}
+	}
+
+	*dst_curr_runnable_sum += wts->curr_window;
+	*dst_prev_runnable_sum += wts->prev_window;
+	if (new_task) {
+		*dst_nt_curr_runnable_sum += wts->curr_window;
+		*dst_nt_prev_runnable_sum += wts->prev_window;
+	}
+
+	/*
+	 * When a task enter or exits a group, it's curr and prev windows are
+	 * moved to a single CPU. This behavior might be sub-optimal in the
+	 * exit case, however, it saves us the overhead of handling inter
+	 * cluster migration fixups while the task is part of a related group.
+	 */
+	wts->curr_window_cpu[cpu] = wts->curr_window;
+	wts->prev_window_cpu[cpu] = wts->prev_window;
+
+	trace_sched_migration_update_sum(p, migrate_type, rq);
+}
+
+bool is_rtgb_active(void)
+{
+	struct walt_related_thread_group *grp;
+
+	grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID);
+	return grp && grp->skip_min;
+}
+
+u64 get_rtgb_active_time(void)
+{
+	struct walt_related_thread_group *grp;
+	u64 now = sched_clock();
+
+	grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID);
+
+	if (grp && grp->skip_min && grp->start_ts)
+		return now - grp->start_ts;
+
+	return 0;
+}
+
+static void walt_init_window_dep(void);
+static void walt_tunables_fixup(void)
+{
+	if (likely(num_sched_clusters > 0))
+		walt_update_group_thresholds();
+	walt_init_window_dep();
+}
+
+static void walt_update_irqload(struct rq *rq)
+{
+	u64 irq_delta = 0;
+	unsigned int nr_windows = 0;
+	u64 cur_irq_time;
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+	u64 last_irq_window = READ_ONCE(wrq->last_irq_window);
+
+	if (wrq->window_start > last_irq_window)
+		nr_windows = div64_u64(wrq->window_start - last_irq_window,
+				       sched_ravg_window);
+
+	/* Decay CPU's irqload by 3/4 for each window. */
+	if (nr_windows < 10)
+		wrq->avg_irqload = mult_frac(wrq->avg_irqload, 3, 4);
+	else
+		wrq->avg_irqload = 0;
+
+	cur_irq_time = irq_time_read(cpu_of(rq));
+	if (cur_irq_time > wrq->prev_irq_time)
+		irq_delta = cur_irq_time - wrq->prev_irq_time;
+
+	wrq->avg_irqload += irq_delta;
+	wrq->prev_irq_time = cur_irq_time;
+
+	if (nr_windows < SCHED_HIGH_IRQ_TIMEOUT)
+		wrq->high_irqload = (wrq->avg_irqload >=
+					walt_cpu_high_irqload);
+	else
+		wrq->high_irqload = false;
+}
+
+/*
+ * Runs in hard-irq context. This should ideally run just after the latest
+ * window roll-over.
+ */
+static void walt_irq_work(struct irq_work *irq_work)
+{
+	struct walt_sched_cluster *cluster;
+	struct rq *rq;
+	int cpu;
+	u64 wc;
+	bool is_migration = false, is_asym_migration = false;
+	u64 total_grp_load = 0, min_cluster_grp_load = 0;
+	int level = 0;
+	unsigned long flags;
+	struct walt_rq *wrq;
+
+	/* Am I the window rollover work or the migration work? */
+	if (irq_work == &walt_migration_irq_work)
+		is_migration = true;
+
+	for_each_cpu(cpu, cpu_possible_mask) {
+		if (level == 0)
+			raw_spin_lock(&cpu_rq(cpu)->lock);
+		else
+			raw_spin_lock_nested(&cpu_rq(cpu)->lock, level);
+		level++;
+	}
+
+	wc = sched_ktime_clock();
+	walt_load_reported_window = atomic64_read(&walt_irq_work_lastq_ws);
+	for_each_sched_cluster(cluster) {
+		u64 aggr_grp_load = 0;
+
+		raw_spin_lock(&cluster->load_lock);
+
+		for_each_cpu(cpu, &cluster->cpus) {
+			rq = cpu_rq(cpu);
+			wrq = (struct walt_rq *) rq->android_vendor_data1;
+			if (rq->curr) {
+				walt_update_task_ravg(rq->curr, rq,
+						TASK_UPDATE, wc, 0);
+				account_load_subtractions(rq);
+				aggr_grp_load +=
+					wrq->grp_time.prev_runnable_sum;
+			}
+			if (is_migration && wrq->notif_pending &&
+			    cpumask_test_cpu(cpu, &asym_cap_sibling_cpus)) {
+				is_asym_migration = true;
+				wrq->notif_pending = false;
+			}
+		}
+
+		cluster->aggr_grp_load = aggr_grp_load;
+		total_grp_load += aggr_grp_load;
+
+		if (is_min_capacity_cluster(cluster))
+			min_cluster_grp_load = aggr_grp_load;
+		raw_spin_unlock(&cluster->load_lock);
+	}
+
+	if (total_grp_load) {
+		if (cpumask_weight(&asym_cap_sibling_cpus)) {
+			u64 big_grp_load =
+					  total_grp_load - min_cluster_grp_load;
+
+			for_each_cpu(cpu, &asym_cap_sibling_cpus)
+				cpu_cluster(cpu)->aggr_grp_load = big_grp_load;
+		}
+		rtgb_active = is_rtgb_active();
+	} else {
+		rtgb_active = false;
+	}
+
+	if (!is_migration && sysctl_sched_user_hint && time_after(jiffies,
+						sched_user_hint_reset_time))
+		sysctl_sched_user_hint = 0;
+
+	for_each_sched_cluster(cluster) {
+		cpumask_t cluster_online_cpus;
+		unsigned int num_cpus, i = 1;
+
+		cpumask_and(&cluster_online_cpus, &cluster->cpus,
+						cpu_online_mask);
+		num_cpus = cpumask_weight(&cluster_online_cpus);
+		for_each_cpu(cpu, &cluster_online_cpus) {
+			int wflag = 0;
+
+			/*
+			 * FIXME:
+			 *
+			 * For now supporting both schedutil and waltgov.
+			 * This is not by design but for convenience.
+			 */
+			rq = cpu_rq(cpu);
+			wrq = (struct walt_rq *) rq->android_vendor_data1;
+
+			if (is_migration) {
+				if (wrq->notif_pending) {
+					wrq->notif_pending = false;
+
+					wflag |= WALT_CPUFREQ_IC_MIGRATION;
+				}
+			} else {
+				wflag |= WALT_CPUFREQ_ROLLOVER;
+			}
+
+			if (is_asym_migration && cpumask_test_cpu(cpu,
+							&asym_cap_sibling_cpus)) {
+				wflag |= WALT_CPUFREQ_IC_MIGRATION;
+			}
+
+			if (i == num_cpus)
+				waltgov_run_callback(cpu_rq(cpu), wflag);
+			else
+				waltgov_run_callback(cpu_rq(cpu), wflag |
+							WALT_CPUFREQ_CONTINUE);
+			i++;
+
+			if (!is_migration)
+				walt_update_irqload(rq);
+		}
+	}
+
+	/*
+	 * If the window change request is in pending, good place to
+	 * change sched_ravg_window since all rq locks are acquired.
+	 *
+	 * If the current window roll over is delayed such that the
+	 * mark_start (current wallclock with which roll over is done)
+	 * of the current task went past the window start with the
+	 * updated new window size, delay the update to the next
+	 * window roll over. Otherwise the CPU counters (prs and crs) are
+	 * not rolled over properly as mark_start > window_start.
+	 */
+	if (!is_migration) {
+		spin_lock_irqsave(&sched_ravg_window_lock, flags);
+		wrq = (struct walt_rq *) this_rq()->android_vendor_data1;
+		if ((sched_ravg_window != new_sched_ravg_window) &&
+		    (wc < wrq->window_start + new_sched_ravg_window)) {
+			sched_ravg_window_change_time = sched_ktime_clock();
+			trace_sched_ravg_window_change(sched_ravg_window,
+					new_sched_ravg_window,
+					sched_ravg_window_change_time);
+			sched_ravg_window = new_sched_ravg_window;
+			walt_tunables_fixup();
+		}
+		spin_unlock_irqrestore(&sched_ravg_window_lock, flags);
+	}
+
+	for_each_cpu(cpu, cpu_possible_mask)
+		raw_spin_unlock(&cpu_rq(cpu)->lock);
+
+	if (!is_migration) {
+		wrq = (struct walt_rq *) this_rq()->android_vendor_data1;
+		core_ctl_check(wrq->window_start);
+	}
+}
+
+void walt_rotation_checkpoint(int nr_big)
+{
+	if (!hmp_capable())
+		return;
+
+	if (!sysctl_sched_walt_rotate_big_tasks || sched_boost() != NO_BOOST) {
+		walt_rotation_enabled = 0;
+		return;
+	}
+
+	walt_rotation_enabled = nr_big >= num_possible_cpus();
+}
+
+void walt_fill_ta_data(struct core_ctl_notif_data *data)
+{
+	struct walt_related_thread_group *grp;
+	unsigned long flags;
+	u64 total_demand = 0, wallclock;
+	struct task_struct *p;
+	int min_cap_cpu, scale = 1024;
+	struct walt_sched_cluster *cluster;
+	int i = 0;
+	struct walt_task_struct *wts;
+	struct list_head *task_list;
+
+	grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID);
+
+	raw_spin_lock_irqsave(&grp->lock, flags);
+	if (list_empty(&grp->tasks)) {
+		raw_spin_unlock_irqrestore(&grp->lock, flags);
+		goto fill_util;
+	}
+
+	wallclock = sched_ktime_clock();
+
+	list_for_each(task_list, &grp->tasks) {
+		p = (struct task_struct *) task_list;
+		wts = (struct walt_task_struct *) p->android_vendor_data1;
+		if (wts->mark_start < wallclock -
+		    (sched_ravg_window * sched_ravg_hist_size))
+			continue;
+
+		total_demand += wts->coloc_demand;
+	}
+
+	raw_spin_unlock_irqrestore(&grp->lock, flags);
+
+	/*
+	 * Scale the total demand to the lowest capacity CPU and
+	 * convert into percentage.
+	 *
+	 * P = total_demand/sched_ravg_window * 1024/scale * 100
+	 */
+
+	min_cap_cpu = cpumask_first(&cpu_array[0][0]);
+	if (min_cap_cpu != -1)
+		scale = arch_scale_cpu_capacity(min_cap_cpu);
+
+	data->coloc_load_pct = div64_u64(total_demand * 1024 * 100,
+			       (u64)sched_ravg_window * scale);
+
+fill_util:
+	for_each_sched_cluster(cluster) {
+		int fcpu = cluster_first_cpu(cluster);
+
+		if (i == MAX_CLUSTERS)
+			break;
+
+		scale = arch_scale_cpu_capacity(fcpu);
+		data->ta_util_pct[i] = div64_u64(cluster->aggr_grp_load * 1024 *
+				       100, (u64)sched_ravg_window * scale);
+
+		scale = arch_scale_freq_capacity(fcpu);
+		data->cur_cap_pct[i] = (scale * 100)/1024;
+		i++;
+	}
+}
+
+static void walt_init_window_dep(void)
+{
+	walt_cpu_util_freq_divisor =
+	    (sched_ravg_window >> SCHED_CAPACITY_SHIFT) * 100;
+	walt_scale_demand_divisor = sched_ravg_window >> SCHED_CAPACITY_SHIFT;
+
+	sched_init_task_load_windows =
+		div64_u64((u64)sysctl_sched_init_task_load_pct *
+			  (u64)sched_ravg_window, 100);
+	sched_init_task_load_windows_scaled =
+		scale_demand(sched_init_task_load_windows);
+
+	walt_cpu_high_irqload = div64_u64((u64)sched_ravg_window * 95, (u64) 100);
+}
+
+static void walt_init_once(void)
+{
+	init_irq_work(&walt_migration_irq_work, walt_irq_work);
+	init_irq_work(&walt_cpufreq_irq_work, walt_irq_work);
+	walt_init_window_dep();
+}
+
+static void walt_sched_init_rq(struct rq *rq)
+{
+	int j;
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+
+	if (cpu_of(rq) == 0)
+		walt_init_once();
+
+	cpumask_set_cpu(cpu_of(rq), &wrq->freq_domain_cpumask);
+
+	wrq->walt_stats.cumulative_runnable_avg_scaled = 0;
+	wrq->prev_window_size = sched_ravg_window;
+	wrq->window_start = 0;
+	wrq->walt_stats.nr_big_tasks = 0;
+	wrq->walt_flags = 0;
+	wrq->avg_irqload = 0;
+	wrq->prev_irq_time = 0;
+	wrq->last_irq_window = 0;
+	wrq->high_irqload = false;
+	wrq->task_exec_scale = 1024;
+	wrq->push_task = NULL;
+
+	/*
+	 * All cpus part of same cluster by default. This avoids the
+	 * need to check for wrq->cluster being non-NULL in hot-paths
+	 * like select_best_cpu()
+	 */
+	wrq->cluster = &init_cluster;
+	wrq->curr_runnable_sum = wrq->prev_runnable_sum = 0;
+	wrq->nt_curr_runnable_sum = wrq->nt_prev_runnable_sum = 0;
+	memset(&wrq->grp_time, 0, sizeof(struct group_cpu_time));
+	wrq->old_busy_time = 0;
+	wrq->old_estimated_time = 0;
+	wrq->walt_stats.pred_demands_sum_scaled = 0;
+	wrq->walt_stats.nr_rtg_high_prio_tasks = 0;
+	wrq->ed_task = NULL;
+	wrq->curr_table = 0;
+	wrq->prev_top = 0;
+	wrq->curr_top = 0;
+	wrq->last_cc_update = 0;
+	wrq->cycles = 0;
+	for (j = 0; j < NUM_TRACKED_WINDOWS; j++) {
+		memset(&wrq->load_subs[j], 0,
+				sizeof(struct load_subtractions));
+		wrq->top_tasks[j] = kcalloc(NUM_LOAD_INDICES,
+				sizeof(u8), GFP_ATOMIC | GFP_NOWAIT);
+		/* No other choice */
+		BUG_ON(!wrq->top_tasks[j]);
+		clear_top_tasks_bitmap(wrq->top_tasks_bitmap[j]);
+	}
+	wrq->cum_window_demand_scaled = 0;
+	wrq->notif_pending = false;
+}
+
+void sched_window_nr_ticks_change(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&sched_ravg_window_lock, flags);
+	new_sched_ravg_window = mult_frac(sysctl_sched_ravg_window_nr_ticks,
+						NSEC_PER_SEC, HZ);
+	spin_unlock_irqrestore(&sched_ravg_window_lock, flags);
+}
+
+static void
+walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p)
+{
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	fixup_cumulative_runnable_avg(&wrq->walt_stats, wts->demand_scaled,
+					wts->pred_demand_scaled);
+}
+
+static void
+walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p)
+{
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	fixup_cumulative_runnable_avg(&wrq->walt_stats,
+				      -(s64)wts->demand_scaled,
+				      -(s64)wts->pred_demand_scaled);
+}
+
+static void inc_rq_walt_stats(struct rq *rq, struct task_struct *p)
+{
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	if (wts->misfit)
+		wrq->walt_stats.nr_big_tasks++;
+
+	wts->rtg_high_prio = task_rtg_high_prio(p);
+	if (wts->rtg_high_prio)
+		wrq->walt_stats.nr_rtg_high_prio_tasks++;
+}
+
+static void dec_rq_walt_stats(struct rq *rq, struct task_struct *p)
+{
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	if (wts->misfit)
+		wrq->walt_stats.nr_big_tasks--;
+
+	if (wts->rtg_high_prio)
+		wrq->walt_stats.nr_rtg_high_prio_tasks--;
+
+	BUG_ON(wrq->walt_stats.nr_big_tasks < 0);
+}
+
+static void android_rvh_wake_up_new_task(void *unused, struct task_struct *new)
+{
+	add_new_task_to_grp(new);
+}
+
+/*
+ * The intention of this hook is to update cpu_capacity_orig as well as
+ * (*capacity), otherwise we will end up capacity_of() > capacity_orig_of().
+ */
+static void android_rvh_update_cpu_capacity(void *unused, int cpu, unsigned long *capacity)
+{
+	unsigned long max_capacity = arch_scale_cpu_capacity(cpu);
+	unsigned long thermal_pressure = arch_scale_thermal_pressure(cpu);
+	unsigned long thermal_cap;
+
+	/*
+	 * thermal_pressure = max_capacity - curr_cap_as_per_thermal.
+	 * so,
+	 * curr_cap_as_per_thermal = max_capacity - thermal_pressure.
+	 */
+
+	thermal_cap = max_capacity - thermal_pressure;
+
+	/*
+	 * TODO:
+	 * Thermal is taken care now. but what about limits via
+	 * cpufreq max. we don't have arch_scale_max_freq_capacity()
+	 * in 5.10 now.
+	 *
+	 * Two options:
+	 * #1 either port that max_frq_cap patch to AOSP
+	 * #2 register for cpufreq policy updates..
+	 */
+	cpu_rq(cpu)->cpu_capacity_orig = min(cpu_rq(cpu)->cpu_capacity_orig,
+					     thermal_cap);
+	*capacity = cpu_rq(cpu)->cpu_capacity_orig;
+}
+
+static void android_rvh_sched_cpu_starting(void *unused, int cpu)
+{
+	unsigned long flags;
+	struct rq *rq = cpu_rq(cpu);
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	set_window_start(rq);
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+	clear_walt_request(cpu);
+}
+
+static void android_rvh_sched_cpu_dying(void *unused, int cpu)
+{
+	clear_walt_request(cpu);
+}
+
+static void android_rvh_set_task_cpu(void *unused, struct task_struct *p, unsigned int new_cpu)
+{
+	if (new_cpu < 0)
+		return;
+	fixup_busy_time(p, (int) new_cpu);
+}
+
+static void android_rvh_sched_fork(void *unused, struct task_struct *p)
+{
+	init_new_task_load(p);
+}
+
+static void android_rvh_new_task_stats(void *unused, struct task_struct *p)
+{
+	mark_task_starting(p);
+}
+
+static void android_rvh_account_irq(void *unused, struct task_struct *curr, int cpu, s64 delta)
+{
+	struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1;
+
+	if (!!(curr->flags & PF_IDLE)) {
+		if (hardirq_count() || in_serving_softirq())
+			walt_sched_account_irqend(cpu, curr, delta);
+		else
+			walt_sched_account_irqstart(cpu, curr);
+	}
+	wrq->last_irq_window = wrq->window_start;
+}
+
+static void android_rvh_flush_task(void *unused, struct task_struct *p)
+{
+	walt_task_dead(p);
+}
+
+static void android_rvh_enqueue_task(void *unused, struct rq *rq, struct task_struct *p, int flags)
+{
+	u64 wallclock = sched_ktime_clock();
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	wts->last_enqueued_ts = wallclock;
+	sched_update_nr_prod(rq->cpu, true);
+
+	if (fair_policy(p->policy)) {
+		wts->misfit = !task_fits_max(p, rq->cpu);
+		inc_rq_walt_stats(rq, p);
+	}
+
+	walt_inc_cumulative_runnable_avg(rq, p);
+	trace_sched_enq_deq_task(p, 1, cpumask_bits(&p->cpus_mask)[0]);
+}
+
+static void android_rvh_dequeue_task(void *unused, struct rq *rq, struct task_struct *p, int flags)
+{
+	/*
+	 * TODO: remove later.
+	 * We don't have to check if p is ed task and clear it. the below
+	 * code calls is_ed_task_present() which clears the rq's ed_task
+	 * unconditionally.
+	 */
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+
+	if (p == wrq->ed_task)
+		is_ed_task_present(rq, sched_ktime_clock());
+
+	sched_update_nr_prod(rq->cpu, false);
+
+	if (fair_policy(p->policy))
+		dec_rq_walt_stats(rq, p);
+
+	walt_dec_cumulative_runnable_avg(rq, p);
+	trace_sched_enq_deq_task(p, 0, cpumask_bits(&p->cpus_mask)[0]);
+}
+
+static void android_rvh_update_misfit_status(void *unused, struct task_struct *p,
+		struct rq *rq, bool *need_update)
+{
+	struct walt_task_struct *wts;
+	struct walt_rq *wrq;
+	bool old_misfit, misfit;
+	int change;
+
+	*need_update = false;
+
+	if (!p) {
+		rq->misfit_task_load = 0;
+		return;
+	}
+
+	wrq = (struct walt_rq *) rq->android_vendor_data1;
+	wts = (struct walt_task_struct *) p->android_vendor_data1;
+	old_misfit = wts->misfit;
+
+	if (task_fits_capacity(p, capacity_orig_of(cpu_of(rq)), rq->cpu))
+		rq->misfit_task_load = 0;
+	else
+		rq->misfit_task_load = task_load(p);
+
+	misfit = rq->misfit_task_load;
+
+	change = misfit - old_misfit;
+	if (change) {
+		sched_update_nr_prod(rq->cpu, true);
+		wts->misfit = misfit;
+		wrq->walt_stats.nr_big_tasks += change;
+		BUG_ON(wrq->walt_stats.nr_big_tasks < 0);
+	}
+}
+
+/* utility function to update walt signals at wakeup */
+static void android_rvh_try_to_wake_up(void *unused, struct task_struct *p)
+{
+	struct rq *rq = cpu_rq(task_cpu(p));
+	struct rq_flags rf;
+	u64 wallclock;
+	unsigned int old_load;
+	struct walt_related_thread_group *grp = NULL;
+
+	rq_lock_irqsave(rq, &rf);
+	old_load = task_load(p);
+	wallclock = sched_ktime_clock();
+	walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+	walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+	note_task_waking(p, wallclock);
+	rq_unlock_irqrestore(rq, &rf);
+
+	rcu_read_lock();
+	grp = task_related_thread_group(p);
+	if (update_preferred_cluster(grp, p, old_load, false))
+		set_preferred_cluster(grp);
+	rcu_read_unlock();
+}
+
+static void android_rvh_try_to_wake_up_success(void *unused, struct task_struct *p)
+{
+	unsigned long flags;
+	int cpu = p->cpu;
+
+	if (!sched_predl)
+		return;
+
+	raw_spin_lock_irqsave(&cpu_rq(cpu)->lock, flags);
+	if (do_pl_notif(cpu_rq(cpu)))
+		waltgov_run_callback(cpu_rq(cpu), WALT_CPUFREQ_PL);
+	raw_spin_unlock_irqrestore(&cpu_rq(cpu)->lock, flags);
+}
+
+static void android_rvh_tick_entry(void *unused, struct rq *rq)
+{
+	u64 wallclock;
+	u32 old_load;
+	struct walt_related_thread_group *grp;
+
+	set_window_start(rq);
+	wallclock = sched_ktime_clock();
+
+	old_load = task_load(rq->curr);
+	walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+
+	rcu_read_lock();
+	grp = task_related_thread_group(rq->curr);
+	if (update_preferred_cluster(grp, rq->curr, old_load, true))
+		set_preferred_cluster(grp);
+	rcu_read_unlock();
+
+	if (is_ed_task_present(rq, wallclock))
+		waltgov_run_callback(rq, WALT_CPUFREQ_EARLY_DET);
+
+	/* TODO
+	 * currently load balancer registered for a post-hook which
+	 * takes care of rotation and migration for misfit tasks.
+	 *
+	 * See if that can also be done here.
+	 */
+}
+
+static void android_rvh_schedule(void *unused, struct task_struct *prev,
+		struct task_struct *next, struct rq *rq)
+{
+	u64 wallclock = sched_ktime_clock();
+	struct walt_task_struct *wts = (struct walt_task_struct *) prev->android_vendor_data1;
+
+	if (likely(prev != next)) {
+		if (!prev->on_rq)
+			wts->last_sleep_ts = wallclock;
+		walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
+		walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
+	} else {
+		walt_update_task_ravg(prev, rq, TASK_UPDATE, wallclock, 0);
+	}
+}
+
+static void android_rvh_resume_cpus(void *unused, struct cpumask *resuming_cpus, int *err)
+{
+	int i;
+	struct rq *rq;
+	unsigned long flags;
+
+	/*
+	 * send a reschedule event  on all resumed CPUs
+	 * which trigger newly idle load balance.
+	 */
+	for_each_cpu(i, resuming_cpus) {
+		rq = cpu_rq(i);
+		raw_spin_lock_irqsave(&rq->lock, flags);
+		resched_curr(rq);
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	}
+
+	*err = 0;
+}
+
+static void android_rvh_update_cpus_allowed(void *unused, struct task_struct *p,
+						cpumask_var_t cpus_requested,
+						const struct cpumask *new_mask, int *ret)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	if (cpumask_subset(&wts->cpus_requested, cpus_requested))
+		*ret = set_cpus_allowed_ptr(p, &wts->cpus_requested);
+}
+
+static void android_rvh_sched_fork_init(void *unused, struct task_struct *p)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	wts->last_sleep_ts		= 0;
+	wts->wake_up_idle		= false;
+	wts->boost			= 0;
+	wts->boost_expires		= 0;
+	wts->boost_period		= false;
+	wts->low_latency		= false;
+}
+
+static void android_rvh_ttwu_cond(void *unused, bool *cond)
+{
+	*cond = sysctl_sched_many_wakeup_threshold < WALT_MANY_WAKEUP_DEFAULT;
+}
+
+static void android_rvh_sched_exec(void *unused, bool *cond)
+{
+	*cond = true;
+}
+
+static void android_rvh_build_perf_domains(void *unused, bool *eas_check)
+{
+	*eas_check = true;
+}
+
+static void register_walt_hooks(void)
+{
+	register_trace_android_rvh_wake_up_new_task(android_rvh_wake_up_new_task, NULL);
+	register_trace_android_rvh_update_cpu_capacity(android_rvh_update_cpu_capacity, NULL);
+	register_trace_android_rvh_sched_cpu_starting(android_rvh_sched_cpu_starting, NULL);
+	register_trace_android_rvh_sched_cpu_dying(android_rvh_sched_cpu_dying, NULL);
+	register_trace_android_rvh_set_task_cpu(android_rvh_set_task_cpu, NULL);
+	register_trace_android_rvh_new_task_stats(android_rvh_new_task_stats, NULL);
+	register_trace_android_rvh_sched_fork(android_rvh_sched_fork, NULL);
+	register_trace_android_rvh_account_irq(android_rvh_account_irq, NULL);
+	register_trace_android_rvh_flush_task(android_rvh_flush_task, NULL);
+	register_trace_android_rvh_update_misfit_status(android_rvh_update_misfit_status, NULL);
+	register_trace_android_rvh_enqueue_task(android_rvh_enqueue_task, NULL);
+	register_trace_android_rvh_dequeue_task(android_rvh_dequeue_task, NULL);
+	register_trace_android_rvh_try_to_wake_up(android_rvh_try_to_wake_up, NULL);
+	register_trace_android_rvh_try_to_wake_up_success(android_rvh_try_to_wake_up_success, NULL);
+	register_trace_android_rvh_tick_entry(android_rvh_tick_entry, NULL);
+	register_trace_android_rvh_schedule(android_rvh_schedule, NULL);
+	register_trace_android_rvh_resume_cpus(android_rvh_resume_cpus, NULL);
+	register_trace_android_vh_show_max_freq(android_vh_show_max_freq, NULL);
+	register_trace_android_rvh_cpu_cgroup_attach(android_rvh_cpu_cgroup_attach, NULL);
+	register_trace_android_rvh_update_cpus_allowed(android_rvh_update_cpus_allowed, NULL);
+	register_trace_android_rvh_sched_fork_init(android_rvh_sched_fork_init, NULL);
+	register_trace_android_rvh_ttwu_cond(android_rvh_ttwu_cond, NULL);
+	register_trace_android_rvh_sched_exec(android_rvh_sched_exec, NULL);
+	register_trace_android_rvh_build_perf_domains(android_rvh_build_perf_domains, NULL);
+}
+
+atomic64_t walt_irq_work_lastq_ws;
+
+static int walt_init_stop_handler(void *data)
+{
+	int cpu;
+	struct task_struct *g, *p;
+	u64 window_start_ns, nr_windows;
+	struct walt_rq *wrq;
+
+	read_lock(&tasklist_lock);
+	for_each_possible_cpu(cpu) {
+		raw_spin_lock(&cpu_rq(cpu)->lock);
+	}
+
+	do_each_thread(g, p) {
+		init_existing_task_load(p);
+	} while_each_thread(g, p);
+
+	window_start_ns = ktime_get_ns();
+	nr_windows = div64_u64(window_start_ns, sched_ravg_window);
+	window_start_ns = (u64)nr_windows * (u64)sched_ravg_window;
+
+	for_each_possible_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+
+		/* Create task members for idle thread */
+		init_new_task_load(rq->idle);
+
+		walt_sched_init_rq(rq);
+
+		wrq = (struct walt_rq *) rq->android_vendor_data1;
+		wrq->window_start = window_start_ns;
+	}
+
+	atomic64_set(&walt_irq_work_lastq_ws, window_start_ns);
+
+	register_walt_hooks();
+	walt_lb_init();
+	walt_rt_init();
+	walt_cfs_init();
+	create_default_coloc_group();
+
+	walt_update_cluster_topology();
+
+	for_each_possible_cpu(cpu) {
+		raw_spin_unlock(&cpu_rq(cpu)->lock);
+	}
+	read_unlock(&tasklist_lock);
+
+	return 0;
+}
+
+static int walt_module_init(void)
+{
+	struct ctl_table_header *hdr;
+	int i;
+
+	walt_tunables();
+
+	sched_init_ops();
+	BUG_ON(alloc_related_thread_groups());
+	walt_init_cycle_counter();
+	init_clusters();
+	stop_machine(walt_init_stop_handler, NULL, NULL);
+
+	hdr = register_sysctl_table(walt_base_table);
+	kmemleak_not_leak(hdr);
+
+	input_boost_init();
+	core_ctl_init();
+	waltgov_register();
+
+	i = match_string(sched_feat_names, __SCHED_FEAT_NR, "TTWU_QUEUE");
+	static_key_disable_cpuslocked(&sched_feat_keys[i]);
+	sysctl_sched_features &= ~(1UL << i);
+
+	return 0;
+}
+
+module_init(walt_module_init);
+MODULE_LICENSE("GPL v2");

+ 1006 - 0
kernel/sched/walt/walt.h

@@ -0,0 +1,1006 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2019-2021, The Linux Foundation. All rights reserved.
+ */
+
+#ifndef _WALT_H
+#define _WALT_H
+
+#include "../../../kernel/sched/sched.h"
+#include "../../../fs/proc/internal.h"
+#include <linux/sched/core_ctl.h>
+#include <linux/cgroup.h>
+
+#ifdef CONFIG_HZ_300
+/*
+ * Tick interval becomes to 3333333 due to
+ * rounding error when HZ=300.
+ */
+#define DEFAULT_SCHED_RAVG_WINDOW (3333333 * 5)
+#else
+/* Min window size (in ns) = 16ms */
+#define DEFAULT_SCHED_RAVG_WINDOW 16000000
+#endif
+
+/* Max window size (in ns) = 1s */
+#define MAX_SCHED_RAVG_WINDOW 1000000000
+
+#define NR_WINDOWS_PER_SEC (NSEC_PER_SEC / DEFAULT_SCHED_RAVG_WINDOW)
+
+#define SCHED_CPUFREQ_MIGRATION	(1U << 1)
+#define SCHED_CPUFREQ_INTERCLUSTER_MIG	(1U << 3)
+#define SCHED_CPUFREQ_WALT	(1U << 4)
+#define SCHED_CPUFREQ_PL	(1U << 5)
+#define SCHED_CPUFREQ_EARLY_DET	(1U << 6)
+#define SCHED_CPUFREQ_CONTINUE	(1U << 8)
+
+#define MAX_CLUSTERS 3
+/* MAX_MARGIN_LEVELS should be one less than MAX_CLUSTERS */
+#define MAX_MARGIN_LEVELS (MAX_CLUSTERS - 1)
+
+enum task_event {
+	PUT_PREV_TASK	= 0,
+	PICK_NEXT_TASK	= 1,
+	TASK_WAKE	= 2,
+	TASK_MIGRATE	= 3,
+	TASK_UPDATE	= 4,
+	IRQ_UPDATE	= 5,
+};
+
+/* Note: this need to be in sync with migrate_type_names array */
+enum migrate_types {
+	GROUP_TO_RQ,
+	RQ_TO_GROUP,
+};
+
+enum task_boost_type {
+	TASK_BOOST_NONE = 0,
+	TASK_BOOST_ON_MID,
+	TASK_BOOST_ON_MAX,
+	TASK_BOOST_STRICT_MAX,
+	TASK_BOOST_END,
+};
+
+#define WALT_NR_CPUS 8
+#define RAVG_HIST_SIZE_MAX 5
+#define NUM_BUSY_BUCKETS 10
+
+struct walt_task_struct {
+	/*
+	 * 'mark_start' marks the beginning of an event (task waking up, task
+	 * starting to execute, task being preempted) within a window
+	 *
+	 * 'sum' represents how runnable a task has been within current
+	 * window. It incorporates both running time and wait time and is
+	 * frequency scaled.
+	 *
+	 * 'sum_history' keeps track of history of 'sum' seen over previous
+	 * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are
+	 * ignored.
+	 *
+	 * 'demand' represents maximum sum seen over previous
+	 * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency
+	 * demand for tasks.
+	 *
+	 * 'curr_window_cpu' represents task's contribution to cpu busy time on
+	 * various CPUs in the current window
+	 *
+	 * 'prev_window_cpu' represents task's contribution to cpu busy time on
+	 * various CPUs in the previous window
+	 *
+	 * 'curr_window' represents the sum of all entries in curr_window_cpu
+	 *
+	 * 'prev_window' represents the sum of all entries in prev_window_cpu
+	 *
+	 * 'pred_demand' represents task's current predicted cpu busy time
+	 *
+	 * 'busy_buckets' groups historical busy time into different buckets
+	 * used for prediction
+	 *
+	 * 'demand_scaled' represents task's demand scaled to 1024
+	 */
+	u64				mark_start;
+	u32				sum, demand;
+	u32				coloc_demand;
+	u32				sum_history[RAVG_HIST_SIZE_MAX];
+	u32				curr_window_cpu[WALT_NR_CPUS];
+	u32				prev_window_cpu[WALT_NR_CPUS];
+	u32				curr_window, prev_window;
+	u32				pred_demand;
+	u8				busy_buckets[NUM_BUSY_BUCKETS];
+	u16				demand_scaled;
+	u16				pred_demand_scaled;
+	u64				active_time;
+	u64				last_win_size;
+	int				boost;
+	bool				wake_up_idle;
+	bool				misfit;
+	bool				rtg_high_prio;
+	bool				low_latency;
+	u64				boost_period;
+	u64				boost_expires;
+	u64				last_sleep_ts;
+	u32				init_load_pct;
+	u32				unfilter;
+	u64				last_wake_ts;
+	u64				last_enqueued_ts;
+	struct walt_related_thread_group __rcu	*grp;
+	struct list_head		grp_list;
+	u64				cpu_cycles;
+	cpumask_t			cpus_requested;
+};
+
+/*End linux/sched.h port */
+/*SCHED.H PORT*/
+extern __read_mostly bool sched_predl;
+
+struct walt_cpu_load {
+	unsigned long	nl;
+	unsigned long	pl;
+	bool		rtgb_active;
+	u64		ws;
+};
+
+#define DECLARE_BITMAP_ARRAY(name, nr, bits) \
+	unsigned long name[nr][BITS_TO_LONGS(bits)]
+
+struct walt_sched_stats {
+	int		nr_big_tasks;
+	u64		cumulative_runnable_avg_scaled;
+	u64		pred_demands_sum_scaled;
+	unsigned int	nr_rtg_high_prio_tasks;
+};
+
+#define NUM_TRACKED_WINDOWS 2
+#define NUM_LOAD_INDICES 1000
+
+struct group_cpu_time {
+	u64			curr_runnable_sum;
+	u64			prev_runnable_sum;
+	u64			nt_curr_runnable_sum;
+	u64			nt_prev_runnable_sum;
+};
+
+struct load_subtractions {
+	u64			window_start;
+	u64			subs;
+	u64			new_subs;
+};
+
+struct walt_rq {
+	struct task_struct	*push_task;
+	struct walt_sched_cluster *cluster;
+	struct cpumask		freq_domain_cpumask;
+	struct walt_sched_stats walt_stats;
+
+	u64			window_start;
+	u32			prev_window_size;
+	unsigned long		walt_flags;
+
+	u64			avg_irqload;
+	u64			last_irq_window;
+	u64			prev_irq_time;
+	struct task_struct	*ed_task;
+	u64			task_exec_scale;
+	u64			old_busy_time;
+	u64			old_estimated_time;
+	u64			curr_runnable_sum;
+	u64			prev_runnable_sum;
+	u64			nt_curr_runnable_sum;
+	u64			nt_prev_runnable_sum;
+	u64			cum_window_demand_scaled;
+	struct group_cpu_time	grp_time;
+	struct load_subtractions load_subs[NUM_TRACKED_WINDOWS];
+	DECLARE_BITMAP_ARRAY(top_tasks_bitmap,
+			NUM_TRACKED_WINDOWS, NUM_LOAD_INDICES);
+	u8			*top_tasks[NUM_TRACKED_WINDOWS];
+	u8			curr_table;
+	int			prev_top;
+	int			curr_top;
+	bool			notif_pending;
+	bool			high_irqload;
+	u64			last_cc_update;
+	u64			cycles;
+};
+
+struct walt_sched_cluster {
+	raw_spinlock_t		load_lock;
+	struct list_head	list;
+	struct cpumask		cpus;
+	int			id;
+	/*
+	 * max_possible_freq = maximum supported by hardware
+	 */
+	unsigned int		cur_freq;
+	unsigned int		max_possible_freq;
+	u64			aggr_grp_load;
+};
+
+struct walt_related_thread_group {
+	int			id;
+	raw_spinlock_t		lock;
+	struct list_head	tasks;
+	struct list_head	list;
+	bool			skip_min;
+	struct rcu_head		rcu;
+	u64			last_update;
+	u64			downmigrate_ts;
+	u64			start_ts;
+};
+
+extern struct walt_sched_cluster *sched_cluster[WALT_NR_CPUS];
+
+extern struct walt_sched_cluster *rq_cluster(struct rq *rq);
+
+/*END SCHED.H PORT*/
+
+extern int num_sched_clusters;
+extern unsigned int sched_capacity_margin_up[WALT_NR_CPUS];
+extern unsigned int sched_capacity_margin_down[WALT_NR_CPUS];
+extern cpumask_t asym_cap_sibling_cpus;
+extern cpumask_t __read_mostly **cpu_array;
+
+extern void sched_update_nr_prod(int cpu, bool enq);
+extern unsigned int walt_big_tasks(int cpu);
+extern void walt_rotate_work_init(void);
+extern void walt_rotation_checkpoint(int nr_big);
+extern void walt_fill_ta_data(struct core_ctl_notif_data *data);
+extern int sched_set_group_id(struct task_struct *p, unsigned int group_id);
+extern unsigned int sched_get_group_id(struct task_struct *p);
+extern int sched_set_init_task_load(struct task_struct *p, int init_load_pct);
+extern u32 sched_get_init_task_load(struct task_struct *p);
+extern void core_ctl_check(u64 wallclock);
+extern int sched_set_boost(int enable);
+extern int sched_pause_count(const cpumask_t *mask, bool include_offline);
+extern void sched_pause_pending(int cpu);
+extern void sched_unpause_pending(int cpu);
+extern int sched_wake_up_idle_show(struct seq_file *m, void *v);
+extern ssize_t sched_wake_up_idle_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *offset);
+extern int sched_wake_up_idle_open(struct inode *inode,	struct file *filp);
+extern int sched_init_task_load_show(struct seq_file *m, void *v);
+extern ssize_t sched_init_task_load_write(struct file *file, const char __user *buf,
+					size_t count, loff_t *offset);
+extern int sched_init_task_load_open(struct inode *inode, struct file *filp);
+extern int sched_group_id_show(struct seq_file *m, void *v);
+extern ssize_t sched_group_id_write(struct file *file, const char __user *buf,
+					size_t count, loff_t *offset);
+extern int sched_group_id_open(struct inode *inode, struct file *filp);
+extern int sched_pause_cpus(struct cpumask *pause_cpus);
+extern int sched_unpause_cpus(struct cpumask *unpause_cpus);
+
+extern int core_ctl_set_boost(bool boost);
+extern void core_ctl_notifier_register(struct notifier_block *n);
+extern void core_ctl_notifier_unregister(struct notifier_block *n);
+extern unsigned int sched_get_cpu_util(int cpu);
+extern void sched_update_hyst_times(void);
+extern u64 sched_lpm_disallowed_time(int cpu);
+extern int
+sched_updown_migrate_handler(struct ctl_table *table, int write,
+			void __user *buffer, size_t *lenp, loff_t *ppos);
+extern int sched_boost_handler(struct ctl_table *table, int write,
+			void __user *buffer, size_t *lenp, loff_t *ppos);
+extern int sched_busy_hyst_handler(struct ctl_table *table, int write,
+			void __user *buffer, size_t *lenp, loff_t *ppos);
+extern u64 sched_ktime_clock(void);
+extern void clear_walt_request(int cpu);
+extern void walt_init_tg(struct task_group *tg);
+extern void walt_init_topapp_tg(struct task_group *tg);
+extern void walt_init_foreground_tg(struct task_group *tg);
+extern int register_walt_callback(void);
+extern void set_cpu_array(void);
+extern int sched_init_ops(void);
+extern int core_ctl_init(void);
+extern void acquire_rq_locks_irqsave(const cpumask_t *cpus,
+		unsigned long *flags);
+extern void release_rq_locks_irqrestore(const cpumask_t *cpus,
+		unsigned long *flags);
+extern struct list_head cluster_head;
+extern int set_sched_ravg_window(char *str);
+extern int set_sched_predl(char *str);
+extern int input_boost_init(void);
+extern int core_ctl_init(void);
+
+extern atomic64_t walt_irq_work_lastq_ws;
+extern unsigned int __read_mostly sched_ravg_window;
+extern unsigned int min_max_possible_capacity;
+extern unsigned int max_possible_capacity;
+extern unsigned int __read_mostly sched_init_task_load_windows;
+extern unsigned int __read_mostly sched_load_granule;
+
+/* 1ms default for 20ms window size scaled to 1024 */
+extern unsigned int sysctl_sched_min_task_util_for_boost;
+/* 0.68ms default for 20ms window size scaled to 1024 */
+extern unsigned int sysctl_sched_min_task_util_for_colocation;
+extern unsigned int sysctl_sched_busy_hyst_enable_cpus;
+extern unsigned int sysctl_sched_busy_hyst;
+extern unsigned int sysctl_sched_coloc_busy_hyst_enable_cpus;
+extern unsigned int sysctl_sched_coloc_busy_hyst_cpu[WALT_NR_CPUS];
+extern unsigned int sysctl_sched_coloc_busy_hyst_max_ms;
+extern unsigned int sysctl_sched_coloc_busy_hyst_cpu_busy_pct[WALT_NR_CPUS];
+extern unsigned int sysctl_sched_boost; /* To/from userspace */
+extern unsigned int sysctl_sched_capacity_margin_up[MAX_MARGIN_LEVELS];
+extern unsigned int sysctl_sched_capacity_margin_down[MAX_MARGIN_LEVELS];
+extern unsigned int sched_boost_type; /* currently activated sched boost */
+extern enum sched_boost_policy boost_policy;
+extern unsigned int sysctl_input_boost_ms;
+extern unsigned int sysctl_input_boost_freq[8];
+extern unsigned int sysctl_sched_boost_on_input;
+extern unsigned int sysctl_sched_load_boost[WALT_NR_CPUS];
+extern unsigned int sysctl_sched_user_hint;
+extern unsigned int sysctl_sched_conservative_pl;
+#define WALT_MANY_WAKEUP_DEFAULT 1000
+extern unsigned int sysctl_sched_many_wakeup_threshold;
+extern unsigned int sysctl_walt_rtg_cfs_boost_prio;
+extern __read_mostly unsigned int sysctl_sched_force_lb_enable;
+extern const int sched_user_hint_max;
+extern unsigned int sysctl_sched_prefer_spread;
+
+#define for_each_sched_cluster(cluster) \
+	list_for_each_entry_rcu(cluster, &cluster_head, list)
+
+static inline u32 cpu_cycles_to_freq(u64 cycles, u64 period)
+{
+	return div64_u64(cycles, period);
+}
+
+static inline unsigned int sched_cpu_legacy_freq(int cpu)
+{
+	unsigned long curr_cap = arch_scale_freq_capacity(cpu);
+	struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1;
+
+	return (curr_cap * (u64) wrq->cluster->max_possible_freq) >>
+		SCHED_CAPACITY_SHIFT;
+}
+
+extern __read_mostly bool sched_freq_aggr_en;
+static inline void walt_enable_frequency_aggregation(bool enable)
+{
+	sched_freq_aggr_en = enable;
+}
+
+#ifndef CONFIG_IRQ_TIME_ACCOUNTING
+static inline u64 irq_time_read(int cpu) { return 0; }
+#endif
+
+/*Sysctl related interface*/
+#define WINDOW_STATS_RECENT		0
+#define WINDOW_STATS_MAX		1
+#define WINDOW_STATS_MAX_RECENT_AVG	2
+#define WINDOW_STATS_AVG		3
+#define WINDOW_STATS_INVALID_POLICY	4
+
+extern unsigned int __read_mostly sysctl_sched_coloc_downmigrate_ns;
+extern unsigned int __read_mostly sysctl_sched_group_downmigrate_pct;
+extern unsigned int __read_mostly sysctl_sched_group_upmigrate_pct;
+extern unsigned int __read_mostly sysctl_sched_window_stats_policy;
+extern unsigned int sysctl_sched_ravg_window_nr_ticks;
+extern unsigned int sysctl_sched_dynamic_ravg_window_enable;
+extern unsigned int sysctl_sched_walt_rotate_big_tasks;
+extern unsigned int sysctl_sched_task_unfilter_period;
+extern unsigned int __read_mostly sysctl_sched_asym_cap_sibling_freq_match_pct;
+extern unsigned int sysctl_walt_low_latency_task_threshold; /* disabled by default */
+extern unsigned int sysctl_task_read_pid;
+extern struct ctl_table walt_table[];
+extern struct ctl_table walt_base_table[];
+extern void walt_tunables(void);
+extern void walt_update_group_thresholds(void);
+extern void sched_window_nr_ticks_change(void);
+extern unsigned long sched_user_hint_reset_time;
+extern struct irq_work walt_migration_irq_work;
+extern __read_mostly unsigned int new_sched_ravg_window;
+extern struct task_group *task_group_topapp;
+extern struct task_group *task_group_foreground;
+
+#define LIB_PATH_LENGTH 512
+extern unsigned int cpuinfo_max_freq_cached;
+extern char sched_lib_name[LIB_PATH_LENGTH];
+extern unsigned int sched_lib_mask_force;
+extern bool is_sched_lib_based_app(pid_t pid);
+void android_vh_show_max_freq(void *unused, struct cpufreq_policy *policy,
+				unsigned int *max_freq);
+
+/* WALT cpufreq interface */
+#define WALT_CPUFREQ_ROLLOVER		(1U << 0)
+#define WALT_CPUFREQ_CONTINUE		(1U << 1)
+#define WALT_CPUFREQ_IC_MIGRATION	(1U << 2)
+#define WALT_CPUFREQ_PL			(1U << 3)
+#define WALT_CPUFREQ_EARLY_DET		(1U << 4)
+
+#define NO_BOOST 0
+#define FULL_THROTTLE_BOOST 1
+#define CONSERVATIVE_BOOST 2
+#define RESTRAINED_BOOST 3
+#define FULL_THROTTLE_BOOST_DISABLE -1
+#define CONSERVATIVE_BOOST_DISABLE -2
+#define RESTRAINED_BOOST_DISABLE -3
+#define MAX_NUM_BOOST_TYPE (RESTRAINED_BOOST+1)
+
+enum sched_boost_policy {
+	SCHED_BOOST_NONE,
+	SCHED_BOOST_ON_BIG,
+	SCHED_BOOST_ON_ALL,
+};
+
+struct walt_task_group {
+	/*
+	 * Controls whether tasks of this cgroup should be colocated with each
+	 * other and tasks of other cgroups that have the same flag turned on.
+	 */
+	bool colocate;
+	/*
+	 * array indicating whether this task group participates in the
+	 * particular boost type
+	 */
+	bool sched_boost_enable[MAX_NUM_BOOST_TYPE];
+};
+
+struct sched_avg_stats {
+	int nr;
+	int nr_misfit;
+	int nr_max;
+	int nr_scaled;
+};
+
+struct waltgov_callback {
+	void (*func)(struct waltgov_callback *cb, u64 time, unsigned int flags);
+};
+
+DECLARE_PER_CPU(struct waltgov_callback *, waltgov_cb_data);
+
+static inline void waltgov_add_callback(int cpu, struct waltgov_callback *cb,
+			void (*func)(struct waltgov_callback *cb, u64 time,
+			unsigned int flags))
+{
+	if (WARN_ON(!cb || !func))
+		return;
+
+	if (WARN_ON(per_cpu(waltgov_cb_data, cpu)))
+		return;
+
+	cb->func = func;
+	rcu_assign_pointer(per_cpu(waltgov_cb_data, cpu), cb);
+}
+
+static inline void waltgov_remove_callback(int cpu)
+{
+	rcu_assign_pointer(per_cpu(waltgov_cb_data, cpu), NULL);
+}
+
+static inline void waltgov_run_callback(struct rq *rq, unsigned int flags)
+{
+	struct waltgov_callback *cb;
+
+	cb = rcu_dereference_sched(*per_cpu_ptr(&waltgov_cb_data, cpu_of(rq)));
+	if (cb)
+		cb->func(cb, sched_ktime_clock(), flags);
+}
+
+extern unsigned long cpu_util_freq_walt(int cpu, struct walt_cpu_load *walt_load);
+int waltgov_register(void);
+
+extern void walt_lb_init(void);
+extern unsigned int walt_rotation_enabled;
+
+/*
+ * Returns the current capacity of cpu after applying both
+ * cpu and freq scaling.
+ */
+static inline unsigned long capacity_curr_of(int cpu)
+{
+	unsigned long max_cap = cpu_rq(cpu)->cpu_capacity_orig;
+	unsigned long scale_freq = arch_scale_freq_capacity(cpu);
+
+	return cap_scale(max_cap, scale_freq);
+}
+
+static inline unsigned long task_util(struct task_struct *p)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	return wts->demand_scaled;
+}
+
+static inline unsigned long cpu_util(int cpu)
+{
+	struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1;
+	u64 walt_cpu_util = wrq->walt_stats.cumulative_runnable_avg_scaled;
+
+	return min_t(unsigned long, walt_cpu_util, capacity_orig_of(cpu));
+}
+
+static inline unsigned long cpu_util_cum(int cpu, int delta)
+{
+	struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1;
+	u64 util = wrq->cum_window_demand_scaled;
+	unsigned long capacity = capacity_orig_of(cpu);
+
+	delta += util;
+	if (delta < 0)
+		return 0;
+
+	return (delta >= capacity) ? capacity : delta;
+}
+
+extern unsigned int capacity_margin_freq;
+
+static inline unsigned long
+add_capacity_margin(unsigned long cpu_capacity, int cpu)
+{
+	cpu_capacity = cpu_capacity * capacity_margin_freq *
+			(100 + sysctl_sched_load_boost[cpu]);
+	cpu_capacity /= 100;
+	cpu_capacity /= SCHED_CAPACITY_SCALE;
+	return cpu_capacity;
+}
+
+static inline enum sched_boost_policy sched_boost_policy(void)
+{
+	return boost_policy;
+}
+
+static inline int sched_boost(void)
+{
+	return sched_boost_type;
+}
+
+static inline bool rt_boost_on_big(void)
+{
+	return sched_boost() == FULL_THROTTLE_BOOST ?
+			(sched_boost_policy() == SCHED_BOOST_ON_BIG) : false;
+}
+
+static inline bool is_full_throttle_boost(void)
+{
+	return sched_boost() == FULL_THROTTLE_BOOST;
+}
+
+static inline bool task_sched_boost(struct task_struct *p)
+{
+	struct cgroup_subsys_state *css;
+	struct task_group *tg;
+	bool sched_boost_enabled;
+	struct walt_task_group *wtg;
+
+	/* optimization for FT boost, skip looking at tg */
+	if (sched_boost() == FULL_THROTTLE_BOOST)
+		return true;
+
+	rcu_read_lock();
+	css = task_css(p, cpu_cgrp_id);
+	if (!css) {
+		rcu_read_unlock();
+		return false;
+	}
+	tg = container_of(css, struct task_group, css);
+	wtg = (struct walt_task_group *) tg->android_vendor_data1;
+	sched_boost_enabled = wtg->sched_boost_enable[sched_boost()];
+	rcu_read_unlock();
+
+	return sched_boost_enabled;
+}
+
+static inline bool task_placement_boost_enabled(struct task_struct *p)
+{
+	if (likely(sched_boost_policy() == SCHED_BOOST_NONE))
+		return false;
+
+	return task_sched_boost(p);
+}
+
+static inline enum sched_boost_policy task_boost_policy(struct task_struct *p)
+{
+	enum sched_boost_policy policy;
+
+	if (likely(sched_boost_policy() == SCHED_BOOST_NONE))
+		return SCHED_BOOST_NONE;
+
+	policy = task_sched_boost(p) ? sched_boost_policy() : SCHED_BOOST_NONE;
+	if (policy == SCHED_BOOST_ON_BIG) {
+		/*
+		 * Filter out tasks less than min task util threshold
+		 * under conservative boost.
+		 */
+		if (sched_boost() == CONSERVATIVE_BOOST &&
+			task_util(p) <= sysctl_sched_min_task_util_for_boost)
+			policy = SCHED_BOOST_NONE;
+	}
+
+	return policy;
+}
+
+static inline unsigned long capacity_of(int cpu)
+{
+	return cpu_rq(cpu)->cpu_capacity;
+}
+
+static inline bool __cpu_overutilized(int cpu, int delta)
+{
+	return (capacity_orig_of(cpu) * 1024) <
+		((cpu_util(cpu) + delta) * sched_capacity_margin_up[cpu]);
+}
+
+static inline bool cpu_overutilized(int cpu)
+{
+	return __cpu_overutilized(cpu, 0);
+}
+
+static inline int asym_cap_siblings(int cpu1, int cpu2)
+{
+	return (cpumask_test_cpu(cpu1, &asym_cap_sibling_cpus) &&
+		cpumask_test_cpu(cpu2, &asym_cap_sibling_cpus));
+}
+
+static inline bool asym_cap_sibling_group_has_capacity(int dst_cpu, int margin)
+{
+	int sib1, sib2;
+	int nr_running;
+	unsigned long total_util, total_capacity;
+
+	if (cpumask_empty(&asym_cap_sibling_cpus) ||
+			cpumask_test_cpu(dst_cpu, &asym_cap_sibling_cpus))
+		return false;
+
+	sib1 = cpumask_first(&asym_cap_sibling_cpus);
+	sib2 = cpumask_last(&asym_cap_sibling_cpus);
+
+	if (!cpu_active(sib1) || !cpu_active(sib2))
+		return false;
+
+	nr_running = cpu_rq(sib1)->cfs.h_nr_running +
+			cpu_rq(sib2)->cfs.h_nr_running;
+
+	if (nr_running <= 2)
+		return true;
+
+	total_capacity = capacity_of(sib1) + capacity_of(sib2);
+	total_util = cpu_util(sib1) + cpu_util(sib2);
+
+	return ((total_capacity * 100) > (total_util * margin));
+}
+
+/* Is frequency of two cpus synchronized with each other? */
+static inline int same_freq_domain(int src_cpu, int dst_cpu)
+{
+	struct walt_rq *wrq = (struct walt_rq *) cpu_rq(src_cpu)->android_vendor_data1;
+
+	if (src_cpu == dst_cpu)
+		return 1;
+
+	if (asym_cap_siblings(src_cpu, dst_cpu))
+		return 1;
+
+	return cpumask_test_cpu(dst_cpu, &wrq->freq_domain_cpumask);
+}
+
+static inline unsigned long task_util_est(struct task_struct *p)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	return wts->demand_scaled;
+}
+
+#ifdef CONFIG_UCLAMP_TASK
+static inline unsigned long uclamp_task_util(struct task_struct *p)
+{
+	return clamp(task_util_est(p),
+		     uclamp_eff_value(p, UCLAMP_MIN),
+		     uclamp_eff_value(p, UCLAMP_MAX));
+}
+#else
+static inline unsigned long uclamp_task_util(struct task_struct *p)
+{
+	return task_util_est(p);
+}
+#endif
+
+static inline int per_task_boost(struct task_struct *p)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	if (wts->boost_period) {
+		if (sched_clock() > wts->boost_expires) {
+			wts->boost_period = 0;
+			wts->boost_expires = 0;
+			wts->boost = 0;
+		}
+	}
+	return wts->boost;
+}
+
+static inline int cluster_first_cpu(struct walt_sched_cluster *cluster)
+{
+	return cpumask_first(&cluster->cpus);
+}
+
+static inline bool hmp_capable(void)
+{
+	return max_possible_capacity != min_max_possible_capacity;
+}
+
+static inline bool is_max_capacity_cpu(int cpu)
+{
+	return arch_scale_cpu_capacity(cpu) == max_possible_capacity;
+}
+
+static inline bool is_min_capacity_cpu(int cpu)
+{
+	return arch_scale_cpu_capacity(cpu) == min_max_possible_capacity;
+}
+
+static inline bool is_min_capacity_cluster(struct walt_sched_cluster *cluster)
+{
+	return is_min_capacity_cpu(cluster_first_cpu(cluster));
+}
+
+/*
+ * This is only for tracepoints to print the avg irq load. For
+ * task placment considerations, use sched_cpu_high_irqload().
+ */
+#define SCHED_HIGH_IRQ_TIMEOUT 3
+static inline u64 sched_irqload(int cpu)
+{
+	s64 delta;
+	struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1;
+
+	delta = wrq->window_start - wrq->last_irq_window;
+	if (delta < SCHED_HIGH_IRQ_TIMEOUT)
+		return wrq->avg_irqload;
+	else
+		return 0;
+}
+
+static inline int sched_cpu_high_irqload(int cpu)
+{
+	struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1;
+
+	return wrq->high_irqload;
+}
+
+static inline u64
+scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq)
+{
+	return div64_u64(load * (u64)src_freq, (u64)dst_freq);
+}
+
+static inline unsigned int max_task_load(void)
+{
+	return sched_ravg_window;
+}
+
+static inline int same_cluster(int src_cpu, int dst_cpu)
+{
+	struct walt_rq *src_wrq = (struct walt_rq *) cpu_rq(src_cpu)->android_vendor_data1;
+	struct walt_rq *dest_wrq = (struct walt_rq *) cpu_rq(dst_cpu)->android_vendor_data1;
+
+	return src_wrq->cluster == dest_wrq->cluster;
+}
+
+static inline bool is_suh_max(void)
+{
+	return sysctl_sched_user_hint == sched_user_hint_max;
+}
+
+#define DEFAULT_CGROUP_COLOC_ID 1
+static inline bool walt_should_kick_upmigrate(struct task_struct *p, int cpu)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+	struct walt_related_thread_group *rtg = wts->grp;
+
+	if (is_suh_max() && rtg && rtg->id == DEFAULT_CGROUP_COLOC_ID &&
+			    rtg->skip_min && wts->unfilter)
+		return is_min_capacity_cpu(cpu);
+
+	return false;
+}
+
+extern bool is_rtgb_active(void);
+extern u64 get_rtgb_active_time(void);
+
+static inline unsigned int walt_nr_rtg_high_prio(int cpu)
+{
+	struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1;
+
+	return wrq->walt_stats.nr_rtg_high_prio_tasks;
+}
+
+static inline bool task_fits_capacity(struct task_struct *p,
+					long capacity,
+					int cpu)
+{
+	unsigned int margin;
+
+	/*
+	 * Derive upmigration/downmigrate margin wrt the src/dest CPU.
+	 */
+	if (capacity_orig_of(task_cpu(p)) > capacity_orig_of(cpu))
+		margin = sched_capacity_margin_down[cpu];
+	else
+		margin = sched_capacity_margin_up[task_cpu(p)];
+
+	return capacity * 1024 > uclamp_task_util(p) * margin;
+}
+
+static inline bool task_fits_max(struct task_struct *p, int cpu)
+{
+	unsigned long capacity = capacity_orig_of(cpu);
+	unsigned long max_capacity = max_possible_capacity;
+	unsigned long task_boost = per_task_boost(p);
+
+	if (capacity == max_capacity)
+		return true;
+
+	if (is_min_capacity_cpu(cpu)) {
+		if (task_boost_policy(p) == SCHED_BOOST_ON_BIG ||
+				task_boost > 0 ||
+				uclamp_boosted(p) ||
+				walt_should_kick_upmigrate(p, cpu))
+			return false;
+	} else { /* mid cap cpu */
+		if (task_boost > TASK_BOOST_ON_MID)
+			return false;
+	}
+
+	return task_fits_capacity(p, capacity, cpu);
+}
+
+/* applying the task threshold for all types of low latency tasks. */
+static inline bool walt_low_latency_task(struct task_struct *p)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	return wts->low_latency &&
+		(task_util(p) < sysctl_walt_low_latency_task_threshold);
+}
+
+static inline unsigned int walt_get_idle_exit_latency(struct rq *rq)
+{
+	struct cpuidle_state *idle = idle_get_state(rq);
+
+	if (idle)
+		return idle->exit_latency;
+
+	return UINT_MAX;
+}
+
+extern void sched_get_nr_running_avg(struct sched_avg_stats *stats);
+extern void sched_update_hyst_times(void);
+
+extern enum sched_boost_policy sched_boost_policy(void);
+extern void walt_rt_init(void);
+extern void walt_cfs_init(void);
+extern int walt_find_energy_efficient_cpu(struct task_struct *p, int prev_cpu,
+					int sync, int sibling_count_hint);
+
+static inline unsigned int cpu_max_possible_freq(int cpu)
+{
+	struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1;
+
+	return wrq->cluster->max_possible_freq;
+}
+
+static inline unsigned int cpu_max_freq(int cpu)
+{
+	return mult_frac(cpu_max_possible_freq(cpu), capacity_orig_of(cpu),
+			 arch_scale_cpu_capacity(cpu));
+}
+
+static inline unsigned int task_load(struct task_struct *p)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	return wts->demand;
+}
+
+static inline unsigned int task_pl(struct task_struct *p)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	return wts->pred_demand;
+}
+
+static inline bool task_in_related_thread_group(struct task_struct *p)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	return (rcu_access_pointer(wts->grp) != NULL);
+}
+
+static inline bool task_rtg_high_prio(struct task_struct *p)
+{
+	return task_in_related_thread_group(p) &&
+		(p->prio <= sysctl_walt_rtg_cfs_boost_prio);
+}
+
+static inline struct walt_related_thread_group
+*task_related_thread_group(struct task_struct *p)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	return rcu_dereference(wts->grp);
+}
+
+#define CPU_RESERVED 1
+static inline int is_reserved(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+
+	return test_bit(CPU_RESERVED, &wrq->walt_flags);
+}
+
+static inline int mark_reserved(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+
+	return test_and_set_bit(CPU_RESERVED, &wrq->walt_flags);
+}
+
+static inline void clear_reserved(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+
+	clear_bit(CPU_RESERVED, &wrq->walt_flags);
+}
+
+static inline bool
+task_in_cum_window_demand(struct rq *rq, struct task_struct *p)
+{
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	return cpu_of(rq) == task_cpu(p) && (p->on_rq ||
+		wts->last_sleep_ts >= wrq->window_start);
+}
+
+static inline void walt_fixup_cum_window_demand(struct rq *rq, s64 scaled_delta)
+{
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+
+	wrq->cum_window_demand_scaled += scaled_delta;
+	if (unlikely((s64)wrq->cum_window_demand_scaled < 0))
+		wrq->cum_window_demand_scaled = 0;
+}
+
+static inline void walt_irq_work_queue(struct irq_work *work)
+{
+	if (likely(cpu_online(raw_smp_processor_id())))
+		irq_work_queue(work);
+	else
+		irq_work_queue_on(work, cpumask_any(cpu_online_mask));
+}
+
+#define PF_WAKE_UP_IDLE	1
+static inline u32 sched_get_wake_up_idle(struct task_struct *p)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	return wts->wake_up_idle;
+}
+
+static inline int sched_set_wake_up_idle(struct task_struct *p,
+						int wake_up_idle)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	wts->wake_up_idle = !!wake_up_idle;
+	return 0;
+}
+
+static inline void set_wake_up_idle(bool enabled)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) current->android_vendor_data1;
+
+	wts->wake_up_idle = enabled;
+}
+
+extern int set_task_boost(int boost, u64 period);
+
+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
+{
+	return css ? container_of(css, struct task_group, css) : NULL;
+}
+
+#endif /* _WALT_H */

+ 785 - 0
kernel/sched/walt/walt_cfs.c

@@ -0,0 +1,785 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2020-2021, The Linux Foundation. All rights reserved.
+ */
+
+#include <trace/hooks/sched.h>
+#include <trace/hooks/binder.h>
+
+#include "walt.h"
+#include "trace.h"
+#include "../../../drivers/android/binder_trace.h"
+
+/* Migration margins */
+unsigned int sched_capacity_margin_up[WALT_NR_CPUS] = {
+			[0 ... WALT_NR_CPUS-1] = 1078 /* ~5% margin */
+};
+unsigned int sched_capacity_margin_down[WALT_NR_CPUS] = {
+			[0 ... WALT_NR_CPUS-1] = 1205 /* ~15% margin */
+};
+
+__read_mostly unsigned int sysctl_sched_prefer_spread;
+unsigned int sysctl_walt_rtg_cfs_boost_prio = 99; /* disabled by default */
+unsigned int sched_small_task_threshold = 102;
+__read_mostly unsigned int sysctl_sched_force_lb_enable = 1;
+unsigned int capacity_margin_freq = 1280; /* ~20% margin */
+
+static inline bool prefer_spread_on_idle(int cpu, bool new_ilb)
+{
+	switch (sysctl_sched_prefer_spread) {
+	case 1:
+		return is_min_capacity_cpu(cpu);
+	case 2:
+		return true;
+	case 3:
+		return (new_ilb && is_min_capacity_cpu(cpu));
+	case 4:
+		return new_ilb;
+	default:
+		return false;
+	}
+}
+
+static inline bool
+bias_to_this_cpu(struct task_struct *p, int cpu, int start_cpu)
+{
+	bool base_test = cpumask_test_cpu(cpu, &p->cpus_mask) &&
+						cpu_active(cpu);
+	bool start_cap_test = (capacity_orig_of(cpu) >=
+					capacity_orig_of(start_cpu));
+
+	return base_test && start_cap_test;
+}
+
+static inline bool task_demand_fits(struct task_struct *p, int cpu)
+{
+	unsigned long capacity = capacity_orig_of(cpu);
+	unsigned long max_capacity = max_possible_capacity;
+
+	if (capacity == max_capacity)
+		return true;
+
+	return task_fits_capacity(p, capacity, cpu);
+}
+
+struct find_best_target_env {
+	bool	is_rtg;
+	int	need_idle;
+	bool	boosted;
+	int	fastpath;
+	int	start_cpu;
+	int	order_index;
+	int	end_index;
+	bool	strict_max;
+	int	skip_cpu;
+};
+
+/*
+ * cpu_util_without: compute cpu utilization without any contributions from *p
+ * @cpu: the CPU which utilization is requested
+ * @p: the task which utilization should be discounted
+ *
+ * The utilization of a CPU is defined by the utilization of tasks currently
+ * enqueued on that CPU as well as tasks which are currently sleeping after an
+ * execution on that CPU.
+ *
+ * This method returns the utilization of the specified CPU by discounting the
+ * utilization of the specified task, whenever the task is currently
+ * contributing to the CPU utilization.
+ */
+static unsigned long cpu_util_without(int cpu, struct task_struct *p)
+{
+	unsigned int util;
+
+	/*
+	 * WALT does not decay idle tasks in the same manner
+	 * as PELT, so it makes little sense to subtract task
+	 * utilization from cpu utilization. Instead just use
+	 * cpu_util for this case.
+	 */
+	if (likely(p->state == TASK_WAKING))
+		return cpu_util(cpu);
+
+	/* Task has no contribution or is new */
+	if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+		return cpu_util(cpu);
+
+	util = max_t(long, cpu_util(cpu) - task_util(p), 0);
+
+	/*
+	 * Utilization (estimated) can exceed the CPU capacity, thus let's
+	 * clamp to the maximum CPU capacity to ensure consistency with
+	 * the cpu_util call.
+	 */
+	return min_t(unsigned long, util, capacity_orig_of(cpu));
+}
+
+static inline bool walt_get_rtg_status(struct task_struct *p)
+{
+	struct walt_related_thread_group *grp;
+	bool ret = false;
+
+	rcu_read_lock();
+
+	grp = task_related_thread_group(p);
+	if (grp)
+		ret = grp->skip_min;
+
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static inline bool walt_task_skip_min_cpu(struct task_struct *p)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	return sched_boost() != CONSERVATIVE_BOOST &&
+		walt_get_rtg_status(p) && wts->unfilter;
+}
+
+static inline bool walt_is_many_wakeup(int sibling_count_hint)
+{
+	return sibling_count_hint >= sysctl_sched_many_wakeup_threshold;
+}
+
+static inline bool walt_target_ok(int target_cpu, int order_index)
+{
+	return !((order_index != num_sched_clusters - 1) &&
+		 (cpumask_weight(&cpu_array[order_index][0]) == 1) &&
+		 (target_cpu == cpumask_first(&cpu_array[order_index][0])));
+}
+
+static void walt_get_indicies(struct task_struct *p, int *order_index,
+		int *end_index, int task_boost, bool boosted)
+{
+	int i = 0;
+
+	*order_index = 0;
+	*end_index = 0;
+
+	if (num_sched_clusters <= 1)
+		return;
+
+	if (task_boost > TASK_BOOST_ON_MID) {
+		*order_index = num_sched_clusters - 1;
+		return;
+	}
+
+	if (is_full_throttle_boost()) {
+		*order_index = num_sched_clusters - 1;
+		if ((*order_index > 1) && task_demand_fits(p,
+			cpumask_first(&cpu_array[*order_index][1])))
+			*end_index = 1;
+		return;
+	}
+
+	if (boosted || task_boost_policy(p) == SCHED_BOOST_ON_BIG ||
+		walt_task_skip_min_cpu(p))
+		*order_index = 1;
+
+	for (i = *order_index ; i < num_sched_clusters - 1; i++) {
+		if (task_demand_fits(p, cpumask_first(&cpu_array[i][0])))
+			break;
+	}
+
+	*order_index = i;
+}
+
+enum fastpaths {
+	NONE = 0,
+	SYNC_WAKEUP,
+	PREV_CPU_FASTPATH,
+};
+
+static void walt_find_best_target(struct sched_domain *sd,
+					cpumask_t *candidates,
+					struct task_struct *p,
+					struct find_best_target_env *fbt_env)
+{
+	unsigned long min_util = uclamp_task_util(p);
+	long target_max_spare_cap = 0;
+	unsigned long best_idle_cuml_util = ULONG_MAX;
+	unsigned int min_exit_latency = UINT_MAX;
+	int best_idle_cpu = -1;
+	int target_cpu = -1;
+	int i, start_cpu;
+	long spare_wake_cap, most_spare_wake_cap = 0;
+	int most_spare_cap_cpu = -1;
+	int prev_cpu = task_cpu(p);
+	int active_candidate = -1;
+	int order_index = fbt_env->order_index, end_index = fbt_env->end_index;
+	int cluster;
+	unsigned int target_nr_rtg_high_prio = UINT_MAX;
+	bool rtg_high_prio_task = task_rtg_high_prio(p);
+	cpumask_t visit_cpus;
+	bool io_task_pack = (order_index > 0 && p->in_iowait);
+	struct cfs_rq *cfs_rq;
+
+	/* Find start CPU based on boost value */
+	start_cpu = fbt_env->start_cpu;
+
+	if (fbt_env->strict_max || io_task_pack)
+		target_max_spare_cap = LONG_MIN;
+
+	if (p->state == TASK_RUNNING)
+		most_spare_wake_cap = ULONG_MAX;
+
+	/* fast path for prev_cpu */
+	if (((capacity_orig_of(prev_cpu) == capacity_orig_of(start_cpu)) ||
+				asym_cap_siblings(prev_cpu, start_cpu)) &&
+				cpu_active(prev_cpu) && cpu_online(prev_cpu) &&
+				available_idle_cpu(prev_cpu)) {
+		target_cpu = prev_cpu;
+		fbt_env->fastpath = PREV_CPU_FASTPATH;
+		cpumask_set_cpu(target_cpu, candidates);
+		goto out;
+	}
+
+	for (cluster = 0; cluster < num_sched_clusters; cluster++) {
+		cpumask_and(&visit_cpus, &p->cpus_mask,
+				&cpu_array[order_index][cluster]);
+		for_each_cpu(i, &visit_cpus) {
+			unsigned long capacity_orig = capacity_orig_of(i);
+			unsigned long wake_util, new_util, new_util_cuml;
+			long spare_cap;
+			unsigned int idle_exit_latency = UINT_MAX;
+
+			trace_sched_cpu_util(i);
+
+			if (!cpu_active(i))
+				continue;
+
+			if (active_candidate == -1)
+				active_candidate = i;
+
+			/*
+			 * This CPU is the target of an active migration that's
+			 * yet to complete. Avoid placing another task on it.
+			 */
+			if (is_reserved(i))
+				continue;
+
+			if (sched_cpu_high_irqload(i))
+				continue;
+
+			if (fbt_env->skip_cpu == i)
+				continue;
+
+			/*
+			 * p's blocked utilization is still accounted for on prev_cpu
+			 * so prev_cpu will receive a negative bias due to the double
+			 * accounting. However, the blocked utilization may be zero.
+			 */
+			wake_util = cpu_util_without(i, p);
+			new_util = wake_util + uclamp_task_util(p);
+			spare_wake_cap = capacity_orig - wake_util;
+
+			if (spare_wake_cap > most_spare_wake_cap) {
+				most_spare_wake_cap = spare_wake_cap;
+				most_spare_cap_cpu = i;
+			}
+
+			if ((per_task_boost(cpu_rq(i)->curr) ==
+					TASK_BOOST_STRICT_MAX) &&
+					!fbt_env->strict_max)
+				continue;
+
+			/* get rq's utilization with this task included */
+			cfs_rq = &cpu_rq(i)->cfs;
+			new_util_cuml = READ_ONCE(cfs_rq->avg.util_avg) + min_util;
+
+			/*
+			 * Ensure minimum capacity to grant the required boost.
+			 * The target CPU can be already at a capacity level higher
+			 * than the one required to boost the task.
+			 */
+			new_util = max(min_util, new_util);
+			if (!(fbt_env->strict_max || io_task_pack) &&
+					new_util > capacity_orig)
+				continue;
+
+			/*
+			 * Pre-compute the maximum possible capacity we expect
+			 * to have available on this CPU once the task is
+			 * enqueued here.
+			 */
+			spare_cap = capacity_orig - new_util;
+
+			/*
+			 * Find an optimal backup IDLE CPU for non latency
+			 * sensitive tasks.
+			 *
+			 * Looking for:
+			 * - favoring shallowest idle states
+			 *   i.e. avoid to wakeup deep-idle CPUs
+			 *
+			 * The following code path is used by non latency
+			 * sensitive tasks if IDLE CPUs are available. If at
+			 * least one of such CPUs are available it sets the
+			 * best_idle_cpu to the most suitable idle CPU to be
+			 * selected.
+			 *
+			 * If idle CPUs are available, favour these CPUs to
+			 * improve performances by spreading tasks.
+			 * Indeed, the energy_diff() computed by the caller
+			 * will take care to ensure the minimization of energy
+			 * consumptions without affecting performance.
+			 */
+			if (available_idle_cpu(i)) {
+				idle_exit_latency = walt_get_idle_exit_latency(cpu_rq(i));
+
+				/*
+				 * Prefer shallowest over deeper idle state cpu,
+				 * of same capacity cpus.
+				 */
+				if (idle_exit_latency > min_exit_latency)
+					continue;
+				if (min_exit_latency == idle_exit_latency &&
+					(best_idle_cpu == prev_cpu ||
+					(i != prev_cpu &&
+					new_util_cuml > best_idle_cuml_util)))
+					continue;
+
+				min_exit_latency = idle_exit_latency;
+				best_idle_cuml_util = new_util_cuml;
+				best_idle_cpu = i;
+				continue;
+			}
+
+			/*
+			 * Consider only idle CPUs for active migration.
+			 */
+			if (p->state == TASK_RUNNING)
+				continue;
+
+			/*
+			 * Try to spread the rtg high prio tasks so that they
+			 * don't preempt each other. This is a optimisitc
+			 * check assuming rtg high prio can actually preempt
+			 * the current running task with the given vruntime
+			 * boost.
+			 */
+			if (rtg_high_prio_task) {
+				if (walt_nr_rtg_high_prio(i) > target_nr_rtg_high_prio)
+					continue;
+
+				/* Favor CPUs with maximum spare capacity */
+				if (walt_nr_rtg_high_prio(i) == target_nr_rtg_high_prio &&
+						spare_cap < target_max_spare_cap)
+					continue;
+			} else {
+				/* Favor CPUs with maximum spare capacity */
+				if (spare_cap < target_max_spare_cap)
+					continue;
+			}
+
+			target_max_spare_cap = spare_cap;
+			target_nr_rtg_high_prio = walt_nr_rtg_high_prio(i);
+			target_cpu = i;
+		}
+
+		if (best_idle_cpu != -1)
+			break;
+
+		if ((cluster >= end_index) && (target_cpu != -1) &&
+			walt_target_ok(target_cpu, order_index))
+			break;
+	}
+
+	if (best_idle_cpu != -1)
+		target_cpu = -1;
+	/*
+	 * We set both idle and target as long as they are valid CPUs.
+	 * If we don't find either, then we fallback to most_spare_cap,
+	 * If we don't find most spare cap, we fallback to prev_cpu,
+	 * provided that the prev_cpu is active.
+	 * If the prev_cpu is not active, we fallback to active_candidate.
+	 */
+
+	if (unlikely(target_cpu == -1)) {
+		if (best_idle_cpu != -1)
+			target_cpu = best_idle_cpu;
+		else if (most_spare_cap_cpu != -1)
+			target_cpu = most_spare_cap_cpu;
+		else if (!cpu_active(prev_cpu))
+			target_cpu = active_candidate;
+	}
+
+	if (target_cpu != -1)
+		cpumask_set_cpu(target_cpu, candidates);
+	if (best_idle_cpu != -1 && target_cpu != best_idle_cpu)
+		cpumask_set_cpu(best_idle_cpu, candidates);
+out:
+	trace_sched_find_best_target(p, min_util, start_cpu,
+			     best_idle_cpu, most_spare_cap_cpu,
+			     target_cpu, order_index, end_index,
+			     fbt_env->skip_cpu, p->state == TASK_RUNNING);
+}
+
+static inline unsigned long
+cpu_util_next_walt(int cpu, struct task_struct *p, int dst_cpu)
+{
+	struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1;
+	unsigned long util = wrq->walt_stats.cumulative_runnable_avg_scaled;
+	bool queued = task_on_rq_queued(p);
+
+	/*
+	 * When task is queued,
+	 * (a) The evaluating CPU (cpu) is task's current CPU. If the
+	 * task is migrating, discount the task contribution from the
+	 * evaluation cpu.
+	 * (b) The evaluating CPU (cpu) is task's current CPU. If the
+	 * task is NOT migrating, nothing to do. The contribution is
+	 * already present on the evaluation CPU.
+	 * (c) The evaluating CPU (cpu) is not task's current CPU. But
+	 * the task is migrating to the evaluating CPU. So add the
+	 * task contribution to it.
+	 * (d) The evaluating CPU (cpu) is neither the current CPU nor
+	 * the destination CPU. don't care.
+	 *
+	 * When task is NOT queued i.e waking. Task contribution is not
+	 * present on any CPU.
+	 *
+	 * (a) If the evaluating CPU is the destination CPU, add the task
+	 * contribution.
+	 * (b) The evaluation CPU is not the destination CPU, don't care.
+	 */
+	if (unlikely(queued)) {
+		if (task_cpu(p) == cpu) {
+			if (dst_cpu != cpu)
+				util = max_t(long, util - task_util(p), 0);
+		} else if (dst_cpu == cpu) {
+			util += task_util(p);
+		}
+	} else if (dst_cpu == cpu) {
+		util += task_util(p);
+	}
+
+	return min_t(unsigned long, util, capacity_orig_of(cpu));
+}
+
+/*
+ * compute_energy(): Estimates the energy that @pd would consume if @p was
+ * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
+ * landscape of @pd's CPUs after the task migration, and uses the Energy Model
+ * to compute what would be the energy if we decided to actually migrate that
+ * task.
+ */
+static long
+compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
+{
+	struct cpumask *pd_mask = perf_domain_span(pd);
+	unsigned long max_util = 0, sum_util = 0;
+	int cpu;
+	unsigned long cpu_util;
+
+	/*
+	 * The capacity state of CPUs of the current rd can be driven by CPUs
+	 * of another rd if they belong to the same pd. So, account for the
+	 * utilization of these CPUs too by masking pd with cpu_online_mask
+	 * instead of the rd span.
+	 *
+	 * If an entire pd is outside of the current rd, it will not appear in
+	 * its pd list and will not be accounted by compute_energy().
+	 */
+	for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
+		cpu_util = cpu_util_next_walt(cpu, p, dst_cpu);
+		sum_util += cpu_util;
+		max_util = max(max_util, cpu_util);
+	}
+
+	return em_cpu_energy(pd->em_pd, max_util, sum_util);
+}
+
+static inline long
+walt_compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
+{
+	long energy = 0;
+
+	for (; pd; pd = pd->next)
+		energy += compute_energy(p, dst_cpu, pd);
+
+	return energy;
+}
+
+static inline int wake_to_idle(struct task_struct *p)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+	struct walt_task_struct *cur_wts =
+		(struct walt_task_struct *) current->android_vendor_data1;
+
+	return (cur_wts->wake_up_idle || wts->wake_up_idle);
+}
+
+/* return true if cpu should be chosen over best_energy_cpu */
+static inline bool select_cpu_same_energy(int cpu, int best_cpu, int prev_cpu)
+{
+	if (capacity_orig_of(cpu) < capacity_orig_of(best_cpu))
+		return true;
+
+	if (best_cpu == prev_cpu)
+		return false;
+
+	if (available_idle_cpu(best_cpu) && walt_get_idle_exit_latency(cpu_rq(best_cpu)) <= 1)
+		return false; /* best_cpu is idle wfi or shallower */
+
+	if (available_idle_cpu(cpu) && walt_get_idle_exit_latency(cpu_rq(cpu)) <= 1)
+		return true; /* new cpu is idle wfi or shallower */
+
+	/*
+	 * If we are this far this must be a tie between a busy and deep idle,
+	 * pick the busy.
+	 */
+	return available_idle_cpu(best_cpu);
+}
+
+static DEFINE_PER_CPU(cpumask_t, energy_cpus);
+int walt_find_energy_efficient_cpu(struct task_struct *p, int prev_cpu,
+				     int sync, int sibling_count_hint)
+{
+	unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
+	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+	int weight, cpu = smp_processor_id(), best_energy_cpu = prev_cpu;
+	struct perf_domain *pd;
+	unsigned long cur_energy;
+	cpumask_t *candidates;
+	bool is_rtg, curr_is_rtg;
+	struct find_best_target_env fbt_env;
+	bool need_idle = wake_to_idle(p) || uclamp_latency_sensitive(p);
+	u64 start_t = 0;
+	int delta = 0;
+	int task_boost = per_task_boost(p);
+	bool is_uclamp_boosted = uclamp_boosted(p);
+	bool boosted = is_uclamp_boosted || (task_boost > 0);
+	int start_cpu, order_index, end_index;
+
+	if (walt_is_many_wakeup(sibling_count_hint) && prev_cpu != cpu &&
+			cpumask_test_cpu(prev_cpu, &p->cpus_mask))
+		return prev_cpu;
+
+	if (unlikely(!cpu_array))
+		return -EPERM;
+
+	walt_get_indicies(p, &order_index, &end_index, task_boost, boosted);
+	start_cpu = cpumask_first(&cpu_array[order_index][0]);
+
+	is_rtg = task_in_related_thread_group(p);
+	curr_is_rtg = task_in_related_thread_group(cpu_rq(cpu)->curr);
+
+	fbt_env.fastpath = 0;
+	fbt_env.need_idle = need_idle;
+
+	if (trace_sched_task_util_enabled())
+		start_t = sched_clock();
+
+	/* Pre-select a set of candidate CPUs. */
+	candidates = this_cpu_ptr(&energy_cpus);
+	cpumask_clear(candidates);
+
+	if (sync && (need_idle || (is_rtg && curr_is_rtg)))
+		sync = 0;
+
+	if (sync && bias_to_this_cpu(p, cpu, start_cpu)) {
+		best_energy_cpu = cpu;
+		fbt_env.fastpath = SYNC_WAKEUP;
+		goto done;
+	}
+
+	rcu_read_lock();
+	pd = rcu_dereference(rd->pd);
+	if (!pd)
+		goto fail;
+
+	fbt_env.is_rtg = is_rtg;
+	fbt_env.start_cpu = start_cpu;
+	fbt_env.order_index = order_index;
+	fbt_env.end_index = end_index;
+	fbt_env.boosted = boosted;
+	fbt_env.strict_max = is_rtg &&
+		(task_boost == TASK_BOOST_STRICT_MAX);
+	fbt_env.skip_cpu = walt_is_many_wakeup(sibling_count_hint) ?
+			   cpu : -1;
+
+	walt_find_best_target(NULL, candidates, p, &fbt_env);
+
+	/* Bail out if no candidate was found. */
+	weight = cpumask_weight(candidates);
+	if (!weight)
+		goto unlock;
+
+	/* If there is only one sensible candidate, select it now. */
+	cpu = cpumask_first(candidates);
+	if (weight == 1 && (available_idle_cpu(cpu) || cpu == prev_cpu)) {
+		best_energy_cpu = cpu;
+		goto unlock;
+	}
+
+	if (p->state == TASK_WAKING)
+		delta = task_util(p);
+
+	if (task_placement_boost_enabled(p) || fbt_env.need_idle ||
+	    boosted || is_rtg || __cpu_overutilized(prev_cpu, delta) ||
+	    !task_fits_max(p, prev_cpu) || !cpu_active(prev_cpu)) {
+		best_energy_cpu = cpu;
+		goto unlock;
+	}
+
+	if (cpumask_test_cpu(prev_cpu, &p->cpus_mask))
+		prev_delta = best_delta =
+			walt_compute_energy(p, prev_cpu, pd);
+	else
+		prev_delta = best_delta = ULONG_MAX;
+
+	/* Select the best candidate energy-wise. */
+	for_each_cpu(cpu, candidates) {
+		if (cpu == prev_cpu)
+			continue;
+
+		cur_energy = walt_compute_energy(p, cpu, pd);
+		trace_sched_compute_energy(p, cpu, cur_energy,
+			prev_delta, best_delta, best_energy_cpu);
+
+		if (cur_energy < best_delta) {
+			best_delta = cur_energy;
+			best_energy_cpu = cpu;
+		} else if (cur_energy == best_delta) {
+			if (select_cpu_same_energy(cpu, best_energy_cpu,
+							prev_cpu)) {
+				best_delta = cur_energy;
+				best_energy_cpu = cpu;
+			}
+		}
+	}
+
+unlock:
+	rcu_read_unlock();
+
+	/*
+	 * Pick the prev CPU, if best energy CPU can't saves at least 6% of
+	 * the energy used by prev_cpu.
+	 */
+	if (!(available_idle_cpu(best_energy_cpu) &&
+	    walt_get_idle_exit_latency(cpu_rq(best_energy_cpu)) <= 1) &&
+	    (prev_delta != ULONG_MAX) && (best_energy_cpu != prev_cpu) &&
+	    ((prev_delta - best_delta) <= prev_delta >> 4) &&
+	    (capacity_orig_of(prev_cpu) <= capacity_orig_of(start_cpu)))
+		best_energy_cpu = prev_cpu;
+
+done:
+	trace_sched_task_util(p, cpumask_bits(candidates)[0], best_energy_cpu,
+			sync, fbt_env.need_idle, fbt_env.fastpath,
+			task_boost_policy(p), start_t, boosted, is_rtg,
+			walt_get_rtg_status(p), start_cpu);
+
+	return best_energy_cpu;
+
+fail:
+	rcu_read_unlock();
+	return -EPERM;
+}
+
+static void
+walt_select_task_rq_fair(void *unused, struct task_struct *p, int prev_cpu,
+				int sd_flag, int wake_flags, int *target_cpu)
+{
+	int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
+	int sibling_count_hint = p->wake_q_head ? p->wake_q_head->count : 1;
+
+	*target_cpu = walt_find_energy_efficient_cpu(p, prev_cpu, sync, sibling_count_hint);
+	if (unlikely(*target_cpu < 0))
+		*target_cpu = prev_cpu;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static unsigned long task_h_load(struct task_struct *p)
+{
+	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+
+	update_cfs_rq_h_load(cfs_rq);
+	return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
+			cfs_rq_load_avg(cfs_rq) + 1);
+}
+#else
+static unsigned long task_h_load(struct task_struct *p)
+{
+	return p->se.avg.load_avg;
+}
+#endif
+
+static void walt_update_misfit_status(void *unused, struct task_struct *p,
+					struct rq *rq, bool *need_update)
+{
+	*need_update = false;
+
+	if (!p) {
+		rq->misfit_task_load = 0;
+		return;
+	}
+
+	if (task_fits_max(p, cpu_of(rq))) {
+		rq->misfit_task_load = 0;
+		return;
+	}
+
+	/*
+	 * Make sure that misfit_task_load will not be null even if
+	 * task_h_load() returns 0.
+	 */
+	rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
+}
+
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+	return container_of(se, struct task_struct, se);
+}
+
+static void walt_place_entity(void *unused, struct sched_entity *se, u64 *vruntime)
+{
+	if (entity_is_task(se)) {
+		unsigned long thresh = sysctl_sched_latency;
+
+		/*
+		 * Halve their sleep time's effect, to allow
+		 * for a gentler effect of sleepers:
+		 */
+		if (sched_feat(GENTLE_FAIR_SLEEPERS))
+			thresh >>= 1;
+
+		if ((per_task_boost(task_of(se)) == TASK_BOOST_STRICT_MAX) ||
+				walt_low_latency_task(task_of(se)) ||
+				task_rtg_high_prio(task_of(se))) {
+			*vruntime -= sysctl_sched_latency;
+			*vruntime -= thresh;
+			se->vruntime = *vruntime;
+		}
+	}
+}
+
+static void walt_binder_low_latency_set(void *unused, struct task_struct *task)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) task->android_vendor_data1;
+
+	if (task && current->signal &&
+			(current->signal->oom_score_adj == 0) &&
+			(current->prio < DEFAULT_PRIO))
+		wts->low_latency = true;
+}
+
+static void walt_binder_low_latency_clear(void *unused, struct binder_transaction *t)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) current->android_vendor_data1;
+
+	if (wts->low_latency)
+		wts->low_latency = false;
+}
+
+void walt_cfs_init(void)
+{
+	register_trace_android_rvh_select_task_rq_fair(walt_select_task_rq_fair, NULL);
+	register_trace_android_rvh_update_misfit_status(walt_update_misfit_status, NULL);
+	register_trace_android_rvh_place_entity(walt_place_entity, NULL);
+
+	register_trace_android_vh_binder_wakeup_ilocked(walt_binder_low_latency_set, NULL);
+	register_trace_binder_transaction_received(walt_binder_low_latency_clear, NULL);
+}

+ 50 - 0
kernel/sched/walt/walt_debug.c

@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2020-2021, The Linux Foundation. All rights reserved.
+ */
+
+#include <linux/module.h>
+
+#include <trace/hooks/sched.h>
+
+#include "walt.h"
+#include "walt_debug.h"
+
+static void dump_throttled_rt_tasks(void *unused, int cpu, u64 clock,
+		ktime_t rt_period, u64 rt_runtime, s64 rt_period_timer_expires)
+{
+	printk_deferred("sched: RT throttling activated for cpu %d\n", cpu);
+	printk_deferred("rt_period_timer: expires=%lld now=%llu runtime=%llu period=%llu\n",
+			rt_period_timer_expires, ktime_get_ns(), rt_runtime, rt_period);
+	printk_deferred("potential CPU hogs:\n");
+#ifdef CONFIG_SCHED_INFO
+	if (sched_info_on())
+		printk_deferred("current %s (%d) is running for %llu nsec\n",
+				current->comm, current->pid,
+				clock - current->sched_info.last_arrival);
+#endif
+	BUG();
+}
+
+static void android_rvh_schedule_bug(void *unused, void *unused2)
+{
+	BUG();
+}
+
+static int __init walt_debug_init(void)
+{
+	int ret;
+
+	ret = preemptirq_long_init();
+	if (!ret)
+		return ret;
+
+	register_trace_android_vh_dump_throttled_rt_tasks(dump_throttled_rt_tasks, NULL);
+	register_trace_android_rvh_schedule_bug(android_rvh_schedule_bug, NULL);
+
+	return 0;
+}
+module_init(walt_debug_init);
+
+MODULE_DESCRIPTION("QTI WALT Debug Module");
+MODULE_LICENSE("GPL v2");

+ 5 - 0
kernel/sched/walt/walt_debug.h

@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2021, The Linux Foundation. All rights reserved.
+ */
+int preemptirq_long_init(void);

+ 742 - 0
kernel/sched/walt/walt_lb.c

@@ -0,0 +1,742 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2020-2021, The Linux Foundation. All rights reserved.
+ */
+
+#include <trace/hooks/sched.h>
+
+#include "walt.h"
+#include "trace.h"
+
+extern u64 sched_ktime_clock(void); // TODO
+static void walt_detach_task(struct task_struct *p, struct rq *src_rq,
+			     struct rq *dst_rq)
+{
+	deactivate_task(src_rq, p, 0);
+	double_lock_balance(src_rq, dst_rq);
+	if (!(src_rq->clock_update_flags & RQCF_UPDATED))
+		update_rq_clock(src_rq);
+	set_task_cpu(p, dst_rq->cpu);
+	double_unlock_balance(src_rq, dst_rq);
+}
+
+static void walt_attach_task(struct task_struct *p, struct rq *rq)
+{
+	activate_task(rq, p, 0);
+	check_preempt_curr(rq, p, 0);
+}
+
+static int walt_lb_active_migration(void *data)
+{
+	struct rq *busiest_rq = data;
+	int busiest_cpu = cpu_of(busiest_rq);
+	int target_cpu = busiest_rq->push_cpu;
+	struct rq *target_rq = cpu_rq(target_cpu);
+	struct walt_rq *wrq = (struct walt_rq *) busiest_rq->android_vendor_data1;
+	struct task_struct *push_task = wrq->push_task;
+	int push_task_detached = 0;
+
+	raw_spin_lock_irq(&busiest_rq->lock);
+
+	/* sanity checks before initiating the pull */
+	if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
+		goto out_unlock;
+
+	if (unlikely(busiest_cpu != raw_smp_processor_id() ||
+		     !busiest_rq->active_balance))
+		goto out_unlock;
+
+	if (busiest_rq->nr_running <= 1)
+		goto out_unlock;
+
+	BUG_ON(busiest_rq == target_rq);
+
+	if (task_on_rq_queued(push_task) &&
+			push_task->state == TASK_RUNNING &&
+			task_cpu(push_task) == busiest_cpu &&
+			cpu_active(target_cpu)) {
+		walt_detach_task(push_task, busiest_rq, target_rq);
+		push_task_detached = 1;
+	}
+
+out_unlock: /* called with busiest_rq lock */
+	busiest_rq->active_balance = 0;
+	target_cpu = busiest_rq->push_cpu;
+	clear_reserved(target_cpu);
+	wrq->push_task = NULL;
+	raw_spin_unlock(&busiest_rq->lock);
+
+	if (push_task_detached) {
+		if (push_task_detached) {
+			raw_spin_lock(&target_rq->lock);
+			walt_attach_task(push_task, target_rq);
+			raw_spin_unlock(&target_rq->lock);
+		}
+	}
+	put_task_struct(push_task);
+
+	local_irq_enable();
+	return 0;
+}
+
+struct walt_lb_rotate_work {
+	struct work_struct	w;
+	struct task_struct	*src_task;
+	struct task_struct	*dst_task;
+	int			src_cpu;
+	int			dst_cpu;
+};
+
+DEFINE_PER_CPU(struct walt_lb_rotate_work, walt_lb_rotate_works);
+
+static void walt_lb_rotate_work_func(struct work_struct *work)
+{
+	struct walt_lb_rotate_work *wr = container_of(work,
+					struct walt_lb_rotate_work, w);
+
+	migrate_swap(wr->src_task, wr->dst_task, wr->dst_cpu, wr->src_cpu);
+
+	put_task_struct(wr->src_task);
+	put_task_struct(wr->dst_task);
+
+	clear_reserved(wr->src_cpu);
+	clear_reserved(wr->dst_cpu);
+}
+
+static void walt_lb_rotate_work_init(void)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct walt_lb_rotate_work *wr = &per_cpu(walt_lb_rotate_works, i);
+
+		INIT_WORK(&wr->w, walt_lb_rotate_work_func);
+	}
+}
+
+#define WALT_ROTATION_THRESHOLD_NS	16000000
+static void walt_lb_check_for_rotation(struct rq *src_rq)
+{
+	u64 wc, wait, max_wait = 0, run, max_run = 0;
+	int deserved_cpu = nr_cpu_ids, dst_cpu = nr_cpu_ids;
+	int i, src_cpu = cpu_of(src_rq);
+	struct rq *dst_rq;
+	struct walt_lb_rotate_work *wr = NULL;
+	struct walt_task_struct *wts;
+
+	if (!is_min_capacity_cpu(src_cpu))
+		return;
+
+	wc = sched_ktime_clock();
+
+	for_each_possible_cpu(i) {
+		struct rq *rq = cpu_rq(i);
+
+		if (!is_min_capacity_cpu(i))
+			break;
+
+		if (is_reserved(i))
+			continue;
+
+		if (!rq->misfit_task_load)
+			continue;
+
+		wts = (struct walt_task_struct *) rq->curr->android_vendor_data1;
+		wait = wc - wts->last_enqueued_ts;
+		if (wait > max_wait) {
+			max_wait = wait;
+			deserved_cpu = i;
+		}
+	}
+
+	if (deserved_cpu != src_cpu)
+		return;
+
+	for_each_possible_cpu(i) {
+		struct rq *rq = cpu_rq(i);
+
+		if (is_min_capacity_cpu(i))
+			continue;
+
+		if (is_reserved(i))
+			continue;
+
+		if (rq->curr->prio < MAX_RT_PRIO)
+			continue;
+
+		if (rq->nr_running > 1)
+			continue;
+
+		wts = (struct walt_task_struct *) rq->curr->android_vendor_data1;
+		run = wc - wts->last_enqueued_ts;
+
+		if (run < WALT_ROTATION_THRESHOLD_NS)
+			continue;
+
+		if (run > max_run) {
+			max_run = run;
+			dst_cpu = i;
+		}
+	}
+
+	if (dst_cpu == nr_cpu_ids)
+		return;
+
+	dst_rq = cpu_rq(dst_cpu);
+
+	double_rq_lock(src_rq, dst_rq);
+	if (dst_rq->curr->prio >= MAX_RT_PRIO && dst_rq->curr != dst_rq->idle &&
+		src_rq->curr->prio >= MAX_RT_PRIO && src_rq->curr != src_rq->idle) {
+		get_task_struct(src_rq->curr);
+		get_task_struct(dst_rq->curr);
+
+		mark_reserved(src_cpu);
+		mark_reserved(dst_cpu);
+		wr = &per_cpu(walt_lb_rotate_works, src_cpu);
+
+		wr->src_task = src_rq->curr;
+		wr->dst_task = dst_rq->curr;
+
+		wr->src_cpu = src_cpu;
+		wr->dst_cpu = dst_cpu;
+	}
+	double_rq_unlock(src_rq, dst_rq);
+
+	if (wr)
+		queue_work_on(src_cpu, system_highpri_wq, &wr->w);
+}
+
+static inline bool _walt_can_migrate_task(struct task_struct *p, int dst_cpu,
+					  bool to_lower)
+{
+	struct walt_rq *wrq = (struct walt_rq *) task_rq(p)->android_vendor_data1;
+
+	if (to_lower) {
+		if (p->in_iowait)
+			return false;
+		if (per_task_boost(p) == TASK_BOOST_STRICT_MAX &&
+				task_in_related_thread_group(p))
+			return false;
+	}
+
+	/* Don't detach task if it is under active migration */
+	if (wrq->push_task == p)
+		return false;
+
+	return true;
+}
+
+static inline bool need_active_lb(struct task_struct *p, int dst_cpu,
+				  int src_cpu)
+{
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	if (cpu_rq(src_cpu)->active_balance)
+		return false;
+
+	if (capacity_orig_of(dst_cpu) <= capacity_orig_of(src_cpu))
+		return false;
+
+	if (!wts->misfit)
+		return false;
+
+	return true;
+}
+
+static int walt_lb_pull_tasks(int dst_cpu, int src_cpu)
+{
+	struct rq *dst_rq = cpu_rq(dst_cpu);
+	struct rq *src_rq = cpu_rq(src_cpu);
+	unsigned long flags;
+	struct task_struct *pulled_task = NULL, *p;
+	bool active_balance = false, to_lower;
+	struct walt_rq *wrq = (struct walt_rq *) src_rq->android_vendor_data1;
+	struct walt_task_struct *wts;
+
+	BUG_ON(src_cpu == dst_cpu);
+
+	to_lower = capacity_orig_of(dst_cpu) < capacity_orig_of(src_cpu);
+
+	raw_spin_lock_irqsave(&src_rq->lock, flags);
+	list_for_each_entry_reverse(p, &src_rq->cfs_tasks, se.group_node) {
+
+		if (!cpumask_test_cpu(dst_cpu, p->cpus_ptr))
+			continue;
+
+		if (!_walt_can_migrate_task(p, dst_cpu, to_lower))
+			continue;
+
+		if (task_running(src_rq, p)) {
+
+			if (need_active_lb(p, dst_cpu, src_cpu)) {
+				active_balance = true;
+				break;
+			}
+			continue;
+		}
+
+		walt_detach_task(p, src_rq, dst_rq);
+		pulled_task = p;
+		break;
+	}
+
+	if (active_balance) {
+		src_rq->active_balance = 1;
+		src_rq->push_cpu = dst_cpu;
+		get_task_struct(p);
+		wrq->push_task = p;
+		mark_reserved(dst_cpu);
+	}
+	/* lock must be dropped before waking the stopper */
+	raw_spin_unlock_irqrestore(&src_rq->lock, flags);
+
+	/*
+	 * Using our custom active load balance callback so that
+	 * the push_task is really pulled onto this CPU.
+	 */
+	if (active_balance) {
+		wts = (struct walt_task_struct *) p->android_vendor_data1;
+		trace_walt_active_load_balance(p, src_cpu, dst_cpu, wts);
+		stop_one_cpu_nowait(src_cpu, walt_lb_active_migration,
+				    src_rq, &src_rq->active_balance_work);
+		return 0; /* we did not pull any task here */
+	}
+
+	if (!pulled_task)
+		return 0;
+
+	raw_spin_lock_irqsave(&dst_rq->lock, flags);
+	walt_attach_task(p, dst_rq);
+	raw_spin_unlock_irqrestore(&dst_rq->lock, flags);
+
+	return 1; /* we pulled 1 task */
+}
+
+static int walt_lb_find_busiest_similar_cap_cpu(int dst_cpu, const cpumask_t *src_mask)
+{
+	int i;
+	int busiest_cpu = -1;
+	int busiest_nr = 1; /* we need atleast 2 */
+	unsigned long util, busiest_util = 0;
+	struct walt_rq *wrq;
+
+	for_each_cpu(i, src_mask) {
+		wrq = (struct walt_rq *) cpu_rq(i)->android_vendor_data1;
+		trace_walt_lb_cpu_util(i, wrq);
+
+		if (cpu_rq(i)->cfs.h_nr_running < 2)
+			continue;
+
+		util = cpu_util(i);
+		if (util < busiest_util)
+			continue;
+
+		busiest_nr = cpu_rq(i)->cfs.h_nr_running;
+		busiest_util = util;
+		busiest_cpu = i;
+	}
+
+	return busiest_cpu;
+}
+
+#define SMALL_TASK_THRESHOLD	102
+static int walt_lb_find_busiest_higher_cap_cpu(int dst_cpu, const cpumask_t *src_mask)
+{
+	int i;
+	int busiest_cpu = -1;
+	int busiest_nr = 1; /* we need atleast 2 */
+	unsigned long util, busiest_util = 0;
+	unsigned long total_capacity = 0, total_util = 0, total_nr = 0;
+	int total_cpus = 0;
+	struct walt_rq *wrq;
+
+	for_each_cpu(i, src_mask) {
+
+		if (!cpu_active(i))
+			continue;
+
+		wrq = (struct walt_rq *) cpu_rq(i)->android_vendor_data1;
+		trace_walt_lb_cpu_util(i, wrq);
+
+		util = cpu_util(i);
+		total_cpus += 1;
+		total_util += util;
+		total_capacity += capacity_orig_of(i);
+		total_nr += cpu_rq(i)->cfs.h_nr_running;
+
+		if (cpu_rq(i)->cfs.h_nr_running < 2)
+			continue;
+
+		if (cpu_rq(i)->cfs.h_nr_running == 2 &&
+			task_util(cpu_rq(i)->curr) < SMALL_TASK_THRESHOLD)
+			continue;
+
+		/*
+		 * During rotation, two silver fmax tasks gets
+		 * placed on gold/prime and the CPU may not be
+		 * overutilized but for rotation, we have to
+		 * spread out.
+		 */
+		if (!walt_rotation_enabled && !cpu_overutilized(i))
+			continue;
+
+		if (util < busiest_util)
+			continue;
+
+		busiest_nr = cpu_rq(i)->cfs.h_nr_running;
+		busiest_util = util;
+		busiest_cpu = i;
+	}
+
+	/*
+	 * Don't allow migrating to lower cluster unless this high
+	 * capacity cluster is sufficiently loaded.
+	 */
+	if (!walt_rotation_enabled) {
+		if (total_nr <= total_cpus || total_util * 1280 < total_capacity * 1024)
+			busiest_cpu = -1;
+	}
+
+	return busiest_cpu;
+}
+
+static int walt_lb_find_busiest_lower_cap_cpu(int dst_cpu, const cpumask_t *src_mask)
+{
+	int i;
+	int busiest_cpu = -1;
+	int busiest_nr = 1; /* we need atleast 2 */
+	unsigned long util, busiest_util = 0;
+	unsigned long total_capacity = 0, total_util = 0, total_nr = 0;
+	int total_cpus = 0;
+	int busy_nr_big_tasks = 0;
+	struct walt_rq *wrq;
+
+	/*
+	 * A higher capacity CPU is looking at a lower capacity
+	 * cluster. active balance and big tasks are in play.
+	 * other than that, it is very much same as above. we
+	 * really don't need this as a separate block. will
+	 * refactor this after final testing is done.
+	 */
+	for_each_cpu(i, src_mask) {
+		wrq = (struct walt_rq *) cpu_rq(i)->android_vendor_data1;
+
+		if (!cpu_active(i))
+			continue;
+
+		trace_walt_lb_cpu_util(i, wrq);
+
+		util = cpu_util(i);
+		total_cpus += 1;
+		total_util += util;
+		total_capacity += capacity_orig_of(i);
+		total_nr += cpu_rq(i)->cfs.h_nr_running;
+
+		/*
+		 * no point in selecting this CPU as busy, as
+		 * active balance is in progress.
+		 */
+		if (cpu_rq(i)->active_balance)
+			continue;
+
+		if (cpu_rq(i)->cfs.h_nr_running < 2 && !wrq->walt_stats.nr_big_tasks)
+			continue;
+
+		if (!walt_rotation_enabled && !cpu_overutilized(i))
+			continue;
+
+		if (util < busiest_util)
+			continue;
+
+		busiest_nr = cpu_rq(i)->cfs.h_nr_running;
+		busiest_util = util;
+		busiest_cpu = i;
+		busy_nr_big_tasks = wrq->walt_stats.nr_big_tasks;
+	}
+
+	if (!walt_rotation_enabled && !busy_nr_big_tasks) {
+		if (total_nr <= total_cpus || total_util * 1280 < total_capacity * 1024)
+			busiest_cpu = -1;
+	}
+
+	return busiest_cpu;
+}
+
+static int walt_lb_find_busiest_cpu(int dst_cpu, const cpumask_t *src_mask)
+{
+	int fsrc_cpu = cpumask_first(src_mask);
+	int busiest_cpu;
+
+	if (capacity_orig_of(dst_cpu) == capacity_orig_of(fsrc_cpu))
+		busiest_cpu = walt_lb_find_busiest_similar_cap_cpu(dst_cpu,
+								src_mask);
+	else if (capacity_orig_of(dst_cpu) < capacity_orig_of(fsrc_cpu))
+		busiest_cpu = walt_lb_find_busiest_lower_cap_cpu(dst_cpu,
+								src_mask);
+	else
+		busiest_cpu = walt_lb_find_busiest_higher_cap_cpu(dst_cpu,
+								src_mask);
+
+	return busiest_cpu;
+}
+
+static DEFINE_RAW_SPINLOCK(walt_lb_migration_lock);
+static void walt_lb_tick(void *unused, struct rq *rq)
+{
+	int prev_cpu = rq->cpu, new_cpu, ret;
+	struct task_struct *p = rq->curr;
+	unsigned long flags;
+	struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
+	struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
+
+	if (!rq->misfit_task_load)
+		return;
+
+	if (p->state != TASK_RUNNING || p->nr_cpus_allowed == 1)
+		return;
+
+	raw_spin_lock_irqsave(&walt_lb_migration_lock, flags);
+
+	if (walt_rotation_enabled) {
+		walt_lb_check_for_rotation(rq);
+		goto out_unlock;
+	}
+
+	rcu_read_lock();
+	new_cpu = walt_find_energy_efficient_cpu(p, prev_cpu, 0, 1);
+	rcu_read_unlock();
+
+	if (new_cpu < 0 || same_cluster(new_cpu, prev_cpu))
+		goto out_unlock;
+
+	raw_spin_lock(&rq->lock);
+	if (rq->active_balance) {
+		raw_spin_unlock(&rq->lock);
+		goto out_unlock;
+	}
+	rq->active_balance = 1;
+	rq->push_cpu = new_cpu;
+	get_task_struct(p);
+	wrq->push_task = p;
+	raw_spin_unlock(&rq->lock);
+
+	mark_reserved(new_cpu);
+	raw_spin_unlock_irqrestore(&walt_lb_migration_lock, flags);
+
+	trace_walt_active_load_balance(p, prev_cpu, new_cpu, wts);
+	ret = stop_one_cpu_nowait(prev_cpu,
+			walt_lb_active_migration, rq,
+			&rq->active_balance_work);
+	if (!ret)
+		clear_reserved(new_cpu);
+	else
+		wake_up_if_idle(new_cpu);
+
+	return;
+
+out_unlock:
+	raw_spin_unlock_irqrestore(&walt_lb_migration_lock, flags);
+}
+
+static void walt_newidle_balance(void *unused, struct rq *this_rq,
+				 struct rq_flags *rf, int *pulled_task,
+				 int *done)
+{
+	int this_cpu = this_rq->cpu;
+	struct walt_rq *wrq = (struct walt_rq *) this_rq->android_vendor_data1;
+	int order_index = wrq->cluster->id;
+	int cluster = 0;
+	int busy_cpu;
+
+	if (unlikely(!cpu_array))
+		return;
+
+	/*
+	 * newly idle load balance is completely handled here, so
+	 * set done to skip the load balance by the caller.
+	 */
+	*done = 1;
+	*pulled_task = 0;
+
+	/*
+	 * This CPU is about to enter idle, so clear the
+	 * misfit_task_load and mark the idle stamp.
+	 */
+	this_rq->misfit_task_load = 0;
+	this_rq->idle_stamp = rq_clock(this_rq);
+
+	if (!cpu_active(this_cpu))
+		return;
+
+	if (!READ_ONCE(this_rq->rd->overload))
+		return;
+
+	rq_unpin_lock(this_rq, rf);
+	raw_spin_unlock(&this_rq->lock);
+
+	/*
+	 * careful, we dropped the lock, and has to be acquired
+	 * before returning. Since rq lock is dropped, tasks
+	 * can be queued remotely, so keep a check on nr_running
+	 * and bail out.
+	 */
+	do {
+		busy_cpu = walt_lb_find_busiest_cpu(this_cpu,
+				&cpu_array[order_index][cluster]);
+
+		/* we got the busy/src cpu here. */
+		if (busy_cpu != -1 || this_rq->nr_running > 0)
+			break;
+
+	} while (++cluster < num_sched_clusters);
+
+	/* sanity checks before attempting the pull */
+	if (busy_cpu == -1 || this_rq->nr_running > 0 || (busy_cpu == this_cpu))
+		goto out;
+
+	*pulled_task = walt_lb_pull_tasks(this_cpu, busy_cpu);
+
+out:
+	raw_spin_lock(&this_rq->lock);
+	if (this_rq->cfs.h_nr_running && !*pulled_task)
+		*pulled_task = 1;
+
+	/* Is there a task of a high priority class? */
+	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
+		*pulled_task = -1;
+
+	/* reset the idle time stamp if we pulled any task */
+	if (*pulled_task)
+		this_rq->idle_stamp = 0;
+
+	rq_repin_lock(this_rq, rf);
+
+	trace_walt_newidle_balance(this_cpu, busy_cpu, *pulled_task);
+}
+
+static void walt_find_busiest_queue(void *unused, int dst_cpu,
+				    struct sched_group *group,
+				    struct cpumask *env_cpus,
+				    struct rq **busiest, int *done)
+{
+	int fsrc_cpu = group_first_cpu(group);
+	int busiest_cpu = -1;
+	struct cpumask src_mask;
+
+	*done = 1;
+	*busiest = NULL;
+
+	/*
+	 * same cluster means, there will only be 1
+	 * CPU in the busy group, so just select it.
+	 */
+	if (same_cluster(dst_cpu, fsrc_cpu)) {
+		busiest_cpu = fsrc_cpu;
+		goto done;
+	}
+
+	/*
+	 * We will allow inter cluster migrations
+	 * only if the source group is sufficiently
+	 * loaded. The upstream load balancer is a
+	 * bit more generous.
+	 *
+	 * re-using the same code that we use it
+	 * for newly idle load balance. The policies
+	 * remain same.
+	 */
+	cpumask_and(&src_mask, sched_group_span(group), env_cpus);
+	busiest_cpu = walt_lb_find_busiest_cpu(dst_cpu, &src_mask);
+done:
+	if (busiest_cpu != -1)
+		*busiest = cpu_rq(busiest_cpu);
+
+	trace_walt_find_busiest_queue(dst_cpu, busiest_cpu, src_mask.bits[0]);
+}
+
+static void walt_migrate_queued_task(void *unused, struct rq *rq,
+				     struct rq_flags *rf,
+				     struct task_struct *p,
+				     int new_cpu, int *detached)
+{
+	/*
+	 * WALT expects both source and destination rqs to be
+	 * held when set_task_cpu() is called on a queued task.
+	 * so implementing this detach hook. unpin the lock
+	 * before detaching and repin it later to make lockdep
+	 * happy.
+	 */
+	BUG_ON(!rf);
+
+	rq_unpin_lock(rq, rf);
+	walt_detach_task(p, rq, cpu_rq(new_cpu));
+	rq_repin_lock(rq, rf);
+
+	*detached = 1;
+}
+
+/*
+ * we only decide if nohz balance kick is needed or not. the
+ * first CPU in the nohz.idle will come out of idle and do
+ * load balance on behalf of every CPU. adding another hook
+ * to decide which cpu to kick is useless. most of the time,
+ * it is impossible to decide which CPU has to come out because
+ * we get to kick only once.
+ */
+static void walt_nohz_balancer_kick(void *unused, struct rq *rq,
+				    unsigned int *flags, int *done)
+{
+	*done = 1;
+
+	/*
+	 * tick path migration takes care of misfit task.
+	 * so we have to check for nr_running >= 2 here.
+	 */
+	if (rq->nr_running >= 2 && cpu_overutilized(rq->cpu)) {
+		*flags = NOHZ_KICK_MASK;
+		trace_walt_nohz_balance_kick(rq);
+	}
+}
+
+static void walt_can_migrate_task(void *unused, struct task_struct *p,
+				  int dst_cpu, int *can_migrate)
+{
+	bool to_lower;
+
+	to_lower = capacity_orig_of(dst_cpu) < capacity_orig_of(task_cpu(p));
+
+	if (_walt_can_migrate_task(p, dst_cpu, to_lower))
+		return;
+
+	*can_migrate = 0;
+}
+
+/*
+ * when WALT becomes module, this init will be called from
+ * another file and we don't have to define module_init().
+ */
+void walt_lb_init(void)
+{
+	/*
+	 * Any task movement outside task placement is called
+	 * load balance, so moving the tick path and rotation
+	 * code to here. we also use our custom active load balance
+	 * stopper function instad of adding hooks to
+	 * active_load_balance_cpu_stop() in fair.c
+	 */
+	walt_lb_rotate_work_init();
+
+	register_trace_android_rvh_migrate_queued_task(walt_migrate_queued_task, NULL);
+	register_trace_android_rvh_sched_nohz_balancer_kick(walt_nohz_balancer_kick, NULL);
+	register_trace_android_rvh_can_migrate_task(walt_can_migrate_task, NULL);
+	register_trace_android_rvh_find_busiest_queue(walt_find_busiest_queue, NULL);
+	register_trace_android_rvh_sched_newidle_balance(walt_newidle_balance, NULL);
+
+	/*
+	 * TODO:
+	 * scheduler tick is not a restricted hook so multiple entities
+	 * can register for it. but from WALT, we will have only 1 hook
+	 * and it will call our load balancer function later.
+	 */
+	register_trace_android_vh_scheduler_tick(walt_lb_tick, NULL);
+}

+ 90 - 0
kernel/sched/walt/walt_rt.c

@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2020-2021, The Linux Foundation. All rights reserved.
+ */
+
+#include <trace/hooks/sched.h>
+
+#include "walt.h"
+#include "trace.h"
+
+static void rt_energy_aware_wake_cpu(void *unused, struct task_struct *task,
+				struct cpumask *lowest_mask, int ret, int *best_cpu)
+{
+	int cpu;
+	unsigned long util, best_cpu_util = ULONG_MAX;
+	unsigned long best_cpu_util_cum = ULONG_MAX;
+	unsigned long util_cum;
+	unsigned long tutil = task_util(task);
+	unsigned int best_idle_exit_latency = UINT_MAX;
+	unsigned int cpu_idle_exit_latency = UINT_MAX;
+	bool boost_on_big = rt_boost_on_big();
+	int cluster;
+	int order_index = (boost_on_big && num_sched_clusters > 1) ? 1 : 0;
+
+	if (!ret)
+		return; /* No targets found */
+
+	rcu_read_lock();
+	for (cluster = 0; cluster < num_sched_clusters; cluster++) {
+		for_each_cpu_and(cpu, lowest_mask, &cpu_array[order_index][cluster]) {
+			trace_sched_cpu_util(cpu);
+
+			if (!cpu_active(cpu))
+				continue;
+
+			if (sched_cpu_high_irqload(cpu))
+				continue;
+
+			if (__cpu_overutilized(cpu, tutil))
+				continue;
+
+			util = cpu_util(cpu);
+
+			/* Find the least loaded CPU */
+			if (util > best_cpu_util)
+				continue;
+
+			/*
+			 * If the previous CPU has same load, keep it as
+			 * best_cpu.
+			 */
+			if (best_cpu_util == util && *best_cpu == task_cpu(task))
+				continue;
+
+			/*
+			 * If candidate CPU is the previous CPU, select it.
+			 * Otherwise, if its load is same with best_cpu and in
+			 * a shallower C-state, select it.  If all above
+			 * conditions are same, select the least cumulative
+			 * window demand CPU.
+			 */
+			cpu_idle_exit_latency = walt_get_idle_exit_latency(cpu_rq(cpu));
+
+			util_cum = cpu_util_cum(cpu, 0);
+			if (cpu != task_cpu(task) && best_cpu_util == util) {
+				if (best_idle_exit_latency < cpu_idle_exit_latency)
+					continue;
+
+				if (best_idle_exit_latency == cpu_idle_exit_latency &&
+						best_cpu_util_cum < util_cum)
+					continue;
+			}
+
+			best_idle_exit_latency = cpu_idle_exit_latency;
+			best_cpu_util_cum = util_cum;
+			best_cpu_util = util;
+			*best_cpu = cpu;
+		}
+
+		if (*best_cpu != -1)
+			break;
+	}
+
+	rcu_read_unlock();
+}
+
+void walt_rt_init(void)
+{
+	register_trace_android_rvh_find_lowest_rq(rt_energy_aware_wake_cpu, NULL);
+}

+ 0 - 2
kernel/trace/Makefile

@@ -101,5 +101,3 @@ obj-$(CONFIG_IPC_LOGGING) += qcom_ipc_logging.o
 qcom_ipc_logging-y := ipc_logging.o  ipc_logging_debug.o
 
 libftrace-y := ftrace.o
-
-obj-$(CONFIG_PREEMPTIRQ_TRACEPOINTS) += preemptirq_long.o

+ 1 - 0
modules.list.msm.lahaina

@@ -81,3 +81,4 @@ memory_dump_v2.ko
 llcc-qcom.ko
 qcom_edac.ko
 kryo_arm64_edac.ko
+qcom-cpufreq-hw.ko