// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2020-2021, The Linux Foundation. All rights reserved. * Copyright (c) 2022-2024, Qualcomm Innovation Center, Inc. All rights reserved. */ #include #include #include #include "walt.h" #include "trace.h" #include <../../../drivers/android/binder_internal.h> #include "../../../drivers/android/binder_trace.h" static void create_util_to_cost_pd(struct em_perf_domain *pd) { int util, cpu = cpumask_first(to_cpumask(pd->cpus)); unsigned long fmax; unsigned long scale_cpu; struct walt_rq *wrq = &per_cpu(walt_rq, cpu); struct walt_sched_cluster *cluster = wrq->cluster; fmax = (u64)pd->table[pd->nr_perf_states - 1].frequency; scale_cpu = arch_scale_cpu_capacity(cpu); for (util = 0; util < 1024; util++) { int j; int f = (fmax * util) / scale_cpu; struct em_perf_state *ps = &pd->table[0]; for (j = 0; j < pd->nr_perf_states; j++) { ps = &pd->table[j]; if (ps->frequency >= f) break; } cluster->util_to_cost[util] = ps->cost; } } void create_util_to_cost(void) { struct perf_domain *pd; struct root_domain *rd = cpu_rq(smp_processor_id())->rd; rcu_read_lock(); pd = rcu_dereference(rd->pd); for (; pd; pd = pd->next) create_util_to_cost_pd(pd->em_pd); rcu_read_unlock(); } DECLARE_PER_CPU(unsigned long, gov_last_util); /* Migration margins */ unsigned int sched_capacity_margin_up[WALT_NR_CPUS] = { [0 ... WALT_NR_CPUS-1] = 1078 /* ~5% margin */ }; unsigned int sched_capacity_margin_down[WALT_NR_CPUS] = { [0 ... WALT_NR_CPUS-1] = 1205 /* ~15% margin */ }; /* Migration margins for topapp */ unsigned int sched_capacity_margin_early_up[WALT_NR_CPUS] = { [0 ... WALT_NR_CPUS-1] = 1078 /* ~5% margin */ }; unsigned int sched_capacity_margin_early_down[WALT_NR_CPUS] = { [0 ... WALT_NR_CPUS-1] = 1205 /* ~15% margin */ }; static inline bool bias_to_this_cpu(struct task_struct *p, int cpu, int start_cpu) { bool base_test = cpumask_test_cpu(cpu, p->cpus_ptr) && cpu_active(cpu); bool start_cap_test = !check_for_higher_capacity(start_cpu, cpu); return base_test && start_cap_test; } static inline bool task_demand_fits(struct task_struct *p, int dst_cpu) { if (is_max_possible_cluster_cpu(dst_cpu)) return true; if (!task_in_related_thread_group(p) && p->prio >= 124 && !is_min_possible_cluster_cpu(dst_cpu) && !is_max_possible_cluster_cpu(dst_cpu)) { /* a non topapp low prio task fits on gold */ return true; } return task_fits_capacity(p, dst_cpu); } struct find_best_target_env { bool is_rtg; int need_idle; int fastpath; int start_cpu; int order_index; int end_index; bool strict_max; int skip_cpu; u64 prs[8]; }; /* * cpu_util_without: compute cpu utilization without any contributions from *p * @cpu: the CPU which utilization is requested * @p: the task which utilization should be discounted * * The utilization of a CPU is defined by the utilization of tasks currently * enqueued on that CPU as well as tasks which are currently sleeping after an * execution on that CPU. * * This method returns the utilization of the specified CPU by discounting the * utilization of the specified task, whenever the task is currently * contributing to the CPU utilization. */ static unsigned long cpu_util_without(int cpu, struct task_struct *p) { unsigned int util; /* * WALT does not decay idle tasks in the same manner * as PELT, so it makes little sense to subtract task * utilization from cpu utilization. Instead just use * cpu_util for this case. */ if (likely(READ_ONCE(p->__state) == TASK_WAKING)) return cpu_util(cpu); /* Task has no contribution or is new */ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) return cpu_util(cpu); util = max_t(long, cpu_util(cpu) - task_util(p), 0); /* * Utilization (estimated) can exceed the CPU capacity, thus let's * clamp to the maximum CPU capacity to ensure consistency with * the cpu_util call. */ return min_t(unsigned long, util, capacity_orig_of(cpu)); } static inline bool walt_task_skip_min_cpu(struct task_struct *p) { struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; return (sched_boost_type != CONSERVATIVE_BOOST) && walt_get_rtg_status(p) && (wts->unfilter || walt_pipeline_low_latency_task(p)); } static inline bool walt_is_many_wakeup(int sibling_count_hint) { return sibling_count_hint >= sysctl_sched_many_wakeup_threshold; } static inline bool walt_target_ok(int target_cpu, int order_index) { return !((order_index != num_sched_clusters - 1) && (cpumask_weight(&cpu_array[order_index][0]) == 1) && (target_cpu == cpumask_first(&cpu_array[order_index][0]))); } #define MIN_UTIL_FOR_ENERGY_EVAL 52 static void walt_get_indicies(struct task_struct *p, int *order_index, int *end_index, int per_task_boost, bool is_uclamp_boosted, bool *energy_eval_needed) { *order_index = 0; *end_index = 0; if (num_sched_clusters <= 1) return; if (per_task_boost > TASK_BOOST_ON_MID) { *order_index = num_sched_clusters - 1; *energy_eval_needed = false; return; } if (is_full_throttle_boost()) { *energy_eval_needed = false; *order_index = num_sched_clusters - 1; *end_index = num_sched_clusters - 2; for (; *end_index >= 0; (*end_index)--) if (task_demand_fits(p, cpumask_first(&cpu_array[*order_index][*end_index]))) break; return; } if (is_uclamp_boosted || per_task_boost || task_boost_policy(p) == SCHED_BOOST_ON_BIG || walt_task_skip_min_cpu(p)) { *energy_eval_needed = false; *order_index = 1; *end_index = max(0, num_sched_clusters - 3); if (sysctl_sched_asymcap_boost) { (*end_index)++; return; } } for (; *order_index < num_sched_clusters - 1; (*order_index)++) { if (task_demand_fits(p, cpumask_first(&cpu_array[*order_index][0]))) break; } if (*order_index == 0 && (task_util(p) >= MIN_UTIL_FOR_ENERGY_EVAL) && !(p->in_iowait && task_in_related_thread_group(p)) && !walt_get_rtg_status(p) && !(sched_boost_type == CONSERVATIVE_BOOST && task_sched_boost(p)) && !sysctl_sched_suppress_region2 ) *end_index = 1; if (p->in_iowait && task_in_related_thread_group(p)) *energy_eval_needed = false; } enum fastpaths { NONE = 0, SYNC_WAKEUP, PREV_CPU_FASTPATH, CLUSTER_PACKING_FASTPATH, PIPELINE_FASTPATH, }; static inline bool is_complex_sibling_idle(int cpu) { if (cpu_l2_sibling[cpu] != -1) return available_idle_cpu(cpu_l2_sibling[cpu]); return false; } static inline bool walt_should_reject_fbt_cpu(struct walt_rq *wrq, struct task_struct *p, int cpu, int order_index, struct find_best_target_env *fbt_env) { if (!cpu_active(cpu)) return true; if (cpu_halted(cpu)) return true; if (order_index != 0 && cpu_partial_halted(cpu)) return true; /* * This CPU is the target of an active migration that's * yet to complete. Avoid placing another task on it. */ if (is_reserved(cpu)) return true; if (sched_cpu_high_irqload(cpu)) return true; if (fbt_env->skip_cpu == cpu) return true; if (wrq->num_mvp_tasks > 0 && per_task_boost(p) != TASK_BOOST_STRICT_MAX) return true; return false; } bool select_prev_cpu_fastpath(int prev_cpu, int start_cpu, int order_index, struct task_struct *p) { struct walt_rq *prev_wrq = &per_cpu(walt_rq, prev_cpu); struct walt_rq *start_wrq = &per_cpu(walt_rq, start_cpu); bool valid_part_haltable_prev_cpu = false, valid_prev_cpu = false; if (!cpu_active(prev_cpu)) return false; if (!available_idle_cpu(prev_cpu)) return false; if (!cpumask_test_cpu(prev_cpu, p->cpus_ptr)) return false; if (cpu_halted(prev_cpu)) return false; if (is_reserved(prev_cpu)) return false; valid_part_haltable_prev_cpu = cpumask_test_cpu(prev_cpu, &part_haltable_cpus) && ((order_index == 0 && cpu_partial_halted(prev_cpu)) || (order_index == 1 && !cpu_partial_halted(prev_cpu))); valid_prev_cpu = (prev_wrq->cluster->id == start_wrq->cluster->id); if (!(valid_part_haltable_prev_cpu || valid_prev_cpu)) return false; return true; } #define DIRE_STRAITS_PREV_NR_LIMIT 10 static void walt_find_best_target(struct sched_domain *sd, cpumask_t *candidates, struct task_struct *p, struct find_best_target_env *fbt_env) { unsigned long min_task_util = uclamp_task_util(p); long target_max_spare_cap = 0; unsigned long best_idle_cuml_util = ULONG_MAX; unsigned int min_exit_latency = UINT_MAX; int i, start_cpu; long spare_wake_cap, most_spare_wake_cap = 0; int most_spare_cap_cpu = -1; int least_nr_cpu = -1; unsigned int cpu_rq_runnable_cnt = UINT_MAX; int prev_cpu = task_cpu(p); int order_index = fbt_env->order_index, end_index = fbt_env->end_index; int stop_index = INT_MAX; int cluster; unsigned int target_nr_rtg_high_prio = UINT_MAX; bool rtg_high_prio_task = task_rtg_high_prio(p); cpumask_t visit_cpus; struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; int packing_cpu; long most_spare_wake_cap_target_clusters = LONG_MIN; int most_spare_cap_target_cluster_cpu = -1; /* Find start CPU based on boost value */ start_cpu = fbt_env->start_cpu; /* * For higher capacity worth I/O tasks, stop the search * at the end of higher capacity cluster(s). */ if (order_index > 0 && wts->iowaited) { stop_index = num_sched_clusters - 2; most_spare_wake_cap = LONG_MIN; } if (fbt_env->strict_max) { stop_index = 0; most_spare_wake_cap = LONG_MIN; } /* fast path for packing_cpu */ packing_cpu = walt_find_and_choose_cluster_packing_cpu(start_cpu, p); if (packing_cpu >= 0) { fbt_env->fastpath = CLUSTER_PACKING_FASTPATH; cpumask_set_cpu(packing_cpu, candidates); goto out; } /* fast path for prev_cpu */ if (select_prev_cpu_fastpath(prev_cpu, start_cpu, order_index, p)) { fbt_env->fastpath = PREV_CPU_FASTPATH; cpumask_set_cpu(prev_cpu, candidates); goto out; } for (cluster = 0; cluster < num_sched_clusters; cluster++) { int best_idle_cpu_cluster = -1; int target_cpu_cluster = -1; int this_complex_idle = 0; int best_complex_idle = 0; target_max_spare_cap = 0; min_exit_latency = INT_MAX; best_idle_cuml_util = ULONG_MAX; cpumask_and(&visit_cpus, p->cpus_ptr, &cpu_array[order_index][cluster]); for_each_cpu(i, &visit_cpus) { unsigned long capacity_orig = capacity_orig_of(i); unsigned long wake_cpu_util, new_cpu_util, new_util_cuml; long spare_cap; unsigned int idle_exit_latency = UINT_MAX; struct walt_rq *wrq = &per_cpu(walt_rq, i); trace_sched_cpu_util(i, NULL); /* record the prss as we visit cpus in a cluster */ fbt_env->prs[i] = wrq->prev_runnable_sum + wrq->grp_time.prev_runnable_sum; if (walt_should_reject_fbt_cpu(wrq, p, i, order_index, fbt_env)) continue; /* * p's blocked utilization is still accounted for on prev_cpu * so prev_cpu will receive a negative bias due to the double * accounting. However, the blocked utilization may be zero. */ wake_cpu_util = cpu_util_without(i, p); spare_wake_cap = capacity_orig - wake_cpu_util; if (spare_wake_cap > most_spare_wake_cap) { most_spare_wake_cap = spare_wake_cap; most_spare_cap_cpu = i; } /* * Keep track of least loaded cpu which can be used as a * fallback placement core for BIG rtg task in case all * the cores are busy, this is to avoid prev_cpu * fallback mechanism. */ if ((cluster <= end_index) && (spare_wake_cap > most_spare_wake_cap_target_clusters)) { most_spare_wake_cap_target_clusters = spare_wake_cap; most_spare_cap_target_cluster_cpu = i; } /* * Keep track of runnables for each CPU, if none of the * CPUs have spare capacity then use CPU with less * number of runnables. */ if (cpu_rq(i)->nr_running < cpu_rq_runnable_cnt) { cpu_rq_runnable_cnt = cpu_rq(i)->nr_running; least_nr_cpu = i; } /* * Ensure minimum capacity to grant the required boost. * The target CPU can be already at a capacity level higher * than the one required to boost the task. */ new_cpu_util = wake_cpu_util + min_task_util; if (new_cpu_util > capacity_orig) continue; /* * Find an optimal backup IDLE CPU for non latency * sensitive tasks. * * Looking for: * - favoring shallowest idle states * i.e. avoid to wakeup deep-idle CPUs * * The following code path is used by non latency * sensitive tasks if IDLE CPUs are available. If at * least one of such CPUs are available it sets the * best_idle_cpu to the most suitable idle CPU to be * selected. * * If idle CPUs are available, favour these CPUs to * improve performances by spreading tasks. * Indeed, the energy_diff() computed by the caller * will take care to ensure the minimization of energy * consumptions without affecting performance. */ if (available_idle_cpu(i)) { idle_exit_latency = walt_get_idle_exit_latency(cpu_rq(i)); this_complex_idle = is_complex_sibling_idle(i) ? 1 : 0; if (this_complex_idle < best_complex_idle) continue; /* * Prefer shallowest over deeper idle state cpu, * of same capacity cpus. */ if (idle_exit_latency > min_exit_latency) continue; new_util_cuml = cpu_util_cum(i); if (min_exit_latency == idle_exit_latency && (best_idle_cpu_cluster == prev_cpu || (i != prev_cpu && new_util_cuml > best_idle_cuml_util))) continue; min_exit_latency = idle_exit_latency; best_idle_cuml_util = new_util_cuml; best_idle_cpu_cluster = i; best_complex_idle = this_complex_idle; continue; } /* skip visiting any more busy if idle was found */ if (best_idle_cpu_cluster != -1) continue; /* * Compute the maximum possible capacity we expect * to have available on this CPU once the task is * enqueued here. */ spare_cap = capacity_orig - new_cpu_util; /* * Try to spread the rtg high prio tasks so that they * don't preempt each other. This is a optimisitc * check assuming rtg high prio can actually preempt * the current running task with the given vruntime * boost. */ if (rtg_high_prio_task) { if (walt_nr_rtg_high_prio(i) > target_nr_rtg_high_prio) continue; /* Favor CPUs with maximum spare capacity */ if (walt_nr_rtg_high_prio(i) == target_nr_rtg_high_prio && spare_cap < target_max_spare_cap) continue; } else { /* Favor CPUs with maximum spare capacity */ if (spare_cap < target_max_spare_cap) continue; } target_max_spare_cap = spare_cap; target_nr_rtg_high_prio = walt_nr_rtg_high_prio(i); target_cpu_cluster = i; } if (best_idle_cpu_cluster != -1) cpumask_set_cpu(best_idle_cpu_cluster, candidates); else if (target_cpu_cluster != -1) cpumask_set_cpu(target_cpu_cluster, candidates); if ((cluster >= end_index) && (!cpumask_empty(candidates)) && walt_target_ok(target_cpu_cluster, order_index)) break; if (most_spare_cap_cpu != -1 && cluster >= stop_index) break; } /* * We have set idle or target as long as they are valid CPUs. * If we don't find either, then we fallback to most_spare_cap, * If we don't find most spare cap, we fallback to prev_cpu, * provided that the prev_cpu is active and has less than * DIRE_STRAITS_PREV_NR_LIMIT runnables otherwise, we fallback to cpu * with least number of runnables. */ if (unlikely(cpumask_empty(candidates))) { if (most_spare_cap_cpu != -1) cpumask_set_cpu(most_spare_cap_cpu, candidates); else if (most_spare_cap_target_cluster_cpu != -1 && (order_index > 0) && fbt_env->is_rtg) cpumask_set_cpu(most_spare_cap_target_cluster_cpu, candidates); else if (cpu_active(prev_cpu) && (cpu_rq(prev_cpu)->nr_running < DIRE_STRAITS_PREV_NR_LIMIT)) cpumask_set_cpu(prev_cpu, candidates); else if (least_nr_cpu != -1) cpumask_set_cpu(least_nr_cpu, candidates); } out: trace_sched_find_best_target(p, min_task_util, start_cpu, cpumask_bits(candidates)[0], most_spare_cap_cpu, order_index, end_index, fbt_env->skip_cpu, task_on_rq_queued(p), least_nr_cpu, cpu_rq_runnable_cnt, most_spare_cap_target_cluster_cpu); } static inline unsigned long cpu_util_next_walt(int cpu, struct task_struct *p, int dst_cpu) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu); unsigned long util = wrq->walt_stats.cumulative_runnable_avg_scaled; bool queued = task_on_rq_queued(p); /* * When task is queued, * (a) The evaluating CPU (cpu) is task's current CPU. If the * task is migrating, discount the task contribution from the * evaluation cpu. * (b) The evaluating CPU (cpu) is task's current CPU. If the * task is NOT migrating, nothing to do. The contribution is * already present on the evaluation CPU. * (c) The evaluating CPU (cpu) is not task's current CPU. But * the task is migrating to the evaluating CPU. So add the * task contribution to it. * (d) The evaluating CPU (cpu) is neither the current CPU nor * the destination CPU. don't care. * * When task is NOT queued i.e waking. Task contribution is not * present on any CPU. * * (a) If the evaluating CPU is the destination CPU, add the task * contribution. * (b) The evaluation CPU is not the destination CPU, don't care. */ if (unlikely(queued)) { if (task_cpu(p) == cpu) { if (dst_cpu != cpu) util = max_t(long, util - task_util(p), 0); } else if (dst_cpu == cpu) { util += task_util(p); } } else if (dst_cpu == cpu) { util += task_util(p); } return min_t(unsigned long, util, capacity_orig_of(cpu)); } static inline u64 cpu_util_next_walt_prs(int cpu, struct task_struct *p, int dst_cpu, bool prev_dst_same_cluster, u64 *prs) { struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; long util = prs[cpu]; if (wts->prev_window) { if (!prev_dst_same_cluster) { /* intercluster migration of non rtg task - mimic fixups */ util -= wts->prev_window_cpu[cpu]; if (util < 0) util = 0; if (cpu == dst_cpu) util += wts->prev_window; } } else { if (cpu == dst_cpu) util += wts->demand; } return util; } static inline unsigned long get_util_to_cost(int cpu, unsigned long util) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu); if (cpu == 0 && util > sysctl_em_inflate_thres) return mult_frac(wrq->cluster->util_to_cost[util], sysctl_em_inflate_pct, 100); else return wrq->cluster->util_to_cost[util]; } /** * walt_em_cpu_energy() - Estimates the energy consumed by the CPUs of a performance domain * @pd : performance domain for which energy has to be estimated * @max_util : highest utilization among CPUs of the domain * @sum_util : sum of the utilization of all CPUs in the domain * * This function must be used only for CPU devices. There is no validation, * i.e. if the EM is a CPU type and has cpumask allocated. It is called from * the scheduler code quite frequently and that is why there is not checks. * * Return: the sum of the energy consumed by the CPUs of the domain assuming * a capacity state satisfying the max utilization of the domain. */ static inline unsigned long walt_em_cpu_energy(struct em_perf_domain *pd, unsigned long max_util, unsigned long sum_util, struct compute_energy_output *output, unsigned int x) { unsigned long scale_cpu, cost; int cpu; if (!sum_util) return 0; /* * In order to predict the capacity state, map the utilization of the * most utilized CPU of the performance domain to a requested frequency, * like schedutil. */ cpu = cpumask_first(to_cpumask(pd->cpus)); scale_cpu = arch_scale_cpu_capacity(cpu); max_util = max_util + (max_util >> 2); /* account for TARGET_LOAD usually 80 */ max_util = max(max_util, (arch_scale_freq_capacity(cpu) * scale_cpu) >> SCHED_CAPACITY_SHIFT); /* * The capacity of a CPU in the domain at the performance state (ps) * can be computed as: * * ps->freq * scale_cpu * ps->cap = -------------------- (1) * cpu_max_freq * * So, ignoring the costs of idle states (which are not available in * the EM), the energy consumed by this CPU at that performance state * is estimated as: * * ps->power * cpu_util * cpu_nrg = -------------------- (2) * ps->cap * * since 'cpu_util / ps->cap' represents its percentage of busy time. * * NOTE: Although the result of this computation actually is in * units of power, it can be manipulated as an energy value * over a scheduling period, since it is assumed to be * constant during that interval. * * By injecting (1) in (2), 'cpu_nrg' can be re-expressed as a product * of two terms: * * ps->power * cpu_max_freq cpu_util * cpu_nrg = ------------------------ * --------- (3) * ps->freq scale_cpu * * The first term is static, and is stored in the em_perf_state struct * as 'ps->cost'. * * Since all CPUs of the domain have the same micro-architecture, they * share the same 'ps->cost', and the same CPU capacity. Hence, the * total energy of the domain (which is the simple sum of the energy of * all of its CPUs) can be factorized as: * * ps->cost * \Sum cpu_util * pd_nrg = ------------------------ (4) * scale_cpu */ if (max_util >= 1024) max_util = 1023; cost = get_util_to_cost(cpu, max_util); if (output) { output->cost[x] = cost; output->max_util[x] = max_util; output->sum_util[x] = sum_util; } return cost * sum_util / scale_cpu; } /* * walt_pd_compute_energy(): Estimates the energy that @pd would consume if @p was * migrated to @dst_cpu. compute_energy() predicts what will be the utilization * landscape of @pd's CPUs after the task migration, and uses the Energy Model * to compute what would be the energy if we decided to actually migrate that * task. */ static long walt_pd_compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd, u64 *prs, struct compute_energy_output *output, unsigned int x) { struct cpumask *pd_mask = perf_domain_span(pd); unsigned long max_util = 0, sum_util = 0; int cpu; unsigned long cpu_util; bool prev_dst_same_cluster = false; if (same_cluster(task_cpu(p), dst_cpu)) prev_dst_same_cluster = true; /* * The capacity state of CPUs of the current rd can be driven by CPUs * of another rd if they belong to the same pd. So, account for the * utilization of these CPUs too by masking pd with cpu_online_mask * instead of the rd span. * * If an entire pd is outside of the current rd, it will not appear in * its pd list and will not be accounted by compute_energy(). */ for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { sum_util += cpu_util_next_walt(cpu, p, dst_cpu); cpu_util = cpu_util_next_walt_prs(cpu, p, dst_cpu, prev_dst_same_cluster, prs); max_util = max(max_util, cpu_util); } max_util = scale_time_to_util(max_util); if (output) output->cluster_first_cpu[x] = cpumask_first(pd_mask); return walt_em_cpu_energy(pd->em_pd, max_util, sum_util, output, x); } static inline long walt_compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd, cpumask_t *candidates, u64 *prs, struct compute_energy_output *output) { long energy = 0; unsigned int x = 0; for (; pd; pd = pd->next) { struct cpumask *pd_mask = perf_domain_span(pd); if (cpumask_intersects(candidates, pd_mask) || cpumask_test_cpu(task_cpu(p), pd_mask)) { energy += walt_pd_compute_energy(p, dst_cpu, pd, prs, output, x); x++; } } return energy; } static inline int wake_to_idle(struct task_struct *p) { struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; struct walt_task_struct *cur_wts = (struct walt_task_struct *) current->android_vendor_data1; return (cur_wts->wake_up_idle || wts->wake_up_idle); } /* return true if cpu should be chosen over best_energy_cpu */ static inline bool select_cpu_same_energy(int cpu, int best_cpu, int prev_cpu) { bool new_cpu_is_idle = available_idle_cpu(cpu); bool best_cpu_is_idle = available_idle_cpu(best_cpu); if (check_for_higher_capacity(cpu, best_cpu)) return false; if (check_for_higher_capacity(best_cpu, cpu)) return true; if (best_cpu_is_idle && walt_get_idle_exit_latency(cpu_rq(best_cpu)) <= 1) return false; if (new_cpu_is_idle && walt_get_idle_exit_latency(cpu_rq(cpu)) <= 1) return true; if (best_cpu_is_idle && !new_cpu_is_idle) return false; if (new_cpu_is_idle && !best_cpu_is_idle) return true; if (best_cpu == prev_cpu) return false; if (cpu == prev_cpu) return true; if (best_cpu_is_idle && new_cpu_is_idle) return false; if (cpu_util(best_cpu) <= cpu_util(cpu)) return false; return true; } static inline unsigned int capacity_spare_of(int cpu) { return capacity_orig_of(cpu) - cpu_util(cpu); } static DEFINE_PER_CPU(cpumask_t, energy_cpus); int walt_find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sync, int sibling_count_hint) { unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX; struct root_domain *rd = cpu_rq(cpumask_first(cpu_active_mask))->rd; int weight, cpu = smp_processor_id(), best_energy_cpu = prev_cpu; struct perf_domain *pd; unsigned long cur_energy; cpumask_t *candidates; bool is_rtg, curr_is_rtg; struct find_best_target_env fbt_env; bool need_idle = wake_to_idle(p); u64 start_t = 0; int delta = 0; int task_boost = per_task_boost(p); bool uclamp_boost = walt_uclamp_boosted(p); int start_cpu = 0, order_index, end_index; int first_cpu; bool energy_eval_needed = true; struct compute_energy_output output; struct walt_task_struct *wts; int pipeline_cpu; if (walt_is_many_wakeup(sibling_count_hint) && prev_cpu != cpu && cpumask_test_cpu(prev_cpu, p->cpus_ptr)) return prev_cpu; if (unlikely(!cpu_array)) return prev_cpu; /* Pre-select a set of candidate CPUs. */ candidates = this_cpu_ptr(&energy_cpus); cpumask_clear(candidates); wts = (struct walt_task_struct *) p->android_vendor_data1; pipeline_cpu = wts->pipeline_cpu; if ((wts->low_latency & WALT_LOW_LATENCY_MASK) && (pipeline_cpu != -1) && walt_task_skip_min_cpu(p) && cpumask_test_cpu(pipeline_cpu, p->cpus_ptr) && cpu_active(pipeline_cpu) && !cpu_halted(pipeline_cpu)) { if ((p == cpu_rq(pipeline_cpu)->curr) || !walt_pipeline_low_latency_task(cpu_rq(pipeline_cpu)->curr)) { best_energy_cpu = pipeline_cpu; fbt_env.fastpath = PIPELINE_FASTPATH; goto out; } } walt_get_indicies(p, &order_index, &end_index, task_boost, uclamp_boost, &energy_eval_needed); start_cpu = cpumask_first(&cpu_array[order_index][0]); is_rtg = task_in_related_thread_group(p); curr_is_rtg = task_in_related_thread_group(cpu_rq(cpu)->curr); if (trace_sched_task_util_enabled()) start_t = sched_clock(); rcu_read_lock(); need_idle |= uclamp_latency_sensitive(p); fbt_env.fastpath = 0; fbt_env.need_idle = need_idle; if (sync && (need_idle || (is_rtg && curr_is_rtg))) sync = 0; if (sysctl_sched_sync_hint_enable && sync && bias_to_this_cpu(p, cpu, start_cpu) && !cpu_halted(cpu)) { best_energy_cpu = cpu; fbt_env.fastpath = SYNC_WAKEUP; goto unlock; } /* if symmetrical system, default to upstream behavior */ pd = rcu_dereference(rd->pd); if (!pd) goto fail; fbt_env.is_rtg = is_rtg; fbt_env.start_cpu = start_cpu; fbt_env.order_index = order_index; fbt_env.end_index = end_index; fbt_env.strict_max = is_rtg && (task_boost == TASK_BOOST_STRICT_MAX); fbt_env.skip_cpu = walt_is_many_wakeup(sibling_count_hint) ? cpu : -1; walt_find_best_target(NULL, candidates, p, &fbt_env); /* Bail out if no candidate was found. */ weight = cpumask_weight(candidates); if (!weight) goto unlock; first_cpu = cpumask_first(candidates); if (fbt_env.fastpath == CLUSTER_PACKING_FASTPATH) { best_energy_cpu = first_cpu; goto unlock; } if (weight == 1) { if (available_idle_cpu(first_cpu) || first_cpu == prev_cpu) { best_energy_cpu = first_cpu; goto unlock; } } if (need_idle && available_idle_cpu(first_cpu)) { best_energy_cpu = first_cpu; goto unlock; } if (!energy_eval_needed) { int max_spare_cpu = first_cpu; for_each_cpu(cpu, candidates) { if (capacity_spare_of(max_spare_cpu) < capacity_spare_of(cpu)) max_spare_cpu = cpu; } best_energy_cpu = max_spare_cpu; goto unlock; } if (READ_ONCE(p->__state) == TASK_WAKING) delta = task_util(p); if (cpumask_test_cpu(prev_cpu, p->cpus_ptr) && !__cpu_overutilized(prev_cpu, delta)) { if (trace_sched_compute_energy_enabled()) { memset(&output, 0, sizeof(output)); prev_energy = walt_compute_energy(p, prev_cpu, pd, candidates, fbt_env.prs, &output); } else { prev_energy = walt_compute_energy(p, prev_cpu, pd, candidates, fbt_env.prs, NULL); } best_energy = prev_energy; trace_sched_compute_energy(p, prev_cpu, prev_energy, 0, 0, 0, &output); } else { prev_energy = best_energy = ULONG_MAX; if (weight == 1) { best_energy_cpu = first_cpu; goto unlock; } } /* Select the best candidate energy-wise. */ for_each_cpu(cpu, candidates) { if (cpu == prev_cpu) continue; if (trace_sched_compute_energy_enabled()) { memset(&output, 0, sizeof(output)); cur_energy = walt_compute_energy(p, cpu, pd, candidates, fbt_env.prs, &output); } else { cur_energy = walt_compute_energy(p, cpu, pd, candidates, fbt_env.prs, NULL); } if (cur_energy < best_energy) { best_energy = cur_energy; best_energy_cpu = cpu; } else if (cur_energy == best_energy) { if (select_cpu_same_energy(cpu, best_energy_cpu, prev_cpu)) { best_energy = cur_energy; best_energy_cpu = cpu; } } trace_sched_compute_energy(p, cpu, cur_energy, prev_energy, best_energy, best_energy_cpu, &output); } /* * Pick the prev CPU, if best energy CPU can't saves at least 6% of * the energy used by prev_cpu. */ if (!(available_idle_cpu(best_energy_cpu) && walt_get_idle_exit_latency(cpu_rq(best_energy_cpu)) <= 1) && (prev_energy != ULONG_MAX) && (best_energy_cpu != prev_cpu) && ((prev_energy - best_energy) <= prev_energy >> 5) && !check_for_higher_capacity(prev_cpu, start_cpu)) best_energy_cpu = prev_cpu; unlock: rcu_read_unlock(); out: if (best_energy_cpu < 0 || best_energy_cpu >= WALT_NR_CPUS) best_energy_cpu = prev_cpu; trace_sched_task_util(p, cpumask_bits(candidates)[0], best_energy_cpu, sync, fbt_env.need_idle, fbt_env.fastpath, start_t, uclamp_boost, start_cpu); return best_energy_cpu; fail: rcu_read_unlock(); return -1; } static void walt_select_task_rq_fair(void *unused, struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags, int *target_cpu) { int sync; int sibling_count_hint; if (unlikely(walt_disabled)) return; sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); sibling_count_hint = p->wake_q_count; p->wake_q_count = 0; *target_cpu = walt_find_energy_efficient_cpu(p, prev_cpu, sync, sibling_count_hint); } static void walt_binder_low_latency_set(void *unused, struct task_struct *task, bool sync, struct binder_proc *proc) { struct walt_task_struct *wts = (struct walt_task_struct *) task->android_vendor_data1; if (unlikely(walt_disabled)) return; if (task && ((task_in_related_thread_group(current) && task->group_leader->prio < MAX_RT_PRIO) || (current->group_leader->prio < MAX_RT_PRIO && task_in_related_thread_group(task)))) wts->low_latency |= WALT_LOW_LATENCY_BINDER; else /* * Clear low_latency flag if criterion above is not met, this * will handle usecase where for a binder thread WALT_LOW_LATENCY_BINDER * is set by one task and before WALT clears this flag after timer expiry * some other task tries to use same binder thread. * * The only gets cleared when binder transaction is initiated * and the above condition to set flasg is nto satisfied. */ wts->low_latency &= ~WALT_LOW_LATENCY_BINDER; } static void binder_set_priority_hook(void *data, struct binder_transaction *bndrtrans, struct task_struct *task) { struct walt_task_struct *wts = (struct walt_task_struct *) task->android_vendor_data1; struct walt_task_struct *current_wts = (struct walt_task_struct *) current->android_vendor_data1; if (unlikely(walt_disabled)) return; if (bndrtrans && bndrtrans->need_reply && current_wts->boost == TASK_BOOST_STRICT_MAX) { bndrtrans->android_vendor_data1 = wts->boost; wts->boost = TASK_BOOST_STRICT_MAX; } } static void binder_restore_priority_hook(void *data, struct binder_transaction *bndrtrans, struct task_struct *task) { struct walt_task_struct *wts = (struct walt_task_struct *) task->android_vendor_data1; if (unlikely(walt_disabled)) return; if (bndrtrans && wts->boost == TASK_BOOST_STRICT_MAX) wts->boost = bndrtrans->android_vendor_data1; } /* * Higher prio mvp can preempt lower prio mvp. * * However, the lower prio MVP slice will be more since we expect them to * be the work horses. For example, binders will have higher prio MVP and * they can preempt long running rtg prio tasks but binders loose their * powers with in 3 msec where as rtg prio tasks can run more than that. */ int walt_get_mvp_task_prio(struct task_struct *p) { if (walt_procfs_low_latency_task(p) || walt_pipeline_low_latency_task(p)) return WALT_LL_PIPE_MVP; if (per_task_boost(p) == TASK_BOOST_STRICT_MAX) return WALT_TASK_BOOST_MVP; if (walt_binder_low_latency_task(p)) return WALT_BINDER_MVP; if (task_rtg_high_prio(p)) return WALT_RTG_MVP; return WALT_NOT_MVP; } static inline unsigned int walt_cfs_mvp_task_limit(struct task_struct *p) { struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; /* Binder MVP tasks are high prio but have only single slice */ if (wts->mvp_prio == WALT_BINDER_MVP) return WALT_MVP_SLICE; return WALT_MVP_LIMIT; } static void walt_cfs_insert_mvp_task(struct walt_rq *wrq, struct walt_task_struct *wts, bool at_front) { struct list_head *pos; list_for_each(pos, &wrq->mvp_tasks) { struct walt_task_struct *tmp_wts = container_of(pos, struct walt_task_struct, mvp_list); if (at_front) { if (wts->mvp_prio >= tmp_wts->mvp_prio) break; } else { if (wts->mvp_prio > tmp_wts->mvp_prio) break; } } list_add(&wts->mvp_list, pos->prev); wrq->num_mvp_tasks++; } void walt_cfs_deactivate_mvp_task(struct rq *rq, struct task_struct *p) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; list_del_init(&wts->mvp_list); wts->mvp_prio = WALT_NOT_MVP; wrq->num_mvp_tasks--; } /* * MVP task runtime update happens here. Three possibilities: * * de-activated: The MVP consumed its runtime. Non MVP can preempt. * slice expired: MVP slice is expired and other MVP can preempt. * slice not expired: This MVP task can continue to run. */ #define MAX_MVP_TIME_NS 500000000ULL #define MVP_THROTTLE_TIME_NS 100000000ULL static void walt_cfs_account_mvp_runtime(struct rq *rq, struct task_struct *curr) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); struct walt_task_struct *wts = (struct walt_task_struct *) curr->android_vendor_data1; u64 slice; unsigned int limit; walt_lockdep_assert_rq(rq, NULL); /* * RQ clock update happens in tick path in the scheduler. * Since we drop the lock in the scheduler before calling * into vendor hook, it is possible that update flags are * reset by another rq lock and unlock. Do the update here * if required. */ if (!(rq->clock_update_flags & RQCF_UPDATED)) update_rq_clock(rq); if (wrq->mvp_throttle_time) { if ((rq->clock - wrq->mvp_throttle_time) > MVP_THROTTLE_TIME_NS) { wrq->skip_mvp = false; wrq->mvp_throttle_time = 0; } } else if (wrq->mvp_arrival_time) { if ((rq->clock - wrq->mvp_arrival_time) > MAX_MVP_TIME_NS) { wrq->skip_mvp = true; wrq->mvp_arrival_time = 0; wrq->mvp_throttle_time = rq->clock; } } /* * continue accounting even in skip_mvp state if a MVP task is selected * by scheduler core to run on CPU. */ if (curr->se.sum_exec_runtime > wts->sum_exec_snapshot_for_total) wts->total_exec = curr->se.sum_exec_runtime - wts->sum_exec_snapshot_for_total; else wts->total_exec = 0; if (curr->se.sum_exec_runtime > wts->sum_exec_snapshot_for_slice) slice = curr->se.sum_exec_runtime - wts->sum_exec_snapshot_for_slice; else slice = 0; /* slice is not expired */ if (slice < WALT_MVP_SLICE) return; wts->sum_exec_snapshot_for_slice = curr->se.sum_exec_runtime; /* * slice is expired, check if we have to deactivate the * MVP task, otherwise requeue the task in the list so * that other MVP tasks gets a chance. */ limit = walt_cfs_mvp_task_limit(curr); if (wts->total_exec > limit) { walt_cfs_deactivate_mvp_task(rq, curr); trace_walt_cfs_deactivate_mvp_task(curr, wts, limit); return; } if (wrq->num_mvp_tasks == 1) return; /* slice expired. re-queue the task */ list_del(&wts->mvp_list); wrq->num_mvp_tasks--; walt_cfs_insert_mvp_task(wrq, wts, false); } void walt_cfs_enqueue_task(struct rq *rq, struct task_struct *p) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; int mvp_prio = walt_get_mvp_task_prio(p); if (mvp_prio == WALT_NOT_MVP) return; /* * This can happen during migration or enq/deq for prio/class change. * it was once MVP but got demoted, it will not be MVP until * it goes to sleep again. */ if (wts->total_exec > walt_cfs_mvp_task_limit(p)) return; wts->mvp_prio = mvp_prio; walt_cfs_insert_mvp_task(wrq, wts, task_on_cpu(rq, p)); /* * We inserted the task at the appropriate position. Take the * task runtime snapshot. From now onwards we use this point as a * baseline to enforce the slice and demotion. */ if (!wts->total_exec) /* queue after sleep */ { wts->sum_exec_snapshot_for_total = p->se.sum_exec_runtime; wts->sum_exec_snapshot_for_slice = p->se.sum_exec_runtime; } } void walt_cfs_dequeue_task(struct rq *rq, struct task_struct *p) { struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; if (!list_empty(&wts->mvp_list) && wts->mvp_list.next) walt_cfs_deactivate_mvp_task(rq, p); /* * Reset the exec time during sleep so that it starts * from scratch upon next wakeup. total_exec should * be preserved when task is enq/deq while it is on * runqueue. */ if (READ_ONCE(p->__state) != TASK_RUNNING) wts->total_exec = 0; } void walt_cfs_tick(struct rq *rq) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); struct walt_task_struct *wts = (struct walt_task_struct *) rq->curr->android_vendor_data1; bool skip_mvp; if (unlikely(walt_disabled)) return; raw_spin_lock(&rq->__lock); if (list_empty(&wts->mvp_list) || (wts->mvp_list.next == NULL)) goto out; /* Reschedule if RQ's skip_mvp state changes */ skip_mvp = wrq->skip_mvp; walt_cfs_account_mvp_runtime(rq, rq->curr); /* * If the current is not MVP means, we have to re-schedule to * see if we can run any other task including MVP tasks. */ if (((skip_mvp != wrq->skip_mvp) || (wrq->mvp_tasks.next != &wts->mvp_list)) && rq->cfs.h_nr_running > 1) resched_curr(rq); out: raw_spin_unlock(&rq->__lock); } /* * When preempt = false and nopreempt = false, we leave the preemption * decision to CFS. */ static void walt_cfs_check_preempt_wakeup(void *unused, struct rq *rq, struct task_struct *p, bool *preempt, bool *nopreempt, int wake_flags, struct sched_entity *se, struct sched_entity *pse, int next_buddy_marked, unsigned int granularity) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); struct walt_task_struct *wts_p = (struct walt_task_struct *) p->android_vendor_data1; struct task_struct *c = rq->curr; struct walt_task_struct *wts_c = (struct walt_task_struct *) rq->curr->android_vendor_data1; bool resched = false, skip_mvp; bool p_is_mvp, curr_is_mvp; if (unlikely(walt_disabled)) return; p_is_mvp = !list_empty(&wts_p->mvp_list) && wts_p->mvp_list.next; curr_is_mvp = !list_empty(&wts_c->mvp_list) && wts_c->mvp_list.next; /* * current is not MVP, so preemption decision * is simple. */ if (!curr_is_mvp) { if (p_is_mvp && !wrq->skip_mvp) goto preempt; return; /* CFS decides preemption */ } /* * current is MVP. update its runtime before deciding the * preemption. */ skip_mvp = wrq->skip_mvp; walt_cfs_account_mvp_runtime(rq, c); resched = (skip_mvp != wrq->skip_mvp) || (wrq->mvp_tasks.next != &wts_c->mvp_list); /* * current is no longer eligible to run. It must have been * picked (because of MVP) ahead of other tasks in the CFS * tree, so drive preemption to pick up the next task from * the tree, which also includes picking up the first in * the MVP queue. */ if (resched) goto preempt; /* current is the first in the queue, so no preemption */ *nopreempt = true; trace_walt_cfs_mvp_wakeup_nopreempt(c, wts_c, walt_cfs_mvp_task_limit(c)); return; preempt: *preempt = true; trace_walt_cfs_mvp_wakeup_preempt(p, wts_p, walt_cfs_mvp_task_limit(p)); } #ifdef CONFIG_FAIR_GROUP_SCHED /* Walk up scheduling entities hierarchy */ #define for_each_sched_entity(se) \ for (; se; se = se->parent) #else /* !CONFIG_FAIR_GROUP_SCHED */ #define for_each_sched_entity(se) \ for (; se; se = NULL) #endif extern void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se); static void walt_cfs_replace_next_task_fair(void *unused, struct rq *rq, struct task_struct **p, struct sched_entity **se, bool *repick, bool simple, struct task_struct *prev) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); struct walt_task_struct *wts; struct task_struct *mvp; struct cfs_rq *cfs_rq; if (unlikely(walt_disabled)) return; if ((*p) && (*p) != prev && ((*p)->on_cpu == 1 || (*p)->on_rq == 0 || (*p)->on_rq == TASK_ON_RQ_MIGRATING || task_thread_info(*p)->cpu != cpu_of(rq))) WALT_BUG(WALT_BUG_UPSTREAM, *p, "picked %s(%d) on_cpu=%d on_rq=%d p->cpu=%d cpu_of(rq)=%d kthread=%d\n", (*p)->comm, (*p)->pid, (*p)->on_cpu, (*p)->on_rq, task_thread_info(*p)->cpu, cpu_of(rq), ((*p)->flags & PF_KTHREAD)); /* RQ is in MVP throttled state*/ if (wrq->skip_mvp) return; if (list_empty(&wrq->mvp_tasks)) { wrq->mvp_arrival_time = 0; return; } /* Return the first task from MVP queue */ wts = list_first_entry(&wrq->mvp_tasks, struct walt_task_struct, mvp_list); mvp = wts_to_ts(wts); *p = mvp; *se = &mvp->se; *repick = true; /* TODO: check with team if it is fine in case clock is not updated */ /* Mark arrival of MVP task */ if (!wrq->mvp_arrival_time) wrq->mvp_arrival_time = rq->clock; if (simple) { for_each_sched_entity((*se)) { /* * TODO If CFS_BANDWIDTH is enabled, we might pick * from a throttled cfs_rq */ cfs_rq = cfs_rq_of(*se); set_next_entity(cfs_rq, *se); } } if ((*p) && (*p) != prev && ((*p)->on_cpu == 1 || (*p)->on_rq == 0 || (*p)->on_rq == TASK_ON_RQ_MIGRATING || task_thread_info(*p)->cpu != cpu_of(rq))) WALT_BUG(WALT_BUG_UPSTREAM, *p, "picked %s(%d) on_cpu=%d on_rq=%d p->cpu=%d cpu_of(rq)=%d kthread=%d\n", (*p)->comm, (*p)->pid, (*p)->on_cpu, (*p)->on_rq, task_thread_info(*p)->cpu, cpu_of(rq), ((*p)->flags & PF_KTHREAD)); trace_walt_cfs_mvp_pick_next(mvp, wts, walt_cfs_mvp_task_limit(mvp)); } void walt_cfs_init(void) { register_trace_android_rvh_select_task_rq_fair(walt_select_task_rq_fair, NULL); register_trace_android_vh_binder_wakeup_ilocked(walt_binder_low_latency_set, NULL); register_trace_android_vh_binder_set_priority(binder_set_priority_hook, NULL); register_trace_android_vh_binder_restore_priority(binder_restore_priority_hook, NULL); register_trace_android_rvh_check_preempt_wakeup(walt_cfs_check_preempt_wakeup, NULL); register_trace_android_rvh_replace_next_task_fair(walt_cfs_replace_next_task_fair, NULL); }