// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2012, 2015-2021, The Linux Foundation. All rights reserved. */ /* * Scheduler hook for average runqueue determination */ #include #include #include #include #include #include "walt.h" #include "trace.h" #include "hyst_qos.h" struct user_req us_req; bool qos_reg; static DEFINE_PER_CPU(u64, nr_prod_sum); static DEFINE_PER_CPU(u64, last_time); static DEFINE_PER_CPU(u64, nr_big_prod_sum); static DEFINE_PER_CPU(u64, nr); static DEFINE_PER_CPU(u64, nr_max); static DEFINE_PER_CPU(spinlock_t, nr_lock) = __SPIN_LOCK_UNLOCKED(nr_lock); static s64 last_get_time; static DEFINE_PER_CPU(atomic64_t, busy_hyst_end_time) = ATOMIC64_INIT(0); static DEFINE_PER_CPU(u64, hyst_time); static DEFINE_PER_CPU(u64, coloc_hyst_busy); static DEFINE_PER_CPU(u64, coloc_hyst_time); static DEFINE_PER_CPU(u64, util_hyst_time); #define NR_THRESHOLD_PCT 40 #define MAX_RTGB_TIME (sysctl_sched_coloc_busy_hyst_max_ms * NSEC_PER_MSEC) struct sched_avg_stats stats[WALT_NR_CPUS]; unsigned int cstats_util_pct[MAX_CLUSTERS]; /** * sched_get_cluster_util_pct * @return: provide the percentage of this cluter that was used in the * previous window. * * This routine may be called any number of times as needed during * a window, but will always return the same result until window * rollover. */ unsigned int sched_get_cluster_util_pct(struct walt_sched_cluster *cluster) { unsigned int cluster_util_pct = 0; if (cluster->id < MAX_CLUSTERS) cluster_util_pct = cstats_util_pct[cluster->id]; return cluster_util_pct; } /** * sched_get_nr_running_avg * @return: Average nr_running, iowait and nr_big_tasks value since last poll. * Returns the avg * 100 to return up to two decimal points * of accuracy. * * Obtains the average nr_running value since the last poll. * This function may not be called concurrently with itself. * * It is assumed that this function is called at most once per window * rollover. */ struct sched_avg_stats *sched_get_nr_running_avg(void) { int cpu; u64 curr_time = sched_clock(); u64 period = curr_time - last_get_time; u64 tmp_nr, tmp_misfit; bool any_hyst_time = false; struct walt_sched_cluster *cluster; if (unlikely(walt_disabled)) return NULL; if (!period) goto done; /* read and reset nr_running counts */ for_each_possible_cpu(cpu) { unsigned long flags; u64 diff; spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags); curr_time = sched_clock(); diff = curr_time - per_cpu(last_time, cpu); BUG_ON((s64)diff < 0); tmp_nr = per_cpu(nr_prod_sum, cpu); tmp_nr += per_cpu(nr, cpu) * diff; tmp_nr = div64_u64((tmp_nr * 100), period); tmp_misfit = per_cpu(nr_big_prod_sum, cpu); tmp_misfit += walt_big_tasks(cpu) * diff; tmp_misfit = div64_u64((tmp_misfit * 100), period); /* * NR_THRESHOLD_PCT is to make sure that the task ran * at least 85% in the last window to compensate any * over estimating being done. */ stats[cpu].nr = (int)div64_u64((tmp_nr + NR_THRESHOLD_PCT), 100); stats[cpu].nr_misfit = (int)div64_u64((tmp_misfit + NR_THRESHOLD_PCT), 100); stats[cpu].nr_max = per_cpu(nr_max, cpu); stats[cpu].nr_scaled = tmp_nr; trace_sched_get_nr_running_avg(cpu, stats[cpu].nr, stats[cpu].nr_misfit, stats[cpu].nr_max, stats[cpu].nr_scaled); per_cpu(last_time, cpu) = curr_time; per_cpu(nr_prod_sum, cpu) = 0; per_cpu(nr_big_prod_sum, cpu) = 0; per_cpu(nr_max, cpu) = per_cpu(nr, cpu); spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); } /* collect cluster load stats */ for_each_sched_cluster(cluster) { unsigned int num_cpus = cpumask_weight(&cluster->cpus); unsigned int sum_util_pct = 0; /* load is already scaled, see freq_policy_load/prev_runnable_sum */ for_each_cpu(cpu, &cluster->cpus) { struct rq *rq = cpu_rq(cpu); struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); /* compute the % this cpu's utilization of the cpu capacity, * and sum it across all cpus */ sum_util_pct += (wrq->util * 100) / arch_scale_cpu_capacity(cpu); } /* calculate the averge per-cpu utilization */ cstats_util_pct[cluster->id] = sum_util_pct / num_cpus; } for_each_possible_cpu(cpu) { if (per_cpu(coloc_hyst_time, cpu)) { any_hyst_time = true; break; } } if (any_hyst_time && get_rtgb_active_time() >= MAX_RTGB_TIME) sched_update_hyst_times(); last_get_time = curr_time; done: return &stats[0]; } EXPORT_SYMBOL_GPL(sched_get_nr_running_avg); void sched_update_hyst_times(void) { bool rtgb_active; int cpu; unsigned long cpu_cap, coloc_busy_pct; rtgb_active = is_rtgb_active() && (sched_boost_type != CONSERVATIVE_BOOST) && (get_rtgb_active_time() < MAX_RTGB_TIME); for_each_possible_cpu(cpu) { cpu_cap = arch_scale_cpu_capacity(cpu); coloc_busy_pct = sysctl_sched_coloc_busy_hyst_cpu_busy_pct[cpu]; per_cpu(hyst_time, cpu) = (BIT(cpu) & sysctl_sched_busy_hyst_enable_cpus) ? busy_hyst_qos_value : 0; per_cpu(coloc_hyst_time, cpu) = ((BIT(cpu) & sysctl_sched_coloc_busy_hyst_enable_cpus) && rtgb_active) ? sysctl_sched_coloc_busy_hyst_cpu[cpu] : 0; per_cpu(coloc_hyst_busy, cpu) = mult_frac(cpu_cap, coloc_busy_pct, 100); per_cpu(util_hyst_time, cpu) = (BIT(cpu) & sysctl_sched_util_busy_hyst_enable_cpus) ? sysctl_sched_util_busy_hyst_cpu[cpu] : 0; } } #define BUSY_NR_RUN 3 #define BUSY_LOAD_FACTOR 10 static inline void update_busy_hyst_end_time(int cpu, int enq, unsigned long prev_nr_run, u64 curr_time) { bool nr_run_trigger = false; bool load_trigger = false, coloc_load_trigger = false; u64 agg_hyst_time, total_util = 0; bool util_load_trigger = false; int i; bool hyst_trigger, coloc_trigger; bool dequeue = (enq < 0); if (!per_cpu(hyst_time, cpu) && !per_cpu(coloc_hyst_time, cpu) && !per_cpu(util_hyst_time, cpu)) return; if (prev_nr_run >= BUSY_NR_RUN && per_cpu(nr, cpu) < BUSY_NR_RUN) nr_run_trigger = true; if (dequeue && (cpu_util(cpu) * BUSY_LOAD_FACTOR) > capacity_orig_of(cpu)) load_trigger = true; if (dequeue && cpu_util(cpu) > per_cpu(coloc_hyst_busy, cpu)) coloc_load_trigger = true; if (dequeue) { for_each_possible_cpu(i) { total_util += cpu_util(i); if (total_util >= sysctl_sched_util_busy_hyst_cpu_util[cpu]) { util_load_trigger = true; break; } } } coloc_trigger = nr_run_trigger || coloc_load_trigger; #if IS_ENABLED(CONFIG_SCHED_CONSERVATIVE_BOOST_LPM_BIAS) hyst_trigger = nr_run_trigger || load_trigger || (sched_boost_type == CONSERVATIVE_BOOST); #else hyst_trigger = nr_run_trigger || load_trigger; #endif agg_hyst_time = max(max(hyst_trigger ? per_cpu(hyst_time, cpu) : 0, coloc_trigger ? per_cpu(coloc_hyst_time, cpu) : 0), util_load_trigger ? per_cpu(util_hyst_time, cpu) : 0); if (agg_hyst_time) { atomic64_set(&per_cpu(busy_hyst_end_time, cpu), curr_time + agg_hyst_time); trace_sched_busy_hyst_time(cpu, agg_hyst_time, prev_nr_run, cpu_util(cpu), per_cpu(hyst_time, cpu), per_cpu(coloc_hyst_time, cpu), per_cpu(util_hyst_time, cpu)); } } int sched_busy_hyst_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; if (table->maxlen > (sizeof(unsigned int) * num_possible_cpus())) table->maxlen = sizeof(unsigned int) * num_possible_cpus(); ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { if (!qos_reg) { hyst_add_request(&us_req, 0, "PerfLock"); qos_reg = true; } hyst_update_request(&us_req, PM_QOS_MIN_LIMIT, sysctl_sched_busy_hyst); sched_update_hyst_times(); } return ret; } /** * sched_update_nr_prod * @cpu: The core id of the nr running driver. * @enq: enqueue/dequeue/misfit happening on this CPU. * @return: N/A * * Update average with latest nr_running value for CPU */ void sched_update_nr_prod(int cpu, int enq) { u64 diff; u64 curr_time; unsigned long flags, nr_running; spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags); nr_running = per_cpu(nr, cpu); curr_time = sched_clock(); diff = curr_time - per_cpu(last_time, cpu); BUG_ON((s64)diff < 0); per_cpu(last_time, cpu) = curr_time; per_cpu(nr, cpu) = cpu_rq(cpu)->nr_running + enq; if (per_cpu(nr, cpu) > per_cpu(nr_max, cpu)) per_cpu(nr_max, cpu) = per_cpu(nr, cpu); /* Don't update hyst time for misfit tasks */ if (enq) update_busy_hyst_end_time(cpu, enq, nr_running, curr_time); per_cpu(nr_prod_sum, cpu) += nr_running * diff; per_cpu(nr_big_prod_sum, cpu) += walt_big_tasks(cpu) * diff; spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); } /* * Returns the CPU utilization % in the last window. */ unsigned int sched_get_cpu_util_pct(int cpu) { struct rq *rq = cpu_rq(cpu); u64 util; unsigned long capacity, flags; unsigned int busy; struct walt_rq *wrq = &per_cpu(walt_rq, cpu); raw_spin_lock_irqsave(&rq->__lock, flags); capacity = capacity_orig_of(cpu); util = wrq->prev_runnable_sum + wrq->grp_time.prev_runnable_sum; util = scale_time_to_util(util); raw_spin_unlock_irqrestore(&rq->__lock, flags); util = (util >= capacity) ? capacity : util; busy = div64_ul((util * 100), capacity); return busy; } int sched_lpm_disallowed_time(int cpu, u64 *timeout) { u64 now = sched_clock(); u64 bias_end_time = atomic64_read(&per_cpu(busy_hyst_end_time, cpu)); if (unlikely(walt_disabled)) return -EAGAIN; if (unlikely(is_reserved(cpu))) { *timeout = 10 * NSEC_PER_MSEC; return 0; /* shallowest c-state */ } if (now < bias_end_time) { *timeout = bias_end_time - now; return 0; /* shallowest c-state */ } return INT_MAX; /* don't care */ } EXPORT_SYMBOL_GPL(sched_lpm_disallowed_time);