diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 3c31ba88aca5..581ee79550de 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -92,6 +92,13 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); +#ifdef CONFIG_SMP +extern unsigned int sysctl_sched_pelt_multiplier; + +int sched_pelt_multiplier(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +#endif + #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) extern unsigned int sysctl_sched_energy_aware; int sched_energy_aware_handler(struct ctl_table *table, int write, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1defb8dff43a..304159b392e4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4788,7 +4788,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttle_count--; if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - + cfs_rq->throttled_clock_task_time += rq_clock_task_mult(rq) - cfs_rq->throttled_clock_task; /* Add cfs_rq with already running entity in the list */ @@ -4806,7 +4806,7 @@ static int tg_throttle_down(struct task_group *tg, void *data) /* group is entering throttled state, stop time */ if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_task = rq_clock_task(rq); + cfs_rq->throttled_clock_task = rq_clock_task_mult(rq); list_del_leaf_cfs_rq(cfs_rq); } cfs_rq->throttle_count++; @@ -5224,7 +5224,7 @@ static void sync_throttle(struct task_group *tg, int cpu) pcfs_rq = tg->parent->cfs_rq[cpu]; cfs_rq->throttle_count = pcfs_rq->throttle_count; - cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); + cfs_rq->throttled_clock_task = rq_clock_task_mult(cpu_rq(cpu)); } /* conditionally throttle active cfs_rq's from put_prev_entity() */ diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index bbb0de2219f9..2d2c0fa9812d 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -531,3 +531,45 @@ int update_irq_load_avg(struct rq *rq, u64 running) return ret; } #endif + +DEFINE_PER_CPU(u64, clock_task_mult); + +unsigned int sysctl_sched_pelt_multiplier = 1; +__read_mostly unsigned int sched_pelt_lshift; + +int sched_pelt_multiplier(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + static DEFINE_MUTEX(mutex); + unsigned int old; + int ret; + + mutex_lock(&mutex); + + old = sysctl_sched_pelt_multiplier; + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret) + goto undo; + if (!write) + goto done; + + switch (sysctl_sched_pelt_multiplier) { + case 1: + fallthrough; + case 2: + fallthrough; + case 4: + WRITE_ONCE(sched_pelt_lshift, + sysctl_sched_pelt_multiplier >> 1); + goto done; + default: + ret = -EINVAL; + } + +undo: + sysctl_sched_pelt_multiplier = old; +done: + mutex_unlock(&mutex); + + return ret; +} diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 45bf08e22207..3f00596def11 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -61,6 +61,8 @@ static inline void cfs_se_util_change(struct sched_avg *avg) WRITE_ONCE(avg->util_est.enqueued, enqueued); } +extern unsigned int sched_pelt_lshift; + /* * The clock_pelt scales the time to reflect the effective amount of * computation done during the running delta time but then sync back to @@ -75,9 +77,13 @@ static inline void cfs_se_util_change(struct sched_avg *avg) */ static inline void update_rq_clock_pelt(struct rq *rq, s64 delta) { + delta <<= READ_ONCE(sched_pelt_lshift); + + per_cpu(clock_task_mult, rq->cpu) += delta; + if (unlikely(is_idle_task(rq->curr))) { /* The rq is idle, we can sync to clock_task */ - rq->clock_pelt = rq_clock_task(rq); + rq->clock_pelt = rq_clock_task_mult(rq); return; } @@ -129,7 +135,8 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq) * rq's clock_task. */ if (util_sum >= divider) - rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt; + rq->lost_idle_time += rq_clock_task_mult(rq) - + rq->clock_pelt; } static inline u64 rq_clock_pelt(struct rq *rq) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 518da844fe2a..175d5f3ef58e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1193,6 +1193,16 @@ static inline u64 rq_clock_task(struct rq *rq) return rq->clock_task; } +DECLARE_PER_CPU(u64, clock_task_mult); + +static inline u64 rq_clock_task_mult(struct rq *rq) +{ + lockdep_assert_held(&rq->lock); + assert_clock_updated(rq); + + return per_cpu(clock_task_mult, rq->cpu); +} + /** * By default the decay is the default pelt decay period. * The decay shift can change the decay period in diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a4006f6558d1..3632cfbc88b0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1829,6 +1829,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_rr_handler, }, +#ifdef CONFIG_SMP + { + .procname = "sched_pelt_multiplier", + .data = &sysctl_sched_pelt_multiplier, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_pelt_multiplier, + }, +#endif #ifdef CONFIG_UCLAMP_TASK { .procname = "sched_util_clamp_min",