pelt.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Per Entity Load Tracking
  4. *
  5. * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <[email protected]>
  6. *
  7. * Interactivity improvements by Mike Galbraith
  8. * (C) 2007 Mike Galbraith <[email protected]>
  9. *
  10. * Various enhancements by Dmitry Adamushko.
  11. * (C) 2007 Dmitry Adamushko <[email protected]>
  12. *
  13. * Group scheduling enhancements by Srivatsa Vaddagiri
  14. * Copyright IBM Corporation, 2007
  15. * Author: Srivatsa Vaddagiri <[email protected]>
  16. *
  17. * Scaled math optimizations by Thomas Gleixner
  18. * Copyright (C) 2007, Thomas Gleixner <[email protected]>
  19. *
  20. * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  21. * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  22. *
  23. * Move PELT related code from fair.c into this pelt.c file
  24. * Author: Vincent Guittot <[email protected]>
  25. */
  26. #include <trace/hooks/sched.h>
  27. /*
  28. * Approximate:
  29. * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
  30. */
  31. static u64 decay_load(u64 val, u64 n)
  32. {
  33. unsigned int local_n;
  34. if (unlikely(n > LOAD_AVG_PERIOD * 63))
  35. return 0;
  36. /* after bounds checking we can collapse to 32-bit */
  37. local_n = n;
  38. /*
  39. * As y^PERIOD = 1/2, we can combine
  40. * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
  41. * With a look-up table which covers y^n (n<PERIOD)
  42. *
  43. * To achieve constant time decay_load.
  44. */
  45. if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
  46. val >>= local_n / LOAD_AVG_PERIOD;
  47. local_n %= LOAD_AVG_PERIOD;
  48. }
  49. val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
  50. return val;
  51. }
  52. static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
  53. {
  54. u32 c1, c2, c3 = d3; /* y^0 == 1 */
  55. /*
  56. * c1 = d1 y^p
  57. */
  58. c1 = decay_load((u64)d1, periods);
  59. /*
  60. * p-1
  61. * c2 = 1024 \Sum y^n
  62. * n=1
  63. *
  64. * inf inf
  65. * = 1024 ( \Sum y^n - \Sum y^n - y^0 )
  66. * n=0 n=p
  67. */
  68. c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
  69. return c1 + c2 + c3;
  70. }
  71. /*
  72. * Accumulate the three separate parts of the sum; d1 the remainder
  73. * of the last (incomplete) period, d2 the span of full periods and d3
  74. * the remainder of the (incomplete) current period.
  75. *
  76. * d1 d2 d3
  77. * ^ ^ ^
  78. * | | |
  79. * |<->|<----------------->|<--->|
  80. * ... |---x---|------| ... |------|-----x (now)
  81. *
  82. * p-1
  83. * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
  84. * n=1
  85. *
  86. * = u y^p + (Step 1)
  87. *
  88. * p-1
  89. * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2)
  90. * n=1
  91. */
  92. static __always_inline u32
  93. accumulate_sum(u64 delta, struct sched_avg *sa,
  94. unsigned long load, unsigned long runnable, int running)
  95. {
  96. u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
  97. u64 periods;
  98. delta += sa->period_contrib;
  99. periods = delta / 1024; /* A period is 1024us (~1ms) */
  100. /*
  101. * Step 1: decay old *_sum if we crossed period boundaries.
  102. */
  103. if (periods) {
  104. sa->load_sum = decay_load(sa->load_sum, periods);
  105. sa->runnable_sum =
  106. decay_load(sa->runnable_sum, periods);
  107. sa->util_sum = decay_load((u64)(sa->util_sum), periods);
  108. /*
  109. * Step 2
  110. */
  111. delta %= 1024;
  112. if (load) {
  113. /*
  114. * This relies on the:
  115. *
  116. * if (!load)
  117. * runnable = running = 0;
  118. *
  119. * clause from ___update_load_sum(); this results in
  120. * the below usage of @contrib to disappear entirely,
  121. * so no point in calculating it.
  122. */
  123. contrib = __accumulate_pelt_segments(periods,
  124. 1024 - sa->period_contrib, delta);
  125. }
  126. }
  127. sa->period_contrib = delta;
  128. if (load)
  129. sa->load_sum += load * contrib;
  130. if (runnable)
  131. sa->runnable_sum += runnable * contrib << SCHED_CAPACITY_SHIFT;
  132. if (running)
  133. sa->util_sum += contrib << SCHED_CAPACITY_SHIFT;
  134. return periods;
  135. }
  136. /*
  137. * We can represent the historical contribution to runnable average as the
  138. * coefficients of a geometric series. To do this we sub-divide our runnable
  139. * history into segments of approximately 1ms (1024us); label the segment that
  140. * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
  141. *
  142. * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
  143. * p0 p1 p2
  144. * (now) (~1ms ago) (~2ms ago)
  145. *
  146. * Let u_i denote the fraction of p_i that the entity was runnable.
  147. *
  148. * We then designate the fractions u_i as our co-efficients, yielding the
  149. * following representation of historical load:
  150. * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
  151. *
  152. * We choose y based on the with of a reasonably scheduling period, fixing:
  153. * y^32 = 0.5
  154. *
  155. * This means that the contribution to load ~32ms ago (u_32) will be weighted
  156. * approximately half as much as the contribution to load within the last ms
  157. * (u_0).
  158. *
  159. * When a period "rolls over" and we have new u_0`, multiplying the previous
  160. * sum again by y is sufficient to update:
  161. * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
  162. * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
  163. */
  164. int
  165. ___update_load_sum(u64 now, struct sched_avg *sa,
  166. unsigned long load, unsigned long runnable, int running)
  167. {
  168. u64 delta;
  169. delta = now - sa->last_update_time;
  170. /*
  171. * This should only happen when time goes backwards, which it
  172. * unfortunately does during sched clock init when we swap over to TSC.
  173. */
  174. if ((s64)delta < 0) {
  175. sa->last_update_time = now;
  176. return 0;
  177. }
  178. /*
  179. * Use 1024ns as the unit of measurement since it's a reasonable
  180. * approximation of 1us and fast to compute.
  181. */
  182. delta >>= 10;
  183. if (!delta)
  184. return 0;
  185. sa->last_update_time += delta << 10;
  186. trace_android_rvh_update_load_sum(sa, &delta, &sched_pelt_lshift);
  187. /*
  188. * running is a subset of runnable (weight) so running can't be set if
  189. * runnable is clear. But there are some corner cases where the current
  190. * se has been already dequeued but cfs_rq->curr still points to it.
  191. * This means that weight will be 0 but not running for a sched_entity
  192. * but also for a cfs_rq if the latter becomes idle. As an example,
  193. * this happens during idle_balance() which calls
  194. * update_blocked_averages().
  195. *
  196. * Also see the comment in accumulate_sum().
  197. */
  198. if (!load)
  199. runnable = running = 0;
  200. /*
  201. * Now we know we crossed measurement unit boundaries. The *_avg
  202. * accrues by two steps:
  203. *
  204. * Step 1: accumulate *_sum since last_update_time. If we haven't
  205. * crossed period boundaries, finish.
  206. */
  207. if (!accumulate_sum(delta, sa, load, runnable, running))
  208. return 0;
  209. return 1;
  210. }
  211. EXPORT_SYMBOL_GPL(___update_load_sum);
  212. /*
  213. * When syncing *_avg with *_sum, we must take into account the current
  214. * position in the PELT segment otherwise the remaining part of the segment
  215. * will be considered as idle time whereas it's not yet elapsed and this will
  216. * generate unwanted oscillation in the range [1002..1024[.
  217. *
  218. * The max value of *_sum varies with the position in the time segment and is
  219. * equals to :
  220. *
  221. * LOAD_AVG_MAX*y + sa->period_contrib
  222. *
  223. * which can be simplified into:
  224. *
  225. * LOAD_AVG_MAX - 1024 + sa->period_contrib
  226. *
  227. * because LOAD_AVG_MAX*y == LOAD_AVG_MAX-1024
  228. *
  229. * The same care must be taken when a sched entity is added, updated or
  230. * removed from a cfs_rq and we need to update sched_avg. Scheduler entities
  231. * and the cfs rq, to which they are attached, have the same position in the
  232. * time segment because they use the same clock. This means that we can use
  233. * the period_contrib of cfs_rq when updating the sched_avg of a sched_entity
  234. * if it's more convenient.
  235. */
  236. void
  237. ___update_load_avg(struct sched_avg *sa, unsigned long load)
  238. {
  239. u32 divider = get_pelt_divider(sa);
  240. /*
  241. * Step 2: update *_avg.
  242. */
  243. sa->load_avg = div_u64(load * sa->load_sum, divider);
  244. sa->runnable_avg = div_u64(sa->runnable_sum, divider);
  245. WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
  246. }
  247. EXPORT_SYMBOL_GPL(___update_load_avg);
  248. /*
  249. * sched_entity:
  250. *
  251. * task:
  252. * se_weight() = se->load.weight
  253. * se_runnable() = !!on_rq
  254. *
  255. * group: [ see update_cfs_group() ]
  256. * se_weight() = tg->weight * grq->load_avg / tg->load_avg
  257. * se_runnable() = grq->h_nr_running
  258. *
  259. * runnable_sum = se_runnable() * runnable = grq->runnable_sum
  260. * runnable_avg = runnable_sum
  261. *
  262. * load_sum := runnable
  263. * load_avg = se_weight(se) * load_sum
  264. *
  265. * cfq_rq:
  266. *
  267. * runnable_sum = \Sum se->avg.runnable_sum
  268. * runnable_avg = \Sum se->avg.runnable_avg
  269. *
  270. * load_sum = \Sum se_weight(se) * se->avg.load_sum
  271. * load_avg = \Sum se->avg.load_avg
  272. */
  273. int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
  274. {
  275. if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
  276. ___update_load_avg(&se->avg, se_weight(se));
  277. trace_pelt_se_tp(se);
  278. return 1;
  279. }
  280. return 0;
  281. }
  282. EXPORT_SYMBOL_GPL(__update_load_avg_blocked_se);
  283. int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
  284. {
  285. if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se),
  286. cfs_rq->curr == se)) {
  287. ___update_load_avg(&se->avg, se_weight(se));
  288. cfs_se_util_change(&se->avg);
  289. trace_pelt_se_tp(se);
  290. return 1;
  291. }
  292. return 0;
  293. }
  294. int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
  295. {
  296. if (___update_load_sum(now, &cfs_rq->avg,
  297. scale_load_down(cfs_rq->load.weight),
  298. cfs_rq->h_nr_running,
  299. cfs_rq->curr != NULL)) {
  300. ___update_load_avg(&cfs_rq->avg, 1);
  301. trace_pelt_cfs_tp(cfs_rq);
  302. return 1;
  303. }
  304. return 0;
  305. }
  306. /*
  307. * rt_rq:
  308. *
  309. * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
  310. * util_sum = cpu_scale * load_sum
  311. * runnable_sum = util_sum
  312. *
  313. * load_avg and runnable_avg are not supported and meaningless.
  314. *
  315. */
  316. int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
  317. {
  318. if (___update_load_sum(now, &rq->avg_rt,
  319. running,
  320. running,
  321. running)) {
  322. ___update_load_avg(&rq->avg_rt, 1);
  323. trace_pelt_rt_tp(rq);
  324. return 1;
  325. }
  326. return 0;
  327. }
  328. /*
  329. * dl_rq:
  330. *
  331. * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
  332. * util_sum = cpu_scale * load_sum
  333. * runnable_sum = util_sum
  334. *
  335. * load_avg and runnable_avg are not supported and meaningless.
  336. *
  337. */
  338. int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
  339. {
  340. if (___update_load_sum(now, &rq->avg_dl,
  341. running,
  342. running,
  343. running)) {
  344. ___update_load_avg(&rq->avg_dl, 1);
  345. trace_pelt_dl_tp(rq);
  346. return 1;
  347. }
  348. return 0;
  349. }
  350. #ifdef CONFIG_SCHED_THERMAL_PRESSURE
  351. /*
  352. * thermal:
  353. *
  354. * load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked
  355. *
  356. * util_avg and runnable_load_avg are not supported and meaningless.
  357. *
  358. * Unlike rt/dl utilization tracking that track time spent by a cpu
  359. * running a rt/dl task through util_avg, the average thermal pressure is
  360. * tracked through load_avg. This is because thermal pressure signal is
  361. * time weighted "delta" capacity unlike util_avg which is binary.
  362. * "delta capacity" = actual capacity -
  363. * capped capacity a cpu due to a thermal event.
  364. */
  365. int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
  366. {
  367. if (___update_load_sum(now, &rq->avg_thermal,
  368. capacity,
  369. capacity,
  370. capacity)) {
  371. ___update_load_avg(&rq->avg_thermal, 1);
  372. trace_pelt_thermal_tp(rq);
  373. return 1;
  374. }
  375. return 0;
  376. }
  377. #endif
  378. #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
  379. /*
  380. * irq:
  381. *
  382. * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
  383. * util_sum = cpu_scale * load_sum
  384. * runnable_sum = util_sum
  385. *
  386. * load_avg and runnable_avg are not supported and meaningless.
  387. *
  388. */
  389. int update_irq_load_avg(struct rq *rq, u64 running)
  390. {
  391. int ret = 0;
  392. /*
  393. * We can't use clock_pelt because irq time is not accounted in
  394. * clock_task. Instead we directly scale the running time to
  395. * reflect the real amount of computation
  396. */
  397. running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq)));
  398. running = cap_scale(running, arch_scale_cpu_capacity(cpu_of(rq)));
  399. /*
  400. * We know the time that has been used by interrupt since last update
  401. * but we don't when. Let be pessimistic and assume that interrupt has
  402. * happened just before the update. This is not so far from reality
  403. * because interrupt will most probably wake up task and trig an update
  404. * of rq clock during which the metric is updated.
  405. * We start to decay with normal context time and then we add the
  406. * interrupt context time.
  407. * We can safely remove running from rq->clock because
  408. * rq->clock += delta with delta >= running
  409. */
  410. ret = ___update_load_sum(rq->clock - running, &rq->avg_irq,
  411. 0,
  412. 0,
  413. 0);
  414. ret += ___update_load_sum(rq->clock, &rq->avg_irq,
  415. 1,
  416. 1,
  417. 1);
  418. if (ret) {
  419. ___update_load_avg(&rq->avg_irq, 1);
  420. trace_pelt_irq_tp(rq);
  421. }
  422. return ret;
  423. }
  424. #endif
  425. __read_mostly unsigned int sched_pelt_lshift;
  426. #ifdef CONFIG_SYSCTL
  427. #include <trace/hooks/sched.h>
  428. static unsigned int sysctl_sched_pelt_multiplier = 1;
  429. int sched_pelt_multiplier(struct ctl_table *table, int write, void *buffer,
  430. size_t *lenp, loff_t *ppos)
  431. {
  432. static DEFINE_MUTEX(mutex);
  433. unsigned int old;
  434. int ret;
  435. mutex_lock(&mutex);
  436. old = sysctl_sched_pelt_multiplier;
  437. ret = proc_dointvec(table, write, buffer, lenp, ppos);
  438. if (ret)
  439. goto undo;
  440. if (!write)
  441. goto done;
  442. trace_android_vh_sched_pelt_multiplier(old, sysctl_sched_pelt_multiplier, &ret);
  443. if (ret)
  444. goto undo;
  445. switch (sysctl_sched_pelt_multiplier) {
  446. case 1:
  447. fallthrough;
  448. case 2:
  449. fallthrough;
  450. case 4:
  451. WRITE_ONCE(sched_pelt_lshift,
  452. sysctl_sched_pelt_multiplier >> 1);
  453. goto done;
  454. default:
  455. ret = -EINVAL;
  456. }
  457. undo:
  458. sysctl_sched_pelt_multiplier = old;
  459. done:
  460. mutex_unlock(&mutex);
  461. return ret;
  462. }
  463. static struct ctl_table sched_pelt_sysctls[] = {
  464. {
  465. .procname = "sched_pelt_multiplier",
  466. .data = &sysctl_sched_pelt_multiplier,
  467. .maxlen = sizeof(unsigned int),
  468. .mode = 0644,
  469. .proc_handler = sched_pelt_multiplier,
  470. },
  471. {}
  472. };
  473. static int __init sched_pelt_sysctl_init(void)
  474. {
  475. register_sysctl_init("kernel", sched_pelt_sysctls);
  476. return 0;
  477. }
  478. late_initcall(sched_pelt_sysctl_init);
  479. #endif