rstat.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. #include "cgroup-internal.h"
  3. #include <linux/sched/cputime.h>
  4. #include <linux/bpf.h>
  5. #include <linux/btf.h>
  6. #include <linux/btf_ids.h>
  7. static DEFINE_SPINLOCK(cgroup_rstat_lock);
  8. static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
  9. static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
  10. static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
  11. {
  12. return per_cpu_ptr(cgrp->rstat_cpu, cpu);
  13. }
  14. /**
  15. * cgroup_rstat_updated - keep track of updated rstat_cpu
  16. * @cgrp: target cgroup
  17. * @cpu: cpu on which rstat_cpu was updated
  18. *
  19. * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching
  20. * rstat_cpu->updated_children list. See the comment on top of
  21. * cgroup_rstat_cpu definition for details.
  22. */
  23. void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
  24. {
  25. raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
  26. unsigned long flags;
  27. /*
  28. * Speculative already-on-list test. This may race leading to
  29. * temporary inaccuracies, which is fine.
  30. *
  31. * Because @parent's updated_children is terminated with @parent
  32. * instead of NULL, we can tell whether @cgrp is on the list by
  33. * testing the next pointer for NULL.
  34. */
  35. if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
  36. return;
  37. raw_spin_lock_irqsave(cpu_lock, flags);
  38. /* put @cgrp and all ancestors on the corresponding updated lists */
  39. while (true) {
  40. struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
  41. struct cgroup *parent = cgroup_parent(cgrp);
  42. struct cgroup_rstat_cpu *prstatc;
  43. /*
  44. * Both additions and removals are bottom-up. If a cgroup
  45. * is already in the tree, all ancestors are.
  46. */
  47. if (rstatc->updated_next)
  48. break;
  49. /* Root has no parent to link it to, but mark it busy */
  50. if (!parent) {
  51. rstatc->updated_next = cgrp;
  52. break;
  53. }
  54. prstatc = cgroup_rstat_cpu(parent, cpu);
  55. rstatc->updated_next = prstatc->updated_children;
  56. prstatc->updated_children = cgrp;
  57. cgrp = parent;
  58. }
  59. raw_spin_unlock_irqrestore(cpu_lock, flags);
  60. }
  61. /**
  62. * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
  63. * @pos: current position
  64. * @root: root of the tree to traversal
  65. * @cpu: target cpu
  66. *
  67. * Walks the updated rstat_cpu tree on @cpu from @root. %NULL @pos starts
  68. * the traversal and %NULL return indicates the end. During traversal,
  69. * each returned cgroup is unlinked from the tree. Must be called with the
  70. * matching cgroup_rstat_cpu_lock held.
  71. *
  72. * The only ordering guarantee is that, for a parent and a child pair
  73. * covered by a given traversal, if a child is visited, its parent is
  74. * guaranteed to be visited afterwards.
  75. */
  76. static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
  77. struct cgroup *root, int cpu)
  78. {
  79. struct cgroup_rstat_cpu *rstatc;
  80. struct cgroup *parent;
  81. if (pos == root)
  82. return NULL;
  83. /*
  84. * We're gonna walk down to the first leaf and visit/remove it. We
  85. * can pick whatever unvisited node as the starting point.
  86. */
  87. if (!pos) {
  88. pos = root;
  89. /* return NULL if this subtree is not on-list */
  90. if (!cgroup_rstat_cpu(pos, cpu)->updated_next)
  91. return NULL;
  92. } else {
  93. pos = cgroup_parent(pos);
  94. }
  95. /* walk down to the first leaf */
  96. while (true) {
  97. rstatc = cgroup_rstat_cpu(pos, cpu);
  98. if (rstatc->updated_children == pos)
  99. break;
  100. pos = rstatc->updated_children;
  101. }
  102. /*
  103. * Unlink @pos from the tree. As the updated_children list is
  104. * singly linked, we have to walk it to find the removal point.
  105. * However, due to the way we traverse, @pos will be the first
  106. * child in most cases. The only exception is @root.
  107. */
  108. parent = cgroup_parent(pos);
  109. if (parent) {
  110. struct cgroup_rstat_cpu *prstatc;
  111. struct cgroup **nextp;
  112. prstatc = cgroup_rstat_cpu(parent, cpu);
  113. nextp = &prstatc->updated_children;
  114. while (*nextp != pos) {
  115. struct cgroup_rstat_cpu *nrstatc;
  116. nrstatc = cgroup_rstat_cpu(*nextp, cpu);
  117. WARN_ON_ONCE(*nextp == parent);
  118. nextp = &nrstatc->updated_next;
  119. }
  120. *nextp = rstatc->updated_next;
  121. }
  122. rstatc->updated_next = NULL;
  123. return pos;
  124. }
  125. /*
  126. * A hook for bpf stat collectors to attach to and flush their stats.
  127. * Together with providing bpf kfuncs for cgroup_rstat_updated() and
  128. * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
  129. * collect cgroup stats can integrate with rstat for efficient flushing.
  130. *
  131. * A static noinline declaration here could cause the compiler to optimize away
  132. * the function. A global noinline declaration will keep the definition, but may
  133. * optimize away the callsite. Therefore, __weak is needed to ensure that the
  134. * call is still emitted, by telling the compiler that we don't know what the
  135. * function might eventually be.
  136. *
  137. * __diag_* below are needed to dismiss the missing prototype warning.
  138. */
  139. __diag_push();
  140. __diag_ignore_all("-Wmissing-prototypes",
  141. "kfuncs which will be used in BPF programs");
  142. __weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
  143. struct cgroup *parent, int cpu)
  144. {
  145. }
  146. __diag_pop();
  147. /* see cgroup_rstat_flush() */
  148. static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
  149. __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
  150. {
  151. int cpu;
  152. lockdep_assert_held(&cgroup_rstat_lock);
  153. for_each_possible_cpu(cpu) {
  154. raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
  155. cpu);
  156. struct cgroup *pos = NULL;
  157. unsigned long flags;
  158. /*
  159. * The _irqsave() is needed because cgroup_rstat_lock is
  160. * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
  161. * this lock with the _irq() suffix only disables interrupts on
  162. * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
  163. * interrupts on both configurations. The _irqsave() ensures
  164. * that interrupts are always disabled and later restored.
  165. */
  166. raw_spin_lock_irqsave(cpu_lock, flags);
  167. while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
  168. struct cgroup_subsys_state *css;
  169. cgroup_base_stat_flush(pos, cpu);
  170. bpf_rstat_flush(pos, cgroup_parent(pos), cpu);
  171. rcu_read_lock();
  172. list_for_each_entry_rcu(css, &pos->rstat_css_list,
  173. rstat_css_node)
  174. css->ss->css_rstat_flush(css, cpu);
  175. rcu_read_unlock();
  176. }
  177. raw_spin_unlock_irqrestore(cpu_lock, flags);
  178. /* if @may_sleep, play nice and yield if necessary */
  179. if (may_sleep && (need_resched() ||
  180. spin_needbreak(&cgroup_rstat_lock))) {
  181. spin_unlock_irq(&cgroup_rstat_lock);
  182. if (!cond_resched())
  183. cpu_relax();
  184. spin_lock_irq(&cgroup_rstat_lock);
  185. }
  186. }
  187. }
  188. /**
  189. * cgroup_rstat_flush - flush stats in @cgrp's subtree
  190. * @cgrp: target cgroup
  191. *
  192. * Collect all per-cpu stats in @cgrp's subtree into the global counters
  193. * and propagate them upwards. After this function returns, all cgroups in
  194. * the subtree have up-to-date ->stat.
  195. *
  196. * This also gets all cgroups in the subtree including @cgrp off the
  197. * ->updated_children lists.
  198. *
  199. * This function may block.
  200. */
  201. void cgroup_rstat_flush(struct cgroup *cgrp)
  202. {
  203. might_sleep();
  204. spin_lock_irq(&cgroup_rstat_lock);
  205. cgroup_rstat_flush_locked(cgrp, true);
  206. spin_unlock_irq(&cgroup_rstat_lock);
  207. }
  208. /**
  209. * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
  210. * @cgrp: target cgroup
  211. *
  212. * This function can be called from any context.
  213. */
  214. void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
  215. {
  216. unsigned long flags;
  217. spin_lock_irqsave(&cgroup_rstat_lock, flags);
  218. cgroup_rstat_flush_locked(cgrp, false);
  219. spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
  220. }
  221. /**
  222. * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
  223. * @cgrp: target cgroup
  224. *
  225. * Flush stats in @cgrp's subtree and prevent further flushes. Must be
  226. * paired with cgroup_rstat_flush_release().
  227. *
  228. * This function may block.
  229. */
  230. void cgroup_rstat_flush_hold(struct cgroup *cgrp)
  231. __acquires(&cgroup_rstat_lock)
  232. {
  233. might_sleep();
  234. spin_lock_irq(&cgroup_rstat_lock);
  235. cgroup_rstat_flush_locked(cgrp, true);
  236. }
  237. /**
  238. * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
  239. */
  240. void cgroup_rstat_flush_release(void)
  241. __releases(&cgroup_rstat_lock)
  242. {
  243. spin_unlock_irq(&cgroup_rstat_lock);
  244. }
  245. int cgroup_rstat_init(struct cgroup *cgrp)
  246. {
  247. int cpu;
  248. /* the root cgrp has rstat_cpu preallocated */
  249. if (!cgrp->rstat_cpu) {
  250. cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
  251. if (!cgrp->rstat_cpu)
  252. return -ENOMEM;
  253. }
  254. /* ->updated_children list is self terminated */
  255. for_each_possible_cpu(cpu) {
  256. struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
  257. rstatc->updated_children = cgrp;
  258. u64_stats_init(&rstatc->bsync);
  259. }
  260. return 0;
  261. }
  262. void cgroup_rstat_exit(struct cgroup *cgrp)
  263. {
  264. int cpu;
  265. cgroup_rstat_flush(cgrp);
  266. /* sanity check */
  267. for_each_possible_cpu(cpu) {
  268. struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
  269. if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
  270. WARN_ON_ONCE(rstatc->updated_next))
  271. return;
  272. }
  273. free_percpu(cgrp->rstat_cpu);
  274. cgrp->rstat_cpu = NULL;
  275. }
  276. void __init cgroup_rstat_boot(void)
  277. {
  278. int cpu;
  279. for_each_possible_cpu(cpu)
  280. raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
  281. }
  282. /*
  283. * Functions for cgroup basic resource statistics implemented on top of
  284. * rstat.
  285. */
  286. static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
  287. struct cgroup_base_stat *src_bstat)
  288. {
  289. dst_bstat->cputime.utime += src_bstat->cputime.utime;
  290. dst_bstat->cputime.stime += src_bstat->cputime.stime;
  291. dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
  292. #ifdef CONFIG_SCHED_CORE
  293. dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
  294. #endif
  295. }
  296. static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
  297. struct cgroup_base_stat *src_bstat)
  298. {
  299. dst_bstat->cputime.utime -= src_bstat->cputime.utime;
  300. dst_bstat->cputime.stime -= src_bstat->cputime.stime;
  301. dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
  302. #ifdef CONFIG_SCHED_CORE
  303. dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
  304. #endif
  305. }
  306. static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
  307. {
  308. struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
  309. struct cgroup *parent = cgroup_parent(cgrp);
  310. struct cgroup_base_stat delta;
  311. unsigned seq;
  312. /* Root-level stats are sourced from system-wide CPU stats */
  313. if (!parent)
  314. return;
  315. /* fetch the current per-cpu values */
  316. do {
  317. seq = __u64_stats_fetch_begin(&rstatc->bsync);
  318. delta = rstatc->bstat;
  319. } while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
  320. /* propagate percpu delta to global */
  321. cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
  322. cgroup_base_stat_add(&cgrp->bstat, &delta);
  323. cgroup_base_stat_add(&rstatc->last_bstat, &delta);
  324. /* propagate global delta to parent (unless that's root) */
  325. if (cgroup_parent(parent)) {
  326. delta = cgrp->bstat;
  327. cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
  328. cgroup_base_stat_add(&parent->bstat, &delta);
  329. cgroup_base_stat_add(&cgrp->last_bstat, &delta);
  330. }
  331. }
  332. static struct cgroup_rstat_cpu *
  333. cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
  334. {
  335. struct cgroup_rstat_cpu *rstatc;
  336. rstatc = get_cpu_ptr(cgrp->rstat_cpu);
  337. *flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
  338. return rstatc;
  339. }
  340. static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
  341. struct cgroup_rstat_cpu *rstatc,
  342. unsigned long flags)
  343. {
  344. u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
  345. cgroup_rstat_updated(cgrp, smp_processor_id());
  346. put_cpu_ptr(rstatc);
  347. }
  348. void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
  349. {
  350. struct cgroup_rstat_cpu *rstatc;
  351. unsigned long flags;
  352. rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
  353. rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
  354. cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
  355. }
  356. void __cgroup_account_cputime_field(struct cgroup *cgrp,
  357. enum cpu_usage_stat index, u64 delta_exec)
  358. {
  359. struct cgroup_rstat_cpu *rstatc;
  360. unsigned long flags;
  361. rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
  362. switch (index) {
  363. case CPUTIME_USER:
  364. case CPUTIME_NICE:
  365. rstatc->bstat.cputime.utime += delta_exec;
  366. break;
  367. case CPUTIME_SYSTEM:
  368. case CPUTIME_IRQ:
  369. case CPUTIME_SOFTIRQ:
  370. rstatc->bstat.cputime.stime += delta_exec;
  371. break;
  372. #ifdef CONFIG_SCHED_CORE
  373. case CPUTIME_FORCEIDLE:
  374. rstatc->bstat.forceidle_sum += delta_exec;
  375. break;
  376. #endif
  377. default:
  378. break;
  379. }
  380. cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
  381. }
  382. /*
  383. * compute the cputime for the root cgroup by getting the per cpu data
  384. * at a global level, then categorizing the fields in a manner consistent
  385. * with how it is done by __cgroup_account_cputime_field for each bit of
  386. * cpu time attributed to a cgroup.
  387. */
  388. static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
  389. {
  390. struct task_cputime *cputime = &bstat->cputime;
  391. int i;
  392. memset(bstat, 0, sizeof(*bstat));
  393. for_each_possible_cpu(i) {
  394. struct kernel_cpustat kcpustat;
  395. u64 *cpustat = kcpustat.cpustat;
  396. u64 user = 0;
  397. u64 sys = 0;
  398. kcpustat_cpu_fetch(&kcpustat, i);
  399. user += cpustat[CPUTIME_USER];
  400. user += cpustat[CPUTIME_NICE];
  401. cputime->utime += user;
  402. sys += cpustat[CPUTIME_SYSTEM];
  403. sys += cpustat[CPUTIME_IRQ];
  404. sys += cpustat[CPUTIME_SOFTIRQ];
  405. cputime->stime += sys;
  406. cputime->sum_exec_runtime += user;
  407. cputime->sum_exec_runtime += sys;
  408. cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
  409. #ifdef CONFIG_SCHED_CORE
  410. bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
  411. #endif
  412. }
  413. }
  414. void cgroup_base_stat_cputime_show(struct seq_file *seq)
  415. {
  416. struct cgroup *cgrp = seq_css(seq)->cgroup;
  417. u64 usage, utime, stime;
  418. struct cgroup_base_stat bstat;
  419. #ifdef CONFIG_SCHED_CORE
  420. u64 forceidle_time;
  421. #endif
  422. if (cgroup_parent(cgrp)) {
  423. cgroup_rstat_flush_hold(cgrp);
  424. usage = cgrp->bstat.cputime.sum_exec_runtime;
  425. cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
  426. &utime, &stime);
  427. #ifdef CONFIG_SCHED_CORE
  428. forceidle_time = cgrp->bstat.forceidle_sum;
  429. #endif
  430. cgroup_rstat_flush_release();
  431. } else {
  432. root_cgroup_cputime(&bstat);
  433. usage = bstat.cputime.sum_exec_runtime;
  434. utime = bstat.cputime.utime;
  435. stime = bstat.cputime.stime;
  436. #ifdef CONFIG_SCHED_CORE
  437. forceidle_time = bstat.forceidle_sum;
  438. #endif
  439. }
  440. do_div(usage, NSEC_PER_USEC);
  441. do_div(utime, NSEC_PER_USEC);
  442. do_div(stime, NSEC_PER_USEC);
  443. #ifdef CONFIG_SCHED_CORE
  444. do_div(forceidle_time, NSEC_PER_USEC);
  445. #endif
  446. seq_printf(seq, "usage_usec %llu\n"
  447. "user_usec %llu\n"
  448. "system_usec %llu\n",
  449. usage, utime, stime);
  450. #ifdef CONFIG_SCHED_CORE
  451. seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
  452. #endif
  453. }
  454. /* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
  455. BTF_SET8_START(bpf_rstat_kfunc_ids)
  456. BTF_ID_FLAGS(func, cgroup_rstat_updated)
  457. BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
  458. BTF_SET8_END(bpf_rstat_kfunc_ids)
  459. static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
  460. .owner = THIS_MODULE,
  461. .set = &bpf_rstat_kfunc_ids,
  462. };
  463. static int __init bpf_rstat_kfunc_init(void)
  464. {
  465. return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
  466. &bpf_rstat_kfunc_set);
  467. }
  468. late_initcall(bpf_rstat_kfunc_init);