cgroup_iter.c 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /* Copyright (c) 2022 Google */
  3. #include <linux/bpf.h>
  4. #include <linux/btf_ids.h>
  5. #include <linux/cgroup.h>
  6. #include <linux/kernel.h>
  7. #include <linux/seq_file.h>
  8. #include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */
  9. /* cgroup_iter provides four modes of traversal to the cgroup hierarchy.
  10. *
  11. * 1. Walk the descendants of a cgroup in pre-order.
  12. * 2. Walk the descendants of a cgroup in post-order.
  13. * 3. Walk the ancestors of a cgroup.
  14. * 4. Show the given cgroup only.
  15. *
  16. * For walking descendants, cgroup_iter can walk in either pre-order or
  17. * post-order. For walking ancestors, the iter walks up from a cgroup to
  18. * the root.
  19. *
  20. * The iter program can terminate the walk early by returning 1. Walk
  21. * continues if prog returns 0.
  22. *
  23. * The prog can check (seq->num == 0) to determine whether this is
  24. * the first element. The prog may also be passed a NULL cgroup,
  25. * which means the walk has completed and the prog has a chance to
  26. * do post-processing, such as outputting an epilogue.
  27. *
  28. * Note: the iter_prog is called with cgroup_mutex held.
  29. *
  30. * Currently only one session is supported, which means, depending on the
  31. * volume of data bpf program intends to send to user space, the number
  32. * of cgroups that can be walked is limited. For example, given the current
  33. * buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each
  34. * cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can
  35. * be walked is 512. This is a limitation of cgroup_iter. If the output data
  36. * is larger than the kernel buffer size, after all data in the kernel buffer
  37. * is consumed by user space, the subsequent read() syscall will signal
  38. * EOPNOTSUPP. In order to work around, the user may have to update their
  39. * program to reduce the volume of data sent to output. For example, skip
  40. * some uninteresting cgroups.
  41. */
  42. struct bpf_iter__cgroup {
  43. __bpf_md_ptr(struct bpf_iter_meta *, meta);
  44. __bpf_md_ptr(struct cgroup *, cgroup);
  45. };
  46. struct cgroup_iter_priv {
  47. struct cgroup_subsys_state *start_css;
  48. bool visited_all;
  49. bool terminate;
  50. int order;
  51. };
  52. static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
  53. {
  54. struct cgroup_iter_priv *p = seq->private;
  55. cgroup_lock();
  56. /* cgroup_iter doesn't support read across multiple sessions. */
  57. if (*pos > 0) {
  58. if (p->visited_all)
  59. return NULL;
  60. /* Haven't visited all, but because cgroup_mutex has dropped,
  61. * return -EOPNOTSUPP to indicate incomplete iteration.
  62. */
  63. return ERR_PTR(-EOPNOTSUPP);
  64. }
  65. ++*pos;
  66. p->terminate = false;
  67. p->visited_all = false;
  68. if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
  69. return css_next_descendant_pre(NULL, p->start_css);
  70. else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
  71. return css_next_descendant_post(NULL, p->start_css);
  72. else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */
  73. return p->start_css;
  74. }
  75. static int __cgroup_iter_seq_show(struct seq_file *seq,
  76. struct cgroup_subsys_state *css, int in_stop);
  77. static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
  78. {
  79. struct cgroup_iter_priv *p = seq->private;
  80. cgroup_unlock();
  81. /* pass NULL to the prog for post-processing */
  82. if (!v) {
  83. __cgroup_iter_seq_show(seq, NULL, true);
  84. p->visited_all = true;
  85. }
  86. }
  87. static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
  88. {
  89. struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
  90. struct cgroup_iter_priv *p = seq->private;
  91. ++*pos;
  92. if (p->terminate)
  93. return NULL;
  94. if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
  95. return css_next_descendant_pre(curr, p->start_css);
  96. else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
  97. return css_next_descendant_post(curr, p->start_css);
  98. else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP)
  99. return curr->parent;
  100. else /* BPF_CGROUP_ITER_SELF_ONLY */
  101. return NULL;
  102. }
  103. static int __cgroup_iter_seq_show(struct seq_file *seq,
  104. struct cgroup_subsys_state *css, int in_stop)
  105. {
  106. struct cgroup_iter_priv *p = seq->private;
  107. struct bpf_iter__cgroup ctx;
  108. struct bpf_iter_meta meta;
  109. struct bpf_prog *prog;
  110. int ret = 0;
  111. /* cgroup is dead, skip this element */
  112. if (css && cgroup_is_dead(css->cgroup))
  113. return 0;
  114. ctx.meta = &meta;
  115. ctx.cgroup = css ? css->cgroup : NULL;
  116. meta.seq = seq;
  117. prog = bpf_iter_get_info(&meta, in_stop);
  118. if (prog)
  119. ret = bpf_iter_run_prog(prog, &ctx);
  120. /* if prog returns > 0, terminate after this element. */
  121. if (ret != 0)
  122. p->terminate = true;
  123. return 0;
  124. }
  125. static int cgroup_iter_seq_show(struct seq_file *seq, void *v)
  126. {
  127. return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v,
  128. false);
  129. }
  130. static const struct seq_operations cgroup_iter_seq_ops = {
  131. .start = cgroup_iter_seq_start,
  132. .next = cgroup_iter_seq_next,
  133. .stop = cgroup_iter_seq_stop,
  134. .show = cgroup_iter_seq_show,
  135. };
  136. BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
  137. static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
  138. {
  139. struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
  140. struct cgroup *cgrp = aux->cgroup.start;
  141. /* bpf_iter_attach_cgroup() has already acquired an extra reference
  142. * for the start cgroup, but the reference may be released after
  143. * cgroup_iter_seq_init(), so acquire another reference for the
  144. * start cgroup.
  145. */
  146. p->start_css = &cgrp->self;
  147. css_get(p->start_css);
  148. p->terminate = false;
  149. p->visited_all = false;
  150. p->order = aux->cgroup.order;
  151. return 0;
  152. }
  153. static void cgroup_iter_seq_fini(void *priv)
  154. {
  155. struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
  156. css_put(p->start_css);
  157. }
  158. static const struct bpf_iter_seq_info cgroup_iter_seq_info = {
  159. .seq_ops = &cgroup_iter_seq_ops,
  160. .init_seq_private = cgroup_iter_seq_init,
  161. .fini_seq_private = cgroup_iter_seq_fini,
  162. .seq_priv_size = sizeof(struct cgroup_iter_priv),
  163. };
  164. static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
  165. union bpf_iter_link_info *linfo,
  166. struct bpf_iter_aux_info *aux)
  167. {
  168. int fd = linfo->cgroup.cgroup_fd;
  169. u64 id = linfo->cgroup.cgroup_id;
  170. int order = linfo->cgroup.order;
  171. struct cgroup *cgrp;
  172. if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE &&
  173. order != BPF_CGROUP_ITER_DESCENDANTS_POST &&
  174. order != BPF_CGROUP_ITER_ANCESTORS_UP &&
  175. order != BPF_CGROUP_ITER_SELF_ONLY)
  176. return -EINVAL;
  177. if (fd && id)
  178. return -EINVAL;
  179. if (fd)
  180. cgrp = cgroup_v1v2_get_from_fd(fd);
  181. else if (id)
  182. cgrp = cgroup_get_from_id(id);
  183. else /* walk the entire hierarchy by default. */
  184. cgrp = cgroup_get_from_path("/");
  185. if (IS_ERR(cgrp))
  186. return PTR_ERR(cgrp);
  187. aux->cgroup.start = cgrp;
  188. aux->cgroup.order = order;
  189. return 0;
  190. }
  191. static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux)
  192. {
  193. cgroup_put(aux->cgroup.start);
  194. }
  195. static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux,
  196. struct seq_file *seq)
  197. {
  198. char *buf;
  199. buf = kzalloc(PATH_MAX, GFP_KERNEL);
  200. if (!buf) {
  201. seq_puts(seq, "cgroup_path:\t<unknown>\n");
  202. goto show_order;
  203. }
  204. /* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path
  205. * will print nothing.
  206. *
  207. * Path is in the calling process's cgroup namespace.
  208. */
  209. cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX,
  210. current->nsproxy->cgroup_ns);
  211. seq_printf(seq, "cgroup_path:\t%s\n", buf);
  212. kfree(buf);
  213. show_order:
  214. if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
  215. seq_puts(seq, "order: descendants_pre\n");
  216. else if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_POST)
  217. seq_puts(seq, "order: descendants_post\n");
  218. else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP)
  219. seq_puts(seq, "order: ancestors_up\n");
  220. else /* BPF_CGROUP_ITER_SELF_ONLY */
  221. seq_puts(seq, "order: self_only\n");
  222. }
  223. static int bpf_iter_cgroup_fill_link_info(const struct bpf_iter_aux_info *aux,
  224. struct bpf_link_info *info)
  225. {
  226. info->iter.cgroup.order = aux->cgroup.order;
  227. info->iter.cgroup.cgroup_id = cgroup_id(aux->cgroup.start);
  228. return 0;
  229. }
  230. DEFINE_BPF_ITER_FUNC(cgroup, struct bpf_iter_meta *meta,
  231. struct cgroup *cgroup)
  232. static struct bpf_iter_reg bpf_cgroup_reg_info = {
  233. .target = "cgroup",
  234. .feature = BPF_ITER_RESCHED,
  235. .attach_target = bpf_iter_attach_cgroup,
  236. .detach_target = bpf_iter_detach_cgroup,
  237. .show_fdinfo = bpf_iter_cgroup_show_fdinfo,
  238. .fill_link_info = bpf_iter_cgroup_fill_link_info,
  239. .ctx_arg_info_size = 1,
  240. .ctx_arg_info = {
  241. { offsetof(struct bpf_iter__cgroup, cgroup),
  242. PTR_TO_BTF_ID_OR_NULL },
  243. },
  244. .seq_info = &cgroup_iter_seq_info,
  245. };
  246. static int __init bpf_cgroup_iter_init(void)
  247. {
  248. bpf_cgroup_reg_info.ctx_arg_info[0].btf_id = bpf_cgroup_btf_id[0];
  249. return bpf_iter_reg_target(&bpf_cgroup_reg_info);
  250. }
  251. late_initcall(bpf_cgroup_iter_init);