pids.c 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Process number limiting controller for cgroups.
  4. *
  5. * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
  6. * after a certain limit is reached.
  7. *
  8. * Since it is trivial to hit the task limit without hitting any kmemcg limits
  9. * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
  10. * preventable in the scope of a cgroup hierarchy by allowing resource limiting
  11. * of the number of tasks in a cgroup.
  12. *
  13. * In order to use the `pids` controller, set the maximum number of tasks in
  14. * pids.max (this is not available in the root cgroup for obvious reasons). The
  15. * number of processes currently in the cgroup is given by pids.current.
  16. * Organisational operations are not blocked by cgroup policies, so it is
  17. * possible to have pids.current > pids.max. However, it is not possible to
  18. * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
  19. * would cause a cgroup policy to be violated.
  20. *
  21. * To set a cgroup to have no limit, set pids.max to "max". This is the default
  22. * for all new cgroups (N.B. that PID limits are hierarchical, so the most
  23. * stringent limit in the hierarchy is followed).
  24. *
  25. * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
  26. * a superset of parent/child/pids.current.
  27. *
  28. * Copyright (C) 2015 Aleksa Sarai <[email protected]>
  29. */
  30. #include <linux/kernel.h>
  31. #include <linux/threads.h>
  32. #include <linux/atomic.h>
  33. #include <linux/cgroup.h>
  34. #include <linux/slab.h>
  35. #include <linux/sched/task.h>
  36. #define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
  37. #define PIDS_MAX_STR "max"
  38. struct pids_cgroup {
  39. struct cgroup_subsys_state css;
  40. /*
  41. * Use 64-bit types so that we can safely represent "max" as
  42. * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
  43. */
  44. atomic64_t counter;
  45. atomic64_t limit;
  46. int64_t watermark;
  47. /* Handle for "pids.events" */
  48. struct cgroup_file events_file;
  49. /* Number of times fork failed because limit was hit. */
  50. atomic64_t events_limit;
  51. };
  52. static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
  53. {
  54. return container_of(css, struct pids_cgroup, css);
  55. }
  56. static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
  57. {
  58. return css_pids(pids->css.parent);
  59. }
  60. static struct cgroup_subsys_state *
  61. pids_css_alloc(struct cgroup_subsys_state *parent)
  62. {
  63. struct pids_cgroup *pids;
  64. pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
  65. if (!pids)
  66. return ERR_PTR(-ENOMEM);
  67. atomic64_set(&pids->counter, 0);
  68. atomic64_set(&pids->limit, PIDS_MAX);
  69. atomic64_set(&pids->events_limit, 0);
  70. return &pids->css;
  71. }
  72. static void pids_css_free(struct cgroup_subsys_state *css)
  73. {
  74. kfree(css_pids(css));
  75. }
  76. static void pids_update_watermark(struct pids_cgroup *p, int64_t nr_pids)
  77. {
  78. /*
  79. * This is racy, but we don't need perfectly accurate tallying of
  80. * the watermark, and this lets us avoid extra atomic overhead.
  81. */
  82. if (nr_pids > READ_ONCE(p->watermark))
  83. WRITE_ONCE(p->watermark, nr_pids);
  84. }
  85. /**
  86. * pids_cancel - uncharge the local pid count
  87. * @pids: the pid cgroup state
  88. * @num: the number of pids to cancel
  89. *
  90. * This function will WARN if the pid count goes under 0, because such a case is
  91. * a bug in the pids controller proper.
  92. */
  93. static void pids_cancel(struct pids_cgroup *pids, int num)
  94. {
  95. /*
  96. * A negative count (or overflow for that matter) is invalid,
  97. * and indicates a bug in the `pids` controller proper.
  98. */
  99. WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
  100. }
  101. /**
  102. * pids_uncharge - hierarchically uncharge the pid count
  103. * @pids: the pid cgroup state
  104. * @num: the number of pids to uncharge
  105. */
  106. static void pids_uncharge(struct pids_cgroup *pids, int num)
  107. {
  108. struct pids_cgroup *p;
  109. for (p = pids; parent_pids(p); p = parent_pids(p))
  110. pids_cancel(p, num);
  111. }
  112. /**
  113. * pids_charge - hierarchically charge the pid count
  114. * @pids: the pid cgroup state
  115. * @num: the number of pids to charge
  116. *
  117. * This function does *not* follow the pid limit set. It cannot fail and the new
  118. * pid count may exceed the limit. This is only used for reverting failed
  119. * attaches, where there is no other way out than violating the limit.
  120. */
  121. static void pids_charge(struct pids_cgroup *pids, int num)
  122. {
  123. struct pids_cgroup *p;
  124. for (p = pids; parent_pids(p); p = parent_pids(p)) {
  125. int64_t new = atomic64_add_return(num, &p->counter);
  126. pids_update_watermark(p, new);
  127. }
  128. }
  129. /**
  130. * pids_try_charge - hierarchically try to charge the pid count
  131. * @pids: the pid cgroup state
  132. * @num: the number of pids to charge
  133. *
  134. * This function follows the set limit. It will fail if the charge would cause
  135. * the new value to exceed the hierarchical limit. Returns 0 if the charge
  136. * succeeded, otherwise -EAGAIN.
  137. */
  138. static int pids_try_charge(struct pids_cgroup *pids, int num)
  139. {
  140. struct pids_cgroup *p, *q;
  141. for (p = pids; parent_pids(p); p = parent_pids(p)) {
  142. int64_t new = atomic64_add_return(num, &p->counter);
  143. int64_t limit = atomic64_read(&p->limit);
  144. /*
  145. * Since new is capped to the maximum number of pid_t, if
  146. * p->limit is %PIDS_MAX then we know that this test will never
  147. * fail.
  148. */
  149. if (new > limit)
  150. goto revert;
  151. /*
  152. * Not technically accurate if we go over limit somewhere up
  153. * the hierarchy, but that's tolerable for the watermark.
  154. */
  155. pids_update_watermark(p, new);
  156. }
  157. return 0;
  158. revert:
  159. for (q = pids; q != p; q = parent_pids(q))
  160. pids_cancel(q, num);
  161. pids_cancel(p, num);
  162. return -EAGAIN;
  163. }
  164. static int pids_can_attach(struct cgroup_taskset *tset)
  165. {
  166. struct task_struct *task;
  167. struct cgroup_subsys_state *dst_css;
  168. cgroup_taskset_for_each(task, dst_css, tset) {
  169. struct pids_cgroup *pids = css_pids(dst_css);
  170. struct cgroup_subsys_state *old_css;
  171. struct pids_cgroup *old_pids;
  172. /*
  173. * No need to pin @old_css between here and cancel_attach()
  174. * because cgroup core protects it from being freed before
  175. * the migration completes or fails.
  176. */
  177. old_css = task_css(task, pids_cgrp_id);
  178. old_pids = css_pids(old_css);
  179. pids_charge(pids, 1);
  180. pids_uncharge(old_pids, 1);
  181. }
  182. return 0;
  183. }
  184. static void pids_cancel_attach(struct cgroup_taskset *tset)
  185. {
  186. struct task_struct *task;
  187. struct cgroup_subsys_state *dst_css;
  188. cgroup_taskset_for_each(task, dst_css, tset) {
  189. struct pids_cgroup *pids = css_pids(dst_css);
  190. struct cgroup_subsys_state *old_css;
  191. struct pids_cgroup *old_pids;
  192. old_css = task_css(task, pids_cgrp_id);
  193. old_pids = css_pids(old_css);
  194. pids_charge(old_pids, 1);
  195. pids_uncharge(pids, 1);
  196. }
  197. }
  198. /*
  199. * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
  200. * on cgroup_threadgroup_change_begin() held by the copy_process().
  201. */
  202. static int pids_can_fork(struct task_struct *task, struct css_set *cset)
  203. {
  204. struct cgroup_subsys_state *css;
  205. struct pids_cgroup *pids;
  206. int err;
  207. if (cset)
  208. css = cset->subsys[pids_cgrp_id];
  209. else
  210. css = task_css_check(current, pids_cgrp_id, true);
  211. pids = css_pids(css);
  212. err = pids_try_charge(pids, 1);
  213. if (err) {
  214. /* Only log the first time events_limit is incremented. */
  215. if (atomic64_inc_return(&pids->events_limit) == 1) {
  216. pr_info("cgroup: fork rejected by pids controller in ");
  217. pr_cont_cgroup_path(css->cgroup);
  218. pr_cont("\n");
  219. }
  220. cgroup_file_notify(&pids->events_file);
  221. }
  222. return err;
  223. }
  224. static void pids_cancel_fork(struct task_struct *task, struct css_set *cset)
  225. {
  226. struct cgroup_subsys_state *css;
  227. struct pids_cgroup *pids;
  228. if (cset)
  229. css = cset->subsys[pids_cgrp_id];
  230. else
  231. css = task_css_check(current, pids_cgrp_id, true);
  232. pids = css_pids(css);
  233. pids_uncharge(pids, 1);
  234. }
  235. static void pids_release(struct task_struct *task)
  236. {
  237. struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
  238. pids_uncharge(pids, 1);
  239. }
  240. static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
  241. size_t nbytes, loff_t off)
  242. {
  243. struct cgroup_subsys_state *css = of_css(of);
  244. struct pids_cgroup *pids = css_pids(css);
  245. int64_t limit;
  246. int err;
  247. buf = strstrip(buf);
  248. if (!strcmp(buf, PIDS_MAX_STR)) {
  249. limit = PIDS_MAX;
  250. goto set_limit;
  251. }
  252. err = kstrtoll(buf, 0, &limit);
  253. if (err)
  254. return err;
  255. if (limit < 0 || limit >= PIDS_MAX)
  256. return -EINVAL;
  257. set_limit:
  258. /*
  259. * Limit updates don't need to be mutex'd, since it isn't
  260. * critical that any racing fork()s follow the new limit.
  261. */
  262. atomic64_set(&pids->limit, limit);
  263. return nbytes;
  264. }
  265. static int pids_max_show(struct seq_file *sf, void *v)
  266. {
  267. struct cgroup_subsys_state *css = seq_css(sf);
  268. struct pids_cgroup *pids = css_pids(css);
  269. int64_t limit = atomic64_read(&pids->limit);
  270. if (limit >= PIDS_MAX)
  271. seq_printf(sf, "%s\n", PIDS_MAX_STR);
  272. else
  273. seq_printf(sf, "%lld\n", limit);
  274. return 0;
  275. }
  276. static s64 pids_current_read(struct cgroup_subsys_state *css,
  277. struct cftype *cft)
  278. {
  279. struct pids_cgroup *pids = css_pids(css);
  280. return atomic64_read(&pids->counter);
  281. }
  282. static s64 pids_peak_read(struct cgroup_subsys_state *css,
  283. struct cftype *cft)
  284. {
  285. struct pids_cgroup *pids = css_pids(css);
  286. return READ_ONCE(pids->watermark);
  287. }
  288. static int pids_events_show(struct seq_file *sf, void *v)
  289. {
  290. struct pids_cgroup *pids = css_pids(seq_css(sf));
  291. seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit));
  292. return 0;
  293. }
  294. static struct cftype pids_files[] = {
  295. {
  296. .name = "max",
  297. .write = pids_max_write,
  298. .seq_show = pids_max_show,
  299. .flags = CFTYPE_NOT_ON_ROOT,
  300. },
  301. {
  302. .name = "current",
  303. .read_s64 = pids_current_read,
  304. .flags = CFTYPE_NOT_ON_ROOT,
  305. },
  306. {
  307. .name = "peak",
  308. .flags = CFTYPE_NOT_ON_ROOT,
  309. .read_s64 = pids_peak_read,
  310. },
  311. {
  312. .name = "events",
  313. .seq_show = pids_events_show,
  314. .file_offset = offsetof(struct pids_cgroup, events_file),
  315. .flags = CFTYPE_NOT_ON_ROOT,
  316. },
  317. { } /* terminate */
  318. };
  319. struct cgroup_subsys pids_cgrp_subsys = {
  320. .css_alloc = pids_css_alloc,
  321. .css_free = pids_css_free,
  322. .can_attach = pids_can_attach,
  323. .cancel_attach = pids_cancel_attach,
  324. .can_fork = pids_can_fork,
  325. .cancel_fork = pids_cancel_fork,
  326. .release = pids_release,
  327. .legacy_cftypes = pids_files,
  328. .dfl_cftypes = pids_files,
  329. .threaded = true,
  330. };