cgroup-v1.c 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. #include "cgroup-internal.h"
  3. #include <linux/ctype.h>
  4. #include <linux/kmod.h>
  5. #include <linux/sort.h>
  6. #include <linux/delay.h>
  7. #include <linux/mm.h>
  8. #include <linux/sched/signal.h>
  9. #include <linux/sched/task.h>
  10. #include <linux/magic.h>
  11. #include <linux/slab.h>
  12. #include <linux/vmalloc.h>
  13. #include <linux/delayacct.h>
  14. #include <linux/pid_namespace.h>
  15. #include <linux/cgroupstats.h>
  16. #include <linux/fs_parser.h>
  17. #include <trace/events/cgroup.h>
  18. #include <trace/hooks/cgroup.h>
  19. /*
  20. * pidlists linger the following amount before being destroyed. The goal
  21. * is avoiding frequent destruction in the middle of consecutive read calls
  22. * Expiring in the middle is a performance problem not a correctness one.
  23. * 1 sec should be enough.
  24. */
  25. #define CGROUP_PIDLIST_DESTROY_DELAY HZ
  26. /* Controllers blocked by the commandline in v1 */
  27. static u16 cgroup_no_v1_mask;
  28. /* disable named v1 mounts */
  29. static bool cgroup_no_v1_named;
  30. /*
  31. * pidlist destructions need to be flushed on cgroup destruction. Use a
  32. * separate workqueue as flush domain.
  33. */
  34. static struct workqueue_struct *cgroup_pidlist_destroy_wq;
  35. /* protects cgroup_subsys->release_agent_path */
  36. static DEFINE_SPINLOCK(release_agent_path_lock);
  37. bool cgroup1_ssid_disabled(int ssid)
  38. {
  39. return cgroup_no_v1_mask & (1 << ssid);
  40. }
  41. /**
  42. * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
  43. * @from: attach to all cgroups of a given task
  44. * @tsk: the task to be attached
  45. *
  46. * Return: %0 on success or a negative errno code on failure
  47. */
  48. int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
  49. {
  50. struct cgroup_root *root;
  51. int retval = 0;
  52. cgroup_lock();
  53. cgroup_attach_lock(true);
  54. for_each_root(root) {
  55. struct cgroup *from_cgrp;
  56. spin_lock_irq(&css_set_lock);
  57. from_cgrp = task_cgroup_from_root(from, root);
  58. spin_unlock_irq(&css_set_lock);
  59. retval = cgroup_attach_task(from_cgrp, tsk, false);
  60. if (retval)
  61. break;
  62. }
  63. cgroup_attach_unlock(true);
  64. cgroup_unlock();
  65. return retval;
  66. }
  67. EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
  68. /**
  69. * cgroup_transfer_tasks - move tasks from one cgroup to another
  70. * @to: cgroup to which the tasks will be moved
  71. * @from: cgroup in which the tasks currently reside
  72. *
  73. * Locking rules between cgroup_post_fork() and the migration path
  74. * guarantee that, if a task is forking while being migrated, the new child
  75. * is guaranteed to be either visible in the source cgroup after the
  76. * parent's migration is complete or put into the target cgroup. No task
  77. * can slip out of migration through forking.
  78. *
  79. * Return: %0 on success or a negative errno code on failure
  80. */
  81. int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
  82. {
  83. DEFINE_CGROUP_MGCTX(mgctx);
  84. struct cgrp_cset_link *link;
  85. struct css_task_iter it;
  86. struct task_struct *task;
  87. int ret;
  88. if (cgroup_on_dfl(to))
  89. return -EINVAL;
  90. ret = cgroup_migrate_vet_dst(to);
  91. if (ret)
  92. return ret;
  93. cgroup_lock();
  94. cgroup_attach_lock(true);
  95. /* all tasks in @from are being moved, all csets are source */
  96. spin_lock_irq(&css_set_lock);
  97. list_for_each_entry(link, &from->cset_links, cset_link)
  98. cgroup_migrate_add_src(link->cset, to, &mgctx);
  99. spin_unlock_irq(&css_set_lock);
  100. ret = cgroup_migrate_prepare_dst(&mgctx);
  101. if (ret)
  102. goto out_err;
  103. /*
  104. * Migrate tasks one-by-one until @from is empty. This fails iff
  105. * ->can_attach() fails.
  106. */
  107. do {
  108. css_task_iter_start(&from->self, 0, &it);
  109. do {
  110. task = css_task_iter_next(&it);
  111. } while (task && (task->flags & PF_EXITING));
  112. if (task)
  113. get_task_struct(task);
  114. css_task_iter_end(&it);
  115. if (task) {
  116. ret = cgroup_migrate(task, false, &mgctx);
  117. if (!ret)
  118. TRACE_CGROUP_PATH(transfer_tasks, to, task, false);
  119. put_task_struct(task);
  120. }
  121. } while (task && !ret);
  122. out_err:
  123. cgroup_migrate_finish(&mgctx);
  124. cgroup_attach_unlock(true);
  125. cgroup_unlock();
  126. return ret;
  127. }
  128. /*
  129. * Stuff for reading the 'tasks'/'procs' files.
  130. *
  131. * Reading this file can return large amounts of data if a cgroup has
  132. * *lots* of attached tasks. So it may need several calls to read(),
  133. * but we cannot guarantee that the information we produce is correct
  134. * unless we produce it entirely atomically.
  135. *
  136. */
  137. /* which pidlist file are we talking about? */
  138. enum cgroup_filetype {
  139. CGROUP_FILE_PROCS,
  140. CGROUP_FILE_TASKS,
  141. };
  142. /*
  143. * A pidlist is a list of pids that virtually represents the contents of one
  144. * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
  145. * a pair (one each for procs, tasks) for each pid namespace that's relevant
  146. * to the cgroup.
  147. */
  148. struct cgroup_pidlist {
  149. /*
  150. * used to find which pidlist is wanted. doesn't change as long as
  151. * this particular list stays in the list.
  152. */
  153. struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
  154. /* array of xids */
  155. pid_t *list;
  156. /* how many elements the above list has */
  157. int length;
  158. /* each of these stored in a list by its cgroup */
  159. struct list_head links;
  160. /* pointer to the cgroup we belong to, for list removal purposes */
  161. struct cgroup *owner;
  162. /* for delayed destruction */
  163. struct delayed_work destroy_dwork;
  164. };
  165. /*
  166. * Used to destroy all pidlists lingering waiting for destroy timer. None
  167. * should be left afterwards.
  168. */
  169. void cgroup1_pidlist_destroy_all(struct cgroup *cgrp)
  170. {
  171. struct cgroup_pidlist *l, *tmp_l;
  172. mutex_lock(&cgrp->pidlist_mutex);
  173. list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
  174. mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
  175. mutex_unlock(&cgrp->pidlist_mutex);
  176. flush_workqueue(cgroup_pidlist_destroy_wq);
  177. BUG_ON(!list_empty(&cgrp->pidlists));
  178. }
  179. static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
  180. {
  181. struct delayed_work *dwork = to_delayed_work(work);
  182. struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
  183. destroy_dwork);
  184. struct cgroup_pidlist *tofree = NULL;
  185. mutex_lock(&l->owner->pidlist_mutex);
  186. /*
  187. * Destroy iff we didn't get queued again. The state won't change
  188. * as destroy_dwork can only be queued while locked.
  189. */
  190. if (!delayed_work_pending(dwork)) {
  191. list_del(&l->links);
  192. kvfree(l->list);
  193. put_pid_ns(l->key.ns);
  194. tofree = l;
  195. }
  196. mutex_unlock(&l->owner->pidlist_mutex);
  197. kfree(tofree);
  198. }
  199. /*
  200. * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
  201. * Returns the number of unique elements.
  202. */
  203. static int pidlist_uniq(pid_t *list, int length)
  204. {
  205. int src, dest = 1;
  206. /*
  207. * we presume the 0th element is unique, so i starts at 1. trivial
  208. * edge cases first; no work needs to be done for either
  209. */
  210. if (length == 0 || length == 1)
  211. return length;
  212. /* src and dest walk down the list; dest counts unique elements */
  213. for (src = 1; src < length; src++) {
  214. /* find next unique element */
  215. while (list[src] == list[src-1]) {
  216. src++;
  217. if (src == length)
  218. goto after;
  219. }
  220. /* dest always points to where the next unique element goes */
  221. list[dest] = list[src];
  222. dest++;
  223. }
  224. after:
  225. return dest;
  226. }
  227. /*
  228. * The two pid files - task and cgroup.procs - guaranteed that the result
  229. * is sorted, which forced this whole pidlist fiasco. As pid order is
  230. * different per namespace, each namespace needs differently sorted list,
  231. * making it impossible to use, for example, single rbtree of member tasks
  232. * sorted by task pointer. As pidlists can be fairly large, allocating one
  233. * per open file is dangerous, so cgroup had to implement shared pool of
  234. * pidlists keyed by cgroup and namespace.
  235. */
  236. static int cmppid(const void *a, const void *b)
  237. {
  238. return *(pid_t *)a - *(pid_t *)b;
  239. }
  240. static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
  241. enum cgroup_filetype type)
  242. {
  243. struct cgroup_pidlist *l;
  244. /* don't need task_nsproxy() if we're looking at ourself */
  245. struct pid_namespace *ns = task_active_pid_ns(current);
  246. lockdep_assert_held(&cgrp->pidlist_mutex);
  247. list_for_each_entry(l, &cgrp->pidlists, links)
  248. if (l->key.type == type && l->key.ns == ns)
  249. return l;
  250. return NULL;
  251. }
  252. /*
  253. * find the appropriate pidlist for our purpose (given procs vs tasks)
  254. * returns with the lock on that pidlist already held, and takes care
  255. * of the use count, or returns NULL with no locks held if we're out of
  256. * memory.
  257. */
  258. static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
  259. enum cgroup_filetype type)
  260. {
  261. struct cgroup_pidlist *l;
  262. lockdep_assert_held(&cgrp->pidlist_mutex);
  263. l = cgroup_pidlist_find(cgrp, type);
  264. if (l)
  265. return l;
  266. /* entry not found; create a new one */
  267. l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
  268. if (!l)
  269. return l;
  270. INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
  271. l->key.type = type;
  272. /* don't need task_nsproxy() if we're looking at ourself */
  273. l->key.ns = get_pid_ns(task_active_pid_ns(current));
  274. l->owner = cgrp;
  275. list_add(&l->links, &cgrp->pidlists);
  276. return l;
  277. }
  278. /*
  279. * Load a cgroup's pidarray with either procs' tgids or tasks' pids
  280. */
  281. static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
  282. struct cgroup_pidlist **lp)
  283. {
  284. pid_t *array;
  285. int length;
  286. int pid, n = 0; /* used for populating the array */
  287. struct css_task_iter it;
  288. struct task_struct *tsk;
  289. struct cgroup_pidlist *l;
  290. lockdep_assert_held(&cgrp->pidlist_mutex);
  291. /*
  292. * If cgroup gets more users after we read count, we won't have
  293. * enough space - tough. This race is indistinguishable to the
  294. * caller from the case that the additional cgroup users didn't
  295. * show up until sometime later on.
  296. */
  297. length = cgroup_task_count(cgrp);
  298. array = kvmalloc_array(length, sizeof(pid_t), GFP_KERNEL);
  299. if (!array)
  300. return -ENOMEM;
  301. /* now, populate the array */
  302. css_task_iter_start(&cgrp->self, 0, &it);
  303. while ((tsk = css_task_iter_next(&it))) {
  304. if (unlikely(n == length))
  305. break;
  306. /* get tgid or pid for procs or tasks file respectively */
  307. if (type == CGROUP_FILE_PROCS)
  308. pid = task_tgid_vnr(tsk);
  309. else
  310. pid = task_pid_vnr(tsk);
  311. if (pid > 0) /* make sure to only use valid results */
  312. array[n++] = pid;
  313. }
  314. css_task_iter_end(&it);
  315. length = n;
  316. /* now sort & strip out duplicates (tgids or recycled thread PIDs) */
  317. sort(array, length, sizeof(pid_t), cmppid, NULL);
  318. length = pidlist_uniq(array, length);
  319. l = cgroup_pidlist_find_create(cgrp, type);
  320. if (!l) {
  321. kvfree(array);
  322. return -ENOMEM;
  323. }
  324. /* store array, freeing old if necessary */
  325. kvfree(l->list);
  326. l->list = array;
  327. l->length = length;
  328. *lp = l;
  329. return 0;
  330. }
  331. /*
  332. * seq_file methods for the tasks/procs files. The seq_file position is the
  333. * next pid to display; the seq_file iterator is a pointer to the pid
  334. * in the cgroup->l->list array.
  335. */
  336. static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
  337. {
  338. /*
  339. * Initially we receive a position value that corresponds to
  340. * one more than the last pid shown (or 0 on the first call or
  341. * after a seek to the start). Use a binary-search to find the
  342. * next pid to display, if any
  343. */
  344. struct kernfs_open_file *of = s->private;
  345. struct cgroup_file_ctx *ctx = of->priv;
  346. struct cgroup *cgrp = seq_css(s)->cgroup;
  347. struct cgroup_pidlist *l;
  348. enum cgroup_filetype type = seq_cft(s)->private;
  349. int index = 0, pid = *pos;
  350. int *iter, ret;
  351. mutex_lock(&cgrp->pidlist_mutex);
  352. /*
  353. * !NULL @ctx->procs1.pidlist indicates that this isn't the first
  354. * start() after open. If the matching pidlist is around, we can use
  355. * that. Look for it. Note that @ctx->procs1.pidlist can't be used
  356. * directly. It could already have been destroyed.
  357. */
  358. if (ctx->procs1.pidlist)
  359. ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type);
  360. /*
  361. * Either this is the first start() after open or the matching
  362. * pidlist has been destroyed inbetween. Create a new one.
  363. */
  364. if (!ctx->procs1.pidlist) {
  365. ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist);
  366. if (ret)
  367. return ERR_PTR(ret);
  368. }
  369. l = ctx->procs1.pidlist;
  370. if (pid) {
  371. int end = l->length;
  372. while (index < end) {
  373. int mid = (index + end) / 2;
  374. if (l->list[mid] == pid) {
  375. index = mid;
  376. break;
  377. } else if (l->list[mid] <= pid)
  378. index = mid + 1;
  379. else
  380. end = mid;
  381. }
  382. }
  383. /* If we're off the end of the array, we're done */
  384. if (index >= l->length)
  385. return NULL;
  386. /* Update the abstract position to be the actual pid that we found */
  387. iter = l->list + index;
  388. *pos = *iter;
  389. return iter;
  390. }
  391. static void cgroup_pidlist_stop(struct seq_file *s, void *v)
  392. {
  393. struct kernfs_open_file *of = s->private;
  394. struct cgroup_file_ctx *ctx = of->priv;
  395. struct cgroup_pidlist *l = ctx->procs1.pidlist;
  396. if (l)
  397. mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
  398. CGROUP_PIDLIST_DESTROY_DELAY);
  399. mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
  400. }
  401. static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
  402. {
  403. struct kernfs_open_file *of = s->private;
  404. struct cgroup_file_ctx *ctx = of->priv;
  405. struct cgroup_pidlist *l = ctx->procs1.pidlist;
  406. pid_t *p = v;
  407. pid_t *end = l->list + l->length;
  408. /*
  409. * Advance to the next pid in the array. If this goes off the
  410. * end, we're done
  411. */
  412. p++;
  413. if (p >= end) {
  414. (*pos)++;
  415. return NULL;
  416. } else {
  417. *pos = *p;
  418. return p;
  419. }
  420. }
  421. static int cgroup_pidlist_show(struct seq_file *s, void *v)
  422. {
  423. seq_printf(s, "%d\n", *(int *)v);
  424. return 0;
  425. }
  426. static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
  427. char *buf, size_t nbytes, loff_t off,
  428. bool threadgroup)
  429. {
  430. struct cgroup *cgrp;
  431. struct task_struct *task;
  432. const struct cred *cred, *tcred;
  433. ssize_t ret;
  434. bool locked;
  435. cgrp = cgroup_kn_lock_live(of->kn, false);
  436. if (!cgrp)
  437. return -ENODEV;
  438. task = cgroup_procs_write_start(buf, threadgroup, &locked, cgrp);
  439. ret = PTR_ERR_OR_ZERO(task);
  440. if (ret)
  441. goto out_unlock;
  442. /*
  443. * Even if we're attaching all tasks in the thread group, we only need
  444. * to check permissions on one of them. Check permissions using the
  445. * credentials from file open to protect against inherited fd attacks.
  446. */
  447. cred = of->file->f_cred;
  448. tcred = get_task_cred(task);
  449. if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
  450. !uid_eq(cred->euid, tcred->uid) &&
  451. !uid_eq(cred->euid, tcred->suid) &&
  452. !ns_capable(tcred->user_ns, CAP_SYS_NICE))
  453. ret = -EACCES;
  454. put_cred(tcred);
  455. if (ret)
  456. goto out_finish;
  457. ret = cgroup_attach_task(cgrp, task, threadgroup);
  458. trace_android_vh_cgroup_set_task(ret, task);
  459. out_finish:
  460. cgroup_procs_write_finish(task, locked);
  461. out_unlock:
  462. cgroup_kn_unlock(of->kn);
  463. return ret ?: nbytes;
  464. }
  465. static ssize_t cgroup1_procs_write(struct kernfs_open_file *of,
  466. char *buf, size_t nbytes, loff_t off)
  467. {
  468. return __cgroup1_procs_write(of, buf, nbytes, off, true);
  469. }
  470. static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of,
  471. char *buf, size_t nbytes, loff_t off)
  472. {
  473. return __cgroup1_procs_write(of, buf, nbytes, off, false);
  474. }
  475. static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
  476. char *buf, size_t nbytes, loff_t off)
  477. {
  478. struct cgroup *cgrp;
  479. struct cgroup_file_ctx *ctx;
  480. BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
  481. /*
  482. * Release agent gets called with all capabilities,
  483. * require capabilities to set release agent.
  484. */
  485. ctx = of->priv;
  486. if ((ctx->ns->user_ns != &init_user_ns) ||
  487. !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN))
  488. return -EPERM;
  489. cgrp = cgroup_kn_lock_live(of->kn, false);
  490. if (!cgrp)
  491. return -ENODEV;
  492. spin_lock(&release_agent_path_lock);
  493. strlcpy(cgrp->root->release_agent_path, strstrip(buf),
  494. sizeof(cgrp->root->release_agent_path));
  495. spin_unlock(&release_agent_path_lock);
  496. cgroup_kn_unlock(of->kn);
  497. return nbytes;
  498. }
  499. static int cgroup_release_agent_show(struct seq_file *seq, void *v)
  500. {
  501. struct cgroup *cgrp = seq_css(seq)->cgroup;
  502. spin_lock(&release_agent_path_lock);
  503. seq_puts(seq, cgrp->root->release_agent_path);
  504. spin_unlock(&release_agent_path_lock);
  505. seq_putc(seq, '\n');
  506. return 0;
  507. }
  508. static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
  509. {
  510. seq_puts(seq, "0\n");
  511. return 0;
  512. }
  513. static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
  514. struct cftype *cft)
  515. {
  516. return notify_on_release(css->cgroup);
  517. }
  518. static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
  519. struct cftype *cft, u64 val)
  520. {
  521. if (val)
  522. set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
  523. else
  524. clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
  525. return 0;
  526. }
  527. static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
  528. struct cftype *cft)
  529. {
  530. return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
  531. }
  532. static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
  533. struct cftype *cft, u64 val)
  534. {
  535. if (val)
  536. set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
  537. else
  538. clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
  539. return 0;
  540. }
  541. /* cgroup core interface files for the legacy hierarchies */
  542. struct cftype cgroup1_base_files[] = {
  543. {
  544. .name = "cgroup.procs",
  545. .seq_start = cgroup_pidlist_start,
  546. .seq_next = cgroup_pidlist_next,
  547. .seq_stop = cgroup_pidlist_stop,
  548. .seq_show = cgroup_pidlist_show,
  549. .private = CGROUP_FILE_PROCS,
  550. .write = cgroup1_procs_write,
  551. },
  552. {
  553. .name = "cgroup.clone_children",
  554. .read_u64 = cgroup_clone_children_read,
  555. .write_u64 = cgroup_clone_children_write,
  556. },
  557. {
  558. .name = "cgroup.sane_behavior",
  559. .flags = CFTYPE_ONLY_ON_ROOT,
  560. .seq_show = cgroup_sane_behavior_show,
  561. },
  562. {
  563. .name = "tasks",
  564. .seq_start = cgroup_pidlist_start,
  565. .seq_next = cgroup_pidlist_next,
  566. .seq_stop = cgroup_pidlist_stop,
  567. .seq_show = cgroup_pidlist_show,
  568. .private = CGROUP_FILE_TASKS,
  569. .write = cgroup1_tasks_write,
  570. },
  571. {
  572. .name = "notify_on_release",
  573. .read_u64 = cgroup_read_notify_on_release,
  574. .write_u64 = cgroup_write_notify_on_release,
  575. },
  576. {
  577. .name = "release_agent",
  578. .flags = CFTYPE_ONLY_ON_ROOT,
  579. .seq_show = cgroup_release_agent_show,
  580. .write = cgroup_release_agent_write,
  581. .max_write_len = PATH_MAX - 1,
  582. },
  583. { } /* terminate */
  584. };
  585. /* Display information about each subsystem and each hierarchy */
  586. int proc_cgroupstats_show(struct seq_file *m, void *v)
  587. {
  588. struct cgroup_subsys *ss;
  589. int i;
  590. seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
  591. /*
  592. * Grab the subsystems state racily. No need to add avenue to
  593. * cgroup_mutex contention.
  594. */
  595. for_each_subsys(ss, i)
  596. seq_printf(m, "%s\t%d\t%d\t%d\n",
  597. ss->legacy_name, ss->root->hierarchy_id,
  598. atomic_read(&ss->root->nr_cgrps),
  599. cgroup_ssid_enabled(i));
  600. return 0;
  601. }
  602. /**
  603. * cgroupstats_build - build and fill cgroupstats
  604. * @stats: cgroupstats to fill information into
  605. * @dentry: A dentry entry belonging to the cgroup for which stats have
  606. * been requested.
  607. *
  608. * Build and fill cgroupstats so that taskstats can export it to user
  609. * space.
  610. *
  611. * Return: %0 on success or a negative errno code on failure
  612. */
  613. int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
  614. {
  615. struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
  616. struct cgroup *cgrp;
  617. struct css_task_iter it;
  618. struct task_struct *tsk;
  619. /* it should be kernfs_node belonging to cgroupfs and is a directory */
  620. if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
  621. kernfs_type(kn) != KERNFS_DIR)
  622. return -EINVAL;
  623. /*
  624. * We aren't being called from kernfs and there's no guarantee on
  625. * @kn->priv's validity. For this and css_tryget_online_from_dir(),
  626. * @kn->priv is RCU safe. Let's do the RCU dancing.
  627. */
  628. rcu_read_lock();
  629. cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
  630. if (!cgrp || !cgroup_tryget(cgrp)) {
  631. rcu_read_unlock();
  632. return -ENOENT;
  633. }
  634. rcu_read_unlock();
  635. css_task_iter_start(&cgrp->self, 0, &it);
  636. while ((tsk = css_task_iter_next(&it))) {
  637. switch (READ_ONCE(tsk->__state)) {
  638. case TASK_RUNNING:
  639. stats->nr_running++;
  640. break;
  641. case TASK_INTERRUPTIBLE:
  642. stats->nr_sleeping++;
  643. break;
  644. case TASK_UNINTERRUPTIBLE:
  645. stats->nr_uninterruptible++;
  646. break;
  647. case TASK_STOPPED:
  648. stats->nr_stopped++;
  649. break;
  650. default:
  651. if (tsk->in_iowait)
  652. stats->nr_io_wait++;
  653. break;
  654. }
  655. }
  656. css_task_iter_end(&it);
  657. cgroup_put(cgrp);
  658. return 0;
  659. }
  660. void cgroup1_check_for_release(struct cgroup *cgrp)
  661. {
  662. if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
  663. !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
  664. schedule_work(&cgrp->release_agent_work);
  665. }
  666. /*
  667. * Notify userspace when a cgroup is released, by running the
  668. * configured release agent with the name of the cgroup (path
  669. * relative to the root of cgroup file system) as the argument.
  670. *
  671. * Most likely, this user command will try to rmdir this cgroup.
  672. *
  673. * This races with the possibility that some other task will be
  674. * attached to this cgroup before it is removed, or that some other
  675. * user task will 'mkdir' a child cgroup of this cgroup. That's ok.
  676. * The presumed 'rmdir' will fail quietly if this cgroup is no longer
  677. * unused, and this cgroup will be reprieved from its death sentence,
  678. * to continue to serve a useful existence. Next time it's released,
  679. * we will get notified again, if it still has 'notify_on_release' set.
  680. *
  681. * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
  682. * means only wait until the task is successfully execve()'d. The
  683. * separate release agent task is forked by call_usermodehelper(),
  684. * then control in this thread returns here, without waiting for the
  685. * release agent task. We don't bother to wait because the caller of
  686. * this routine has no use for the exit status of the release agent
  687. * task, so no sense holding our caller up for that.
  688. */
  689. void cgroup1_release_agent(struct work_struct *work)
  690. {
  691. struct cgroup *cgrp =
  692. container_of(work, struct cgroup, release_agent_work);
  693. char *pathbuf, *agentbuf;
  694. char *argv[3], *envp[3];
  695. int ret;
  696. /* snoop agent path and exit early if empty */
  697. if (!cgrp->root->release_agent_path[0])
  698. return;
  699. /* prepare argument buffers */
  700. pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
  701. agentbuf = kmalloc(PATH_MAX, GFP_KERNEL);
  702. if (!pathbuf || !agentbuf)
  703. goto out_free;
  704. spin_lock(&release_agent_path_lock);
  705. strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);
  706. spin_unlock(&release_agent_path_lock);
  707. if (!agentbuf[0])
  708. goto out_free;
  709. ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
  710. if (ret < 0 || ret >= PATH_MAX)
  711. goto out_free;
  712. argv[0] = agentbuf;
  713. argv[1] = pathbuf;
  714. argv[2] = NULL;
  715. /* minimal command environment */
  716. envp[0] = "HOME=/";
  717. envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
  718. envp[2] = NULL;
  719. call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
  720. out_free:
  721. kfree(agentbuf);
  722. kfree(pathbuf);
  723. }
  724. /*
  725. * cgroup_rename - Only allow simple rename of directories in place.
  726. */
  727. static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
  728. const char *new_name_str)
  729. {
  730. struct cgroup *cgrp = kn->priv;
  731. int ret;
  732. /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
  733. if (strchr(new_name_str, '\n'))
  734. return -EINVAL;
  735. if (kernfs_type(kn) != KERNFS_DIR)
  736. return -ENOTDIR;
  737. if (kn->parent != new_parent)
  738. return -EIO;
  739. /*
  740. * We're gonna grab cgroup_mutex which nests outside kernfs
  741. * active_ref. kernfs_rename() doesn't require active_ref
  742. * protection. Break them before grabbing cgroup_mutex.
  743. */
  744. kernfs_break_active_protection(new_parent);
  745. kernfs_break_active_protection(kn);
  746. cgroup_lock();
  747. ret = kernfs_rename(kn, new_parent, new_name_str);
  748. if (!ret)
  749. TRACE_CGROUP_PATH(rename, cgrp);
  750. cgroup_unlock();
  751. kernfs_unbreak_active_protection(kn);
  752. kernfs_unbreak_active_protection(new_parent);
  753. return ret;
  754. }
  755. static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
  756. {
  757. struct cgroup_root *root = cgroup_root_from_kf(kf_root);
  758. struct cgroup_subsys *ss;
  759. int ssid;
  760. for_each_subsys(ss, ssid)
  761. if (root->subsys_mask & (1 << ssid))
  762. seq_show_option(seq, ss->legacy_name, NULL);
  763. if (root->flags & CGRP_ROOT_NOPREFIX)
  764. seq_puts(seq, ",noprefix");
  765. if (root->flags & CGRP_ROOT_XATTR)
  766. seq_puts(seq, ",xattr");
  767. if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
  768. seq_puts(seq, ",cpuset_v2_mode");
  769. if (root->flags & CGRP_ROOT_FAVOR_DYNMODS)
  770. seq_puts(seq, ",favordynmods");
  771. spin_lock(&release_agent_path_lock);
  772. if (strlen(root->release_agent_path))
  773. seq_show_option(seq, "release_agent",
  774. root->release_agent_path);
  775. spin_unlock(&release_agent_path_lock);
  776. if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
  777. seq_puts(seq, ",clone_children");
  778. if (strlen(root->name))
  779. seq_show_option(seq, "name", root->name);
  780. return 0;
  781. }
  782. enum cgroup1_param {
  783. Opt_all,
  784. Opt_clone_children,
  785. Opt_cpuset_v2_mode,
  786. Opt_name,
  787. Opt_none,
  788. Opt_noprefix,
  789. Opt_release_agent,
  790. Opt_xattr,
  791. Opt_favordynmods,
  792. Opt_nofavordynmods,
  793. };
  794. const struct fs_parameter_spec cgroup1_fs_parameters[] = {
  795. fsparam_flag ("all", Opt_all),
  796. fsparam_flag ("clone_children", Opt_clone_children),
  797. fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode),
  798. fsparam_string("name", Opt_name),
  799. fsparam_flag ("none", Opt_none),
  800. fsparam_flag ("noprefix", Opt_noprefix),
  801. fsparam_string("release_agent", Opt_release_agent),
  802. fsparam_flag ("xattr", Opt_xattr),
  803. fsparam_flag ("favordynmods", Opt_favordynmods),
  804. fsparam_flag ("nofavordynmods", Opt_nofavordynmods),
  805. {}
  806. };
  807. int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
  808. {
  809. struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
  810. struct cgroup_subsys *ss;
  811. struct fs_parse_result result;
  812. int opt, i;
  813. opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
  814. if (opt == -ENOPARAM) {
  815. int ret;
  816. ret = vfs_parse_fs_param_source(fc, param);
  817. if (ret != -ENOPARAM)
  818. return ret;
  819. for_each_subsys(ss, i) {
  820. if (strcmp(param->key, ss->legacy_name))
  821. continue;
  822. if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i))
  823. return invalfc(fc, "Disabled controller '%s'",
  824. param->key);
  825. ctx->subsys_mask |= (1 << i);
  826. return 0;
  827. }
  828. return invalfc(fc, "Unknown subsys name '%s'", param->key);
  829. }
  830. if (opt < 0)
  831. return opt;
  832. switch (opt) {
  833. case Opt_none:
  834. /* Explicitly have no subsystems */
  835. ctx->none = true;
  836. break;
  837. case Opt_all:
  838. ctx->all_ss = true;
  839. break;
  840. case Opt_noprefix:
  841. ctx->flags |= CGRP_ROOT_NOPREFIX;
  842. break;
  843. case Opt_clone_children:
  844. ctx->cpuset_clone_children = true;
  845. break;
  846. case Opt_cpuset_v2_mode:
  847. ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE;
  848. break;
  849. case Opt_xattr:
  850. ctx->flags |= CGRP_ROOT_XATTR;
  851. break;
  852. case Opt_favordynmods:
  853. ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
  854. break;
  855. case Opt_nofavordynmods:
  856. ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
  857. break;
  858. case Opt_release_agent:
  859. /* Specifying two release agents is forbidden */
  860. if (ctx->release_agent)
  861. return invalfc(fc, "release_agent respecified");
  862. /*
  863. * Release agent gets called with all capabilities,
  864. * require capabilities to set release agent.
  865. */
  866. if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))
  867. return invalfc(fc, "Setting release_agent not allowed");
  868. ctx->release_agent = param->string;
  869. param->string = NULL;
  870. break;
  871. case Opt_name:
  872. /* blocked by boot param? */
  873. if (cgroup_no_v1_named)
  874. return -ENOENT;
  875. /* Can't specify an empty name */
  876. if (!param->size)
  877. return invalfc(fc, "Empty name");
  878. if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1)
  879. return invalfc(fc, "Name too long");
  880. /* Must match [\w.-]+ */
  881. for (i = 0; i < param->size; i++) {
  882. char c = param->string[i];
  883. if (isalnum(c))
  884. continue;
  885. if ((c == '.') || (c == '-') || (c == '_'))
  886. continue;
  887. return invalfc(fc, "Invalid name");
  888. }
  889. /* Specifying two names is forbidden */
  890. if (ctx->name)
  891. return invalfc(fc, "name respecified");
  892. ctx->name = param->string;
  893. param->string = NULL;
  894. break;
  895. }
  896. return 0;
  897. }
  898. static int check_cgroupfs_options(struct fs_context *fc)
  899. {
  900. struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
  901. u16 mask = U16_MAX;
  902. u16 enabled = 0;
  903. struct cgroup_subsys *ss;
  904. int i;
  905. #ifdef CONFIG_CPUSETS
  906. mask = ~((u16)1 << cpuset_cgrp_id);
  907. #endif
  908. for_each_subsys(ss, i)
  909. if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
  910. enabled |= 1 << i;
  911. ctx->subsys_mask &= enabled;
  912. /*
  913. * In absence of 'none', 'name=' and subsystem name options,
  914. * let's default to 'all'.
  915. */
  916. if (!ctx->subsys_mask && !ctx->none && !ctx->name)
  917. ctx->all_ss = true;
  918. if (ctx->all_ss) {
  919. /* Mutually exclusive option 'all' + subsystem name */
  920. if (ctx->subsys_mask)
  921. return invalfc(fc, "subsys name conflicts with all");
  922. /* 'all' => select all the subsystems */
  923. ctx->subsys_mask = enabled;
  924. }
  925. /*
  926. * We either have to specify by name or by subsystems. (So all
  927. * empty hierarchies must have a name).
  928. */
  929. if (!ctx->subsys_mask && !ctx->name)
  930. return invalfc(fc, "Need name or subsystem set");
  931. /*
  932. * Option noprefix was introduced just for backward compatibility
  933. * with the old cpuset, so we allow noprefix only if mounting just
  934. * the cpuset subsystem.
  935. */
  936. if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask))
  937. return invalfc(fc, "noprefix used incorrectly");
  938. /* Can't specify "none" and some subsystems */
  939. if (ctx->subsys_mask && ctx->none)
  940. return invalfc(fc, "none used incorrectly");
  941. return 0;
  942. }
  943. int cgroup1_reconfigure(struct fs_context *fc)
  944. {
  945. struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
  946. struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb);
  947. struct cgroup_root *root = cgroup_root_from_kf(kf_root);
  948. int ret = 0;
  949. u16 added_mask, removed_mask;
  950. cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
  951. /* See what subsystems are wanted */
  952. ret = check_cgroupfs_options(fc);
  953. if (ret)
  954. goto out_unlock;
  955. if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent)
  956. pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
  957. task_tgid_nr(current), current->comm);
  958. added_mask = ctx->subsys_mask & ~root->subsys_mask;
  959. removed_mask = root->subsys_mask & ~ctx->subsys_mask;
  960. /* Don't allow flags or name to change at remount */
  961. if ((ctx->flags ^ root->flags) ||
  962. (ctx->name && strcmp(ctx->name, root->name))) {
  963. errorfc(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"",
  964. ctx->flags, ctx->name ?: "", root->flags, root->name);
  965. ret = -EINVAL;
  966. goto out_unlock;
  967. }
  968. /* remounting is not allowed for populated hierarchies */
  969. if (!list_empty(&root->cgrp.self.children)) {
  970. ret = -EBUSY;
  971. goto out_unlock;
  972. }
  973. ret = rebind_subsystems(root, added_mask);
  974. if (ret)
  975. goto out_unlock;
  976. WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
  977. if (ctx->release_agent) {
  978. spin_lock(&release_agent_path_lock);
  979. strcpy(root->release_agent_path, ctx->release_agent);
  980. spin_unlock(&release_agent_path_lock);
  981. }
  982. trace_cgroup_remount(root);
  983. out_unlock:
  984. cgroup_unlock();
  985. return ret;
  986. }
  987. struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
  988. .rename = cgroup1_rename,
  989. .show_options = cgroup1_show_options,
  990. .mkdir = cgroup_mkdir,
  991. .rmdir = cgroup_rmdir,
  992. .show_path = cgroup_show_path,
  993. };
  994. /*
  995. * The guts of cgroup1 mount - find or create cgroup_root to use.
  996. * Called with cgroup_mutex held; returns 0 on success, -E... on
  997. * error and positive - in case when the candidate is busy dying.
  998. * On success it stashes a reference to cgroup_root into given
  999. * cgroup_fs_context; that reference is *NOT* counting towards the
  1000. * cgroup_root refcount.
  1001. */
  1002. static int cgroup1_root_to_use(struct fs_context *fc)
  1003. {
  1004. struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
  1005. struct cgroup_root *root;
  1006. struct cgroup_subsys *ss;
  1007. int i, ret;
  1008. /* First find the desired set of subsystems */
  1009. ret = check_cgroupfs_options(fc);
  1010. if (ret)
  1011. return ret;
  1012. /*
  1013. * Destruction of cgroup root is asynchronous, so subsystems may
  1014. * still be dying after the previous unmount. Let's drain the
  1015. * dying subsystems. We just need to ensure that the ones
  1016. * unmounted previously finish dying and don't care about new ones
  1017. * starting. Testing ref liveliness is good enough.
  1018. */
  1019. for_each_subsys(ss, i) {
  1020. if (!(ctx->subsys_mask & (1 << i)) ||
  1021. ss->root == &cgrp_dfl_root)
  1022. continue;
  1023. if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt))
  1024. return 1; /* restart */
  1025. cgroup_put(&ss->root->cgrp);
  1026. }
  1027. for_each_root(root) {
  1028. bool name_match = false;
  1029. if (root == &cgrp_dfl_root)
  1030. continue;
  1031. /*
  1032. * If we asked for a name then it must match. Also, if
  1033. * name matches but sybsys_mask doesn't, we should fail.
  1034. * Remember whether name matched.
  1035. */
  1036. if (ctx->name) {
  1037. if (strcmp(ctx->name, root->name))
  1038. continue;
  1039. name_match = true;
  1040. }
  1041. /*
  1042. * If we asked for subsystems (or explicitly for no
  1043. * subsystems) then they must match.
  1044. */
  1045. if ((ctx->subsys_mask || ctx->none) &&
  1046. (ctx->subsys_mask != root->subsys_mask)) {
  1047. if (!name_match)
  1048. continue;
  1049. return -EBUSY;
  1050. }
  1051. if (root->flags ^ ctx->flags)
  1052. pr_warn("new mount options do not match the existing superblock, will be ignored\n");
  1053. ctx->root = root;
  1054. return 0;
  1055. }
  1056. /*
  1057. * No such thing, create a new one. name= matching without subsys
  1058. * specification is allowed for already existing hierarchies but we
  1059. * can't create new one without subsys specification.
  1060. */
  1061. if (!ctx->subsys_mask && !ctx->none)
  1062. return invalfc(fc, "No subsys list or none specified");
  1063. /* Hierarchies may only be created in the initial cgroup namespace. */
  1064. if (ctx->ns != &init_cgroup_ns)
  1065. return -EPERM;
  1066. root = kzalloc(sizeof(*root), GFP_KERNEL);
  1067. if (!root)
  1068. return -ENOMEM;
  1069. ctx->root = root;
  1070. init_cgroup_root(ctx);
  1071. ret = cgroup_setup_root(root, ctx->subsys_mask);
  1072. if (!ret)
  1073. cgroup_favor_dynmods(root, ctx->flags & CGRP_ROOT_FAVOR_DYNMODS);
  1074. else
  1075. cgroup_free_root(root);
  1076. return ret;
  1077. }
  1078. int cgroup1_get_tree(struct fs_context *fc)
  1079. {
  1080. struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
  1081. int ret;
  1082. /* Check if the caller has permission to mount. */
  1083. if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN))
  1084. return -EPERM;
  1085. cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
  1086. ret = cgroup1_root_to_use(fc);
  1087. if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt))
  1088. ret = 1; /* restart */
  1089. cgroup_unlock();
  1090. if (!ret)
  1091. ret = cgroup_do_get_tree(fc);
  1092. if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) {
  1093. fc_drop_locked(fc);
  1094. ret = 1;
  1095. }
  1096. if (unlikely(ret > 0)) {
  1097. msleep(10);
  1098. return restart_syscall();
  1099. }
  1100. return ret;
  1101. }
  1102. static int __init cgroup1_wq_init(void)
  1103. {
  1104. /*
  1105. * Used to destroy pidlists and separate to serve as flush domain.
  1106. * Cap @max_active to 1 too.
  1107. */
  1108. cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
  1109. 0, 1);
  1110. BUG_ON(!cgroup_pidlist_destroy_wq);
  1111. return 0;
  1112. }
  1113. core_initcall(cgroup1_wq_init);
  1114. static int __init cgroup_no_v1(char *str)
  1115. {
  1116. struct cgroup_subsys *ss;
  1117. char *token;
  1118. int i;
  1119. while ((token = strsep(&str, ",")) != NULL) {
  1120. if (!*token)
  1121. continue;
  1122. if (!strcmp(token, "all")) {
  1123. cgroup_no_v1_mask = U16_MAX;
  1124. continue;
  1125. }
  1126. if (!strcmp(token, "named")) {
  1127. cgroup_no_v1_named = true;
  1128. continue;
  1129. }
  1130. for_each_subsys(ss, i) {
  1131. if (strcmp(token, ss->name) &&
  1132. strcmp(token, ss->legacy_name))
  1133. continue;
  1134. cgroup_no_v1_mask |= 1 << i;
  1135. }
  1136. }
  1137. return 1;
  1138. }
  1139. __setup("cgroup_no_v1=", cgroup_no_v1);