hugetlb_cgroup.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919
  1. /*
  2. *
  3. * Copyright IBM Corporation, 2012
  4. * Author Aneesh Kumar K.V <[email protected]>
  5. *
  6. * Cgroup v2
  7. * Copyright (C) 2019 Red Hat, Inc.
  8. * Author: Giuseppe Scrivano <[email protected]>
  9. *
  10. * This program is free software; you can redistribute it and/or modify it
  11. * under the terms of version 2.1 of the GNU Lesser General Public License
  12. * as published by the Free Software Foundation.
  13. *
  14. * This program is distributed in the hope that it would be useful, but
  15. * WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  17. *
  18. */
  19. #include <linux/cgroup.h>
  20. #include <linux/page_counter.h>
  21. #include <linux/slab.h>
  22. #include <linux/hugetlb.h>
  23. #include <linux/hugetlb_cgroup.h>
  24. #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
  25. #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
  26. #define MEMFILE_ATTR(val) ((val) & 0xffff)
  27. static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
  28. static inline struct page_counter *
  29. __hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx,
  30. bool rsvd)
  31. {
  32. if (rsvd)
  33. return &h_cg->rsvd_hugepage[idx];
  34. return &h_cg->hugepage[idx];
  35. }
  36. static inline struct page_counter *
  37. hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx)
  38. {
  39. return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false);
  40. }
  41. static inline struct page_counter *
  42. hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx)
  43. {
  44. return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true);
  45. }
  46. static inline
  47. struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
  48. {
  49. return s ? container_of(s, struct hugetlb_cgroup, css) : NULL;
  50. }
  51. static inline
  52. struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
  53. {
  54. return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id));
  55. }
  56. static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
  57. {
  58. return (h_cg == root_h_cgroup);
  59. }
  60. static inline struct hugetlb_cgroup *
  61. parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
  62. {
  63. return hugetlb_cgroup_from_css(h_cg->css.parent);
  64. }
  65. static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
  66. {
  67. struct hstate *h;
  68. for_each_hstate(h) {
  69. if (page_counter_read(
  70. hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h))))
  71. return true;
  72. }
  73. return false;
  74. }
  75. static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
  76. struct hugetlb_cgroup *parent_h_cgroup)
  77. {
  78. int idx;
  79. for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
  80. struct page_counter *fault_parent = NULL;
  81. struct page_counter *rsvd_parent = NULL;
  82. unsigned long limit;
  83. int ret;
  84. if (parent_h_cgroup) {
  85. fault_parent = hugetlb_cgroup_counter_from_cgroup(
  86. parent_h_cgroup, idx);
  87. rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd(
  88. parent_h_cgroup, idx);
  89. }
  90. page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup,
  91. idx),
  92. fault_parent);
  93. page_counter_init(
  94. hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx),
  95. rsvd_parent);
  96. limit = round_down(PAGE_COUNTER_MAX,
  97. pages_per_huge_page(&hstates[idx]));
  98. ret = page_counter_set_max(
  99. hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx),
  100. limit);
  101. VM_BUG_ON(ret);
  102. ret = page_counter_set_max(
  103. hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx),
  104. limit);
  105. VM_BUG_ON(ret);
  106. }
  107. }
  108. static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup)
  109. {
  110. int node;
  111. for_each_node(node)
  112. kfree(h_cgroup->nodeinfo[node]);
  113. kfree(h_cgroup);
  114. }
  115. static struct cgroup_subsys_state *
  116. hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
  117. {
  118. struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
  119. struct hugetlb_cgroup *h_cgroup;
  120. int node;
  121. h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids),
  122. GFP_KERNEL);
  123. if (!h_cgroup)
  124. return ERR_PTR(-ENOMEM);
  125. if (!parent_h_cgroup)
  126. root_h_cgroup = h_cgroup;
  127. /*
  128. * TODO: this routine can waste much memory for nodes which will
  129. * never be onlined. It's better to use memory hotplug callback
  130. * function.
  131. */
  132. for_each_node(node) {
  133. /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */
  134. int node_to_alloc =
  135. node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE;
  136. h_cgroup->nodeinfo[node] =
  137. kzalloc_node(sizeof(struct hugetlb_cgroup_per_node),
  138. GFP_KERNEL, node_to_alloc);
  139. if (!h_cgroup->nodeinfo[node])
  140. goto fail_alloc_nodeinfo;
  141. }
  142. hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
  143. return &h_cgroup->css;
  144. fail_alloc_nodeinfo:
  145. hugetlb_cgroup_free(h_cgroup);
  146. return ERR_PTR(-ENOMEM);
  147. }
  148. static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
  149. {
  150. hugetlb_cgroup_free(hugetlb_cgroup_from_css(css));
  151. }
  152. /*
  153. * Should be called with hugetlb_lock held.
  154. * Since we are holding hugetlb_lock, pages cannot get moved from
  155. * active list or uncharged from the cgroup, So no need to get
  156. * page reference and test for page active here. This function
  157. * cannot fail.
  158. */
  159. static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
  160. struct page *page)
  161. {
  162. unsigned int nr_pages;
  163. struct page_counter *counter;
  164. struct hugetlb_cgroup *page_hcg;
  165. struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
  166. page_hcg = hugetlb_cgroup_from_page(page);
  167. /*
  168. * We can have pages in active list without any cgroup
  169. * ie, hugepage with less than 3 pages. We can safely
  170. * ignore those pages.
  171. */
  172. if (!page_hcg || page_hcg != h_cg)
  173. goto out;
  174. nr_pages = compound_nr(page);
  175. if (!parent) {
  176. parent = root_h_cgroup;
  177. /* root has no limit */
  178. page_counter_charge(&parent->hugepage[idx], nr_pages);
  179. }
  180. counter = &h_cg->hugepage[idx];
  181. /* Take the pages off the local counter */
  182. page_counter_cancel(counter, nr_pages);
  183. set_hugetlb_cgroup(page, parent);
  184. out:
  185. return;
  186. }
  187. /*
  188. * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
  189. * the parent cgroup.
  190. */
  191. static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
  192. {
  193. struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
  194. struct hstate *h;
  195. struct page *page;
  196. do {
  197. for_each_hstate(h) {
  198. spin_lock_irq(&hugetlb_lock);
  199. list_for_each_entry(page, &h->hugepage_activelist, lru)
  200. hugetlb_cgroup_move_parent(hstate_index(h), h_cg, page);
  201. spin_unlock_irq(&hugetlb_lock);
  202. }
  203. cond_resched();
  204. } while (hugetlb_cgroup_have_usage(h_cg));
  205. }
  206. static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx,
  207. enum hugetlb_memory_event event)
  208. {
  209. atomic_long_inc(&hugetlb->events_local[idx][event]);
  210. cgroup_file_notify(&hugetlb->events_local_file[idx]);
  211. do {
  212. atomic_long_inc(&hugetlb->events[idx][event]);
  213. cgroup_file_notify(&hugetlb->events_file[idx]);
  214. } while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) &&
  215. !hugetlb_cgroup_is_root(hugetlb));
  216. }
  217. static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
  218. struct hugetlb_cgroup **ptr,
  219. bool rsvd)
  220. {
  221. int ret = 0;
  222. struct page_counter *counter;
  223. struct hugetlb_cgroup *h_cg = NULL;
  224. if (hugetlb_cgroup_disabled())
  225. goto done;
  226. /*
  227. * We don't charge any cgroup if the compound page have less
  228. * than 3 pages.
  229. */
  230. if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
  231. goto done;
  232. again:
  233. rcu_read_lock();
  234. h_cg = hugetlb_cgroup_from_task(current);
  235. if (!css_tryget(&h_cg->css)) {
  236. rcu_read_unlock();
  237. goto again;
  238. }
  239. rcu_read_unlock();
  240. if (!page_counter_try_charge(
  241. __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
  242. nr_pages, &counter)) {
  243. ret = -ENOMEM;
  244. hugetlb_event(h_cg, idx, HUGETLB_MAX);
  245. css_put(&h_cg->css);
  246. goto done;
  247. }
  248. /* Reservations take a reference to the css because they do not get
  249. * reparented.
  250. */
  251. if (!rsvd)
  252. css_put(&h_cg->css);
  253. done:
  254. *ptr = h_cg;
  255. return ret;
  256. }
  257. int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
  258. struct hugetlb_cgroup **ptr)
  259. {
  260. return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false);
  261. }
  262. int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
  263. struct hugetlb_cgroup **ptr)
  264. {
  265. return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true);
  266. }
  267. /* Should be called with hugetlb_lock held */
  268. static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
  269. struct hugetlb_cgroup *h_cg,
  270. struct page *page, bool rsvd)
  271. {
  272. if (hugetlb_cgroup_disabled() || !h_cg)
  273. return;
  274. __set_hugetlb_cgroup(page, h_cg, rsvd);
  275. if (!rsvd) {
  276. unsigned long usage =
  277. h_cg->nodeinfo[page_to_nid(page)]->usage[idx];
  278. /*
  279. * This write is not atomic due to fetching usage and writing
  280. * to it, but that's fine because we call this with
  281. * hugetlb_lock held anyway.
  282. */
  283. WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx],
  284. usage + nr_pages);
  285. }
  286. }
  287. void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
  288. struct hugetlb_cgroup *h_cg,
  289. struct page *page)
  290. {
  291. __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, false);
  292. }
  293. void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
  294. struct hugetlb_cgroup *h_cg,
  295. struct page *page)
  296. {
  297. __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, true);
  298. }
  299. /*
  300. * Should be called with hugetlb_lock held
  301. */
  302. static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
  303. struct page *page, bool rsvd)
  304. {
  305. struct hugetlb_cgroup *h_cg;
  306. if (hugetlb_cgroup_disabled())
  307. return;
  308. lockdep_assert_held(&hugetlb_lock);
  309. h_cg = __hugetlb_cgroup_from_page(page, rsvd);
  310. if (unlikely(!h_cg))
  311. return;
  312. __set_hugetlb_cgroup(page, NULL, rsvd);
  313. page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
  314. rsvd),
  315. nr_pages);
  316. if (rsvd)
  317. css_put(&h_cg->css);
  318. else {
  319. unsigned long usage =
  320. h_cg->nodeinfo[page_to_nid(page)]->usage[idx];
  321. /*
  322. * This write is not atomic due to fetching usage and writing
  323. * to it, but that's fine because we call this with
  324. * hugetlb_lock held anyway.
  325. */
  326. WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx],
  327. usage - nr_pages);
  328. }
  329. }
  330. void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
  331. struct page *page)
  332. {
  333. __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, false);
  334. }
  335. void hugetlb_cgroup_uncharge_page_rsvd(int idx, unsigned long nr_pages,
  336. struct page *page)
  337. {
  338. __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, true);
  339. }
  340. static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
  341. struct hugetlb_cgroup *h_cg,
  342. bool rsvd)
  343. {
  344. if (hugetlb_cgroup_disabled() || !h_cg)
  345. return;
  346. if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
  347. return;
  348. page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
  349. rsvd),
  350. nr_pages);
  351. if (rsvd)
  352. css_put(&h_cg->css);
  353. }
  354. void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
  355. struct hugetlb_cgroup *h_cg)
  356. {
  357. __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false);
  358. }
  359. void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages,
  360. struct hugetlb_cgroup *h_cg)
  361. {
  362. __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true);
  363. }
  364. void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start,
  365. unsigned long end)
  366. {
  367. if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter ||
  368. !resv->css)
  369. return;
  370. page_counter_uncharge(resv->reservation_counter,
  371. (end - start) * resv->pages_per_hpage);
  372. css_put(resv->css);
  373. }
  374. void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
  375. struct file_region *rg,
  376. unsigned long nr_pages,
  377. bool region_del)
  378. {
  379. if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages)
  380. return;
  381. if (rg->reservation_counter && resv->pages_per_hpage &&
  382. !resv->reservation_counter) {
  383. page_counter_uncharge(rg->reservation_counter,
  384. nr_pages * resv->pages_per_hpage);
  385. /*
  386. * Only do css_put(rg->css) when we delete the entire region
  387. * because one file_region must hold exactly one css reference.
  388. */
  389. if (region_del)
  390. css_put(rg->css);
  391. }
  392. }
  393. enum {
  394. RES_USAGE,
  395. RES_RSVD_USAGE,
  396. RES_LIMIT,
  397. RES_RSVD_LIMIT,
  398. RES_MAX_USAGE,
  399. RES_RSVD_MAX_USAGE,
  400. RES_FAILCNT,
  401. RES_RSVD_FAILCNT,
  402. };
  403. static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy)
  404. {
  405. int nid;
  406. struct cftype *cft = seq_cft(seq);
  407. int idx = MEMFILE_IDX(cft->private);
  408. bool legacy = MEMFILE_ATTR(cft->private);
  409. struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
  410. struct cgroup_subsys_state *css;
  411. unsigned long usage;
  412. if (legacy) {
  413. /* Add up usage across all nodes for the non-hierarchical total. */
  414. usage = 0;
  415. for_each_node_state(nid, N_MEMORY)
  416. usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]);
  417. seq_printf(seq, "total=%lu", usage * PAGE_SIZE);
  418. /* Simply print the per-node usage for the non-hierarchical total. */
  419. for_each_node_state(nid, N_MEMORY)
  420. seq_printf(seq, " N%d=%lu", nid,
  421. READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) *
  422. PAGE_SIZE);
  423. seq_putc(seq, '\n');
  424. }
  425. /*
  426. * The hierarchical total is pretty much the value recorded by the
  427. * counter, so use that.
  428. */
  429. seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "",
  430. page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE);
  431. /*
  432. * For each node, transverse the css tree to obtain the hierarchical
  433. * node usage.
  434. */
  435. for_each_node_state(nid, N_MEMORY) {
  436. usage = 0;
  437. rcu_read_lock();
  438. css_for_each_descendant_pre(css, &h_cg->css) {
  439. usage += READ_ONCE(hugetlb_cgroup_from_css(css)
  440. ->nodeinfo[nid]
  441. ->usage[idx]);
  442. }
  443. rcu_read_unlock();
  444. seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE);
  445. }
  446. seq_putc(seq, '\n');
  447. return 0;
  448. }
  449. static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
  450. struct cftype *cft)
  451. {
  452. struct page_counter *counter;
  453. struct page_counter *rsvd_counter;
  454. struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
  455. counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)];
  456. rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)];
  457. switch (MEMFILE_ATTR(cft->private)) {
  458. case RES_USAGE:
  459. return (u64)page_counter_read(counter) * PAGE_SIZE;
  460. case RES_RSVD_USAGE:
  461. return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE;
  462. case RES_LIMIT:
  463. return (u64)counter->max * PAGE_SIZE;
  464. case RES_RSVD_LIMIT:
  465. return (u64)rsvd_counter->max * PAGE_SIZE;
  466. case RES_MAX_USAGE:
  467. return (u64)counter->watermark * PAGE_SIZE;
  468. case RES_RSVD_MAX_USAGE:
  469. return (u64)rsvd_counter->watermark * PAGE_SIZE;
  470. case RES_FAILCNT:
  471. return counter->failcnt;
  472. case RES_RSVD_FAILCNT:
  473. return rsvd_counter->failcnt;
  474. default:
  475. BUG();
  476. }
  477. }
  478. static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v)
  479. {
  480. int idx;
  481. u64 val;
  482. struct cftype *cft = seq_cft(seq);
  483. unsigned long limit;
  484. struct page_counter *counter;
  485. struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
  486. idx = MEMFILE_IDX(cft->private);
  487. counter = &h_cg->hugepage[idx];
  488. limit = round_down(PAGE_COUNTER_MAX,
  489. pages_per_huge_page(&hstates[idx]));
  490. switch (MEMFILE_ATTR(cft->private)) {
  491. case RES_RSVD_USAGE:
  492. counter = &h_cg->rsvd_hugepage[idx];
  493. fallthrough;
  494. case RES_USAGE:
  495. val = (u64)page_counter_read(counter);
  496. seq_printf(seq, "%llu\n", val * PAGE_SIZE);
  497. break;
  498. case RES_RSVD_LIMIT:
  499. counter = &h_cg->rsvd_hugepage[idx];
  500. fallthrough;
  501. case RES_LIMIT:
  502. val = (u64)counter->max;
  503. if (val == limit)
  504. seq_puts(seq, "max\n");
  505. else
  506. seq_printf(seq, "%llu\n", val * PAGE_SIZE);
  507. break;
  508. default:
  509. BUG();
  510. }
  511. return 0;
  512. }
  513. static DEFINE_MUTEX(hugetlb_limit_mutex);
  514. static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
  515. char *buf, size_t nbytes, loff_t off,
  516. const char *max)
  517. {
  518. int ret, idx;
  519. unsigned long nr_pages;
  520. struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
  521. bool rsvd = false;
  522. if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */
  523. return -EINVAL;
  524. buf = strstrip(buf);
  525. ret = page_counter_memparse(buf, max, &nr_pages);
  526. if (ret)
  527. return ret;
  528. idx = MEMFILE_IDX(of_cft(of)->private);
  529. nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx]));
  530. switch (MEMFILE_ATTR(of_cft(of)->private)) {
  531. case RES_RSVD_LIMIT:
  532. rsvd = true;
  533. fallthrough;
  534. case RES_LIMIT:
  535. mutex_lock(&hugetlb_limit_mutex);
  536. ret = page_counter_set_max(
  537. __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
  538. nr_pages);
  539. mutex_unlock(&hugetlb_limit_mutex);
  540. break;
  541. default:
  542. ret = -EINVAL;
  543. break;
  544. }
  545. return ret ?: nbytes;
  546. }
  547. static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of,
  548. char *buf, size_t nbytes, loff_t off)
  549. {
  550. return hugetlb_cgroup_write(of, buf, nbytes, off, "-1");
  551. }
  552. static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of,
  553. char *buf, size_t nbytes, loff_t off)
  554. {
  555. return hugetlb_cgroup_write(of, buf, nbytes, off, "max");
  556. }
  557. static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
  558. char *buf, size_t nbytes, loff_t off)
  559. {
  560. int ret = 0;
  561. struct page_counter *counter, *rsvd_counter;
  562. struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
  563. counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)];
  564. rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)];
  565. switch (MEMFILE_ATTR(of_cft(of)->private)) {
  566. case RES_MAX_USAGE:
  567. page_counter_reset_watermark(counter);
  568. break;
  569. case RES_RSVD_MAX_USAGE:
  570. page_counter_reset_watermark(rsvd_counter);
  571. break;
  572. case RES_FAILCNT:
  573. counter->failcnt = 0;
  574. break;
  575. case RES_RSVD_FAILCNT:
  576. rsvd_counter->failcnt = 0;
  577. break;
  578. default:
  579. ret = -EINVAL;
  580. break;
  581. }
  582. return ret ?: nbytes;
  583. }
  584. static char *mem_fmt(char *buf, int size, unsigned long hsize)
  585. {
  586. if (hsize >= SZ_1G)
  587. snprintf(buf, size, "%luGB", hsize / SZ_1G);
  588. else if (hsize >= SZ_1M)
  589. snprintf(buf, size, "%luMB", hsize / SZ_1M);
  590. else
  591. snprintf(buf, size, "%luKB", hsize / SZ_1K);
  592. return buf;
  593. }
  594. static int __hugetlb_events_show(struct seq_file *seq, bool local)
  595. {
  596. int idx;
  597. long max;
  598. struct cftype *cft = seq_cft(seq);
  599. struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
  600. idx = MEMFILE_IDX(cft->private);
  601. if (local)
  602. max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]);
  603. else
  604. max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]);
  605. seq_printf(seq, "max %lu\n", max);
  606. return 0;
  607. }
  608. static int hugetlb_events_show(struct seq_file *seq, void *v)
  609. {
  610. return __hugetlb_events_show(seq, false);
  611. }
  612. static int hugetlb_events_local_show(struct seq_file *seq, void *v)
  613. {
  614. return __hugetlb_events_show(seq, true);
  615. }
  616. static void __init __hugetlb_cgroup_file_dfl_init(int idx)
  617. {
  618. char buf[32];
  619. struct cftype *cft;
  620. struct hstate *h = &hstates[idx];
  621. /* format the size */
  622. mem_fmt(buf, sizeof(buf), huge_page_size(h));
  623. /* Add the limit file */
  624. cft = &h->cgroup_files_dfl[0];
  625. snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max", buf);
  626. cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
  627. cft->seq_show = hugetlb_cgroup_read_u64_max;
  628. cft->write = hugetlb_cgroup_write_dfl;
  629. cft->flags = CFTYPE_NOT_ON_ROOT;
  630. /* Add the reservation limit file */
  631. cft = &h->cgroup_files_dfl[1];
  632. snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.max", buf);
  633. cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT);
  634. cft->seq_show = hugetlb_cgroup_read_u64_max;
  635. cft->write = hugetlb_cgroup_write_dfl;
  636. cft->flags = CFTYPE_NOT_ON_ROOT;
  637. /* Add the current usage file */
  638. cft = &h->cgroup_files_dfl[2];
  639. snprintf(cft->name, MAX_CFTYPE_NAME, "%s.current", buf);
  640. cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
  641. cft->seq_show = hugetlb_cgroup_read_u64_max;
  642. cft->flags = CFTYPE_NOT_ON_ROOT;
  643. /* Add the current reservation usage file */
  644. cft = &h->cgroup_files_dfl[3];
  645. snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.current", buf);
  646. cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE);
  647. cft->seq_show = hugetlb_cgroup_read_u64_max;
  648. cft->flags = CFTYPE_NOT_ON_ROOT;
  649. /* Add the events file */
  650. cft = &h->cgroup_files_dfl[4];
  651. snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events", buf);
  652. cft->private = MEMFILE_PRIVATE(idx, 0);
  653. cft->seq_show = hugetlb_events_show;
  654. cft->file_offset = offsetof(struct hugetlb_cgroup, events_file[idx]);
  655. cft->flags = CFTYPE_NOT_ON_ROOT;
  656. /* Add the events.local file */
  657. cft = &h->cgroup_files_dfl[5];
  658. snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events.local", buf);
  659. cft->private = MEMFILE_PRIVATE(idx, 0);
  660. cft->seq_show = hugetlb_events_local_show;
  661. cft->file_offset = offsetof(struct hugetlb_cgroup,
  662. events_local_file[idx]);
  663. cft->flags = CFTYPE_NOT_ON_ROOT;
  664. /* Add the numa stat file */
  665. cft = &h->cgroup_files_dfl[6];
  666. snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf);
  667. cft->private = MEMFILE_PRIVATE(idx, 0);
  668. cft->seq_show = hugetlb_cgroup_read_numa_stat;
  669. cft->flags = CFTYPE_NOT_ON_ROOT;
  670. /* NULL terminate the last cft */
  671. cft = &h->cgroup_files_dfl[7];
  672. memset(cft, 0, sizeof(*cft));
  673. WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
  674. h->cgroup_files_dfl));
  675. }
  676. static void __init __hugetlb_cgroup_file_legacy_init(int idx)
  677. {
  678. char buf[32];
  679. struct cftype *cft;
  680. struct hstate *h = &hstates[idx];
  681. /* format the size */
  682. mem_fmt(buf, sizeof(buf), huge_page_size(h));
  683. /* Add the limit file */
  684. cft = &h->cgroup_files_legacy[0];
  685. snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
  686. cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
  687. cft->read_u64 = hugetlb_cgroup_read_u64;
  688. cft->write = hugetlb_cgroup_write_legacy;
  689. /* Add the reservation limit file */
  690. cft = &h->cgroup_files_legacy[1];
  691. snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.limit_in_bytes", buf);
  692. cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT);
  693. cft->read_u64 = hugetlb_cgroup_read_u64;
  694. cft->write = hugetlb_cgroup_write_legacy;
  695. /* Add the usage file */
  696. cft = &h->cgroup_files_legacy[2];
  697. snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
  698. cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
  699. cft->read_u64 = hugetlb_cgroup_read_u64;
  700. /* Add the reservation usage file */
  701. cft = &h->cgroup_files_legacy[3];
  702. snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.usage_in_bytes", buf);
  703. cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE);
  704. cft->read_u64 = hugetlb_cgroup_read_u64;
  705. /* Add the MAX usage file */
  706. cft = &h->cgroup_files_legacy[4];
  707. snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
  708. cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
  709. cft->write = hugetlb_cgroup_reset;
  710. cft->read_u64 = hugetlb_cgroup_read_u64;
  711. /* Add the MAX reservation usage file */
  712. cft = &h->cgroup_files_legacy[5];
  713. snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.max_usage_in_bytes", buf);
  714. cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_MAX_USAGE);
  715. cft->write = hugetlb_cgroup_reset;
  716. cft->read_u64 = hugetlb_cgroup_read_u64;
  717. /* Add the failcntfile */
  718. cft = &h->cgroup_files_legacy[6];
  719. snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
  720. cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
  721. cft->write = hugetlb_cgroup_reset;
  722. cft->read_u64 = hugetlb_cgroup_read_u64;
  723. /* Add the reservation failcntfile */
  724. cft = &h->cgroup_files_legacy[7];
  725. snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.failcnt", buf);
  726. cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_FAILCNT);
  727. cft->write = hugetlb_cgroup_reset;
  728. cft->read_u64 = hugetlb_cgroup_read_u64;
  729. /* Add the numa stat file */
  730. cft = &h->cgroup_files_legacy[8];
  731. snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf);
  732. cft->private = MEMFILE_PRIVATE(idx, 1);
  733. cft->seq_show = hugetlb_cgroup_read_numa_stat;
  734. /* NULL terminate the last cft */
  735. cft = &h->cgroup_files_legacy[9];
  736. memset(cft, 0, sizeof(*cft));
  737. WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
  738. h->cgroup_files_legacy));
  739. }
  740. static void __init __hugetlb_cgroup_file_init(int idx)
  741. {
  742. __hugetlb_cgroup_file_dfl_init(idx);
  743. __hugetlb_cgroup_file_legacy_init(idx);
  744. }
  745. void __init hugetlb_cgroup_file_init(void)
  746. {
  747. struct hstate *h;
  748. for_each_hstate(h) {
  749. /*
  750. * Add cgroup control files only if the huge page consists
  751. * of more than two normal pages. This is because we use
  752. * page[2].private for storing cgroup details.
  753. */
  754. if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
  755. __hugetlb_cgroup_file_init(hstate_index(h));
  756. }
  757. }
  758. /*
  759. * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
  760. * when we migrate hugepages
  761. */
  762. void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
  763. {
  764. struct hugetlb_cgroup *h_cg;
  765. struct hugetlb_cgroup *h_cg_rsvd;
  766. struct hstate *h = page_hstate(oldhpage);
  767. if (hugetlb_cgroup_disabled())
  768. return;
  769. spin_lock_irq(&hugetlb_lock);
  770. h_cg = hugetlb_cgroup_from_page(oldhpage);
  771. h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage);
  772. set_hugetlb_cgroup(oldhpage, NULL);
  773. set_hugetlb_cgroup_rsvd(oldhpage, NULL);
  774. /* move the h_cg details to new cgroup */
  775. set_hugetlb_cgroup(newhpage, h_cg);
  776. set_hugetlb_cgroup_rsvd(newhpage, h_cg_rsvd);
  777. list_move(&newhpage->lru, &h->hugepage_activelist);
  778. spin_unlock_irq(&hugetlb_lock);
  779. return;
  780. }
  781. static struct cftype hugetlb_files[] = {
  782. {} /* terminate */
  783. };
  784. struct cgroup_subsys hugetlb_cgrp_subsys = {
  785. .css_alloc = hugetlb_cgroup_css_alloc,
  786. .css_offline = hugetlb_cgroup_css_offline,
  787. .css_free = hugetlb_cgroup_css_free,
  788. .dfl_cftypes = hugetlb_files,
  789. .legacy_cftypes = hugetlb_files,
  790. };