workingset.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Workingset detection
  4. *
  5. * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner
  6. */
  7. #include <linux/memcontrol.h>
  8. #include <linux/mm_inline.h>
  9. #include <linux/writeback.h>
  10. #include <linux/shmem_fs.h>
  11. #include <linux/pagemap.h>
  12. #include <linux/atomic.h>
  13. #include <linux/module.h>
  14. #include <linux/swap.h>
  15. #include <linux/dax.h>
  16. #include <linux/fs.h>
  17. #include <linux/mm.h>
  18. /*
  19. * Double CLOCK lists
  20. *
  21. * Per node, two clock lists are maintained for file pages: the
  22. * inactive and the active list. Freshly faulted pages start out at
  23. * the head of the inactive list and page reclaim scans pages from the
  24. * tail. Pages that are accessed multiple times on the inactive list
  25. * are promoted to the active list, to protect them from reclaim,
  26. * whereas active pages are demoted to the inactive list when the
  27. * active list grows too big.
  28. *
  29. * fault ------------------------+
  30. * |
  31. * +--------------+ | +-------------+
  32. * reclaim <- | inactive | <-+-- demotion | active | <--+
  33. * +--------------+ +-------------+ |
  34. * | |
  35. * +-------------- promotion ------------------+
  36. *
  37. *
  38. * Access frequency and refault distance
  39. *
  40. * A workload is thrashing when its pages are frequently used but they
  41. * are evicted from the inactive list every time before another access
  42. * would have promoted them to the active list.
  43. *
  44. * In cases where the average access distance between thrashing pages
  45. * is bigger than the size of memory there is nothing that can be
  46. * done - the thrashing set could never fit into memory under any
  47. * circumstance.
  48. *
  49. * However, the average access distance could be bigger than the
  50. * inactive list, yet smaller than the size of memory. In this case,
  51. * the set could fit into memory if it weren't for the currently
  52. * active pages - which may be used more, hopefully less frequently:
  53. *
  54. * +-memory available to cache-+
  55. * | |
  56. * +-inactive------+-active----+
  57. * a b | c d e f g h i | J K L M N |
  58. * +---------------+-----------+
  59. *
  60. * It is prohibitively expensive to accurately track access frequency
  61. * of pages. But a reasonable approximation can be made to measure
  62. * thrashing on the inactive list, after which refaulting pages can be
  63. * activated optimistically to compete with the existing active pages.
  64. *
  65. * Approximating inactive page access frequency - Observations:
  66. *
  67. * 1. When a page is accessed for the first time, it is added to the
  68. * head of the inactive list, slides every existing inactive page
  69. * towards the tail by one slot, and pushes the current tail page
  70. * out of memory.
  71. *
  72. * 2. When a page is accessed for the second time, it is promoted to
  73. * the active list, shrinking the inactive list by one slot. This
  74. * also slides all inactive pages that were faulted into the cache
  75. * more recently than the activated page towards the tail of the
  76. * inactive list.
  77. *
  78. * Thus:
  79. *
  80. * 1. The sum of evictions and activations between any two points in
  81. * time indicate the minimum number of inactive pages accessed in
  82. * between.
  83. *
  84. * 2. Moving one inactive page N page slots towards the tail of the
  85. * list requires at least N inactive page accesses.
  86. *
  87. * Combining these:
  88. *
  89. * 1. When a page is finally evicted from memory, the number of
  90. * inactive pages accessed while the page was in cache is at least
  91. * the number of page slots on the inactive list.
  92. *
  93. * 2. In addition, measuring the sum of evictions and activations (E)
  94. * at the time of a page's eviction, and comparing it to another
  95. * reading (R) at the time the page faults back into memory tells
  96. * the minimum number of accesses while the page was not cached.
  97. * This is called the refault distance.
  98. *
  99. * Because the first access of the page was the fault and the second
  100. * access the refault, we combine the in-cache distance with the
  101. * out-of-cache distance to get the complete minimum access distance
  102. * of this page:
  103. *
  104. * NR_inactive + (R - E)
  105. *
  106. * And knowing the minimum access distance of a page, we can easily
  107. * tell if the page would be able to stay in cache assuming all page
  108. * slots in the cache were available:
  109. *
  110. * NR_inactive + (R - E) <= NR_inactive + NR_active
  111. *
  112. * which can be further simplified to
  113. *
  114. * (R - E) <= NR_active
  115. *
  116. * Put into words, the refault distance (out-of-cache) can be seen as
  117. * a deficit in inactive list space (in-cache). If the inactive list
  118. * had (R - E) more page slots, the page would not have been evicted
  119. * in between accesses, but activated instead. And on a full system,
  120. * the only thing eating into inactive list space is active pages.
  121. *
  122. *
  123. * Refaulting inactive pages
  124. *
  125. * All that is known about the active list is that the pages have been
  126. * accessed more than once in the past. This means that at any given
  127. * time there is actually a good chance that pages on the active list
  128. * are no longer in active use.
  129. *
  130. * So when a refault distance of (R - E) is observed and there are at
  131. * least (R - E) active pages, the refaulting page is activated
  132. * optimistically in the hope that (R - E) active pages are actually
  133. * used less frequently than the refaulting page - or even not used at
  134. * all anymore.
  135. *
  136. * That means if inactive cache is refaulting with a suitable refault
  137. * distance, we assume the cache workingset is transitioning and put
  138. * pressure on the current active list.
  139. *
  140. * If this is wrong and demotion kicks in, the pages which are truly
  141. * used more frequently will be reactivated while the less frequently
  142. * used once will be evicted from memory.
  143. *
  144. * But if this is right, the stale pages will be pushed out of memory
  145. * and the used pages get to stay in cache.
  146. *
  147. * Refaulting active pages
  148. *
  149. * If on the other hand the refaulting pages have recently been
  150. * deactivated, it means that the active list is no longer protecting
  151. * actively used cache from reclaim. The cache is NOT transitioning to
  152. * a different workingset; the existing workingset is thrashing in the
  153. * space allocated to the page cache.
  154. *
  155. *
  156. * Implementation
  157. *
  158. * For each node's LRU lists, a counter for inactive evictions and
  159. * activations is maintained (node->nonresident_age).
  160. *
  161. * On eviction, a snapshot of this counter (along with some bits to
  162. * identify the node) is stored in the now empty page cache
  163. * slot of the evicted page. This is called a shadow entry.
  164. *
  165. * On cache misses for which there are shadow entries, an eligible
  166. * refault distance will immediately activate the refaulting page.
  167. */
  168. #define WORKINGSET_SHIFT 1
  169. #define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \
  170. WORKINGSET_SHIFT + NODES_SHIFT + \
  171. MEM_CGROUP_ID_SHIFT)
  172. #define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
  173. /*
  174. * Eviction timestamps need to be able to cover the full range of
  175. * actionable refaults. However, bits are tight in the xarray
  176. * entry, and after storing the identifier for the lruvec there might
  177. * not be enough left to represent every single actionable refault. In
  178. * that case, we have to sacrifice granularity for distance, and group
  179. * evictions into coarser buckets by shaving off lower timestamp bits.
  180. */
  181. static unsigned int bucket_order __read_mostly;
  182. static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
  183. bool workingset)
  184. {
  185. eviction &= EVICTION_MASK;
  186. eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
  187. eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
  188. eviction = (eviction << WORKINGSET_SHIFT) | workingset;
  189. return xa_mk_value(eviction);
  190. }
  191. static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
  192. unsigned long *evictionp, bool *workingsetp)
  193. {
  194. unsigned long entry = xa_to_value(shadow);
  195. int memcgid, nid;
  196. bool workingset;
  197. workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1);
  198. entry >>= WORKINGSET_SHIFT;
  199. nid = entry & ((1UL << NODES_SHIFT) - 1);
  200. entry >>= NODES_SHIFT;
  201. memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
  202. entry >>= MEM_CGROUP_ID_SHIFT;
  203. *memcgidp = memcgid;
  204. *pgdat = NODE_DATA(nid);
  205. *evictionp = entry;
  206. *workingsetp = workingset;
  207. }
  208. #ifdef CONFIG_LRU_GEN
  209. static void *lru_gen_eviction(struct folio *folio)
  210. {
  211. int hist;
  212. unsigned long token;
  213. unsigned long min_seq;
  214. struct lruvec *lruvec;
  215. struct lru_gen_folio *lrugen;
  216. int type = folio_is_file_lru(folio);
  217. int delta = folio_nr_pages(folio);
  218. int refs = folio_lru_refs(folio);
  219. int tier = lru_tier_from_refs(refs);
  220. struct mem_cgroup *memcg = folio_memcg(folio);
  221. struct pglist_data *pgdat = folio_pgdat(folio);
  222. BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
  223. lruvec = mem_cgroup_lruvec(memcg, pgdat);
  224. lrugen = &lruvec->lrugen;
  225. min_seq = READ_ONCE(lrugen->min_seq[type]);
  226. token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0);
  227. hist = lru_hist_from_seq(min_seq);
  228. atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
  229. return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
  230. }
  231. static void lru_gen_refault(struct folio *folio, void *shadow)
  232. {
  233. int hist, tier, refs;
  234. int memcg_id;
  235. bool workingset;
  236. unsigned long token;
  237. unsigned long min_seq;
  238. struct lruvec *lruvec;
  239. struct lru_gen_folio *lrugen;
  240. struct mem_cgroup *memcg;
  241. struct pglist_data *pgdat;
  242. int type = folio_is_file_lru(folio);
  243. int delta = folio_nr_pages(folio);
  244. unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset);
  245. if (pgdat != folio_pgdat(folio))
  246. return;
  247. rcu_read_lock();
  248. memcg = folio_memcg_rcu(folio);
  249. if (memcg_id != mem_cgroup_id(memcg))
  250. goto unlock;
  251. lruvec = mem_cgroup_lruvec(memcg, pgdat);
  252. lrugen = &lruvec->lrugen;
  253. mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
  254. min_seq = READ_ONCE(lrugen->min_seq[type]);
  255. if ((token >> LRU_REFS_WIDTH) != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)))
  256. goto unlock;
  257. hist = lru_hist_from_seq(min_seq);
  258. /* see the comment in folio_lru_refs() */
  259. refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
  260. tier = lru_tier_from_refs(refs);
  261. atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
  262. mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
  263. /*
  264. * Count the following two cases as stalls:
  265. * 1. For pages accessed through page tables, hotter pages pushed out
  266. * hot pages which refaulted immediately.
  267. * 2. For pages accessed multiple times through file descriptors,
  268. * numbers of accesses might have been out of the range.
  269. */
  270. if (lru_gen_in_fault() || refs == BIT(LRU_REFS_WIDTH)) {
  271. folio_set_workingset(folio);
  272. mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
  273. }
  274. unlock:
  275. rcu_read_unlock();
  276. }
  277. #else /* !CONFIG_LRU_GEN */
  278. static void *lru_gen_eviction(struct folio *folio)
  279. {
  280. return NULL;
  281. }
  282. static void lru_gen_refault(struct folio *folio, void *shadow)
  283. {
  284. }
  285. #endif /* CONFIG_LRU_GEN */
  286. /**
  287. * workingset_age_nonresident - age non-resident entries as LRU ages
  288. * @lruvec: the lruvec that was aged
  289. * @nr_pages: the number of pages to count
  290. *
  291. * As in-memory pages are aged, non-resident pages need to be aged as
  292. * well, in order for the refault distances later on to be comparable
  293. * to the in-memory dimensions. This function allows reclaim and LRU
  294. * operations to drive the non-resident aging along in parallel.
  295. */
  296. void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
  297. {
  298. /*
  299. * Reclaiming a cgroup means reclaiming all its children in a
  300. * round-robin fashion. That means that each cgroup has an LRU
  301. * order that is composed of the LRU orders of its child
  302. * cgroups; and every page has an LRU position not just in the
  303. * cgroup that owns it, but in all of that group's ancestors.
  304. *
  305. * So when the physical inactive list of a leaf cgroup ages,
  306. * the virtual inactive lists of all its parents, including
  307. * the root cgroup's, age as well.
  308. */
  309. do {
  310. atomic_long_add(nr_pages, &lruvec->nonresident_age);
  311. } while ((lruvec = parent_lruvec(lruvec)));
  312. }
  313. /**
  314. * workingset_eviction - note the eviction of a folio from memory
  315. * @target_memcg: the cgroup that is causing the reclaim
  316. * @folio: the folio being evicted
  317. *
  318. * Return: a shadow entry to be stored in @folio->mapping->i_pages in place
  319. * of the evicted @folio so that a later refault can be detected.
  320. */
  321. void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
  322. {
  323. struct pglist_data *pgdat = folio_pgdat(folio);
  324. unsigned long eviction;
  325. struct lruvec *lruvec;
  326. int memcgid;
  327. /* Folio is fully exclusive and pins folio's memory cgroup pointer */
  328. VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
  329. VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
  330. VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
  331. if (lru_gen_enabled())
  332. return lru_gen_eviction(folio);
  333. lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
  334. /* XXX: target_memcg can be NULL, go through lruvec */
  335. memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
  336. eviction = atomic_long_read(&lruvec->nonresident_age);
  337. eviction >>= bucket_order;
  338. workingset_age_nonresident(lruvec, folio_nr_pages(folio));
  339. return pack_shadow(memcgid, pgdat, eviction,
  340. folio_test_workingset(folio));
  341. }
  342. /**
  343. * workingset_refault - Evaluate the refault of a previously evicted folio.
  344. * @folio: The freshly allocated replacement folio.
  345. * @shadow: Shadow entry of the evicted folio.
  346. *
  347. * Calculates and evaluates the refault distance of the previously
  348. * evicted folio in the context of the node and the memcg whose memory
  349. * pressure caused the eviction.
  350. */
  351. void workingset_refault(struct folio *folio, void *shadow)
  352. {
  353. bool file = folio_is_file_lru(folio);
  354. struct mem_cgroup *eviction_memcg;
  355. struct lruvec *eviction_lruvec;
  356. unsigned long refault_distance;
  357. unsigned long workingset_size;
  358. struct pglist_data *pgdat;
  359. struct mem_cgroup *memcg;
  360. unsigned long eviction;
  361. struct lruvec *lruvec;
  362. unsigned long refault;
  363. bool workingset;
  364. int memcgid;
  365. long nr;
  366. if (lru_gen_enabled()) {
  367. lru_gen_refault(folio, shadow);
  368. return;
  369. }
  370. unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
  371. eviction <<= bucket_order;
  372. rcu_read_lock();
  373. /*
  374. * Look up the memcg associated with the stored ID. It might
  375. * have been deleted since the folio's eviction.
  376. *
  377. * Note that in rare events the ID could have been recycled
  378. * for a new cgroup that refaults a shared folio. This is
  379. * impossible to tell from the available data. However, this
  380. * should be a rare and limited disturbance, and activations
  381. * are always speculative anyway. Ultimately, it's the aging
  382. * algorithm's job to shake out the minimum access frequency
  383. * for the active cache.
  384. *
  385. * XXX: On !CONFIG_MEMCG, this will always return NULL; it
  386. * would be better if the root_mem_cgroup existed in all
  387. * configurations instead.
  388. */
  389. eviction_memcg = mem_cgroup_from_id(memcgid);
  390. if (!mem_cgroup_disabled() && !eviction_memcg)
  391. goto out;
  392. eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
  393. refault = atomic_long_read(&eviction_lruvec->nonresident_age);
  394. /*
  395. * Calculate the refault distance
  396. *
  397. * The unsigned subtraction here gives an accurate distance
  398. * across nonresident_age overflows in most cases. There is a
  399. * special case: usually, shadow entries have a short lifetime
  400. * and are either refaulted or reclaimed along with the inode
  401. * before they get too old. But it is not impossible for the
  402. * nonresident_age to lap a shadow entry in the field, which
  403. * can then result in a false small refault distance, leading
  404. * to a false activation should this old entry actually
  405. * refault again. However, earlier kernels used to deactivate
  406. * unconditionally with *every* reclaim invocation for the
  407. * longest time, so the occasional inappropriate activation
  408. * leading to pressure on the active list is not a problem.
  409. */
  410. refault_distance = (refault - eviction) & EVICTION_MASK;
  411. /*
  412. * The activation decision for this folio is made at the level
  413. * where the eviction occurred, as that is where the LRU order
  414. * during folio reclaim is being determined.
  415. *
  416. * However, the cgroup that will own the folio is the one that
  417. * is actually experiencing the refault event.
  418. */
  419. nr = folio_nr_pages(folio);
  420. memcg = folio_memcg(folio);
  421. lruvec = mem_cgroup_lruvec(memcg, pgdat);
  422. mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
  423. mem_cgroup_flush_stats_delayed();
  424. /*
  425. * Compare the distance to the existing workingset size. We
  426. * don't activate pages that couldn't stay resident even if
  427. * all the memory was available to the workingset. Whether
  428. * workingset competition needs to consider anon or not depends
  429. * on having swap.
  430. */
  431. workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
  432. if (!file) {
  433. workingset_size += lruvec_page_state(eviction_lruvec,
  434. NR_INACTIVE_FILE);
  435. }
  436. if (mem_cgroup_get_nr_swap_pages(memcg) > 0) {
  437. workingset_size += lruvec_page_state(eviction_lruvec,
  438. NR_ACTIVE_ANON);
  439. if (file) {
  440. workingset_size += lruvec_page_state(eviction_lruvec,
  441. NR_INACTIVE_ANON);
  442. }
  443. }
  444. if (refault_distance > workingset_size)
  445. goto out;
  446. folio_set_active(folio);
  447. workingset_age_nonresident(lruvec, nr);
  448. mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr);
  449. /* Folio was active prior to eviction */
  450. if (workingset) {
  451. folio_set_workingset(folio);
  452. /* XXX: Move to lru_cache_add() when it supports new vs putback */
  453. lru_note_cost_folio(folio);
  454. mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
  455. }
  456. out:
  457. rcu_read_unlock();
  458. }
  459. /**
  460. * workingset_activation - note a page activation
  461. * @folio: Folio that is being activated.
  462. */
  463. void workingset_activation(struct folio *folio)
  464. {
  465. struct mem_cgroup *memcg;
  466. rcu_read_lock();
  467. /*
  468. * Filter non-memcg pages here, e.g. unmap can call
  469. * mark_page_accessed() on VDSO pages.
  470. *
  471. * XXX: See workingset_refault() - this should return
  472. * root_mem_cgroup even for !CONFIG_MEMCG.
  473. */
  474. memcg = folio_memcg_rcu(folio);
  475. if (!mem_cgroup_disabled() && !memcg)
  476. goto out;
  477. workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
  478. out:
  479. rcu_read_unlock();
  480. }
  481. /*
  482. * Shadow entries reflect the share of the working set that does not
  483. * fit into memory, so their number depends on the access pattern of
  484. * the workload. In most cases, they will refault or get reclaimed
  485. * along with the inode, but a (malicious) workload that streams
  486. * through files with a total size several times that of available
  487. * memory, while preventing the inodes from being reclaimed, can
  488. * create excessive amounts of shadow nodes. To keep a lid on this,
  489. * track shadow nodes and reclaim them when they grow way past the
  490. * point where they would still be useful.
  491. */
  492. struct list_lru shadow_nodes;
  493. void workingset_update_node(struct xa_node *node)
  494. {
  495. struct address_space *mapping;
  496. /*
  497. * Track non-empty nodes that contain only shadow entries;
  498. * unlink those that contain pages or are being freed.
  499. *
  500. * Avoid acquiring the list_lru lock when the nodes are
  501. * already where they should be. The list_empty() test is safe
  502. * as node->private_list is protected by the i_pages lock.
  503. */
  504. mapping = container_of(node->array, struct address_space, i_pages);
  505. lockdep_assert_held(&mapping->i_pages.xa_lock);
  506. if (node->count && node->count == node->nr_values) {
  507. if (list_empty(&node->private_list)) {
  508. list_lru_add(&shadow_nodes, &node->private_list);
  509. __inc_lruvec_kmem_state(node, WORKINGSET_NODES);
  510. }
  511. } else {
  512. if (!list_empty(&node->private_list)) {
  513. list_lru_del(&shadow_nodes, &node->private_list);
  514. __dec_lruvec_kmem_state(node, WORKINGSET_NODES);
  515. }
  516. }
  517. }
  518. static unsigned long count_shadow_nodes(struct shrinker *shrinker,
  519. struct shrink_control *sc)
  520. {
  521. unsigned long max_nodes;
  522. unsigned long nodes;
  523. unsigned long pages;
  524. nodes = list_lru_shrink_count(&shadow_nodes, sc);
  525. if (!nodes)
  526. return SHRINK_EMPTY;
  527. /*
  528. * Approximate a reasonable limit for the nodes
  529. * containing shadow entries. We don't need to keep more
  530. * shadow entries than possible pages on the active list,
  531. * since refault distances bigger than that are dismissed.
  532. *
  533. * The size of the active list converges toward 100% of
  534. * overall page cache as memory grows, with only a tiny
  535. * inactive list. Assume the total cache size for that.
  536. *
  537. * Nodes might be sparsely populated, with only one shadow
  538. * entry in the extreme case. Obviously, we cannot keep one
  539. * node for every eligible shadow entry, so compromise on a
  540. * worst-case density of 1/8th. Below that, not all eligible
  541. * refaults can be detected anymore.
  542. *
  543. * On 64-bit with 7 xa_nodes per page and 64 slots
  544. * each, this will reclaim shadow entries when they consume
  545. * ~1.8% of available memory:
  546. *
  547. * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE
  548. */
  549. #ifdef CONFIG_MEMCG
  550. if (sc->memcg) {
  551. struct lruvec *lruvec;
  552. int i;
  553. lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
  554. for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
  555. pages += lruvec_page_state_local(lruvec,
  556. NR_LRU_BASE + i);
  557. pages += lruvec_page_state_local(
  558. lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT;
  559. pages += lruvec_page_state_local(
  560. lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT;
  561. } else
  562. #endif
  563. pages = node_present_pages(sc->nid);
  564. max_nodes = pages >> (XA_CHUNK_SHIFT - 3);
  565. if (nodes <= max_nodes)
  566. return 0;
  567. return nodes - max_nodes;
  568. }
  569. static enum lru_status shadow_lru_isolate(struct list_head *item,
  570. struct list_lru_one *lru,
  571. spinlock_t *lru_lock,
  572. void *arg) __must_hold(lru_lock)
  573. {
  574. struct xa_node *node = container_of(item, struct xa_node, private_list);
  575. struct address_space *mapping;
  576. int ret;
  577. /*
  578. * Page cache insertions and deletions synchronously maintain
  579. * the shadow node LRU under the i_pages lock and the
  580. * lru_lock. Because the page cache tree is emptied before
  581. * the inode can be destroyed, holding the lru_lock pins any
  582. * address_space that has nodes on the LRU.
  583. *
  584. * We can then safely transition to the i_pages lock to
  585. * pin only the address_space of the particular node we want
  586. * to reclaim, take the node off-LRU, and drop the lru_lock.
  587. */
  588. mapping = container_of(node->array, struct address_space, i_pages);
  589. /* Coming from the list, invert the lock order */
  590. if (!xa_trylock(&mapping->i_pages)) {
  591. spin_unlock_irq(lru_lock);
  592. ret = LRU_RETRY;
  593. goto out;
  594. }
  595. if (!spin_trylock(&mapping->host->i_lock)) {
  596. xa_unlock(&mapping->i_pages);
  597. spin_unlock_irq(lru_lock);
  598. ret = LRU_RETRY;
  599. goto out;
  600. }
  601. list_lru_isolate(lru, item);
  602. __dec_lruvec_kmem_state(node, WORKINGSET_NODES);
  603. spin_unlock(lru_lock);
  604. /*
  605. * The nodes should only contain one or more shadow entries,
  606. * no pages, so we expect to be able to remove them all and
  607. * delete and free the empty node afterwards.
  608. */
  609. if (WARN_ON_ONCE(!node->nr_values))
  610. goto out_invalid;
  611. if (WARN_ON_ONCE(node->count != node->nr_values))
  612. goto out_invalid;
  613. xa_delete_node(node, workingset_update_node);
  614. __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM);
  615. out_invalid:
  616. xa_unlock_irq(&mapping->i_pages);
  617. if (mapping_shrinkable(mapping))
  618. inode_add_lru(mapping->host);
  619. spin_unlock(&mapping->host->i_lock);
  620. ret = LRU_REMOVED_RETRY;
  621. out:
  622. cond_resched();
  623. spin_lock_irq(lru_lock);
  624. return ret;
  625. }
  626. static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
  627. struct shrink_control *sc)
  628. {
  629. /* list_lru lock nests inside the IRQ-safe i_pages lock */
  630. return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate,
  631. NULL);
  632. }
  633. static struct shrinker workingset_shadow_shrinker = {
  634. .count_objects = count_shadow_nodes,
  635. .scan_objects = scan_shadow_nodes,
  636. .seeks = 0, /* ->count reports only fully expendable nodes */
  637. .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
  638. };
  639. /*
  640. * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
  641. * i_pages lock.
  642. */
  643. static struct lock_class_key shadow_nodes_key;
  644. static int __init workingset_init(void)
  645. {
  646. unsigned int timestamp_bits;
  647. unsigned int max_order;
  648. int ret;
  649. BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
  650. /*
  651. * Calculate the eviction bucket size to cover the longest
  652. * actionable refault distance, which is currently half of
  653. * memory (totalram_pages/2). However, memory hotplug may add
  654. * some more pages at runtime, so keep working with up to
  655. * double the initial memory by using totalram_pages as-is.
  656. */
  657. timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
  658. max_order = fls_long(totalram_pages() - 1);
  659. if (max_order > timestamp_bits)
  660. bucket_order = max_order - timestamp_bits;
  661. pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
  662. timestamp_bits, max_order, bucket_order);
  663. ret = prealloc_shrinker(&workingset_shadow_shrinker, "mm-shadow");
  664. if (ret)
  665. goto err;
  666. ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key,
  667. &workingset_shadow_shrinker);
  668. if (ret)
  669. goto err_list_lru;
  670. register_shrinker_prepared(&workingset_shadow_shrinker);
  671. return 0;
  672. err_list_lru:
  673. free_prealloced_shrinker(&workingset_shadow_shrinker);
  674. err:
  675. return ret;
  676. }
  677. module_init(workingset_init);