mm_inline.h 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. #ifndef LINUX_MM_INLINE_H
  3. #define LINUX_MM_INLINE_H
  4. #include <linux/atomic.h>
  5. #include <linux/huge_mm.h>
  6. #include <linux/swap.h>
  7. #include <linux/string.h>
  8. #include <linux/userfaultfd_k.h>
  9. #include <linux/swapops.h>
  10. /**
  11. * folio_is_file_lru - Should the folio be on a file LRU or anon LRU?
  12. * @folio: The folio to test.
  13. *
  14. * We would like to get this info without a page flag, but the state
  15. * needs to survive until the folio is last deleted from the LRU, which
  16. * could be as far down as __page_cache_release.
  17. *
  18. * Return: An integer (not a boolean!) used to sort a folio onto the
  19. * right LRU list and to account folios correctly.
  20. * 1 if @folio is a regular filesystem backed page cache folio
  21. * or a lazily freed anonymous folio (e.g. via MADV_FREE).
  22. * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise
  23. * ram or swap backed folio.
  24. */
  25. static inline int folio_is_file_lru(struct folio *folio)
  26. {
  27. return !folio_test_swapbacked(folio);
  28. }
  29. static inline int page_is_file_lru(struct page *page)
  30. {
  31. return folio_is_file_lru(page_folio(page));
  32. }
  33. static __always_inline void __update_lru_size(struct lruvec *lruvec,
  34. enum lru_list lru, enum zone_type zid,
  35. long nr_pages)
  36. {
  37. struct pglist_data *pgdat = lruvec_pgdat(lruvec);
  38. lockdep_assert_held(&lruvec->lru_lock);
  39. WARN_ON_ONCE(nr_pages != (int)nr_pages);
  40. __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
  41. __mod_zone_page_state(&pgdat->node_zones[zid],
  42. NR_ZONE_LRU_BASE + lru, nr_pages);
  43. }
  44. static __always_inline void update_lru_size(struct lruvec *lruvec,
  45. enum lru_list lru, enum zone_type zid,
  46. long nr_pages)
  47. {
  48. __update_lru_size(lruvec, lru, zid, nr_pages);
  49. #ifdef CONFIG_MEMCG
  50. mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
  51. #endif
  52. }
  53. /**
  54. * __folio_clear_lru_flags - Clear page lru flags before releasing a page.
  55. * @folio: The folio that was on lru and now has a zero reference.
  56. */
  57. static __always_inline void __folio_clear_lru_flags(struct folio *folio)
  58. {
  59. VM_BUG_ON_FOLIO(!folio_test_lru(folio), folio);
  60. __folio_clear_lru(folio);
  61. /* this shouldn't happen, so leave the flags to bad_page() */
  62. if (folio_test_active(folio) && folio_test_unevictable(folio))
  63. return;
  64. __folio_clear_active(folio);
  65. __folio_clear_unevictable(folio);
  66. }
  67. /**
  68. * folio_lru_list - Which LRU list should a folio be on?
  69. * @folio: The folio to test.
  70. *
  71. * Return: The LRU list a folio should be on, as an index
  72. * into the array of LRU lists.
  73. */
  74. static __always_inline enum lru_list folio_lru_list(struct folio *folio)
  75. {
  76. enum lru_list lru;
  77. VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio);
  78. if (folio_test_unevictable(folio))
  79. return LRU_UNEVICTABLE;
  80. lru = folio_is_file_lru(folio) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
  81. if (folio_test_active(folio))
  82. lru += LRU_ACTIVE;
  83. return lru;
  84. }
  85. #ifdef CONFIG_LRU_GEN
  86. #ifdef CONFIG_LRU_GEN_ENABLED
  87. static inline bool lru_gen_enabled(void)
  88. {
  89. DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);
  90. return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
  91. }
  92. #else
  93. static inline bool lru_gen_enabled(void)
  94. {
  95. DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);
  96. return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]);
  97. }
  98. #endif
  99. static inline bool lru_gen_in_fault(void)
  100. {
  101. return current->in_lru_fault;
  102. }
  103. static inline int lru_gen_from_seq(unsigned long seq)
  104. {
  105. return seq % MAX_NR_GENS;
  106. }
  107. static inline int lru_hist_from_seq(unsigned long seq)
  108. {
  109. return seq % NR_HIST_GENS;
  110. }
  111. static inline int lru_tier_from_refs(int refs)
  112. {
  113. VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH));
  114. /* see the comment in folio_lru_refs() */
  115. return order_base_2(refs + 1);
  116. }
  117. static inline int folio_lru_refs(struct folio *folio)
  118. {
  119. unsigned long flags = READ_ONCE(folio->flags);
  120. bool workingset = flags & BIT(PG_workingset);
  121. /*
  122. * Return the number of accesses beyond PG_referenced, i.e., N-1 if the
  123. * total number of accesses is N>1, since N=0,1 both map to the first
  124. * tier. lru_tier_from_refs() will account for this off-by-one. Also see
  125. * the comment on MAX_NR_TIERS.
  126. */
  127. return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset;
  128. }
  129. static inline int folio_lru_gen(struct folio *folio)
  130. {
  131. unsigned long flags = READ_ONCE(folio->flags);
  132. return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
  133. }
  134. static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
  135. {
  136. unsigned long max_seq = lruvec->lrugen.max_seq;
  137. VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
  138. /* see the comment on MIN_NR_GENS */
  139. return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
  140. }
  141. static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio,
  142. int old_gen, int new_gen)
  143. {
  144. int type = folio_is_file_lru(folio);
  145. int zone = folio_zonenum(folio);
  146. int delta = folio_nr_pages(folio);
  147. enum lru_list lru = type * LRU_INACTIVE_FILE;
  148. struct lru_gen_folio *lrugen = &lruvec->lrugen;
  149. VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
  150. VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
  151. VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1);
  152. if (old_gen >= 0)
  153. WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone],
  154. lrugen->nr_pages[old_gen][type][zone] - delta);
  155. if (new_gen >= 0)
  156. WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone],
  157. lrugen->nr_pages[new_gen][type][zone] + delta);
  158. /* addition */
  159. if (old_gen < 0) {
  160. if (lru_gen_is_active(lruvec, new_gen))
  161. lru += LRU_ACTIVE;
  162. __update_lru_size(lruvec, lru, zone, delta);
  163. return;
  164. }
  165. /* deletion */
  166. if (new_gen < 0) {
  167. if (lru_gen_is_active(lruvec, old_gen))
  168. lru += LRU_ACTIVE;
  169. __update_lru_size(lruvec, lru, zone, -delta);
  170. return;
  171. }
  172. /* promotion */
  173. if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
  174. __update_lru_size(lruvec, lru, zone, -delta);
  175. __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
  176. }
  177. /* demotion requires isolation, e.g., lru_deactivate_fn() */
  178. VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
  179. }
  180. static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
  181. {
  182. unsigned long seq;
  183. unsigned long flags;
  184. int gen = folio_lru_gen(folio);
  185. int type = folio_is_file_lru(folio);
  186. int zone = folio_zonenum(folio);
  187. struct lru_gen_folio *lrugen = &lruvec->lrugen;
  188. VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
  189. if (folio_test_unevictable(folio) || !lrugen->enabled)
  190. return false;
  191. /*
  192. * There are three common cases for this page:
  193. * 1. If it's hot, e.g., freshly faulted in or previously hot and
  194. * migrated, add it to the youngest generation.
  195. * 2. If it's cold but can't be evicted immediately, i.e., an anon page
  196. * not in swapcache or a dirty page pending writeback, add it to the
  197. * second oldest generation.
  198. * 3. Everything else (clean, cold) is added to the oldest generation.
  199. */
  200. if (folio_test_active(folio))
  201. seq = lrugen->max_seq;
  202. else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) ||
  203. (folio_test_reclaim(folio) &&
  204. (folio_test_dirty(folio) || folio_test_writeback(folio))))
  205. seq = lrugen->min_seq[type] + 1;
  206. else
  207. seq = lrugen->min_seq[type];
  208. gen = lru_gen_from_seq(seq);
  209. flags = (gen + 1UL) << LRU_GEN_PGOFF;
  210. /* see the comment on MIN_NR_GENS about PG_active */
  211. set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags);
  212. lru_gen_update_size(lruvec, folio, -1, gen);
  213. /* for folio_rotate_reclaimable() */
  214. if (reclaiming)
  215. list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
  216. else
  217. list_add(&folio->lru, &lrugen->folios[gen][type][zone]);
  218. return true;
  219. }
  220. static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
  221. {
  222. unsigned long flags;
  223. int gen = folio_lru_gen(folio);
  224. if (gen < 0)
  225. return false;
  226. VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
  227. VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
  228. /* for folio_migrate_flags() */
  229. flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
  230. flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags);
  231. gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
  232. lru_gen_update_size(lruvec, folio, gen, -1);
  233. list_del(&folio->lru);
  234. return true;
  235. }
  236. #else /* !CONFIG_LRU_GEN */
  237. static inline bool lru_gen_enabled(void)
  238. {
  239. return false;
  240. }
  241. static inline bool lru_gen_in_fault(void)
  242. {
  243. return false;
  244. }
  245. static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
  246. {
  247. return false;
  248. }
  249. static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
  250. {
  251. return false;
  252. }
  253. #endif /* CONFIG_LRU_GEN */
  254. static __always_inline
  255. void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio)
  256. {
  257. enum lru_list lru = folio_lru_list(folio);
  258. if (lru_gen_add_folio(lruvec, folio, false))
  259. return;
  260. update_lru_size(lruvec, lru, folio_zonenum(folio),
  261. folio_nr_pages(folio));
  262. if (lru != LRU_UNEVICTABLE)
  263. list_add(&folio->lru, &lruvec->lists[lru]);
  264. }
  265. static __always_inline void add_page_to_lru_list(struct page *page,
  266. struct lruvec *lruvec)
  267. {
  268. lruvec_add_folio(lruvec, page_folio(page));
  269. }
  270. static __always_inline
  271. void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio)
  272. {
  273. enum lru_list lru = folio_lru_list(folio);
  274. if (lru_gen_add_folio(lruvec, folio, true))
  275. return;
  276. update_lru_size(lruvec, lru, folio_zonenum(folio),
  277. folio_nr_pages(folio));
  278. /* This is not expected to be used on LRU_UNEVICTABLE */
  279. list_add_tail(&folio->lru, &lruvec->lists[lru]);
  280. }
  281. static __always_inline
  282. void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
  283. {
  284. enum lru_list lru = folio_lru_list(folio);
  285. if (lru_gen_del_folio(lruvec, folio, false))
  286. return;
  287. if (lru != LRU_UNEVICTABLE)
  288. list_del(&folio->lru);
  289. update_lru_size(lruvec, lru, folio_zonenum(folio),
  290. -folio_nr_pages(folio));
  291. }
  292. static __always_inline void del_page_from_lru_list(struct page *page,
  293. struct lruvec *lruvec)
  294. {
  295. lruvec_del_folio(lruvec, page_folio(page));
  296. }
  297. #ifdef CONFIG_ANON_VMA_NAME
  298. /*
  299. * mmap_lock should be read-locked when calling anon_vma_name(). Caller should
  300. * either keep holding the lock while using the returned pointer or it should
  301. * raise anon_vma_name refcount before releasing the lock.
  302. */
  303. extern struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma);
  304. extern struct anon_vma_name *anon_vma_name_alloc(const char *name);
  305. extern void anon_vma_name_free(struct kref *kref);
  306. /* mmap_lock should be read-locked */
  307. static inline void anon_vma_name_get(struct anon_vma_name *anon_name)
  308. {
  309. if (anon_name)
  310. kref_get(&anon_name->kref);
  311. }
  312. static inline void anon_vma_name_put(struct anon_vma_name *anon_name)
  313. {
  314. if (anon_name)
  315. kref_put(&anon_name->kref, anon_vma_name_free);
  316. }
  317. static inline
  318. struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name)
  319. {
  320. /* Prevent anon_name refcount saturation early on */
  321. if (kref_read(&anon_name->kref) < REFCOUNT_MAX) {
  322. anon_vma_name_get(anon_name);
  323. return anon_name;
  324. }
  325. return anon_vma_name_alloc(anon_name->name);
  326. }
  327. static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
  328. struct vm_area_struct *new_vma)
  329. {
  330. struct anon_vma_name *anon_name = anon_vma_name(orig_vma);
  331. if (anon_name)
  332. new_vma->anon_name = anon_vma_name_reuse(anon_name);
  333. }
  334. static inline void free_anon_vma_name(struct vm_area_struct *vma)
  335. {
  336. /*
  337. * Not using anon_vma_name because it generates a warning if mmap_lock
  338. * is not held, which might be the case here.
  339. */
  340. if (!vma->vm_file)
  341. anon_vma_name_put(vma->anon_name);
  342. }
  343. static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
  344. struct anon_vma_name *anon_name2)
  345. {
  346. if (anon_name1 == anon_name2)
  347. return true;
  348. return anon_name1 && anon_name2 &&
  349. !strcmp(anon_name1->name, anon_name2->name);
  350. }
  351. #else /* CONFIG_ANON_VMA_NAME */
  352. static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
  353. {
  354. return NULL;
  355. }
  356. static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
  357. {
  358. return NULL;
  359. }
  360. static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {}
  361. static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {}
  362. static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
  363. struct vm_area_struct *new_vma) {}
  364. static inline void free_anon_vma_name(struct vm_area_struct *vma) {}
  365. static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
  366. struct anon_vma_name *anon_name2)
  367. {
  368. return true;
  369. }
  370. #endif /* CONFIG_ANON_VMA_NAME */
  371. static inline void init_tlb_flush_pending(struct mm_struct *mm)
  372. {
  373. atomic_set(&mm->tlb_flush_pending, 0);
  374. }
  375. static inline void inc_tlb_flush_pending(struct mm_struct *mm)
  376. {
  377. atomic_inc(&mm->tlb_flush_pending);
  378. /*
  379. * The only time this value is relevant is when there are indeed pages
  380. * to flush. And we'll only flush pages after changing them, which
  381. * requires the PTL.
  382. *
  383. * So the ordering here is:
  384. *
  385. * atomic_inc(&mm->tlb_flush_pending);
  386. * spin_lock(&ptl);
  387. * ...
  388. * set_pte_at();
  389. * spin_unlock(&ptl);
  390. *
  391. * spin_lock(&ptl)
  392. * mm_tlb_flush_pending();
  393. * ....
  394. * spin_unlock(&ptl);
  395. *
  396. * flush_tlb_range();
  397. * atomic_dec(&mm->tlb_flush_pending);
  398. *
  399. * Where the increment if constrained by the PTL unlock, it thus
  400. * ensures that the increment is visible if the PTE modification is
  401. * visible. After all, if there is no PTE modification, nobody cares
  402. * about TLB flushes either.
  403. *
  404. * This very much relies on users (mm_tlb_flush_pending() and
  405. * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and
  406. * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc
  407. * locks (PPC) the unlock of one doesn't order against the lock of
  408. * another PTL.
  409. *
  410. * The decrement is ordered by the flush_tlb_range(), such that
  411. * mm_tlb_flush_pending() will not return false unless all flushes have
  412. * completed.
  413. */
  414. }
  415. static inline void dec_tlb_flush_pending(struct mm_struct *mm)
  416. {
  417. /*
  418. * See inc_tlb_flush_pending().
  419. *
  420. * This cannot be smp_mb__before_atomic() because smp_mb() simply does
  421. * not order against TLB invalidate completion, which is what we need.
  422. *
  423. * Therefore we must rely on tlb_flush_*() to guarantee order.
  424. */
  425. atomic_dec(&mm->tlb_flush_pending);
  426. }
  427. static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
  428. {
  429. /*
  430. * Must be called after having acquired the PTL; orders against that
  431. * PTLs release and therefore ensures that if we observe the modified
  432. * PTE we must also observe the increment from inc_tlb_flush_pending().
  433. *
  434. * That is, it only guarantees to return true if there is a flush
  435. * pending for _this_ PTL.
  436. */
  437. return atomic_read(&mm->tlb_flush_pending);
  438. }
  439. static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
  440. {
  441. /*
  442. * Similar to mm_tlb_flush_pending(), we must have acquired the PTL
  443. * for which there is a TLB flush pending in order to guarantee
  444. * we've seen both that PTE modification and the increment.
  445. *
  446. * (no requirement on actually still holding the PTL, that is irrelevant)
  447. */
  448. return atomic_read(&mm->tlb_flush_pending) > 1;
  449. }
  450. /*
  451. * If this pte is wr-protected by uffd-wp in any form, arm the special pte to
  452. * replace a none pte. NOTE! This should only be called when *pte is already
  453. * cleared so we will never accidentally replace something valuable. Meanwhile
  454. * none pte also means we are not demoting the pte so tlb flushed is not needed.
  455. * E.g., when pte cleared the caller should have taken care of the tlb flush.
  456. *
  457. * Must be called with pgtable lock held so that no thread will see the none
  458. * pte, and if they see it, they'll fault and serialize at the pgtable lock.
  459. *
  460. * This function is a no-op if PTE_MARKER_UFFD_WP is not enabled.
  461. */
  462. static inline void
  463. pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
  464. pte_t *pte, pte_t pteval)
  465. {
  466. #ifdef CONFIG_PTE_MARKER_UFFD_WP
  467. bool arm_uffd_pte = false;
  468. /* The current status of the pte should be "cleared" before calling */
  469. WARN_ON_ONCE(!pte_none(*pte));
  470. if (vma_is_anonymous(vma) || !userfaultfd_wp(vma))
  471. return;
  472. /* A uffd-wp wr-protected normal pte */
  473. if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval)))
  474. arm_uffd_pte = true;
  475. /*
  476. * A uffd-wp wr-protected swap pte. Note: this should even cover an
  477. * existing pte marker with uffd-wp bit set.
  478. */
  479. if (unlikely(pte_swp_uffd_wp_any(pteval)))
  480. arm_uffd_pte = true;
  481. if (unlikely(arm_uffd_pte))
  482. set_pte_at(vma->vm_mm, addr, pte,
  483. make_pte_marker(PTE_MARKER_UFFD_WP));
  484. #endif
  485. }
  486. static inline bool vma_has_recency(struct vm_area_struct *vma)
  487. {
  488. if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))
  489. return false;
  490. if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE))
  491. return false;
  492. return true;
  493. }
  494. #endif