pagewalk.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/pagewalk.h>
  3. #include <linux/highmem.h>
  4. #include <linux/sched.h>
  5. #include <linux/hugetlb.h>
  6. /*
  7. * We want to know the real level where a entry is located ignoring any
  8. * folding of levels which may be happening. For example if p4d is folded then
  9. * a missing entry found at level 1 (p4d) is actually at level 0 (pgd).
  10. */
  11. static int real_depth(int depth)
  12. {
  13. if (depth == 3 && PTRS_PER_PMD == 1)
  14. depth = 2;
  15. if (depth == 2 && PTRS_PER_PUD == 1)
  16. depth = 1;
  17. if (depth == 1 && PTRS_PER_P4D == 1)
  18. depth = 0;
  19. return depth;
  20. }
  21. static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
  22. unsigned long end, struct mm_walk *walk)
  23. {
  24. const struct mm_walk_ops *ops = walk->ops;
  25. int err = 0;
  26. for (;;) {
  27. err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
  28. if (err)
  29. break;
  30. if (addr >= end - PAGE_SIZE)
  31. break;
  32. addr += PAGE_SIZE;
  33. pte++;
  34. }
  35. return err;
  36. }
  37. static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
  38. struct mm_walk *walk)
  39. {
  40. pte_t *pte;
  41. int err = 0;
  42. spinlock_t *ptl;
  43. if (walk->no_vma) {
  44. pte = pte_offset_map(pmd, addr);
  45. err = walk_pte_range_inner(pte, addr, end, walk);
  46. pte_unmap(pte);
  47. } else {
  48. pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
  49. err = walk_pte_range_inner(pte, addr, end, walk);
  50. pte_unmap_unlock(pte, ptl);
  51. }
  52. return err;
  53. }
  54. #ifdef CONFIG_ARCH_HAS_HUGEPD
  55. static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
  56. unsigned long end, struct mm_walk *walk, int pdshift)
  57. {
  58. int err = 0;
  59. const struct mm_walk_ops *ops = walk->ops;
  60. int shift = hugepd_shift(*phpd);
  61. int page_size = 1 << shift;
  62. if (!ops->pte_entry)
  63. return 0;
  64. if (addr & (page_size - 1))
  65. return 0;
  66. for (;;) {
  67. pte_t *pte;
  68. spin_lock(&walk->mm->page_table_lock);
  69. pte = hugepte_offset(*phpd, addr, pdshift);
  70. err = ops->pte_entry(pte, addr, addr + page_size, walk);
  71. spin_unlock(&walk->mm->page_table_lock);
  72. if (err)
  73. break;
  74. if (addr >= end - page_size)
  75. break;
  76. addr += page_size;
  77. }
  78. return err;
  79. }
  80. #else
  81. static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
  82. unsigned long end, struct mm_walk *walk, int pdshift)
  83. {
  84. return 0;
  85. }
  86. #endif
  87. static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
  88. struct mm_walk *walk)
  89. {
  90. pmd_t *pmd;
  91. unsigned long next;
  92. const struct mm_walk_ops *ops = walk->ops;
  93. int err = 0;
  94. int depth = real_depth(3);
  95. pmd = pmd_offset(pud, addr);
  96. do {
  97. again:
  98. next = pmd_addr_end(addr, end);
  99. if (pmd_none(*pmd)) {
  100. if (ops->pte_hole)
  101. err = ops->pte_hole(addr, next, depth, walk);
  102. if (err)
  103. break;
  104. continue;
  105. }
  106. walk->action = ACTION_SUBTREE;
  107. /*
  108. * This implies that each ->pmd_entry() handler
  109. * needs to know about pmd_trans_huge() pmds
  110. */
  111. if (ops->pmd_entry)
  112. err = ops->pmd_entry(pmd, addr, next, walk);
  113. if (err)
  114. break;
  115. if (walk->action == ACTION_AGAIN)
  116. goto again;
  117. /*
  118. * Check this here so we only break down trans_huge
  119. * pages when we _need_ to
  120. */
  121. if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) ||
  122. walk->action == ACTION_CONTINUE ||
  123. !(ops->pte_entry))
  124. continue;
  125. if (walk->vma) {
  126. split_huge_pmd(walk->vma, pmd, addr);
  127. if (pmd_trans_unstable(pmd))
  128. goto again;
  129. }
  130. if (is_hugepd(__hugepd(pmd_val(*pmd))))
  131. err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
  132. else
  133. err = walk_pte_range(pmd, addr, next, walk);
  134. if (err)
  135. break;
  136. } while (pmd++, addr = next, addr != end);
  137. return err;
  138. }
  139. static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
  140. struct mm_walk *walk)
  141. {
  142. pud_t *pud;
  143. unsigned long next;
  144. const struct mm_walk_ops *ops = walk->ops;
  145. int err = 0;
  146. int depth = real_depth(2);
  147. pud = pud_offset(p4d, addr);
  148. do {
  149. again:
  150. next = pud_addr_end(addr, end);
  151. if (pud_none(*pud)) {
  152. if (ops->pte_hole)
  153. err = ops->pte_hole(addr, next, depth, walk);
  154. if (err)
  155. break;
  156. continue;
  157. }
  158. walk->action = ACTION_SUBTREE;
  159. if (ops->pud_entry)
  160. err = ops->pud_entry(pud, addr, next, walk);
  161. if (err)
  162. break;
  163. if (walk->action == ACTION_AGAIN)
  164. goto again;
  165. if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
  166. walk->action == ACTION_CONTINUE ||
  167. !(ops->pmd_entry || ops->pte_entry))
  168. continue;
  169. if (walk->vma)
  170. split_huge_pud(walk->vma, pud, addr);
  171. if (pud_none(*pud))
  172. goto again;
  173. if (is_hugepd(__hugepd(pud_val(*pud))))
  174. err = walk_hugepd_range((hugepd_t *)pud, addr, next, walk, PUD_SHIFT);
  175. else
  176. err = walk_pmd_range(pud, addr, next, walk);
  177. if (err)
  178. break;
  179. } while (pud++, addr = next, addr != end);
  180. return err;
  181. }
  182. static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
  183. struct mm_walk *walk)
  184. {
  185. p4d_t *p4d;
  186. unsigned long next;
  187. const struct mm_walk_ops *ops = walk->ops;
  188. int err = 0;
  189. int depth = real_depth(1);
  190. p4d = p4d_offset(pgd, addr);
  191. do {
  192. next = p4d_addr_end(addr, end);
  193. if (p4d_none_or_clear_bad(p4d)) {
  194. if (ops->pte_hole)
  195. err = ops->pte_hole(addr, next, depth, walk);
  196. if (err)
  197. break;
  198. continue;
  199. }
  200. if (ops->p4d_entry) {
  201. err = ops->p4d_entry(p4d, addr, next, walk);
  202. if (err)
  203. break;
  204. }
  205. if (is_hugepd(__hugepd(p4d_val(*p4d))))
  206. err = walk_hugepd_range((hugepd_t *)p4d, addr, next, walk, P4D_SHIFT);
  207. else if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
  208. err = walk_pud_range(p4d, addr, next, walk);
  209. if (err)
  210. break;
  211. } while (p4d++, addr = next, addr != end);
  212. return err;
  213. }
  214. static int walk_pgd_range(unsigned long addr, unsigned long end,
  215. struct mm_walk *walk)
  216. {
  217. pgd_t *pgd;
  218. unsigned long next;
  219. const struct mm_walk_ops *ops = walk->ops;
  220. int err = 0;
  221. if (walk->pgd)
  222. pgd = walk->pgd + pgd_index(addr);
  223. else
  224. pgd = pgd_offset(walk->mm, addr);
  225. do {
  226. next = pgd_addr_end(addr, end);
  227. if (pgd_none_or_clear_bad(pgd)) {
  228. if (ops->pte_hole)
  229. err = ops->pte_hole(addr, next, 0, walk);
  230. if (err)
  231. break;
  232. continue;
  233. }
  234. if (ops->pgd_entry) {
  235. err = ops->pgd_entry(pgd, addr, next, walk);
  236. if (err)
  237. break;
  238. }
  239. if (is_hugepd(__hugepd(pgd_val(*pgd))))
  240. err = walk_hugepd_range((hugepd_t *)pgd, addr, next, walk, PGDIR_SHIFT);
  241. else if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry)
  242. err = walk_p4d_range(pgd, addr, next, walk);
  243. if (err)
  244. break;
  245. } while (pgd++, addr = next, addr != end);
  246. return err;
  247. }
  248. #ifdef CONFIG_HUGETLB_PAGE
  249. static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
  250. unsigned long end)
  251. {
  252. unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
  253. return boundary < end ? boundary : end;
  254. }
  255. static int walk_hugetlb_range(unsigned long addr, unsigned long end,
  256. struct mm_walk *walk)
  257. {
  258. struct vm_area_struct *vma = walk->vma;
  259. struct hstate *h = hstate_vma(vma);
  260. unsigned long next;
  261. unsigned long hmask = huge_page_mask(h);
  262. unsigned long sz = huge_page_size(h);
  263. pte_t *pte;
  264. const struct mm_walk_ops *ops = walk->ops;
  265. int err = 0;
  266. do {
  267. next = hugetlb_entry_end(h, addr, end);
  268. pte = huge_pte_offset(walk->mm, addr & hmask, sz);
  269. if (pte)
  270. err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
  271. else if (ops->pte_hole)
  272. err = ops->pte_hole(addr, next, -1, walk);
  273. if (err)
  274. break;
  275. } while (addr = next, addr != end);
  276. return err;
  277. }
  278. #else /* CONFIG_HUGETLB_PAGE */
  279. static int walk_hugetlb_range(unsigned long addr, unsigned long end,
  280. struct mm_walk *walk)
  281. {
  282. return 0;
  283. }
  284. #endif /* CONFIG_HUGETLB_PAGE */
  285. /*
  286. * Decide whether we really walk over the current vma on [@start, @end)
  287. * or skip it via the returned value. Return 0 if we do walk over the
  288. * current vma, and return 1 if we skip the vma. Negative values means
  289. * error, where we abort the current walk.
  290. */
  291. static int walk_page_test(unsigned long start, unsigned long end,
  292. struct mm_walk *walk)
  293. {
  294. struct vm_area_struct *vma = walk->vma;
  295. const struct mm_walk_ops *ops = walk->ops;
  296. if (ops->test_walk)
  297. return ops->test_walk(start, end, walk);
  298. /*
  299. * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
  300. * range, so we don't walk over it as we do for normal vmas. However,
  301. * Some callers are interested in handling hole range and they don't
  302. * want to just ignore any single address range. Such users certainly
  303. * define their ->pte_hole() callbacks, so let's delegate them to handle
  304. * vma(VM_PFNMAP).
  305. */
  306. if (vma->vm_flags & VM_PFNMAP) {
  307. int err = 1;
  308. if (ops->pte_hole)
  309. err = ops->pte_hole(start, end, -1, walk);
  310. return err ? err : 1;
  311. }
  312. return 0;
  313. }
  314. static int __walk_page_range(unsigned long start, unsigned long end,
  315. struct mm_walk *walk)
  316. {
  317. int err = 0;
  318. struct vm_area_struct *vma = walk->vma;
  319. const struct mm_walk_ops *ops = walk->ops;
  320. if (ops->pre_vma) {
  321. err = ops->pre_vma(start, end, walk);
  322. if (err)
  323. return err;
  324. }
  325. if (is_vm_hugetlb_page(vma)) {
  326. if (ops->hugetlb_entry)
  327. err = walk_hugetlb_range(start, end, walk);
  328. } else
  329. err = walk_pgd_range(start, end, walk);
  330. if (ops->post_vma)
  331. ops->post_vma(walk);
  332. return err;
  333. }
  334. static inline void process_mm_walk_lock(struct mm_struct *mm,
  335. enum page_walk_lock walk_lock)
  336. {
  337. if (walk_lock == PGWALK_RDLOCK)
  338. mmap_assert_locked(mm);
  339. else
  340. mmap_assert_write_locked(mm);
  341. }
  342. static inline void process_vma_walk_lock(struct vm_area_struct *vma,
  343. enum page_walk_lock walk_lock)
  344. {
  345. #ifdef CONFIG_PER_VMA_LOCK
  346. switch (walk_lock) {
  347. case PGWALK_WRLOCK:
  348. vma_start_write(vma);
  349. break;
  350. case PGWALK_WRLOCK_VERIFY:
  351. vma_assert_write_locked(vma);
  352. break;
  353. case PGWALK_RDLOCK:
  354. /* PGWALK_RDLOCK is handled by process_mm_walk_lock */
  355. break;
  356. }
  357. #endif
  358. }
  359. /**
  360. * walk_page_range - walk page table with caller specific callbacks
  361. * @mm: mm_struct representing the target process of page table walk
  362. * @start: start address of the virtual address range
  363. * @end: end address of the virtual address range
  364. * @ops: operation to call during the walk
  365. * @private: private data for callbacks' usage
  366. *
  367. * Recursively walk the page table tree of the process represented by @mm
  368. * within the virtual address range [@start, @end). During walking, we can do
  369. * some caller-specific works for each entry, by setting up pmd_entry(),
  370. * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
  371. * callbacks, the associated entries/pages are just ignored.
  372. * The return values of these callbacks are commonly defined like below:
  373. *
  374. * - 0 : succeeded to handle the current entry, and if you don't reach the
  375. * end address yet, continue to walk.
  376. * - >0 : succeeded to handle the current entry, and return to the caller
  377. * with caller specific value.
  378. * - <0 : failed to handle the current entry, and return to the caller
  379. * with error code.
  380. *
  381. * Before starting to walk page table, some callers want to check whether
  382. * they really want to walk over the current vma, typically by checking
  383. * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
  384. * purpose.
  385. *
  386. * If operations need to be staged before and committed after a vma is walked,
  387. * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
  388. * since it is intended to handle commit-type operations, can't return any
  389. * errors.
  390. *
  391. * struct mm_walk keeps current values of some common data like vma and pmd,
  392. * which are useful for the access from callbacks. If you want to pass some
  393. * caller-specific data to callbacks, @private should be helpful.
  394. *
  395. * Locking:
  396. * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
  397. * because these function traverse vma list and/or access to vma's data.
  398. */
  399. int walk_page_range(struct mm_struct *mm, unsigned long start,
  400. unsigned long end, const struct mm_walk_ops *ops,
  401. void *private)
  402. {
  403. int err = 0;
  404. unsigned long next;
  405. struct vm_area_struct *vma;
  406. struct mm_walk walk = {
  407. .ops = ops,
  408. .mm = mm,
  409. .private = private,
  410. };
  411. if (start >= end)
  412. return -EINVAL;
  413. if (!walk.mm)
  414. return -EINVAL;
  415. process_mm_walk_lock(walk.mm, ops->walk_lock);
  416. vma = find_vma(walk.mm, start);
  417. do {
  418. if (!vma) { /* after the last vma */
  419. walk.vma = NULL;
  420. next = end;
  421. if (ops->pte_hole)
  422. err = ops->pte_hole(start, next, -1, &walk);
  423. } else if (start < vma->vm_start) { /* outside vma */
  424. walk.vma = NULL;
  425. next = min(end, vma->vm_start);
  426. if (ops->pte_hole)
  427. err = ops->pte_hole(start, next, -1, &walk);
  428. } else { /* inside vma */
  429. process_vma_walk_lock(vma, ops->walk_lock);
  430. walk.vma = vma;
  431. next = min(end, vma->vm_end);
  432. vma = find_vma(mm, vma->vm_end);
  433. err = walk_page_test(start, next, &walk);
  434. if (err > 0) {
  435. /*
  436. * positive return values are purely for
  437. * controlling the pagewalk, so should never
  438. * be passed to the callers.
  439. */
  440. err = 0;
  441. continue;
  442. }
  443. if (err < 0)
  444. break;
  445. err = __walk_page_range(start, next, &walk);
  446. }
  447. if (err)
  448. break;
  449. } while (start = next, start < end);
  450. return err;
  451. }
  452. /**
  453. * walk_page_range_novma - walk a range of pagetables not backed by a vma
  454. * @mm: mm_struct representing the target process of page table walk
  455. * @start: start address of the virtual address range
  456. * @end: end address of the virtual address range
  457. * @ops: operation to call during the walk
  458. * @pgd: pgd to walk if different from mm->pgd
  459. * @private: private data for callbacks' usage
  460. *
  461. * Similar to walk_page_range() but can walk any page tables even if they are
  462. * not backed by VMAs. Because 'unusual' entries may be walked this function
  463. * will also not lock the PTEs for the pte_entry() callback. This is useful for
  464. * walking the kernel pages tables or page tables for firmware.
  465. */
  466. int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
  467. unsigned long end, const struct mm_walk_ops *ops,
  468. pgd_t *pgd,
  469. void *private)
  470. {
  471. struct mm_walk walk = {
  472. .ops = ops,
  473. .mm = mm,
  474. .pgd = pgd,
  475. .private = private,
  476. .no_vma = true
  477. };
  478. if (start >= end || !walk.mm)
  479. return -EINVAL;
  480. mmap_assert_write_locked(walk.mm);
  481. return walk_pgd_range(start, end, &walk);
  482. }
  483. int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
  484. void *private)
  485. {
  486. struct mm_walk walk = {
  487. .ops = ops,
  488. .mm = vma->vm_mm,
  489. .vma = vma,
  490. .private = private,
  491. };
  492. int err;
  493. if (!walk.mm)
  494. return -EINVAL;
  495. process_mm_walk_lock(walk.mm, ops->walk_lock);
  496. process_vma_walk_lock(vma, ops->walk_lock);
  497. err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
  498. if (err > 0)
  499. return 0;
  500. if (err < 0)
  501. return err;
  502. return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
  503. }
  504. /**
  505. * walk_page_mapping - walk all memory areas mapped into a struct address_space.
  506. * @mapping: Pointer to the struct address_space
  507. * @first_index: First page offset in the address_space
  508. * @nr: Number of incremental page offsets to cover
  509. * @ops: operation to call during the walk
  510. * @private: private data for callbacks' usage
  511. *
  512. * This function walks all memory areas mapped into a struct address_space.
  513. * The walk is limited to only the given page-size index range, but if
  514. * the index boundaries cross a huge page-table entry, that entry will be
  515. * included.
  516. *
  517. * Also see walk_page_range() for additional information.
  518. *
  519. * Locking:
  520. * This function can't require that the struct mm_struct::mmap_lock is held,
  521. * since @mapping may be mapped by multiple processes. Instead
  522. * @mapping->i_mmap_rwsem must be held. This might have implications in the
  523. * callbacks, and it's up tho the caller to ensure that the
  524. * struct mm_struct::mmap_lock is not needed.
  525. *
  526. * Also this means that a caller can't rely on the struct
  527. * vm_area_struct::vm_flags to be constant across a call,
  528. * except for immutable flags. Callers requiring this shouldn't use
  529. * this function.
  530. *
  531. * Return: 0 on success, negative error code on failure, positive number on
  532. * caller defined premature termination.
  533. */
  534. int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
  535. pgoff_t nr, const struct mm_walk_ops *ops,
  536. void *private)
  537. {
  538. struct mm_walk walk = {
  539. .ops = ops,
  540. .private = private,
  541. };
  542. struct vm_area_struct *vma;
  543. pgoff_t vba, vea, cba, cea;
  544. unsigned long start_addr, end_addr;
  545. int err = 0;
  546. lockdep_assert_held(&mapping->i_mmap_rwsem);
  547. vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
  548. first_index + nr - 1) {
  549. /* Clip to the vma */
  550. vba = vma->vm_pgoff;
  551. vea = vba + vma_pages(vma);
  552. cba = first_index;
  553. cba = max(cba, vba);
  554. cea = first_index + nr;
  555. cea = min(cea, vea);
  556. start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
  557. end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
  558. if (start_addr >= end_addr)
  559. continue;
  560. walk.vma = vma;
  561. walk.mm = vma->vm_mm;
  562. err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
  563. if (err > 0) {
  564. err = 0;
  565. break;
  566. } else if (err < 0)
  567. break;
  568. err = __walk_page_range(start_addr, end_addr, &walk);
  569. if (err)
  570. break;
  571. }
  572. return err;
  573. }