hugetlb_vmemmap.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * HugeTLB Vmemmap Optimization (HVO)
  4. *
  5. * Copyright (c) 2020, ByteDance. All rights reserved.
  6. *
  7. * Author: Muchun Song <[email protected]>
  8. *
  9. * See Documentation/mm/vmemmap_dedup.rst
  10. */
  11. #define pr_fmt(fmt) "HugeTLB: " fmt
  12. #include <linux/pgtable.h>
  13. #include <linux/moduleparam.h>
  14. #include <linux/bootmem_info.h>
  15. #include <asm/pgalloc.h>
  16. #include <asm/tlbflush.h>
  17. #include "hugetlb_vmemmap.h"
  18. /**
  19. * struct vmemmap_remap_walk - walk vmemmap page table
  20. *
  21. * @remap_pte: called for each lowest-level entry (PTE).
  22. * @nr_walked: the number of walked pte.
  23. * @reuse_page: the page which is reused for the tail vmemmap pages.
  24. * @reuse_addr: the virtual address of the @reuse_page page.
  25. * @vmemmap_pages: the list head of the vmemmap pages that can be freed
  26. * or is mapped from.
  27. */
  28. struct vmemmap_remap_walk {
  29. void (*remap_pte)(pte_t *pte, unsigned long addr,
  30. struct vmemmap_remap_walk *walk);
  31. unsigned long nr_walked;
  32. struct page *reuse_page;
  33. unsigned long reuse_addr;
  34. struct list_head *vmemmap_pages;
  35. };
  36. static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
  37. {
  38. pmd_t __pmd;
  39. int i;
  40. unsigned long addr = start;
  41. struct page *head;
  42. pte_t *pgtable;
  43. spin_lock(&init_mm.page_table_lock);
  44. head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL;
  45. spin_unlock(&init_mm.page_table_lock);
  46. if (!head)
  47. return 0;
  48. pgtable = pte_alloc_one_kernel(&init_mm);
  49. if (!pgtable)
  50. return -ENOMEM;
  51. pmd_populate_kernel(&init_mm, &__pmd, pgtable);
  52. for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
  53. pte_t entry, *pte;
  54. pgprot_t pgprot = PAGE_KERNEL;
  55. entry = mk_pte(head + i, pgprot);
  56. pte = pte_offset_kernel(&__pmd, addr);
  57. set_pte_at(&init_mm, addr, pte, entry);
  58. }
  59. spin_lock(&init_mm.page_table_lock);
  60. if (likely(pmd_leaf(*pmd))) {
  61. /*
  62. * Higher order allocations from buddy allocator must be able to
  63. * be treated as indepdenent small pages (as they can be freed
  64. * individually).
  65. */
  66. if (!PageReserved(head))
  67. split_page(head, get_order(PMD_SIZE));
  68. /* Make pte visible before pmd. See comment in pmd_install(). */
  69. smp_wmb();
  70. pmd_populate_kernel(&init_mm, pmd, pgtable);
  71. flush_tlb_kernel_range(start, start + PMD_SIZE);
  72. } else {
  73. pte_free_kernel(&init_mm, pgtable);
  74. }
  75. spin_unlock(&init_mm.page_table_lock);
  76. return 0;
  77. }
  78. static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
  79. unsigned long end,
  80. struct vmemmap_remap_walk *walk)
  81. {
  82. pte_t *pte = pte_offset_kernel(pmd, addr);
  83. /*
  84. * The reuse_page is found 'first' in table walk before we start
  85. * remapping (which is calling @walk->remap_pte).
  86. */
  87. if (!walk->reuse_page) {
  88. walk->reuse_page = pte_page(*pte);
  89. /*
  90. * Because the reuse address is part of the range that we are
  91. * walking, skip the reuse address range.
  92. */
  93. addr += PAGE_SIZE;
  94. pte++;
  95. walk->nr_walked++;
  96. }
  97. for (; addr != end; addr += PAGE_SIZE, pte++) {
  98. walk->remap_pte(pte, addr, walk);
  99. walk->nr_walked++;
  100. }
  101. }
  102. static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
  103. unsigned long end,
  104. struct vmemmap_remap_walk *walk)
  105. {
  106. pmd_t *pmd;
  107. unsigned long next;
  108. pmd = pmd_offset(pud, addr);
  109. do {
  110. int ret;
  111. ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
  112. if (ret)
  113. return ret;
  114. next = pmd_addr_end(addr, end);
  115. vmemmap_pte_range(pmd, addr, next, walk);
  116. } while (pmd++, addr = next, addr != end);
  117. return 0;
  118. }
  119. static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
  120. unsigned long end,
  121. struct vmemmap_remap_walk *walk)
  122. {
  123. pud_t *pud;
  124. unsigned long next;
  125. pud = pud_offset(p4d, addr);
  126. do {
  127. int ret;
  128. next = pud_addr_end(addr, end);
  129. ret = vmemmap_pmd_range(pud, addr, next, walk);
  130. if (ret)
  131. return ret;
  132. } while (pud++, addr = next, addr != end);
  133. return 0;
  134. }
  135. static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
  136. unsigned long end,
  137. struct vmemmap_remap_walk *walk)
  138. {
  139. p4d_t *p4d;
  140. unsigned long next;
  141. p4d = p4d_offset(pgd, addr);
  142. do {
  143. int ret;
  144. next = p4d_addr_end(addr, end);
  145. ret = vmemmap_pud_range(p4d, addr, next, walk);
  146. if (ret)
  147. return ret;
  148. } while (p4d++, addr = next, addr != end);
  149. return 0;
  150. }
  151. static int vmemmap_remap_range(unsigned long start, unsigned long end,
  152. struct vmemmap_remap_walk *walk)
  153. {
  154. unsigned long addr = start;
  155. unsigned long next;
  156. pgd_t *pgd;
  157. VM_BUG_ON(!PAGE_ALIGNED(start));
  158. VM_BUG_ON(!PAGE_ALIGNED(end));
  159. pgd = pgd_offset_k(addr);
  160. do {
  161. int ret;
  162. next = pgd_addr_end(addr, end);
  163. ret = vmemmap_p4d_range(pgd, addr, next, walk);
  164. if (ret)
  165. return ret;
  166. } while (pgd++, addr = next, addr != end);
  167. /*
  168. * We only change the mapping of the vmemmap virtual address range
  169. * [@start + PAGE_SIZE, end), so we only need to flush the TLB which
  170. * belongs to the range.
  171. */
  172. flush_tlb_kernel_range(start + PAGE_SIZE, end);
  173. return 0;
  174. }
  175. /*
  176. * Free a vmemmap page. A vmemmap page can be allocated from the memblock
  177. * allocator or buddy allocator. If the PG_reserved flag is set, it means
  178. * that it allocated from the memblock allocator, just free it via the
  179. * free_bootmem_page(). Otherwise, use __free_page().
  180. */
  181. static inline void free_vmemmap_page(struct page *page)
  182. {
  183. if (PageReserved(page))
  184. free_bootmem_page(page);
  185. else
  186. __free_page(page);
  187. }
  188. /* Free a list of the vmemmap pages */
  189. static void free_vmemmap_page_list(struct list_head *list)
  190. {
  191. struct page *page, *next;
  192. list_for_each_entry_safe(page, next, list, lru) {
  193. list_del(&page->lru);
  194. free_vmemmap_page(page);
  195. }
  196. }
  197. static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
  198. struct vmemmap_remap_walk *walk)
  199. {
  200. /*
  201. * Remap the tail pages as read-only to catch illegal write operation
  202. * to the tail pages.
  203. */
  204. pgprot_t pgprot = PAGE_KERNEL_RO;
  205. pte_t entry = mk_pte(walk->reuse_page, pgprot);
  206. struct page *page = pte_page(*pte);
  207. list_add_tail(&page->lru, walk->vmemmap_pages);
  208. set_pte_at(&init_mm, addr, pte, entry);
  209. }
  210. /*
  211. * How many struct page structs need to be reset. When we reuse the head
  212. * struct page, the special metadata (e.g. page->flags or page->mapping)
  213. * cannot copy to the tail struct page structs. The invalid value will be
  214. * checked in the free_tail_pages_check(). In order to avoid the message
  215. * of "corrupted mapping in tail page". We need to reset at least 3 (one
  216. * head struct page struct and two tail struct page structs) struct page
  217. * structs.
  218. */
  219. #define NR_RESET_STRUCT_PAGE 3
  220. static inline void reset_struct_pages(struct page *start)
  221. {
  222. struct page *from = start + NR_RESET_STRUCT_PAGE;
  223. BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page));
  224. memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE);
  225. }
  226. static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
  227. struct vmemmap_remap_walk *walk)
  228. {
  229. pgprot_t pgprot = PAGE_KERNEL;
  230. struct page *page;
  231. void *to;
  232. BUG_ON(pte_page(*pte) != walk->reuse_page);
  233. page = list_first_entry(walk->vmemmap_pages, struct page, lru);
  234. list_del(&page->lru);
  235. to = page_to_virt(page);
  236. copy_page(to, (void *)walk->reuse_addr);
  237. reset_struct_pages(to);
  238. /*
  239. * Makes sure that preceding stores to the page contents become visible
  240. * before the set_pte_at() write.
  241. */
  242. smp_wmb();
  243. set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
  244. }
  245. /**
  246. * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
  247. * to the page which @reuse is mapped to, then free vmemmap
  248. * which the range are mapped to.
  249. * @start: start address of the vmemmap virtual address range that we want
  250. * to remap.
  251. * @end: end address of the vmemmap virtual address range that we want to
  252. * remap.
  253. * @reuse: reuse address.
  254. *
  255. * Return: %0 on success, negative error code otherwise.
  256. */
  257. static int vmemmap_remap_free(unsigned long start, unsigned long end,
  258. unsigned long reuse)
  259. {
  260. int ret;
  261. LIST_HEAD(vmemmap_pages);
  262. struct vmemmap_remap_walk walk = {
  263. .remap_pte = vmemmap_remap_pte,
  264. .reuse_addr = reuse,
  265. .vmemmap_pages = &vmemmap_pages,
  266. };
  267. /*
  268. * In order to make remapping routine most efficient for the huge pages,
  269. * the routine of vmemmap page table walking has the following rules
  270. * (see more details from the vmemmap_pte_range()):
  271. *
  272. * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
  273. * should be continuous.
  274. * - The @reuse address is part of the range [@reuse, @end) that we are
  275. * walking which is passed to vmemmap_remap_range().
  276. * - The @reuse address is the first in the complete range.
  277. *
  278. * So we need to make sure that @start and @reuse meet the above rules.
  279. */
  280. BUG_ON(start - reuse != PAGE_SIZE);
  281. mmap_read_lock(&init_mm);
  282. ret = vmemmap_remap_range(reuse, end, &walk);
  283. if (ret && walk.nr_walked) {
  284. end = reuse + walk.nr_walked * PAGE_SIZE;
  285. /*
  286. * vmemmap_pages contains pages from the previous
  287. * vmemmap_remap_range call which failed. These
  288. * are pages which were removed from the vmemmap.
  289. * They will be restored in the following call.
  290. */
  291. walk = (struct vmemmap_remap_walk) {
  292. .remap_pte = vmemmap_restore_pte,
  293. .reuse_addr = reuse,
  294. .vmemmap_pages = &vmemmap_pages,
  295. };
  296. vmemmap_remap_range(reuse, end, &walk);
  297. }
  298. mmap_read_unlock(&init_mm);
  299. free_vmemmap_page_list(&vmemmap_pages);
  300. return ret;
  301. }
  302. static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
  303. gfp_t gfp_mask, struct list_head *list)
  304. {
  305. unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
  306. int nid = page_to_nid((struct page *)start);
  307. struct page *page, *next;
  308. while (nr_pages--) {
  309. page = alloc_pages_node(nid, gfp_mask, 0);
  310. if (!page)
  311. goto out;
  312. list_add_tail(&page->lru, list);
  313. }
  314. return 0;
  315. out:
  316. list_for_each_entry_safe(page, next, list, lru)
  317. __free_pages(page, 0);
  318. return -ENOMEM;
  319. }
  320. /**
  321. * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
  322. * to the page which is from the @vmemmap_pages
  323. * respectively.
  324. * @start: start address of the vmemmap virtual address range that we want
  325. * to remap.
  326. * @end: end address of the vmemmap virtual address range that we want to
  327. * remap.
  328. * @reuse: reuse address.
  329. * @gfp_mask: GFP flag for allocating vmemmap pages.
  330. *
  331. * Return: %0 on success, negative error code otherwise.
  332. */
  333. static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
  334. unsigned long reuse, gfp_t gfp_mask)
  335. {
  336. LIST_HEAD(vmemmap_pages);
  337. struct vmemmap_remap_walk walk = {
  338. .remap_pte = vmemmap_restore_pte,
  339. .reuse_addr = reuse,
  340. .vmemmap_pages = &vmemmap_pages,
  341. };
  342. /* See the comment in the vmemmap_remap_free(). */
  343. BUG_ON(start - reuse != PAGE_SIZE);
  344. if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
  345. return -ENOMEM;
  346. mmap_read_lock(&init_mm);
  347. vmemmap_remap_range(reuse, end, &walk);
  348. mmap_read_unlock(&init_mm);
  349. return 0;
  350. }
  351. DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
  352. EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
  353. static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
  354. core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);
  355. /**
  356. * hugetlb_vmemmap_restore - restore previously optimized (by
  357. * hugetlb_vmemmap_optimize()) vmemmap pages which
  358. * will be reallocated and remapped.
  359. * @h: struct hstate.
  360. * @head: the head page whose vmemmap pages will be restored.
  361. *
  362. * Return: %0 if @head's vmemmap pages have been reallocated and remapped,
  363. * negative error code otherwise.
  364. */
  365. int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
  366. {
  367. int ret;
  368. unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
  369. unsigned long vmemmap_reuse;
  370. if (!HPageVmemmapOptimized(head))
  371. return 0;
  372. vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
  373. vmemmap_reuse = vmemmap_start;
  374. vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
  375. /*
  376. * The pages which the vmemmap virtual address range [@vmemmap_start,
  377. * @vmemmap_end) are mapped to are freed to the buddy allocator, and
  378. * the range is mapped to the page which @vmemmap_reuse is mapped to.
  379. * When a HugeTLB page is freed to the buddy allocator, previously
  380. * discarded vmemmap pages must be allocated and remapping.
  381. */
  382. ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse,
  383. GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
  384. if (!ret) {
  385. ClearHPageVmemmapOptimized(head);
  386. static_branch_dec(&hugetlb_optimize_vmemmap_key);
  387. }
  388. return ret;
  389. }
  390. /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
  391. static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
  392. {
  393. if (!READ_ONCE(vmemmap_optimize_enabled))
  394. return false;
  395. if (!hugetlb_vmemmap_optimizable(h))
  396. return false;
  397. if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
  398. pmd_t *pmdp, pmd;
  399. struct page *vmemmap_page;
  400. unsigned long vaddr = (unsigned long)head;
  401. /*
  402. * Only the vmemmap page's vmemmap page can be self-hosted.
  403. * Walking the page tables to find the backing page of the
  404. * vmemmap page.
  405. */
  406. pmdp = pmd_off_k(vaddr);
  407. /*
  408. * The READ_ONCE() is used to stabilize *pmdp in a register or
  409. * on the stack so that it will stop changing under the code.
  410. * The only concurrent operation where it can be changed is
  411. * split_vmemmap_huge_pmd() (*pmdp will be stable after this
  412. * operation).
  413. */
  414. pmd = READ_ONCE(*pmdp);
  415. if (pmd_leaf(pmd))
  416. vmemmap_page = pmd_page(pmd) + pte_index(vaddr);
  417. else
  418. vmemmap_page = pte_page(*pte_offset_kernel(pmdp, vaddr));
  419. /*
  420. * Due to HugeTLB alignment requirements and the vmemmap pages
  421. * being at the start of the hotplugged memory region in
  422. * memory_hotplug.memmap_on_memory case. Checking any vmemmap
  423. * page's vmemmap page if it is marked as VmemmapSelfHosted is
  424. * sufficient.
  425. *
  426. * [ hotplugged memory ]
  427. * [ section ][...][ section ]
  428. * [ vmemmap ][ usable memory ]
  429. * ^ | | |
  430. * +---+ | |
  431. * ^ | |
  432. * +-------+ |
  433. * ^ |
  434. * +-------------------------------------------+
  435. */
  436. if (PageVmemmapSelfHosted(vmemmap_page))
  437. return false;
  438. }
  439. return true;
  440. }
  441. /**
  442. * hugetlb_vmemmap_optimize - optimize @head page's vmemmap pages.
  443. * @h: struct hstate.
  444. * @head: the head page whose vmemmap pages will be optimized.
  445. *
  446. * This function only tries to optimize @head's vmemmap pages and does not
  447. * guarantee that the optimization will succeed after it returns. The caller
  448. * can use HPageVmemmapOptimized(@head) to detect if @head's vmemmap pages
  449. * have been optimized.
  450. */
  451. void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
  452. {
  453. unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
  454. unsigned long vmemmap_reuse;
  455. if (!vmemmap_should_optimize(h, head))
  456. return;
  457. static_branch_inc(&hugetlb_optimize_vmemmap_key);
  458. vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
  459. vmemmap_reuse = vmemmap_start;
  460. vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
  461. /*
  462. * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
  463. * to the page which @vmemmap_reuse is mapped to, then free the pages
  464. * which the range [@vmemmap_start, @vmemmap_end] is mapped to.
  465. */
  466. if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse))
  467. static_branch_dec(&hugetlb_optimize_vmemmap_key);
  468. else
  469. SetHPageVmemmapOptimized(head);
  470. }
  471. static struct ctl_table hugetlb_vmemmap_sysctls[] = {
  472. {
  473. .procname = "hugetlb_optimize_vmemmap",
  474. .data = &vmemmap_optimize_enabled,
  475. .maxlen = sizeof(int),
  476. .mode = 0644,
  477. .proc_handler = proc_dobool,
  478. },
  479. { }
  480. };
  481. static int __init hugetlb_vmemmap_init(void)
  482. {
  483. /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
  484. BUILD_BUG_ON(__NR_USED_SUBPAGE * sizeof(struct page) > HUGETLB_VMEMMAP_RESERVE_SIZE);
  485. if (IS_ENABLED(CONFIG_PROC_SYSCTL)) {
  486. const struct hstate *h;
  487. for_each_hstate(h) {
  488. if (hugetlb_vmemmap_optimizable(h)) {
  489. register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
  490. break;
  491. }
  492. }
  493. }
  494. return 0;
  495. }
  496. late_initcall(hugetlb_vmemmap_init);