123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577 |
- // SPDX-License-Identifier: GPL-2.0
- /*
- * HugeTLB Vmemmap Optimization (HVO)
- *
- * Copyright (c) 2020, ByteDance. All rights reserved.
- *
- * Author: Muchun Song <[email protected]>
- *
- * See Documentation/mm/vmemmap_dedup.rst
- */
- #define pr_fmt(fmt) "HugeTLB: " fmt
- #include <linux/pgtable.h>
- #include <linux/moduleparam.h>
- #include <linux/bootmem_info.h>
- #include <asm/pgalloc.h>
- #include <asm/tlbflush.h>
- #include "hugetlb_vmemmap.h"
- /**
- * struct vmemmap_remap_walk - walk vmemmap page table
- *
- * @remap_pte: called for each lowest-level entry (PTE).
- * @nr_walked: the number of walked pte.
- * @reuse_page: the page which is reused for the tail vmemmap pages.
- * @reuse_addr: the virtual address of the @reuse_page page.
- * @vmemmap_pages: the list head of the vmemmap pages that can be freed
- * or is mapped from.
- */
- struct vmemmap_remap_walk {
- void (*remap_pte)(pte_t *pte, unsigned long addr,
- struct vmemmap_remap_walk *walk);
- unsigned long nr_walked;
- struct page *reuse_page;
- unsigned long reuse_addr;
- struct list_head *vmemmap_pages;
- };
- static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
- {
- pmd_t __pmd;
- int i;
- unsigned long addr = start;
- struct page *head;
- pte_t *pgtable;
- spin_lock(&init_mm.page_table_lock);
- head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL;
- spin_unlock(&init_mm.page_table_lock);
- if (!head)
- return 0;
- pgtable = pte_alloc_one_kernel(&init_mm);
- if (!pgtable)
- return -ENOMEM;
- pmd_populate_kernel(&init_mm, &__pmd, pgtable);
- for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
- pte_t entry, *pte;
- pgprot_t pgprot = PAGE_KERNEL;
- entry = mk_pte(head + i, pgprot);
- pte = pte_offset_kernel(&__pmd, addr);
- set_pte_at(&init_mm, addr, pte, entry);
- }
- spin_lock(&init_mm.page_table_lock);
- if (likely(pmd_leaf(*pmd))) {
- /*
- * Higher order allocations from buddy allocator must be able to
- * be treated as indepdenent small pages (as they can be freed
- * individually).
- */
- if (!PageReserved(head))
- split_page(head, get_order(PMD_SIZE));
- /* Make pte visible before pmd. See comment in pmd_install(). */
- smp_wmb();
- pmd_populate_kernel(&init_mm, pmd, pgtable);
- flush_tlb_kernel_range(start, start + PMD_SIZE);
- } else {
- pte_free_kernel(&init_mm, pgtable);
- }
- spin_unlock(&init_mm.page_table_lock);
- return 0;
- }
- static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
- unsigned long end,
- struct vmemmap_remap_walk *walk)
- {
- pte_t *pte = pte_offset_kernel(pmd, addr);
- /*
- * The reuse_page is found 'first' in table walk before we start
- * remapping (which is calling @walk->remap_pte).
- */
- if (!walk->reuse_page) {
- walk->reuse_page = pte_page(*pte);
- /*
- * Because the reuse address is part of the range that we are
- * walking, skip the reuse address range.
- */
- addr += PAGE_SIZE;
- pte++;
- walk->nr_walked++;
- }
- for (; addr != end; addr += PAGE_SIZE, pte++) {
- walk->remap_pte(pte, addr, walk);
- walk->nr_walked++;
- }
- }
- static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
- unsigned long end,
- struct vmemmap_remap_walk *walk)
- {
- pmd_t *pmd;
- unsigned long next;
- pmd = pmd_offset(pud, addr);
- do {
- int ret;
- ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
- if (ret)
- return ret;
- next = pmd_addr_end(addr, end);
- vmemmap_pte_range(pmd, addr, next, walk);
- } while (pmd++, addr = next, addr != end);
- return 0;
- }
- static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
- unsigned long end,
- struct vmemmap_remap_walk *walk)
- {
- pud_t *pud;
- unsigned long next;
- pud = pud_offset(p4d, addr);
- do {
- int ret;
- next = pud_addr_end(addr, end);
- ret = vmemmap_pmd_range(pud, addr, next, walk);
- if (ret)
- return ret;
- } while (pud++, addr = next, addr != end);
- return 0;
- }
- static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
- unsigned long end,
- struct vmemmap_remap_walk *walk)
- {
- p4d_t *p4d;
- unsigned long next;
- p4d = p4d_offset(pgd, addr);
- do {
- int ret;
- next = p4d_addr_end(addr, end);
- ret = vmemmap_pud_range(p4d, addr, next, walk);
- if (ret)
- return ret;
- } while (p4d++, addr = next, addr != end);
- return 0;
- }
- static int vmemmap_remap_range(unsigned long start, unsigned long end,
- struct vmemmap_remap_walk *walk)
- {
- unsigned long addr = start;
- unsigned long next;
- pgd_t *pgd;
- VM_BUG_ON(!PAGE_ALIGNED(start));
- VM_BUG_ON(!PAGE_ALIGNED(end));
- pgd = pgd_offset_k(addr);
- do {
- int ret;
- next = pgd_addr_end(addr, end);
- ret = vmemmap_p4d_range(pgd, addr, next, walk);
- if (ret)
- return ret;
- } while (pgd++, addr = next, addr != end);
- /*
- * We only change the mapping of the vmemmap virtual address range
- * [@start + PAGE_SIZE, end), so we only need to flush the TLB which
- * belongs to the range.
- */
- flush_tlb_kernel_range(start + PAGE_SIZE, end);
- return 0;
- }
- /*
- * Free a vmemmap page. A vmemmap page can be allocated from the memblock
- * allocator or buddy allocator. If the PG_reserved flag is set, it means
- * that it allocated from the memblock allocator, just free it via the
- * free_bootmem_page(). Otherwise, use __free_page().
- */
- static inline void free_vmemmap_page(struct page *page)
- {
- if (PageReserved(page))
- free_bootmem_page(page);
- else
- __free_page(page);
- }
- /* Free a list of the vmemmap pages */
- static void free_vmemmap_page_list(struct list_head *list)
- {
- struct page *page, *next;
- list_for_each_entry_safe(page, next, list, lru) {
- list_del(&page->lru);
- free_vmemmap_page(page);
- }
- }
- static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
- struct vmemmap_remap_walk *walk)
- {
- /*
- * Remap the tail pages as read-only to catch illegal write operation
- * to the tail pages.
- */
- pgprot_t pgprot = PAGE_KERNEL_RO;
- pte_t entry = mk_pte(walk->reuse_page, pgprot);
- struct page *page = pte_page(*pte);
- list_add_tail(&page->lru, walk->vmemmap_pages);
- set_pte_at(&init_mm, addr, pte, entry);
- }
- /*
- * How many struct page structs need to be reset. When we reuse the head
- * struct page, the special metadata (e.g. page->flags or page->mapping)
- * cannot copy to the tail struct page structs. The invalid value will be
- * checked in the free_tail_pages_check(). In order to avoid the message
- * of "corrupted mapping in tail page". We need to reset at least 3 (one
- * head struct page struct and two tail struct page structs) struct page
- * structs.
- */
- #define NR_RESET_STRUCT_PAGE 3
- static inline void reset_struct_pages(struct page *start)
- {
- struct page *from = start + NR_RESET_STRUCT_PAGE;
- BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page));
- memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE);
- }
- static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
- struct vmemmap_remap_walk *walk)
- {
- pgprot_t pgprot = PAGE_KERNEL;
- struct page *page;
- void *to;
- BUG_ON(pte_page(*pte) != walk->reuse_page);
- page = list_first_entry(walk->vmemmap_pages, struct page, lru);
- list_del(&page->lru);
- to = page_to_virt(page);
- copy_page(to, (void *)walk->reuse_addr);
- reset_struct_pages(to);
- /*
- * Makes sure that preceding stores to the page contents become visible
- * before the set_pte_at() write.
- */
- smp_wmb();
- set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
- }
- /**
- * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
- * to the page which @reuse is mapped to, then free vmemmap
- * which the range are mapped to.
- * @start: start address of the vmemmap virtual address range that we want
- * to remap.
- * @end: end address of the vmemmap virtual address range that we want to
- * remap.
- * @reuse: reuse address.
- *
- * Return: %0 on success, negative error code otherwise.
- */
- static int vmemmap_remap_free(unsigned long start, unsigned long end,
- unsigned long reuse)
- {
- int ret;
- LIST_HEAD(vmemmap_pages);
- struct vmemmap_remap_walk walk = {
- .remap_pte = vmemmap_remap_pte,
- .reuse_addr = reuse,
- .vmemmap_pages = &vmemmap_pages,
- };
- /*
- * In order to make remapping routine most efficient for the huge pages,
- * the routine of vmemmap page table walking has the following rules
- * (see more details from the vmemmap_pte_range()):
- *
- * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
- * should be continuous.
- * - The @reuse address is part of the range [@reuse, @end) that we are
- * walking which is passed to vmemmap_remap_range().
- * - The @reuse address is the first in the complete range.
- *
- * So we need to make sure that @start and @reuse meet the above rules.
- */
- BUG_ON(start - reuse != PAGE_SIZE);
- mmap_read_lock(&init_mm);
- ret = vmemmap_remap_range(reuse, end, &walk);
- if (ret && walk.nr_walked) {
- end = reuse + walk.nr_walked * PAGE_SIZE;
- /*
- * vmemmap_pages contains pages from the previous
- * vmemmap_remap_range call which failed. These
- * are pages which were removed from the vmemmap.
- * They will be restored in the following call.
- */
- walk = (struct vmemmap_remap_walk) {
- .remap_pte = vmemmap_restore_pte,
- .reuse_addr = reuse,
- .vmemmap_pages = &vmemmap_pages,
- };
- vmemmap_remap_range(reuse, end, &walk);
- }
- mmap_read_unlock(&init_mm);
- free_vmemmap_page_list(&vmemmap_pages);
- return ret;
- }
- static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
- gfp_t gfp_mask, struct list_head *list)
- {
- unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
- int nid = page_to_nid((struct page *)start);
- struct page *page, *next;
- while (nr_pages--) {
- page = alloc_pages_node(nid, gfp_mask, 0);
- if (!page)
- goto out;
- list_add_tail(&page->lru, list);
- }
- return 0;
- out:
- list_for_each_entry_safe(page, next, list, lru)
- __free_pages(page, 0);
- return -ENOMEM;
- }
- /**
- * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
- * to the page which is from the @vmemmap_pages
- * respectively.
- * @start: start address of the vmemmap virtual address range that we want
- * to remap.
- * @end: end address of the vmemmap virtual address range that we want to
- * remap.
- * @reuse: reuse address.
- * @gfp_mask: GFP flag for allocating vmemmap pages.
- *
- * Return: %0 on success, negative error code otherwise.
- */
- static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
- unsigned long reuse, gfp_t gfp_mask)
- {
- LIST_HEAD(vmemmap_pages);
- struct vmemmap_remap_walk walk = {
- .remap_pte = vmemmap_restore_pte,
- .reuse_addr = reuse,
- .vmemmap_pages = &vmemmap_pages,
- };
- /* See the comment in the vmemmap_remap_free(). */
- BUG_ON(start - reuse != PAGE_SIZE);
- if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
- return -ENOMEM;
- mmap_read_lock(&init_mm);
- vmemmap_remap_range(reuse, end, &walk);
- mmap_read_unlock(&init_mm);
- return 0;
- }
- DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
- EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
- static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
- core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);
- /**
- * hugetlb_vmemmap_restore - restore previously optimized (by
- * hugetlb_vmemmap_optimize()) vmemmap pages which
- * will be reallocated and remapped.
- * @h: struct hstate.
- * @head: the head page whose vmemmap pages will be restored.
- *
- * Return: %0 if @head's vmemmap pages have been reallocated and remapped,
- * negative error code otherwise.
- */
- int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
- {
- int ret;
- unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
- unsigned long vmemmap_reuse;
- if (!HPageVmemmapOptimized(head))
- return 0;
- vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
- vmemmap_reuse = vmemmap_start;
- vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
- /*
- * The pages which the vmemmap virtual address range [@vmemmap_start,
- * @vmemmap_end) are mapped to are freed to the buddy allocator, and
- * the range is mapped to the page which @vmemmap_reuse is mapped to.
- * When a HugeTLB page is freed to the buddy allocator, previously
- * discarded vmemmap pages must be allocated and remapping.
- */
- ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse,
- GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
- if (!ret) {
- ClearHPageVmemmapOptimized(head);
- static_branch_dec(&hugetlb_optimize_vmemmap_key);
- }
- return ret;
- }
- /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
- static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
- {
- if (!READ_ONCE(vmemmap_optimize_enabled))
- return false;
- if (!hugetlb_vmemmap_optimizable(h))
- return false;
- if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
- pmd_t *pmdp, pmd;
- struct page *vmemmap_page;
- unsigned long vaddr = (unsigned long)head;
- /*
- * Only the vmemmap page's vmemmap page can be self-hosted.
- * Walking the page tables to find the backing page of the
- * vmemmap page.
- */
- pmdp = pmd_off_k(vaddr);
- /*
- * The READ_ONCE() is used to stabilize *pmdp in a register or
- * on the stack so that it will stop changing under the code.
- * The only concurrent operation where it can be changed is
- * split_vmemmap_huge_pmd() (*pmdp will be stable after this
- * operation).
- */
- pmd = READ_ONCE(*pmdp);
- if (pmd_leaf(pmd))
- vmemmap_page = pmd_page(pmd) + pte_index(vaddr);
- else
- vmemmap_page = pte_page(*pte_offset_kernel(pmdp, vaddr));
- /*
- * Due to HugeTLB alignment requirements and the vmemmap pages
- * being at the start of the hotplugged memory region in
- * memory_hotplug.memmap_on_memory case. Checking any vmemmap
- * page's vmemmap page if it is marked as VmemmapSelfHosted is
- * sufficient.
- *
- * [ hotplugged memory ]
- * [ section ][...][ section ]
- * [ vmemmap ][ usable memory ]
- * ^ | | |
- * +---+ | |
- * ^ | |
- * +-------+ |
- * ^ |
- * +-------------------------------------------+
- */
- if (PageVmemmapSelfHosted(vmemmap_page))
- return false;
- }
- return true;
- }
- /**
- * hugetlb_vmemmap_optimize - optimize @head page's vmemmap pages.
- * @h: struct hstate.
- * @head: the head page whose vmemmap pages will be optimized.
- *
- * This function only tries to optimize @head's vmemmap pages and does not
- * guarantee that the optimization will succeed after it returns. The caller
- * can use HPageVmemmapOptimized(@head) to detect if @head's vmemmap pages
- * have been optimized.
- */
- void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
- {
- unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
- unsigned long vmemmap_reuse;
- if (!vmemmap_should_optimize(h, head))
- return;
- static_branch_inc(&hugetlb_optimize_vmemmap_key);
- vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
- vmemmap_reuse = vmemmap_start;
- vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
- /*
- * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
- * to the page which @vmemmap_reuse is mapped to, then free the pages
- * which the range [@vmemmap_start, @vmemmap_end] is mapped to.
- */
- if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse))
- static_branch_dec(&hugetlb_optimize_vmemmap_key);
- else
- SetHPageVmemmapOptimized(head);
- }
- static struct ctl_table hugetlb_vmemmap_sysctls[] = {
- {
- .procname = "hugetlb_optimize_vmemmap",
- .data = &vmemmap_optimize_enabled,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dobool,
- },
- { }
- };
- static int __init hugetlb_vmemmap_init(void)
- {
- /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
- BUILD_BUG_ON(__NR_USED_SUBPAGE * sizeof(struct page) > HUGETLB_VMEMMAP_RESERVE_SIZE);
- if (IS_ENABLED(CONFIG_PROC_SYSCTL)) {
- const struct hstate *h;
- for_each_hstate(h) {
- if (hugetlb_vmemmap_optimizable(h)) {
- register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
- break;
- }
- }
- }
- return 0;
- }
- late_initcall(hugetlb_vmemmap_init);
|