hash_pgtable.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * Copyright 2005, Paul Mackerras, IBM Corporation.
  4. * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
  5. * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
  6. */
  7. #include <linux/sched.h>
  8. #include <linux/mm_types.h>
  9. #include <linux/mm.h>
  10. #include <linux/stop_machine.h>
  11. #include <asm/sections.h>
  12. #include <asm/mmu.h>
  13. #include <asm/tlb.h>
  14. #include <asm/firmware.h>
  15. #include <mm/mmu_decl.h>
  16. #include <trace/events/thp.h>
  17. #if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE))
  18. #warning Limited user VSID range means pagetable space is wasted
  19. #endif
  20. #ifdef CONFIG_SPARSEMEM_VMEMMAP
  21. /*
  22. * vmemmap is the starting address of the virtual address space where
  23. * struct pages are allocated for all possible PFNs present on the system
  24. * including holes and bad memory (hence sparse). These virtual struct
  25. * pages are stored in sequence in this virtual address space irrespective
  26. * of the fact whether the corresponding PFN is valid or not. This achieves
  27. * constant relationship between address of struct page and its PFN.
  28. *
  29. * During boot or memory hotplug operation when a new memory section is
  30. * added, physical memory allocation (including hash table bolting) will
  31. * be performed for the set of struct pages which are part of the memory
  32. * section. This saves memory by not allocating struct pages for PFNs
  33. * which are not valid.
  34. *
  35. * ----------------------------------------------
  36. * | PHYSICAL ALLOCATION OF VIRTUAL STRUCT PAGES|
  37. * ----------------------------------------------
  38. *
  39. * f000000000000000 c000000000000000
  40. * vmemmap +--------------+ +--------------+
  41. * + | page struct | +--------------> | page struct |
  42. * | +--------------+ +--------------+
  43. * | | page struct | +--------------> | page struct |
  44. * | +--------------+ | +--------------+
  45. * | | page struct | + +------> | page struct |
  46. * | +--------------+ | +--------------+
  47. * | | page struct | | +--> | page struct |
  48. * | +--------------+ | | +--------------+
  49. * | | page struct | | |
  50. * | +--------------+ | |
  51. * | | page struct | | |
  52. * | +--------------+ | |
  53. * | | page struct | | |
  54. * | +--------------+ | |
  55. * | | page struct | | |
  56. * | +--------------+ | |
  57. * | | page struct | +-------+ |
  58. * | +--------------+ |
  59. * | | page struct | +-----------+
  60. * | +--------------+
  61. * | | page struct | No mapping
  62. * | +--------------+
  63. * | | page struct | No mapping
  64. * v +--------------+
  65. *
  66. * -----------------------------------------
  67. * | RELATION BETWEEN STRUCT PAGES AND PFNS|
  68. * -----------------------------------------
  69. *
  70. * vmemmap +--------------+ +---------------+
  71. * + | page struct | +-------------> | PFN |
  72. * | +--------------+ +---------------+
  73. * | | page struct | +-------------> | PFN |
  74. * | +--------------+ +---------------+
  75. * | | page struct | +-------------> | PFN |
  76. * | +--------------+ +---------------+
  77. * | | page struct | +-------------> | PFN |
  78. * | +--------------+ +---------------+
  79. * | | |
  80. * | +--------------+
  81. * | | |
  82. * | +--------------+
  83. * | | |
  84. * | +--------------+ +---------------+
  85. * | | page struct | +-------------> | PFN |
  86. * | +--------------+ +---------------+
  87. * | | |
  88. * | +--------------+
  89. * | | |
  90. * | +--------------+ +---------------+
  91. * | | page struct | +-------------> | PFN |
  92. * | +--------------+ +---------------+
  93. * | | page struct | +-------------> | PFN |
  94. * v +--------------+ +---------------+
  95. */
  96. /*
  97. * On hash-based CPUs, the vmemmap is bolted in the hash table.
  98. *
  99. */
  100. int __meminit hash__vmemmap_create_mapping(unsigned long start,
  101. unsigned long page_size,
  102. unsigned long phys)
  103. {
  104. int rc;
  105. if ((start + page_size) >= H_VMEMMAP_END) {
  106. pr_warn("Outside the supported range\n");
  107. return -1;
  108. }
  109. rc = htab_bolt_mapping(start, start + page_size, phys,
  110. pgprot_val(PAGE_KERNEL),
  111. mmu_vmemmap_psize, mmu_kernel_ssize);
  112. if (rc < 0) {
  113. int rc2 = htab_remove_mapping(start, start + page_size,
  114. mmu_vmemmap_psize,
  115. mmu_kernel_ssize);
  116. BUG_ON(rc2 && (rc2 != -ENOENT));
  117. }
  118. return rc;
  119. }
  120. #ifdef CONFIG_MEMORY_HOTPLUG
  121. void hash__vmemmap_remove_mapping(unsigned long start,
  122. unsigned long page_size)
  123. {
  124. int rc = htab_remove_mapping(start, start + page_size,
  125. mmu_vmemmap_psize,
  126. mmu_kernel_ssize);
  127. BUG_ON((rc < 0) && (rc != -ENOENT));
  128. WARN_ON(rc == -ENOENT);
  129. }
  130. #endif
  131. #endif /* CONFIG_SPARSEMEM_VMEMMAP */
  132. /*
  133. * map_kernel_page currently only called by __ioremap
  134. * map_kernel_page adds an entry to the ioremap page table
  135. * and adds an entry to the HPT, possibly bolting it
  136. */
  137. int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
  138. {
  139. pgd_t *pgdp;
  140. p4d_t *p4dp;
  141. pud_t *pudp;
  142. pmd_t *pmdp;
  143. pte_t *ptep;
  144. BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
  145. if (slab_is_available()) {
  146. pgdp = pgd_offset_k(ea);
  147. p4dp = p4d_offset(pgdp, ea);
  148. pudp = pud_alloc(&init_mm, p4dp, ea);
  149. if (!pudp)
  150. return -ENOMEM;
  151. pmdp = pmd_alloc(&init_mm, pudp, ea);
  152. if (!pmdp)
  153. return -ENOMEM;
  154. ptep = pte_alloc_kernel(pmdp, ea);
  155. if (!ptep)
  156. return -ENOMEM;
  157. set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot));
  158. } else {
  159. /*
  160. * If the mm subsystem is not fully up, we cannot create a
  161. * linux page table entry for this mapping. Simply bolt an
  162. * entry in the hardware page table.
  163. *
  164. */
  165. if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, pgprot_val(prot),
  166. mmu_io_psize, mmu_kernel_ssize)) {
  167. printk(KERN_ERR "Failed to do bolted mapping IO "
  168. "memory at %016lx !\n", pa);
  169. return -ENOMEM;
  170. }
  171. }
  172. smp_wmb();
  173. return 0;
  174. }
  175. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  176. unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
  177. pmd_t *pmdp, unsigned long clr,
  178. unsigned long set)
  179. {
  180. __be64 old_be, tmp;
  181. unsigned long old;
  182. #ifdef CONFIG_DEBUG_VM
  183. WARN_ON(!hash__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
  184. assert_spin_locked(pmd_lockptr(mm, pmdp));
  185. #endif
  186. __asm__ __volatile__(
  187. "1: ldarx %0,0,%3\n\
  188. and. %1,%0,%6\n\
  189. bne- 1b \n\
  190. andc %1,%0,%4 \n\
  191. or %1,%1,%7\n\
  192. stdcx. %1,0,%3 \n\
  193. bne- 1b"
  194. : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp)
  195. : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp),
  196. "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
  197. : "cc" );
  198. old = be64_to_cpu(old_be);
  199. trace_hugepage_update(addr, old, clr, set);
  200. if (old & H_PAGE_HASHPTE)
  201. hpte_do_hugepage_flush(mm, addr, pmdp, old);
  202. return old;
  203. }
  204. pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
  205. pmd_t *pmdp)
  206. {
  207. pmd_t pmd;
  208. VM_BUG_ON(address & ~HPAGE_PMD_MASK);
  209. VM_BUG_ON(pmd_trans_huge(*pmdp));
  210. VM_BUG_ON(pmd_devmap(*pmdp));
  211. pmd = *pmdp;
  212. pmd_clear(pmdp);
  213. /*
  214. * Wait for all pending hash_page to finish. This is needed
  215. * in case of subpage collapse. When we collapse normal pages
  216. * to hugepage, we first clear the pmd, then invalidate all
  217. * the PTE entries. The assumption here is that any low level
  218. * page fault will see a none pmd and take the slow path that
  219. * will wait on mmap_lock. But we could very well be in a
  220. * hash_page with local ptep pointer value. Such a hash page
  221. * can result in adding new HPTE entries for normal subpages.
  222. * That means we could be modifying the page content as we
  223. * copy them to a huge page. So wait for parallel hash_page
  224. * to finish before invalidating HPTE entries. We can do this
  225. * by sending an IPI to all the cpus and executing a dummy
  226. * function there.
  227. */
  228. serialize_against_pte_lookup(vma->vm_mm);
  229. /*
  230. * Now invalidate the hpte entries in the range
  231. * covered by pmd. This make sure we take a
  232. * fault and will find the pmd as none, which will
  233. * result in a major fault which takes mmap_lock and
  234. * hence wait for collapse to complete. Without this
  235. * the __collapse_huge_page_copy can result in copying
  236. * the old content.
  237. */
  238. flush_hash_table_pmd_range(vma->vm_mm, &pmd, address);
  239. return pmd;
  240. }
  241. /*
  242. * We want to put the pgtable in pmd and use pgtable for tracking
  243. * the base page size hptes
  244. */
  245. void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
  246. pgtable_t pgtable)
  247. {
  248. pgtable_t *pgtable_slot;
  249. assert_spin_locked(pmd_lockptr(mm, pmdp));
  250. /*
  251. * we store the pgtable in the second half of PMD
  252. */
  253. pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
  254. *pgtable_slot = pgtable;
  255. /*
  256. * expose the deposited pgtable to other cpus.
  257. * before we set the hugepage PTE at pmd level
  258. * hash fault code looks at the deposted pgtable
  259. * to store hash index values.
  260. */
  261. smp_wmb();
  262. }
  263. pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
  264. {
  265. pgtable_t pgtable;
  266. pgtable_t *pgtable_slot;
  267. assert_spin_locked(pmd_lockptr(mm, pmdp));
  268. pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
  269. pgtable = *pgtable_slot;
  270. /*
  271. * Once we withdraw, mark the entry NULL.
  272. */
  273. *pgtable_slot = NULL;
  274. /*
  275. * We store HPTE information in the deposited PTE fragment.
  276. * zero out the content on withdraw.
  277. */
  278. memset(pgtable, 0, PTE_FRAG_SIZE);
  279. return pgtable;
  280. }
  281. /*
  282. * A linux hugepage PMD was changed and the corresponding hash table entries
  283. * neesd to be flushed.
  284. */
  285. void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
  286. pmd_t *pmdp, unsigned long old_pmd)
  287. {
  288. int ssize;
  289. unsigned int psize;
  290. unsigned long vsid;
  291. unsigned long flags = 0;
  292. /* get the base page size,vsid and segment size */
  293. #ifdef CONFIG_DEBUG_VM
  294. psize = get_slice_psize(mm, addr);
  295. BUG_ON(psize == MMU_PAGE_16M);
  296. #endif
  297. if (old_pmd & H_PAGE_COMBO)
  298. psize = MMU_PAGE_4K;
  299. else
  300. psize = MMU_PAGE_64K;
  301. if (!is_kernel_addr(addr)) {
  302. ssize = user_segment_size(addr);
  303. vsid = get_user_vsid(&mm->context, addr, ssize);
  304. WARN_ON(vsid == 0);
  305. } else {
  306. vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
  307. ssize = mmu_kernel_ssize;
  308. }
  309. if (mm_is_thread_local(mm))
  310. flags |= HPTE_LOCAL_UPDATE;
  311. return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
  312. }
  313. pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
  314. unsigned long addr, pmd_t *pmdp)
  315. {
  316. pmd_t old_pmd;
  317. pgtable_t pgtable;
  318. unsigned long old;
  319. pgtable_t *pgtable_slot;
  320. old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
  321. old_pmd = __pmd(old);
  322. /*
  323. * We have pmd == none and we are holding page_table_lock.
  324. * So we can safely go and clear the pgtable hash
  325. * index info.
  326. */
  327. pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
  328. pgtable = *pgtable_slot;
  329. /*
  330. * Let's zero out old valid and hash index details
  331. * hash fault look at them.
  332. */
  333. memset(pgtable, 0, PTE_FRAG_SIZE);
  334. return old_pmd;
  335. }
  336. int hash__has_transparent_hugepage(void)
  337. {
  338. if (!mmu_has_feature(MMU_FTR_16M_PAGE))
  339. return 0;
  340. /*
  341. * We support THP only if PMD_SIZE is 16MB.
  342. */
  343. if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
  344. return 0;
  345. /*
  346. * We need to make sure that we support 16MB hugepage in a segment
  347. * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
  348. * of 64K.
  349. */
  350. /*
  351. * If we have 64K HPTE, we will be using that by default
  352. */
  353. if (mmu_psize_defs[MMU_PAGE_64K].shift &&
  354. (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
  355. return 0;
  356. /*
  357. * Ok we only have 4K HPTE
  358. */
  359. if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
  360. return 0;
  361. return 1;
  362. }
  363. EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage);
  364. #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  365. #ifdef CONFIG_STRICT_KERNEL_RWX
  366. struct change_memory_parms {
  367. unsigned long start, end, newpp;
  368. unsigned int step, nr_cpus;
  369. atomic_t master_cpu;
  370. atomic_t cpu_counter;
  371. };
  372. // We'd rather this was on the stack but it has to be in the RMO
  373. static struct change_memory_parms chmem_parms;
  374. // And therefore we need a lock to protect it from concurrent use
  375. static DEFINE_MUTEX(chmem_lock);
  376. static void change_memory_range(unsigned long start, unsigned long end,
  377. unsigned int step, unsigned long newpp)
  378. {
  379. unsigned long idx;
  380. pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n",
  381. start, end, newpp, step);
  382. for (idx = start; idx < end; idx += step)
  383. /* Not sure if we can do much with the return value */
  384. mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize,
  385. mmu_kernel_ssize);
  386. }
  387. static int notrace chmem_secondary_loop(struct change_memory_parms *parms)
  388. {
  389. unsigned long msr, tmp, flags;
  390. int *p;
  391. p = &parms->cpu_counter.counter;
  392. local_irq_save(flags);
  393. hard_irq_disable();
  394. asm volatile (
  395. // Switch to real mode and leave interrupts off
  396. "mfmsr %[msr] ;"
  397. "li %[tmp], %[MSR_IR_DR] ;"
  398. "andc %[tmp], %[msr], %[tmp] ;"
  399. "mtmsrd %[tmp] ;"
  400. // Tell the master we are in real mode
  401. "1: "
  402. "lwarx %[tmp], 0, %[p] ;"
  403. "addic %[tmp], %[tmp], -1 ;"
  404. "stwcx. %[tmp], 0, %[p] ;"
  405. "bne- 1b ;"
  406. // Spin until the counter goes to zero
  407. "2: ;"
  408. "lwz %[tmp], 0(%[p]) ;"
  409. "cmpwi %[tmp], 0 ;"
  410. "bne- 2b ;"
  411. // Switch back to virtual mode
  412. "mtmsrd %[msr] ;"
  413. : // outputs
  414. [msr] "=&r" (msr), [tmp] "=&b" (tmp), "+m" (*p)
  415. : // inputs
  416. [p] "b" (p), [MSR_IR_DR] "i" (MSR_IR | MSR_DR)
  417. : // clobbers
  418. "cc", "xer"
  419. );
  420. local_irq_restore(flags);
  421. return 0;
  422. }
  423. static int change_memory_range_fn(void *data)
  424. {
  425. struct change_memory_parms *parms = data;
  426. // First CPU goes through, all others wait.
  427. if (atomic_xchg(&parms->master_cpu, 1) == 1)
  428. return chmem_secondary_loop(parms);
  429. // Wait for all but one CPU (this one) to call-in
  430. while (atomic_read(&parms->cpu_counter) > 1)
  431. barrier();
  432. change_memory_range(parms->start, parms->end, parms->step, parms->newpp);
  433. mb();
  434. // Signal the other CPUs that we're done
  435. atomic_dec(&parms->cpu_counter);
  436. return 0;
  437. }
  438. static bool hash__change_memory_range(unsigned long start, unsigned long end,
  439. unsigned long newpp)
  440. {
  441. unsigned int step, shift;
  442. shift = mmu_psize_defs[mmu_linear_psize].shift;
  443. step = 1 << shift;
  444. start = ALIGN_DOWN(start, step);
  445. end = ALIGN(end, step); // aligns up
  446. if (start >= end)
  447. return false;
  448. if (firmware_has_feature(FW_FEATURE_LPAR)) {
  449. mutex_lock(&chmem_lock);
  450. chmem_parms.start = start;
  451. chmem_parms.end = end;
  452. chmem_parms.step = step;
  453. chmem_parms.newpp = newpp;
  454. atomic_set(&chmem_parms.master_cpu, 0);
  455. cpus_read_lock();
  456. atomic_set(&chmem_parms.cpu_counter, num_online_cpus());
  457. // Ensure state is consistent before we call the other CPUs
  458. mb();
  459. stop_machine_cpuslocked(change_memory_range_fn, &chmem_parms,
  460. cpu_online_mask);
  461. cpus_read_unlock();
  462. mutex_unlock(&chmem_lock);
  463. } else
  464. change_memory_range(start, end, step, newpp);
  465. return true;
  466. }
  467. void hash__mark_rodata_ro(void)
  468. {
  469. unsigned long start, end, pp;
  470. start = (unsigned long)_stext;
  471. end = (unsigned long)__end_rodata;
  472. pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL_ROX), HPTE_USE_KERNEL_KEY);
  473. WARN_ON(!hash__change_memory_range(start, end, pp));
  474. }
  475. void hash__mark_initmem_nx(void)
  476. {
  477. unsigned long start, end, pp;
  478. start = (unsigned long)__init_begin;
  479. end = (unsigned long)__init_end;
  480. pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), HPTE_USE_KERNEL_KEY);
  481. WARN_ON(!hash__change_memory_range(start, end, pp));
  482. }
  483. #endif