pgalloc.c 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Page table allocation functions
  4. *
  5. * Copyright IBM Corp. 2016
  6. * Author(s): Martin Schwidefsky <[email protected]>
  7. */
  8. #include <linux/sysctl.h>
  9. #include <linux/slab.h>
  10. #include <linux/mm.h>
  11. #include <asm/mmu_context.h>
  12. #include <asm/pgalloc.h>
  13. #include <asm/gmap.h>
  14. #include <asm/tlb.h>
  15. #include <asm/tlbflush.h>
  16. #ifdef CONFIG_PGSTE
  17. int page_table_allocate_pgste = 0;
  18. EXPORT_SYMBOL(page_table_allocate_pgste);
  19. static struct ctl_table page_table_sysctl[] = {
  20. {
  21. .procname = "allocate_pgste",
  22. .data = &page_table_allocate_pgste,
  23. .maxlen = sizeof(int),
  24. .mode = S_IRUGO | S_IWUSR,
  25. .proc_handler = proc_dointvec_minmax,
  26. .extra1 = SYSCTL_ZERO,
  27. .extra2 = SYSCTL_ONE,
  28. },
  29. { }
  30. };
  31. static struct ctl_table page_table_sysctl_dir[] = {
  32. {
  33. .procname = "vm",
  34. .maxlen = 0,
  35. .mode = 0555,
  36. .child = page_table_sysctl,
  37. },
  38. { }
  39. };
  40. static int __init page_table_register_sysctl(void)
  41. {
  42. return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM;
  43. }
  44. __initcall(page_table_register_sysctl);
  45. #endif /* CONFIG_PGSTE */
  46. unsigned long *crst_table_alloc(struct mm_struct *mm)
  47. {
  48. struct page *page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
  49. if (!page)
  50. return NULL;
  51. arch_set_page_dat(page, CRST_ALLOC_ORDER);
  52. return (unsigned long *) page_to_virt(page);
  53. }
  54. void crst_table_free(struct mm_struct *mm, unsigned long *table)
  55. {
  56. free_pages((unsigned long)table, CRST_ALLOC_ORDER);
  57. }
  58. static void __crst_table_upgrade(void *arg)
  59. {
  60. struct mm_struct *mm = arg;
  61. /* change all active ASCEs to avoid the creation of new TLBs */
  62. if (current->active_mm == mm) {
  63. S390_lowcore.user_asce = mm->context.asce;
  64. __ctl_load(S390_lowcore.user_asce, 7, 7);
  65. }
  66. __tlb_flush_local();
  67. }
  68. int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
  69. {
  70. unsigned long *pgd = NULL, *p4d = NULL, *__pgd;
  71. unsigned long asce_limit = mm->context.asce_limit;
  72. /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
  73. VM_BUG_ON(asce_limit < _REGION2_SIZE);
  74. if (end <= asce_limit)
  75. return 0;
  76. if (asce_limit == _REGION2_SIZE) {
  77. p4d = crst_table_alloc(mm);
  78. if (unlikely(!p4d))
  79. goto err_p4d;
  80. crst_table_init(p4d, _REGION2_ENTRY_EMPTY);
  81. }
  82. if (end > _REGION1_SIZE) {
  83. pgd = crst_table_alloc(mm);
  84. if (unlikely(!pgd))
  85. goto err_pgd;
  86. crst_table_init(pgd, _REGION1_ENTRY_EMPTY);
  87. }
  88. spin_lock_bh(&mm->page_table_lock);
  89. /*
  90. * This routine gets called with mmap_lock lock held and there is
  91. * no reason to optimize for the case of otherwise. However, if
  92. * that would ever change, the below check will let us know.
  93. */
  94. VM_BUG_ON(asce_limit != mm->context.asce_limit);
  95. if (p4d) {
  96. __pgd = (unsigned long *) mm->pgd;
  97. p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd);
  98. mm->pgd = (pgd_t *) p4d;
  99. mm->context.asce_limit = _REGION1_SIZE;
  100. mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
  101. _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
  102. mm_inc_nr_puds(mm);
  103. }
  104. if (pgd) {
  105. __pgd = (unsigned long *) mm->pgd;
  106. pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd);
  107. mm->pgd = (pgd_t *) pgd;
  108. mm->context.asce_limit = TASK_SIZE_MAX;
  109. mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
  110. _ASCE_USER_BITS | _ASCE_TYPE_REGION1;
  111. }
  112. spin_unlock_bh(&mm->page_table_lock);
  113. on_each_cpu(__crst_table_upgrade, mm, 0);
  114. return 0;
  115. err_pgd:
  116. crst_table_free(mm, p4d);
  117. err_p4d:
  118. return -ENOMEM;
  119. }
  120. static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
  121. {
  122. unsigned int old, new;
  123. do {
  124. old = atomic_read(v);
  125. new = old ^ bits;
  126. } while (atomic_cmpxchg(v, old, new) != old);
  127. return new;
  128. }
  129. #ifdef CONFIG_PGSTE
  130. struct page *page_table_alloc_pgste(struct mm_struct *mm)
  131. {
  132. struct page *page;
  133. u64 *table;
  134. page = alloc_page(GFP_KERNEL);
  135. if (page) {
  136. table = (u64 *)page_to_virt(page);
  137. memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
  138. memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
  139. }
  140. return page;
  141. }
  142. void page_table_free_pgste(struct page *page)
  143. {
  144. __free_page(page);
  145. }
  146. #endif /* CONFIG_PGSTE */
  147. /*
  148. * A 2KB-pgtable is either upper or lower half of a normal page.
  149. * The second half of the page may be unused or used as another
  150. * 2KB-pgtable.
  151. *
  152. * Whenever possible the parent page for a new 2KB-pgtable is picked
  153. * from the list of partially allocated pages mm_context_t::pgtable_list.
  154. * In case the list is empty a new parent page is allocated and added to
  155. * the list.
  156. *
  157. * When a parent page gets fully allocated it contains 2KB-pgtables in both
  158. * upper and lower halves and is removed from mm_context_t::pgtable_list.
  159. *
  160. * When 2KB-pgtable is freed from to fully allocated parent page that
  161. * page turns partially allocated and added to mm_context_t::pgtable_list.
  162. *
  163. * If 2KB-pgtable is freed from the partially allocated parent page that
  164. * page turns unused and gets removed from mm_context_t::pgtable_list.
  165. * Furthermore, the unused parent page is released.
  166. *
  167. * As follows from the above, no unallocated or fully allocated parent
  168. * pages are contained in mm_context_t::pgtable_list.
  169. *
  170. * The upper byte (bits 24-31) of the parent page _refcount is used
  171. * for tracking contained 2KB-pgtables and has the following format:
  172. *
  173. * PP AA
  174. * 01234567 upper byte (bits 24-31) of struct page::_refcount
  175. * || ||
  176. * || |+--- upper 2KB-pgtable is allocated
  177. * || +---- lower 2KB-pgtable is allocated
  178. * |+------- upper 2KB-pgtable is pending for removal
  179. * +-------- lower 2KB-pgtable is pending for removal
  180. *
  181. * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why
  182. * using _refcount is possible).
  183. *
  184. * When 2KB-pgtable is allocated the corresponding AA bit is set to 1.
  185. * The parent page is either:
  186. * - added to mm_context_t::pgtable_list in case the second half of the
  187. * parent page is still unallocated;
  188. * - removed from mm_context_t::pgtable_list in case both hales of the
  189. * parent page are allocated;
  190. * These operations are protected with mm_context_t::lock.
  191. *
  192. * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0
  193. * and the corresponding PP bit is set to 1 in a single atomic operation.
  194. * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually
  195. * exclusive and may never be both set to 1!
  196. * The parent page is either:
  197. * - added to mm_context_t::pgtable_list in case the second half of the
  198. * parent page is still allocated;
  199. * - removed from mm_context_t::pgtable_list in case the second half of
  200. * the parent page is unallocated;
  201. * These operations are protected with mm_context_t::lock.
  202. *
  203. * It is important to understand that mm_context_t::lock only protects
  204. * mm_context_t::pgtable_list and AA bits, but not the parent page itself
  205. * and PP bits.
  206. *
  207. * Releasing the parent page happens whenever the PP bit turns from 1 to 0,
  208. * while both AA bits and the second PP bit are already unset. Then the
  209. * parent page does not contain any 2KB-pgtable fragment anymore, and it has
  210. * also been removed from mm_context_t::pgtable_list. It is safe to release
  211. * the page therefore.
  212. *
  213. * PGSTE memory spaces use full 4KB-pgtables and do not need most of the
  214. * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable
  215. * while the PP bits are never used, nor such a page is added to or removed
  216. * from mm_context_t::pgtable_list.
  217. */
  218. unsigned long *page_table_alloc(struct mm_struct *mm)
  219. {
  220. unsigned long *table;
  221. struct page *page;
  222. unsigned int mask, bit;
  223. /* Try to get a fragment of a 4K page as a 2K page table */
  224. if (!mm_alloc_pgste(mm)) {
  225. table = NULL;
  226. spin_lock_bh(&mm->context.lock);
  227. if (!list_empty(&mm->context.pgtable_list)) {
  228. page = list_first_entry(&mm->context.pgtable_list,
  229. struct page, lru);
  230. mask = atomic_read(&page->_refcount) >> 24;
  231. /*
  232. * The pending removal bits must also be checked.
  233. * Failure to do so might lead to an impossible
  234. * value of (i.e 0x13 or 0x23) written to _refcount.
  235. * Such values violate the assumption that pending and
  236. * allocation bits are mutually exclusive, and the rest
  237. * of the code unrails as result. That could lead to
  238. * a whole bunch of races and corruptions.
  239. */
  240. mask = (mask | (mask >> 4)) & 0x03U;
  241. if (mask != 0x03U) {
  242. table = (unsigned long *) page_to_virt(page);
  243. bit = mask & 1; /* =1 -> second 2K */
  244. if (bit)
  245. table += PTRS_PER_PTE;
  246. atomic_xor_bits(&page->_refcount,
  247. 0x01U << (bit + 24));
  248. list_del(&page->lru);
  249. }
  250. }
  251. spin_unlock_bh(&mm->context.lock);
  252. if (table)
  253. return table;
  254. }
  255. /* Allocate a fresh page */
  256. page = alloc_page(GFP_KERNEL);
  257. if (!page)
  258. return NULL;
  259. if (!pgtable_pte_page_ctor(page)) {
  260. __free_page(page);
  261. return NULL;
  262. }
  263. arch_set_page_dat(page, 0);
  264. /* Initialize page table */
  265. table = (unsigned long *) page_to_virt(page);
  266. if (mm_alloc_pgste(mm)) {
  267. /* Return 4K page table with PGSTEs */
  268. atomic_xor_bits(&page->_refcount, 0x03U << 24);
  269. memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
  270. memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
  271. } else {
  272. /* Return the first 2K fragment of the page */
  273. atomic_xor_bits(&page->_refcount, 0x01U << 24);
  274. memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
  275. spin_lock_bh(&mm->context.lock);
  276. list_add(&page->lru, &mm->context.pgtable_list);
  277. spin_unlock_bh(&mm->context.lock);
  278. }
  279. return table;
  280. }
  281. static void page_table_release_check(struct page *page, void *table,
  282. unsigned int half, unsigned int mask)
  283. {
  284. char msg[128];
  285. if (!IS_ENABLED(CONFIG_DEBUG_VM) || !mask)
  286. return;
  287. snprintf(msg, sizeof(msg),
  288. "Invalid pgtable %p release half 0x%02x mask 0x%02x",
  289. table, half, mask);
  290. dump_page(page, msg);
  291. }
  292. void page_table_free(struct mm_struct *mm, unsigned long *table)
  293. {
  294. unsigned int mask, bit, half;
  295. struct page *page;
  296. page = virt_to_page(table);
  297. if (!mm_alloc_pgste(mm)) {
  298. /* Free 2K page table fragment of a 4K page */
  299. bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
  300. spin_lock_bh(&mm->context.lock);
  301. /*
  302. * Mark the page for delayed release. The actual release
  303. * will happen outside of the critical section from this
  304. * function or from __tlb_remove_table()
  305. */
  306. mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
  307. mask >>= 24;
  308. if (mask & 0x03U)
  309. list_add(&page->lru, &mm->context.pgtable_list);
  310. else
  311. list_del(&page->lru);
  312. spin_unlock_bh(&mm->context.lock);
  313. mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24));
  314. mask >>= 24;
  315. if (mask != 0x00U)
  316. return;
  317. half = 0x01U << bit;
  318. } else {
  319. half = 0x03U;
  320. mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
  321. mask >>= 24;
  322. }
  323. page_table_release_check(page, table, half, mask);
  324. pgtable_pte_page_dtor(page);
  325. __free_page(page);
  326. }
  327. void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
  328. unsigned long vmaddr)
  329. {
  330. struct mm_struct *mm;
  331. struct page *page;
  332. unsigned int bit, mask;
  333. mm = tlb->mm;
  334. page = virt_to_page(table);
  335. if (mm_alloc_pgste(mm)) {
  336. gmap_unlink(mm, table, vmaddr);
  337. table = (unsigned long *) ((unsigned long)table | 0x03U);
  338. tlb_remove_table(tlb, table);
  339. return;
  340. }
  341. bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
  342. spin_lock_bh(&mm->context.lock);
  343. /*
  344. * Mark the page for delayed release. The actual release will happen
  345. * outside of the critical section from __tlb_remove_table() or from
  346. * page_table_free()
  347. */
  348. mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
  349. mask >>= 24;
  350. if (mask & 0x03U)
  351. list_add_tail(&page->lru, &mm->context.pgtable_list);
  352. else
  353. list_del(&page->lru);
  354. spin_unlock_bh(&mm->context.lock);
  355. table = (unsigned long *) ((unsigned long) table | (0x01U << bit));
  356. tlb_remove_table(tlb, table);
  357. }
  358. void __tlb_remove_table(void *_table)
  359. {
  360. unsigned int mask = (unsigned long) _table & 0x03U, half = mask;
  361. void *table = (void *)((unsigned long) _table ^ mask);
  362. struct page *page = virt_to_page(table);
  363. switch (half) {
  364. case 0x00U: /* pmd, pud, or p4d */
  365. free_pages((unsigned long)table, CRST_ALLOC_ORDER);
  366. return;
  367. case 0x01U: /* lower 2K of a 4K page table */
  368. case 0x02U: /* higher 2K of a 4K page table */
  369. mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
  370. mask >>= 24;
  371. if (mask != 0x00U)
  372. return;
  373. break;
  374. case 0x03U: /* 4K page table with pgstes */
  375. mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
  376. mask >>= 24;
  377. break;
  378. }
  379. page_table_release_check(page, table, half, mask);
  380. pgtable_pte_page_dtor(page);
  381. __free_page(page);
  382. }
  383. /*
  384. * Base infrastructure required to generate basic asces, region, segment,
  385. * and page tables that do not make use of enhanced features like EDAT1.
  386. */
  387. static struct kmem_cache *base_pgt_cache;
  388. static unsigned long *base_pgt_alloc(void)
  389. {
  390. unsigned long *table;
  391. table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL);
  392. if (table)
  393. memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
  394. return table;
  395. }
  396. static void base_pgt_free(unsigned long *table)
  397. {
  398. kmem_cache_free(base_pgt_cache, table);
  399. }
  400. static unsigned long *base_crst_alloc(unsigned long val)
  401. {
  402. unsigned long *table;
  403. table = (unsigned long *)__get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
  404. if (table)
  405. crst_table_init(table, val);
  406. return table;
  407. }
  408. static void base_crst_free(unsigned long *table)
  409. {
  410. free_pages((unsigned long)table, CRST_ALLOC_ORDER);
  411. }
  412. #define BASE_ADDR_END_FUNC(NAME, SIZE) \
  413. static inline unsigned long base_##NAME##_addr_end(unsigned long addr, \
  414. unsigned long end) \
  415. { \
  416. unsigned long next = (addr + (SIZE)) & ~((SIZE) - 1); \
  417. \
  418. return (next - 1) < (end - 1) ? next : end; \
  419. }
  420. BASE_ADDR_END_FUNC(page, _PAGE_SIZE)
  421. BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE)
  422. BASE_ADDR_END_FUNC(region3, _REGION3_SIZE)
  423. BASE_ADDR_END_FUNC(region2, _REGION2_SIZE)
  424. BASE_ADDR_END_FUNC(region1, _REGION1_SIZE)
  425. static inline unsigned long base_lra(unsigned long address)
  426. {
  427. unsigned long real;
  428. asm volatile(
  429. " lra %0,0(%1)\n"
  430. : "=d" (real) : "a" (address) : "cc");
  431. return real;
  432. }
  433. static int base_page_walk(unsigned long *origin, unsigned long addr,
  434. unsigned long end, int alloc)
  435. {
  436. unsigned long *pte, next;
  437. if (!alloc)
  438. return 0;
  439. pte = origin;
  440. pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT;
  441. do {
  442. next = base_page_addr_end(addr, end);
  443. *pte = base_lra(addr);
  444. } while (pte++, addr = next, addr < end);
  445. return 0;
  446. }
  447. static int base_segment_walk(unsigned long *origin, unsigned long addr,
  448. unsigned long end, int alloc)
  449. {
  450. unsigned long *ste, next, *table;
  451. int rc;
  452. ste = origin;
  453. ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
  454. do {
  455. next = base_segment_addr_end(addr, end);
  456. if (*ste & _SEGMENT_ENTRY_INVALID) {
  457. if (!alloc)
  458. continue;
  459. table = base_pgt_alloc();
  460. if (!table)
  461. return -ENOMEM;
  462. *ste = __pa(table) | _SEGMENT_ENTRY;
  463. }
  464. table = __va(*ste & _SEGMENT_ENTRY_ORIGIN);
  465. rc = base_page_walk(table, addr, next, alloc);
  466. if (rc)
  467. return rc;
  468. if (!alloc)
  469. base_pgt_free(table);
  470. cond_resched();
  471. } while (ste++, addr = next, addr < end);
  472. return 0;
  473. }
  474. static int base_region3_walk(unsigned long *origin, unsigned long addr,
  475. unsigned long end, int alloc)
  476. {
  477. unsigned long *rtte, next, *table;
  478. int rc;
  479. rtte = origin;
  480. rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT;
  481. do {
  482. next = base_region3_addr_end(addr, end);
  483. if (*rtte & _REGION_ENTRY_INVALID) {
  484. if (!alloc)
  485. continue;
  486. table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
  487. if (!table)
  488. return -ENOMEM;
  489. *rtte = __pa(table) | _REGION3_ENTRY;
  490. }
  491. table = __va(*rtte & _REGION_ENTRY_ORIGIN);
  492. rc = base_segment_walk(table, addr, next, alloc);
  493. if (rc)
  494. return rc;
  495. if (!alloc)
  496. base_crst_free(table);
  497. } while (rtte++, addr = next, addr < end);
  498. return 0;
  499. }
  500. static int base_region2_walk(unsigned long *origin, unsigned long addr,
  501. unsigned long end, int alloc)
  502. {
  503. unsigned long *rste, next, *table;
  504. int rc;
  505. rste = origin;
  506. rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT;
  507. do {
  508. next = base_region2_addr_end(addr, end);
  509. if (*rste & _REGION_ENTRY_INVALID) {
  510. if (!alloc)
  511. continue;
  512. table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
  513. if (!table)
  514. return -ENOMEM;
  515. *rste = __pa(table) | _REGION2_ENTRY;
  516. }
  517. table = __va(*rste & _REGION_ENTRY_ORIGIN);
  518. rc = base_region3_walk(table, addr, next, alloc);
  519. if (rc)
  520. return rc;
  521. if (!alloc)
  522. base_crst_free(table);
  523. } while (rste++, addr = next, addr < end);
  524. return 0;
  525. }
  526. static int base_region1_walk(unsigned long *origin, unsigned long addr,
  527. unsigned long end, int alloc)
  528. {
  529. unsigned long *rfte, next, *table;
  530. int rc;
  531. rfte = origin;
  532. rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT;
  533. do {
  534. next = base_region1_addr_end(addr, end);
  535. if (*rfte & _REGION_ENTRY_INVALID) {
  536. if (!alloc)
  537. continue;
  538. table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
  539. if (!table)
  540. return -ENOMEM;
  541. *rfte = __pa(table) | _REGION1_ENTRY;
  542. }
  543. table = __va(*rfte & _REGION_ENTRY_ORIGIN);
  544. rc = base_region2_walk(table, addr, next, alloc);
  545. if (rc)
  546. return rc;
  547. if (!alloc)
  548. base_crst_free(table);
  549. } while (rfte++, addr = next, addr < end);
  550. return 0;
  551. }
  552. /**
  553. * base_asce_free - free asce and tables returned from base_asce_alloc()
  554. * @asce: asce to be freed
  555. *
  556. * Frees all region, segment, and page tables that were allocated with a
  557. * corresponding base_asce_alloc() call.
  558. */
  559. void base_asce_free(unsigned long asce)
  560. {
  561. unsigned long *table = __va(asce & _ASCE_ORIGIN);
  562. if (!asce)
  563. return;
  564. switch (asce & _ASCE_TYPE_MASK) {
  565. case _ASCE_TYPE_SEGMENT:
  566. base_segment_walk(table, 0, _REGION3_SIZE, 0);
  567. break;
  568. case _ASCE_TYPE_REGION3:
  569. base_region3_walk(table, 0, _REGION2_SIZE, 0);
  570. break;
  571. case _ASCE_TYPE_REGION2:
  572. base_region2_walk(table, 0, _REGION1_SIZE, 0);
  573. break;
  574. case _ASCE_TYPE_REGION1:
  575. base_region1_walk(table, 0, TASK_SIZE_MAX, 0);
  576. break;
  577. }
  578. base_crst_free(table);
  579. }
  580. static int base_pgt_cache_init(void)
  581. {
  582. static DEFINE_MUTEX(base_pgt_cache_mutex);
  583. unsigned long sz = _PAGE_TABLE_SIZE;
  584. if (base_pgt_cache)
  585. return 0;
  586. mutex_lock(&base_pgt_cache_mutex);
  587. if (!base_pgt_cache)
  588. base_pgt_cache = kmem_cache_create("base_pgt", sz, sz, 0, NULL);
  589. mutex_unlock(&base_pgt_cache_mutex);
  590. return base_pgt_cache ? 0 : -ENOMEM;
  591. }
  592. /**
  593. * base_asce_alloc - create kernel mapping without enhanced DAT features
  594. * @addr: virtual start address of kernel mapping
  595. * @num_pages: number of consecutive pages
  596. *
  597. * Generate an asce, including all required region, segment and page tables,
  598. * that can be used to access the virtual kernel mapping. The difference is
  599. * that the returned asce does not make use of any enhanced DAT features like
  600. * e.g. large pages. This is required for some I/O functions that pass an
  601. * asce, like e.g. some service call requests.
  602. *
  603. * Note: the returned asce may NEVER be attached to any cpu. It may only be
  604. * used for I/O requests. tlb entries that might result because the
  605. * asce was attached to a cpu won't be cleared.
  606. */
  607. unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
  608. {
  609. unsigned long asce, *table, end;
  610. int rc;
  611. if (base_pgt_cache_init())
  612. return 0;
  613. end = addr + num_pages * PAGE_SIZE;
  614. if (end <= _REGION3_SIZE) {
  615. table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
  616. if (!table)
  617. return 0;
  618. rc = base_segment_walk(table, addr, end, 1);
  619. asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
  620. } else if (end <= _REGION2_SIZE) {
  621. table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
  622. if (!table)
  623. return 0;
  624. rc = base_region3_walk(table, addr, end, 1);
  625. asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
  626. } else if (end <= _REGION1_SIZE) {
  627. table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
  628. if (!table)
  629. return 0;
  630. rc = base_region2_walk(table, addr, end, 1);
  631. asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
  632. } else {
  633. table = base_crst_alloc(_REGION1_ENTRY_EMPTY);
  634. if (!table)
  635. return 0;
  636. rc = base_region1_walk(table, addr, end, 1);
  637. asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
  638. }
  639. if (rc) {
  640. base_asce_free(asce);
  641. asce = 0;
  642. }
  643. return asce;
  644. }