mmu.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (C) 2019 Western Digital Corporation or its affiliates.
  4. *
  5. * Authors:
  6. * Anup Patel <[email protected]>
  7. */
  8. #include <linux/bitops.h>
  9. #include <linux/errno.h>
  10. #include <linux/err.h>
  11. #include <linux/hugetlb.h>
  12. #include <linux/module.h>
  13. #include <linux/uaccess.h>
  14. #include <linux/vmalloc.h>
  15. #include <linux/kvm_host.h>
  16. #include <linux/sched/signal.h>
  17. #include <asm/csr.h>
  18. #include <asm/page.h>
  19. #include <asm/pgtable.h>
  20. #ifdef CONFIG_64BIT
  21. static unsigned long gstage_mode = (HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT);
  22. static unsigned long gstage_pgd_levels = 3;
  23. #define gstage_index_bits 9
  24. #else
  25. static unsigned long gstage_mode = (HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT);
  26. static unsigned long gstage_pgd_levels = 2;
  27. #define gstage_index_bits 10
  28. #endif
  29. #define gstage_pgd_xbits 2
  30. #define gstage_pgd_size (1UL << (HGATP_PAGE_SHIFT + gstage_pgd_xbits))
  31. #define gstage_gpa_bits (HGATP_PAGE_SHIFT + \
  32. (gstage_pgd_levels * gstage_index_bits) + \
  33. gstage_pgd_xbits)
  34. #define gstage_gpa_size ((gpa_t)(1ULL << gstage_gpa_bits))
  35. #define gstage_pte_leaf(__ptep) \
  36. (pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC))
  37. static inline unsigned long gstage_pte_index(gpa_t addr, u32 level)
  38. {
  39. unsigned long mask;
  40. unsigned long shift = HGATP_PAGE_SHIFT + (gstage_index_bits * level);
  41. if (level == (gstage_pgd_levels - 1))
  42. mask = (PTRS_PER_PTE * (1UL << gstage_pgd_xbits)) - 1;
  43. else
  44. mask = PTRS_PER_PTE - 1;
  45. return (addr >> shift) & mask;
  46. }
  47. static inline unsigned long gstage_pte_page_vaddr(pte_t pte)
  48. {
  49. return (unsigned long)pfn_to_virt(__page_val_to_pfn(pte_val(pte)));
  50. }
  51. static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level)
  52. {
  53. u32 i;
  54. unsigned long psz = 1UL << 12;
  55. for (i = 0; i < gstage_pgd_levels; i++) {
  56. if (page_size == (psz << (i * gstage_index_bits))) {
  57. *out_level = i;
  58. return 0;
  59. }
  60. }
  61. return -EINVAL;
  62. }
  63. static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder)
  64. {
  65. if (gstage_pgd_levels < level)
  66. return -EINVAL;
  67. *out_pgorder = 12 + (level * gstage_index_bits);
  68. return 0;
  69. }
  70. static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize)
  71. {
  72. int rc;
  73. unsigned long page_order = PAGE_SHIFT;
  74. rc = gstage_level_to_page_order(level, &page_order);
  75. if (rc)
  76. return rc;
  77. *out_pgsize = BIT(page_order);
  78. return 0;
  79. }
  80. static bool gstage_get_leaf_entry(struct kvm *kvm, gpa_t addr,
  81. pte_t **ptepp, u32 *ptep_level)
  82. {
  83. pte_t *ptep;
  84. u32 current_level = gstage_pgd_levels - 1;
  85. *ptep_level = current_level;
  86. ptep = (pte_t *)kvm->arch.pgd;
  87. ptep = &ptep[gstage_pte_index(addr, current_level)];
  88. while (ptep && pte_val(*ptep)) {
  89. if (gstage_pte_leaf(ptep)) {
  90. *ptep_level = current_level;
  91. *ptepp = ptep;
  92. return true;
  93. }
  94. if (current_level) {
  95. current_level--;
  96. *ptep_level = current_level;
  97. ptep = (pte_t *)gstage_pte_page_vaddr(*ptep);
  98. ptep = &ptep[gstage_pte_index(addr, current_level)];
  99. } else {
  100. ptep = NULL;
  101. }
  102. }
  103. return false;
  104. }
  105. static void gstage_remote_tlb_flush(struct kvm *kvm, u32 level, gpa_t addr)
  106. {
  107. unsigned long order = PAGE_SHIFT;
  108. if (gstage_level_to_page_order(level, &order))
  109. return;
  110. addr &= ~(BIT(order) - 1);
  111. kvm_riscv_hfence_gvma_vmid_gpa(kvm, -1UL, 0, addr, BIT(order), order);
  112. }
  113. static int gstage_set_pte(struct kvm *kvm, u32 level,
  114. struct kvm_mmu_memory_cache *pcache,
  115. gpa_t addr, const pte_t *new_pte)
  116. {
  117. u32 current_level = gstage_pgd_levels - 1;
  118. pte_t *next_ptep = (pte_t *)kvm->arch.pgd;
  119. pte_t *ptep = &next_ptep[gstage_pte_index(addr, current_level)];
  120. if (current_level < level)
  121. return -EINVAL;
  122. while (current_level != level) {
  123. if (gstage_pte_leaf(ptep))
  124. return -EEXIST;
  125. if (!pte_val(*ptep)) {
  126. if (!pcache)
  127. return -ENOMEM;
  128. next_ptep = kvm_mmu_memory_cache_alloc(pcache);
  129. if (!next_ptep)
  130. return -ENOMEM;
  131. *ptep = pfn_pte(PFN_DOWN(__pa(next_ptep)),
  132. __pgprot(_PAGE_TABLE));
  133. } else {
  134. if (gstage_pte_leaf(ptep))
  135. return -EEXIST;
  136. next_ptep = (pte_t *)gstage_pte_page_vaddr(*ptep);
  137. }
  138. current_level--;
  139. ptep = &next_ptep[gstage_pte_index(addr, current_level)];
  140. }
  141. *ptep = *new_pte;
  142. if (gstage_pte_leaf(ptep))
  143. gstage_remote_tlb_flush(kvm, current_level, addr);
  144. return 0;
  145. }
  146. static int gstage_map_page(struct kvm *kvm,
  147. struct kvm_mmu_memory_cache *pcache,
  148. gpa_t gpa, phys_addr_t hpa,
  149. unsigned long page_size,
  150. bool page_rdonly, bool page_exec)
  151. {
  152. int ret;
  153. u32 level = 0;
  154. pte_t new_pte;
  155. pgprot_t prot;
  156. ret = gstage_page_size_to_level(page_size, &level);
  157. if (ret)
  158. return ret;
  159. /*
  160. * A RISC-V implementation can choose to either:
  161. * 1) Update 'A' and 'D' PTE bits in hardware
  162. * 2) Generate page fault when 'A' and/or 'D' bits are not set
  163. * PTE so that software can update these bits.
  164. *
  165. * We support both options mentioned above. To achieve this, we
  166. * always set 'A' and 'D' PTE bits at time of creating G-stage
  167. * mapping. To support KVM dirty page logging with both options
  168. * mentioned above, we will write-protect G-stage PTEs to track
  169. * dirty pages.
  170. */
  171. if (page_exec) {
  172. if (page_rdonly)
  173. prot = PAGE_READ_EXEC;
  174. else
  175. prot = PAGE_WRITE_EXEC;
  176. } else {
  177. if (page_rdonly)
  178. prot = PAGE_READ;
  179. else
  180. prot = PAGE_WRITE;
  181. }
  182. new_pte = pfn_pte(PFN_DOWN(hpa), prot);
  183. new_pte = pte_mkdirty(new_pte);
  184. return gstage_set_pte(kvm, level, pcache, gpa, &new_pte);
  185. }
  186. enum gstage_op {
  187. GSTAGE_OP_NOP = 0, /* Nothing */
  188. GSTAGE_OP_CLEAR, /* Clear/Unmap */
  189. GSTAGE_OP_WP, /* Write-protect */
  190. };
  191. static void gstage_op_pte(struct kvm *kvm, gpa_t addr,
  192. pte_t *ptep, u32 ptep_level, enum gstage_op op)
  193. {
  194. int i, ret;
  195. pte_t *next_ptep;
  196. u32 next_ptep_level;
  197. unsigned long next_page_size, page_size;
  198. ret = gstage_level_to_page_size(ptep_level, &page_size);
  199. if (ret)
  200. return;
  201. BUG_ON(addr & (page_size - 1));
  202. if (!pte_val(*ptep))
  203. return;
  204. if (ptep_level && !gstage_pte_leaf(ptep)) {
  205. next_ptep = (pte_t *)gstage_pte_page_vaddr(*ptep);
  206. next_ptep_level = ptep_level - 1;
  207. ret = gstage_level_to_page_size(next_ptep_level,
  208. &next_page_size);
  209. if (ret)
  210. return;
  211. if (op == GSTAGE_OP_CLEAR)
  212. set_pte(ptep, __pte(0));
  213. for (i = 0; i < PTRS_PER_PTE; i++)
  214. gstage_op_pte(kvm, addr + i * next_page_size,
  215. &next_ptep[i], next_ptep_level, op);
  216. if (op == GSTAGE_OP_CLEAR)
  217. put_page(virt_to_page(next_ptep));
  218. } else {
  219. if (op == GSTAGE_OP_CLEAR)
  220. set_pte(ptep, __pte(0));
  221. else if (op == GSTAGE_OP_WP)
  222. set_pte(ptep, __pte(pte_val(*ptep) & ~_PAGE_WRITE));
  223. gstage_remote_tlb_flush(kvm, ptep_level, addr);
  224. }
  225. }
  226. static void gstage_unmap_range(struct kvm *kvm, gpa_t start,
  227. gpa_t size, bool may_block)
  228. {
  229. int ret;
  230. pte_t *ptep;
  231. u32 ptep_level;
  232. bool found_leaf;
  233. unsigned long page_size;
  234. gpa_t addr = start, end = start + size;
  235. while (addr < end) {
  236. found_leaf = gstage_get_leaf_entry(kvm, addr,
  237. &ptep, &ptep_level);
  238. ret = gstage_level_to_page_size(ptep_level, &page_size);
  239. if (ret)
  240. break;
  241. if (!found_leaf)
  242. goto next;
  243. if (!(addr & (page_size - 1)) && ((end - addr) >= page_size))
  244. gstage_op_pte(kvm, addr, ptep,
  245. ptep_level, GSTAGE_OP_CLEAR);
  246. next:
  247. addr += page_size;
  248. /*
  249. * If the range is too large, release the kvm->mmu_lock
  250. * to prevent starvation and lockup detector warnings.
  251. */
  252. if (may_block && addr < end)
  253. cond_resched_lock(&kvm->mmu_lock);
  254. }
  255. }
  256. static void gstage_wp_range(struct kvm *kvm, gpa_t start, gpa_t end)
  257. {
  258. int ret;
  259. pte_t *ptep;
  260. u32 ptep_level;
  261. bool found_leaf;
  262. gpa_t addr = start;
  263. unsigned long page_size;
  264. while (addr < end) {
  265. found_leaf = gstage_get_leaf_entry(kvm, addr,
  266. &ptep, &ptep_level);
  267. ret = gstage_level_to_page_size(ptep_level, &page_size);
  268. if (ret)
  269. break;
  270. if (!found_leaf)
  271. goto next;
  272. if (!(addr & (page_size - 1)) && ((end - addr) >= page_size))
  273. gstage_op_pte(kvm, addr, ptep,
  274. ptep_level, GSTAGE_OP_WP);
  275. next:
  276. addr += page_size;
  277. }
  278. }
  279. static void gstage_wp_memory_region(struct kvm *kvm, int slot)
  280. {
  281. struct kvm_memslots *slots = kvm_memslots(kvm);
  282. struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
  283. phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
  284. phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
  285. spin_lock(&kvm->mmu_lock);
  286. gstage_wp_range(kvm, start, end);
  287. spin_unlock(&kvm->mmu_lock);
  288. kvm_flush_remote_tlbs(kvm);
  289. }
  290. int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa,
  291. phys_addr_t hpa, unsigned long size,
  292. bool writable, bool in_atomic)
  293. {
  294. pte_t pte;
  295. int ret = 0;
  296. unsigned long pfn;
  297. phys_addr_t addr, end;
  298. struct kvm_mmu_memory_cache pcache = {
  299. .gfp_custom = (in_atomic) ? GFP_ATOMIC | __GFP_ACCOUNT : 0,
  300. .gfp_zero = __GFP_ZERO,
  301. };
  302. end = (gpa + size + PAGE_SIZE - 1) & PAGE_MASK;
  303. pfn = __phys_to_pfn(hpa);
  304. for (addr = gpa; addr < end; addr += PAGE_SIZE) {
  305. pte = pfn_pte(pfn, PAGE_KERNEL_IO);
  306. if (!writable)
  307. pte = pte_wrprotect(pte);
  308. ret = kvm_mmu_topup_memory_cache(&pcache, gstage_pgd_levels);
  309. if (ret)
  310. goto out;
  311. spin_lock(&kvm->mmu_lock);
  312. ret = gstage_set_pte(kvm, 0, &pcache, addr, &pte);
  313. spin_unlock(&kvm->mmu_lock);
  314. if (ret)
  315. goto out;
  316. pfn++;
  317. }
  318. out:
  319. kvm_mmu_free_memory_cache(&pcache);
  320. return ret;
  321. }
  322. void kvm_riscv_gstage_iounmap(struct kvm *kvm, gpa_t gpa, unsigned long size)
  323. {
  324. spin_lock(&kvm->mmu_lock);
  325. gstage_unmap_range(kvm, gpa, size, false);
  326. spin_unlock(&kvm->mmu_lock);
  327. }
  328. void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
  329. struct kvm_memory_slot *slot,
  330. gfn_t gfn_offset,
  331. unsigned long mask)
  332. {
  333. phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
  334. phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
  335. phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
  336. gstage_wp_range(kvm, start, end);
  337. }
  338. void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
  339. {
  340. }
  341. void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
  342. const struct kvm_memory_slot *memslot)
  343. {
  344. kvm_flush_remote_tlbs(kvm);
  345. }
  346. void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free)
  347. {
  348. }
  349. void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
  350. {
  351. }
  352. void kvm_arch_flush_shadow_all(struct kvm *kvm)
  353. {
  354. kvm_riscv_gstage_free_pgd(kvm);
  355. }
  356. void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
  357. struct kvm_memory_slot *slot)
  358. {
  359. gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
  360. phys_addr_t size = slot->npages << PAGE_SHIFT;
  361. spin_lock(&kvm->mmu_lock);
  362. gstage_unmap_range(kvm, gpa, size, false);
  363. spin_unlock(&kvm->mmu_lock);
  364. }
  365. void kvm_arch_commit_memory_region(struct kvm *kvm,
  366. struct kvm_memory_slot *old,
  367. const struct kvm_memory_slot *new,
  368. enum kvm_mr_change change)
  369. {
  370. /*
  371. * At this point memslot has been committed and there is an
  372. * allocated dirty_bitmap[], dirty pages will be tracked while
  373. * the memory slot is write protected.
  374. */
  375. if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES)
  376. gstage_wp_memory_region(kvm, new->id);
  377. }
  378. int kvm_arch_prepare_memory_region(struct kvm *kvm,
  379. const struct kvm_memory_slot *old,
  380. struct kvm_memory_slot *new,
  381. enum kvm_mr_change change)
  382. {
  383. hva_t hva, reg_end, size;
  384. gpa_t base_gpa;
  385. bool writable;
  386. int ret = 0;
  387. if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
  388. change != KVM_MR_FLAGS_ONLY)
  389. return 0;
  390. /*
  391. * Prevent userspace from creating a memory region outside of the GPA
  392. * space addressable by the KVM guest GPA space.
  393. */
  394. if ((new->base_gfn + new->npages) >=
  395. (gstage_gpa_size >> PAGE_SHIFT))
  396. return -EFAULT;
  397. hva = new->userspace_addr;
  398. size = new->npages << PAGE_SHIFT;
  399. reg_end = hva + size;
  400. base_gpa = new->base_gfn << PAGE_SHIFT;
  401. writable = !(new->flags & KVM_MEM_READONLY);
  402. mmap_read_lock(current->mm);
  403. /*
  404. * A memory region could potentially cover multiple VMAs, and
  405. * any holes between them, so iterate over all of them to find
  406. * out if we can map any of them right now.
  407. *
  408. * +--------------------------------------------+
  409. * +---------------+----------------+ +----------------+
  410. * | : VMA 1 | VMA 2 | | VMA 3 : |
  411. * +---------------+----------------+ +----------------+
  412. * | memory region |
  413. * +--------------------------------------------+
  414. */
  415. do {
  416. struct vm_area_struct *vma = find_vma(current->mm, hva);
  417. hva_t vm_start, vm_end;
  418. if (!vma || vma->vm_start >= reg_end)
  419. break;
  420. /*
  421. * Mapping a read-only VMA is only allowed if the
  422. * memory region is configured as read-only.
  423. */
  424. if (writable && !(vma->vm_flags & VM_WRITE)) {
  425. ret = -EPERM;
  426. break;
  427. }
  428. /* Take the intersection of this VMA with the memory region */
  429. vm_start = max(hva, vma->vm_start);
  430. vm_end = min(reg_end, vma->vm_end);
  431. if (vma->vm_flags & VM_PFNMAP) {
  432. gpa_t gpa = base_gpa + (vm_start - hva);
  433. phys_addr_t pa;
  434. pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
  435. pa += vm_start - vma->vm_start;
  436. /* IO region dirty page logging not allowed */
  437. if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
  438. ret = -EINVAL;
  439. goto out;
  440. }
  441. ret = kvm_riscv_gstage_ioremap(kvm, gpa, pa,
  442. vm_end - vm_start,
  443. writable, false);
  444. if (ret)
  445. break;
  446. }
  447. hva = vm_end;
  448. } while (hva < reg_end);
  449. if (change == KVM_MR_FLAGS_ONLY)
  450. goto out;
  451. spin_lock(&kvm->mmu_lock);
  452. if (ret)
  453. gstage_unmap_range(kvm, base_gpa, size, false);
  454. spin_unlock(&kvm->mmu_lock);
  455. out:
  456. mmap_read_unlock(current->mm);
  457. return ret;
  458. }
  459. bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
  460. {
  461. if (!kvm->arch.pgd)
  462. return false;
  463. gstage_unmap_range(kvm, range->start << PAGE_SHIFT,
  464. (range->end - range->start) << PAGE_SHIFT,
  465. range->may_block);
  466. return false;
  467. }
  468. bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
  469. {
  470. int ret;
  471. kvm_pfn_t pfn = pte_pfn(range->pte);
  472. if (!kvm->arch.pgd)
  473. return false;
  474. WARN_ON(range->end - range->start != 1);
  475. ret = gstage_map_page(kvm, NULL, range->start << PAGE_SHIFT,
  476. __pfn_to_phys(pfn), PAGE_SIZE, true, true);
  477. if (ret) {
  478. kvm_debug("Failed to map G-stage page (error %d)\n", ret);
  479. return true;
  480. }
  481. return false;
  482. }
  483. bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
  484. {
  485. pte_t *ptep;
  486. u32 ptep_level = 0;
  487. u64 size = (range->end - range->start) << PAGE_SHIFT;
  488. if (!kvm->arch.pgd)
  489. return false;
  490. WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE);
  491. if (!gstage_get_leaf_entry(kvm, range->start << PAGE_SHIFT,
  492. &ptep, &ptep_level))
  493. return false;
  494. return ptep_test_and_clear_young(NULL, 0, ptep);
  495. }
  496. bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
  497. {
  498. pte_t *ptep;
  499. u32 ptep_level = 0;
  500. u64 size = (range->end - range->start) << PAGE_SHIFT;
  501. if (!kvm->arch.pgd)
  502. return false;
  503. WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE);
  504. if (!gstage_get_leaf_entry(kvm, range->start << PAGE_SHIFT,
  505. &ptep, &ptep_level))
  506. return false;
  507. return pte_young(*ptep);
  508. }
  509. int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
  510. struct kvm_memory_slot *memslot,
  511. gpa_t gpa, unsigned long hva, bool is_write)
  512. {
  513. int ret;
  514. kvm_pfn_t hfn;
  515. bool writable;
  516. short vma_pageshift;
  517. gfn_t gfn = gpa >> PAGE_SHIFT;
  518. struct vm_area_struct *vma;
  519. struct kvm *kvm = vcpu->kvm;
  520. struct kvm_mmu_memory_cache *pcache = &vcpu->arch.mmu_page_cache;
  521. bool logging = (memslot->dirty_bitmap &&
  522. !(memslot->flags & KVM_MEM_READONLY)) ? true : false;
  523. unsigned long vma_pagesize, mmu_seq;
  524. /* We need minimum second+third level pages */
  525. ret = kvm_mmu_topup_memory_cache(pcache, gstage_pgd_levels);
  526. if (ret) {
  527. kvm_err("Failed to topup G-stage cache\n");
  528. return ret;
  529. }
  530. mmap_read_lock(current->mm);
  531. vma = find_vma_intersection(current->mm, hva, hva + 1);
  532. if (unlikely(!vma)) {
  533. kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
  534. mmap_read_unlock(current->mm);
  535. return -EFAULT;
  536. }
  537. if (is_vm_hugetlb_page(vma))
  538. vma_pageshift = huge_page_shift(hstate_vma(vma));
  539. else
  540. vma_pageshift = PAGE_SHIFT;
  541. vma_pagesize = 1ULL << vma_pageshift;
  542. if (logging || (vma->vm_flags & VM_PFNMAP))
  543. vma_pagesize = PAGE_SIZE;
  544. if (vma_pagesize == PMD_SIZE || vma_pagesize == PGDIR_SIZE)
  545. gfn = (gpa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
  546. /*
  547. * Read mmu_invalidate_seq so that KVM can detect if the results of
  548. * vma_lookup() or gfn_to_pfn_prot() become stale priort to acquiring
  549. * kvm->mmu_lock.
  550. *
  551. * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
  552. * with the smp_wmb() in kvm_mmu_invalidate_end().
  553. */
  554. mmu_seq = kvm->mmu_invalidate_seq;
  555. mmap_read_unlock(current->mm);
  556. if (vma_pagesize != PGDIR_SIZE &&
  557. vma_pagesize != PMD_SIZE &&
  558. vma_pagesize != PAGE_SIZE) {
  559. kvm_err("Invalid VMA page size 0x%lx\n", vma_pagesize);
  560. return -EFAULT;
  561. }
  562. hfn = gfn_to_pfn_prot(kvm, gfn, is_write, &writable);
  563. if (hfn == KVM_PFN_ERR_HWPOISON) {
  564. send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva,
  565. vma_pageshift, current);
  566. return 0;
  567. }
  568. if (is_error_noslot_pfn(hfn))
  569. return -EFAULT;
  570. /*
  571. * If logging is active then we allow writable pages only
  572. * for write faults.
  573. */
  574. if (logging && !is_write)
  575. writable = false;
  576. spin_lock(&kvm->mmu_lock);
  577. if (mmu_invalidate_retry(kvm, mmu_seq))
  578. goto out_unlock;
  579. if (writable) {
  580. kvm_set_pfn_dirty(hfn);
  581. mark_page_dirty(kvm, gfn);
  582. ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT,
  583. vma_pagesize, false, true);
  584. } else {
  585. ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT,
  586. vma_pagesize, true, true);
  587. }
  588. if (ret)
  589. kvm_err("Failed to map in G-stage\n");
  590. out_unlock:
  591. spin_unlock(&kvm->mmu_lock);
  592. kvm_set_pfn_accessed(hfn);
  593. kvm_release_pfn_clean(hfn);
  594. return ret;
  595. }
  596. int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm)
  597. {
  598. struct page *pgd_page;
  599. if (kvm->arch.pgd != NULL) {
  600. kvm_err("kvm_arch already initialized?\n");
  601. return -EINVAL;
  602. }
  603. pgd_page = alloc_pages(GFP_KERNEL | __GFP_ZERO,
  604. get_order(gstage_pgd_size));
  605. if (!pgd_page)
  606. return -ENOMEM;
  607. kvm->arch.pgd = page_to_virt(pgd_page);
  608. kvm->arch.pgd_phys = page_to_phys(pgd_page);
  609. return 0;
  610. }
  611. void kvm_riscv_gstage_free_pgd(struct kvm *kvm)
  612. {
  613. void *pgd = NULL;
  614. spin_lock(&kvm->mmu_lock);
  615. if (kvm->arch.pgd) {
  616. gstage_unmap_range(kvm, 0UL, gstage_gpa_size, false);
  617. pgd = READ_ONCE(kvm->arch.pgd);
  618. kvm->arch.pgd = NULL;
  619. kvm->arch.pgd_phys = 0;
  620. }
  621. spin_unlock(&kvm->mmu_lock);
  622. if (pgd)
  623. free_pages((unsigned long)pgd, get_order(gstage_pgd_size));
  624. }
  625. void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu)
  626. {
  627. unsigned long hgatp = gstage_mode;
  628. struct kvm_arch *k = &vcpu->kvm->arch;
  629. hgatp |= (READ_ONCE(k->vmid.vmid) << HGATP_VMID_SHIFT) &
  630. HGATP_VMID_MASK;
  631. hgatp |= (k->pgd_phys >> PAGE_SHIFT) & HGATP_PPN;
  632. csr_write(CSR_HGATP, hgatp);
  633. if (!kvm_riscv_gstage_vmid_bits())
  634. kvm_riscv_local_hfence_gvma_all();
  635. }
  636. void kvm_riscv_gstage_mode_detect(void)
  637. {
  638. #ifdef CONFIG_64BIT
  639. /* Try Sv57x4 G-stage mode */
  640. csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT);
  641. if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) {
  642. gstage_mode = (HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT);
  643. gstage_pgd_levels = 5;
  644. goto skip_sv48x4_test;
  645. }
  646. /* Try Sv48x4 G-stage mode */
  647. csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT);
  648. if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) {
  649. gstage_mode = (HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT);
  650. gstage_pgd_levels = 4;
  651. }
  652. skip_sv48x4_test:
  653. csr_write(CSR_HGATP, 0);
  654. kvm_riscv_local_hfence_gvma_all();
  655. #endif
  656. }
  657. unsigned long kvm_riscv_gstage_mode(void)
  658. {
  659. return gstage_mode >> HGATP_MODE_SHIFT;
  660. }
  661. int kvm_riscv_gstage_gpa_bits(void)
  662. {
  663. return gstage_gpa_bits;
  664. }