virt.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Device driver to expose SGX enclave memory to KVM guests.
  4. *
  5. * Copyright(c) 2021 Intel Corporation.
  6. */
  7. #include <linux/miscdevice.h>
  8. #include <linux/mm.h>
  9. #include <linux/mman.h>
  10. #include <linux/sched/mm.h>
  11. #include <linux/sched/signal.h>
  12. #include <linux/slab.h>
  13. #include <linux/xarray.h>
  14. #include <asm/sgx.h>
  15. #include <uapi/asm/sgx.h>
  16. #include "encls.h"
  17. #include "sgx.h"
  18. struct sgx_vepc {
  19. struct xarray page_array;
  20. struct mutex lock;
  21. };
  22. /*
  23. * Temporary SECS pages that cannot be EREMOVE'd due to having child in other
  24. * virtual EPC instances, and the lock to protect it.
  25. */
  26. static struct mutex zombie_secs_pages_lock;
  27. static struct list_head zombie_secs_pages;
  28. static int __sgx_vepc_fault(struct sgx_vepc *vepc,
  29. struct vm_area_struct *vma, unsigned long addr)
  30. {
  31. struct sgx_epc_page *epc_page;
  32. unsigned long index, pfn;
  33. int ret;
  34. WARN_ON(!mutex_is_locked(&vepc->lock));
  35. /* Calculate index of EPC page in virtual EPC's page_array */
  36. index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
  37. epc_page = xa_load(&vepc->page_array, index);
  38. if (epc_page)
  39. return 0;
  40. epc_page = sgx_alloc_epc_page(vepc, false);
  41. if (IS_ERR(epc_page))
  42. return PTR_ERR(epc_page);
  43. ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL));
  44. if (ret)
  45. goto err_free;
  46. pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
  47. ret = vmf_insert_pfn(vma, addr, pfn);
  48. if (ret != VM_FAULT_NOPAGE) {
  49. ret = -EFAULT;
  50. goto err_delete;
  51. }
  52. return 0;
  53. err_delete:
  54. xa_erase(&vepc->page_array, index);
  55. err_free:
  56. sgx_free_epc_page(epc_page);
  57. return ret;
  58. }
  59. static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)
  60. {
  61. struct vm_area_struct *vma = vmf->vma;
  62. struct sgx_vepc *vepc = vma->vm_private_data;
  63. int ret;
  64. mutex_lock(&vepc->lock);
  65. ret = __sgx_vepc_fault(vepc, vma, vmf->address);
  66. mutex_unlock(&vepc->lock);
  67. if (!ret)
  68. return VM_FAULT_NOPAGE;
  69. if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
  70. mmap_read_unlock(vma->vm_mm);
  71. return VM_FAULT_RETRY;
  72. }
  73. return VM_FAULT_SIGBUS;
  74. }
  75. static const struct vm_operations_struct sgx_vepc_vm_ops = {
  76. .fault = sgx_vepc_fault,
  77. };
  78. static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
  79. {
  80. struct sgx_vepc *vepc = file->private_data;
  81. if (!(vma->vm_flags & VM_SHARED))
  82. return -EINVAL;
  83. vma->vm_ops = &sgx_vepc_vm_ops;
  84. /* Don't copy VMA in fork() */
  85. vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY);
  86. vma->vm_private_data = vepc;
  87. return 0;
  88. }
  89. static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page)
  90. {
  91. /*
  92. * Take a previously guest-owned EPC page and return it to the
  93. * general EPC page pool.
  94. *
  95. * Guests can not be trusted to have left this page in a good
  96. * state, so run EREMOVE on the page unconditionally. In the
  97. * case that a guest properly EREMOVE'd this page, a superfluous
  98. * EREMOVE is harmless.
  99. */
  100. return __eremove(sgx_get_epc_virt_addr(epc_page));
  101. }
  102. static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
  103. {
  104. int ret = sgx_vepc_remove_page(epc_page);
  105. if (ret) {
  106. /*
  107. * Only SGX_CHILD_PRESENT is expected, which is because of
  108. * EREMOVE'ing an SECS still with child, in which case it can
  109. * be handled by EREMOVE'ing the SECS again after all pages in
  110. * virtual EPC have been EREMOVE'd. See comments in below in
  111. * sgx_vepc_release().
  112. *
  113. * The user of virtual EPC (KVM) needs to guarantee there's no
  114. * logical processor is still running in the enclave in guest,
  115. * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be
  116. * handled here.
  117. */
  118. WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE,
  119. ret, ret);
  120. return ret;
  121. }
  122. sgx_free_epc_page(epc_page);
  123. return 0;
  124. }
  125. static long sgx_vepc_remove_all(struct sgx_vepc *vepc)
  126. {
  127. struct sgx_epc_page *entry;
  128. unsigned long index;
  129. long failures = 0;
  130. xa_for_each(&vepc->page_array, index, entry) {
  131. int ret = sgx_vepc_remove_page(entry);
  132. if (ret) {
  133. if (ret == SGX_CHILD_PRESENT) {
  134. /* The page is a SECS, userspace will retry. */
  135. failures++;
  136. } else {
  137. /*
  138. * Report errors due to #GP or SGX_ENCLAVE_ACT; do not
  139. * WARN, as userspace can induce said failures by
  140. * calling the ioctl concurrently on multiple vEPCs or
  141. * while one or more CPUs is running the enclave. Only
  142. * a #PF on EREMOVE indicates a kernel/hardware issue.
  143. */
  144. WARN_ON_ONCE(encls_faulted(ret) &&
  145. ENCLS_TRAPNR(ret) != X86_TRAP_GP);
  146. return -EBUSY;
  147. }
  148. }
  149. cond_resched();
  150. }
  151. /*
  152. * Return the number of SECS pages that failed to be removed, so
  153. * userspace knows that it has to retry.
  154. */
  155. return failures;
  156. }
  157. static int sgx_vepc_release(struct inode *inode, struct file *file)
  158. {
  159. struct sgx_vepc *vepc = file->private_data;
  160. struct sgx_epc_page *epc_page, *tmp, *entry;
  161. unsigned long index;
  162. LIST_HEAD(secs_pages);
  163. xa_for_each(&vepc->page_array, index, entry) {
  164. /*
  165. * Remove all normal, child pages. sgx_vepc_free_page()
  166. * will fail if EREMOVE fails, but this is OK and expected on
  167. * SECS pages. Those can only be EREMOVE'd *after* all their
  168. * child pages. Retries below will clean them up.
  169. */
  170. if (sgx_vepc_free_page(entry))
  171. continue;
  172. xa_erase(&vepc->page_array, index);
  173. cond_resched();
  174. }
  175. /*
  176. * Retry EREMOVE'ing pages. This will clean up any SECS pages that
  177. * only had children in this 'epc' area.
  178. */
  179. xa_for_each(&vepc->page_array, index, entry) {
  180. epc_page = entry;
  181. /*
  182. * An EREMOVE failure here means that the SECS page still
  183. * has children. But, since all children in this 'sgx_vepc'
  184. * have been removed, the SECS page must have a child on
  185. * another instance.
  186. */
  187. if (sgx_vepc_free_page(epc_page))
  188. list_add_tail(&epc_page->list, &secs_pages);
  189. xa_erase(&vepc->page_array, index);
  190. cond_resched();
  191. }
  192. /*
  193. * SECS pages are "pinned" by child pages, and "unpinned" once all
  194. * children have been EREMOVE'd. A child page in this instance
  195. * may have pinned an SECS page encountered in an earlier release(),
  196. * creating a zombie. Since some children were EREMOVE'd above,
  197. * try to EREMOVE all zombies in the hopes that one was unpinned.
  198. */
  199. mutex_lock(&zombie_secs_pages_lock);
  200. list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) {
  201. /*
  202. * Speculatively remove the page from the list of zombies,
  203. * if the page is successfully EREMOVE'd it will be added to
  204. * the list of free pages. If EREMOVE fails, throw the page
  205. * on the local list, which will be spliced on at the end.
  206. */
  207. list_del(&epc_page->list);
  208. if (sgx_vepc_free_page(epc_page))
  209. list_add_tail(&epc_page->list, &secs_pages);
  210. cond_resched();
  211. }
  212. if (!list_empty(&secs_pages))
  213. list_splice_tail(&secs_pages, &zombie_secs_pages);
  214. mutex_unlock(&zombie_secs_pages_lock);
  215. xa_destroy(&vepc->page_array);
  216. kfree(vepc);
  217. return 0;
  218. }
  219. static int sgx_vepc_open(struct inode *inode, struct file *file)
  220. {
  221. struct sgx_vepc *vepc;
  222. vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL);
  223. if (!vepc)
  224. return -ENOMEM;
  225. mutex_init(&vepc->lock);
  226. xa_init(&vepc->page_array);
  227. file->private_data = vepc;
  228. return 0;
  229. }
  230. static long sgx_vepc_ioctl(struct file *file,
  231. unsigned int cmd, unsigned long arg)
  232. {
  233. struct sgx_vepc *vepc = file->private_data;
  234. switch (cmd) {
  235. case SGX_IOC_VEPC_REMOVE_ALL:
  236. if (arg)
  237. return -EINVAL;
  238. return sgx_vepc_remove_all(vepc);
  239. default:
  240. return -ENOTTY;
  241. }
  242. }
  243. static const struct file_operations sgx_vepc_fops = {
  244. .owner = THIS_MODULE,
  245. .open = sgx_vepc_open,
  246. .unlocked_ioctl = sgx_vepc_ioctl,
  247. .compat_ioctl = sgx_vepc_ioctl,
  248. .release = sgx_vepc_release,
  249. .mmap = sgx_vepc_mmap,
  250. };
  251. static struct miscdevice sgx_vepc_dev = {
  252. .minor = MISC_DYNAMIC_MINOR,
  253. .name = "sgx_vepc",
  254. .nodename = "sgx_vepc",
  255. .fops = &sgx_vepc_fops,
  256. };
  257. int __init sgx_vepc_init(void)
  258. {
  259. /* SGX virtualization requires KVM to work */
  260. if (!cpu_feature_enabled(X86_FEATURE_VMX))
  261. return -ENODEV;
  262. INIT_LIST_HEAD(&zombie_secs_pages);
  263. mutex_init(&zombie_secs_pages_lock);
  264. return misc_register(&sgx_vepc_dev);
  265. }
  266. /**
  267. * sgx_virt_ecreate() - Run ECREATE on behalf of guest
  268. * @pageinfo: Pointer to PAGEINFO structure
  269. * @secs: Userspace pointer to SECS page
  270. * @trapnr: trap number injected to guest in case of ECREATE error
  271. *
  272. * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose
  273. * of enforcing policies of guest's enclaves, and return the trap number
  274. * which should be injected to guest in case of any ECREATE error.
  275. *
  276. * Return:
  277. * - 0: ECREATE was successful.
  278. * - <0: on error.
  279. */
  280. int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
  281. int *trapnr)
  282. {
  283. int ret;
  284. /*
  285. * @secs is an untrusted, userspace-provided address. It comes from
  286. * KVM and is assumed to be a valid pointer which points somewhere in
  287. * userspace. This can fault and call SGX or other fault handlers when
  288. * userspace mapping @secs doesn't exist.
  289. *
  290. * Add a WARN() to make sure @secs is already valid userspace pointer
  291. * from caller (KVM), who should already have handled invalid pointer
  292. * case (for instance, made by malicious guest). All other checks,
  293. * such as alignment of @secs, are deferred to ENCLS itself.
  294. */
  295. if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))
  296. return -EINVAL;
  297. __uaccess_begin();
  298. ret = __ecreate(pageinfo, (void *)secs);
  299. __uaccess_end();
  300. if (encls_faulted(ret)) {
  301. *trapnr = ENCLS_TRAPNR(ret);
  302. return -EFAULT;
  303. }
  304. /* ECREATE doesn't return an error code, it faults or succeeds. */
  305. WARN_ON_ONCE(ret);
  306. return 0;
  307. }
  308. EXPORT_SYMBOL_GPL(sgx_virt_ecreate);
  309. static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
  310. void __user *secs)
  311. {
  312. int ret;
  313. /*
  314. * Make sure all userspace pointers from caller (KVM) are valid.
  315. * All other checks deferred to ENCLS itself. Also see comment
  316. * for @secs in sgx_virt_ecreate().
  317. */
  318. #define SGX_EINITTOKEN_SIZE 304
  319. if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) ||
  320. !access_ok(token, SGX_EINITTOKEN_SIZE) ||
  321. !access_ok(secs, PAGE_SIZE)))
  322. return -EINVAL;
  323. __uaccess_begin();
  324. ret = __einit((void *)sigstruct, (void *)token, (void *)secs);
  325. __uaccess_end();
  326. return ret;
  327. }
  328. /**
  329. * sgx_virt_einit() - Run EINIT on behalf of guest
  330. * @sigstruct: Userspace pointer to SIGSTRUCT structure
  331. * @token: Userspace pointer to EINITTOKEN structure
  332. * @secs: Userspace pointer to SECS page
  333. * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values
  334. * @trapnr: trap number injected to guest in case of EINIT error
  335. *
  336. * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available
  337. * in host, SGX driver may rewrite the hardware values at wish, therefore KVM
  338. * needs to update hardware values to guest's virtual MSR values in order to
  339. * ensure EINIT is executed with expected hardware values.
  340. *
  341. * Return:
  342. * - 0: EINIT was successful.
  343. * - <0: on error.
  344. */
  345. int sgx_virt_einit(void __user *sigstruct, void __user *token,
  346. void __user *secs, u64 *lepubkeyhash, int *trapnr)
  347. {
  348. int ret;
  349. if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
  350. ret = __sgx_virt_einit(sigstruct, token, secs);
  351. } else {
  352. preempt_disable();
  353. sgx_update_lepubkeyhash(lepubkeyhash);
  354. ret = __sgx_virt_einit(sigstruct, token, secs);
  355. preempt_enable();
  356. }
  357. /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */
  358. if (ret == -EINVAL)
  359. return ret;
  360. if (encls_faulted(ret)) {
  361. *trapnr = ENCLS_TRAPNR(ret);
  362. return -EFAULT;
  363. }
  364. return ret;
  365. }
  366. EXPORT_SYMBOL_GPL(sgx_virt_einit);