main.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963
  1. // SPDX-License-Identifier: GPL-2.0
  2. /* Copyright(c) 2016-20 Intel Corporation. */
  3. #include <linux/file.h>
  4. #include <linux/freezer.h>
  5. #include <linux/highmem.h>
  6. #include <linux/kthread.h>
  7. #include <linux/miscdevice.h>
  8. #include <linux/node.h>
  9. #include <linux/pagemap.h>
  10. #include <linux/ratelimit.h>
  11. #include <linux/sched/mm.h>
  12. #include <linux/sched/signal.h>
  13. #include <linux/slab.h>
  14. #include <linux/sysfs.h>
  15. #include <asm/sgx.h>
  16. #include "driver.h"
  17. #include "encl.h"
  18. #include "encls.h"
  19. struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
  20. static int sgx_nr_epc_sections;
  21. static struct task_struct *ksgxd_tsk;
  22. static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
  23. static DEFINE_XARRAY(sgx_epc_address_space);
  24. /*
  25. * These variables are part of the state of the reclaimer, and must be accessed
  26. * with sgx_reclaimer_lock acquired.
  27. */
  28. static LIST_HEAD(sgx_active_page_list);
  29. static DEFINE_SPINLOCK(sgx_reclaimer_lock);
  30. static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0);
  31. /* Nodes with one or more EPC sections. */
  32. static nodemask_t sgx_numa_mask;
  33. /*
  34. * Array with one list_head for each possible NUMA node. Each
  35. * list contains all the sgx_epc_section's which are on that
  36. * node.
  37. */
  38. static struct sgx_numa_node *sgx_numa_nodes;
  39. static LIST_HEAD(sgx_dirty_page_list);
  40. /*
  41. * Reset post-kexec EPC pages to the uninitialized state. The pages are removed
  42. * from the input list, and made available for the page allocator. SECS pages
  43. * prepending their children in the input list are left intact.
  44. *
  45. * Return 0 when sanitization was successful or kthread was stopped, and the
  46. * number of unsanitized pages otherwise.
  47. */
  48. static unsigned long __sgx_sanitize_pages(struct list_head *dirty_page_list)
  49. {
  50. unsigned long left_dirty = 0;
  51. struct sgx_epc_page *page;
  52. LIST_HEAD(dirty);
  53. int ret;
  54. /* dirty_page_list is thread-local, no need for a lock: */
  55. while (!list_empty(dirty_page_list)) {
  56. if (kthread_should_stop())
  57. return 0;
  58. page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
  59. /*
  60. * Checking page->poison without holding the node->lock
  61. * is racy, but losing the race (i.e. poison is set just
  62. * after the check) just means __eremove() will be uselessly
  63. * called for a page that sgx_free_epc_page() will put onto
  64. * the node->sgx_poison_page_list later.
  65. */
  66. if (page->poison) {
  67. struct sgx_epc_section *section = &sgx_epc_sections[page->section];
  68. struct sgx_numa_node *node = section->node;
  69. spin_lock(&node->lock);
  70. list_move(&page->list, &node->sgx_poison_page_list);
  71. spin_unlock(&node->lock);
  72. continue;
  73. }
  74. ret = __eremove(sgx_get_epc_virt_addr(page));
  75. if (!ret) {
  76. /*
  77. * page is now sanitized. Make it available via the SGX
  78. * page allocator:
  79. */
  80. list_del(&page->list);
  81. sgx_free_epc_page(page);
  82. } else {
  83. /* The page is not yet clean - move to the dirty list. */
  84. list_move_tail(&page->list, &dirty);
  85. left_dirty++;
  86. }
  87. cond_resched();
  88. }
  89. list_splice(&dirty, dirty_page_list);
  90. return left_dirty;
  91. }
  92. static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page)
  93. {
  94. struct sgx_encl_page *page = epc_page->owner;
  95. struct sgx_encl *encl = page->encl;
  96. struct sgx_encl_mm *encl_mm;
  97. bool ret = true;
  98. int idx;
  99. idx = srcu_read_lock(&encl->srcu);
  100. list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
  101. if (!mmget_not_zero(encl_mm->mm))
  102. continue;
  103. mmap_read_lock(encl_mm->mm);
  104. ret = !sgx_encl_test_and_clear_young(encl_mm->mm, page);
  105. mmap_read_unlock(encl_mm->mm);
  106. mmput_async(encl_mm->mm);
  107. if (!ret)
  108. break;
  109. }
  110. srcu_read_unlock(&encl->srcu, idx);
  111. if (!ret)
  112. return false;
  113. return true;
  114. }
  115. static void sgx_reclaimer_block(struct sgx_epc_page *epc_page)
  116. {
  117. struct sgx_encl_page *page = epc_page->owner;
  118. unsigned long addr = page->desc & PAGE_MASK;
  119. struct sgx_encl *encl = page->encl;
  120. int ret;
  121. sgx_zap_enclave_ptes(encl, addr);
  122. mutex_lock(&encl->lock);
  123. ret = __eblock(sgx_get_epc_virt_addr(epc_page));
  124. if (encls_failed(ret))
  125. ENCLS_WARN(ret, "EBLOCK");
  126. mutex_unlock(&encl->lock);
  127. }
  128. static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot,
  129. struct sgx_backing *backing)
  130. {
  131. struct sgx_pageinfo pginfo;
  132. int ret;
  133. pginfo.addr = 0;
  134. pginfo.secs = 0;
  135. pginfo.contents = (unsigned long)kmap_atomic(backing->contents);
  136. pginfo.metadata = (unsigned long)kmap_atomic(backing->pcmd) +
  137. backing->pcmd_offset;
  138. ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot);
  139. set_page_dirty(backing->pcmd);
  140. set_page_dirty(backing->contents);
  141. kunmap_atomic((void *)(unsigned long)(pginfo.metadata -
  142. backing->pcmd_offset));
  143. kunmap_atomic((void *)(unsigned long)pginfo.contents);
  144. return ret;
  145. }
  146. void sgx_ipi_cb(void *info)
  147. {
  148. }
  149. /*
  150. * Swap page to the regular memory transformed to the blocked state by using
  151. * EBLOCK, which means that it can no longer be referenced (no new TLB entries).
  152. *
  153. * The first trial just tries to write the page assuming that some other thread
  154. * has reset the count for threads inside the enclave by using ETRACK, and
  155. * previous thread count has been zeroed out. The second trial calls ETRACK
  156. * before EWB. If that fails we kick all the HW threads out, and then do EWB,
  157. * which should be guaranteed the succeed.
  158. */
  159. static void sgx_encl_ewb(struct sgx_epc_page *epc_page,
  160. struct sgx_backing *backing)
  161. {
  162. struct sgx_encl_page *encl_page = epc_page->owner;
  163. struct sgx_encl *encl = encl_page->encl;
  164. struct sgx_va_page *va_page;
  165. unsigned int va_offset;
  166. void *va_slot;
  167. int ret;
  168. encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED;
  169. va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
  170. list);
  171. va_offset = sgx_alloc_va_slot(va_page);
  172. va_slot = sgx_get_epc_virt_addr(va_page->epc_page) + va_offset;
  173. if (sgx_va_page_full(va_page))
  174. list_move_tail(&va_page->list, &encl->va_pages);
  175. ret = __sgx_encl_ewb(epc_page, va_slot, backing);
  176. if (ret == SGX_NOT_TRACKED) {
  177. ret = __etrack(sgx_get_epc_virt_addr(encl->secs.epc_page));
  178. if (ret) {
  179. if (encls_failed(ret))
  180. ENCLS_WARN(ret, "ETRACK");
  181. }
  182. ret = __sgx_encl_ewb(epc_page, va_slot, backing);
  183. if (ret == SGX_NOT_TRACKED) {
  184. /*
  185. * Slow path, send IPIs to kick cpus out of the
  186. * enclave. Note, it's imperative that the cpu
  187. * mask is generated *after* ETRACK, else we'll
  188. * miss cpus that entered the enclave between
  189. * generating the mask and incrementing epoch.
  190. */
  191. on_each_cpu_mask(sgx_encl_cpumask(encl),
  192. sgx_ipi_cb, NULL, 1);
  193. ret = __sgx_encl_ewb(epc_page, va_slot, backing);
  194. }
  195. }
  196. if (ret) {
  197. if (encls_failed(ret))
  198. ENCLS_WARN(ret, "EWB");
  199. sgx_free_va_slot(va_page, va_offset);
  200. } else {
  201. encl_page->desc |= va_offset;
  202. encl_page->va_page = va_page;
  203. }
  204. }
  205. static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
  206. struct sgx_backing *backing)
  207. {
  208. struct sgx_encl_page *encl_page = epc_page->owner;
  209. struct sgx_encl *encl = encl_page->encl;
  210. struct sgx_backing secs_backing;
  211. int ret;
  212. mutex_lock(&encl->lock);
  213. sgx_encl_ewb(epc_page, backing);
  214. encl_page->epc_page = NULL;
  215. encl->secs_child_cnt--;
  216. sgx_encl_put_backing(backing);
  217. if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) {
  218. ret = sgx_encl_alloc_backing(encl, PFN_DOWN(encl->size),
  219. &secs_backing);
  220. if (ret)
  221. goto out;
  222. sgx_encl_ewb(encl->secs.epc_page, &secs_backing);
  223. sgx_encl_free_epc_page(encl->secs.epc_page);
  224. encl->secs.epc_page = NULL;
  225. sgx_encl_put_backing(&secs_backing);
  226. }
  227. out:
  228. mutex_unlock(&encl->lock);
  229. }
  230. /*
  231. * Take a fixed number of pages from the head of the active page pool and
  232. * reclaim them to the enclave's private shmem files. Skip the pages, which have
  233. * been accessed since the last scan. Move those pages to the tail of active
  234. * page pool so that the pages get scanned in LRU like fashion.
  235. *
  236. * Batch process a chunk of pages (at the moment 16) in order to degrade amount
  237. * of IPI's and ETRACK's potentially required. sgx_encl_ewb() does degrade a bit
  238. * among the HW threads with three stage EWB pipeline (EWB, ETRACK + EWB and IPI
  239. * + EWB) but not sufficiently. Reclaiming one page at a time would also be
  240. * problematic as it would increase the lock contention too much, which would
  241. * halt forward progress.
  242. */
  243. static void sgx_reclaim_pages(void)
  244. {
  245. struct sgx_epc_page *chunk[SGX_NR_TO_SCAN];
  246. struct sgx_backing backing[SGX_NR_TO_SCAN];
  247. struct sgx_encl_page *encl_page;
  248. struct sgx_epc_page *epc_page;
  249. pgoff_t page_index;
  250. int cnt = 0;
  251. int ret;
  252. int i;
  253. spin_lock(&sgx_reclaimer_lock);
  254. for (i = 0; i < SGX_NR_TO_SCAN; i++) {
  255. if (list_empty(&sgx_active_page_list))
  256. break;
  257. epc_page = list_first_entry(&sgx_active_page_list,
  258. struct sgx_epc_page, list);
  259. list_del_init(&epc_page->list);
  260. encl_page = epc_page->owner;
  261. if (kref_get_unless_zero(&encl_page->encl->refcount) != 0)
  262. chunk[cnt++] = epc_page;
  263. else
  264. /* The owner is freeing the page. No need to add the
  265. * page back to the list of reclaimable pages.
  266. */
  267. epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
  268. }
  269. spin_unlock(&sgx_reclaimer_lock);
  270. for (i = 0; i < cnt; i++) {
  271. epc_page = chunk[i];
  272. encl_page = epc_page->owner;
  273. if (!sgx_reclaimer_age(epc_page))
  274. goto skip;
  275. page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
  276. mutex_lock(&encl_page->encl->lock);
  277. ret = sgx_encl_alloc_backing(encl_page->encl, page_index, &backing[i]);
  278. if (ret) {
  279. mutex_unlock(&encl_page->encl->lock);
  280. goto skip;
  281. }
  282. encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED;
  283. mutex_unlock(&encl_page->encl->lock);
  284. continue;
  285. skip:
  286. spin_lock(&sgx_reclaimer_lock);
  287. list_add_tail(&epc_page->list, &sgx_active_page_list);
  288. spin_unlock(&sgx_reclaimer_lock);
  289. kref_put(&encl_page->encl->refcount, sgx_encl_release);
  290. chunk[i] = NULL;
  291. }
  292. for (i = 0; i < cnt; i++) {
  293. epc_page = chunk[i];
  294. if (epc_page)
  295. sgx_reclaimer_block(epc_page);
  296. }
  297. for (i = 0; i < cnt; i++) {
  298. epc_page = chunk[i];
  299. if (!epc_page)
  300. continue;
  301. encl_page = epc_page->owner;
  302. sgx_reclaimer_write(epc_page, &backing[i]);
  303. kref_put(&encl_page->encl->refcount, sgx_encl_release);
  304. epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
  305. sgx_free_epc_page(epc_page);
  306. }
  307. }
  308. static bool sgx_should_reclaim(unsigned long watermark)
  309. {
  310. return atomic_long_read(&sgx_nr_free_pages) < watermark &&
  311. !list_empty(&sgx_active_page_list);
  312. }
  313. /*
  314. * sgx_reclaim_direct() should be called (without enclave's mutex held)
  315. * in locations where SGX memory resources might be low and might be
  316. * needed in order to make forward progress.
  317. */
  318. void sgx_reclaim_direct(void)
  319. {
  320. if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
  321. sgx_reclaim_pages();
  322. }
  323. static int ksgxd(void *p)
  324. {
  325. set_freezable();
  326. /*
  327. * Sanitize pages in order to recover from kexec(). The 2nd pass is
  328. * required for SECS pages, whose child pages blocked EREMOVE.
  329. */
  330. __sgx_sanitize_pages(&sgx_dirty_page_list);
  331. WARN_ON(__sgx_sanitize_pages(&sgx_dirty_page_list));
  332. while (!kthread_should_stop()) {
  333. if (try_to_freeze())
  334. continue;
  335. wait_event_freezable(ksgxd_waitq,
  336. kthread_should_stop() ||
  337. sgx_should_reclaim(SGX_NR_HIGH_PAGES));
  338. if (sgx_should_reclaim(SGX_NR_HIGH_PAGES))
  339. sgx_reclaim_pages();
  340. cond_resched();
  341. }
  342. return 0;
  343. }
  344. static bool __init sgx_page_reclaimer_init(void)
  345. {
  346. struct task_struct *tsk;
  347. tsk = kthread_run(ksgxd, NULL, "ksgxd");
  348. if (IS_ERR(tsk))
  349. return false;
  350. ksgxd_tsk = tsk;
  351. return true;
  352. }
  353. bool current_is_ksgxd(void)
  354. {
  355. return current == ksgxd_tsk;
  356. }
  357. static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
  358. {
  359. struct sgx_numa_node *node = &sgx_numa_nodes[nid];
  360. struct sgx_epc_page *page = NULL;
  361. spin_lock(&node->lock);
  362. if (list_empty(&node->free_page_list)) {
  363. spin_unlock(&node->lock);
  364. return NULL;
  365. }
  366. page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
  367. list_del_init(&page->list);
  368. page->flags = 0;
  369. spin_unlock(&node->lock);
  370. atomic_long_dec(&sgx_nr_free_pages);
  371. return page;
  372. }
  373. /**
  374. * __sgx_alloc_epc_page() - Allocate an EPC page
  375. *
  376. * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start
  377. * from the NUMA node, where the caller is executing.
  378. *
  379. * Return:
  380. * - an EPC page: A borrowed EPC pages were available.
  381. * - NULL: Out of EPC pages.
  382. */
  383. struct sgx_epc_page *__sgx_alloc_epc_page(void)
  384. {
  385. struct sgx_epc_page *page;
  386. int nid_of_current = numa_node_id();
  387. int nid = nid_of_current;
  388. if (node_isset(nid_of_current, sgx_numa_mask)) {
  389. page = __sgx_alloc_epc_page_from_node(nid_of_current);
  390. if (page)
  391. return page;
  392. }
  393. /* Fall back to the non-local NUMA nodes: */
  394. while (true) {
  395. nid = next_node_in(nid, sgx_numa_mask);
  396. if (nid == nid_of_current)
  397. break;
  398. page = __sgx_alloc_epc_page_from_node(nid);
  399. if (page)
  400. return page;
  401. }
  402. return ERR_PTR(-ENOMEM);
  403. }
  404. /**
  405. * sgx_mark_page_reclaimable() - Mark a page as reclaimable
  406. * @page: EPC page
  407. *
  408. * Mark a page as reclaimable and add it to the active page list. Pages
  409. * are automatically removed from the active list when freed.
  410. */
  411. void sgx_mark_page_reclaimable(struct sgx_epc_page *page)
  412. {
  413. spin_lock(&sgx_reclaimer_lock);
  414. page->flags |= SGX_EPC_PAGE_RECLAIMER_TRACKED;
  415. list_add_tail(&page->list, &sgx_active_page_list);
  416. spin_unlock(&sgx_reclaimer_lock);
  417. }
  418. /**
  419. * sgx_unmark_page_reclaimable() - Remove a page from the reclaim list
  420. * @page: EPC page
  421. *
  422. * Clear the reclaimable flag and remove the page from the active page list.
  423. *
  424. * Return:
  425. * 0 on success,
  426. * -EBUSY if the page is in the process of being reclaimed
  427. */
  428. int sgx_unmark_page_reclaimable(struct sgx_epc_page *page)
  429. {
  430. spin_lock(&sgx_reclaimer_lock);
  431. if (page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) {
  432. /* The page is being reclaimed. */
  433. if (list_empty(&page->list)) {
  434. spin_unlock(&sgx_reclaimer_lock);
  435. return -EBUSY;
  436. }
  437. list_del(&page->list);
  438. page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
  439. }
  440. spin_unlock(&sgx_reclaimer_lock);
  441. return 0;
  442. }
  443. /**
  444. * sgx_alloc_epc_page() - Allocate an EPC page
  445. * @owner: the owner of the EPC page
  446. * @reclaim: reclaim pages if necessary
  447. *
  448. * Iterate through EPC sections and borrow a free EPC page to the caller. When a
  449. * page is no longer needed it must be released with sgx_free_epc_page(). If
  450. * @reclaim is set to true, directly reclaim pages when we are out of pages. No
  451. * mm's can be locked when @reclaim is set to true.
  452. *
  453. * Finally, wake up ksgxd when the number of pages goes below the watermark
  454. * before returning back to the caller.
  455. *
  456. * Return:
  457. * an EPC page,
  458. * -errno on error
  459. */
  460. struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
  461. {
  462. struct sgx_epc_page *page;
  463. for ( ; ; ) {
  464. page = __sgx_alloc_epc_page();
  465. if (!IS_ERR(page)) {
  466. page->owner = owner;
  467. break;
  468. }
  469. if (list_empty(&sgx_active_page_list))
  470. return ERR_PTR(-ENOMEM);
  471. if (!reclaim) {
  472. page = ERR_PTR(-EBUSY);
  473. break;
  474. }
  475. if (signal_pending(current)) {
  476. page = ERR_PTR(-ERESTARTSYS);
  477. break;
  478. }
  479. sgx_reclaim_pages();
  480. cond_resched();
  481. }
  482. if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
  483. wake_up(&ksgxd_waitq);
  484. return page;
  485. }
  486. /**
  487. * sgx_free_epc_page() - Free an EPC page
  488. * @page: an EPC page
  489. *
  490. * Put the EPC page back to the list of free pages. It's the caller's
  491. * responsibility to make sure that the page is in uninitialized state. In other
  492. * words, do EREMOVE, EWB or whatever operation is necessary before calling
  493. * this function.
  494. */
  495. void sgx_free_epc_page(struct sgx_epc_page *page)
  496. {
  497. struct sgx_epc_section *section = &sgx_epc_sections[page->section];
  498. struct sgx_numa_node *node = section->node;
  499. spin_lock(&node->lock);
  500. page->owner = NULL;
  501. if (page->poison)
  502. list_add(&page->list, &node->sgx_poison_page_list);
  503. else
  504. list_add_tail(&page->list, &node->free_page_list);
  505. page->flags = SGX_EPC_PAGE_IS_FREE;
  506. spin_unlock(&node->lock);
  507. atomic_long_inc(&sgx_nr_free_pages);
  508. }
  509. static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
  510. unsigned long index,
  511. struct sgx_epc_section *section)
  512. {
  513. unsigned long nr_pages = size >> PAGE_SHIFT;
  514. unsigned long i;
  515. section->virt_addr = memremap(phys_addr, size, MEMREMAP_WB);
  516. if (!section->virt_addr)
  517. return false;
  518. section->pages = vmalloc(nr_pages * sizeof(struct sgx_epc_page));
  519. if (!section->pages) {
  520. memunmap(section->virt_addr);
  521. return false;
  522. }
  523. section->phys_addr = phys_addr;
  524. xa_store_range(&sgx_epc_address_space, section->phys_addr,
  525. phys_addr + size - 1, section, GFP_KERNEL);
  526. for (i = 0; i < nr_pages; i++) {
  527. section->pages[i].section = index;
  528. section->pages[i].flags = 0;
  529. section->pages[i].owner = NULL;
  530. section->pages[i].poison = 0;
  531. list_add_tail(&section->pages[i].list, &sgx_dirty_page_list);
  532. }
  533. return true;
  534. }
  535. bool arch_is_platform_page(u64 paddr)
  536. {
  537. return !!xa_load(&sgx_epc_address_space, paddr);
  538. }
  539. EXPORT_SYMBOL_GPL(arch_is_platform_page);
  540. static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr)
  541. {
  542. struct sgx_epc_section *section;
  543. section = xa_load(&sgx_epc_address_space, paddr);
  544. if (!section)
  545. return NULL;
  546. return &section->pages[PFN_DOWN(paddr - section->phys_addr)];
  547. }
  548. /*
  549. * Called in process context to handle a hardware reported
  550. * error in an SGX EPC page.
  551. * If the MF_ACTION_REQUIRED bit is set in flags, then the
  552. * context is the task that consumed the poison data. Otherwise
  553. * this is called from a kernel thread unrelated to the page.
  554. */
  555. int arch_memory_failure(unsigned long pfn, int flags)
  556. {
  557. struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT);
  558. struct sgx_epc_section *section;
  559. struct sgx_numa_node *node;
  560. /*
  561. * mm/memory-failure.c calls this routine for all errors
  562. * where there isn't a "struct page" for the address. But that
  563. * includes other address ranges besides SGX.
  564. */
  565. if (!page)
  566. return -ENXIO;
  567. /*
  568. * If poison was consumed synchronously. Send a SIGBUS to
  569. * the task. Hardware has already exited the SGX enclave and
  570. * will not allow re-entry to an enclave that has a memory
  571. * error. The signal may help the task understand why the
  572. * enclave is broken.
  573. */
  574. if (flags & MF_ACTION_REQUIRED)
  575. force_sig(SIGBUS);
  576. section = &sgx_epc_sections[page->section];
  577. node = section->node;
  578. spin_lock(&node->lock);
  579. /* Already poisoned? Nothing more to do */
  580. if (page->poison)
  581. goto out;
  582. page->poison = 1;
  583. /*
  584. * If the page is on a free list, move it to the per-node
  585. * poison page list.
  586. */
  587. if (page->flags & SGX_EPC_PAGE_IS_FREE) {
  588. list_move(&page->list, &node->sgx_poison_page_list);
  589. goto out;
  590. }
  591. /*
  592. * TBD: Add additional plumbing to enable pre-emptive
  593. * action for asynchronous poison notification. Until
  594. * then just hope that the poison:
  595. * a) is not accessed - sgx_free_epc_page() will deal with it
  596. * when the user gives it back
  597. * b) results in a recoverable machine check rather than
  598. * a fatal one
  599. */
  600. out:
  601. spin_unlock(&node->lock);
  602. return 0;
  603. }
  604. /**
  605. * A section metric is concatenated in a way that @low bits 12-31 define the
  606. * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
  607. * metric.
  608. */
  609. static inline u64 __init sgx_calc_section_metric(u64 low, u64 high)
  610. {
  611. return (low & GENMASK_ULL(31, 12)) +
  612. ((high & GENMASK_ULL(19, 0)) << 32);
  613. }
  614. #ifdef CONFIG_NUMA
  615. static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf)
  616. {
  617. return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size);
  618. }
  619. static DEVICE_ATTR_RO(sgx_total_bytes);
  620. static umode_t arch_node_attr_is_visible(struct kobject *kobj,
  621. struct attribute *attr, int idx)
  622. {
  623. /* Make all x86/ attributes invisible when SGX is not initialized: */
  624. if (nodes_empty(sgx_numa_mask))
  625. return 0;
  626. return attr->mode;
  627. }
  628. static struct attribute *arch_node_dev_attrs[] = {
  629. &dev_attr_sgx_total_bytes.attr,
  630. NULL,
  631. };
  632. const struct attribute_group arch_node_dev_group = {
  633. .name = "x86",
  634. .attrs = arch_node_dev_attrs,
  635. .is_visible = arch_node_attr_is_visible,
  636. };
  637. static void __init arch_update_sysfs_visibility(int nid)
  638. {
  639. struct node *node = node_devices[nid];
  640. int ret;
  641. ret = sysfs_update_group(&node->dev.kobj, &arch_node_dev_group);
  642. if (ret)
  643. pr_err("sysfs update failed (%d), files may be invisible", ret);
  644. }
  645. #else /* !CONFIG_NUMA */
  646. static void __init arch_update_sysfs_visibility(int nid) {}
  647. #endif
  648. static bool __init sgx_page_cache_init(void)
  649. {
  650. u32 eax, ebx, ecx, edx, type;
  651. u64 pa, size;
  652. int nid;
  653. int i;
  654. sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL);
  655. if (!sgx_numa_nodes)
  656. return false;
  657. for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
  658. cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx);
  659. type = eax & SGX_CPUID_EPC_MASK;
  660. if (type == SGX_CPUID_EPC_INVALID)
  661. break;
  662. if (type != SGX_CPUID_EPC_SECTION) {
  663. pr_err_once("Unknown EPC section type: %u\n", type);
  664. break;
  665. }
  666. pa = sgx_calc_section_metric(eax, ebx);
  667. size = sgx_calc_section_metric(ecx, edx);
  668. pr_info("EPC section 0x%llx-0x%llx\n", pa, pa + size - 1);
  669. if (!sgx_setup_epc_section(pa, size, i, &sgx_epc_sections[i])) {
  670. pr_err("No free memory for an EPC section\n");
  671. break;
  672. }
  673. nid = numa_map_to_online_node(phys_to_target_node(pa));
  674. if (nid == NUMA_NO_NODE) {
  675. /* The physical address is already printed above. */
  676. pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n");
  677. nid = 0;
  678. }
  679. if (!node_isset(nid, sgx_numa_mask)) {
  680. spin_lock_init(&sgx_numa_nodes[nid].lock);
  681. INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
  682. INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list);
  683. node_set(nid, sgx_numa_mask);
  684. sgx_numa_nodes[nid].size = 0;
  685. /* Make SGX-specific node sysfs files visible: */
  686. arch_update_sysfs_visibility(nid);
  687. }
  688. sgx_epc_sections[i].node = &sgx_numa_nodes[nid];
  689. sgx_numa_nodes[nid].size += size;
  690. sgx_nr_epc_sections++;
  691. }
  692. if (!sgx_nr_epc_sections) {
  693. pr_err("There are zero EPC sections.\n");
  694. return false;
  695. }
  696. return true;
  697. }
  698. /*
  699. * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller.
  700. * Bare-metal driver requires to update them to hash of enclave's signer
  701. * before EINIT. KVM needs to update them to guest's virtual MSR values
  702. * before doing EINIT from guest.
  703. */
  704. void sgx_update_lepubkeyhash(u64 *lepubkeyhash)
  705. {
  706. int i;
  707. WARN_ON_ONCE(preemptible());
  708. for (i = 0; i < 4; i++)
  709. wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
  710. }
  711. const struct file_operations sgx_provision_fops = {
  712. .owner = THIS_MODULE,
  713. };
  714. static struct miscdevice sgx_dev_provision = {
  715. .minor = MISC_DYNAMIC_MINOR,
  716. .name = "sgx_provision",
  717. .nodename = "sgx_provision",
  718. .fops = &sgx_provision_fops,
  719. };
  720. /**
  721. * sgx_set_attribute() - Update allowed attributes given file descriptor
  722. * @allowed_attributes: Pointer to allowed enclave attributes
  723. * @attribute_fd: File descriptor for specific attribute
  724. *
  725. * Append enclave attribute indicated by file descriptor to allowed
  726. * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by
  727. * /dev/sgx_provision is supported.
  728. *
  729. * Return:
  730. * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes
  731. * -EINVAL: Invalid, or not supported file descriptor
  732. */
  733. int sgx_set_attribute(unsigned long *allowed_attributes,
  734. unsigned int attribute_fd)
  735. {
  736. struct file *file;
  737. file = fget(attribute_fd);
  738. if (!file)
  739. return -EINVAL;
  740. if (file->f_op != &sgx_provision_fops) {
  741. fput(file);
  742. return -EINVAL;
  743. }
  744. *allowed_attributes |= SGX_ATTR_PROVISIONKEY;
  745. fput(file);
  746. return 0;
  747. }
  748. EXPORT_SYMBOL_GPL(sgx_set_attribute);
  749. static int __init sgx_init(void)
  750. {
  751. int ret;
  752. int i;
  753. if (!cpu_feature_enabled(X86_FEATURE_SGX))
  754. return -ENODEV;
  755. if (!sgx_page_cache_init())
  756. return -ENOMEM;
  757. if (!sgx_page_reclaimer_init()) {
  758. ret = -ENOMEM;
  759. goto err_page_cache;
  760. }
  761. ret = misc_register(&sgx_dev_provision);
  762. if (ret)
  763. goto err_kthread;
  764. /*
  765. * Always try to initialize the native *and* KVM drivers.
  766. * The KVM driver is less picky than the native one and
  767. * can function if the native one is not supported on the
  768. * current system or fails to initialize.
  769. *
  770. * Error out only if both fail to initialize.
  771. */
  772. ret = sgx_drv_init();
  773. if (sgx_vepc_init() && ret)
  774. goto err_provision;
  775. return 0;
  776. err_provision:
  777. misc_deregister(&sgx_dev_provision);
  778. err_kthread:
  779. kthread_stop(ksgxd_tsk);
  780. err_page_cache:
  781. for (i = 0; i < sgx_nr_epc_sections; i++) {
  782. vfree(sgx_epc_sections[i].pages);
  783. memunmap(sgx_epc_sections[i].virt_addr);
  784. }
  785. return ret;
  786. }
  787. device_initcall(sgx_init);