kvm.c 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * KVM paravirt_ops implementation
  4. *
  5. * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <[email protected]>
  6. * Copyright IBM Corporation, 2007
  7. * Authors: Anthony Liguori <[email protected]>
  8. */
  9. #define pr_fmt(fmt) "kvm-guest: " fmt
  10. #include <linux/context_tracking.h>
  11. #include <linux/init.h>
  12. #include <linux/irq.h>
  13. #include <linux/kernel.h>
  14. #include <linux/kvm_para.h>
  15. #include <linux/cpu.h>
  16. #include <linux/mm.h>
  17. #include <linux/highmem.h>
  18. #include <linux/hardirq.h>
  19. #include <linux/notifier.h>
  20. #include <linux/reboot.h>
  21. #include <linux/hash.h>
  22. #include <linux/sched.h>
  23. #include <linux/slab.h>
  24. #include <linux/kprobes.h>
  25. #include <linux/nmi.h>
  26. #include <linux/swait.h>
  27. #include <linux/syscore_ops.h>
  28. #include <linux/cc_platform.h>
  29. #include <linux/efi.h>
  30. #include <asm/timer.h>
  31. #include <asm/cpu.h>
  32. #include <asm/traps.h>
  33. #include <asm/desc.h>
  34. #include <asm/tlbflush.h>
  35. #include <asm/apic.h>
  36. #include <asm/apicdef.h>
  37. #include <asm/hypervisor.h>
  38. #include <asm/tlb.h>
  39. #include <asm/cpuidle_haltpoll.h>
  40. #include <asm/ptrace.h>
  41. #include <asm/reboot.h>
  42. #include <asm/svm.h>
  43. #include <asm/e820/api.h>
  44. DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
  45. static int kvmapf = 1;
  46. static int __init parse_no_kvmapf(char *arg)
  47. {
  48. kvmapf = 0;
  49. return 0;
  50. }
  51. early_param("no-kvmapf", parse_no_kvmapf);
  52. static int steal_acc = 1;
  53. static int __init parse_no_stealacc(char *arg)
  54. {
  55. steal_acc = 0;
  56. return 0;
  57. }
  58. early_param("no-steal-acc", parse_no_stealacc);
  59. static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
  60. DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
  61. static int has_steal_clock = 0;
  62. static int has_guest_poll = 0;
  63. /*
  64. * No need for any "IO delay" on KVM
  65. */
  66. static void kvm_io_delay(void)
  67. {
  68. }
  69. #define KVM_TASK_SLEEP_HASHBITS 8
  70. #define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
  71. struct kvm_task_sleep_node {
  72. struct hlist_node link;
  73. struct swait_queue_head wq;
  74. u32 token;
  75. int cpu;
  76. };
  77. static struct kvm_task_sleep_head {
  78. raw_spinlock_t lock;
  79. struct hlist_head list;
  80. } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
  81. static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
  82. u32 token)
  83. {
  84. struct hlist_node *p;
  85. hlist_for_each(p, &b->list) {
  86. struct kvm_task_sleep_node *n =
  87. hlist_entry(p, typeof(*n), link);
  88. if (n->token == token)
  89. return n;
  90. }
  91. return NULL;
  92. }
  93. static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n)
  94. {
  95. u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
  96. struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
  97. struct kvm_task_sleep_node *e;
  98. raw_spin_lock(&b->lock);
  99. e = _find_apf_task(b, token);
  100. if (e) {
  101. /* dummy entry exist -> wake up was delivered ahead of PF */
  102. hlist_del(&e->link);
  103. raw_spin_unlock(&b->lock);
  104. kfree(e);
  105. return false;
  106. }
  107. n->token = token;
  108. n->cpu = smp_processor_id();
  109. init_swait_queue_head(&n->wq);
  110. hlist_add_head(&n->link, &b->list);
  111. raw_spin_unlock(&b->lock);
  112. return true;
  113. }
  114. /*
  115. * kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled
  116. * @token: Token to identify the sleep node entry
  117. *
  118. * Invoked from the async pagefault handling code or from the VM exit page
  119. * fault handler. In both cases RCU is watching.
  120. */
  121. void kvm_async_pf_task_wait_schedule(u32 token)
  122. {
  123. struct kvm_task_sleep_node n;
  124. DECLARE_SWAITQUEUE(wait);
  125. lockdep_assert_irqs_disabled();
  126. if (!kvm_async_pf_queue_task(token, &n))
  127. return;
  128. for (;;) {
  129. prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
  130. if (hlist_unhashed(&n.link))
  131. break;
  132. local_irq_enable();
  133. schedule();
  134. local_irq_disable();
  135. }
  136. finish_swait(&n.wq, &wait);
  137. }
  138. EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule);
  139. static void apf_task_wake_one(struct kvm_task_sleep_node *n)
  140. {
  141. hlist_del_init(&n->link);
  142. if (swq_has_sleeper(&n->wq))
  143. swake_up_one(&n->wq);
  144. }
  145. static void apf_task_wake_all(void)
  146. {
  147. int i;
  148. for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
  149. struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
  150. struct kvm_task_sleep_node *n;
  151. struct hlist_node *p, *next;
  152. raw_spin_lock(&b->lock);
  153. hlist_for_each_safe(p, next, &b->list) {
  154. n = hlist_entry(p, typeof(*n), link);
  155. if (n->cpu == smp_processor_id())
  156. apf_task_wake_one(n);
  157. }
  158. raw_spin_unlock(&b->lock);
  159. }
  160. }
  161. void kvm_async_pf_task_wake(u32 token)
  162. {
  163. u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
  164. struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
  165. struct kvm_task_sleep_node *n, *dummy = NULL;
  166. if (token == ~0) {
  167. apf_task_wake_all();
  168. return;
  169. }
  170. again:
  171. raw_spin_lock(&b->lock);
  172. n = _find_apf_task(b, token);
  173. if (!n) {
  174. /*
  175. * Async #PF not yet handled, add a dummy entry for the token.
  176. * Allocating the token must be down outside of the raw lock
  177. * as the allocator is preemptible on PREEMPT_RT kernels.
  178. */
  179. if (!dummy) {
  180. raw_spin_unlock(&b->lock);
  181. dummy = kzalloc(sizeof(*dummy), GFP_ATOMIC);
  182. /*
  183. * Continue looping on allocation failure, eventually
  184. * the async #PF will be handled and allocating a new
  185. * node will be unnecessary.
  186. */
  187. if (!dummy)
  188. cpu_relax();
  189. /*
  190. * Recheck for async #PF completion before enqueueing
  191. * the dummy token to avoid duplicate list entries.
  192. */
  193. goto again;
  194. }
  195. dummy->token = token;
  196. dummy->cpu = smp_processor_id();
  197. init_swait_queue_head(&dummy->wq);
  198. hlist_add_head(&dummy->link, &b->list);
  199. dummy = NULL;
  200. } else {
  201. apf_task_wake_one(n);
  202. }
  203. raw_spin_unlock(&b->lock);
  204. /* A dummy token might be allocated and ultimately not used. */
  205. kfree(dummy);
  206. }
  207. EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
  208. noinstr u32 kvm_read_and_reset_apf_flags(void)
  209. {
  210. u32 flags = 0;
  211. if (__this_cpu_read(apf_reason.enabled)) {
  212. flags = __this_cpu_read(apf_reason.flags);
  213. __this_cpu_write(apf_reason.flags, 0);
  214. }
  215. return flags;
  216. }
  217. EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags);
  218. noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
  219. {
  220. u32 flags = kvm_read_and_reset_apf_flags();
  221. irqentry_state_t state;
  222. if (!flags)
  223. return false;
  224. state = irqentry_enter(regs);
  225. instrumentation_begin();
  226. /*
  227. * If the host managed to inject an async #PF into an interrupt
  228. * disabled region, then die hard as this is not going to end well
  229. * and the host side is seriously broken.
  230. */
  231. if (unlikely(!(regs->flags & X86_EFLAGS_IF)))
  232. panic("Host injected async #PF in interrupt disabled region\n");
  233. if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
  234. if (unlikely(!(user_mode(regs))))
  235. panic("Host injected async #PF in kernel mode\n");
  236. /* Page is swapped out by the host. */
  237. kvm_async_pf_task_wait_schedule(token);
  238. } else {
  239. WARN_ONCE(1, "Unexpected async PF flags: %x\n", flags);
  240. }
  241. instrumentation_end();
  242. irqentry_exit(regs, state);
  243. return true;
  244. }
  245. DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
  246. {
  247. struct pt_regs *old_regs = set_irq_regs(regs);
  248. u32 token;
  249. ack_APIC_irq();
  250. inc_irq_stat(irq_hv_callback_count);
  251. if (__this_cpu_read(apf_reason.enabled)) {
  252. token = __this_cpu_read(apf_reason.token);
  253. kvm_async_pf_task_wake(token);
  254. __this_cpu_write(apf_reason.token, 0);
  255. wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1);
  256. }
  257. set_irq_regs(old_regs);
  258. }
  259. static void __init paravirt_ops_setup(void)
  260. {
  261. pv_info.name = "KVM";
  262. if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
  263. pv_ops.cpu.io_delay = kvm_io_delay;
  264. #ifdef CONFIG_X86_IO_APIC
  265. no_timer_check = 1;
  266. #endif
  267. }
  268. static void kvm_register_steal_time(void)
  269. {
  270. int cpu = smp_processor_id();
  271. struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
  272. if (!has_steal_clock)
  273. return;
  274. wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
  275. pr_debug("stealtime: cpu %d, msr %llx\n", cpu,
  276. (unsigned long long) slow_virt_to_phys(st));
  277. }
  278. static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
  279. static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
  280. {
  281. /**
  282. * This relies on __test_and_clear_bit to modify the memory
  283. * in a way that is atomic with respect to the local CPU.
  284. * The hypervisor only accesses this memory from the local CPU so
  285. * there's no need for lock or memory barriers.
  286. * An optimization barrier is implied in apic write.
  287. */
  288. if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
  289. return;
  290. apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK);
  291. }
  292. static void kvm_guest_cpu_init(void)
  293. {
  294. if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
  295. u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
  296. WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
  297. pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
  298. pa |= KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
  299. if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
  300. pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
  301. wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR);
  302. wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
  303. __this_cpu_write(apf_reason.enabled, 1);
  304. pr_debug("setup async PF for cpu %d\n", smp_processor_id());
  305. }
  306. if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
  307. unsigned long pa;
  308. /* Size alignment is implied but just to make it explicit. */
  309. BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
  310. __this_cpu_write(kvm_apic_eoi, 0);
  311. pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi))
  312. | KVM_MSR_ENABLED;
  313. wrmsrl(MSR_KVM_PV_EOI_EN, pa);
  314. }
  315. if (has_steal_clock)
  316. kvm_register_steal_time();
  317. }
  318. static void kvm_pv_disable_apf(void)
  319. {
  320. if (!__this_cpu_read(apf_reason.enabled))
  321. return;
  322. wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
  323. __this_cpu_write(apf_reason.enabled, 0);
  324. pr_debug("disable async PF for cpu %d\n", smp_processor_id());
  325. }
  326. static void kvm_disable_steal_time(void)
  327. {
  328. if (!has_steal_clock)
  329. return;
  330. wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
  331. }
  332. static u64 kvm_steal_clock(int cpu)
  333. {
  334. u64 steal;
  335. struct kvm_steal_time *src;
  336. int version;
  337. src = &per_cpu(steal_time, cpu);
  338. do {
  339. version = src->version;
  340. virt_rmb();
  341. steal = src->steal;
  342. virt_rmb();
  343. } while ((version & 1) || (version != src->version));
  344. return steal;
  345. }
  346. static inline void __set_percpu_decrypted(void *ptr, unsigned long size)
  347. {
  348. early_set_memory_decrypted((unsigned long) ptr, size);
  349. }
  350. /*
  351. * Iterate through all possible CPUs and map the memory region pointed
  352. * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once.
  353. *
  354. * Note: we iterate through all possible CPUs to ensure that CPUs
  355. * hotplugged will have their per-cpu variable already mapped as
  356. * decrypted.
  357. */
  358. static void __init sev_map_percpu_data(void)
  359. {
  360. int cpu;
  361. if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
  362. return;
  363. for_each_possible_cpu(cpu) {
  364. __set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason));
  365. __set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time));
  366. __set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi));
  367. }
  368. }
  369. static void kvm_guest_cpu_offline(bool shutdown)
  370. {
  371. kvm_disable_steal_time();
  372. if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
  373. wrmsrl(MSR_KVM_PV_EOI_EN, 0);
  374. if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
  375. wrmsrl(MSR_KVM_MIGRATION_CONTROL, 0);
  376. kvm_pv_disable_apf();
  377. if (!shutdown)
  378. apf_task_wake_all();
  379. kvmclock_disable();
  380. }
  381. static int kvm_cpu_online(unsigned int cpu)
  382. {
  383. unsigned long flags;
  384. local_irq_save(flags);
  385. kvm_guest_cpu_init();
  386. local_irq_restore(flags);
  387. return 0;
  388. }
  389. #ifdef CONFIG_SMP
  390. static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
  391. static bool pv_tlb_flush_supported(void)
  392. {
  393. return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
  394. !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
  395. kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
  396. !boot_cpu_has(X86_FEATURE_MWAIT) &&
  397. (num_possible_cpus() != 1));
  398. }
  399. static bool pv_ipi_supported(void)
  400. {
  401. return (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI) &&
  402. (num_possible_cpus() != 1));
  403. }
  404. static bool pv_sched_yield_supported(void)
  405. {
  406. return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) &&
  407. !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
  408. kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
  409. !boot_cpu_has(X86_FEATURE_MWAIT) &&
  410. (num_possible_cpus() != 1));
  411. }
  412. #define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG)
  413. static void __send_ipi_mask(const struct cpumask *mask, int vector)
  414. {
  415. unsigned long flags;
  416. int cpu, apic_id, icr;
  417. int min = 0, max = 0;
  418. #ifdef CONFIG_X86_64
  419. __uint128_t ipi_bitmap = 0;
  420. #else
  421. u64 ipi_bitmap = 0;
  422. #endif
  423. long ret;
  424. if (cpumask_empty(mask))
  425. return;
  426. local_irq_save(flags);
  427. switch (vector) {
  428. default:
  429. icr = APIC_DM_FIXED | vector;
  430. break;
  431. case NMI_VECTOR:
  432. icr = APIC_DM_NMI;
  433. break;
  434. }
  435. for_each_cpu(cpu, mask) {
  436. apic_id = per_cpu(x86_cpu_to_apicid, cpu);
  437. if (!ipi_bitmap) {
  438. min = max = apic_id;
  439. } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
  440. ipi_bitmap <<= min - apic_id;
  441. min = apic_id;
  442. } else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) {
  443. max = apic_id < max ? max : apic_id;
  444. } else {
  445. ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
  446. (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
  447. WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
  448. ret);
  449. min = max = apic_id;
  450. ipi_bitmap = 0;
  451. }
  452. __set_bit(apic_id - min, (unsigned long *)&ipi_bitmap);
  453. }
  454. if (ipi_bitmap) {
  455. ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
  456. (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
  457. WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
  458. ret);
  459. }
  460. local_irq_restore(flags);
  461. }
  462. static void kvm_send_ipi_mask(const struct cpumask *mask, int vector)
  463. {
  464. __send_ipi_mask(mask, vector);
  465. }
  466. static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
  467. {
  468. unsigned int this_cpu = smp_processor_id();
  469. struct cpumask *new_mask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
  470. const struct cpumask *local_mask;
  471. cpumask_copy(new_mask, mask);
  472. cpumask_clear_cpu(this_cpu, new_mask);
  473. local_mask = new_mask;
  474. __send_ipi_mask(local_mask, vector);
  475. }
  476. static int __init setup_efi_kvm_sev_migration(void)
  477. {
  478. efi_char16_t efi_sev_live_migration_enabled[] = L"SevLiveMigrationEnabled";
  479. efi_guid_t efi_variable_guid = AMD_SEV_MEM_ENCRYPT_GUID;
  480. efi_status_t status;
  481. unsigned long size;
  482. bool enabled;
  483. if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) ||
  484. !kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
  485. return 0;
  486. if (!efi_enabled(EFI_BOOT))
  487. return 0;
  488. if (!efi_enabled(EFI_RUNTIME_SERVICES)) {
  489. pr_info("%s : EFI runtime services are not enabled\n", __func__);
  490. return 0;
  491. }
  492. size = sizeof(enabled);
  493. /* Get variable contents into buffer */
  494. status = efi.get_variable(efi_sev_live_migration_enabled,
  495. &efi_variable_guid, NULL, &size, &enabled);
  496. if (status == EFI_NOT_FOUND) {
  497. pr_info("%s : EFI live migration variable not found\n", __func__);
  498. return 0;
  499. }
  500. if (status != EFI_SUCCESS) {
  501. pr_info("%s : EFI variable retrieval failed\n", __func__);
  502. return 0;
  503. }
  504. if (enabled == 0) {
  505. pr_info("%s: live migration disabled in EFI\n", __func__);
  506. return 0;
  507. }
  508. pr_info("%s : live migration enabled in EFI\n", __func__);
  509. wrmsrl(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY);
  510. return 1;
  511. }
  512. late_initcall(setup_efi_kvm_sev_migration);
  513. /*
  514. * Set the IPI entry points
  515. */
  516. static void kvm_setup_pv_ipi(void)
  517. {
  518. apic->send_IPI_mask = kvm_send_ipi_mask;
  519. apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself;
  520. pr_info("setup PV IPIs\n");
  521. }
  522. static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
  523. {
  524. int cpu;
  525. native_send_call_func_ipi(mask);
  526. /* Make sure other vCPUs get a chance to run if they need to. */
  527. for_each_cpu(cpu, mask) {
  528. if (!idle_cpu(cpu) && vcpu_is_preempted(cpu)) {
  529. kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu));
  530. break;
  531. }
  532. }
  533. }
  534. static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
  535. const struct flush_tlb_info *info)
  536. {
  537. u8 state;
  538. int cpu;
  539. struct kvm_steal_time *src;
  540. struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
  541. cpumask_copy(flushmask, cpumask);
  542. /*
  543. * We have to call flush only on online vCPUs. And
  544. * queue flush_on_enter for pre-empted vCPUs
  545. */
  546. for_each_cpu(cpu, flushmask) {
  547. /*
  548. * The local vCPU is never preempted, so we do not explicitly
  549. * skip check for local vCPU - it will never be cleared from
  550. * flushmask.
  551. */
  552. src = &per_cpu(steal_time, cpu);
  553. state = READ_ONCE(src->preempted);
  554. if ((state & KVM_VCPU_PREEMPTED)) {
  555. if (try_cmpxchg(&src->preempted, &state,
  556. state | KVM_VCPU_FLUSH_TLB))
  557. __cpumask_clear_cpu(cpu, flushmask);
  558. }
  559. }
  560. native_flush_tlb_multi(flushmask, info);
  561. }
  562. static __init int kvm_alloc_cpumask(void)
  563. {
  564. int cpu;
  565. if (!kvm_para_available() || nopv)
  566. return 0;
  567. if (pv_tlb_flush_supported() || pv_ipi_supported())
  568. for_each_possible_cpu(cpu) {
  569. zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
  570. GFP_KERNEL, cpu_to_node(cpu));
  571. }
  572. return 0;
  573. }
  574. arch_initcall(kvm_alloc_cpumask);
  575. static void __init kvm_smp_prepare_boot_cpu(void)
  576. {
  577. /*
  578. * Map the per-cpu variables as decrypted before kvm_guest_cpu_init()
  579. * shares the guest physical address with the hypervisor.
  580. */
  581. sev_map_percpu_data();
  582. kvm_guest_cpu_init();
  583. native_smp_prepare_boot_cpu();
  584. kvm_spinlock_init();
  585. }
  586. static int kvm_cpu_down_prepare(unsigned int cpu)
  587. {
  588. unsigned long flags;
  589. local_irq_save(flags);
  590. kvm_guest_cpu_offline(false);
  591. local_irq_restore(flags);
  592. return 0;
  593. }
  594. #endif
  595. static int kvm_suspend(void)
  596. {
  597. u64 val = 0;
  598. kvm_guest_cpu_offline(false);
  599. #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
  600. if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
  601. rdmsrl(MSR_KVM_POLL_CONTROL, val);
  602. has_guest_poll = !(val & 1);
  603. #endif
  604. return 0;
  605. }
  606. static void kvm_resume(void)
  607. {
  608. kvm_cpu_online(raw_smp_processor_id());
  609. #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
  610. if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
  611. wrmsrl(MSR_KVM_POLL_CONTROL, 0);
  612. #endif
  613. }
  614. static struct syscore_ops kvm_syscore_ops = {
  615. .suspend = kvm_suspend,
  616. .resume = kvm_resume,
  617. };
  618. static void kvm_pv_guest_cpu_reboot(void *unused)
  619. {
  620. kvm_guest_cpu_offline(true);
  621. }
  622. static int kvm_pv_reboot_notify(struct notifier_block *nb,
  623. unsigned long code, void *unused)
  624. {
  625. if (code == SYS_RESTART)
  626. on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
  627. return NOTIFY_DONE;
  628. }
  629. static struct notifier_block kvm_pv_reboot_nb = {
  630. .notifier_call = kvm_pv_reboot_notify,
  631. };
  632. /*
  633. * After a PV feature is registered, the host will keep writing to the
  634. * registered memory location. If the guest happens to shutdown, this memory
  635. * won't be valid. In cases like kexec, in which you install a new kernel, this
  636. * means a random memory location will be kept being written.
  637. */
  638. #ifdef CONFIG_KEXEC_CORE
  639. static void kvm_crash_shutdown(struct pt_regs *regs)
  640. {
  641. kvm_guest_cpu_offline(true);
  642. native_machine_crash_shutdown(regs);
  643. }
  644. #endif
  645. #if defined(CONFIG_X86_32) || !defined(CONFIG_SMP)
  646. bool __kvm_vcpu_is_preempted(long cpu);
  647. __visible bool __kvm_vcpu_is_preempted(long cpu)
  648. {
  649. struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
  650. return !!(src->preempted & KVM_VCPU_PREEMPTED);
  651. }
  652. PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
  653. #else
  654. #include <asm/asm-offsets.h>
  655. extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
  656. /*
  657. * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
  658. * restoring to/from the stack.
  659. */
  660. asm(
  661. ".pushsection .text;"
  662. ".global __raw_callee_save___kvm_vcpu_is_preempted;"
  663. ".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
  664. "__raw_callee_save___kvm_vcpu_is_preempted:"
  665. ASM_ENDBR
  666. "movq __per_cpu_offset(,%rdi,8), %rax;"
  667. "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
  668. "setne %al;"
  669. ASM_RET
  670. ".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
  671. ".popsection");
  672. #endif
  673. static void __init kvm_guest_init(void)
  674. {
  675. int i;
  676. paravirt_ops_setup();
  677. register_reboot_notifier(&kvm_pv_reboot_nb);
  678. for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
  679. raw_spin_lock_init(&async_pf_sleepers[i].lock);
  680. if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
  681. has_steal_clock = 1;
  682. static_call_update(pv_steal_clock, kvm_steal_clock);
  683. pv_ops.lock.vcpu_is_preempted =
  684. PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
  685. }
  686. if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
  687. apic_set_eoi_write(kvm_guest_apic_eoi_write);
  688. if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
  689. static_branch_enable(&kvm_async_pf_enabled);
  690. alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_kvm_asyncpf_interrupt);
  691. }
  692. #ifdef CONFIG_SMP
  693. if (pv_tlb_flush_supported()) {
  694. pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
  695. pv_ops.mmu.tlb_remove_table = tlb_remove_table;
  696. pr_info("KVM setup pv remote TLB flush\n");
  697. }
  698. smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
  699. if (pv_sched_yield_supported()) {
  700. smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
  701. pr_info("setup PV sched yield\n");
  702. }
  703. if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
  704. kvm_cpu_online, kvm_cpu_down_prepare) < 0)
  705. pr_err("failed to install cpu hotplug callbacks\n");
  706. #else
  707. sev_map_percpu_data();
  708. kvm_guest_cpu_init();
  709. #endif
  710. #ifdef CONFIG_KEXEC_CORE
  711. machine_ops.crash_shutdown = kvm_crash_shutdown;
  712. #endif
  713. register_syscore_ops(&kvm_syscore_ops);
  714. /*
  715. * Hard lockup detection is enabled by default. Disable it, as guests
  716. * can get false positives too easily, for example if the host is
  717. * overcommitted.
  718. */
  719. hardlockup_detector_disable();
  720. }
  721. static noinline uint32_t __kvm_cpuid_base(void)
  722. {
  723. if (boot_cpu_data.cpuid_level < 0)
  724. return 0; /* So we don't blow up on old processors */
  725. if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
  726. return hypervisor_cpuid_base(KVM_SIGNATURE, 0);
  727. return 0;
  728. }
  729. static inline uint32_t kvm_cpuid_base(void)
  730. {
  731. static int kvm_cpuid_base = -1;
  732. if (kvm_cpuid_base == -1)
  733. kvm_cpuid_base = __kvm_cpuid_base();
  734. return kvm_cpuid_base;
  735. }
  736. bool kvm_para_available(void)
  737. {
  738. return kvm_cpuid_base() != 0;
  739. }
  740. EXPORT_SYMBOL_GPL(kvm_para_available);
  741. unsigned int kvm_arch_para_features(void)
  742. {
  743. return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
  744. }
  745. unsigned int kvm_arch_para_hints(void)
  746. {
  747. return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES);
  748. }
  749. EXPORT_SYMBOL_GPL(kvm_arch_para_hints);
  750. static uint32_t __init kvm_detect(void)
  751. {
  752. return kvm_cpuid_base();
  753. }
  754. static void __init kvm_apic_init(void)
  755. {
  756. #ifdef CONFIG_SMP
  757. if (pv_ipi_supported())
  758. kvm_setup_pv_ipi();
  759. #endif
  760. }
  761. static bool __init kvm_msi_ext_dest_id(void)
  762. {
  763. return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID);
  764. }
  765. static void kvm_sev_hc_page_enc_status(unsigned long pfn, int npages, bool enc)
  766. {
  767. kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, pfn << PAGE_SHIFT, npages,
  768. KVM_MAP_GPA_RANGE_ENC_STAT(enc) | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
  769. }
  770. static void __init kvm_init_platform(void)
  771. {
  772. if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) &&
  773. kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) {
  774. unsigned long nr_pages;
  775. int i;
  776. pv_ops.mmu.notify_page_enc_status_changed =
  777. kvm_sev_hc_page_enc_status;
  778. /*
  779. * Reset the host's shared pages list related to kernel
  780. * specific page encryption status settings before we load a
  781. * new kernel by kexec. Reset the page encryption status
  782. * during early boot intead of just before kexec to avoid SMP
  783. * races during kvm_pv_guest_cpu_reboot().
  784. * NOTE: We cannot reset the complete shared pages list
  785. * here as we need to retain the UEFI/OVMF firmware
  786. * specific settings.
  787. */
  788. for (i = 0; i < e820_table->nr_entries; i++) {
  789. struct e820_entry *entry = &e820_table->entries[i];
  790. if (entry->type != E820_TYPE_RAM)
  791. continue;
  792. nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE);
  793. kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, entry->addr,
  794. nr_pages,
  795. KVM_MAP_GPA_RANGE_ENCRYPTED | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
  796. }
  797. /*
  798. * Ensure that _bss_decrypted section is marked as decrypted in the
  799. * shared pages list.
  800. */
  801. early_set_mem_enc_dec_hypercall((unsigned long)__start_bss_decrypted,
  802. __end_bss_decrypted - __start_bss_decrypted, 0);
  803. /*
  804. * If not booted using EFI, enable Live migration support.
  805. */
  806. if (!efi_enabled(EFI_BOOT))
  807. wrmsrl(MSR_KVM_MIGRATION_CONTROL,
  808. KVM_MIGRATION_READY);
  809. }
  810. kvmclock_init();
  811. x86_platform.apic_post_init = kvm_apic_init;
  812. }
  813. #if defined(CONFIG_AMD_MEM_ENCRYPT)
  814. static void kvm_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs)
  815. {
  816. /* RAX and CPL are already in the GHCB */
  817. ghcb_set_rbx(ghcb, regs->bx);
  818. ghcb_set_rcx(ghcb, regs->cx);
  819. ghcb_set_rdx(ghcb, regs->dx);
  820. ghcb_set_rsi(ghcb, regs->si);
  821. }
  822. static bool kvm_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
  823. {
  824. /* No checking of the return state needed */
  825. return true;
  826. }
  827. #endif
  828. const __initconst struct hypervisor_x86 x86_hyper_kvm = {
  829. .name = "KVM",
  830. .detect = kvm_detect,
  831. .type = X86_HYPER_KVM,
  832. .init.guest_late_init = kvm_guest_init,
  833. .init.x2apic_available = kvm_para_available,
  834. .init.msi_ext_dest_id = kvm_msi_ext_dest_id,
  835. .init.init_platform = kvm_init_platform,
  836. #if defined(CONFIG_AMD_MEM_ENCRYPT)
  837. .runtime.sev_es_hcall_prepare = kvm_sev_es_hcall_prepare,
  838. .runtime.sev_es_hcall_finish = kvm_sev_es_hcall_finish,
  839. #endif
  840. };
  841. static __init int activate_jump_labels(void)
  842. {
  843. if (has_steal_clock) {
  844. static_key_slow_inc(&paravirt_steal_enabled);
  845. if (steal_acc)
  846. static_key_slow_inc(&paravirt_steal_rq_enabled);
  847. }
  848. return 0;
  849. }
  850. arch_initcall(activate_jump_labels);
  851. #ifdef CONFIG_PARAVIRT_SPINLOCKS
  852. /* Kick a cpu by its apicid. Used to wake up a halted vcpu */
  853. static void kvm_kick_cpu(int cpu)
  854. {
  855. int apicid;
  856. unsigned long flags = 0;
  857. apicid = per_cpu(x86_cpu_to_apicid, cpu);
  858. kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
  859. }
  860. #include <asm/qspinlock.h>
  861. static void kvm_wait(u8 *ptr, u8 val)
  862. {
  863. if (in_nmi())
  864. return;
  865. /*
  866. * halt until it's our turn and kicked. Note that we do safe halt
  867. * for irq enabled case to avoid hang when lock info is overwritten
  868. * in irq spinlock slowpath and no spurious interrupt occur to save us.
  869. */
  870. if (irqs_disabled()) {
  871. if (READ_ONCE(*ptr) == val)
  872. halt();
  873. } else {
  874. local_irq_disable();
  875. /* safe_halt() will enable IRQ */
  876. if (READ_ONCE(*ptr) == val)
  877. safe_halt();
  878. else
  879. local_irq_enable();
  880. }
  881. }
  882. /*
  883. * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
  884. */
  885. void __init kvm_spinlock_init(void)
  886. {
  887. /*
  888. * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an
  889. * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is
  890. * preferred over native qspinlock when vCPU is preempted.
  891. */
  892. if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) {
  893. pr_info("PV spinlocks disabled, no host support\n");
  894. return;
  895. }
  896. /*
  897. * Disable PV spinlocks and use native qspinlock when dedicated pCPUs
  898. * are available.
  899. */
  900. if (kvm_para_has_hint(KVM_HINTS_REALTIME)) {
  901. pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n");
  902. goto out;
  903. }
  904. if (num_possible_cpus() == 1) {
  905. pr_info("PV spinlocks disabled, single CPU\n");
  906. goto out;
  907. }
  908. if (nopvspin) {
  909. pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n");
  910. goto out;
  911. }
  912. pr_info("PV spinlocks enabled\n");
  913. __pv_init_lock_hash();
  914. pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
  915. pv_ops.lock.queued_spin_unlock =
  916. PV_CALLEE_SAVE(__pv_queued_spin_unlock);
  917. pv_ops.lock.wait = kvm_wait;
  918. pv_ops.lock.kick = kvm_kick_cpu;
  919. /*
  920. * When PV spinlock is enabled which is preferred over
  921. * virt_spin_lock(), virt_spin_lock_key's value is meaningless.
  922. * Just disable it anyway.
  923. */
  924. out:
  925. static_branch_disable(&virt_spin_lock_key);
  926. }
  927. #endif /* CONFIG_PARAVIRT_SPINLOCKS */
  928. #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
  929. static void kvm_disable_host_haltpoll(void *i)
  930. {
  931. wrmsrl(MSR_KVM_POLL_CONTROL, 0);
  932. }
  933. static void kvm_enable_host_haltpoll(void *i)
  934. {
  935. wrmsrl(MSR_KVM_POLL_CONTROL, 1);
  936. }
  937. void arch_haltpoll_enable(unsigned int cpu)
  938. {
  939. if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) {
  940. pr_err_once("host does not support poll control\n");
  941. pr_err_once("host upgrade recommended\n");
  942. return;
  943. }
  944. /* Enable guest halt poll disables host halt poll */
  945. smp_call_function_single(cpu, kvm_disable_host_haltpoll, NULL, 1);
  946. }
  947. EXPORT_SYMBOL_GPL(arch_haltpoll_enable);
  948. void arch_haltpoll_disable(unsigned int cpu)
  949. {
  950. if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
  951. return;
  952. /* Disable guest halt poll enables host halt poll */
  953. smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1);
  954. }
  955. EXPORT_SYMBOL_GPL(arch_haltpoll_disable);
  956. #endif