eventfd.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * kvm eventfd support - use eventfd objects to signal various KVM events
  4. *
  5. * Copyright 2009 Novell. All Rights Reserved.
  6. * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  7. *
  8. * Author:
  9. * Gregory Haskins <[email protected]>
  10. */
  11. #include <linux/kvm_host.h>
  12. #include <linux/kvm.h>
  13. #include <linux/kvm_irqfd.h>
  14. #include <linux/workqueue.h>
  15. #include <linux/syscalls.h>
  16. #include <linux/wait.h>
  17. #include <linux/poll.h>
  18. #include <linux/file.h>
  19. #include <linux/list.h>
  20. #include <linux/eventfd.h>
  21. #include <linux/kernel.h>
  22. #include <linux/srcu.h>
  23. #include <linux/slab.h>
  24. #include <linux/seqlock.h>
  25. #include <linux/irqbypass.h>
  26. #include <trace/events/kvm.h>
  27. #include <kvm/iodev.h>
  28. #ifdef CONFIG_HAVE_KVM_IRQFD
  29. static struct workqueue_struct *irqfd_cleanup_wq;
  30. bool __attribute__((weak))
  31. kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
  32. {
  33. return true;
  34. }
  35. static void
  36. irqfd_inject(struct work_struct *work)
  37. {
  38. struct kvm_kernel_irqfd *irqfd =
  39. container_of(work, struct kvm_kernel_irqfd, inject);
  40. struct kvm *kvm = irqfd->kvm;
  41. if (!irqfd->resampler) {
  42. kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1,
  43. false);
  44. kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0,
  45. false);
  46. } else
  47. kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
  48. irqfd->gsi, 1, false);
  49. }
  50. /*
  51. * Since resampler irqfds share an IRQ source ID, we de-assert once
  52. * then notify all of the resampler irqfds using this GSI. We can't
  53. * do multiple de-asserts or we risk racing with incoming re-asserts.
  54. */
  55. static void
  56. irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
  57. {
  58. struct kvm_kernel_irqfd_resampler *resampler;
  59. struct kvm *kvm;
  60. struct kvm_kernel_irqfd *irqfd;
  61. int idx;
  62. resampler = container_of(kian,
  63. struct kvm_kernel_irqfd_resampler, notifier);
  64. kvm = resampler->kvm;
  65. kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
  66. resampler->notifier.gsi, 0, false);
  67. idx = srcu_read_lock(&kvm->irq_srcu);
  68. list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
  69. srcu_read_lock_held(&kvm->irq_srcu))
  70. eventfd_signal(irqfd->resamplefd, 1);
  71. srcu_read_unlock(&kvm->irq_srcu, idx);
  72. }
  73. static void
  74. irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd)
  75. {
  76. struct kvm_kernel_irqfd_resampler *resampler = irqfd->resampler;
  77. struct kvm *kvm = resampler->kvm;
  78. mutex_lock(&kvm->irqfds.resampler_lock);
  79. list_del_rcu(&irqfd->resampler_link);
  80. synchronize_srcu(&kvm->irq_srcu);
  81. if (list_empty(&resampler->list)) {
  82. list_del(&resampler->link);
  83. kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
  84. kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
  85. resampler->notifier.gsi, 0, false);
  86. kfree(resampler);
  87. }
  88. mutex_unlock(&kvm->irqfds.resampler_lock);
  89. }
  90. /*
  91. * Race-free decouple logic (ordering is critical)
  92. */
  93. static void
  94. irqfd_shutdown(struct work_struct *work)
  95. {
  96. struct kvm_kernel_irqfd *irqfd =
  97. container_of(work, struct kvm_kernel_irqfd, shutdown);
  98. struct kvm *kvm = irqfd->kvm;
  99. u64 cnt;
  100. /* Make sure irqfd has been initialized in assign path. */
  101. synchronize_srcu(&kvm->irq_srcu);
  102. /*
  103. * Synchronize with the wait-queue and unhook ourselves to prevent
  104. * further events.
  105. */
  106. eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
  107. /*
  108. * We know no new events will be scheduled at this point, so block
  109. * until all previously outstanding events have completed
  110. */
  111. flush_work(&irqfd->inject);
  112. if (irqfd->resampler) {
  113. irqfd_resampler_shutdown(irqfd);
  114. eventfd_ctx_put(irqfd->resamplefd);
  115. }
  116. /*
  117. * It is now safe to release the object's resources
  118. */
  119. #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
  120. irq_bypass_unregister_consumer(&irqfd->consumer);
  121. #endif
  122. eventfd_ctx_put(irqfd->eventfd);
  123. kfree(irqfd);
  124. }
  125. /* assumes kvm->irqfds.lock is held */
  126. static bool
  127. irqfd_is_active(struct kvm_kernel_irqfd *irqfd)
  128. {
  129. return list_empty(&irqfd->list) ? false : true;
  130. }
  131. /*
  132. * Mark the irqfd as inactive and schedule it for removal
  133. *
  134. * assumes kvm->irqfds.lock is held
  135. */
  136. static void
  137. irqfd_deactivate(struct kvm_kernel_irqfd *irqfd)
  138. {
  139. BUG_ON(!irqfd_is_active(irqfd));
  140. list_del_init(&irqfd->list);
  141. queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
  142. }
  143. int __attribute__((weak)) kvm_arch_set_irq_inatomic(
  144. struct kvm_kernel_irq_routing_entry *irq,
  145. struct kvm *kvm, int irq_source_id,
  146. int level,
  147. bool line_status)
  148. {
  149. return -EWOULDBLOCK;
  150. }
  151. /*
  152. * Called with wqh->lock held and interrupts disabled
  153. */
  154. static int
  155. irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
  156. {
  157. struct kvm_kernel_irqfd *irqfd =
  158. container_of(wait, struct kvm_kernel_irqfd, wait);
  159. __poll_t flags = key_to_poll(key);
  160. struct kvm_kernel_irq_routing_entry irq;
  161. struct kvm *kvm = irqfd->kvm;
  162. unsigned seq;
  163. int idx;
  164. int ret = 0;
  165. if (flags & EPOLLIN) {
  166. u64 cnt;
  167. eventfd_ctx_do_read(irqfd->eventfd, &cnt);
  168. idx = srcu_read_lock(&kvm->irq_srcu);
  169. do {
  170. seq = read_seqcount_begin(&irqfd->irq_entry_sc);
  171. irq = irqfd->irq_entry;
  172. } while (read_seqcount_retry(&irqfd->irq_entry_sc, seq));
  173. /* An event has been signaled, inject an interrupt */
  174. if (kvm_arch_set_irq_inatomic(&irq, kvm,
  175. KVM_USERSPACE_IRQ_SOURCE_ID, 1,
  176. false) == -EWOULDBLOCK)
  177. schedule_work(&irqfd->inject);
  178. srcu_read_unlock(&kvm->irq_srcu, idx);
  179. ret = 1;
  180. }
  181. if (flags & EPOLLHUP) {
  182. /* The eventfd is closing, detach from KVM */
  183. unsigned long iflags;
  184. spin_lock_irqsave(&kvm->irqfds.lock, iflags);
  185. /*
  186. * We must check if someone deactivated the irqfd before
  187. * we could acquire the irqfds.lock since the item is
  188. * deactivated from the KVM side before it is unhooked from
  189. * the wait-queue. If it is already deactivated, we can
  190. * simply return knowing the other side will cleanup for us.
  191. * We cannot race against the irqfd going away since the
  192. * other side is required to acquire wqh->lock, which we hold
  193. */
  194. if (irqfd_is_active(irqfd))
  195. irqfd_deactivate(irqfd);
  196. spin_unlock_irqrestore(&kvm->irqfds.lock, iflags);
  197. }
  198. return ret;
  199. }
  200. static void
  201. irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
  202. poll_table *pt)
  203. {
  204. struct kvm_kernel_irqfd *irqfd =
  205. container_of(pt, struct kvm_kernel_irqfd, pt);
  206. add_wait_queue_priority(wqh, &irqfd->wait);
  207. }
  208. /* Must be called under irqfds.lock */
  209. static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd)
  210. {
  211. struct kvm_kernel_irq_routing_entry *e;
  212. struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
  213. int n_entries;
  214. n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi);
  215. write_seqcount_begin(&irqfd->irq_entry_sc);
  216. e = entries;
  217. if (n_entries == 1)
  218. irqfd->irq_entry = *e;
  219. else
  220. irqfd->irq_entry.type = 0;
  221. write_seqcount_end(&irqfd->irq_entry_sc);
  222. }
  223. #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
  224. void __attribute__((weak)) kvm_arch_irq_bypass_stop(
  225. struct irq_bypass_consumer *cons)
  226. {
  227. }
  228. void __attribute__((weak)) kvm_arch_irq_bypass_start(
  229. struct irq_bypass_consumer *cons)
  230. {
  231. }
  232. int __attribute__((weak)) kvm_arch_update_irqfd_routing(
  233. struct kvm *kvm, unsigned int host_irq,
  234. uint32_t guest_irq, bool set)
  235. {
  236. return 0;
  237. }
  238. bool __attribute__((weak)) kvm_arch_irqfd_route_changed(
  239. struct kvm_kernel_irq_routing_entry *old,
  240. struct kvm_kernel_irq_routing_entry *new)
  241. {
  242. return true;
  243. }
  244. #endif
  245. static int
  246. kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
  247. {
  248. struct kvm_kernel_irqfd *irqfd, *tmp;
  249. struct fd f;
  250. struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
  251. int ret;
  252. __poll_t events;
  253. int idx;
  254. if (!kvm_arch_intc_initialized(kvm))
  255. return -EAGAIN;
  256. if (!kvm_arch_irqfd_allowed(kvm, args))
  257. return -EINVAL;
  258. irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL_ACCOUNT);
  259. if (!irqfd)
  260. return -ENOMEM;
  261. irqfd->kvm = kvm;
  262. irqfd->gsi = args->gsi;
  263. INIT_LIST_HEAD(&irqfd->list);
  264. INIT_WORK(&irqfd->inject, irqfd_inject);
  265. INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
  266. seqcount_spinlock_init(&irqfd->irq_entry_sc, &kvm->irqfds.lock);
  267. f = fdget(args->fd);
  268. if (!f.file) {
  269. ret = -EBADF;
  270. goto out;
  271. }
  272. eventfd = eventfd_ctx_fileget(f.file);
  273. if (IS_ERR(eventfd)) {
  274. ret = PTR_ERR(eventfd);
  275. goto fail;
  276. }
  277. irqfd->eventfd = eventfd;
  278. if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
  279. struct kvm_kernel_irqfd_resampler *resampler;
  280. resamplefd = eventfd_ctx_fdget(args->resamplefd);
  281. if (IS_ERR(resamplefd)) {
  282. ret = PTR_ERR(resamplefd);
  283. goto fail;
  284. }
  285. irqfd->resamplefd = resamplefd;
  286. INIT_LIST_HEAD(&irqfd->resampler_link);
  287. mutex_lock(&kvm->irqfds.resampler_lock);
  288. list_for_each_entry(resampler,
  289. &kvm->irqfds.resampler_list, link) {
  290. if (resampler->notifier.gsi == irqfd->gsi) {
  291. irqfd->resampler = resampler;
  292. break;
  293. }
  294. }
  295. if (!irqfd->resampler) {
  296. resampler = kzalloc(sizeof(*resampler),
  297. GFP_KERNEL_ACCOUNT);
  298. if (!resampler) {
  299. ret = -ENOMEM;
  300. mutex_unlock(&kvm->irqfds.resampler_lock);
  301. goto fail;
  302. }
  303. resampler->kvm = kvm;
  304. INIT_LIST_HEAD(&resampler->list);
  305. resampler->notifier.gsi = irqfd->gsi;
  306. resampler->notifier.irq_acked = irqfd_resampler_ack;
  307. INIT_LIST_HEAD(&resampler->link);
  308. list_add(&resampler->link, &kvm->irqfds.resampler_list);
  309. kvm_register_irq_ack_notifier(kvm,
  310. &resampler->notifier);
  311. irqfd->resampler = resampler;
  312. }
  313. list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
  314. synchronize_srcu(&kvm->irq_srcu);
  315. mutex_unlock(&kvm->irqfds.resampler_lock);
  316. }
  317. /*
  318. * Install our own custom wake-up handling so we are notified via
  319. * a callback whenever someone signals the underlying eventfd
  320. */
  321. init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
  322. init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
  323. spin_lock_irq(&kvm->irqfds.lock);
  324. ret = 0;
  325. list_for_each_entry(tmp, &kvm->irqfds.items, list) {
  326. if (irqfd->eventfd != tmp->eventfd)
  327. continue;
  328. /* This fd is used for another irq already. */
  329. ret = -EBUSY;
  330. spin_unlock_irq(&kvm->irqfds.lock);
  331. goto fail;
  332. }
  333. idx = srcu_read_lock(&kvm->irq_srcu);
  334. irqfd_update(kvm, irqfd);
  335. list_add_tail(&irqfd->list, &kvm->irqfds.items);
  336. spin_unlock_irq(&kvm->irqfds.lock);
  337. /*
  338. * Check if there was an event already pending on the eventfd
  339. * before we registered, and trigger it as if we didn't miss it.
  340. */
  341. events = vfs_poll(f.file, &irqfd->pt);
  342. if (events & EPOLLIN)
  343. schedule_work(&irqfd->inject);
  344. #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
  345. if (kvm_arch_has_irq_bypass()) {
  346. irqfd->consumer.token = (void *)irqfd->eventfd;
  347. irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer;
  348. irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer;
  349. irqfd->consumer.stop = kvm_arch_irq_bypass_stop;
  350. irqfd->consumer.start = kvm_arch_irq_bypass_start;
  351. ret = irq_bypass_register_consumer(&irqfd->consumer);
  352. if (ret)
  353. pr_info("irq bypass consumer (token %p) registration fails: %d\n",
  354. irqfd->consumer.token, ret);
  355. }
  356. #endif
  357. srcu_read_unlock(&kvm->irq_srcu, idx);
  358. /*
  359. * do not drop the file until the irqfd is fully initialized, otherwise
  360. * we might race against the EPOLLHUP
  361. */
  362. fdput(f);
  363. return 0;
  364. fail:
  365. if (irqfd->resampler)
  366. irqfd_resampler_shutdown(irqfd);
  367. if (resamplefd && !IS_ERR(resamplefd))
  368. eventfd_ctx_put(resamplefd);
  369. if (eventfd && !IS_ERR(eventfd))
  370. eventfd_ctx_put(eventfd);
  371. fdput(f);
  372. out:
  373. kfree(irqfd);
  374. return ret;
  375. }
  376. bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
  377. {
  378. struct kvm_irq_ack_notifier *kian;
  379. int gsi, idx;
  380. idx = srcu_read_lock(&kvm->irq_srcu);
  381. gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
  382. if (gsi != -1)
  383. hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list,
  384. link, srcu_read_lock_held(&kvm->irq_srcu))
  385. if (kian->gsi == gsi) {
  386. srcu_read_unlock(&kvm->irq_srcu, idx);
  387. return true;
  388. }
  389. srcu_read_unlock(&kvm->irq_srcu, idx);
  390. return false;
  391. }
  392. EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
  393. void kvm_notify_acked_gsi(struct kvm *kvm, int gsi)
  394. {
  395. struct kvm_irq_ack_notifier *kian;
  396. hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list,
  397. link, srcu_read_lock_held(&kvm->irq_srcu))
  398. if (kian->gsi == gsi)
  399. kian->irq_acked(kian);
  400. }
  401. void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
  402. {
  403. int gsi, idx;
  404. trace_kvm_ack_irq(irqchip, pin);
  405. idx = srcu_read_lock(&kvm->irq_srcu);
  406. gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
  407. if (gsi != -1)
  408. kvm_notify_acked_gsi(kvm, gsi);
  409. srcu_read_unlock(&kvm->irq_srcu, idx);
  410. }
  411. void kvm_register_irq_ack_notifier(struct kvm *kvm,
  412. struct kvm_irq_ack_notifier *kian)
  413. {
  414. mutex_lock(&kvm->irq_lock);
  415. hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list);
  416. mutex_unlock(&kvm->irq_lock);
  417. kvm_arch_post_irq_ack_notifier_list_update(kvm);
  418. }
  419. void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
  420. struct kvm_irq_ack_notifier *kian)
  421. {
  422. mutex_lock(&kvm->irq_lock);
  423. hlist_del_init_rcu(&kian->link);
  424. mutex_unlock(&kvm->irq_lock);
  425. synchronize_srcu(&kvm->irq_srcu);
  426. kvm_arch_post_irq_ack_notifier_list_update(kvm);
  427. }
  428. #endif
  429. void
  430. kvm_eventfd_init(struct kvm *kvm)
  431. {
  432. #ifdef CONFIG_HAVE_KVM_IRQFD
  433. spin_lock_init(&kvm->irqfds.lock);
  434. INIT_LIST_HEAD(&kvm->irqfds.items);
  435. INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
  436. mutex_init(&kvm->irqfds.resampler_lock);
  437. #endif
  438. INIT_LIST_HEAD(&kvm->ioeventfds);
  439. }
  440. #ifdef CONFIG_HAVE_KVM_IRQFD
  441. /*
  442. * shutdown any irqfd's that match fd+gsi
  443. */
  444. static int
  445. kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
  446. {
  447. struct kvm_kernel_irqfd *irqfd, *tmp;
  448. struct eventfd_ctx *eventfd;
  449. eventfd = eventfd_ctx_fdget(args->fd);
  450. if (IS_ERR(eventfd))
  451. return PTR_ERR(eventfd);
  452. spin_lock_irq(&kvm->irqfds.lock);
  453. list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
  454. if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) {
  455. /*
  456. * This clearing of irq_entry.type is needed for when
  457. * another thread calls kvm_irq_routing_update before
  458. * we flush workqueue below (we synchronize with
  459. * kvm_irq_routing_update using irqfds.lock).
  460. */
  461. write_seqcount_begin(&irqfd->irq_entry_sc);
  462. irqfd->irq_entry.type = 0;
  463. write_seqcount_end(&irqfd->irq_entry_sc);
  464. irqfd_deactivate(irqfd);
  465. }
  466. }
  467. spin_unlock_irq(&kvm->irqfds.lock);
  468. eventfd_ctx_put(eventfd);
  469. /*
  470. * Block until we know all outstanding shutdown jobs have completed
  471. * so that we guarantee there will not be any more interrupts on this
  472. * gsi once this deassign function returns.
  473. */
  474. flush_workqueue(irqfd_cleanup_wq);
  475. return 0;
  476. }
  477. int
  478. kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
  479. {
  480. if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
  481. return -EINVAL;
  482. if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
  483. return kvm_irqfd_deassign(kvm, args);
  484. return kvm_irqfd_assign(kvm, args);
  485. }
  486. /*
  487. * This function is called as the kvm VM fd is being released. Shutdown all
  488. * irqfds that still remain open
  489. */
  490. void
  491. kvm_irqfd_release(struct kvm *kvm)
  492. {
  493. struct kvm_kernel_irqfd *irqfd, *tmp;
  494. spin_lock_irq(&kvm->irqfds.lock);
  495. list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list)
  496. irqfd_deactivate(irqfd);
  497. spin_unlock_irq(&kvm->irqfds.lock);
  498. /*
  499. * Block until we know all outstanding shutdown jobs have completed
  500. * since we do not take a kvm* reference.
  501. */
  502. flush_workqueue(irqfd_cleanup_wq);
  503. }
  504. /*
  505. * Take note of a change in irq routing.
  506. * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards.
  507. */
  508. void kvm_irq_routing_update(struct kvm *kvm)
  509. {
  510. struct kvm_kernel_irqfd *irqfd;
  511. spin_lock_irq(&kvm->irqfds.lock);
  512. list_for_each_entry(irqfd, &kvm->irqfds.items, list) {
  513. #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
  514. /* Under irqfds.lock, so can read irq_entry safely */
  515. struct kvm_kernel_irq_routing_entry old = irqfd->irq_entry;
  516. #endif
  517. irqfd_update(kvm, irqfd);
  518. #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
  519. if (irqfd->producer &&
  520. kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) {
  521. int ret = kvm_arch_update_irqfd_routing(
  522. irqfd->kvm, irqfd->producer->irq,
  523. irqfd->gsi, 1);
  524. WARN_ON(ret);
  525. }
  526. #endif
  527. }
  528. spin_unlock_irq(&kvm->irqfds.lock);
  529. }
  530. /*
  531. * create a host-wide workqueue for issuing deferred shutdown requests
  532. * aggregated from all vm* instances. We need our own isolated
  533. * queue to ease flushing work items when a VM exits.
  534. */
  535. int kvm_irqfd_init(void)
  536. {
  537. irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", 0, 0);
  538. if (!irqfd_cleanup_wq)
  539. return -ENOMEM;
  540. return 0;
  541. }
  542. void kvm_irqfd_exit(void)
  543. {
  544. destroy_workqueue(irqfd_cleanup_wq);
  545. }
  546. #endif
  547. /*
  548. * --------------------------------------------------------------------
  549. * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal.
  550. *
  551. * userspace can register a PIO/MMIO address with an eventfd for receiving
  552. * notification when the memory has been touched.
  553. * --------------------------------------------------------------------
  554. */
  555. struct _ioeventfd {
  556. struct list_head list;
  557. u64 addr;
  558. int length;
  559. struct eventfd_ctx *eventfd;
  560. u64 datamatch;
  561. struct kvm_io_device dev;
  562. u8 bus_idx;
  563. bool wildcard;
  564. };
  565. static inline struct _ioeventfd *
  566. to_ioeventfd(struct kvm_io_device *dev)
  567. {
  568. return container_of(dev, struct _ioeventfd, dev);
  569. }
  570. static void
  571. ioeventfd_release(struct _ioeventfd *p)
  572. {
  573. eventfd_ctx_put(p->eventfd);
  574. list_del(&p->list);
  575. kfree(p);
  576. }
  577. static bool
  578. ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
  579. {
  580. u64 _val;
  581. if (addr != p->addr)
  582. /* address must be precise for a hit */
  583. return false;
  584. if (!p->length)
  585. /* length = 0 means only look at the address, so always a hit */
  586. return true;
  587. if (len != p->length)
  588. /* address-range must be precise for a hit */
  589. return false;
  590. if (p->wildcard)
  591. /* all else equal, wildcard is always a hit */
  592. return true;
  593. /* otherwise, we have to actually compare the data */
  594. BUG_ON(!IS_ALIGNED((unsigned long)val, len));
  595. switch (len) {
  596. case 1:
  597. _val = *(u8 *)val;
  598. break;
  599. case 2:
  600. _val = *(u16 *)val;
  601. break;
  602. case 4:
  603. _val = *(u32 *)val;
  604. break;
  605. case 8:
  606. _val = *(u64 *)val;
  607. break;
  608. default:
  609. return false;
  610. }
  611. return _val == p->datamatch;
  612. }
  613. /* MMIO/PIO writes trigger an event if the addr/val match */
  614. static int
  615. ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
  616. int len, const void *val)
  617. {
  618. struct _ioeventfd *p = to_ioeventfd(this);
  619. if (!ioeventfd_in_range(p, addr, len, val))
  620. return -EOPNOTSUPP;
  621. eventfd_signal(p->eventfd, 1);
  622. return 0;
  623. }
  624. /*
  625. * This function is called as KVM is completely shutting down. We do not
  626. * need to worry about locking just nuke anything we have as quickly as possible
  627. */
  628. static void
  629. ioeventfd_destructor(struct kvm_io_device *this)
  630. {
  631. struct _ioeventfd *p = to_ioeventfd(this);
  632. ioeventfd_release(p);
  633. }
  634. static const struct kvm_io_device_ops ioeventfd_ops = {
  635. .write = ioeventfd_write,
  636. .destructor = ioeventfd_destructor,
  637. };
  638. /* assumes kvm->slots_lock held */
  639. static bool
  640. ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
  641. {
  642. struct _ioeventfd *_p;
  643. list_for_each_entry(_p, &kvm->ioeventfds, list)
  644. if (_p->bus_idx == p->bus_idx &&
  645. _p->addr == p->addr &&
  646. (!_p->length || !p->length ||
  647. (_p->length == p->length &&
  648. (_p->wildcard || p->wildcard ||
  649. _p->datamatch == p->datamatch))))
  650. return true;
  651. return false;
  652. }
  653. static enum kvm_bus ioeventfd_bus_from_flags(__u32 flags)
  654. {
  655. if (flags & KVM_IOEVENTFD_FLAG_PIO)
  656. return KVM_PIO_BUS;
  657. if (flags & KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY)
  658. return KVM_VIRTIO_CCW_NOTIFY_BUS;
  659. return KVM_MMIO_BUS;
  660. }
  661. static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
  662. enum kvm_bus bus_idx,
  663. struct kvm_ioeventfd *args)
  664. {
  665. struct eventfd_ctx *eventfd;
  666. struct _ioeventfd *p;
  667. int ret;
  668. eventfd = eventfd_ctx_fdget(args->fd);
  669. if (IS_ERR(eventfd))
  670. return PTR_ERR(eventfd);
  671. p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
  672. if (!p) {
  673. ret = -ENOMEM;
  674. goto fail;
  675. }
  676. INIT_LIST_HEAD(&p->list);
  677. p->addr = args->addr;
  678. p->bus_idx = bus_idx;
  679. p->length = args->len;
  680. p->eventfd = eventfd;
  681. /* The datamatch feature is optional, otherwise this is a wildcard */
  682. if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
  683. p->datamatch = args->datamatch;
  684. else
  685. p->wildcard = true;
  686. mutex_lock(&kvm->slots_lock);
  687. /* Verify that there isn't a match already */
  688. if (ioeventfd_check_collision(kvm, p)) {
  689. ret = -EEXIST;
  690. goto unlock_fail;
  691. }
  692. kvm_iodevice_init(&p->dev, &ioeventfd_ops);
  693. ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length,
  694. &p->dev);
  695. if (ret < 0)
  696. goto unlock_fail;
  697. kvm_get_bus(kvm, bus_idx)->ioeventfd_count++;
  698. list_add_tail(&p->list, &kvm->ioeventfds);
  699. mutex_unlock(&kvm->slots_lock);
  700. return 0;
  701. unlock_fail:
  702. mutex_unlock(&kvm->slots_lock);
  703. fail:
  704. kfree(p);
  705. eventfd_ctx_put(eventfd);
  706. return ret;
  707. }
  708. static int
  709. kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
  710. struct kvm_ioeventfd *args)
  711. {
  712. struct _ioeventfd *p, *tmp;
  713. struct eventfd_ctx *eventfd;
  714. struct kvm_io_bus *bus;
  715. int ret = -ENOENT;
  716. bool wildcard;
  717. eventfd = eventfd_ctx_fdget(args->fd);
  718. if (IS_ERR(eventfd))
  719. return PTR_ERR(eventfd);
  720. wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
  721. mutex_lock(&kvm->slots_lock);
  722. list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {
  723. if (p->bus_idx != bus_idx ||
  724. p->eventfd != eventfd ||
  725. p->addr != args->addr ||
  726. p->length != args->len ||
  727. p->wildcard != wildcard)
  728. continue;
  729. if (!p->wildcard && p->datamatch != args->datamatch)
  730. continue;
  731. kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
  732. bus = kvm_get_bus(kvm, bus_idx);
  733. if (bus)
  734. bus->ioeventfd_count--;
  735. ioeventfd_release(p);
  736. ret = 0;
  737. break;
  738. }
  739. mutex_unlock(&kvm->slots_lock);
  740. eventfd_ctx_put(eventfd);
  741. return ret;
  742. }
  743. static int kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
  744. {
  745. enum kvm_bus bus_idx = ioeventfd_bus_from_flags(args->flags);
  746. int ret = kvm_deassign_ioeventfd_idx(kvm, bus_idx, args);
  747. if (!args->len && bus_idx == KVM_MMIO_BUS)
  748. kvm_deassign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args);
  749. return ret;
  750. }
  751. static int
  752. kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
  753. {
  754. enum kvm_bus bus_idx;
  755. int ret;
  756. bus_idx = ioeventfd_bus_from_flags(args->flags);
  757. /* must be natural-word sized, or 0 to ignore length */
  758. switch (args->len) {
  759. case 0:
  760. case 1:
  761. case 2:
  762. case 4:
  763. case 8:
  764. break;
  765. default:
  766. return -EINVAL;
  767. }
  768. /* check for range overflow */
  769. if (args->addr + args->len < args->addr)
  770. return -EINVAL;
  771. /* check for extra flags that we don't understand */
  772. if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
  773. return -EINVAL;
  774. /* ioeventfd with no length can't be combined with DATAMATCH */
  775. if (!args->len && (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH))
  776. return -EINVAL;
  777. ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args);
  778. if (ret)
  779. goto fail;
  780. /* When length is ignored, MMIO is also put on a separate bus, for
  781. * faster lookups.
  782. */
  783. if (!args->len && bus_idx == KVM_MMIO_BUS) {
  784. ret = kvm_assign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args);
  785. if (ret < 0)
  786. goto fast_fail;
  787. }
  788. return 0;
  789. fast_fail:
  790. kvm_deassign_ioeventfd_idx(kvm, bus_idx, args);
  791. fail:
  792. return ret;
  793. }
  794. int
  795. kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
  796. {
  797. if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN)
  798. return kvm_deassign_ioeventfd(kvm, args);
  799. return kvm_assign_ioeventfd(kvm, args);
  800. }