vfio_main.c 48 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * VFIO core
  4. *
  5. * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
  6. * Author: Alex Williamson <[email protected]>
  7. *
  8. * Derived from original vfio:
  9. * Copyright 2010 Cisco Systems, Inc. All rights reserved.
  10. * Author: Tom Lyon, [email protected]
  11. */
  12. #include <linux/cdev.h>
  13. #include <linux/compat.h>
  14. #include <linux/device.h>
  15. #include <linux/file.h>
  16. #include <linux/anon_inodes.h>
  17. #include <linux/fs.h>
  18. #include <linux/idr.h>
  19. #include <linux/iommu.h>
  20. #include <linux/list.h>
  21. #include <linux/miscdevice.h>
  22. #include <linux/module.h>
  23. #include <linux/mutex.h>
  24. #include <linux/pci.h>
  25. #include <linux/rwsem.h>
  26. #include <linux/sched.h>
  27. #include <linux/slab.h>
  28. #include <linux/stat.h>
  29. #include <linux/string.h>
  30. #include <linux/uaccess.h>
  31. #include <linux/vfio.h>
  32. #include <linux/wait.h>
  33. #include <linux/sched/signal.h>
  34. #include <linux/pm_runtime.h>
  35. #include <linux/interval_tree.h>
  36. #include <linux/iova_bitmap.h>
  37. #include "vfio.h"
  38. #define DRIVER_VERSION "0.3"
  39. #define DRIVER_AUTHOR "Alex Williamson <[email protected]>"
  40. #define DRIVER_DESC "VFIO - User Level meta-driver"
  41. static struct vfio {
  42. struct class *class;
  43. struct list_head group_list;
  44. struct mutex group_lock; /* locks group_list */
  45. struct ida group_ida;
  46. dev_t group_devt;
  47. struct class *device_class;
  48. struct ida device_ida;
  49. } vfio;
  50. static DEFINE_XARRAY(vfio_device_set_xa);
  51. static const struct file_operations vfio_group_fops;
  52. int vfio_assign_device_set(struct vfio_device *device, void *set_id)
  53. {
  54. unsigned long idx = (unsigned long)set_id;
  55. struct vfio_device_set *new_dev_set;
  56. struct vfio_device_set *dev_set;
  57. if (WARN_ON(!set_id))
  58. return -EINVAL;
  59. /*
  60. * Atomically acquire a singleton object in the xarray for this set_id
  61. */
  62. xa_lock(&vfio_device_set_xa);
  63. dev_set = xa_load(&vfio_device_set_xa, idx);
  64. if (dev_set)
  65. goto found_get_ref;
  66. xa_unlock(&vfio_device_set_xa);
  67. new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
  68. if (!new_dev_set)
  69. return -ENOMEM;
  70. mutex_init(&new_dev_set->lock);
  71. INIT_LIST_HEAD(&new_dev_set->device_list);
  72. new_dev_set->set_id = set_id;
  73. xa_lock(&vfio_device_set_xa);
  74. dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
  75. GFP_KERNEL);
  76. if (!dev_set) {
  77. dev_set = new_dev_set;
  78. goto found_get_ref;
  79. }
  80. kfree(new_dev_set);
  81. if (xa_is_err(dev_set)) {
  82. xa_unlock(&vfio_device_set_xa);
  83. return xa_err(dev_set);
  84. }
  85. found_get_ref:
  86. dev_set->device_count++;
  87. xa_unlock(&vfio_device_set_xa);
  88. mutex_lock(&dev_set->lock);
  89. device->dev_set = dev_set;
  90. list_add_tail(&device->dev_set_list, &dev_set->device_list);
  91. mutex_unlock(&dev_set->lock);
  92. return 0;
  93. }
  94. EXPORT_SYMBOL_GPL(vfio_assign_device_set);
  95. static void vfio_release_device_set(struct vfio_device *device)
  96. {
  97. struct vfio_device_set *dev_set = device->dev_set;
  98. if (!dev_set)
  99. return;
  100. mutex_lock(&dev_set->lock);
  101. list_del(&device->dev_set_list);
  102. mutex_unlock(&dev_set->lock);
  103. xa_lock(&vfio_device_set_xa);
  104. if (!--dev_set->device_count) {
  105. __xa_erase(&vfio_device_set_xa,
  106. (unsigned long)dev_set->set_id);
  107. mutex_destroy(&dev_set->lock);
  108. kfree(dev_set);
  109. }
  110. xa_unlock(&vfio_device_set_xa);
  111. }
  112. unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
  113. {
  114. struct vfio_device *cur;
  115. unsigned int open_count = 0;
  116. lockdep_assert_held(&dev_set->lock);
  117. list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
  118. open_count += cur->open_count;
  119. return open_count;
  120. }
  121. EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
  122. /*
  123. * Group objects - create, release, get, put, search
  124. */
  125. static struct vfio_group *
  126. __vfio_group_get_from_iommu(struct iommu_group *iommu_group)
  127. {
  128. struct vfio_group *group;
  129. /*
  130. * group->iommu_group from the vfio.group_list cannot be NULL
  131. * under the vfio.group_lock.
  132. */
  133. list_for_each_entry(group, &vfio.group_list, vfio_next) {
  134. if (group->iommu_group == iommu_group) {
  135. refcount_inc(&group->drivers);
  136. return group;
  137. }
  138. }
  139. return NULL;
  140. }
  141. static struct vfio_group *
  142. vfio_group_get_from_iommu(struct iommu_group *iommu_group)
  143. {
  144. struct vfio_group *group;
  145. mutex_lock(&vfio.group_lock);
  146. group = __vfio_group_get_from_iommu(iommu_group);
  147. mutex_unlock(&vfio.group_lock);
  148. return group;
  149. }
  150. static void vfio_group_release(struct device *dev)
  151. {
  152. struct vfio_group *group = container_of(dev, struct vfio_group, dev);
  153. mutex_destroy(&group->device_lock);
  154. mutex_destroy(&group->group_lock);
  155. WARN_ON(group->iommu_group);
  156. ida_free(&vfio.group_ida, MINOR(group->dev.devt));
  157. kfree(group);
  158. }
  159. static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group,
  160. enum vfio_group_type type)
  161. {
  162. struct vfio_group *group;
  163. int minor;
  164. group = kzalloc(sizeof(*group), GFP_KERNEL);
  165. if (!group)
  166. return ERR_PTR(-ENOMEM);
  167. minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL);
  168. if (minor < 0) {
  169. kfree(group);
  170. return ERR_PTR(minor);
  171. }
  172. device_initialize(&group->dev);
  173. group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor);
  174. group->dev.class = vfio.class;
  175. group->dev.release = vfio_group_release;
  176. cdev_init(&group->cdev, &vfio_group_fops);
  177. group->cdev.owner = THIS_MODULE;
  178. refcount_set(&group->drivers, 1);
  179. mutex_init(&group->group_lock);
  180. INIT_LIST_HEAD(&group->device_list);
  181. mutex_init(&group->device_lock);
  182. group->iommu_group = iommu_group;
  183. /* put in vfio_group_release() */
  184. iommu_group_ref_get(iommu_group);
  185. group->type = type;
  186. BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
  187. return group;
  188. }
  189. static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
  190. enum vfio_group_type type)
  191. {
  192. struct vfio_group *group;
  193. struct vfio_group *ret;
  194. int err;
  195. group = vfio_group_alloc(iommu_group, type);
  196. if (IS_ERR(group))
  197. return group;
  198. err = dev_set_name(&group->dev, "%s%d",
  199. group->type == VFIO_NO_IOMMU ? "noiommu-" : "",
  200. iommu_group_id(iommu_group));
  201. if (err) {
  202. ret = ERR_PTR(err);
  203. goto err_put;
  204. }
  205. mutex_lock(&vfio.group_lock);
  206. /* Did we race creating this group? */
  207. ret = __vfio_group_get_from_iommu(iommu_group);
  208. if (ret)
  209. goto err_unlock;
  210. err = cdev_device_add(&group->cdev, &group->dev);
  211. if (err) {
  212. ret = ERR_PTR(err);
  213. goto err_unlock;
  214. }
  215. list_add(&group->vfio_next, &vfio.group_list);
  216. mutex_unlock(&vfio.group_lock);
  217. return group;
  218. err_unlock:
  219. mutex_unlock(&vfio.group_lock);
  220. err_put:
  221. put_device(&group->dev);
  222. return ret;
  223. }
  224. static void vfio_device_remove_group(struct vfio_device *device)
  225. {
  226. struct vfio_group *group = device->group;
  227. struct iommu_group *iommu_group;
  228. if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU)
  229. iommu_group_remove_device(device->dev);
  230. /* Pairs with vfio_create_group() / vfio_group_get_from_iommu() */
  231. if (!refcount_dec_and_mutex_lock(&group->drivers, &vfio.group_lock))
  232. return;
  233. list_del(&group->vfio_next);
  234. /*
  235. * We could concurrently probe another driver in the group that might
  236. * race vfio_device_remove_group() with vfio_get_group(), so we have to
  237. * ensure that the sysfs is all cleaned up under lock otherwise the
  238. * cdev_device_add() will fail due to the name aready existing.
  239. */
  240. cdev_device_del(&group->cdev, &group->dev);
  241. mutex_lock(&group->group_lock);
  242. /*
  243. * These data structures all have paired operations that can only be
  244. * undone when the caller holds a live reference on the device. Since
  245. * all pairs must be undone these WARN_ON's indicate some caller did not
  246. * properly hold the group reference.
  247. */
  248. WARN_ON(!list_empty(&group->device_list));
  249. WARN_ON(group->notifier.head);
  250. /*
  251. * Revoke all users of group->iommu_group. At this point we know there
  252. * are no devices active because we are unplugging the last one. Setting
  253. * iommu_group to NULL blocks all new users.
  254. */
  255. if (group->container)
  256. vfio_group_detach_container(group);
  257. iommu_group = group->iommu_group;
  258. group->iommu_group = NULL;
  259. mutex_unlock(&group->group_lock);
  260. mutex_unlock(&vfio.group_lock);
  261. iommu_group_put(iommu_group);
  262. put_device(&group->dev);
  263. }
  264. /*
  265. * Device objects - create, release, get, put, search
  266. */
  267. /* Device reference always implies a group reference */
  268. static void vfio_device_put_registration(struct vfio_device *device)
  269. {
  270. if (refcount_dec_and_test(&device->refcount))
  271. complete(&device->comp);
  272. }
  273. static bool vfio_device_try_get_registration(struct vfio_device *device)
  274. {
  275. return refcount_inc_not_zero(&device->refcount);
  276. }
  277. static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
  278. struct device *dev)
  279. {
  280. struct vfio_device *device;
  281. mutex_lock(&group->device_lock);
  282. list_for_each_entry(device, &group->device_list, group_next) {
  283. if (device->dev == dev &&
  284. vfio_device_try_get_registration(device)) {
  285. mutex_unlock(&group->device_lock);
  286. return device;
  287. }
  288. }
  289. mutex_unlock(&group->device_lock);
  290. return NULL;
  291. }
  292. /*
  293. * VFIO driver API
  294. */
  295. /* Release helper called by vfio_put_device() */
  296. static void vfio_device_release(struct device *dev)
  297. {
  298. struct vfio_device *device =
  299. container_of(dev, struct vfio_device, device);
  300. vfio_release_device_set(device);
  301. ida_free(&vfio.device_ida, device->index);
  302. /*
  303. * kvfree() cannot be done here due to a life cycle mess in
  304. * vfio-ccw. Before the ccw part is fixed all drivers are
  305. * required to support @release and call vfio_free_device()
  306. * from there.
  307. */
  308. device->ops->release(device);
  309. }
  310. /*
  311. * Allocate and initialize vfio_device so it can be registered to vfio
  312. * core.
  313. *
  314. * Drivers should use the wrapper vfio_alloc_device() for allocation.
  315. * @size is the size of the structure to be allocated, including any
  316. * private data used by the driver.
  317. *
  318. * Driver may provide an @init callback to cover device private data.
  319. *
  320. * Use vfio_put_device() to release the structure after success return.
  321. */
  322. struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
  323. const struct vfio_device_ops *ops)
  324. {
  325. struct vfio_device *device;
  326. int ret;
  327. if (WARN_ON(size < sizeof(struct vfio_device)))
  328. return ERR_PTR(-EINVAL);
  329. device = kvzalloc(size, GFP_KERNEL);
  330. if (!device)
  331. return ERR_PTR(-ENOMEM);
  332. ret = vfio_init_device(device, dev, ops);
  333. if (ret)
  334. goto out_free;
  335. return device;
  336. out_free:
  337. kvfree(device);
  338. return ERR_PTR(ret);
  339. }
  340. EXPORT_SYMBOL_GPL(_vfio_alloc_device);
  341. /*
  342. * Initialize a vfio_device so it can be registered to vfio core.
  343. *
  344. * Only vfio-ccw driver should call this interface.
  345. */
  346. int vfio_init_device(struct vfio_device *device, struct device *dev,
  347. const struct vfio_device_ops *ops)
  348. {
  349. int ret;
  350. ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
  351. if (ret < 0) {
  352. dev_dbg(dev, "Error to alloc index\n");
  353. return ret;
  354. }
  355. device->index = ret;
  356. init_completion(&device->comp);
  357. device->dev = dev;
  358. device->ops = ops;
  359. if (ops->init) {
  360. ret = ops->init(device);
  361. if (ret)
  362. goto out_uninit;
  363. }
  364. device_initialize(&device->device);
  365. device->device.release = vfio_device_release;
  366. device->device.class = vfio.device_class;
  367. device->device.parent = device->dev;
  368. return 0;
  369. out_uninit:
  370. vfio_release_device_set(device);
  371. ida_free(&vfio.device_ida, device->index);
  372. return ret;
  373. }
  374. EXPORT_SYMBOL_GPL(vfio_init_device);
  375. /*
  376. * The helper called by driver @release callback to free the device
  377. * structure. Drivers which don't have private data to clean can
  378. * simply use this helper as its @release.
  379. */
  380. void vfio_free_device(struct vfio_device *device)
  381. {
  382. kvfree(device);
  383. }
  384. EXPORT_SYMBOL_GPL(vfio_free_device);
  385. static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev,
  386. enum vfio_group_type type)
  387. {
  388. struct iommu_group *iommu_group;
  389. struct vfio_group *group;
  390. int ret;
  391. iommu_group = iommu_group_alloc();
  392. if (IS_ERR(iommu_group))
  393. return ERR_CAST(iommu_group);
  394. ret = iommu_group_set_name(iommu_group, "vfio-noiommu");
  395. if (ret)
  396. goto out_put_group;
  397. ret = iommu_group_add_device(iommu_group, dev);
  398. if (ret)
  399. goto out_put_group;
  400. group = vfio_create_group(iommu_group, type);
  401. if (IS_ERR(group)) {
  402. ret = PTR_ERR(group);
  403. goto out_remove_device;
  404. }
  405. iommu_group_put(iommu_group);
  406. return group;
  407. out_remove_device:
  408. iommu_group_remove_device(dev);
  409. out_put_group:
  410. iommu_group_put(iommu_group);
  411. return ERR_PTR(ret);
  412. }
  413. static struct vfio_group *vfio_group_find_or_alloc(struct device *dev)
  414. {
  415. struct iommu_group *iommu_group;
  416. struct vfio_group *group;
  417. iommu_group = iommu_group_get(dev);
  418. if (!iommu_group && vfio_noiommu) {
  419. /*
  420. * With noiommu enabled, create an IOMMU group for devices that
  421. * don't already have one, implying no IOMMU hardware/driver
  422. * exists. Taint the kernel because we're about to give a DMA
  423. * capable device to a user without IOMMU protection.
  424. */
  425. group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU);
  426. if (!IS_ERR(group)) {
  427. add_taint(TAINT_USER, LOCKDEP_STILL_OK);
  428. dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
  429. }
  430. return group;
  431. }
  432. if (!iommu_group)
  433. return ERR_PTR(-EINVAL);
  434. /*
  435. * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
  436. * restore cache coherency. It has to be checked here because it is only
  437. * valid for cases where we are using iommu groups.
  438. */
  439. if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) {
  440. iommu_group_put(iommu_group);
  441. return ERR_PTR(-EINVAL);
  442. }
  443. group = vfio_group_get_from_iommu(iommu_group);
  444. if (!group)
  445. group = vfio_create_group(iommu_group, VFIO_IOMMU);
  446. /* The vfio_group holds a reference to the iommu_group */
  447. iommu_group_put(iommu_group);
  448. return group;
  449. }
  450. static int __vfio_register_dev(struct vfio_device *device,
  451. struct vfio_group *group)
  452. {
  453. struct vfio_device *existing_device;
  454. int ret;
  455. /*
  456. * In all cases group is the output of one of the group allocation
  457. * functions and we have group->drivers incremented for us.
  458. */
  459. if (IS_ERR(group))
  460. return PTR_ERR(group);
  461. /*
  462. * If the driver doesn't specify a set then the device is added to a
  463. * singleton set just for itself.
  464. */
  465. if (!device->dev_set)
  466. vfio_assign_device_set(device, device);
  467. existing_device = vfio_group_get_device(group, device->dev);
  468. if (existing_device) {
  469. /*
  470. * group->iommu_group is non-NULL because we hold the drivers
  471. * refcount.
  472. */
  473. dev_WARN(device->dev, "Device already exists on group %d\n",
  474. iommu_group_id(group->iommu_group));
  475. vfio_device_put_registration(existing_device);
  476. ret = -EBUSY;
  477. goto err_out;
  478. }
  479. /* Our reference on group is moved to the device */
  480. device->group = group;
  481. ret = dev_set_name(&device->device, "vfio%d", device->index);
  482. if (ret)
  483. goto err_out;
  484. ret = device_add(&device->device);
  485. if (ret)
  486. goto err_out;
  487. /* Refcounting can't start until the driver calls register */
  488. refcount_set(&device->refcount, 1);
  489. mutex_lock(&group->device_lock);
  490. list_add(&device->group_next, &group->device_list);
  491. mutex_unlock(&group->device_lock);
  492. return 0;
  493. err_out:
  494. vfio_device_remove_group(device);
  495. return ret;
  496. }
  497. int vfio_register_group_dev(struct vfio_device *device)
  498. {
  499. return __vfio_register_dev(device,
  500. vfio_group_find_or_alloc(device->dev));
  501. }
  502. EXPORT_SYMBOL_GPL(vfio_register_group_dev);
  503. /*
  504. * Register a virtual device without IOMMU backing. The user of this
  505. * device must not be able to directly trigger unmediated DMA.
  506. */
  507. int vfio_register_emulated_iommu_dev(struct vfio_device *device)
  508. {
  509. return __vfio_register_dev(device,
  510. vfio_noiommu_group_alloc(device->dev, VFIO_EMULATED_IOMMU));
  511. }
  512. EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
  513. static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
  514. char *buf)
  515. {
  516. struct vfio_device *it, *device = ERR_PTR(-ENODEV);
  517. mutex_lock(&group->device_lock);
  518. list_for_each_entry(it, &group->device_list, group_next) {
  519. int ret;
  520. if (it->ops->match) {
  521. ret = it->ops->match(it, buf);
  522. if (ret < 0) {
  523. device = ERR_PTR(ret);
  524. break;
  525. }
  526. } else {
  527. ret = !strcmp(dev_name(it->dev), buf);
  528. }
  529. if (ret && vfio_device_try_get_registration(it)) {
  530. device = it;
  531. break;
  532. }
  533. }
  534. mutex_unlock(&group->device_lock);
  535. return device;
  536. }
  537. /*
  538. * Decrement the device reference count and wait for the device to be
  539. * removed. Open file descriptors for the device... */
  540. void vfio_unregister_group_dev(struct vfio_device *device)
  541. {
  542. struct vfio_group *group = device->group;
  543. unsigned int i = 0;
  544. bool interrupted = false;
  545. long rc;
  546. vfio_device_put_registration(device);
  547. rc = try_wait_for_completion(&device->comp);
  548. while (rc <= 0) {
  549. if (device->ops->request)
  550. device->ops->request(device, i++);
  551. if (interrupted) {
  552. rc = wait_for_completion_timeout(&device->comp,
  553. HZ * 10);
  554. } else {
  555. rc = wait_for_completion_interruptible_timeout(
  556. &device->comp, HZ * 10);
  557. if (rc < 0) {
  558. interrupted = true;
  559. dev_warn(device->dev,
  560. "Device is currently in use, task"
  561. " \"%s\" (%d) "
  562. "blocked until device is released",
  563. current->comm, task_pid_nr(current));
  564. }
  565. }
  566. }
  567. mutex_lock(&group->device_lock);
  568. list_del(&device->group_next);
  569. mutex_unlock(&group->device_lock);
  570. /* Balances device_add in register path */
  571. device_del(&device->device);
  572. vfio_device_remove_group(device);
  573. }
  574. EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
  575. /*
  576. * VFIO Group fd, /dev/vfio/$GROUP
  577. */
  578. /*
  579. * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
  580. * if there was no container to unset. Since the ioctl is called on
  581. * the group, we know that still exists, therefore the only valid
  582. * transition here is 1->0.
  583. */
  584. static int vfio_group_ioctl_unset_container(struct vfio_group *group)
  585. {
  586. int ret = 0;
  587. mutex_lock(&group->group_lock);
  588. if (!group->container) {
  589. ret = -EINVAL;
  590. goto out_unlock;
  591. }
  592. if (group->container_users != 1) {
  593. ret = -EBUSY;
  594. goto out_unlock;
  595. }
  596. vfio_group_detach_container(group);
  597. out_unlock:
  598. mutex_unlock(&group->group_lock);
  599. return ret;
  600. }
  601. static int vfio_group_ioctl_set_container(struct vfio_group *group,
  602. int __user *arg)
  603. {
  604. struct vfio_container *container;
  605. struct fd f;
  606. int ret;
  607. int fd;
  608. if (get_user(fd, arg))
  609. return -EFAULT;
  610. f = fdget(fd);
  611. if (!f.file)
  612. return -EBADF;
  613. mutex_lock(&group->group_lock);
  614. if (group->container || WARN_ON(group->container_users)) {
  615. ret = -EINVAL;
  616. goto out_unlock;
  617. }
  618. if (!group->iommu_group) {
  619. ret = -ENODEV;
  620. goto out_unlock;
  621. }
  622. container = vfio_container_from_file(f.file);
  623. ret = -EINVAL;
  624. if (container) {
  625. ret = vfio_container_attach_group(container, group);
  626. goto out_unlock;
  627. }
  628. out_unlock:
  629. mutex_unlock(&group->group_lock);
  630. fdput(f);
  631. return ret;
  632. }
  633. static const struct file_operations vfio_device_fops;
  634. /* true if the vfio_device has open_device() called but not close_device() */
  635. bool vfio_assert_device_open(struct vfio_device *device)
  636. {
  637. return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
  638. }
  639. static struct file *vfio_device_open(struct vfio_device *device)
  640. {
  641. struct file *filep;
  642. int ret;
  643. mutex_lock(&device->group->group_lock);
  644. ret = vfio_device_assign_container(device);
  645. mutex_unlock(&device->group->group_lock);
  646. if (ret)
  647. return ERR_PTR(ret);
  648. if (!try_module_get(device->dev->driver->owner)) {
  649. ret = -ENODEV;
  650. goto err_unassign_container;
  651. }
  652. mutex_lock(&device->dev_set->lock);
  653. device->open_count++;
  654. if (device->open_count == 1) {
  655. /*
  656. * Here we pass the KVM pointer with the group under the read
  657. * lock. If the device driver will use it, it must obtain a
  658. * reference and release it during close_device.
  659. */
  660. mutex_lock(&device->group->group_lock);
  661. device->kvm = device->group->kvm;
  662. if (device->ops->open_device) {
  663. ret = device->ops->open_device(device);
  664. if (ret)
  665. goto err_undo_count;
  666. }
  667. vfio_device_container_register(device);
  668. mutex_unlock(&device->group->group_lock);
  669. }
  670. mutex_unlock(&device->dev_set->lock);
  671. /*
  672. * We can't use anon_inode_getfd() because we need to modify
  673. * the f_mode flags directly to allow more than just ioctls
  674. */
  675. filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
  676. device, O_RDWR);
  677. if (IS_ERR(filep)) {
  678. ret = PTR_ERR(filep);
  679. goto err_close_device;
  680. }
  681. /*
  682. * TODO: add an anon_inode interface to do this.
  683. * Appears to be missing by lack of need rather than
  684. * explicitly prevented. Now there's need.
  685. */
  686. filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE);
  687. if (device->group->type == VFIO_NO_IOMMU)
  688. dev_warn(device->dev, "vfio-noiommu device opened by user "
  689. "(%s:%d)\n", current->comm, task_pid_nr(current));
  690. /*
  691. * On success the ref of device is moved to the file and
  692. * put in vfio_device_fops_release()
  693. */
  694. return filep;
  695. err_close_device:
  696. mutex_lock(&device->dev_set->lock);
  697. mutex_lock(&device->group->group_lock);
  698. if (device->open_count == 1) {
  699. if (device->ops->close_device)
  700. device->ops->close_device(device);
  701. vfio_device_container_unregister(device);
  702. }
  703. err_undo_count:
  704. mutex_unlock(&device->group->group_lock);
  705. device->open_count--;
  706. if (device->open_count == 0 && device->kvm)
  707. device->kvm = NULL;
  708. mutex_unlock(&device->dev_set->lock);
  709. module_put(device->dev->driver->owner);
  710. err_unassign_container:
  711. vfio_device_unassign_container(device);
  712. return ERR_PTR(ret);
  713. }
  714. static int vfio_group_ioctl_get_device_fd(struct vfio_group *group,
  715. char __user *arg)
  716. {
  717. struct vfio_device *device;
  718. struct file *filep;
  719. char *buf;
  720. int fdno;
  721. int ret;
  722. buf = strndup_user(arg, PAGE_SIZE);
  723. if (IS_ERR(buf))
  724. return PTR_ERR(buf);
  725. device = vfio_device_get_from_name(group, buf);
  726. kfree(buf);
  727. if (IS_ERR(device))
  728. return PTR_ERR(device);
  729. fdno = get_unused_fd_flags(O_CLOEXEC);
  730. if (fdno < 0) {
  731. ret = fdno;
  732. goto err_put_device;
  733. }
  734. filep = vfio_device_open(device);
  735. if (IS_ERR(filep)) {
  736. ret = PTR_ERR(filep);
  737. goto err_put_fdno;
  738. }
  739. fd_install(fdno, filep);
  740. return fdno;
  741. err_put_fdno:
  742. put_unused_fd(fdno);
  743. err_put_device:
  744. vfio_device_put_registration(device);
  745. return ret;
  746. }
  747. static int vfio_group_ioctl_get_status(struct vfio_group *group,
  748. struct vfio_group_status __user *arg)
  749. {
  750. unsigned long minsz = offsetofend(struct vfio_group_status, flags);
  751. struct vfio_group_status status;
  752. if (copy_from_user(&status, arg, minsz))
  753. return -EFAULT;
  754. if (status.argsz < minsz)
  755. return -EINVAL;
  756. status.flags = 0;
  757. mutex_lock(&group->group_lock);
  758. if (!group->iommu_group) {
  759. mutex_unlock(&group->group_lock);
  760. return -ENODEV;
  761. }
  762. if (group->container)
  763. status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET |
  764. VFIO_GROUP_FLAGS_VIABLE;
  765. else if (!iommu_group_dma_owner_claimed(group->iommu_group))
  766. status.flags |= VFIO_GROUP_FLAGS_VIABLE;
  767. mutex_unlock(&group->group_lock);
  768. if (copy_to_user(arg, &status, minsz))
  769. return -EFAULT;
  770. return 0;
  771. }
  772. static long vfio_group_fops_unl_ioctl(struct file *filep,
  773. unsigned int cmd, unsigned long arg)
  774. {
  775. struct vfio_group *group = filep->private_data;
  776. void __user *uarg = (void __user *)arg;
  777. switch (cmd) {
  778. case VFIO_GROUP_GET_DEVICE_FD:
  779. return vfio_group_ioctl_get_device_fd(group, uarg);
  780. case VFIO_GROUP_GET_STATUS:
  781. return vfio_group_ioctl_get_status(group, uarg);
  782. case VFIO_GROUP_SET_CONTAINER:
  783. return vfio_group_ioctl_set_container(group, uarg);
  784. case VFIO_GROUP_UNSET_CONTAINER:
  785. return vfio_group_ioctl_unset_container(group);
  786. default:
  787. return -ENOTTY;
  788. }
  789. }
  790. static int vfio_group_fops_open(struct inode *inode, struct file *filep)
  791. {
  792. struct vfio_group *group =
  793. container_of(inode->i_cdev, struct vfio_group, cdev);
  794. int ret;
  795. mutex_lock(&group->group_lock);
  796. /*
  797. * drivers can be zero if this races with vfio_device_remove_group(), it
  798. * will be stable at 0 under the group rwsem
  799. */
  800. if (refcount_read(&group->drivers) == 0) {
  801. ret = -ENODEV;
  802. goto out_unlock;
  803. }
  804. if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) {
  805. ret = -EPERM;
  806. goto out_unlock;
  807. }
  808. /*
  809. * Do we need multiple instances of the group open? Seems not.
  810. */
  811. if (group->opened_file) {
  812. ret = -EBUSY;
  813. goto out_unlock;
  814. }
  815. group->opened_file = filep;
  816. filep->private_data = group;
  817. ret = 0;
  818. out_unlock:
  819. mutex_unlock(&group->group_lock);
  820. return ret;
  821. }
  822. static int vfio_group_fops_release(struct inode *inode, struct file *filep)
  823. {
  824. struct vfio_group *group = filep->private_data;
  825. filep->private_data = NULL;
  826. mutex_lock(&group->group_lock);
  827. /*
  828. * Device FDs hold a group file reference, therefore the group release
  829. * is only called when there are no open devices.
  830. */
  831. WARN_ON(group->notifier.head);
  832. if (group->container)
  833. vfio_group_detach_container(group);
  834. group->opened_file = NULL;
  835. mutex_unlock(&group->group_lock);
  836. return 0;
  837. }
  838. static const struct file_operations vfio_group_fops = {
  839. .owner = THIS_MODULE,
  840. .unlocked_ioctl = vfio_group_fops_unl_ioctl,
  841. .compat_ioctl = compat_ptr_ioctl,
  842. .open = vfio_group_fops_open,
  843. .release = vfio_group_fops_release,
  844. };
  845. /*
  846. * Wrapper around pm_runtime_resume_and_get().
  847. * Return error code on failure or 0 on success.
  848. */
  849. static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
  850. {
  851. struct device *dev = device->dev;
  852. if (dev->driver && dev->driver->pm) {
  853. int ret;
  854. ret = pm_runtime_resume_and_get(dev);
  855. if (ret) {
  856. dev_info_ratelimited(dev,
  857. "vfio: runtime resume failed %d\n", ret);
  858. return -EIO;
  859. }
  860. }
  861. return 0;
  862. }
  863. /*
  864. * Wrapper around pm_runtime_put().
  865. */
  866. static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
  867. {
  868. struct device *dev = device->dev;
  869. if (dev->driver && dev->driver->pm)
  870. pm_runtime_put(dev);
  871. }
  872. /*
  873. * VFIO Device fd
  874. */
  875. static int vfio_device_fops_release(struct inode *inode, struct file *filep)
  876. {
  877. struct vfio_device *device = filep->private_data;
  878. mutex_lock(&device->dev_set->lock);
  879. vfio_assert_device_open(device);
  880. mutex_lock(&device->group->group_lock);
  881. if (device->open_count == 1) {
  882. if (device->ops->close_device)
  883. device->ops->close_device(device);
  884. vfio_device_container_unregister(device);
  885. }
  886. mutex_unlock(&device->group->group_lock);
  887. device->open_count--;
  888. if (device->open_count == 0)
  889. device->kvm = NULL;
  890. mutex_unlock(&device->dev_set->lock);
  891. module_put(device->dev->driver->owner);
  892. vfio_device_unassign_container(device);
  893. vfio_device_put_registration(device);
  894. return 0;
  895. }
  896. /*
  897. * vfio_mig_get_next_state - Compute the next step in the FSM
  898. * @cur_fsm - The current state the device is in
  899. * @new_fsm - The target state to reach
  900. * @next_fsm - Pointer to the next step to get to new_fsm
  901. *
  902. * Return 0 upon success, otherwise -errno
  903. * Upon success the next step in the state progression between cur_fsm and
  904. * new_fsm will be set in next_fsm.
  905. *
  906. * This breaks down requests for combination transitions into smaller steps and
  907. * returns the next step to get to new_fsm. The function may need to be called
  908. * multiple times before reaching new_fsm.
  909. *
  910. */
  911. int vfio_mig_get_next_state(struct vfio_device *device,
  912. enum vfio_device_mig_state cur_fsm,
  913. enum vfio_device_mig_state new_fsm,
  914. enum vfio_device_mig_state *next_fsm)
  915. {
  916. enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RUNNING_P2P + 1 };
  917. /*
  918. * The coding in this table requires the driver to implement the
  919. * following FSM arcs:
  920. * RESUMING -> STOP
  921. * STOP -> RESUMING
  922. * STOP -> STOP_COPY
  923. * STOP_COPY -> STOP
  924. *
  925. * If P2P is supported then the driver must also implement these FSM
  926. * arcs:
  927. * RUNNING -> RUNNING_P2P
  928. * RUNNING_P2P -> RUNNING
  929. * RUNNING_P2P -> STOP
  930. * STOP -> RUNNING_P2P
  931. * Without P2P the driver must implement:
  932. * RUNNING -> STOP
  933. * STOP -> RUNNING
  934. *
  935. * The coding will step through multiple states for some combination
  936. * transitions; if all optional features are supported, this means the
  937. * following ones:
  938. * RESUMING -> STOP -> RUNNING_P2P
  939. * RESUMING -> STOP -> RUNNING_P2P -> RUNNING
  940. * RESUMING -> STOP -> STOP_COPY
  941. * RUNNING -> RUNNING_P2P -> STOP
  942. * RUNNING -> RUNNING_P2P -> STOP -> RESUMING
  943. * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
  944. * RUNNING_P2P -> STOP -> RESUMING
  945. * RUNNING_P2P -> STOP -> STOP_COPY
  946. * STOP -> RUNNING_P2P -> RUNNING
  947. * STOP_COPY -> STOP -> RESUMING
  948. * STOP_COPY -> STOP -> RUNNING_P2P
  949. * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
  950. */
  951. static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
  952. [VFIO_DEVICE_STATE_STOP] = {
  953. [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
  954. [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
  955. [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
  956. [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
  957. [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
  958. [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
  959. },
  960. [VFIO_DEVICE_STATE_RUNNING] = {
  961. [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
  962. [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
  963. [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
  964. [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
  965. [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
  966. [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
  967. },
  968. [VFIO_DEVICE_STATE_STOP_COPY] = {
  969. [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
  970. [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
  971. [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
  972. [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
  973. [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
  974. [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
  975. },
  976. [VFIO_DEVICE_STATE_RESUMING] = {
  977. [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
  978. [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
  979. [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
  980. [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
  981. [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
  982. [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
  983. },
  984. [VFIO_DEVICE_STATE_RUNNING_P2P] = {
  985. [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
  986. [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
  987. [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
  988. [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
  989. [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
  990. [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
  991. },
  992. [VFIO_DEVICE_STATE_ERROR] = {
  993. [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
  994. [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
  995. [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
  996. [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
  997. [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
  998. [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
  999. },
  1000. };
  1001. static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
  1002. [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
  1003. [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
  1004. [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
  1005. [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
  1006. [VFIO_DEVICE_STATE_RUNNING_P2P] =
  1007. VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
  1008. [VFIO_DEVICE_STATE_ERROR] = ~0U,
  1009. };
  1010. if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
  1011. (state_flags_table[cur_fsm] & device->migration_flags) !=
  1012. state_flags_table[cur_fsm]))
  1013. return -EINVAL;
  1014. if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
  1015. (state_flags_table[new_fsm] & device->migration_flags) !=
  1016. state_flags_table[new_fsm])
  1017. return -EINVAL;
  1018. /*
  1019. * Arcs touching optional and unsupported states are skipped over. The
  1020. * driver will instead see an arc from the original state to the next
  1021. * logical state, as per the above comment.
  1022. */
  1023. *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
  1024. while ((state_flags_table[*next_fsm] & device->migration_flags) !=
  1025. state_flags_table[*next_fsm])
  1026. *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
  1027. return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
  1028. }
  1029. EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
  1030. /*
  1031. * Convert the drivers's struct file into a FD number and return it to userspace
  1032. */
  1033. static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
  1034. struct vfio_device_feature_mig_state *mig)
  1035. {
  1036. int ret;
  1037. int fd;
  1038. fd = get_unused_fd_flags(O_CLOEXEC);
  1039. if (fd < 0) {
  1040. ret = fd;
  1041. goto out_fput;
  1042. }
  1043. mig->data_fd = fd;
  1044. if (copy_to_user(arg, mig, sizeof(*mig))) {
  1045. ret = -EFAULT;
  1046. goto out_put_unused;
  1047. }
  1048. fd_install(fd, filp);
  1049. return 0;
  1050. out_put_unused:
  1051. put_unused_fd(fd);
  1052. out_fput:
  1053. fput(filp);
  1054. return ret;
  1055. }
  1056. static int
  1057. vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
  1058. u32 flags, void __user *arg,
  1059. size_t argsz)
  1060. {
  1061. size_t minsz =
  1062. offsetofend(struct vfio_device_feature_mig_state, data_fd);
  1063. struct vfio_device_feature_mig_state mig;
  1064. struct file *filp = NULL;
  1065. int ret;
  1066. if (!device->mig_ops)
  1067. return -ENOTTY;
  1068. ret = vfio_check_feature(flags, argsz,
  1069. VFIO_DEVICE_FEATURE_SET |
  1070. VFIO_DEVICE_FEATURE_GET,
  1071. sizeof(mig));
  1072. if (ret != 1)
  1073. return ret;
  1074. if (copy_from_user(&mig, arg, minsz))
  1075. return -EFAULT;
  1076. if (flags & VFIO_DEVICE_FEATURE_GET) {
  1077. enum vfio_device_mig_state curr_state;
  1078. ret = device->mig_ops->migration_get_state(device,
  1079. &curr_state);
  1080. if (ret)
  1081. return ret;
  1082. mig.device_state = curr_state;
  1083. goto out_copy;
  1084. }
  1085. /* Handle the VFIO_DEVICE_FEATURE_SET */
  1086. filp = device->mig_ops->migration_set_state(device, mig.device_state);
  1087. if (IS_ERR(filp) || !filp)
  1088. goto out_copy;
  1089. return vfio_ioct_mig_return_fd(filp, arg, &mig);
  1090. out_copy:
  1091. mig.data_fd = -1;
  1092. if (copy_to_user(arg, &mig, sizeof(mig)))
  1093. return -EFAULT;
  1094. if (IS_ERR(filp))
  1095. return PTR_ERR(filp);
  1096. return 0;
  1097. }
  1098. static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
  1099. u32 flags, void __user *arg,
  1100. size_t argsz)
  1101. {
  1102. struct vfio_device_feature_migration mig = {
  1103. .flags = device->migration_flags,
  1104. };
  1105. int ret;
  1106. if (!device->mig_ops)
  1107. return -ENOTTY;
  1108. ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
  1109. sizeof(mig));
  1110. if (ret != 1)
  1111. return ret;
  1112. if (copy_to_user(arg, &mig, sizeof(mig)))
  1113. return -EFAULT;
  1114. return 0;
  1115. }
  1116. /* Ranges should fit into a single kernel page */
  1117. #define LOG_MAX_RANGES \
  1118. (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
  1119. static int
  1120. vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
  1121. u32 flags, void __user *arg,
  1122. size_t argsz)
  1123. {
  1124. size_t minsz =
  1125. offsetofend(struct vfio_device_feature_dma_logging_control,
  1126. ranges);
  1127. struct vfio_device_feature_dma_logging_range __user *ranges;
  1128. struct vfio_device_feature_dma_logging_control control;
  1129. struct vfio_device_feature_dma_logging_range range;
  1130. struct rb_root_cached root = RB_ROOT_CACHED;
  1131. struct interval_tree_node *nodes;
  1132. u64 iova_end;
  1133. u32 nnodes;
  1134. int i, ret;
  1135. if (!device->log_ops)
  1136. return -ENOTTY;
  1137. ret = vfio_check_feature(flags, argsz,
  1138. VFIO_DEVICE_FEATURE_SET,
  1139. sizeof(control));
  1140. if (ret != 1)
  1141. return ret;
  1142. if (copy_from_user(&control, arg, minsz))
  1143. return -EFAULT;
  1144. nnodes = control.num_ranges;
  1145. if (!nnodes)
  1146. return -EINVAL;
  1147. if (nnodes > LOG_MAX_RANGES)
  1148. return -E2BIG;
  1149. ranges = u64_to_user_ptr(control.ranges);
  1150. nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
  1151. GFP_KERNEL);
  1152. if (!nodes)
  1153. return -ENOMEM;
  1154. for (i = 0; i < nnodes; i++) {
  1155. if (copy_from_user(&range, &ranges[i], sizeof(range))) {
  1156. ret = -EFAULT;
  1157. goto end;
  1158. }
  1159. if (!IS_ALIGNED(range.iova, control.page_size) ||
  1160. !IS_ALIGNED(range.length, control.page_size)) {
  1161. ret = -EINVAL;
  1162. goto end;
  1163. }
  1164. if (check_add_overflow(range.iova, range.length, &iova_end) ||
  1165. iova_end > ULONG_MAX) {
  1166. ret = -EOVERFLOW;
  1167. goto end;
  1168. }
  1169. nodes[i].start = range.iova;
  1170. nodes[i].last = range.iova + range.length - 1;
  1171. if (interval_tree_iter_first(&root, nodes[i].start,
  1172. nodes[i].last)) {
  1173. /* Range overlapping */
  1174. ret = -EINVAL;
  1175. goto end;
  1176. }
  1177. interval_tree_insert(nodes + i, &root);
  1178. }
  1179. ret = device->log_ops->log_start(device, &root, nnodes,
  1180. &control.page_size);
  1181. if (ret)
  1182. goto end;
  1183. if (copy_to_user(arg, &control, sizeof(control))) {
  1184. ret = -EFAULT;
  1185. device->log_ops->log_stop(device);
  1186. }
  1187. end:
  1188. kfree(nodes);
  1189. return ret;
  1190. }
  1191. static int
  1192. vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
  1193. u32 flags, void __user *arg,
  1194. size_t argsz)
  1195. {
  1196. int ret;
  1197. if (!device->log_ops)
  1198. return -ENOTTY;
  1199. ret = vfio_check_feature(flags, argsz,
  1200. VFIO_DEVICE_FEATURE_SET, 0);
  1201. if (ret != 1)
  1202. return ret;
  1203. return device->log_ops->log_stop(device);
  1204. }
  1205. static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
  1206. unsigned long iova, size_t length,
  1207. void *opaque)
  1208. {
  1209. struct vfio_device *device = opaque;
  1210. return device->log_ops->log_read_and_clear(device, iova, length, iter);
  1211. }
  1212. static int
  1213. vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
  1214. u32 flags, void __user *arg,
  1215. size_t argsz)
  1216. {
  1217. size_t minsz =
  1218. offsetofend(struct vfio_device_feature_dma_logging_report,
  1219. bitmap);
  1220. struct vfio_device_feature_dma_logging_report report;
  1221. struct iova_bitmap *iter;
  1222. u64 iova_end;
  1223. int ret;
  1224. if (!device->log_ops)
  1225. return -ENOTTY;
  1226. ret = vfio_check_feature(flags, argsz,
  1227. VFIO_DEVICE_FEATURE_GET,
  1228. sizeof(report));
  1229. if (ret != 1)
  1230. return ret;
  1231. if (copy_from_user(&report, arg, minsz))
  1232. return -EFAULT;
  1233. if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
  1234. return -EINVAL;
  1235. if (check_add_overflow(report.iova, report.length, &iova_end) ||
  1236. iova_end > ULONG_MAX)
  1237. return -EOVERFLOW;
  1238. iter = iova_bitmap_alloc(report.iova, report.length,
  1239. report.page_size,
  1240. u64_to_user_ptr(report.bitmap));
  1241. if (IS_ERR(iter))
  1242. return PTR_ERR(iter);
  1243. ret = iova_bitmap_for_each(iter, device,
  1244. vfio_device_log_read_and_clear);
  1245. iova_bitmap_free(iter);
  1246. return ret;
  1247. }
  1248. static int vfio_ioctl_device_feature(struct vfio_device *device,
  1249. struct vfio_device_feature __user *arg)
  1250. {
  1251. size_t minsz = offsetofend(struct vfio_device_feature, flags);
  1252. struct vfio_device_feature feature;
  1253. if (copy_from_user(&feature, arg, minsz))
  1254. return -EFAULT;
  1255. if (feature.argsz < minsz)
  1256. return -EINVAL;
  1257. /* Check unknown flags */
  1258. if (feature.flags &
  1259. ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
  1260. VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
  1261. return -EINVAL;
  1262. /* GET & SET are mutually exclusive except with PROBE */
  1263. if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
  1264. (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
  1265. (feature.flags & VFIO_DEVICE_FEATURE_GET))
  1266. return -EINVAL;
  1267. switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
  1268. case VFIO_DEVICE_FEATURE_MIGRATION:
  1269. return vfio_ioctl_device_feature_migration(
  1270. device, feature.flags, arg->data,
  1271. feature.argsz - minsz);
  1272. case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
  1273. return vfio_ioctl_device_feature_mig_device_state(
  1274. device, feature.flags, arg->data,
  1275. feature.argsz - minsz);
  1276. case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
  1277. return vfio_ioctl_device_feature_logging_start(
  1278. device, feature.flags, arg->data,
  1279. feature.argsz - minsz);
  1280. case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
  1281. return vfio_ioctl_device_feature_logging_stop(
  1282. device, feature.flags, arg->data,
  1283. feature.argsz - minsz);
  1284. case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
  1285. return vfio_ioctl_device_feature_logging_report(
  1286. device, feature.flags, arg->data,
  1287. feature.argsz - minsz);
  1288. default:
  1289. if (unlikely(!device->ops->device_feature))
  1290. return -EINVAL;
  1291. return device->ops->device_feature(device, feature.flags,
  1292. arg->data,
  1293. feature.argsz - minsz);
  1294. }
  1295. }
  1296. static long vfio_device_fops_unl_ioctl(struct file *filep,
  1297. unsigned int cmd, unsigned long arg)
  1298. {
  1299. struct vfio_device *device = filep->private_data;
  1300. int ret;
  1301. ret = vfio_device_pm_runtime_get(device);
  1302. if (ret)
  1303. return ret;
  1304. switch (cmd) {
  1305. case VFIO_DEVICE_FEATURE:
  1306. ret = vfio_ioctl_device_feature(device, (void __user *)arg);
  1307. break;
  1308. default:
  1309. if (unlikely(!device->ops->ioctl))
  1310. ret = -EINVAL;
  1311. else
  1312. ret = device->ops->ioctl(device, cmd, arg);
  1313. break;
  1314. }
  1315. vfio_device_pm_runtime_put(device);
  1316. return ret;
  1317. }
  1318. static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
  1319. size_t count, loff_t *ppos)
  1320. {
  1321. struct vfio_device *device = filep->private_data;
  1322. if (unlikely(!device->ops->read))
  1323. return -EINVAL;
  1324. return device->ops->read(device, buf, count, ppos);
  1325. }
  1326. static ssize_t vfio_device_fops_write(struct file *filep,
  1327. const char __user *buf,
  1328. size_t count, loff_t *ppos)
  1329. {
  1330. struct vfio_device *device = filep->private_data;
  1331. if (unlikely(!device->ops->write))
  1332. return -EINVAL;
  1333. return device->ops->write(device, buf, count, ppos);
  1334. }
  1335. static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
  1336. {
  1337. struct vfio_device *device = filep->private_data;
  1338. if (unlikely(!device->ops->mmap))
  1339. return -EINVAL;
  1340. return device->ops->mmap(device, vma);
  1341. }
  1342. static const struct file_operations vfio_device_fops = {
  1343. .owner = THIS_MODULE,
  1344. .release = vfio_device_fops_release,
  1345. .read = vfio_device_fops_read,
  1346. .write = vfio_device_fops_write,
  1347. .unlocked_ioctl = vfio_device_fops_unl_ioctl,
  1348. .compat_ioctl = compat_ptr_ioctl,
  1349. .mmap = vfio_device_fops_mmap,
  1350. };
  1351. /**
  1352. * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file
  1353. * @file: VFIO group file
  1354. *
  1355. * The returned iommu_group is valid as long as a ref is held on the file. This
  1356. * returns a reference on the group. This function is deprecated, only the SPAPR
  1357. * path in kvm should call it.
  1358. */
  1359. struct iommu_group *vfio_file_iommu_group(struct file *file)
  1360. {
  1361. struct vfio_group *group = file->private_data;
  1362. struct iommu_group *iommu_group = NULL;
  1363. if (!IS_ENABLED(CONFIG_SPAPR_TCE_IOMMU))
  1364. return NULL;
  1365. if (!vfio_file_is_group(file))
  1366. return NULL;
  1367. mutex_lock(&group->group_lock);
  1368. if (group->iommu_group) {
  1369. iommu_group = group->iommu_group;
  1370. iommu_group_ref_get(iommu_group);
  1371. }
  1372. mutex_unlock(&group->group_lock);
  1373. return iommu_group;
  1374. }
  1375. EXPORT_SYMBOL_GPL(vfio_file_iommu_group);
  1376. /**
  1377. * vfio_file_is_group - True if the file is usable with VFIO aPIS
  1378. * @file: VFIO group file
  1379. */
  1380. bool vfio_file_is_group(struct file *file)
  1381. {
  1382. return file->f_op == &vfio_group_fops;
  1383. }
  1384. EXPORT_SYMBOL_GPL(vfio_file_is_group);
  1385. /**
  1386. * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
  1387. * is always CPU cache coherent
  1388. * @file: VFIO group file
  1389. *
  1390. * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
  1391. * bit in DMA transactions. A return of false indicates that the user has
  1392. * rights to access additional instructions such as wbinvd on x86.
  1393. */
  1394. bool vfio_file_enforced_coherent(struct file *file)
  1395. {
  1396. struct vfio_group *group = file->private_data;
  1397. bool ret;
  1398. if (!vfio_file_is_group(file))
  1399. return true;
  1400. mutex_lock(&group->group_lock);
  1401. if (group->container) {
  1402. ret = vfio_container_ioctl_check_extension(group->container,
  1403. VFIO_DMA_CC_IOMMU);
  1404. } else {
  1405. /*
  1406. * Since the coherency state is determined only once a container
  1407. * is attached the user must do so before they can prove they
  1408. * have permission.
  1409. */
  1410. ret = true;
  1411. }
  1412. mutex_unlock(&group->group_lock);
  1413. return ret;
  1414. }
  1415. EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
  1416. /**
  1417. * vfio_file_set_kvm - Link a kvm with VFIO drivers
  1418. * @file: VFIO group file
  1419. * @kvm: KVM to link
  1420. *
  1421. * When a VFIO device is first opened the KVM will be available in
  1422. * device->kvm if one was associated with the group.
  1423. */
  1424. void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
  1425. {
  1426. struct vfio_group *group = file->private_data;
  1427. if (!vfio_file_is_group(file))
  1428. return;
  1429. mutex_lock(&group->group_lock);
  1430. group->kvm = kvm;
  1431. mutex_unlock(&group->group_lock);
  1432. }
  1433. EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
  1434. /**
  1435. * vfio_file_has_dev - True if the VFIO file is a handle for device
  1436. * @file: VFIO file to check
  1437. * @device: Device that must be part of the file
  1438. *
  1439. * Returns true if given file has permission to manipulate the given device.
  1440. */
  1441. bool vfio_file_has_dev(struct file *file, struct vfio_device *device)
  1442. {
  1443. struct vfio_group *group = file->private_data;
  1444. if (!vfio_file_is_group(file))
  1445. return false;
  1446. return group == device->group;
  1447. }
  1448. EXPORT_SYMBOL_GPL(vfio_file_has_dev);
  1449. /*
  1450. * Sub-module support
  1451. */
  1452. /*
  1453. * Helper for managing a buffer of info chain capabilities, allocate or
  1454. * reallocate a buffer with additional @size, filling in @id and @version
  1455. * of the capability. A pointer to the new capability is returned.
  1456. *
  1457. * NB. The chain is based at the head of the buffer, so new entries are
  1458. * added to the tail, vfio_info_cap_shift() should be called to fixup the
  1459. * next offsets prior to copying to the user buffer.
  1460. */
  1461. struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
  1462. size_t size, u16 id, u16 version)
  1463. {
  1464. void *buf;
  1465. struct vfio_info_cap_header *header, *tmp;
  1466. buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
  1467. if (!buf) {
  1468. kfree(caps->buf);
  1469. caps->buf = NULL;
  1470. caps->size = 0;
  1471. return ERR_PTR(-ENOMEM);
  1472. }
  1473. caps->buf = buf;
  1474. header = buf + caps->size;
  1475. /* Eventually copied to user buffer, zero */
  1476. memset(header, 0, size);
  1477. header->id = id;
  1478. header->version = version;
  1479. /* Add to the end of the capability chain */
  1480. for (tmp = buf; tmp->next; tmp = buf + tmp->next)
  1481. ; /* nothing */
  1482. tmp->next = caps->size;
  1483. caps->size += size;
  1484. return header;
  1485. }
  1486. EXPORT_SYMBOL_GPL(vfio_info_cap_add);
  1487. void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
  1488. {
  1489. struct vfio_info_cap_header *tmp;
  1490. void *buf = (void *)caps->buf;
  1491. for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
  1492. tmp->next += offset;
  1493. }
  1494. EXPORT_SYMBOL(vfio_info_cap_shift);
  1495. int vfio_info_add_capability(struct vfio_info_cap *caps,
  1496. struct vfio_info_cap_header *cap, size_t size)
  1497. {
  1498. struct vfio_info_cap_header *header;
  1499. header = vfio_info_cap_add(caps, size, cap->id, cap->version);
  1500. if (IS_ERR(header))
  1501. return PTR_ERR(header);
  1502. memcpy(header + 1, cap + 1, size - sizeof(*header));
  1503. return 0;
  1504. }
  1505. EXPORT_SYMBOL(vfio_info_add_capability);
  1506. int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
  1507. int max_irq_type, size_t *data_size)
  1508. {
  1509. unsigned long minsz;
  1510. size_t size;
  1511. minsz = offsetofend(struct vfio_irq_set, count);
  1512. if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
  1513. (hdr->count >= (U32_MAX - hdr->start)) ||
  1514. (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
  1515. VFIO_IRQ_SET_ACTION_TYPE_MASK)))
  1516. return -EINVAL;
  1517. if (data_size)
  1518. *data_size = 0;
  1519. if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
  1520. return -EINVAL;
  1521. switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
  1522. case VFIO_IRQ_SET_DATA_NONE:
  1523. size = 0;
  1524. break;
  1525. case VFIO_IRQ_SET_DATA_BOOL:
  1526. size = sizeof(uint8_t);
  1527. break;
  1528. case VFIO_IRQ_SET_DATA_EVENTFD:
  1529. size = sizeof(int32_t);
  1530. break;
  1531. default:
  1532. return -EINVAL;
  1533. }
  1534. if (size) {
  1535. if (hdr->argsz - minsz < hdr->count * size)
  1536. return -EINVAL;
  1537. if (!data_size)
  1538. return -EINVAL;
  1539. *data_size = hdr->count * size;
  1540. }
  1541. return 0;
  1542. }
  1543. EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
  1544. /*
  1545. * Module/class support
  1546. */
  1547. static char *vfio_devnode(struct device *dev, umode_t *mode)
  1548. {
  1549. return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
  1550. }
  1551. static int __init vfio_init(void)
  1552. {
  1553. int ret;
  1554. ida_init(&vfio.group_ida);
  1555. ida_init(&vfio.device_ida);
  1556. mutex_init(&vfio.group_lock);
  1557. INIT_LIST_HEAD(&vfio.group_list);
  1558. ret = vfio_container_init();
  1559. if (ret)
  1560. return ret;
  1561. /* /dev/vfio/$GROUP */
  1562. vfio.class = class_create(THIS_MODULE, "vfio");
  1563. if (IS_ERR(vfio.class)) {
  1564. ret = PTR_ERR(vfio.class);
  1565. goto err_group_class;
  1566. }
  1567. vfio.class->devnode = vfio_devnode;
  1568. /* /sys/class/vfio-dev/vfioX */
  1569. vfio.device_class = class_create(THIS_MODULE, "vfio-dev");
  1570. if (IS_ERR(vfio.device_class)) {
  1571. ret = PTR_ERR(vfio.device_class);
  1572. goto err_dev_class;
  1573. }
  1574. ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
  1575. if (ret)
  1576. goto err_alloc_chrdev;
  1577. pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
  1578. return 0;
  1579. err_alloc_chrdev:
  1580. class_destroy(vfio.device_class);
  1581. vfio.device_class = NULL;
  1582. err_dev_class:
  1583. class_destroy(vfio.class);
  1584. vfio.class = NULL;
  1585. err_group_class:
  1586. vfio_container_cleanup();
  1587. return ret;
  1588. }
  1589. static void __exit vfio_cleanup(void)
  1590. {
  1591. WARN_ON(!list_empty(&vfio.group_list));
  1592. ida_destroy(&vfio.device_ida);
  1593. ida_destroy(&vfio.group_ida);
  1594. unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
  1595. class_destroy(vfio.device_class);
  1596. vfio.device_class = NULL;
  1597. class_destroy(vfio.class);
  1598. vfio_container_cleanup();
  1599. vfio.class = NULL;
  1600. xa_destroy(&vfio_device_set_xa);
  1601. }
  1602. module_init(vfio_init);
  1603. module_exit(vfio_cleanup);
  1604. MODULE_VERSION(DRIVER_VERSION);
  1605. MODULE_LICENSE("GPL v2");
  1606. MODULE_AUTHOR(DRIVER_AUTHOR);
  1607. MODULE_DESCRIPTION(DRIVER_DESC);
  1608. MODULE_ALIAS_MISCDEV(VFIO_MINOR);
  1609. MODULE_ALIAS("devname:vfio/vfio");
  1610. MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");