vsock.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * vhost transport for vsock
  4. *
  5. * Copyright (C) 2013-2015 Red Hat, Inc.
  6. * Author: Asias He <[email protected]>
  7. * Stefan Hajnoczi <[email protected]>
  8. */
  9. #include <linux/miscdevice.h>
  10. #include <linux/atomic.h>
  11. #include <linux/module.h>
  12. #include <linux/mutex.h>
  13. #include <linux/vmalloc.h>
  14. #include <net/sock.h>
  15. #include <linux/virtio_vsock.h>
  16. #include <linux/vhost.h>
  17. #include <linux/hashtable.h>
  18. #include <net/af_vsock.h>
  19. #include "vhost.h"
  20. #define VHOST_VSOCK_DEFAULT_HOST_CID 2
  21. /* Max number of bytes transferred before requeueing the job.
  22. * Using this limit prevents one virtqueue from starving others. */
  23. #define VHOST_VSOCK_WEIGHT 0x80000
  24. /* Max number of packets transferred before requeueing the job.
  25. * Using this limit prevents one virtqueue from starving others with
  26. * small pkts.
  27. */
  28. #define VHOST_VSOCK_PKT_WEIGHT 256
  29. enum {
  30. VHOST_VSOCK_FEATURES = VHOST_FEATURES |
  31. (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
  32. (1ULL << VIRTIO_VSOCK_F_SEQPACKET)
  33. };
  34. enum {
  35. VHOST_VSOCK_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2)
  36. };
  37. /* Used to track all the vhost_vsock instances on the system. */
  38. static DEFINE_MUTEX(vhost_vsock_mutex);
  39. static DEFINE_READ_MOSTLY_HASHTABLE(vhost_vsock_hash, 8);
  40. struct vhost_vsock {
  41. struct vhost_dev dev;
  42. struct vhost_virtqueue vqs[2];
  43. /* Link to global vhost_vsock_hash, writes use vhost_vsock_mutex */
  44. struct hlist_node hash;
  45. struct vhost_work send_pkt_work;
  46. spinlock_t send_pkt_list_lock;
  47. struct list_head send_pkt_list; /* host->guest pending packets */
  48. atomic_t queued_replies;
  49. u32 guest_cid;
  50. bool seqpacket_allow;
  51. };
  52. static u32 vhost_transport_get_local_cid(void)
  53. {
  54. return VHOST_VSOCK_DEFAULT_HOST_CID;
  55. }
  56. /* Callers that dereference the return value must hold vhost_vsock_mutex or the
  57. * RCU read lock.
  58. */
  59. static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
  60. {
  61. struct vhost_vsock *vsock;
  62. hash_for_each_possible_rcu(vhost_vsock_hash, vsock, hash, guest_cid) {
  63. u32 other_cid = vsock->guest_cid;
  64. /* Skip instances that have no CID yet */
  65. if (other_cid == 0)
  66. continue;
  67. if (other_cid == guest_cid)
  68. return vsock;
  69. }
  70. return NULL;
  71. }
  72. static void
  73. vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
  74. struct vhost_virtqueue *vq)
  75. {
  76. struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX];
  77. int pkts = 0, total_len = 0;
  78. bool added = false;
  79. bool restart_tx = false;
  80. mutex_lock(&vq->mutex);
  81. if (!vhost_vq_get_backend(vq))
  82. goto out;
  83. if (!vq_meta_prefetch(vq))
  84. goto out;
  85. /* Avoid further vmexits, we're already processing the virtqueue */
  86. vhost_disable_notify(&vsock->dev, vq);
  87. do {
  88. struct virtio_vsock_pkt *pkt;
  89. struct iov_iter iov_iter;
  90. unsigned out, in;
  91. size_t nbytes;
  92. size_t iov_len, payload_len;
  93. int head;
  94. u32 flags_to_restore = 0;
  95. spin_lock_bh(&vsock->send_pkt_list_lock);
  96. if (list_empty(&vsock->send_pkt_list)) {
  97. spin_unlock_bh(&vsock->send_pkt_list_lock);
  98. vhost_enable_notify(&vsock->dev, vq);
  99. break;
  100. }
  101. pkt = list_first_entry(&vsock->send_pkt_list,
  102. struct virtio_vsock_pkt, list);
  103. list_del_init(&pkt->list);
  104. spin_unlock_bh(&vsock->send_pkt_list_lock);
  105. head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
  106. &out, &in, NULL, NULL);
  107. if (head < 0) {
  108. spin_lock_bh(&vsock->send_pkt_list_lock);
  109. list_add(&pkt->list, &vsock->send_pkt_list);
  110. spin_unlock_bh(&vsock->send_pkt_list_lock);
  111. break;
  112. }
  113. if (head == vq->num) {
  114. spin_lock_bh(&vsock->send_pkt_list_lock);
  115. list_add(&pkt->list, &vsock->send_pkt_list);
  116. spin_unlock_bh(&vsock->send_pkt_list_lock);
  117. /* We cannot finish yet if more buffers snuck in while
  118. * re-enabling notify.
  119. */
  120. if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
  121. vhost_disable_notify(&vsock->dev, vq);
  122. continue;
  123. }
  124. break;
  125. }
  126. if (out) {
  127. virtio_transport_free_pkt(pkt);
  128. vq_err(vq, "Expected 0 output buffers, got %u\n", out);
  129. break;
  130. }
  131. iov_len = iov_length(&vq->iov[out], in);
  132. if (iov_len < sizeof(pkt->hdr)) {
  133. virtio_transport_free_pkt(pkt);
  134. vq_err(vq, "Buffer len [%zu] too small\n", iov_len);
  135. break;
  136. }
  137. iov_iter_init(&iov_iter, ITER_DEST, &vq->iov[out], in, iov_len);
  138. payload_len = pkt->len - pkt->off;
  139. /* If the packet is greater than the space available in the
  140. * buffer, we split it using multiple buffers.
  141. */
  142. if (payload_len > iov_len - sizeof(pkt->hdr)) {
  143. payload_len = iov_len - sizeof(pkt->hdr);
  144. /* As we are copying pieces of large packet's buffer to
  145. * small rx buffers, headers of packets in rx queue are
  146. * created dynamically and are initialized with header
  147. * of current packet(except length). But in case of
  148. * SOCK_SEQPACKET, we also must clear message delimeter
  149. * bit (VIRTIO_VSOCK_SEQ_EOM) and MSG_EOR bit
  150. * (VIRTIO_VSOCK_SEQ_EOR) if set. Otherwise,
  151. * there will be sequence of packets with these
  152. * bits set. After initialized header will be copied to
  153. * rx buffer, these required bits will be restored.
  154. */
  155. if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM) {
  156. pkt->hdr.flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
  157. flags_to_restore |= VIRTIO_VSOCK_SEQ_EOM;
  158. if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR) {
  159. pkt->hdr.flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
  160. flags_to_restore |= VIRTIO_VSOCK_SEQ_EOR;
  161. }
  162. }
  163. }
  164. /* Set the correct length in the header */
  165. pkt->hdr.len = cpu_to_le32(payload_len);
  166. nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
  167. if (nbytes != sizeof(pkt->hdr)) {
  168. virtio_transport_free_pkt(pkt);
  169. vq_err(vq, "Faulted on copying pkt hdr\n");
  170. break;
  171. }
  172. nbytes = copy_to_iter(pkt->buf + pkt->off, payload_len,
  173. &iov_iter);
  174. if (nbytes != payload_len) {
  175. virtio_transport_free_pkt(pkt);
  176. vq_err(vq, "Faulted on copying pkt buf\n");
  177. break;
  178. }
  179. /* Deliver to monitoring devices all packets that we
  180. * will transmit.
  181. */
  182. virtio_transport_deliver_tap_pkt(pkt);
  183. vhost_add_used(vq, head, sizeof(pkt->hdr) + payload_len);
  184. added = true;
  185. pkt->off += payload_len;
  186. total_len += payload_len;
  187. /* If we didn't send all the payload we can requeue the packet
  188. * to send it with the next available buffer.
  189. */
  190. if (pkt->off < pkt->len) {
  191. pkt->hdr.flags |= cpu_to_le32(flags_to_restore);
  192. /* We are queueing the same virtio_vsock_pkt to handle
  193. * the remaining bytes, and we want to deliver it
  194. * to monitoring devices in the next iteration.
  195. */
  196. pkt->tap_delivered = false;
  197. spin_lock_bh(&vsock->send_pkt_list_lock);
  198. list_add(&pkt->list, &vsock->send_pkt_list);
  199. spin_unlock_bh(&vsock->send_pkt_list_lock);
  200. } else {
  201. if (pkt->reply) {
  202. int val;
  203. val = atomic_dec_return(&vsock->queued_replies);
  204. /* Do we have resources to resume tx
  205. * processing?
  206. */
  207. if (val + 1 == tx_vq->num)
  208. restart_tx = true;
  209. }
  210. virtio_transport_free_pkt(pkt);
  211. }
  212. } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
  213. if (added)
  214. vhost_signal(&vsock->dev, vq);
  215. out:
  216. mutex_unlock(&vq->mutex);
  217. if (restart_tx)
  218. vhost_poll_queue(&tx_vq->poll);
  219. }
  220. static void vhost_transport_send_pkt_work(struct vhost_work *work)
  221. {
  222. struct vhost_virtqueue *vq;
  223. struct vhost_vsock *vsock;
  224. vsock = container_of(work, struct vhost_vsock, send_pkt_work);
  225. vq = &vsock->vqs[VSOCK_VQ_RX];
  226. vhost_transport_do_send_pkt(vsock, vq);
  227. }
  228. static int
  229. vhost_transport_send_pkt(struct virtio_vsock_pkt *pkt)
  230. {
  231. struct vhost_vsock *vsock;
  232. int len = pkt->len;
  233. rcu_read_lock();
  234. /* Find the vhost_vsock according to guest context id */
  235. vsock = vhost_vsock_get(le64_to_cpu(pkt->hdr.dst_cid));
  236. if (!vsock) {
  237. rcu_read_unlock();
  238. virtio_transport_free_pkt(pkt);
  239. return -ENODEV;
  240. }
  241. if (pkt->reply)
  242. atomic_inc(&vsock->queued_replies);
  243. spin_lock_bh(&vsock->send_pkt_list_lock);
  244. list_add_tail(&pkt->list, &vsock->send_pkt_list);
  245. spin_unlock_bh(&vsock->send_pkt_list_lock);
  246. vhost_work_queue(&vsock->dev, &vsock->send_pkt_work);
  247. rcu_read_unlock();
  248. return len;
  249. }
  250. static int
  251. vhost_transport_cancel_pkt(struct vsock_sock *vsk)
  252. {
  253. struct vhost_vsock *vsock;
  254. struct virtio_vsock_pkt *pkt, *n;
  255. int cnt = 0;
  256. int ret = -ENODEV;
  257. LIST_HEAD(freeme);
  258. rcu_read_lock();
  259. /* Find the vhost_vsock according to guest context id */
  260. vsock = vhost_vsock_get(vsk->remote_addr.svm_cid);
  261. if (!vsock)
  262. goto out;
  263. spin_lock_bh(&vsock->send_pkt_list_lock);
  264. list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) {
  265. if (pkt->vsk != vsk)
  266. continue;
  267. list_move(&pkt->list, &freeme);
  268. }
  269. spin_unlock_bh(&vsock->send_pkt_list_lock);
  270. list_for_each_entry_safe(pkt, n, &freeme, list) {
  271. if (pkt->reply)
  272. cnt++;
  273. list_del(&pkt->list);
  274. virtio_transport_free_pkt(pkt);
  275. }
  276. if (cnt) {
  277. struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX];
  278. int new_cnt;
  279. new_cnt = atomic_sub_return(cnt, &vsock->queued_replies);
  280. if (new_cnt + cnt >= tx_vq->num && new_cnt < tx_vq->num)
  281. vhost_poll_queue(&tx_vq->poll);
  282. }
  283. ret = 0;
  284. out:
  285. rcu_read_unlock();
  286. return ret;
  287. }
  288. static struct virtio_vsock_pkt *
  289. vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
  290. unsigned int out, unsigned int in)
  291. {
  292. struct virtio_vsock_pkt *pkt;
  293. struct iov_iter iov_iter;
  294. size_t nbytes;
  295. size_t len;
  296. if (in != 0) {
  297. vq_err(vq, "Expected 0 input buffers, got %u\n", in);
  298. return NULL;
  299. }
  300. pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
  301. if (!pkt)
  302. return NULL;
  303. len = iov_length(vq->iov, out);
  304. iov_iter_init(&iov_iter, ITER_SOURCE, vq->iov, out, len);
  305. nbytes = copy_from_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
  306. if (nbytes != sizeof(pkt->hdr)) {
  307. vq_err(vq, "Expected %zu bytes for pkt->hdr, got %zu bytes\n",
  308. sizeof(pkt->hdr), nbytes);
  309. kfree(pkt);
  310. return NULL;
  311. }
  312. pkt->len = le32_to_cpu(pkt->hdr.len);
  313. /* No payload */
  314. if (!pkt->len)
  315. return pkt;
  316. /* The pkt is too big */
  317. if (pkt->len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) {
  318. kfree(pkt);
  319. return NULL;
  320. }
  321. pkt->buf = kvmalloc(pkt->len, GFP_KERNEL);
  322. if (!pkt->buf) {
  323. kfree(pkt);
  324. return NULL;
  325. }
  326. pkt->buf_len = pkt->len;
  327. nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
  328. if (nbytes != pkt->len) {
  329. vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
  330. pkt->len, nbytes);
  331. virtio_transport_free_pkt(pkt);
  332. return NULL;
  333. }
  334. return pkt;
  335. }
  336. /* Is there space left for replies to rx packets? */
  337. static bool vhost_vsock_more_replies(struct vhost_vsock *vsock)
  338. {
  339. struct vhost_virtqueue *vq = &vsock->vqs[VSOCK_VQ_TX];
  340. int val;
  341. smp_rmb(); /* paired with atomic_inc() and atomic_dec_return() */
  342. val = atomic_read(&vsock->queued_replies);
  343. return val < vq->num;
  344. }
  345. static bool vhost_transport_seqpacket_allow(u32 remote_cid);
  346. static struct virtio_transport vhost_transport = {
  347. .transport = {
  348. .module = THIS_MODULE,
  349. .get_local_cid = vhost_transport_get_local_cid,
  350. .init = virtio_transport_do_socket_init,
  351. .destruct = virtio_transport_destruct,
  352. .release = virtio_transport_release,
  353. .connect = virtio_transport_connect,
  354. .shutdown = virtio_transport_shutdown,
  355. .cancel_pkt = vhost_transport_cancel_pkt,
  356. .dgram_enqueue = virtio_transport_dgram_enqueue,
  357. .dgram_dequeue = virtio_transport_dgram_dequeue,
  358. .dgram_bind = virtio_transport_dgram_bind,
  359. .dgram_allow = virtio_transport_dgram_allow,
  360. .stream_enqueue = virtio_transport_stream_enqueue,
  361. .stream_dequeue = virtio_transport_stream_dequeue,
  362. .stream_has_data = virtio_transport_stream_has_data,
  363. .stream_has_space = virtio_transport_stream_has_space,
  364. .stream_rcvhiwat = virtio_transport_stream_rcvhiwat,
  365. .stream_is_active = virtio_transport_stream_is_active,
  366. .stream_allow = virtio_transport_stream_allow,
  367. .seqpacket_dequeue = virtio_transport_seqpacket_dequeue,
  368. .seqpacket_enqueue = virtio_transport_seqpacket_enqueue,
  369. .seqpacket_allow = vhost_transport_seqpacket_allow,
  370. .seqpacket_has_data = virtio_transport_seqpacket_has_data,
  371. .notify_poll_in = virtio_transport_notify_poll_in,
  372. .notify_poll_out = virtio_transport_notify_poll_out,
  373. .notify_recv_init = virtio_transport_notify_recv_init,
  374. .notify_recv_pre_block = virtio_transport_notify_recv_pre_block,
  375. .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue,
  376. .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue,
  377. .notify_send_init = virtio_transport_notify_send_init,
  378. .notify_send_pre_block = virtio_transport_notify_send_pre_block,
  379. .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue,
  380. .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
  381. .notify_buffer_size = virtio_transport_notify_buffer_size,
  382. },
  383. .send_pkt = vhost_transport_send_pkt,
  384. };
  385. static bool vhost_transport_seqpacket_allow(u32 remote_cid)
  386. {
  387. struct vhost_vsock *vsock;
  388. bool seqpacket_allow = false;
  389. rcu_read_lock();
  390. vsock = vhost_vsock_get(remote_cid);
  391. if (vsock)
  392. seqpacket_allow = vsock->seqpacket_allow;
  393. rcu_read_unlock();
  394. return seqpacket_allow;
  395. }
  396. static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
  397. {
  398. struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
  399. poll.work);
  400. struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
  401. dev);
  402. struct virtio_vsock_pkt *pkt;
  403. int head, pkts = 0, total_len = 0;
  404. unsigned int out, in;
  405. bool added = false;
  406. mutex_lock(&vq->mutex);
  407. if (!vhost_vq_get_backend(vq))
  408. goto out;
  409. if (!vq_meta_prefetch(vq))
  410. goto out;
  411. vhost_disable_notify(&vsock->dev, vq);
  412. do {
  413. if (!vhost_vsock_more_replies(vsock)) {
  414. /* Stop tx until the device processes already
  415. * pending replies. Leave tx virtqueue
  416. * callbacks disabled.
  417. */
  418. goto no_more_replies;
  419. }
  420. head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
  421. &out, &in, NULL, NULL);
  422. if (head < 0)
  423. break;
  424. if (head == vq->num) {
  425. if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
  426. vhost_disable_notify(&vsock->dev, vq);
  427. continue;
  428. }
  429. break;
  430. }
  431. pkt = vhost_vsock_alloc_pkt(vq, out, in);
  432. if (!pkt) {
  433. vq_err(vq, "Faulted on pkt\n");
  434. continue;
  435. }
  436. total_len += sizeof(pkt->hdr) + pkt->len;
  437. /* Deliver to monitoring devices all received packets */
  438. virtio_transport_deliver_tap_pkt(pkt);
  439. /* Only accept correctly addressed packets */
  440. if (le64_to_cpu(pkt->hdr.src_cid) == vsock->guest_cid &&
  441. le64_to_cpu(pkt->hdr.dst_cid) ==
  442. vhost_transport_get_local_cid())
  443. virtio_transport_recv_pkt(&vhost_transport, pkt);
  444. else
  445. virtio_transport_free_pkt(pkt);
  446. vhost_add_used(vq, head, 0);
  447. added = true;
  448. } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
  449. no_more_replies:
  450. if (added)
  451. vhost_signal(&vsock->dev, vq);
  452. out:
  453. mutex_unlock(&vq->mutex);
  454. }
  455. static void vhost_vsock_handle_rx_kick(struct vhost_work *work)
  456. {
  457. struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
  458. poll.work);
  459. struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
  460. dev);
  461. vhost_transport_do_send_pkt(vsock, vq);
  462. }
  463. static int vhost_vsock_start(struct vhost_vsock *vsock)
  464. {
  465. struct vhost_virtqueue *vq;
  466. size_t i;
  467. int ret;
  468. mutex_lock(&vsock->dev.mutex);
  469. ret = vhost_dev_check_owner(&vsock->dev);
  470. if (ret)
  471. goto err;
  472. for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
  473. vq = &vsock->vqs[i];
  474. mutex_lock(&vq->mutex);
  475. if (!vhost_vq_access_ok(vq)) {
  476. ret = -EFAULT;
  477. goto err_vq;
  478. }
  479. if (!vhost_vq_get_backend(vq)) {
  480. vhost_vq_set_backend(vq, vsock);
  481. ret = vhost_vq_init_access(vq);
  482. if (ret)
  483. goto err_vq;
  484. }
  485. mutex_unlock(&vq->mutex);
  486. }
  487. /* Some packets may have been queued before the device was started,
  488. * let's kick the send worker to send them.
  489. */
  490. vhost_work_queue(&vsock->dev, &vsock->send_pkt_work);
  491. mutex_unlock(&vsock->dev.mutex);
  492. return 0;
  493. err_vq:
  494. vhost_vq_set_backend(vq, NULL);
  495. mutex_unlock(&vq->mutex);
  496. for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
  497. vq = &vsock->vqs[i];
  498. mutex_lock(&vq->mutex);
  499. vhost_vq_set_backend(vq, NULL);
  500. mutex_unlock(&vq->mutex);
  501. }
  502. err:
  503. mutex_unlock(&vsock->dev.mutex);
  504. return ret;
  505. }
  506. static int vhost_vsock_stop(struct vhost_vsock *vsock, bool check_owner)
  507. {
  508. size_t i;
  509. int ret = 0;
  510. mutex_lock(&vsock->dev.mutex);
  511. if (check_owner) {
  512. ret = vhost_dev_check_owner(&vsock->dev);
  513. if (ret)
  514. goto err;
  515. }
  516. for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
  517. struct vhost_virtqueue *vq = &vsock->vqs[i];
  518. mutex_lock(&vq->mutex);
  519. vhost_vq_set_backend(vq, NULL);
  520. mutex_unlock(&vq->mutex);
  521. }
  522. err:
  523. mutex_unlock(&vsock->dev.mutex);
  524. return ret;
  525. }
  526. static void vhost_vsock_free(struct vhost_vsock *vsock)
  527. {
  528. kvfree(vsock);
  529. }
  530. static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
  531. {
  532. struct vhost_virtqueue **vqs;
  533. struct vhost_vsock *vsock;
  534. int ret;
  535. /* This struct is large and allocation could fail, fall back to vmalloc
  536. * if there is no other way.
  537. */
  538. vsock = kvmalloc(sizeof(*vsock), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
  539. if (!vsock)
  540. return -ENOMEM;
  541. vqs = kmalloc_array(ARRAY_SIZE(vsock->vqs), sizeof(*vqs), GFP_KERNEL);
  542. if (!vqs) {
  543. ret = -ENOMEM;
  544. goto out;
  545. }
  546. vsock->guest_cid = 0; /* no CID assigned yet */
  547. atomic_set(&vsock->queued_replies, 0);
  548. vqs[VSOCK_VQ_TX] = &vsock->vqs[VSOCK_VQ_TX];
  549. vqs[VSOCK_VQ_RX] = &vsock->vqs[VSOCK_VQ_RX];
  550. vsock->vqs[VSOCK_VQ_TX].handle_kick = vhost_vsock_handle_tx_kick;
  551. vsock->vqs[VSOCK_VQ_RX].handle_kick = vhost_vsock_handle_rx_kick;
  552. vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs),
  553. UIO_MAXIOV, VHOST_VSOCK_PKT_WEIGHT,
  554. VHOST_VSOCK_WEIGHT, true, NULL);
  555. file->private_data = vsock;
  556. spin_lock_init(&vsock->send_pkt_list_lock);
  557. INIT_LIST_HEAD(&vsock->send_pkt_list);
  558. vhost_work_init(&vsock->send_pkt_work, vhost_transport_send_pkt_work);
  559. return 0;
  560. out:
  561. vhost_vsock_free(vsock);
  562. return ret;
  563. }
  564. static void vhost_vsock_flush(struct vhost_vsock *vsock)
  565. {
  566. vhost_dev_flush(&vsock->dev);
  567. }
  568. static void vhost_vsock_reset_orphans(struct sock *sk)
  569. {
  570. struct vsock_sock *vsk = vsock_sk(sk);
  571. /* vmci_transport.c doesn't take sk_lock here either. At least we're
  572. * under vsock_table_lock so the sock cannot disappear while we're
  573. * executing.
  574. */
  575. /* If the peer is still valid, no need to reset connection */
  576. if (vhost_vsock_get(vsk->remote_addr.svm_cid))
  577. return;
  578. /* If the close timeout is pending, let it expire. This avoids races
  579. * with the timeout callback.
  580. */
  581. if (vsk->close_work_scheduled)
  582. return;
  583. sock_set_flag(sk, SOCK_DONE);
  584. vsk->peer_shutdown = SHUTDOWN_MASK;
  585. sk->sk_state = SS_UNCONNECTED;
  586. sk->sk_err = ECONNRESET;
  587. sk_error_report(sk);
  588. }
  589. static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
  590. {
  591. struct vhost_vsock *vsock = file->private_data;
  592. mutex_lock(&vhost_vsock_mutex);
  593. if (vsock->guest_cid)
  594. hash_del_rcu(&vsock->hash);
  595. mutex_unlock(&vhost_vsock_mutex);
  596. /* Wait for other CPUs to finish using vsock */
  597. synchronize_rcu();
  598. /* Iterating over all connections for all CIDs to find orphans is
  599. * inefficient. Room for improvement here. */
  600. vsock_for_each_connected_socket(&vhost_transport.transport,
  601. vhost_vsock_reset_orphans);
  602. /* Don't check the owner, because we are in the release path, so we
  603. * need to stop the vsock device in any case.
  604. * vhost_vsock_stop() can not fail in this case, so we don't need to
  605. * check the return code.
  606. */
  607. vhost_vsock_stop(vsock, false);
  608. vhost_vsock_flush(vsock);
  609. vhost_dev_stop(&vsock->dev);
  610. spin_lock_bh(&vsock->send_pkt_list_lock);
  611. while (!list_empty(&vsock->send_pkt_list)) {
  612. struct virtio_vsock_pkt *pkt;
  613. pkt = list_first_entry(&vsock->send_pkt_list,
  614. struct virtio_vsock_pkt, list);
  615. list_del_init(&pkt->list);
  616. virtio_transport_free_pkt(pkt);
  617. }
  618. spin_unlock_bh(&vsock->send_pkt_list_lock);
  619. vhost_dev_cleanup(&vsock->dev);
  620. kfree(vsock->dev.vqs);
  621. vhost_vsock_free(vsock);
  622. return 0;
  623. }
  624. static int vhost_vsock_set_cid(struct vhost_vsock *vsock, u64 guest_cid)
  625. {
  626. struct vhost_vsock *other;
  627. /* Refuse reserved CIDs */
  628. if (guest_cid <= VMADDR_CID_HOST ||
  629. guest_cid == U32_MAX)
  630. return -EINVAL;
  631. /* 64-bit CIDs are not yet supported */
  632. if (guest_cid > U32_MAX)
  633. return -EINVAL;
  634. /* Refuse if CID is assigned to the guest->host transport (i.e. nested
  635. * VM), to make the loopback work.
  636. */
  637. if (vsock_find_cid(guest_cid))
  638. return -EADDRINUSE;
  639. /* Refuse if CID is already in use */
  640. mutex_lock(&vhost_vsock_mutex);
  641. other = vhost_vsock_get(guest_cid);
  642. if (other && other != vsock) {
  643. mutex_unlock(&vhost_vsock_mutex);
  644. return -EADDRINUSE;
  645. }
  646. if (vsock->guest_cid)
  647. hash_del_rcu(&vsock->hash);
  648. vsock->guest_cid = guest_cid;
  649. hash_add_rcu(vhost_vsock_hash, &vsock->hash, vsock->guest_cid);
  650. mutex_unlock(&vhost_vsock_mutex);
  651. return 0;
  652. }
  653. static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features)
  654. {
  655. struct vhost_virtqueue *vq;
  656. int i;
  657. if (features & ~VHOST_VSOCK_FEATURES)
  658. return -EOPNOTSUPP;
  659. mutex_lock(&vsock->dev.mutex);
  660. if ((features & (1 << VHOST_F_LOG_ALL)) &&
  661. !vhost_log_access_ok(&vsock->dev)) {
  662. goto err;
  663. }
  664. if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) {
  665. if (vhost_init_device_iotlb(&vsock->dev, true))
  666. goto err;
  667. }
  668. if (features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET))
  669. vsock->seqpacket_allow = true;
  670. for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
  671. vq = &vsock->vqs[i];
  672. mutex_lock(&vq->mutex);
  673. vq->acked_features = features;
  674. mutex_unlock(&vq->mutex);
  675. }
  676. mutex_unlock(&vsock->dev.mutex);
  677. return 0;
  678. err:
  679. mutex_unlock(&vsock->dev.mutex);
  680. return -EFAULT;
  681. }
  682. static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl,
  683. unsigned long arg)
  684. {
  685. struct vhost_vsock *vsock = f->private_data;
  686. void __user *argp = (void __user *)arg;
  687. u64 guest_cid;
  688. u64 features;
  689. int start;
  690. int r;
  691. switch (ioctl) {
  692. case VHOST_VSOCK_SET_GUEST_CID:
  693. if (copy_from_user(&guest_cid, argp, sizeof(guest_cid)))
  694. return -EFAULT;
  695. return vhost_vsock_set_cid(vsock, guest_cid);
  696. case VHOST_VSOCK_SET_RUNNING:
  697. if (copy_from_user(&start, argp, sizeof(start)))
  698. return -EFAULT;
  699. if (start)
  700. return vhost_vsock_start(vsock);
  701. else
  702. return vhost_vsock_stop(vsock, true);
  703. case VHOST_GET_FEATURES:
  704. features = VHOST_VSOCK_FEATURES;
  705. if (copy_to_user(argp, &features, sizeof(features)))
  706. return -EFAULT;
  707. return 0;
  708. case VHOST_SET_FEATURES:
  709. if (copy_from_user(&features, argp, sizeof(features)))
  710. return -EFAULT;
  711. return vhost_vsock_set_features(vsock, features);
  712. case VHOST_GET_BACKEND_FEATURES:
  713. features = VHOST_VSOCK_BACKEND_FEATURES;
  714. if (copy_to_user(argp, &features, sizeof(features)))
  715. return -EFAULT;
  716. return 0;
  717. case VHOST_SET_BACKEND_FEATURES:
  718. if (copy_from_user(&features, argp, sizeof(features)))
  719. return -EFAULT;
  720. if (features & ~VHOST_VSOCK_BACKEND_FEATURES)
  721. return -EOPNOTSUPP;
  722. vhost_set_backend_features(&vsock->dev, features);
  723. return 0;
  724. default:
  725. mutex_lock(&vsock->dev.mutex);
  726. r = vhost_dev_ioctl(&vsock->dev, ioctl, argp);
  727. if (r == -ENOIOCTLCMD)
  728. r = vhost_vring_ioctl(&vsock->dev, ioctl, argp);
  729. else
  730. vhost_vsock_flush(vsock);
  731. mutex_unlock(&vsock->dev.mutex);
  732. return r;
  733. }
  734. }
  735. static ssize_t vhost_vsock_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
  736. {
  737. struct file *file = iocb->ki_filp;
  738. struct vhost_vsock *vsock = file->private_data;
  739. struct vhost_dev *dev = &vsock->dev;
  740. int noblock = file->f_flags & O_NONBLOCK;
  741. return vhost_chr_read_iter(dev, to, noblock);
  742. }
  743. static ssize_t vhost_vsock_chr_write_iter(struct kiocb *iocb,
  744. struct iov_iter *from)
  745. {
  746. struct file *file = iocb->ki_filp;
  747. struct vhost_vsock *vsock = file->private_data;
  748. struct vhost_dev *dev = &vsock->dev;
  749. return vhost_chr_write_iter(dev, from);
  750. }
  751. static __poll_t vhost_vsock_chr_poll(struct file *file, poll_table *wait)
  752. {
  753. struct vhost_vsock *vsock = file->private_data;
  754. struct vhost_dev *dev = &vsock->dev;
  755. return vhost_chr_poll(file, dev, wait);
  756. }
  757. static const struct file_operations vhost_vsock_fops = {
  758. .owner = THIS_MODULE,
  759. .open = vhost_vsock_dev_open,
  760. .release = vhost_vsock_dev_release,
  761. .llseek = noop_llseek,
  762. .unlocked_ioctl = vhost_vsock_dev_ioctl,
  763. .compat_ioctl = compat_ptr_ioctl,
  764. .read_iter = vhost_vsock_chr_read_iter,
  765. .write_iter = vhost_vsock_chr_write_iter,
  766. .poll = vhost_vsock_chr_poll,
  767. };
  768. static struct miscdevice vhost_vsock_misc = {
  769. .minor = VHOST_VSOCK_MINOR,
  770. .name = "vhost-vsock",
  771. .fops = &vhost_vsock_fops,
  772. };
  773. static int __init vhost_vsock_init(void)
  774. {
  775. int ret;
  776. ret = vsock_core_register(&vhost_transport.transport,
  777. VSOCK_TRANSPORT_F_H2G);
  778. if (ret < 0)
  779. return ret;
  780. ret = misc_register(&vhost_vsock_misc);
  781. if (ret) {
  782. vsock_core_unregister(&vhost_transport.transport);
  783. return ret;
  784. }
  785. return 0;
  786. };
  787. static void __exit vhost_vsock_exit(void)
  788. {
  789. misc_deregister(&vhost_vsock_misc);
  790. vsock_core_unregister(&vhost_transport.transport);
  791. };
  792. module_init(vhost_vsock_init);
  793. module_exit(vhost_vsock_exit);
  794. MODULE_LICENSE("GPL v2");
  795. MODULE_AUTHOR("Asias He");
  796. MODULE_DESCRIPTION("vhost transport for vsock ");
  797. MODULE_ALIAS_MISCDEV(VHOST_VSOCK_MINOR);
  798. MODULE_ALIAS("devname:vhost-vsock");