ring_buffer.c 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. *
  4. * Copyright (c) 2009, Microsoft Corporation.
  5. *
  6. * Authors:
  7. * Haiyang Zhang <[email protected]>
  8. * Hank Janssen <[email protected]>
  9. * K. Y. Srinivasan <[email protected]>
  10. */
  11. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  12. #include <linux/kernel.h>
  13. #include <linux/mm.h>
  14. #include <linux/hyperv.h>
  15. #include <linux/uio.h>
  16. #include <linux/vmalloc.h>
  17. #include <linux/slab.h>
  18. #include <linux/prefetch.h>
  19. #include <linux/io.h>
  20. #include <asm/mshyperv.h>
  21. #include "hyperv_vmbus.h"
  22. #define VMBUS_PKT_TRAILER 8
  23. /*
  24. * When we write to the ring buffer, check if the host needs to
  25. * be signaled. Here is the details of this protocol:
  26. *
  27. * 1. The host guarantees that while it is draining the
  28. * ring buffer, it will set the interrupt_mask to
  29. * indicate it does not need to be interrupted when
  30. * new data is placed.
  31. *
  32. * 2. The host guarantees that it will completely drain
  33. * the ring buffer before exiting the read loop. Further,
  34. * once the ring buffer is empty, it will clear the
  35. * interrupt_mask and re-check to see if new data has
  36. * arrived.
  37. *
  38. * KYS: Oct. 30, 2016:
  39. * It looks like Windows hosts have logic to deal with DOS attacks that
  40. * can be triggered if it receives interrupts when it is not expecting
  41. * the interrupt. The host expects interrupts only when the ring
  42. * transitions from empty to non-empty (or full to non full on the guest
  43. * to host ring).
  44. * So, base the signaling decision solely on the ring state until the
  45. * host logic is fixed.
  46. */
  47. static void hv_signal_on_write(u32 old_write, struct vmbus_channel *channel)
  48. {
  49. struct hv_ring_buffer_info *rbi = &channel->outbound;
  50. virt_mb();
  51. if (READ_ONCE(rbi->ring_buffer->interrupt_mask))
  52. return;
  53. /* check interrupt_mask before read_index */
  54. virt_rmb();
  55. /*
  56. * This is the only case we need to signal when the
  57. * ring transitions from being empty to non-empty.
  58. */
  59. if (old_write == READ_ONCE(rbi->ring_buffer->read_index)) {
  60. ++channel->intr_out_empty;
  61. vmbus_setevent(channel);
  62. }
  63. }
  64. /* Get the next write location for the specified ring buffer. */
  65. static inline u32
  66. hv_get_next_write_location(struct hv_ring_buffer_info *ring_info)
  67. {
  68. u32 next = ring_info->ring_buffer->write_index;
  69. return next;
  70. }
  71. /* Set the next write location for the specified ring buffer. */
  72. static inline void
  73. hv_set_next_write_location(struct hv_ring_buffer_info *ring_info,
  74. u32 next_write_location)
  75. {
  76. ring_info->ring_buffer->write_index = next_write_location;
  77. }
  78. /* Get the size of the ring buffer. */
  79. static inline u32
  80. hv_get_ring_buffersize(const struct hv_ring_buffer_info *ring_info)
  81. {
  82. return ring_info->ring_datasize;
  83. }
  84. /* Get the read and write indices as u64 of the specified ring buffer. */
  85. static inline u64
  86. hv_get_ring_bufferindices(struct hv_ring_buffer_info *ring_info)
  87. {
  88. return (u64)ring_info->ring_buffer->write_index << 32;
  89. }
  90. /*
  91. * Helper routine to copy from source to ring buffer.
  92. * Assume there is enough room. Handles wrap-around in dest case only!!
  93. */
  94. static u32 hv_copyto_ringbuffer(
  95. struct hv_ring_buffer_info *ring_info,
  96. u32 start_write_offset,
  97. const void *src,
  98. u32 srclen)
  99. {
  100. void *ring_buffer = hv_get_ring_buffer(ring_info);
  101. u32 ring_buffer_size = hv_get_ring_buffersize(ring_info);
  102. memcpy(ring_buffer + start_write_offset, src, srclen);
  103. start_write_offset += srclen;
  104. if (start_write_offset >= ring_buffer_size)
  105. start_write_offset -= ring_buffer_size;
  106. return start_write_offset;
  107. }
  108. /*
  109. *
  110. * hv_get_ringbuffer_availbytes()
  111. *
  112. * Get number of bytes available to read and to write to
  113. * for the specified ring buffer
  114. */
  115. static void
  116. hv_get_ringbuffer_availbytes(const struct hv_ring_buffer_info *rbi,
  117. u32 *read, u32 *write)
  118. {
  119. u32 read_loc, write_loc, dsize;
  120. /* Capture the read/write indices before they changed */
  121. read_loc = READ_ONCE(rbi->ring_buffer->read_index);
  122. write_loc = READ_ONCE(rbi->ring_buffer->write_index);
  123. dsize = rbi->ring_datasize;
  124. *write = write_loc >= read_loc ? dsize - (write_loc - read_loc) :
  125. read_loc - write_loc;
  126. *read = dsize - *write;
  127. }
  128. /* Get various debug metrics for the specified ring buffer. */
  129. int hv_ringbuffer_get_debuginfo(struct hv_ring_buffer_info *ring_info,
  130. struct hv_ring_buffer_debug_info *debug_info)
  131. {
  132. u32 bytes_avail_towrite;
  133. u32 bytes_avail_toread;
  134. mutex_lock(&ring_info->ring_buffer_mutex);
  135. if (!ring_info->ring_buffer) {
  136. mutex_unlock(&ring_info->ring_buffer_mutex);
  137. return -EINVAL;
  138. }
  139. hv_get_ringbuffer_availbytes(ring_info,
  140. &bytes_avail_toread,
  141. &bytes_avail_towrite);
  142. debug_info->bytes_avail_toread = bytes_avail_toread;
  143. debug_info->bytes_avail_towrite = bytes_avail_towrite;
  144. debug_info->current_read_index = ring_info->ring_buffer->read_index;
  145. debug_info->current_write_index = ring_info->ring_buffer->write_index;
  146. debug_info->current_interrupt_mask
  147. = ring_info->ring_buffer->interrupt_mask;
  148. mutex_unlock(&ring_info->ring_buffer_mutex);
  149. return 0;
  150. }
  151. EXPORT_SYMBOL_GPL(hv_ringbuffer_get_debuginfo);
  152. /* Initialize a channel's ring buffer info mutex locks */
  153. void hv_ringbuffer_pre_init(struct vmbus_channel *channel)
  154. {
  155. mutex_init(&channel->inbound.ring_buffer_mutex);
  156. mutex_init(&channel->outbound.ring_buffer_mutex);
  157. }
  158. /* Initialize the ring buffer. */
  159. int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
  160. struct page *pages, u32 page_cnt, u32 max_pkt_size)
  161. {
  162. struct page **pages_wraparound;
  163. unsigned long *pfns_wraparound;
  164. u64 pfn;
  165. int i;
  166. BUILD_BUG_ON((sizeof(struct hv_ring_buffer) != PAGE_SIZE));
  167. /*
  168. * First page holds struct hv_ring_buffer, do wraparound mapping for
  169. * the rest.
  170. */
  171. if (hv_isolation_type_snp()) {
  172. pfn = page_to_pfn(pages) +
  173. PFN_DOWN(ms_hyperv.shared_gpa_boundary);
  174. pfns_wraparound = kcalloc(page_cnt * 2 - 1,
  175. sizeof(unsigned long), GFP_KERNEL);
  176. if (!pfns_wraparound)
  177. return -ENOMEM;
  178. pfns_wraparound[0] = pfn;
  179. for (i = 0; i < 2 * (page_cnt - 1); i++)
  180. pfns_wraparound[i + 1] = pfn + i % (page_cnt - 1) + 1;
  181. ring_info->ring_buffer = (struct hv_ring_buffer *)
  182. vmap_pfn(pfns_wraparound, page_cnt * 2 - 1,
  183. PAGE_KERNEL);
  184. kfree(pfns_wraparound);
  185. if (!ring_info->ring_buffer)
  186. return -ENOMEM;
  187. /* Zero ring buffer after setting memory host visibility. */
  188. memset(ring_info->ring_buffer, 0x00, PAGE_SIZE * page_cnt);
  189. } else {
  190. pages_wraparound = kcalloc(page_cnt * 2 - 1,
  191. sizeof(struct page *),
  192. GFP_KERNEL);
  193. if (!pages_wraparound)
  194. return -ENOMEM;
  195. pages_wraparound[0] = pages;
  196. for (i = 0; i < 2 * (page_cnt - 1); i++)
  197. pages_wraparound[i + 1] =
  198. &pages[i % (page_cnt - 1) + 1];
  199. ring_info->ring_buffer = (struct hv_ring_buffer *)
  200. vmap(pages_wraparound, page_cnt * 2 - 1, VM_MAP,
  201. PAGE_KERNEL);
  202. kfree(pages_wraparound);
  203. if (!ring_info->ring_buffer)
  204. return -ENOMEM;
  205. }
  206. ring_info->ring_buffer->read_index =
  207. ring_info->ring_buffer->write_index = 0;
  208. /* Set the feature bit for enabling flow control. */
  209. ring_info->ring_buffer->feature_bits.value = 1;
  210. ring_info->ring_size = page_cnt << PAGE_SHIFT;
  211. ring_info->ring_size_div10_reciprocal =
  212. reciprocal_value(ring_info->ring_size / 10);
  213. ring_info->ring_datasize = ring_info->ring_size -
  214. sizeof(struct hv_ring_buffer);
  215. ring_info->priv_read_index = 0;
  216. /* Initialize buffer that holds copies of incoming packets */
  217. if (max_pkt_size) {
  218. ring_info->pkt_buffer = kzalloc(max_pkt_size, GFP_KERNEL);
  219. if (!ring_info->pkt_buffer)
  220. return -ENOMEM;
  221. ring_info->pkt_buffer_size = max_pkt_size;
  222. }
  223. spin_lock_init(&ring_info->ring_lock);
  224. return 0;
  225. }
  226. /* Cleanup the ring buffer. */
  227. void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info)
  228. {
  229. mutex_lock(&ring_info->ring_buffer_mutex);
  230. vunmap(ring_info->ring_buffer);
  231. ring_info->ring_buffer = NULL;
  232. mutex_unlock(&ring_info->ring_buffer_mutex);
  233. kfree(ring_info->pkt_buffer);
  234. ring_info->pkt_buffer = NULL;
  235. ring_info->pkt_buffer_size = 0;
  236. }
  237. /*
  238. * Check if the ring buffer spinlock is available to take or not; used on
  239. * atomic contexts, like panic path (see the Hyper-V framebuffer driver).
  240. */
  241. bool hv_ringbuffer_spinlock_busy(struct vmbus_channel *channel)
  242. {
  243. struct hv_ring_buffer_info *rinfo = &channel->outbound;
  244. return spin_is_locked(&rinfo->ring_lock);
  245. }
  246. EXPORT_SYMBOL_GPL(hv_ringbuffer_spinlock_busy);
  247. /* Write to the ring buffer. */
  248. int hv_ringbuffer_write(struct vmbus_channel *channel,
  249. const struct kvec *kv_list, u32 kv_count,
  250. u64 requestid, u64 *trans_id)
  251. {
  252. int i;
  253. u32 bytes_avail_towrite;
  254. u32 totalbytes_towrite = sizeof(u64);
  255. u32 next_write_location;
  256. u32 old_write;
  257. u64 prev_indices;
  258. unsigned long flags;
  259. struct hv_ring_buffer_info *outring_info = &channel->outbound;
  260. struct vmpacket_descriptor *desc = kv_list[0].iov_base;
  261. u64 __trans_id, rqst_id = VMBUS_NO_RQSTOR;
  262. if (channel->rescind)
  263. return -ENODEV;
  264. for (i = 0; i < kv_count; i++)
  265. totalbytes_towrite += kv_list[i].iov_len;
  266. spin_lock_irqsave(&outring_info->ring_lock, flags);
  267. bytes_avail_towrite = hv_get_bytes_to_write(outring_info);
  268. /*
  269. * If there is only room for the packet, assume it is full.
  270. * Otherwise, the next time around, we think the ring buffer
  271. * is empty since the read index == write index.
  272. */
  273. if (bytes_avail_towrite <= totalbytes_towrite) {
  274. ++channel->out_full_total;
  275. if (!channel->out_full_flag) {
  276. ++channel->out_full_first;
  277. channel->out_full_flag = true;
  278. }
  279. spin_unlock_irqrestore(&outring_info->ring_lock, flags);
  280. return -EAGAIN;
  281. }
  282. channel->out_full_flag = false;
  283. /* Write to the ring buffer */
  284. next_write_location = hv_get_next_write_location(outring_info);
  285. old_write = next_write_location;
  286. for (i = 0; i < kv_count; i++) {
  287. next_write_location = hv_copyto_ringbuffer(outring_info,
  288. next_write_location,
  289. kv_list[i].iov_base,
  290. kv_list[i].iov_len);
  291. }
  292. /*
  293. * Allocate the request ID after the data has been copied into the
  294. * ring buffer. Once this request ID is allocated, the completion
  295. * path could find the data and free it.
  296. */
  297. if (desc->flags == VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED) {
  298. if (channel->next_request_id_callback != NULL) {
  299. rqst_id = channel->next_request_id_callback(channel, requestid);
  300. if (rqst_id == VMBUS_RQST_ERROR) {
  301. spin_unlock_irqrestore(&outring_info->ring_lock, flags);
  302. return -EAGAIN;
  303. }
  304. }
  305. }
  306. desc = hv_get_ring_buffer(outring_info) + old_write;
  307. __trans_id = (rqst_id == VMBUS_NO_RQSTOR) ? requestid : rqst_id;
  308. /*
  309. * Ensure the compiler doesn't generate code that reads the value of
  310. * the transaction ID from the ring buffer, which is shared with the
  311. * Hyper-V host and subject to being changed at any time.
  312. */
  313. WRITE_ONCE(desc->trans_id, __trans_id);
  314. if (trans_id)
  315. *trans_id = __trans_id;
  316. /* Set previous packet start */
  317. prev_indices = hv_get_ring_bufferindices(outring_info);
  318. next_write_location = hv_copyto_ringbuffer(outring_info,
  319. next_write_location,
  320. &prev_indices,
  321. sizeof(u64));
  322. /* Issue a full memory barrier before updating the write index */
  323. virt_mb();
  324. /* Now, update the write location */
  325. hv_set_next_write_location(outring_info, next_write_location);
  326. spin_unlock_irqrestore(&outring_info->ring_lock, flags);
  327. hv_signal_on_write(old_write, channel);
  328. if (channel->rescind) {
  329. if (rqst_id != VMBUS_NO_RQSTOR) {
  330. /* Reclaim request ID to avoid leak of IDs */
  331. if (channel->request_addr_callback != NULL)
  332. channel->request_addr_callback(channel, rqst_id);
  333. }
  334. return -ENODEV;
  335. }
  336. return 0;
  337. }
  338. int hv_ringbuffer_read(struct vmbus_channel *channel,
  339. void *buffer, u32 buflen, u32 *buffer_actual_len,
  340. u64 *requestid, bool raw)
  341. {
  342. struct vmpacket_descriptor *desc;
  343. u32 packetlen, offset;
  344. if (unlikely(buflen == 0))
  345. return -EINVAL;
  346. *buffer_actual_len = 0;
  347. *requestid = 0;
  348. /* Make sure there is something to read */
  349. desc = hv_pkt_iter_first(channel);
  350. if (desc == NULL) {
  351. /*
  352. * No error is set when there is even no header, drivers are
  353. * supposed to analyze buffer_actual_len.
  354. */
  355. return 0;
  356. }
  357. offset = raw ? 0 : (desc->offset8 << 3);
  358. packetlen = (desc->len8 << 3) - offset;
  359. *buffer_actual_len = packetlen;
  360. *requestid = desc->trans_id;
  361. if (unlikely(packetlen > buflen))
  362. return -ENOBUFS;
  363. /* since ring is double mapped, only one copy is necessary */
  364. memcpy(buffer, (const char *)desc + offset, packetlen);
  365. /* Advance ring index to next packet descriptor */
  366. __hv_pkt_iter_next(channel, desc);
  367. /* Notify host of update */
  368. hv_pkt_iter_close(channel);
  369. return 0;
  370. }
  371. /*
  372. * Determine number of bytes available in ring buffer after
  373. * the current iterator (priv_read_index) location.
  374. *
  375. * This is similar to hv_get_bytes_to_read but with private
  376. * read index instead.
  377. */
  378. static u32 hv_pkt_iter_avail(const struct hv_ring_buffer_info *rbi)
  379. {
  380. u32 priv_read_loc = rbi->priv_read_index;
  381. u32 write_loc;
  382. /*
  383. * The Hyper-V host writes the packet data, then uses
  384. * store_release() to update the write_index. Use load_acquire()
  385. * here to prevent loads of the packet data from being re-ordered
  386. * before the read of the write_index and potentially getting
  387. * stale data.
  388. */
  389. write_loc = virt_load_acquire(&rbi->ring_buffer->write_index);
  390. if (write_loc >= priv_read_loc)
  391. return write_loc - priv_read_loc;
  392. else
  393. return (rbi->ring_datasize - priv_read_loc) + write_loc;
  394. }
  395. /*
  396. * Get first vmbus packet from ring buffer after read_index
  397. *
  398. * If ring buffer is empty, returns NULL and no other action needed.
  399. */
  400. struct vmpacket_descriptor *hv_pkt_iter_first(struct vmbus_channel *channel)
  401. {
  402. struct hv_ring_buffer_info *rbi = &channel->inbound;
  403. struct vmpacket_descriptor *desc, *desc_copy;
  404. u32 bytes_avail, pkt_len, pkt_offset;
  405. hv_debug_delay_test(channel, MESSAGE_DELAY);
  406. bytes_avail = hv_pkt_iter_avail(rbi);
  407. if (bytes_avail < sizeof(struct vmpacket_descriptor))
  408. return NULL;
  409. bytes_avail = min(rbi->pkt_buffer_size, bytes_avail);
  410. desc = (struct vmpacket_descriptor *)(hv_get_ring_buffer(rbi) + rbi->priv_read_index);
  411. /*
  412. * Ensure the compiler does not use references to incoming Hyper-V values (which
  413. * could change at any moment) when reading local variables later in the code
  414. */
  415. pkt_len = READ_ONCE(desc->len8) << 3;
  416. pkt_offset = READ_ONCE(desc->offset8) << 3;
  417. /*
  418. * If pkt_len is invalid, set it to the smaller of hv_pkt_iter_avail() and
  419. * rbi->pkt_buffer_size
  420. */
  421. if (pkt_len < sizeof(struct vmpacket_descriptor) || pkt_len > bytes_avail)
  422. pkt_len = bytes_avail;
  423. /*
  424. * If pkt_offset is invalid, arbitrarily set it to
  425. * the size of vmpacket_descriptor
  426. */
  427. if (pkt_offset < sizeof(struct vmpacket_descriptor) || pkt_offset > pkt_len)
  428. pkt_offset = sizeof(struct vmpacket_descriptor);
  429. /* Copy the Hyper-V packet out of the ring buffer */
  430. desc_copy = (struct vmpacket_descriptor *)rbi->pkt_buffer;
  431. memcpy(desc_copy, desc, pkt_len);
  432. /*
  433. * Hyper-V could still change len8 and offset8 after the earlier read.
  434. * Ensure that desc_copy has legal values for len8 and offset8 that
  435. * are consistent with the copy we just made
  436. */
  437. desc_copy->len8 = pkt_len >> 3;
  438. desc_copy->offset8 = pkt_offset >> 3;
  439. return desc_copy;
  440. }
  441. EXPORT_SYMBOL_GPL(hv_pkt_iter_first);
  442. /*
  443. * Get next vmbus packet from ring buffer.
  444. *
  445. * Advances the current location (priv_read_index) and checks for more
  446. * data. If the end of the ring buffer is reached, then return NULL.
  447. */
  448. struct vmpacket_descriptor *
  449. __hv_pkt_iter_next(struct vmbus_channel *channel,
  450. const struct vmpacket_descriptor *desc)
  451. {
  452. struct hv_ring_buffer_info *rbi = &channel->inbound;
  453. u32 packetlen = desc->len8 << 3;
  454. u32 dsize = rbi->ring_datasize;
  455. hv_debug_delay_test(channel, MESSAGE_DELAY);
  456. /* bump offset to next potential packet */
  457. rbi->priv_read_index += packetlen + VMBUS_PKT_TRAILER;
  458. if (rbi->priv_read_index >= dsize)
  459. rbi->priv_read_index -= dsize;
  460. /* more data? */
  461. return hv_pkt_iter_first(channel);
  462. }
  463. EXPORT_SYMBOL_GPL(__hv_pkt_iter_next);
  464. /* How many bytes were read in this iterator cycle */
  465. static u32 hv_pkt_iter_bytes_read(const struct hv_ring_buffer_info *rbi,
  466. u32 start_read_index)
  467. {
  468. if (rbi->priv_read_index >= start_read_index)
  469. return rbi->priv_read_index - start_read_index;
  470. else
  471. return rbi->ring_datasize - start_read_index +
  472. rbi->priv_read_index;
  473. }
  474. /*
  475. * Update host ring buffer after iterating over packets. If the host has
  476. * stopped queuing new entries because it found the ring buffer full, and
  477. * sufficient space is being freed up, signal the host. But be careful to
  478. * only signal the host when necessary, both for performance reasons and
  479. * because Hyper-V protects itself by throttling guests that signal
  480. * inappropriately.
  481. *
  482. * Determining when to signal is tricky. There are three key data inputs
  483. * that must be handled in this order to avoid race conditions:
  484. *
  485. * 1. Update the read_index
  486. * 2. Read the pending_send_sz
  487. * 3. Read the current write_index
  488. *
  489. * The interrupt_mask is not used to determine when to signal. The
  490. * interrupt_mask is used only on the guest->host ring buffer when
  491. * sending requests to the host. The host does not use it on the host->
  492. * guest ring buffer to indicate whether it should be signaled.
  493. */
  494. void hv_pkt_iter_close(struct vmbus_channel *channel)
  495. {
  496. struct hv_ring_buffer_info *rbi = &channel->inbound;
  497. u32 curr_write_sz, pending_sz, bytes_read, start_read_index;
  498. /*
  499. * Make sure all reads are done before we update the read index since
  500. * the writer may start writing to the read area once the read index
  501. * is updated.
  502. */
  503. virt_rmb();
  504. start_read_index = rbi->ring_buffer->read_index;
  505. rbi->ring_buffer->read_index = rbi->priv_read_index;
  506. /*
  507. * Older versions of Hyper-V (before WS2102 and Win8) do not
  508. * implement pending_send_sz and simply poll if the host->guest
  509. * ring buffer is full. No signaling is needed or expected.
  510. */
  511. if (!rbi->ring_buffer->feature_bits.feat_pending_send_sz)
  512. return;
  513. /*
  514. * Issue a full memory barrier before making the signaling decision.
  515. * If reading pending_send_sz were to be reordered and happen
  516. * before we commit the new read_index, a race could occur. If the
  517. * host were to set the pending_send_sz after we have sampled
  518. * pending_send_sz, and the ring buffer blocks before we commit the
  519. * read index, we could miss sending the interrupt. Issue a full
  520. * memory barrier to address this.
  521. */
  522. virt_mb();
  523. /*
  524. * If the pending_send_sz is zero, then the ring buffer is not
  525. * blocked and there is no need to signal. This is far by the
  526. * most common case, so exit quickly for best performance.
  527. */
  528. pending_sz = READ_ONCE(rbi->ring_buffer->pending_send_sz);
  529. if (!pending_sz)
  530. return;
  531. /*
  532. * Ensure the read of write_index in hv_get_bytes_to_write()
  533. * happens after the read of pending_send_sz.
  534. */
  535. virt_rmb();
  536. curr_write_sz = hv_get_bytes_to_write(rbi);
  537. bytes_read = hv_pkt_iter_bytes_read(rbi, start_read_index);
  538. /*
  539. * We want to signal the host only if we're transitioning
  540. * from a "not enough free space" state to a "enough free
  541. * space" state. For example, it's possible that this function
  542. * could run and free up enough space to signal the host, and then
  543. * run again and free up additional space before the host has a
  544. * chance to clear the pending_send_sz. The 2nd invocation would
  545. * be a null transition from "enough free space" to "enough free
  546. * space", which doesn't warrant a signal.
  547. *
  548. * Exactly filling the ring buffer is treated as "not enough
  549. * space". The ring buffer always must have at least one byte
  550. * empty so the empty and full conditions are distinguishable.
  551. * hv_get_bytes_to_write() doesn't fully tell the truth in
  552. * this regard.
  553. *
  554. * So first check if we were in the "enough free space" state
  555. * before we began the iteration. If so, the host was not
  556. * blocked, and there's no need to signal.
  557. */
  558. if (curr_write_sz - bytes_read > pending_sz)
  559. return;
  560. /*
  561. * Similarly, if the new state is "not enough space", then
  562. * there's no need to signal.
  563. */
  564. if (curr_write_sz <= pending_sz)
  565. return;
  566. ++channel->intr_in_full;
  567. vmbus_setevent(channel);
  568. }
  569. EXPORT_SYMBOL_GPL(hv_pkt_iter_close);