smc_tx.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Shared Memory Communications over RDMA (SMC-R) and RoCE
  4. *
  5. * Manage send buffer.
  6. * Producer:
  7. * Copy user space data into send buffer, if send buffer space available.
  8. * Consumer:
  9. * Trigger RDMA write into RMBE of peer and send CDC, if RMBE space available.
  10. *
  11. * Copyright IBM Corp. 2016
  12. *
  13. * Author(s): Ursula Braun <[email protected]>
  14. */
  15. #include <linux/net.h>
  16. #include <linux/rcupdate.h>
  17. #include <linux/workqueue.h>
  18. #include <linux/sched/signal.h>
  19. #include <net/sock.h>
  20. #include <net/tcp.h>
  21. #include "smc.h"
  22. #include "smc_wr.h"
  23. #include "smc_cdc.h"
  24. #include "smc_close.h"
  25. #include "smc_ism.h"
  26. #include "smc_tx.h"
  27. #include "smc_stats.h"
  28. #include "smc_tracepoint.h"
  29. #define SMC_TX_WORK_DELAY 0
  30. /***************************** sndbuf producer *******************************/
  31. /* callback implementation for sk.sk_write_space()
  32. * to wakeup sndbuf producers that blocked with smc_tx_wait().
  33. * called under sk_socket lock.
  34. */
  35. static void smc_tx_write_space(struct sock *sk)
  36. {
  37. struct socket *sock = sk->sk_socket;
  38. struct smc_sock *smc = smc_sk(sk);
  39. struct socket_wq *wq;
  40. /* similar to sk_stream_write_space */
  41. if (atomic_read(&smc->conn.sndbuf_space) && sock) {
  42. if (test_bit(SOCK_NOSPACE, &sock->flags))
  43. SMC_STAT_RMB_TX_FULL(smc, !smc->conn.lnk);
  44. clear_bit(SOCK_NOSPACE, &sock->flags);
  45. rcu_read_lock();
  46. wq = rcu_dereference(sk->sk_wq);
  47. if (skwq_has_sleeper(wq))
  48. wake_up_interruptible_poll(&wq->wait,
  49. EPOLLOUT | EPOLLWRNORM |
  50. EPOLLWRBAND);
  51. if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
  52. sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
  53. rcu_read_unlock();
  54. }
  55. }
  56. /* Wakeup sndbuf producers that blocked with smc_tx_wait().
  57. * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space().
  58. */
  59. void smc_tx_sndbuf_nonfull(struct smc_sock *smc)
  60. {
  61. if (smc->sk.sk_socket &&
  62. test_bit(SOCK_NOSPACE, &smc->sk.sk_socket->flags))
  63. smc->sk.sk_write_space(&smc->sk);
  64. }
  65. /* blocks sndbuf producer until at least one byte of free space available
  66. * or urgent Byte was consumed
  67. */
  68. static int smc_tx_wait(struct smc_sock *smc, int flags)
  69. {
  70. DEFINE_WAIT_FUNC(wait, woken_wake_function);
  71. struct smc_connection *conn = &smc->conn;
  72. struct sock *sk = &smc->sk;
  73. long timeo;
  74. int rc = 0;
  75. /* similar to sk_stream_wait_memory */
  76. timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
  77. add_wait_queue(sk_sleep(sk), &wait);
  78. while (1) {
  79. sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
  80. if (sk->sk_err ||
  81. (sk->sk_shutdown & SEND_SHUTDOWN) ||
  82. conn->killed ||
  83. conn->local_tx_ctrl.conn_state_flags.peer_done_writing) {
  84. rc = -EPIPE;
  85. break;
  86. }
  87. if (smc_cdc_rxed_any_close(conn)) {
  88. rc = -ECONNRESET;
  89. break;
  90. }
  91. if (!timeo) {
  92. /* ensure EPOLLOUT is subsequently generated */
  93. set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
  94. rc = -EAGAIN;
  95. break;
  96. }
  97. if (signal_pending(current)) {
  98. rc = sock_intr_errno(timeo);
  99. break;
  100. }
  101. sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
  102. if (atomic_read(&conn->sndbuf_space) && !conn->urg_tx_pend)
  103. break; /* at least 1 byte of free & no urgent data */
  104. set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
  105. sk_wait_event(sk, &timeo,
  106. READ_ONCE(sk->sk_err) ||
  107. (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) ||
  108. smc_cdc_rxed_any_close(conn) ||
  109. (atomic_read(&conn->sndbuf_space) &&
  110. !conn->urg_tx_pend),
  111. &wait);
  112. }
  113. remove_wait_queue(sk_sleep(sk), &wait);
  114. return rc;
  115. }
  116. static bool smc_tx_is_corked(struct smc_sock *smc)
  117. {
  118. struct tcp_sock *tp = tcp_sk(smc->clcsock->sk);
  119. return (tp->nonagle & TCP_NAGLE_CORK) ? true : false;
  120. }
  121. /* If we have pending CDC messages, do not send:
  122. * Because CQE of this CDC message will happen shortly, it gives
  123. * a chance to coalesce future sendmsg() payload in to one RDMA Write,
  124. * without need for a timer, and with no latency trade off.
  125. * Algorithm here:
  126. * 1. First message should never cork
  127. * 2. If we have pending Tx CDC messages, wait for the first CDC
  128. * message's completion
  129. * 3. Don't cork to much data in a single RDMA Write to prevent burst
  130. * traffic, total corked message should not exceed sendbuf/2
  131. */
  132. static bool smc_should_autocork(struct smc_sock *smc)
  133. {
  134. struct smc_connection *conn = &smc->conn;
  135. int corking_size;
  136. corking_size = min_t(unsigned int, conn->sndbuf_desc->len >> 1,
  137. sock_net(&smc->sk)->smc.sysctl_autocorking_size);
  138. if (atomic_read(&conn->cdc_pend_tx_wr) == 0 ||
  139. smc_tx_prepared_sends(conn) > corking_size)
  140. return false;
  141. return true;
  142. }
  143. static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg)
  144. {
  145. struct smc_connection *conn = &smc->conn;
  146. if (smc_should_autocork(smc))
  147. return true;
  148. /* for a corked socket defer the RDMA writes if
  149. * sndbuf_space is still available. The applications
  150. * should known how/when to uncork it.
  151. */
  152. if ((msg->msg_flags & MSG_MORE ||
  153. smc_tx_is_corked(smc) ||
  154. msg->msg_flags & MSG_SENDPAGE_NOTLAST) &&
  155. atomic_read(&conn->sndbuf_space))
  156. return true;
  157. return false;
  158. }
  159. /* sndbuf producer: main API called by socket layer.
  160. * called under sock lock.
  161. */
  162. int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
  163. {
  164. size_t copylen, send_done = 0, send_remaining = len;
  165. size_t chunk_len, chunk_off, chunk_len_sum;
  166. struct smc_connection *conn = &smc->conn;
  167. union smc_host_cursor prep;
  168. struct sock *sk = &smc->sk;
  169. char *sndbuf_base;
  170. int tx_cnt_prep;
  171. int writespace;
  172. int rc, chunk;
  173. /* This should be in poll */
  174. sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
  175. if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) {
  176. rc = -EPIPE;
  177. goto out_err;
  178. }
  179. if (sk->sk_state == SMC_INIT)
  180. return -ENOTCONN;
  181. if (len > conn->sndbuf_desc->len)
  182. SMC_STAT_RMB_TX_SIZE_SMALL(smc, !conn->lnk);
  183. if (len > conn->peer_rmbe_size)
  184. SMC_STAT_RMB_TX_PEER_SIZE_SMALL(smc, !conn->lnk);
  185. if (msg->msg_flags & MSG_OOB)
  186. SMC_STAT_INC(smc, urg_data_cnt);
  187. while (msg_data_left(msg)) {
  188. if (smc->sk.sk_shutdown & SEND_SHUTDOWN ||
  189. (smc->sk.sk_err == ECONNABORTED) ||
  190. conn->killed)
  191. return -EPIPE;
  192. if (smc_cdc_rxed_any_close(conn))
  193. return send_done ?: -ECONNRESET;
  194. if (msg->msg_flags & MSG_OOB)
  195. conn->local_tx_ctrl.prod_flags.urg_data_pending = 1;
  196. if (!atomic_read(&conn->sndbuf_space) || conn->urg_tx_pend) {
  197. if (send_done)
  198. return send_done;
  199. rc = smc_tx_wait(smc, msg->msg_flags);
  200. if (rc)
  201. goto out_err;
  202. continue;
  203. }
  204. /* initialize variables for 1st iteration of subsequent loop */
  205. /* could be just 1 byte, even after smc_tx_wait above */
  206. writespace = atomic_read(&conn->sndbuf_space);
  207. /* not more than what user space asked for */
  208. copylen = min_t(size_t, send_remaining, writespace);
  209. /* determine start of sndbuf */
  210. sndbuf_base = conn->sndbuf_desc->cpu_addr;
  211. smc_curs_copy(&prep, &conn->tx_curs_prep, conn);
  212. tx_cnt_prep = prep.count;
  213. /* determine chunks where to write into sndbuf */
  214. /* either unwrapped case, or 1st chunk of wrapped case */
  215. chunk_len = min_t(size_t, copylen, conn->sndbuf_desc->len -
  216. tx_cnt_prep);
  217. chunk_len_sum = chunk_len;
  218. chunk_off = tx_cnt_prep;
  219. for (chunk = 0; chunk < 2; chunk++) {
  220. rc = memcpy_from_msg(sndbuf_base + chunk_off,
  221. msg, chunk_len);
  222. if (rc) {
  223. smc_sndbuf_sync_sg_for_device(conn);
  224. if (send_done)
  225. return send_done;
  226. goto out_err;
  227. }
  228. send_done += chunk_len;
  229. send_remaining -= chunk_len;
  230. if (chunk_len_sum == copylen)
  231. break; /* either on 1st or 2nd iteration */
  232. /* prepare next (== 2nd) iteration */
  233. chunk_len = copylen - chunk_len; /* remainder */
  234. chunk_len_sum += chunk_len;
  235. chunk_off = 0; /* modulo offset in send ring buffer */
  236. }
  237. smc_sndbuf_sync_sg_for_device(conn);
  238. /* update cursors */
  239. smc_curs_add(conn->sndbuf_desc->len, &prep, copylen);
  240. smc_curs_copy(&conn->tx_curs_prep, &prep, conn);
  241. /* increased in send tasklet smc_cdc_tx_handler() */
  242. smp_mb__before_atomic();
  243. atomic_sub(copylen, &conn->sndbuf_space);
  244. /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
  245. smp_mb__after_atomic();
  246. /* since we just produced more new data into sndbuf,
  247. * trigger sndbuf consumer: RDMA write into peer RMBE and CDC
  248. */
  249. if ((msg->msg_flags & MSG_OOB) && !send_remaining)
  250. conn->urg_tx_pend = true;
  251. /* If we need to cork, do nothing and wait for the next
  252. * sendmsg() call or push on tx completion
  253. */
  254. if (!smc_tx_should_cork(smc, msg))
  255. smc_tx_sndbuf_nonempty(conn);
  256. trace_smc_tx_sendmsg(smc, copylen);
  257. } /* while (msg_data_left(msg)) */
  258. return send_done;
  259. out_err:
  260. rc = sk_stream_error(sk, msg->msg_flags, rc);
  261. /* make sure we wake any epoll edge trigger waiter */
  262. if (unlikely(rc == -EAGAIN))
  263. sk->sk_write_space(sk);
  264. return rc;
  265. }
  266. int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset,
  267. size_t size, int flags)
  268. {
  269. struct msghdr msg = {.msg_flags = flags};
  270. char *kaddr = kmap(page);
  271. struct kvec iov;
  272. int rc;
  273. iov.iov_base = kaddr + offset;
  274. iov.iov_len = size;
  275. iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iov, 1, size);
  276. rc = smc_tx_sendmsg(smc, &msg, size);
  277. kunmap(page);
  278. return rc;
  279. }
  280. /***************************** sndbuf consumer *******************************/
  281. /* sndbuf consumer: actual data transfer of one target chunk with ISM write */
  282. int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len,
  283. u32 offset, int signal)
  284. {
  285. int rc;
  286. rc = smc_ism_write(conn->lgr->smcd, conn->peer_token,
  287. conn->peer_rmbe_idx, signal, conn->tx_off + offset,
  288. data, len);
  289. if (rc)
  290. conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
  291. return rc;
  292. }
  293. /* sndbuf consumer: actual data transfer of one target chunk with RDMA write */
  294. static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
  295. int num_sges, struct ib_rdma_wr *rdma_wr)
  296. {
  297. struct smc_link_group *lgr = conn->lgr;
  298. struct smc_link *link = conn->lnk;
  299. int rc;
  300. rdma_wr->wr.wr_id = smc_wr_tx_get_next_wr_id(link);
  301. rdma_wr->wr.num_sge = num_sges;
  302. rdma_wr->remote_addr =
  303. lgr->rtokens[conn->rtoken_idx][link->link_idx].dma_addr +
  304. /* RMBE within RMB */
  305. conn->tx_off +
  306. /* offset within RMBE */
  307. peer_rmbe_offset;
  308. rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey;
  309. rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL);
  310. if (rc)
  311. smcr_link_down_cond_sched(link);
  312. return rc;
  313. }
  314. /* sndbuf consumer */
  315. static inline void smc_tx_advance_cursors(struct smc_connection *conn,
  316. union smc_host_cursor *prod,
  317. union smc_host_cursor *sent,
  318. size_t len)
  319. {
  320. smc_curs_add(conn->peer_rmbe_size, prod, len);
  321. /* increased in recv tasklet smc_cdc_msg_rcv() */
  322. smp_mb__before_atomic();
  323. /* data in flight reduces usable snd_wnd */
  324. atomic_sub(len, &conn->peer_rmbe_space);
  325. /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
  326. smp_mb__after_atomic();
  327. smc_curs_add(conn->sndbuf_desc->len, sent, len);
  328. }
  329. /* SMC-R helper for smc_tx_rdma_writes() */
  330. static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len,
  331. size_t src_off, size_t src_len,
  332. size_t dst_off, size_t dst_len,
  333. struct smc_rdma_wr *wr_rdma_buf)
  334. {
  335. struct smc_link *link = conn->lnk;
  336. dma_addr_t dma_addr =
  337. sg_dma_address(conn->sndbuf_desc->sgt[link->link_idx].sgl);
  338. u64 virt_addr = (uintptr_t)conn->sndbuf_desc->cpu_addr;
  339. int src_len_sum = src_len, dst_len_sum = dst_len;
  340. int sent_count = src_off;
  341. int srcchunk, dstchunk;
  342. int num_sges;
  343. int rc;
  344. for (dstchunk = 0; dstchunk < 2; dstchunk++) {
  345. struct ib_rdma_wr *wr = &wr_rdma_buf->wr_tx_rdma[dstchunk];
  346. struct ib_sge *sge = wr->wr.sg_list;
  347. u64 base_addr = dma_addr;
  348. if (dst_len < link->qp_attr.cap.max_inline_data) {
  349. base_addr = virt_addr;
  350. wr->wr.send_flags |= IB_SEND_INLINE;
  351. } else {
  352. wr->wr.send_flags &= ~IB_SEND_INLINE;
  353. }
  354. num_sges = 0;
  355. for (srcchunk = 0; srcchunk < 2; srcchunk++) {
  356. sge[srcchunk].addr = conn->sndbuf_desc->is_vm ?
  357. (virt_addr + src_off) : (base_addr + src_off);
  358. sge[srcchunk].length = src_len;
  359. if (conn->sndbuf_desc->is_vm)
  360. sge[srcchunk].lkey =
  361. conn->sndbuf_desc->mr[link->link_idx]->lkey;
  362. num_sges++;
  363. src_off += src_len;
  364. if (src_off >= conn->sndbuf_desc->len)
  365. src_off -= conn->sndbuf_desc->len;
  366. /* modulo in send ring */
  367. if (src_len_sum == dst_len)
  368. break; /* either on 1st or 2nd iteration */
  369. /* prepare next (== 2nd) iteration */
  370. src_len = dst_len - src_len; /* remainder */
  371. src_len_sum += src_len;
  372. }
  373. rc = smc_tx_rdma_write(conn, dst_off, num_sges, wr);
  374. if (rc)
  375. return rc;
  376. if (dst_len_sum == len)
  377. break; /* either on 1st or 2nd iteration */
  378. /* prepare next (== 2nd) iteration */
  379. dst_off = 0; /* modulo offset in RMBE ring buffer */
  380. dst_len = len - dst_len; /* remainder */
  381. dst_len_sum += dst_len;
  382. src_len = min_t(int, dst_len, conn->sndbuf_desc->len -
  383. sent_count);
  384. src_len_sum = src_len;
  385. }
  386. return 0;
  387. }
  388. /* SMC-D helper for smc_tx_rdma_writes() */
  389. static int smcd_tx_rdma_writes(struct smc_connection *conn, size_t len,
  390. size_t src_off, size_t src_len,
  391. size_t dst_off, size_t dst_len)
  392. {
  393. int src_len_sum = src_len, dst_len_sum = dst_len;
  394. int srcchunk, dstchunk;
  395. int rc;
  396. for (dstchunk = 0; dstchunk < 2; dstchunk++) {
  397. for (srcchunk = 0; srcchunk < 2; srcchunk++) {
  398. void *data = conn->sndbuf_desc->cpu_addr + src_off;
  399. rc = smcd_tx_ism_write(conn, data, src_len, dst_off +
  400. sizeof(struct smcd_cdc_msg), 0);
  401. if (rc)
  402. return rc;
  403. dst_off += src_len;
  404. src_off += src_len;
  405. if (src_off >= conn->sndbuf_desc->len)
  406. src_off -= conn->sndbuf_desc->len;
  407. /* modulo in send ring */
  408. if (src_len_sum == dst_len)
  409. break; /* either on 1st or 2nd iteration */
  410. /* prepare next (== 2nd) iteration */
  411. src_len = dst_len - src_len; /* remainder */
  412. src_len_sum += src_len;
  413. }
  414. if (dst_len_sum == len)
  415. break; /* either on 1st or 2nd iteration */
  416. /* prepare next (== 2nd) iteration */
  417. dst_off = 0; /* modulo offset in RMBE ring buffer */
  418. dst_len = len - dst_len; /* remainder */
  419. dst_len_sum += dst_len;
  420. src_len = min_t(int, dst_len, conn->sndbuf_desc->len - src_off);
  421. src_len_sum = src_len;
  422. }
  423. return 0;
  424. }
  425. /* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit;
  426. * usable snd_wnd as max transmit
  427. */
  428. static int smc_tx_rdma_writes(struct smc_connection *conn,
  429. struct smc_rdma_wr *wr_rdma_buf)
  430. {
  431. size_t len, src_len, dst_off, dst_len; /* current chunk values */
  432. union smc_host_cursor sent, prep, prod, cons;
  433. struct smc_cdc_producer_flags *pflags;
  434. int to_send, rmbespace;
  435. int rc;
  436. /* source: sndbuf */
  437. smc_curs_copy(&sent, &conn->tx_curs_sent, conn);
  438. smc_curs_copy(&prep, &conn->tx_curs_prep, conn);
  439. /* cf. wmem_alloc - (snd_max - snd_una) */
  440. to_send = smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep);
  441. if (to_send <= 0)
  442. return 0;
  443. /* destination: RMBE */
  444. /* cf. snd_wnd */
  445. rmbespace = atomic_read(&conn->peer_rmbe_space);
  446. if (rmbespace <= 0) {
  447. struct smc_sock *smc = container_of(conn, struct smc_sock,
  448. conn);
  449. SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk);
  450. return 0;
  451. }
  452. smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn);
  453. smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn);
  454. /* if usable snd_wnd closes ask peer to advertise once it opens again */
  455. pflags = &conn->local_tx_ctrl.prod_flags;
  456. pflags->write_blocked = (to_send >= rmbespace);
  457. /* cf. usable snd_wnd */
  458. len = min(to_send, rmbespace);
  459. /* initialize variables for first iteration of subsequent nested loop */
  460. dst_off = prod.count;
  461. if (prod.wrap == cons.wrap) {
  462. /* the filled destination area is unwrapped,
  463. * hence the available free destination space is wrapped
  464. * and we need 2 destination chunks of sum len; start with 1st
  465. * which is limited by what's available in sndbuf
  466. */
  467. dst_len = min_t(size_t,
  468. conn->peer_rmbe_size - prod.count, len);
  469. } else {
  470. /* the filled destination area is wrapped,
  471. * hence the available free destination space is unwrapped
  472. * and we need a single destination chunk of entire len
  473. */
  474. dst_len = len;
  475. }
  476. /* dst_len determines the maximum src_len */
  477. if (sent.count + dst_len <= conn->sndbuf_desc->len) {
  478. /* unwrapped src case: single chunk of entire dst_len */
  479. src_len = dst_len;
  480. } else {
  481. /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */
  482. src_len = conn->sndbuf_desc->len - sent.count;
  483. }
  484. if (conn->lgr->is_smcd)
  485. rc = smcd_tx_rdma_writes(conn, len, sent.count, src_len,
  486. dst_off, dst_len);
  487. else
  488. rc = smcr_tx_rdma_writes(conn, len, sent.count, src_len,
  489. dst_off, dst_len, wr_rdma_buf);
  490. if (rc)
  491. return rc;
  492. if (conn->urg_tx_pend && len == to_send)
  493. pflags->urg_data_present = 1;
  494. smc_tx_advance_cursors(conn, &prod, &sent, len);
  495. /* update connection's cursors with advanced local cursors */
  496. smc_curs_copy(&conn->local_tx_ctrl.prod, &prod, conn);
  497. /* dst: peer RMBE */
  498. smc_curs_copy(&conn->tx_curs_sent, &sent, conn);/* src: local sndbuf */
  499. return 0;
  500. }
  501. /* Wakeup sndbuf consumers from any context (IRQ or process)
  502. * since there is more data to transmit; usable snd_wnd as max transmit
  503. */
  504. static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn)
  505. {
  506. struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags;
  507. struct smc_link *link = conn->lnk;
  508. struct smc_rdma_wr *wr_rdma_buf;
  509. struct smc_cdc_tx_pend *pend;
  510. struct smc_wr_buf *wr_buf;
  511. int rc;
  512. if (!link || !smc_wr_tx_link_hold(link))
  513. return -ENOLINK;
  514. rc = smc_cdc_get_free_slot(conn, link, &wr_buf, &wr_rdma_buf, &pend);
  515. if (rc < 0) {
  516. smc_wr_tx_link_put(link);
  517. if (rc == -EBUSY) {
  518. struct smc_sock *smc =
  519. container_of(conn, struct smc_sock, conn);
  520. if (smc->sk.sk_err == ECONNABORTED)
  521. return sock_error(&smc->sk);
  522. if (conn->killed)
  523. return -EPIPE;
  524. rc = 0;
  525. mod_delayed_work(conn->lgr->tx_wq, &conn->tx_work,
  526. SMC_TX_WORK_DELAY);
  527. }
  528. return rc;
  529. }
  530. spin_lock_bh(&conn->send_lock);
  531. if (link != conn->lnk) {
  532. /* link of connection changed, tx_work will restart */
  533. smc_wr_tx_put_slot(link,
  534. (struct smc_wr_tx_pend_priv *)pend);
  535. rc = -ENOLINK;
  536. goto out_unlock;
  537. }
  538. if (!pflags->urg_data_present) {
  539. rc = smc_tx_rdma_writes(conn, wr_rdma_buf);
  540. if (rc) {
  541. smc_wr_tx_put_slot(link,
  542. (struct smc_wr_tx_pend_priv *)pend);
  543. goto out_unlock;
  544. }
  545. }
  546. rc = smc_cdc_msg_send(conn, wr_buf, pend);
  547. if (!rc && pflags->urg_data_present) {
  548. pflags->urg_data_pending = 0;
  549. pflags->urg_data_present = 0;
  550. }
  551. out_unlock:
  552. spin_unlock_bh(&conn->send_lock);
  553. smc_wr_tx_link_put(link);
  554. return rc;
  555. }
  556. static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn)
  557. {
  558. struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags;
  559. int rc = 0;
  560. spin_lock_bh(&conn->send_lock);
  561. if (!pflags->urg_data_present)
  562. rc = smc_tx_rdma_writes(conn, NULL);
  563. if (!rc)
  564. rc = smcd_cdc_msg_send(conn);
  565. if (!rc && pflags->urg_data_present) {
  566. pflags->urg_data_pending = 0;
  567. pflags->urg_data_present = 0;
  568. }
  569. spin_unlock_bh(&conn->send_lock);
  570. return rc;
  571. }
  572. static int __smc_tx_sndbuf_nonempty(struct smc_connection *conn)
  573. {
  574. struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
  575. int rc = 0;
  576. /* No data in the send queue */
  577. if (unlikely(smc_tx_prepared_sends(conn) <= 0))
  578. goto out;
  579. /* Peer don't have RMBE space */
  580. if (unlikely(atomic_read(&conn->peer_rmbe_space) <= 0)) {
  581. SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk);
  582. goto out;
  583. }
  584. if (conn->killed ||
  585. conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
  586. rc = -EPIPE; /* connection being aborted */
  587. goto out;
  588. }
  589. if (conn->lgr->is_smcd)
  590. rc = smcd_tx_sndbuf_nonempty(conn);
  591. else
  592. rc = smcr_tx_sndbuf_nonempty(conn);
  593. if (!rc) {
  594. /* trigger socket release if connection is closing */
  595. smc_close_wake_tx_prepared(smc);
  596. }
  597. out:
  598. return rc;
  599. }
  600. int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
  601. {
  602. int rc;
  603. /* This make sure only one can send simultaneously to prevent wasting
  604. * of CPU and CDC slot.
  605. * Record whether someone has tried to push while we are pushing.
  606. */
  607. if (atomic_inc_return(&conn->tx_pushing) > 1)
  608. return 0;
  609. again:
  610. atomic_set(&conn->tx_pushing, 1);
  611. smp_wmb(); /* Make sure tx_pushing is 1 before real send */
  612. rc = __smc_tx_sndbuf_nonempty(conn);
  613. /* We need to check whether someone else have added some data into
  614. * the send queue and tried to push but failed after the atomic_set()
  615. * when we are pushing.
  616. * If so, we need to push again to prevent those data hang in the send
  617. * queue.
  618. */
  619. if (unlikely(!atomic_dec_and_test(&conn->tx_pushing)))
  620. goto again;
  621. return rc;
  622. }
  623. /* Wakeup sndbuf consumers from process context
  624. * since there is more data to transmit. The caller
  625. * must hold sock lock.
  626. */
  627. void smc_tx_pending(struct smc_connection *conn)
  628. {
  629. struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
  630. int rc;
  631. if (smc->sk.sk_err)
  632. return;
  633. rc = smc_tx_sndbuf_nonempty(conn);
  634. if (!rc && conn->local_rx_ctrl.prod_flags.write_blocked &&
  635. !atomic_read(&conn->bytes_to_rcv))
  636. conn->local_rx_ctrl.prod_flags.write_blocked = 0;
  637. }
  638. /* Wakeup sndbuf consumers from process context
  639. * since there is more data to transmit in locked
  640. * sock.
  641. */
  642. void smc_tx_work(struct work_struct *work)
  643. {
  644. struct smc_connection *conn = container_of(to_delayed_work(work),
  645. struct smc_connection,
  646. tx_work);
  647. struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
  648. lock_sock(&smc->sk);
  649. smc_tx_pending(conn);
  650. release_sock(&smc->sk);
  651. }
  652. void smc_tx_consumer_update(struct smc_connection *conn, bool force)
  653. {
  654. union smc_host_cursor cfed, cons, prod;
  655. int sender_free = conn->rmb_desc->len;
  656. int to_confirm;
  657. smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
  658. smc_curs_copy(&cfed, &conn->rx_curs_confirmed, conn);
  659. to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons);
  660. if (to_confirm > conn->rmbe_update_limit) {
  661. smc_curs_copy(&prod, &conn->local_rx_ctrl.prod, conn);
  662. sender_free = conn->rmb_desc->len -
  663. smc_curs_diff_large(conn->rmb_desc->len,
  664. &cfed, &prod);
  665. }
  666. if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
  667. force ||
  668. ((to_confirm > conn->rmbe_update_limit) &&
  669. ((sender_free <= (conn->rmb_desc->len / 2)) ||
  670. conn->local_rx_ctrl.prod_flags.write_blocked))) {
  671. if (conn->killed ||
  672. conn->local_rx_ctrl.conn_state_flags.peer_conn_abort)
  673. return;
  674. if ((smc_cdc_get_slot_and_msg_send(conn) < 0) &&
  675. !conn->killed) {
  676. queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work,
  677. SMC_TX_WORK_DELAY);
  678. return;
  679. }
  680. }
  681. if (conn->local_rx_ctrl.prod_flags.write_blocked &&
  682. !atomic_read(&conn->bytes_to_rcv))
  683. conn->local_rx_ctrl.prod_flags.write_blocked = 0;
  684. }
  685. /***************************** send initialize *******************************/
  686. /* Initialize send properties on connection establishment. NB: not __init! */
  687. void smc_tx_init(struct smc_sock *smc)
  688. {
  689. smc->sk.sk_write_space = smc_tx_write_space;
  690. }