svc_rdma_recvfrom.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868
  1. // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
  2. /*
  3. * Copyright (c) 2016-2018 Oracle. All rights reserved.
  4. * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
  5. * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
  6. *
  7. * This software is available to you under a choice of one of two
  8. * licenses. You may choose to be licensed under the terms of the GNU
  9. * General Public License (GPL) Version 2, available from the file
  10. * COPYING in the main directory of this source tree, or the BSD-type
  11. * license below:
  12. *
  13. * Redistribution and use in source and binary forms, with or without
  14. * modification, are permitted provided that the following conditions
  15. * are met:
  16. *
  17. * Redistributions of source code must retain the above copyright
  18. * notice, this list of conditions and the following disclaimer.
  19. *
  20. * Redistributions in binary form must reproduce the above
  21. * copyright notice, this list of conditions and the following
  22. * disclaimer in the documentation and/or other materials provided
  23. * with the distribution.
  24. *
  25. * Neither the name of the Network Appliance, Inc. nor the names of
  26. * its contributors may be used to endorse or promote products
  27. * derived from this software without specific prior written
  28. * permission.
  29. *
  30. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  31. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  32. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  33. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  34. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  35. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  36. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  37. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  38. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  39. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  40. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  41. *
  42. * Author: Tom Tucker <[email protected]>
  43. */
  44. /* Operation
  45. *
  46. * The main entry point is svc_rdma_recvfrom. This is called from
  47. * svc_recv when the transport indicates there is incoming data to
  48. * be read. "Data Ready" is signaled when an RDMA Receive completes,
  49. * or when a set of RDMA Reads complete.
  50. *
  51. * An svc_rqst is passed in. This structure contains an array of
  52. * free pages (rq_pages) that will contain the incoming RPC message.
  53. *
  54. * Short messages are moved directly into svc_rqst::rq_arg, and
  55. * the RPC Call is ready to be processed by the Upper Layer.
  56. * svc_rdma_recvfrom returns the length of the RPC Call message,
  57. * completing the reception of the RPC Call.
  58. *
  59. * However, when an incoming message has Read chunks,
  60. * svc_rdma_recvfrom must post RDMA Reads to pull the RPC Call's
  61. * data payload from the client. svc_rdma_recvfrom sets up the
  62. * RDMA Reads using pages in svc_rqst::rq_pages, which are
  63. * transferred to an svc_rdma_recv_ctxt for the duration of the
  64. * I/O. svc_rdma_recvfrom then returns zero, since the RPC message
  65. * is still not yet ready.
  66. *
  67. * When the Read chunk payloads have become available on the
  68. * server, "Data Ready" is raised again, and svc_recv calls
  69. * svc_rdma_recvfrom again. This second call may use a different
  70. * svc_rqst than the first one, thus any information that needs
  71. * to be preserved across these two calls is kept in an
  72. * svc_rdma_recv_ctxt.
  73. *
  74. * The second call to svc_rdma_recvfrom performs final assembly
  75. * of the RPC Call message, using the RDMA Read sink pages kept in
  76. * the svc_rdma_recv_ctxt. The xdr_buf is copied from the
  77. * svc_rdma_recv_ctxt to the second svc_rqst. The second call returns
  78. * the length of the completed RPC Call message.
  79. *
  80. * Page Management
  81. *
  82. * Pages under I/O must be transferred from the first svc_rqst to an
  83. * svc_rdma_recv_ctxt before the first svc_rdma_recvfrom call returns.
  84. *
  85. * The first svc_rqst supplies pages for RDMA Reads. These are moved
  86. * from rqstp::rq_pages into ctxt::pages. The consumed elements of
  87. * the rq_pages array are set to NULL and refilled with the first
  88. * svc_rdma_recvfrom call returns.
  89. *
  90. * During the second svc_rdma_recvfrom call, RDMA Read sink pages
  91. * are transferred from the svc_rdma_recv_ctxt to the second svc_rqst.
  92. */
  93. #include <linux/slab.h>
  94. #include <linux/spinlock.h>
  95. #include <asm/unaligned.h>
  96. #include <rdma/ib_verbs.h>
  97. #include <rdma/rdma_cm.h>
  98. #include <linux/sunrpc/xdr.h>
  99. #include <linux/sunrpc/debug.h>
  100. #include <linux/sunrpc/rpc_rdma.h>
  101. #include <linux/sunrpc/svc_rdma.h>
  102. #include "xprt_rdma.h"
  103. #include <trace/events/rpcrdma.h>
  104. static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc);
  105. static inline struct svc_rdma_recv_ctxt *
  106. svc_rdma_next_recv_ctxt(struct list_head *list)
  107. {
  108. return list_first_entry_or_null(list, struct svc_rdma_recv_ctxt,
  109. rc_list);
  110. }
  111. static void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma,
  112. struct rpc_rdma_cid *cid)
  113. {
  114. cid->ci_queue_id = rdma->sc_rq_cq->res.id;
  115. cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids);
  116. }
  117. static struct svc_rdma_recv_ctxt *
  118. svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
  119. {
  120. struct svc_rdma_recv_ctxt *ctxt;
  121. dma_addr_t addr;
  122. void *buffer;
  123. ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
  124. if (!ctxt)
  125. goto fail0;
  126. buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL);
  127. if (!buffer)
  128. goto fail1;
  129. addr = ib_dma_map_single(rdma->sc_pd->device, buffer,
  130. rdma->sc_max_req_size, DMA_FROM_DEVICE);
  131. if (ib_dma_mapping_error(rdma->sc_pd->device, addr))
  132. goto fail2;
  133. svc_rdma_recv_cid_init(rdma, &ctxt->rc_cid);
  134. pcl_init(&ctxt->rc_call_pcl);
  135. pcl_init(&ctxt->rc_read_pcl);
  136. pcl_init(&ctxt->rc_write_pcl);
  137. pcl_init(&ctxt->rc_reply_pcl);
  138. ctxt->rc_recv_wr.next = NULL;
  139. ctxt->rc_recv_wr.wr_cqe = &ctxt->rc_cqe;
  140. ctxt->rc_recv_wr.sg_list = &ctxt->rc_recv_sge;
  141. ctxt->rc_recv_wr.num_sge = 1;
  142. ctxt->rc_cqe.done = svc_rdma_wc_receive;
  143. ctxt->rc_recv_sge.addr = addr;
  144. ctxt->rc_recv_sge.length = rdma->sc_max_req_size;
  145. ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey;
  146. ctxt->rc_recv_buf = buffer;
  147. ctxt->rc_temp = false;
  148. return ctxt;
  149. fail2:
  150. kfree(buffer);
  151. fail1:
  152. kfree(ctxt);
  153. fail0:
  154. return NULL;
  155. }
  156. static void svc_rdma_recv_ctxt_destroy(struct svcxprt_rdma *rdma,
  157. struct svc_rdma_recv_ctxt *ctxt)
  158. {
  159. ib_dma_unmap_single(rdma->sc_pd->device, ctxt->rc_recv_sge.addr,
  160. ctxt->rc_recv_sge.length, DMA_FROM_DEVICE);
  161. kfree(ctxt->rc_recv_buf);
  162. kfree(ctxt);
  163. }
  164. /**
  165. * svc_rdma_recv_ctxts_destroy - Release all recv_ctxt's for an xprt
  166. * @rdma: svcxprt_rdma being torn down
  167. *
  168. */
  169. void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma)
  170. {
  171. struct svc_rdma_recv_ctxt *ctxt;
  172. struct llist_node *node;
  173. while ((node = llist_del_first(&rdma->sc_recv_ctxts))) {
  174. ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
  175. svc_rdma_recv_ctxt_destroy(rdma, ctxt);
  176. }
  177. }
  178. /**
  179. * svc_rdma_recv_ctxt_get - Allocate a recv_ctxt
  180. * @rdma: controlling svcxprt_rdma
  181. *
  182. * Returns a recv_ctxt or (rarely) NULL if none are available.
  183. */
  184. struct svc_rdma_recv_ctxt *svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma)
  185. {
  186. struct svc_rdma_recv_ctxt *ctxt;
  187. struct llist_node *node;
  188. node = llist_del_first(&rdma->sc_recv_ctxts);
  189. if (!node)
  190. goto out_empty;
  191. ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
  192. out:
  193. ctxt->rc_page_count = 0;
  194. return ctxt;
  195. out_empty:
  196. ctxt = svc_rdma_recv_ctxt_alloc(rdma);
  197. if (!ctxt)
  198. return NULL;
  199. goto out;
  200. }
  201. /**
  202. * svc_rdma_recv_ctxt_put - Return recv_ctxt to free list
  203. * @rdma: controlling svcxprt_rdma
  204. * @ctxt: object to return to the free list
  205. *
  206. */
  207. void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
  208. struct svc_rdma_recv_ctxt *ctxt)
  209. {
  210. pcl_free(&ctxt->rc_call_pcl);
  211. pcl_free(&ctxt->rc_read_pcl);
  212. pcl_free(&ctxt->rc_write_pcl);
  213. pcl_free(&ctxt->rc_reply_pcl);
  214. if (!ctxt->rc_temp)
  215. llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts);
  216. else
  217. svc_rdma_recv_ctxt_destroy(rdma, ctxt);
  218. }
  219. /**
  220. * svc_rdma_release_ctxt - Release transport-specific per-rqst resources
  221. * @xprt: the transport which owned the context
  222. * @vctxt: the context from rqstp->rq_xprt_ctxt or dr->xprt_ctxt
  223. *
  224. * Ensure that the recv_ctxt is released whether or not a Reply
  225. * was sent. For example, the client could close the connection,
  226. * or svc_process could drop an RPC, before the Reply is sent.
  227. */
  228. void svc_rdma_release_ctxt(struct svc_xprt *xprt, void *vctxt)
  229. {
  230. struct svc_rdma_recv_ctxt *ctxt = vctxt;
  231. struct svcxprt_rdma *rdma =
  232. container_of(xprt, struct svcxprt_rdma, sc_xprt);
  233. if (ctxt)
  234. svc_rdma_recv_ctxt_put(rdma, ctxt);
  235. }
  236. static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
  237. unsigned int wanted, bool temp)
  238. {
  239. const struct ib_recv_wr *bad_wr = NULL;
  240. struct svc_rdma_recv_ctxt *ctxt;
  241. struct ib_recv_wr *recv_chain;
  242. int ret;
  243. if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags))
  244. return false;
  245. recv_chain = NULL;
  246. while (wanted--) {
  247. ctxt = svc_rdma_recv_ctxt_get(rdma);
  248. if (!ctxt)
  249. break;
  250. trace_svcrdma_post_recv(ctxt);
  251. ctxt->rc_temp = temp;
  252. ctxt->rc_recv_wr.next = recv_chain;
  253. recv_chain = &ctxt->rc_recv_wr;
  254. rdma->sc_pending_recvs++;
  255. }
  256. if (!recv_chain)
  257. return false;
  258. ret = ib_post_recv(rdma->sc_qp, recv_chain, &bad_wr);
  259. if (ret)
  260. goto err_free;
  261. return true;
  262. err_free:
  263. trace_svcrdma_rq_post_err(rdma, ret);
  264. while (bad_wr) {
  265. ctxt = container_of(bad_wr, struct svc_rdma_recv_ctxt,
  266. rc_recv_wr);
  267. bad_wr = bad_wr->next;
  268. svc_rdma_recv_ctxt_put(rdma, ctxt);
  269. }
  270. /* Since we're destroying the xprt, no need to reset
  271. * sc_pending_recvs. */
  272. return false;
  273. }
  274. /**
  275. * svc_rdma_post_recvs - Post initial set of Recv WRs
  276. * @rdma: fresh svcxprt_rdma
  277. *
  278. * Returns true if successful, otherwise false.
  279. */
  280. bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
  281. {
  282. return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests, true);
  283. }
  284. /**
  285. * svc_rdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
  286. * @cq: Completion Queue context
  287. * @wc: Work Completion object
  288. *
  289. */
  290. static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
  291. {
  292. struct svcxprt_rdma *rdma = cq->cq_context;
  293. struct ib_cqe *cqe = wc->wr_cqe;
  294. struct svc_rdma_recv_ctxt *ctxt;
  295. rdma->sc_pending_recvs--;
  296. /* WARNING: Only wc->wr_cqe and wc->status are reliable */
  297. ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe);
  298. if (wc->status != IB_WC_SUCCESS)
  299. goto flushed;
  300. trace_svcrdma_wc_recv(wc, &ctxt->rc_cid);
  301. /* If receive posting fails, the connection is about to be
  302. * lost anyway. The server will not be able to send a reply
  303. * for this RPC, and the client will retransmit this RPC
  304. * anyway when it reconnects.
  305. *
  306. * Therefore we drop the Receive, even if status was SUCCESS
  307. * to reduce the likelihood of replayed requests once the
  308. * client reconnects.
  309. */
  310. if (rdma->sc_pending_recvs < rdma->sc_max_requests)
  311. if (!svc_rdma_refresh_recvs(rdma, rdma->sc_recv_batch, false))
  312. goto dropped;
  313. /* All wc fields are now known to be valid */
  314. ctxt->rc_byte_len = wc->byte_len;
  315. spin_lock(&rdma->sc_rq_dto_lock);
  316. list_add_tail(&ctxt->rc_list, &rdma->sc_rq_dto_q);
  317. /* Note the unlock pairs with the smp_rmb in svc_xprt_ready: */
  318. set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags);
  319. spin_unlock(&rdma->sc_rq_dto_lock);
  320. if (!test_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags))
  321. svc_xprt_enqueue(&rdma->sc_xprt);
  322. return;
  323. flushed:
  324. if (wc->status == IB_WC_WR_FLUSH_ERR)
  325. trace_svcrdma_wc_recv_flush(wc, &ctxt->rc_cid);
  326. else
  327. trace_svcrdma_wc_recv_err(wc, &ctxt->rc_cid);
  328. dropped:
  329. svc_rdma_recv_ctxt_put(rdma, ctxt);
  330. svc_xprt_deferred_close(&rdma->sc_xprt);
  331. }
  332. /**
  333. * svc_rdma_flush_recv_queues - Drain pending Receive work
  334. * @rdma: svcxprt_rdma being shut down
  335. *
  336. */
  337. void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma)
  338. {
  339. struct svc_rdma_recv_ctxt *ctxt;
  340. while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_rq_dto_q))) {
  341. list_del(&ctxt->rc_list);
  342. svc_rdma_recv_ctxt_put(rdma, ctxt);
  343. }
  344. }
  345. static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp,
  346. struct svc_rdma_recv_ctxt *ctxt)
  347. {
  348. struct xdr_buf *arg = &rqstp->rq_arg;
  349. arg->head[0].iov_base = ctxt->rc_recv_buf;
  350. arg->head[0].iov_len = ctxt->rc_byte_len;
  351. arg->tail[0].iov_base = NULL;
  352. arg->tail[0].iov_len = 0;
  353. arg->page_len = 0;
  354. arg->page_base = 0;
  355. arg->buflen = ctxt->rc_byte_len;
  356. arg->len = ctxt->rc_byte_len;
  357. }
  358. /**
  359. * xdr_count_read_segments - Count number of Read segments in Read list
  360. * @rctxt: Ingress receive context
  361. * @p: Start of an un-decoded Read list
  362. *
  363. * Before allocating anything, ensure the ingress Read list is safe
  364. * to use.
  365. *
  366. * The segment count is limited to how many segments can fit in the
  367. * transport header without overflowing the buffer. That's about 40
  368. * Read segments for a 1KB inline threshold.
  369. *
  370. * Return values:
  371. * %true: Read list is valid. @rctxt's xdr_stream is updated to point
  372. * to the first byte past the Read list. rc_read_pcl and
  373. * rc_call_pcl cl_count fields are set to the number of
  374. * Read segments in the list.
  375. * %false: Read list is corrupt. @rctxt's xdr_stream is left in an
  376. * unknown state.
  377. */
  378. static bool xdr_count_read_segments(struct svc_rdma_recv_ctxt *rctxt, __be32 *p)
  379. {
  380. rctxt->rc_call_pcl.cl_count = 0;
  381. rctxt->rc_read_pcl.cl_count = 0;
  382. while (xdr_item_is_present(p)) {
  383. u32 position, handle, length;
  384. u64 offset;
  385. p = xdr_inline_decode(&rctxt->rc_stream,
  386. rpcrdma_readseg_maxsz * sizeof(*p));
  387. if (!p)
  388. return false;
  389. xdr_decode_read_segment(p, &position, &handle,
  390. &length, &offset);
  391. if (position) {
  392. if (position & 3)
  393. return false;
  394. ++rctxt->rc_read_pcl.cl_count;
  395. } else {
  396. ++rctxt->rc_call_pcl.cl_count;
  397. }
  398. p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
  399. if (!p)
  400. return false;
  401. }
  402. return true;
  403. }
  404. /* Sanity check the Read list.
  405. *
  406. * Sanity checks:
  407. * - Read list does not overflow Receive buffer.
  408. * - Chunk size limited by largest NFS data payload.
  409. *
  410. * Return values:
  411. * %true: Read list is valid. @rctxt's xdr_stream is updated
  412. * to point to the first byte past the Read list.
  413. * %false: Read list is corrupt. @rctxt's xdr_stream is left
  414. * in an unknown state.
  415. */
  416. static bool xdr_check_read_list(struct svc_rdma_recv_ctxt *rctxt)
  417. {
  418. __be32 *p;
  419. p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
  420. if (!p)
  421. return false;
  422. if (!xdr_count_read_segments(rctxt, p))
  423. return false;
  424. if (!pcl_alloc_call(rctxt, p))
  425. return false;
  426. return pcl_alloc_read(rctxt, p);
  427. }
  428. static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt)
  429. {
  430. u32 segcount;
  431. __be32 *p;
  432. if (xdr_stream_decode_u32(&rctxt->rc_stream, &segcount))
  433. return false;
  434. /* A bogus segcount causes this buffer overflow check to fail. */
  435. p = xdr_inline_decode(&rctxt->rc_stream,
  436. segcount * rpcrdma_segment_maxsz * sizeof(*p));
  437. return p != NULL;
  438. }
  439. /**
  440. * xdr_count_write_chunks - Count number of Write chunks in Write list
  441. * @rctxt: Received header and decoding state
  442. * @p: start of an un-decoded Write list
  443. *
  444. * Before allocating anything, ensure the ingress Write list is
  445. * safe to use.
  446. *
  447. * Return values:
  448. * %true: Write list is valid. @rctxt's xdr_stream is updated
  449. * to point to the first byte past the Write list, and
  450. * the number of Write chunks is in rc_write_pcl.cl_count.
  451. * %false: Write list is corrupt. @rctxt's xdr_stream is left
  452. * in an indeterminate state.
  453. */
  454. static bool xdr_count_write_chunks(struct svc_rdma_recv_ctxt *rctxt, __be32 *p)
  455. {
  456. rctxt->rc_write_pcl.cl_count = 0;
  457. while (xdr_item_is_present(p)) {
  458. if (!xdr_check_write_chunk(rctxt))
  459. return false;
  460. ++rctxt->rc_write_pcl.cl_count;
  461. p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
  462. if (!p)
  463. return false;
  464. }
  465. return true;
  466. }
  467. /* Sanity check the Write list.
  468. *
  469. * Implementation limits:
  470. * - This implementation currently supports only one Write chunk.
  471. *
  472. * Sanity checks:
  473. * - Write list does not overflow Receive buffer.
  474. * - Chunk size limited by largest NFS data payload.
  475. *
  476. * Return values:
  477. * %true: Write list is valid. @rctxt's xdr_stream is updated
  478. * to point to the first byte past the Write list.
  479. * %false: Write list is corrupt. @rctxt's xdr_stream is left
  480. * in an unknown state.
  481. */
  482. static bool xdr_check_write_list(struct svc_rdma_recv_ctxt *rctxt)
  483. {
  484. __be32 *p;
  485. p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
  486. if (!p)
  487. return false;
  488. if (!xdr_count_write_chunks(rctxt, p))
  489. return false;
  490. if (!pcl_alloc_write(rctxt, &rctxt->rc_write_pcl, p))
  491. return false;
  492. rctxt->rc_cur_result_payload = pcl_first_chunk(&rctxt->rc_write_pcl);
  493. return true;
  494. }
  495. /* Sanity check the Reply chunk.
  496. *
  497. * Sanity checks:
  498. * - Reply chunk does not overflow Receive buffer.
  499. * - Chunk size limited by largest NFS data payload.
  500. *
  501. * Return values:
  502. * %true: Reply chunk is valid. @rctxt's xdr_stream is updated
  503. * to point to the first byte past the Reply chunk.
  504. * %false: Reply chunk is corrupt. @rctxt's xdr_stream is left
  505. * in an unknown state.
  506. */
  507. static bool xdr_check_reply_chunk(struct svc_rdma_recv_ctxt *rctxt)
  508. {
  509. __be32 *p;
  510. p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
  511. if (!p)
  512. return false;
  513. if (!xdr_item_is_present(p))
  514. return true;
  515. if (!xdr_check_write_chunk(rctxt))
  516. return false;
  517. rctxt->rc_reply_pcl.cl_count = 1;
  518. return pcl_alloc_write(rctxt, &rctxt->rc_reply_pcl, p);
  519. }
  520. /* RPC-over-RDMA Version One private extension: Remote Invalidation.
  521. * Responder's choice: requester signals it can handle Send With
  522. * Invalidate, and responder chooses one R_key to invalidate.
  523. *
  524. * If there is exactly one distinct R_key in the received transport
  525. * header, set rc_inv_rkey to that R_key. Otherwise, set it to zero.
  526. */
  527. static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma,
  528. struct svc_rdma_recv_ctxt *ctxt)
  529. {
  530. struct svc_rdma_segment *segment;
  531. struct svc_rdma_chunk *chunk;
  532. u32 inv_rkey;
  533. ctxt->rc_inv_rkey = 0;
  534. if (!rdma->sc_snd_w_inv)
  535. return;
  536. inv_rkey = 0;
  537. pcl_for_each_chunk(chunk, &ctxt->rc_call_pcl) {
  538. pcl_for_each_segment(segment, chunk) {
  539. if (inv_rkey == 0)
  540. inv_rkey = segment->rs_handle;
  541. else if (inv_rkey != segment->rs_handle)
  542. return;
  543. }
  544. }
  545. pcl_for_each_chunk(chunk, &ctxt->rc_read_pcl) {
  546. pcl_for_each_segment(segment, chunk) {
  547. if (inv_rkey == 0)
  548. inv_rkey = segment->rs_handle;
  549. else if (inv_rkey != segment->rs_handle)
  550. return;
  551. }
  552. }
  553. pcl_for_each_chunk(chunk, &ctxt->rc_write_pcl) {
  554. pcl_for_each_segment(segment, chunk) {
  555. if (inv_rkey == 0)
  556. inv_rkey = segment->rs_handle;
  557. else if (inv_rkey != segment->rs_handle)
  558. return;
  559. }
  560. }
  561. pcl_for_each_chunk(chunk, &ctxt->rc_reply_pcl) {
  562. pcl_for_each_segment(segment, chunk) {
  563. if (inv_rkey == 0)
  564. inv_rkey = segment->rs_handle;
  565. else if (inv_rkey != segment->rs_handle)
  566. return;
  567. }
  568. }
  569. ctxt->rc_inv_rkey = inv_rkey;
  570. }
  571. /**
  572. * svc_rdma_xdr_decode_req - Decode the transport header
  573. * @rq_arg: xdr_buf containing ingress RPC/RDMA message
  574. * @rctxt: state of decoding
  575. *
  576. * On entry, xdr->head[0].iov_base points to first byte of the
  577. * RPC-over-RDMA transport header.
  578. *
  579. * On successful exit, head[0] points to first byte past the
  580. * RPC-over-RDMA header. For RDMA_MSG, this is the RPC message.
  581. *
  582. * The length of the RPC-over-RDMA header is returned.
  583. *
  584. * Assumptions:
  585. * - The transport header is entirely contained in the head iovec.
  586. */
  587. static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg,
  588. struct svc_rdma_recv_ctxt *rctxt)
  589. {
  590. __be32 *p, *rdma_argp;
  591. unsigned int hdr_len;
  592. rdma_argp = rq_arg->head[0].iov_base;
  593. xdr_init_decode(&rctxt->rc_stream, rq_arg, rdma_argp, NULL);
  594. p = xdr_inline_decode(&rctxt->rc_stream,
  595. rpcrdma_fixed_maxsz * sizeof(*p));
  596. if (unlikely(!p))
  597. goto out_short;
  598. p++;
  599. if (*p != rpcrdma_version)
  600. goto out_version;
  601. p += 2;
  602. rctxt->rc_msgtype = *p;
  603. switch (rctxt->rc_msgtype) {
  604. case rdma_msg:
  605. break;
  606. case rdma_nomsg:
  607. break;
  608. case rdma_done:
  609. goto out_drop;
  610. case rdma_error:
  611. goto out_drop;
  612. default:
  613. goto out_proc;
  614. }
  615. if (!xdr_check_read_list(rctxt))
  616. goto out_inval;
  617. if (!xdr_check_write_list(rctxt))
  618. goto out_inval;
  619. if (!xdr_check_reply_chunk(rctxt))
  620. goto out_inval;
  621. rq_arg->head[0].iov_base = rctxt->rc_stream.p;
  622. hdr_len = xdr_stream_pos(&rctxt->rc_stream);
  623. rq_arg->head[0].iov_len -= hdr_len;
  624. rq_arg->len -= hdr_len;
  625. trace_svcrdma_decode_rqst(rctxt, rdma_argp, hdr_len);
  626. return hdr_len;
  627. out_short:
  628. trace_svcrdma_decode_short_err(rctxt, rq_arg->len);
  629. return -EINVAL;
  630. out_version:
  631. trace_svcrdma_decode_badvers_err(rctxt, rdma_argp);
  632. return -EPROTONOSUPPORT;
  633. out_drop:
  634. trace_svcrdma_decode_drop_err(rctxt, rdma_argp);
  635. return 0;
  636. out_proc:
  637. trace_svcrdma_decode_badproc_err(rctxt, rdma_argp);
  638. return -EINVAL;
  639. out_inval:
  640. trace_svcrdma_decode_parse_err(rctxt, rdma_argp);
  641. return -EINVAL;
  642. }
  643. static void svc_rdma_send_error(struct svcxprt_rdma *rdma,
  644. struct svc_rdma_recv_ctxt *rctxt,
  645. int status)
  646. {
  647. struct svc_rdma_send_ctxt *sctxt;
  648. sctxt = svc_rdma_send_ctxt_get(rdma);
  649. if (!sctxt)
  650. return;
  651. svc_rdma_send_error_msg(rdma, sctxt, rctxt, status);
  652. }
  653. /* By convention, backchannel calls arrive via rdma_msg type
  654. * messages, and never populate the chunk lists. This makes
  655. * the RPC/RDMA header small and fixed in size, so it is
  656. * straightforward to check the RPC header's direction field.
  657. */
  658. static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt,
  659. struct svc_rdma_recv_ctxt *rctxt)
  660. {
  661. __be32 *p = rctxt->rc_recv_buf;
  662. if (!xprt->xpt_bc_xprt)
  663. return false;
  664. if (rctxt->rc_msgtype != rdma_msg)
  665. return false;
  666. if (!pcl_is_empty(&rctxt->rc_call_pcl))
  667. return false;
  668. if (!pcl_is_empty(&rctxt->rc_read_pcl))
  669. return false;
  670. if (!pcl_is_empty(&rctxt->rc_write_pcl))
  671. return false;
  672. if (!pcl_is_empty(&rctxt->rc_reply_pcl))
  673. return false;
  674. /* RPC call direction */
  675. if (*(p + 8) == cpu_to_be32(RPC_CALL))
  676. return false;
  677. return true;
  678. }
  679. /**
  680. * svc_rdma_recvfrom - Receive an RPC call
  681. * @rqstp: request structure into which to receive an RPC Call
  682. *
  683. * Returns:
  684. * The positive number of bytes in the RPC Call message,
  685. * %0 if there were no Calls ready to return,
  686. * %-EINVAL if the Read chunk data is too large,
  687. * %-ENOMEM if rdma_rw context pool was exhausted,
  688. * %-ENOTCONN if posting failed (connection is lost),
  689. * %-EIO if rdma_rw initialization failed (DMA mapping, etc).
  690. *
  691. * Called in a loop when XPT_DATA is set. XPT_DATA is cleared only
  692. * when there are no remaining ctxt's to process.
  693. *
  694. * The next ctxt is removed from the "receive" lists.
  695. *
  696. * - If the ctxt completes a Read, then finish assembling the Call
  697. * message and return the number of bytes in the message.
  698. *
  699. * - If the ctxt completes a Receive, then construct the Call
  700. * message from the contents of the Receive buffer.
  701. *
  702. * - If there are no Read chunks in this message, then finish
  703. * assembling the Call message and return the number of bytes
  704. * in the message.
  705. *
  706. * - If there are Read chunks in this message, post Read WRs to
  707. * pull that payload and return 0.
  708. */
  709. int svc_rdma_recvfrom(struct svc_rqst *rqstp)
  710. {
  711. struct svc_xprt *xprt = rqstp->rq_xprt;
  712. struct svcxprt_rdma *rdma_xprt =
  713. container_of(xprt, struct svcxprt_rdma, sc_xprt);
  714. struct svc_rdma_recv_ctxt *ctxt;
  715. int ret;
  716. /* Prevent svc_xprt_release() from releasing pages in rq_pages
  717. * when returning 0 or an error.
  718. */
  719. rqstp->rq_respages = rqstp->rq_pages;
  720. rqstp->rq_next_page = rqstp->rq_respages;
  721. rqstp->rq_xprt_ctxt = NULL;
  722. ctxt = NULL;
  723. spin_lock(&rdma_xprt->sc_rq_dto_lock);
  724. ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_rq_dto_q);
  725. if (ctxt)
  726. list_del(&ctxt->rc_list);
  727. else
  728. /* No new incoming requests, terminate the loop */
  729. clear_bit(XPT_DATA, &xprt->xpt_flags);
  730. spin_unlock(&rdma_xprt->sc_rq_dto_lock);
  731. /* Unblock the transport for the next receive */
  732. svc_xprt_received(xprt);
  733. if (!ctxt)
  734. return 0;
  735. percpu_counter_inc(&svcrdma_stat_recv);
  736. ib_dma_sync_single_for_cpu(rdma_xprt->sc_pd->device,
  737. ctxt->rc_recv_sge.addr, ctxt->rc_byte_len,
  738. DMA_FROM_DEVICE);
  739. svc_rdma_build_arg_xdr(rqstp, ctxt);
  740. ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg, ctxt);
  741. if (ret < 0)
  742. goto out_err;
  743. if (ret == 0)
  744. goto out_drop;
  745. if (svc_rdma_is_reverse_direction_reply(xprt, ctxt))
  746. goto out_backchannel;
  747. svc_rdma_get_inv_rkey(rdma_xprt, ctxt);
  748. if (!pcl_is_empty(&ctxt->rc_read_pcl) ||
  749. !pcl_is_empty(&ctxt->rc_call_pcl)) {
  750. ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt);
  751. if (ret < 0)
  752. goto out_readfail;
  753. }
  754. rqstp->rq_xprt_ctxt = ctxt;
  755. rqstp->rq_prot = IPPROTO_MAX;
  756. svc_xprt_copy_addrs(rqstp, xprt);
  757. return rqstp->rq_arg.len;
  758. out_err:
  759. svc_rdma_send_error(rdma_xprt, ctxt, ret);
  760. svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
  761. return 0;
  762. out_readfail:
  763. if (ret == -EINVAL)
  764. svc_rdma_send_error(rdma_xprt, ctxt, ret);
  765. svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
  766. svc_xprt_deferred_close(xprt);
  767. return -ENOTCONN;
  768. out_backchannel:
  769. svc_rdma_handle_bc_reply(rqstp, ctxt);
  770. out_drop:
  771. svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
  772. return 0;
  773. }