rpc_rdma.c 40 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510
  1. // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
  2. /*
  3. * Copyright (c) 2014-2020, Oracle and/or its affiliates.
  4. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
  5. *
  6. * This software is available to you under a choice of one of two
  7. * licenses. You may choose to be licensed under the terms of the GNU
  8. * General Public License (GPL) Version 2, available from the file
  9. * COPYING in the main directory of this source tree, or the BSD-type
  10. * license below:
  11. *
  12. * Redistribution and use in source and binary forms, with or without
  13. * modification, are permitted provided that the following conditions
  14. * are met:
  15. *
  16. * Redistributions of source code must retain the above copyright
  17. * notice, this list of conditions and the following disclaimer.
  18. *
  19. * Redistributions in binary form must reproduce the above
  20. * copyright notice, this list of conditions and the following
  21. * disclaimer in the documentation and/or other materials provided
  22. * with the distribution.
  23. *
  24. * Neither the name of the Network Appliance, Inc. nor the names of
  25. * its contributors may be used to endorse or promote products
  26. * derived from this software without specific prior written
  27. * permission.
  28. *
  29. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  30. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  31. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  32. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  33. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  34. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  35. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  36. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  37. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  38. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  39. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  40. */
  41. /*
  42. * rpc_rdma.c
  43. *
  44. * This file contains the guts of the RPC RDMA protocol, and
  45. * does marshaling/unmarshaling, etc. It is also where interfacing
  46. * to the Linux RPC framework lives.
  47. */
  48. #include <linux/highmem.h>
  49. #include <linux/sunrpc/svc_rdma.h>
  50. #include "xprt_rdma.h"
  51. #include <trace/events/rpcrdma.h>
  52. /* Returns size of largest RPC-over-RDMA header in a Call message
  53. *
  54. * The largest Call header contains a full-size Read list and a
  55. * minimal Reply chunk.
  56. */
  57. static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
  58. {
  59. unsigned int size;
  60. /* Fixed header fields and list discriminators */
  61. size = RPCRDMA_HDRLEN_MIN;
  62. /* Maximum Read list size */
  63. size += maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32);
  64. /* Minimal Read chunk size */
  65. size += sizeof(__be32); /* segment count */
  66. size += rpcrdma_segment_maxsz * sizeof(__be32);
  67. size += sizeof(__be32); /* list discriminator */
  68. return size;
  69. }
  70. /* Returns size of largest RPC-over-RDMA header in a Reply message
  71. *
  72. * There is only one Write list or one Reply chunk per Reply
  73. * message. The larger list is the Write list.
  74. */
  75. static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
  76. {
  77. unsigned int size;
  78. /* Fixed header fields and list discriminators */
  79. size = RPCRDMA_HDRLEN_MIN;
  80. /* Maximum Write list size */
  81. size += sizeof(__be32); /* segment count */
  82. size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32);
  83. size += sizeof(__be32); /* list discriminator */
  84. return size;
  85. }
  86. /**
  87. * rpcrdma_set_max_header_sizes - Initialize inline payload sizes
  88. * @ep: endpoint to initialize
  89. *
  90. * The max_inline fields contain the maximum size of an RPC message
  91. * so the marshaling code doesn't have to repeat this calculation
  92. * for every RPC.
  93. */
  94. void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep)
  95. {
  96. unsigned int maxsegs = ep->re_max_rdma_segs;
  97. ep->re_max_inline_send =
  98. ep->re_inline_send - rpcrdma_max_call_header_size(maxsegs);
  99. ep->re_max_inline_recv =
  100. ep->re_inline_recv - rpcrdma_max_reply_header_size(maxsegs);
  101. }
  102. /* The client can send a request inline as long as the RPCRDMA header
  103. * plus the RPC call fit under the transport's inline limit. If the
  104. * combined call message size exceeds that limit, the client must use
  105. * a Read chunk for this operation.
  106. *
  107. * A Read chunk is also required if sending the RPC call inline would
  108. * exceed this device's max_sge limit.
  109. */
  110. static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
  111. struct rpc_rqst *rqst)
  112. {
  113. struct xdr_buf *xdr = &rqst->rq_snd_buf;
  114. struct rpcrdma_ep *ep = r_xprt->rx_ep;
  115. unsigned int count, remaining, offset;
  116. if (xdr->len > ep->re_max_inline_send)
  117. return false;
  118. if (xdr->page_len) {
  119. remaining = xdr->page_len;
  120. offset = offset_in_page(xdr->page_base);
  121. count = RPCRDMA_MIN_SEND_SGES;
  122. while (remaining) {
  123. remaining -= min_t(unsigned int,
  124. PAGE_SIZE - offset, remaining);
  125. offset = 0;
  126. if (++count > ep->re_attr.cap.max_send_sge)
  127. return false;
  128. }
  129. }
  130. return true;
  131. }
  132. /* The client can't know how large the actual reply will be. Thus it
  133. * plans for the largest possible reply for that particular ULP
  134. * operation. If the maximum combined reply message size exceeds that
  135. * limit, the client must provide a write list or a reply chunk for
  136. * this request.
  137. */
  138. static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
  139. struct rpc_rqst *rqst)
  140. {
  141. return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep->re_max_inline_recv;
  142. }
  143. /* The client is required to provide a Reply chunk if the maximum
  144. * size of the non-payload part of the RPC Reply is larger than
  145. * the inline threshold.
  146. */
  147. static bool
  148. rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt,
  149. const struct rpc_rqst *rqst)
  150. {
  151. const struct xdr_buf *buf = &rqst->rq_rcv_buf;
  152. return (buf->head[0].iov_len + buf->tail[0].iov_len) <
  153. r_xprt->rx_ep->re_max_inline_recv;
  154. }
  155. /* ACL likes to be lazy in allocating pages. For TCP, these
  156. * pages can be allocated during receive processing. Not true
  157. * for RDMA, which must always provision receive buffers
  158. * up front.
  159. */
  160. static noinline int
  161. rpcrdma_alloc_sparse_pages(struct xdr_buf *buf)
  162. {
  163. struct page **ppages;
  164. int len;
  165. len = buf->page_len;
  166. ppages = buf->pages + (buf->page_base >> PAGE_SHIFT);
  167. while (len > 0) {
  168. if (!*ppages)
  169. *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN);
  170. if (!*ppages)
  171. return -ENOBUFS;
  172. ppages++;
  173. len -= PAGE_SIZE;
  174. }
  175. return 0;
  176. }
  177. /* Convert @vec to a single SGL element.
  178. *
  179. * Returns pointer to next available SGE, and bumps the total number
  180. * of SGEs consumed.
  181. */
  182. static struct rpcrdma_mr_seg *
  183. rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
  184. unsigned int *n)
  185. {
  186. seg->mr_page = virt_to_page(vec->iov_base);
  187. seg->mr_offset = offset_in_page(vec->iov_base);
  188. seg->mr_len = vec->iov_len;
  189. ++seg;
  190. ++(*n);
  191. return seg;
  192. }
  193. /* Convert @xdrbuf into SGEs no larger than a page each. As they
  194. * are registered, these SGEs are then coalesced into RDMA segments
  195. * when the selected memreg mode supports it.
  196. *
  197. * Returns positive number of SGEs consumed, or a negative errno.
  198. */
  199. static int
  200. rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
  201. unsigned int pos, enum rpcrdma_chunktype type,
  202. struct rpcrdma_mr_seg *seg)
  203. {
  204. unsigned long page_base;
  205. unsigned int len, n;
  206. struct page **ppages;
  207. n = 0;
  208. if (pos == 0)
  209. seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n);
  210. len = xdrbuf->page_len;
  211. ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
  212. page_base = offset_in_page(xdrbuf->page_base);
  213. while (len) {
  214. seg->mr_page = *ppages;
  215. seg->mr_offset = page_base;
  216. seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len);
  217. len -= seg->mr_len;
  218. ++ppages;
  219. ++seg;
  220. ++n;
  221. page_base = 0;
  222. }
  223. if (type == rpcrdma_readch || type == rpcrdma_writech)
  224. goto out;
  225. if (xdrbuf->tail[0].iov_len)
  226. rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n);
  227. out:
  228. if (unlikely(n > RPCRDMA_MAX_SEGS))
  229. return -EIO;
  230. return n;
  231. }
  232. static int
  233. encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr)
  234. {
  235. __be32 *p;
  236. p = xdr_reserve_space(xdr, 4 * sizeof(*p));
  237. if (unlikely(!p))
  238. return -EMSGSIZE;
  239. xdr_encode_rdma_segment(p, mr->mr_handle, mr->mr_length, mr->mr_offset);
  240. return 0;
  241. }
  242. static int
  243. encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr,
  244. u32 position)
  245. {
  246. __be32 *p;
  247. p = xdr_reserve_space(xdr, 6 * sizeof(*p));
  248. if (unlikely(!p))
  249. return -EMSGSIZE;
  250. *p++ = xdr_one; /* Item present */
  251. xdr_encode_read_segment(p, position, mr->mr_handle, mr->mr_length,
  252. mr->mr_offset);
  253. return 0;
  254. }
  255. static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
  256. struct rpcrdma_req *req,
  257. struct rpcrdma_mr_seg *seg,
  258. int nsegs, bool writing,
  259. struct rpcrdma_mr **mr)
  260. {
  261. *mr = rpcrdma_mr_pop(&req->rl_free_mrs);
  262. if (!*mr) {
  263. *mr = rpcrdma_mr_get(r_xprt);
  264. if (!*mr)
  265. goto out_getmr_err;
  266. (*mr)->mr_req = req;
  267. }
  268. rpcrdma_mr_push(*mr, &req->rl_registered);
  269. return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr);
  270. out_getmr_err:
  271. trace_xprtrdma_nomrs_err(r_xprt, req);
  272. xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
  273. rpcrdma_mrs_refresh(r_xprt);
  274. return ERR_PTR(-EAGAIN);
  275. }
  276. /* Register and XDR encode the Read list. Supports encoding a list of read
  277. * segments that belong to a single read chunk.
  278. *
  279. * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
  280. *
  281. * Read chunklist (a linked list):
  282. * N elements, position P (same P for all chunks of same arg!):
  283. * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
  284. *
  285. * Returns zero on success, or a negative errno if a failure occurred.
  286. * @xdr is advanced to the next position in the stream.
  287. *
  288. * Only a single @pos value is currently supported.
  289. */
  290. static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
  291. struct rpcrdma_req *req,
  292. struct rpc_rqst *rqst,
  293. enum rpcrdma_chunktype rtype)
  294. {
  295. struct xdr_stream *xdr = &req->rl_stream;
  296. struct rpcrdma_mr_seg *seg;
  297. struct rpcrdma_mr *mr;
  298. unsigned int pos;
  299. int nsegs;
  300. if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped)
  301. goto done;
  302. pos = rqst->rq_snd_buf.head[0].iov_len;
  303. if (rtype == rpcrdma_areadch)
  304. pos = 0;
  305. seg = req->rl_segments;
  306. nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos,
  307. rtype, seg);
  308. if (nsegs < 0)
  309. return nsegs;
  310. do {
  311. seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, false, &mr);
  312. if (IS_ERR(seg))
  313. return PTR_ERR(seg);
  314. if (encode_read_segment(xdr, mr, pos) < 0)
  315. return -EMSGSIZE;
  316. trace_xprtrdma_chunk_read(rqst->rq_task, pos, mr, nsegs);
  317. r_xprt->rx_stats.read_chunk_count++;
  318. nsegs -= mr->mr_nents;
  319. } while (nsegs);
  320. done:
  321. if (xdr_stream_encode_item_absent(xdr) < 0)
  322. return -EMSGSIZE;
  323. return 0;
  324. }
  325. /* Register and XDR encode the Write list. Supports encoding a list
  326. * containing one array of plain segments that belong to a single
  327. * write chunk.
  328. *
  329. * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
  330. *
  331. * Write chunklist (a list of (one) counted array):
  332. * N elements:
  333. * 1 - N - HLOO - HLOO - ... - HLOO - 0
  334. *
  335. * Returns zero on success, or a negative errno if a failure occurred.
  336. * @xdr is advanced to the next position in the stream.
  337. *
  338. * Only a single Write chunk is currently supported.
  339. */
  340. static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt,
  341. struct rpcrdma_req *req,
  342. struct rpc_rqst *rqst,
  343. enum rpcrdma_chunktype wtype)
  344. {
  345. struct xdr_stream *xdr = &req->rl_stream;
  346. struct rpcrdma_ep *ep = r_xprt->rx_ep;
  347. struct rpcrdma_mr_seg *seg;
  348. struct rpcrdma_mr *mr;
  349. int nsegs, nchunks;
  350. __be32 *segcount;
  351. if (wtype != rpcrdma_writech)
  352. goto done;
  353. seg = req->rl_segments;
  354. nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf,
  355. rqst->rq_rcv_buf.head[0].iov_len,
  356. wtype, seg);
  357. if (nsegs < 0)
  358. return nsegs;
  359. if (xdr_stream_encode_item_present(xdr) < 0)
  360. return -EMSGSIZE;
  361. segcount = xdr_reserve_space(xdr, sizeof(*segcount));
  362. if (unlikely(!segcount))
  363. return -EMSGSIZE;
  364. /* Actual value encoded below */
  365. nchunks = 0;
  366. do {
  367. seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr);
  368. if (IS_ERR(seg))
  369. return PTR_ERR(seg);
  370. if (encode_rdma_segment(xdr, mr) < 0)
  371. return -EMSGSIZE;
  372. trace_xprtrdma_chunk_write(rqst->rq_task, mr, nsegs);
  373. r_xprt->rx_stats.write_chunk_count++;
  374. r_xprt->rx_stats.total_rdma_request += mr->mr_length;
  375. nchunks++;
  376. nsegs -= mr->mr_nents;
  377. } while (nsegs);
  378. if (xdr_pad_size(rqst->rq_rcv_buf.page_len)) {
  379. if (encode_rdma_segment(xdr, ep->re_write_pad_mr) < 0)
  380. return -EMSGSIZE;
  381. trace_xprtrdma_chunk_wp(rqst->rq_task, ep->re_write_pad_mr,
  382. nsegs);
  383. r_xprt->rx_stats.write_chunk_count++;
  384. r_xprt->rx_stats.total_rdma_request += mr->mr_length;
  385. nchunks++;
  386. nsegs -= mr->mr_nents;
  387. }
  388. /* Update count of segments in this Write chunk */
  389. *segcount = cpu_to_be32(nchunks);
  390. done:
  391. if (xdr_stream_encode_item_absent(xdr) < 0)
  392. return -EMSGSIZE;
  393. return 0;
  394. }
  395. /* Register and XDR encode the Reply chunk. Supports encoding an array
  396. * of plain segments that belong to a single write (reply) chunk.
  397. *
  398. * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
  399. *
  400. * Reply chunk (a counted array):
  401. * N elements:
  402. * 1 - N - HLOO - HLOO - ... - HLOO
  403. *
  404. * Returns zero on success, or a negative errno if a failure occurred.
  405. * @xdr is advanced to the next position in the stream.
  406. */
  407. static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
  408. struct rpcrdma_req *req,
  409. struct rpc_rqst *rqst,
  410. enum rpcrdma_chunktype wtype)
  411. {
  412. struct xdr_stream *xdr = &req->rl_stream;
  413. struct rpcrdma_mr_seg *seg;
  414. struct rpcrdma_mr *mr;
  415. int nsegs, nchunks;
  416. __be32 *segcount;
  417. if (wtype != rpcrdma_replych) {
  418. if (xdr_stream_encode_item_absent(xdr) < 0)
  419. return -EMSGSIZE;
  420. return 0;
  421. }
  422. seg = req->rl_segments;
  423. nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg);
  424. if (nsegs < 0)
  425. return nsegs;
  426. if (xdr_stream_encode_item_present(xdr) < 0)
  427. return -EMSGSIZE;
  428. segcount = xdr_reserve_space(xdr, sizeof(*segcount));
  429. if (unlikely(!segcount))
  430. return -EMSGSIZE;
  431. /* Actual value encoded below */
  432. nchunks = 0;
  433. do {
  434. seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr);
  435. if (IS_ERR(seg))
  436. return PTR_ERR(seg);
  437. if (encode_rdma_segment(xdr, mr) < 0)
  438. return -EMSGSIZE;
  439. trace_xprtrdma_chunk_reply(rqst->rq_task, mr, nsegs);
  440. r_xprt->rx_stats.reply_chunk_count++;
  441. r_xprt->rx_stats.total_rdma_request += mr->mr_length;
  442. nchunks++;
  443. nsegs -= mr->mr_nents;
  444. } while (nsegs);
  445. /* Update count of segments in the Reply chunk */
  446. *segcount = cpu_to_be32(nchunks);
  447. return 0;
  448. }
  449. static void rpcrdma_sendctx_done(struct kref *kref)
  450. {
  451. struct rpcrdma_req *req =
  452. container_of(kref, struct rpcrdma_req, rl_kref);
  453. struct rpcrdma_rep *rep = req->rl_reply;
  454. rpcrdma_complete_rqst(rep);
  455. rep->rr_rxprt->rx_stats.reply_waits_for_send++;
  456. }
  457. /**
  458. * rpcrdma_sendctx_unmap - DMA-unmap Send buffer
  459. * @sc: sendctx containing SGEs to unmap
  460. *
  461. */
  462. void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc)
  463. {
  464. struct rpcrdma_regbuf *rb = sc->sc_req->rl_sendbuf;
  465. struct ib_sge *sge;
  466. if (!sc->sc_unmap_count)
  467. return;
  468. /* The first two SGEs contain the transport header and
  469. * the inline buffer. These are always left mapped so
  470. * they can be cheaply re-used.
  471. */
  472. for (sge = &sc->sc_sges[2]; sc->sc_unmap_count;
  473. ++sge, --sc->sc_unmap_count)
  474. ib_dma_unmap_page(rdmab_device(rb), sge->addr, sge->length,
  475. DMA_TO_DEVICE);
  476. kref_put(&sc->sc_req->rl_kref, rpcrdma_sendctx_done);
  477. }
  478. /* Prepare an SGE for the RPC-over-RDMA transport header.
  479. */
  480. static void rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt,
  481. struct rpcrdma_req *req, u32 len)
  482. {
  483. struct rpcrdma_sendctx *sc = req->rl_sendctx;
  484. struct rpcrdma_regbuf *rb = req->rl_rdmabuf;
  485. struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++];
  486. sge->addr = rdmab_addr(rb);
  487. sge->length = len;
  488. sge->lkey = rdmab_lkey(rb);
  489. ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length,
  490. DMA_TO_DEVICE);
  491. }
  492. /* The head iovec is straightforward, as it is usually already
  493. * DMA-mapped. Sync the content that has changed.
  494. */
  495. static bool rpcrdma_prepare_head_iov(struct rpcrdma_xprt *r_xprt,
  496. struct rpcrdma_req *req, unsigned int len)
  497. {
  498. struct rpcrdma_sendctx *sc = req->rl_sendctx;
  499. struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++];
  500. struct rpcrdma_regbuf *rb = req->rl_sendbuf;
  501. if (!rpcrdma_regbuf_dma_map(r_xprt, rb))
  502. return false;
  503. sge->addr = rdmab_addr(rb);
  504. sge->length = len;
  505. sge->lkey = rdmab_lkey(rb);
  506. ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length,
  507. DMA_TO_DEVICE);
  508. return true;
  509. }
  510. /* If there is a page list present, DMA map and prepare an
  511. * SGE for each page to be sent.
  512. */
  513. static bool rpcrdma_prepare_pagelist(struct rpcrdma_req *req,
  514. struct xdr_buf *xdr)
  515. {
  516. struct rpcrdma_sendctx *sc = req->rl_sendctx;
  517. struct rpcrdma_regbuf *rb = req->rl_sendbuf;
  518. unsigned int page_base, len, remaining;
  519. struct page **ppages;
  520. struct ib_sge *sge;
  521. ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
  522. page_base = offset_in_page(xdr->page_base);
  523. remaining = xdr->page_len;
  524. while (remaining) {
  525. sge = &sc->sc_sges[req->rl_wr.num_sge++];
  526. len = min_t(unsigned int, PAGE_SIZE - page_base, remaining);
  527. sge->addr = ib_dma_map_page(rdmab_device(rb), *ppages,
  528. page_base, len, DMA_TO_DEVICE);
  529. if (ib_dma_mapping_error(rdmab_device(rb), sge->addr))
  530. goto out_mapping_err;
  531. sge->length = len;
  532. sge->lkey = rdmab_lkey(rb);
  533. sc->sc_unmap_count++;
  534. ppages++;
  535. remaining -= len;
  536. page_base = 0;
  537. }
  538. return true;
  539. out_mapping_err:
  540. trace_xprtrdma_dma_maperr(sge->addr);
  541. return false;
  542. }
  543. /* The tail iovec may include an XDR pad for the page list,
  544. * as well as additional content, and may not reside in the
  545. * same page as the head iovec.
  546. */
  547. static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req,
  548. struct xdr_buf *xdr,
  549. unsigned int page_base, unsigned int len)
  550. {
  551. struct rpcrdma_sendctx *sc = req->rl_sendctx;
  552. struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++];
  553. struct rpcrdma_regbuf *rb = req->rl_sendbuf;
  554. struct page *page = virt_to_page(xdr->tail[0].iov_base);
  555. sge->addr = ib_dma_map_page(rdmab_device(rb), page, page_base, len,
  556. DMA_TO_DEVICE);
  557. if (ib_dma_mapping_error(rdmab_device(rb), sge->addr))
  558. goto out_mapping_err;
  559. sge->length = len;
  560. sge->lkey = rdmab_lkey(rb);
  561. ++sc->sc_unmap_count;
  562. return true;
  563. out_mapping_err:
  564. trace_xprtrdma_dma_maperr(sge->addr);
  565. return false;
  566. }
  567. /* Copy the tail to the end of the head buffer.
  568. */
  569. static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt,
  570. struct rpcrdma_req *req,
  571. struct xdr_buf *xdr)
  572. {
  573. unsigned char *dst;
  574. dst = (unsigned char *)xdr->head[0].iov_base;
  575. dst += xdr->head[0].iov_len + xdr->page_len;
  576. memmove(dst, xdr->tail[0].iov_base, xdr->tail[0].iov_len);
  577. r_xprt->rx_stats.pullup_copy_count += xdr->tail[0].iov_len;
  578. }
  579. /* Copy pagelist content into the head buffer.
  580. */
  581. static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt,
  582. struct rpcrdma_req *req,
  583. struct xdr_buf *xdr)
  584. {
  585. unsigned int len, page_base, remaining;
  586. struct page **ppages;
  587. unsigned char *src, *dst;
  588. dst = (unsigned char *)xdr->head[0].iov_base;
  589. dst += xdr->head[0].iov_len;
  590. ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
  591. page_base = offset_in_page(xdr->page_base);
  592. remaining = xdr->page_len;
  593. while (remaining) {
  594. src = page_address(*ppages);
  595. src += page_base;
  596. len = min_t(unsigned int, PAGE_SIZE - page_base, remaining);
  597. memcpy(dst, src, len);
  598. r_xprt->rx_stats.pullup_copy_count += len;
  599. ppages++;
  600. dst += len;
  601. remaining -= len;
  602. page_base = 0;
  603. }
  604. }
  605. /* Copy the contents of @xdr into @rl_sendbuf and DMA sync it.
  606. * When the head, pagelist, and tail are small, a pull-up copy
  607. * is considerably less costly than DMA mapping the components
  608. * of @xdr.
  609. *
  610. * Assumptions:
  611. * - the caller has already verified that the total length
  612. * of the RPC Call body will fit into @rl_sendbuf.
  613. */
  614. static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt,
  615. struct rpcrdma_req *req,
  616. struct xdr_buf *xdr)
  617. {
  618. if (unlikely(xdr->tail[0].iov_len))
  619. rpcrdma_pullup_tail_iov(r_xprt, req, xdr);
  620. if (unlikely(xdr->page_len))
  621. rpcrdma_pullup_pagelist(r_xprt, req, xdr);
  622. /* The whole RPC message resides in the head iovec now */
  623. return rpcrdma_prepare_head_iov(r_xprt, req, xdr->len);
  624. }
  625. static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt,
  626. struct rpcrdma_req *req,
  627. struct xdr_buf *xdr)
  628. {
  629. struct kvec *tail = &xdr->tail[0];
  630. if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len))
  631. return false;
  632. if (xdr->page_len)
  633. if (!rpcrdma_prepare_pagelist(req, xdr))
  634. return false;
  635. if (tail->iov_len)
  636. if (!rpcrdma_prepare_tail_iov(req, xdr,
  637. offset_in_page(tail->iov_base),
  638. tail->iov_len))
  639. return false;
  640. if (req->rl_sendctx->sc_unmap_count)
  641. kref_get(&req->rl_kref);
  642. return true;
  643. }
  644. static bool rpcrdma_prepare_readch(struct rpcrdma_xprt *r_xprt,
  645. struct rpcrdma_req *req,
  646. struct xdr_buf *xdr)
  647. {
  648. if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len))
  649. return false;
  650. /* If there is a Read chunk, the page list is being handled
  651. * via explicit RDMA, and thus is skipped here.
  652. */
  653. /* Do not include the tail if it is only an XDR pad */
  654. if (xdr->tail[0].iov_len > 3) {
  655. unsigned int page_base, len;
  656. /* If the content in the page list is an odd length,
  657. * xdr_write_pages() adds a pad at the beginning of
  658. * the tail iovec. Force the tail's non-pad content to
  659. * land at the next XDR position in the Send message.
  660. */
  661. page_base = offset_in_page(xdr->tail[0].iov_base);
  662. len = xdr->tail[0].iov_len;
  663. page_base += len & 3;
  664. len -= len & 3;
  665. if (!rpcrdma_prepare_tail_iov(req, xdr, page_base, len))
  666. return false;
  667. kref_get(&req->rl_kref);
  668. }
  669. return true;
  670. }
  671. /**
  672. * rpcrdma_prepare_send_sges - Construct SGEs for a Send WR
  673. * @r_xprt: controlling transport
  674. * @req: context of RPC Call being marshalled
  675. * @hdrlen: size of transport header, in bytes
  676. * @xdr: xdr_buf containing RPC Call
  677. * @rtype: chunk type being encoded
  678. *
  679. * Returns 0 on success; otherwise a negative errno is returned.
  680. */
  681. inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
  682. struct rpcrdma_req *req, u32 hdrlen,
  683. struct xdr_buf *xdr,
  684. enum rpcrdma_chunktype rtype)
  685. {
  686. int ret;
  687. ret = -EAGAIN;
  688. req->rl_sendctx = rpcrdma_sendctx_get_locked(r_xprt);
  689. if (!req->rl_sendctx)
  690. goto out_nosc;
  691. req->rl_sendctx->sc_unmap_count = 0;
  692. req->rl_sendctx->sc_req = req;
  693. kref_init(&req->rl_kref);
  694. req->rl_wr.wr_cqe = &req->rl_sendctx->sc_cqe;
  695. req->rl_wr.sg_list = req->rl_sendctx->sc_sges;
  696. req->rl_wr.num_sge = 0;
  697. req->rl_wr.opcode = IB_WR_SEND;
  698. rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen);
  699. ret = -EIO;
  700. switch (rtype) {
  701. case rpcrdma_noch_pullup:
  702. if (!rpcrdma_prepare_noch_pullup(r_xprt, req, xdr))
  703. goto out_unmap;
  704. break;
  705. case rpcrdma_noch_mapped:
  706. if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr))
  707. goto out_unmap;
  708. break;
  709. case rpcrdma_readch:
  710. if (!rpcrdma_prepare_readch(r_xprt, req, xdr))
  711. goto out_unmap;
  712. break;
  713. case rpcrdma_areadch:
  714. break;
  715. default:
  716. goto out_unmap;
  717. }
  718. return 0;
  719. out_unmap:
  720. rpcrdma_sendctx_unmap(req->rl_sendctx);
  721. out_nosc:
  722. trace_xprtrdma_prepsend_failed(&req->rl_slot, ret);
  723. return ret;
  724. }
  725. /**
  726. * rpcrdma_marshal_req - Marshal and send one RPC request
  727. * @r_xprt: controlling transport
  728. * @rqst: RPC request to be marshaled
  729. *
  730. * For the RPC in "rqst", this function:
  731. * - Chooses the transfer mode (eg., RDMA_MSG or RDMA_NOMSG)
  732. * - Registers Read, Write, and Reply chunks
  733. * - Constructs the transport header
  734. * - Posts a Send WR to send the transport header and request
  735. *
  736. * Returns:
  737. * %0 if the RPC was sent successfully,
  738. * %-ENOTCONN if the connection was lost,
  739. * %-EAGAIN if the caller should call again with the same arguments,
  740. * %-ENOBUFS if the caller should call again after a delay,
  741. * %-EMSGSIZE if the transport header is too small,
  742. * %-EIO if a permanent problem occurred while marshaling.
  743. */
  744. int
  745. rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
  746. {
  747. struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
  748. struct xdr_stream *xdr = &req->rl_stream;
  749. enum rpcrdma_chunktype rtype, wtype;
  750. struct xdr_buf *buf = &rqst->rq_snd_buf;
  751. bool ddp_allowed;
  752. __be32 *p;
  753. int ret;
  754. if (unlikely(rqst->rq_rcv_buf.flags & XDRBUF_SPARSE_PAGES)) {
  755. ret = rpcrdma_alloc_sparse_pages(&rqst->rq_rcv_buf);
  756. if (ret)
  757. return ret;
  758. }
  759. rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0);
  760. xdr_init_encode(xdr, &req->rl_hdrbuf, rdmab_data(req->rl_rdmabuf),
  761. rqst);
  762. /* Fixed header fields */
  763. ret = -EMSGSIZE;
  764. p = xdr_reserve_space(xdr, 4 * sizeof(*p));
  765. if (!p)
  766. goto out_err;
  767. *p++ = rqst->rq_xid;
  768. *p++ = rpcrdma_version;
  769. *p++ = r_xprt->rx_buf.rb_max_requests;
  770. /* When the ULP employs a GSS flavor that guarantees integrity
  771. * or privacy, direct data placement of individual data items
  772. * is not allowed.
  773. */
  774. ddp_allowed = !test_bit(RPCAUTH_AUTH_DATATOUCH,
  775. &rqst->rq_cred->cr_auth->au_flags);
  776. /*
  777. * Chunks needed for results?
  778. *
  779. * o If the expected result is under the inline threshold, all ops
  780. * return as inline.
  781. * o Large read ops return data as write chunk(s), header as
  782. * inline.
  783. * o Large non-read ops return as a single reply chunk.
  784. */
  785. if (rpcrdma_results_inline(r_xprt, rqst))
  786. wtype = rpcrdma_noch;
  787. else if ((ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ) &&
  788. rpcrdma_nonpayload_inline(r_xprt, rqst))
  789. wtype = rpcrdma_writech;
  790. else
  791. wtype = rpcrdma_replych;
  792. /*
  793. * Chunks needed for arguments?
  794. *
  795. * o If the total request is under the inline threshold, all ops
  796. * are sent as inline.
  797. * o Large write ops transmit data as read chunk(s), header as
  798. * inline.
  799. * o Large non-write ops are sent with the entire message as a
  800. * single read chunk (protocol 0-position special case).
  801. *
  802. * This assumes that the upper layer does not present a request
  803. * that both has a data payload, and whose non-data arguments
  804. * by themselves are larger than the inline threshold.
  805. */
  806. if (rpcrdma_args_inline(r_xprt, rqst)) {
  807. *p++ = rdma_msg;
  808. rtype = buf->len < rdmab_length(req->rl_sendbuf) ?
  809. rpcrdma_noch_pullup : rpcrdma_noch_mapped;
  810. } else if (ddp_allowed && buf->flags & XDRBUF_WRITE) {
  811. *p++ = rdma_msg;
  812. rtype = rpcrdma_readch;
  813. } else {
  814. r_xprt->rx_stats.nomsg_call_count++;
  815. *p++ = rdma_nomsg;
  816. rtype = rpcrdma_areadch;
  817. }
  818. /* This implementation supports the following combinations
  819. * of chunk lists in one RPC-over-RDMA Call message:
  820. *
  821. * - Read list
  822. * - Write list
  823. * - Reply chunk
  824. * - Read list + Reply chunk
  825. *
  826. * It might not yet support the following combinations:
  827. *
  828. * - Read list + Write list
  829. *
  830. * It does not support the following combinations:
  831. *
  832. * - Write list + Reply chunk
  833. * - Read list + Write list + Reply chunk
  834. *
  835. * This implementation supports only a single chunk in each
  836. * Read or Write list. Thus for example the client cannot
  837. * send a Call message with a Position Zero Read chunk and a
  838. * regular Read chunk at the same time.
  839. */
  840. ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype);
  841. if (ret)
  842. goto out_err;
  843. ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype);
  844. if (ret)
  845. goto out_err;
  846. ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype);
  847. if (ret)
  848. goto out_err;
  849. ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len,
  850. buf, rtype);
  851. if (ret)
  852. goto out_err;
  853. trace_xprtrdma_marshal(req, rtype, wtype);
  854. return 0;
  855. out_err:
  856. trace_xprtrdma_marshal_failed(rqst, ret);
  857. r_xprt->rx_stats.failed_marshal_count++;
  858. frwr_reset(req);
  859. return ret;
  860. }
  861. static void __rpcrdma_update_cwnd_locked(struct rpc_xprt *xprt,
  862. struct rpcrdma_buffer *buf,
  863. u32 grant)
  864. {
  865. buf->rb_credits = grant;
  866. xprt->cwnd = grant << RPC_CWNDSHIFT;
  867. }
  868. static void rpcrdma_update_cwnd(struct rpcrdma_xprt *r_xprt, u32 grant)
  869. {
  870. struct rpc_xprt *xprt = &r_xprt->rx_xprt;
  871. spin_lock(&xprt->transport_lock);
  872. __rpcrdma_update_cwnd_locked(xprt, &r_xprt->rx_buf, grant);
  873. spin_unlock(&xprt->transport_lock);
  874. }
  875. /**
  876. * rpcrdma_reset_cwnd - Reset the xprt's congestion window
  877. * @r_xprt: controlling transport instance
  878. *
  879. * Prepare @r_xprt for the next connection by reinitializing
  880. * its credit grant to one (see RFC 8166, Section 3.3.3).
  881. */
  882. void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt)
  883. {
  884. struct rpc_xprt *xprt = &r_xprt->rx_xprt;
  885. spin_lock(&xprt->transport_lock);
  886. xprt->cong = 0;
  887. __rpcrdma_update_cwnd_locked(xprt, &r_xprt->rx_buf, 1);
  888. spin_unlock(&xprt->transport_lock);
  889. }
  890. /**
  891. * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs
  892. * @rqst: controlling RPC request
  893. * @srcp: points to RPC message payload in receive buffer
  894. * @copy_len: remaining length of receive buffer content
  895. * @pad: Write chunk pad bytes needed (zero for pure inline)
  896. *
  897. * The upper layer has set the maximum number of bytes it can
  898. * receive in each component of rq_rcv_buf. These values are set in
  899. * the head.iov_len, page_len, tail.iov_len, and buflen fields.
  900. *
  901. * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in
  902. * many cases this function simply updates iov_base pointers in
  903. * rq_rcv_buf to point directly to the received reply data, to
  904. * avoid copying reply data.
  905. *
  906. * Returns the count of bytes which had to be memcopied.
  907. */
  908. static unsigned long
  909. rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
  910. {
  911. unsigned long fixup_copy_count;
  912. int i, npages, curlen;
  913. char *destp;
  914. struct page **ppages;
  915. int page_base;
  916. /* The head iovec is redirected to the RPC reply message
  917. * in the receive buffer, to avoid a memcopy.
  918. */
  919. rqst->rq_rcv_buf.head[0].iov_base = srcp;
  920. rqst->rq_private_buf.head[0].iov_base = srcp;
  921. /* The contents of the receive buffer that follow
  922. * head.iov_len bytes are copied into the page list.
  923. */
  924. curlen = rqst->rq_rcv_buf.head[0].iov_len;
  925. if (curlen > copy_len)
  926. curlen = copy_len;
  927. srcp += curlen;
  928. copy_len -= curlen;
  929. ppages = rqst->rq_rcv_buf.pages +
  930. (rqst->rq_rcv_buf.page_base >> PAGE_SHIFT);
  931. page_base = offset_in_page(rqst->rq_rcv_buf.page_base);
  932. fixup_copy_count = 0;
  933. if (copy_len && rqst->rq_rcv_buf.page_len) {
  934. int pagelist_len;
  935. pagelist_len = rqst->rq_rcv_buf.page_len;
  936. if (pagelist_len > copy_len)
  937. pagelist_len = copy_len;
  938. npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT;
  939. for (i = 0; i < npages; i++) {
  940. curlen = PAGE_SIZE - page_base;
  941. if (curlen > pagelist_len)
  942. curlen = pagelist_len;
  943. destp = kmap_atomic(ppages[i]);
  944. memcpy(destp + page_base, srcp, curlen);
  945. flush_dcache_page(ppages[i]);
  946. kunmap_atomic(destp);
  947. srcp += curlen;
  948. copy_len -= curlen;
  949. fixup_copy_count += curlen;
  950. pagelist_len -= curlen;
  951. if (!pagelist_len)
  952. break;
  953. page_base = 0;
  954. }
  955. /* Implicit padding for the last segment in a Write
  956. * chunk is inserted inline at the front of the tail
  957. * iovec. The upper layer ignores the content of
  958. * the pad. Simply ensure inline content in the tail
  959. * that follows the Write chunk is properly aligned.
  960. */
  961. if (pad)
  962. srcp -= pad;
  963. }
  964. /* The tail iovec is redirected to the remaining data
  965. * in the receive buffer, to avoid a memcopy.
  966. */
  967. if (copy_len || pad) {
  968. rqst->rq_rcv_buf.tail[0].iov_base = srcp;
  969. rqst->rq_private_buf.tail[0].iov_base = srcp;
  970. }
  971. if (fixup_copy_count)
  972. trace_xprtrdma_fixup(rqst, fixup_copy_count);
  973. return fixup_copy_count;
  974. }
  975. /* By convention, backchannel calls arrive via rdma_msg type
  976. * messages, and never populate the chunk lists. This makes
  977. * the RPC/RDMA header small and fixed in size, so it is
  978. * straightforward to check the RPC header's direction field.
  979. */
  980. static bool
  981. rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
  982. #if defined(CONFIG_SUNRPC_BACKCHANNEL)
  983. {
  984. struct rpc_xprt *xprt = &r_xprt->rx_xprt;
  985. struct xdr_stream *xdr = &rep->rr_stream;
  986. __be32 *p;
  987. if (rep->rr_proc != rdma_msg)
  988. return false;
  989. /* Peek at stream contents without advancing. */
  990. p = xdr_inline_decode(xdr, 0);
  991. /* Chunk lists */
  992. if (xdr_item_is_present(p++))
  993. return false;
  994. if (xdr_item_is_present(p++))
  995. return false;
  996. if (xdr_item_is_present(p++))
  997. return false;
  998. /* RPC header */
  999. if (*p++ != rep->rr_xid)
  1000. return false;
  1001. if (*p != cpu_to_be32(RPC_CALL))
  1002. return false;
  1003. /* No bc service. */
  1004. if (xprt->bc_serv == NULL)
  1005. return false;
  1006. /* Now that we are sure this is a backchannel call,
  1007. * advance to the RPC header.
  1008. */
  1009. p = xdr_inline_decode(xdr, 3 * sizeof(*p));
  1010. if (unlikely(!p))
  1011. return true;
  1012. rpcrdma_bc_receive_call(r_xprt, rep);
  1013. return true;
  1014. }
  1015. #else /* CONFIG_SUNRPC_BACKCHANNEL */
  1016. {
  1017. return false;
  1018. }
  1019. #endif /* CONFIG_SUNRPC_BACKCHANNEL */
  1020. static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length)
  1021. {
  1022. u32 handle;
  1023. u64 offset;
  1024. __be32 *p;
  1025. p = xdr_inline_decode(xdr, 4 * sizeof(*p));
  1026. if (unlikely(!p))
  1027. return -EIO;
  1028. xdr_decode_rdma_segment(p, &handle, length, &offset);
  1029. trace_xprtrdma_decode_seg(handle, *length, offset);
  1030. return 0;
  1031. }
  1032. static int decode_write_chunk(struct xdr_stream *xdr, u32 *length)
  1033. {
  1034. u32 segcount, seglength;
  1035. __be32 *p;
  1036. p = xdr_inline_decode(xdr, sizeof(*p));
  1037. if (unlikely(!p))
  1038. return -EIO;
  1039. *length = 0;
  1040. segcount = be32_to_cpup(p);
  1041. while (segcount--) {
  1042. if (decode_rdma_segment(xdr, &seglength))
  1043. return -EIO;
  1044. *length += seglength;
  1045. }
  1046. return 0;
  1047. }
  1048. /* In RPC-over-RDMA Version One replies, a Read list is never
  1049. * expected. This decoder is a stub that returns an error if
  1050. * a Read list is present.
  1051. */
  1052. static int decode_read_list(struct xdr_stream *xdr)
  1053. {
  1054. __be32 *p;
  1055. p = xdr_inline_decode(xdr, sizeof(*p));
  1056. if (unlikely(!p))
  1057. return -EIO;
  1058. if (unlikely(xdr_item_is_present(p)))
  1059. return -EIO;
  1060. return 0;
  1061. }
  1062. /* Supports only one Write chunk in the Write list
  1063. */
  1064. static int decode_write_list(struct xdr_stream *xdr, u32 *length)
  1065. {
  1066. u32 chunklen;
  1067. bool first;
  1068. __be32 *p;
  1069. *length = 0;
  1070. first = true;
  1071. do {
  1072. p = xdr_inline_decode(xdr, sizeof(*p));
  1073. if (unlikely(!p))
  1074. return -EIO;
  1075. if (xdr_item_is_absent(p))
  1076. break;
  1077. if (!first)
  1078. return -EIO;
  1079. if (decode_write_chunk(xdr, &chunklen))
  1080. return -EIO;
  1081. *length += chunklen;
  1082. first = false;
  1083. } while (true);
  1084. return 0;
  1085. }
  1086. static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length)
  1087. {
  1088. __be32 *p;
  1089. p = xdr_inline_decode(xdr, sizeof(*p));
  1090. if (unlikely(!p))
  1091. return -EIO;
  1092. *length = 0;
  1093. if (xdr_item_is_present(p))
  1094. if (decode_write_chunk(xdr, length))
  1095. return -EIO;
  1096. return 0;
  1097. }
  1098. static int
  1099. rpcrdma_decode_msg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
  1100. struct rpc_rqst *rqst)
  1101. {
  1102. struct xdr_stream *xdr = &rep->rr_stream;
  1103. u32 writelist, replychunk, rpclen;
  1104. char *base;
  1105. /* Decode the chunk lists */
  1106. if (decode_read_list(xdr))
  1107. return -EIO;
  1108. if (decode_write_list(xdr, &writelist))
  1109. return -EIO;
  1110. if (decode_reply_chunk(xdr, &replychunk))
  1111. return -EIO;
  1112. /* RDMA_MSG sanity checks */
  1113. if (unlikely(replychunk))
  1114. return -EIO;
  1115. /* Build the RPC reply's Payload stream in rqst->rq_rcv_buf */
  1116. base = (char *)xdr_inline_decode(xdr, 0);
  1117. rpclen = xdr_stream_remaining(xdr);
  1118. r_xprt->rx_stats.fixup_copy_count +=
  1119. rpcrdma_inline_fixup(rqst, base, rpclen, writelist & 3);
  1120. r_xprt->rx_stats.total_rdma_reply += writelist;
  1121. return rpclen + xdr_align_size(writelist);
  1122. }
  1123. static noinline int
  1124. rpcrdma_decode_nomsg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
  1125. {
  1126. struct xdr_stream *xdr = &rep->rr_stream;
  1127. u32 writelist, replychunk;
  1128. /* Decode the chunk lists */
  1129. if (decode_read_list(xdr))
  1130. return -EIO;
  1131. if (decode_write_list(xdr, &writelist))
  1132. return -EIO;
  1133. if (decode_reply_chunk(xdr, &replychunk))
  1134. return -EIO;
  1135. /* RDMA_NOMSG sanity checks */
  1136. if (unlikely(writelist))
  1137. return -EIO;
  1138. if (unlikely(!replychunk))
  1139. return -EIO;
  1140. /* Reply chunk buffer already is the reply vector */
  1141. r_xprt->rx_stats.total_rdma_reply += replychunk;
  1142. return replychunk;
  1143. }
  1144. static noinline int
  1145. rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
  1146. struct rpc_rqst *rqst)
  1147. {
  1148. struct xdr_stream *xdr = &rep->rr_stream;
  1149. __be32 *p;
  1150. p = xdr_inline_decode(xdr, sizeof(*p));
  1151. if (unlikely(!p))
  1152. return -EIO;
  1153. switch (*p) {
  1154. case err_vers:
  1155. p = xdr_inline_decode(xdr, 2 * sizeof(*p));
  1156. if (!p)
  1157. break;
  1158. trace_xprtrdma_err_vers(rqst, p, p + 1);
  1159. break;
  1160. case err_chunk:
  1161. trace_xprtrdma_err_chunk(rqst);
  1162. break;
  1163. default:
  1164. trace_xprtrdma_err_unrecognized(rqst, p);
  1165. }
  1166. return -EIO;
  1167. }
  1168. /**
  1169. * rpcrdma_unpin_rqst - Release rqst without completing it
  1170. * @rep: RPC/RDMA Receive context
  1171. *
  1172. * This is done when a connection is lost so that a Reply
  1173. * can be dropped and its matching Call can be subsequently
  1174. * retransmitted on a new connection.
  1175. */
  1176. void rpcrdma_unpin_rqst(struct rpcrdma_rep *rep)
  1177. {
  1178. struct rpc_xprt *xprt = &rep->rr_rxprt->rx_xprt;
  1179. struct rpc_rqst *rqst = rep->rr_rqst;
  1180. struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
  1181. req->rl_reply = NULL;
  1182. rep->rr_rqst = NULL;
  1183. spin_lock(&xprt->queue_lock);
  1184. xprt_unpin_rqst(rqst);
  1185. spin_unlock(&xprt->queue_lock);
  1186. }
  1187. /**
  1188. * rpcrdma_complete_rqst - Pass completed rqst back to RPC
  1189. * @rep: RPC/RDMA Receive context
  1190. *
  1191. * Reconstruct the RPC reply and complete the transaction
  1192. * while @rqst is still pinned to ensure the rep, rqst, and
  1193. * rq_task pointers remain stable.
  1194. */
  1195. void rpcrdma_complete_rqst(struct rpcrdma_rep *rep)
  1196. {
  1197. struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
  1198. struct rpc_xprt *xprt = &r_xprt->rx_xprt;
  1199. struct rpc_rqst *rqst = rep->rr_rqst;
  1200. int status;
  1201. switch (rep->rr_proc) {
  1202. case rdma_msg:
  1203. status = rpcrdma_decode_msg(r_xprt, rep, rqst);
  1204. break;
  1205. case rdma_nomsg:
  1206. status = rpcrdma_decode_nomsg(r_xprt, rep);
  1207. break;
  1208. case rdma_error:
  1209. status = rpcrdma_decode_error(r_xprt, rep, rqst);
  1210. break;
  1211. default:
  1212. status = -EIO;
  1213. }
  1214. if (status < 0)
  1215. goto out_badheader;
  1216. out:
  1217. spin_lock(&xprt->queue_lock);
  1218. xprt_complete_rqst(rqst->rq_task, status);
  1219. xprt_unpin_rqst(rqst);
  1220. spin_unlock(&xprt->queue_lock);
  1221. return;
  1222. out_badheader:
  1223. trace_xprtrdma_reply_hdr_err(rep);
  1224. r_xprt->rx_stats.bad_reply_count++;
  1225. rqst->rq_task->tk_status = status;
  1226. status = 0;
  1227. goto out;
  1228. }
  1229. static void rpcrdma_reply_done(struct kref *kref)
  1230. {
  1231. struct rpcrdma_req *req =
  1232. container_of(kref, struct rpcrdma_req, rl_kref);
  1233. rpcrdma_complete_rqst(req->rl_reply);
  1234. }
  1235. /**
  1236. * rpcrdma_reply_handler - Process received RPC/RDMA messages
  1237. * @rep: Incoming rpcrdma_rep object to process
  1238. *
  1239. * Errors must result in the RPC task either being awakened, or
  1240. * allowed to timeout, to discover the errors at that time.
  1241. */
  1242. void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
  1243. {
  1244. struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
  1245. struct rpc_xprt *xprt = &r_xprt->rx_xprt;
  1246. struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
  1247. struct rpcrdma_req *req;
  1248. struct rpc_rqst *rqst;
  1249. u32 credits;
  1250. __be32 *p;
  1251. /* Any data means we had a useful conversation, so
  1252. * then we don't need to delay the next reconnect.
  1253. */
  1254. if (xprt->reestablish_timeout)
  1255. xprt->reestablish_timeout = 0;
  1256. /* Fixed transport header fields */
  1257. xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf,
  1258. rep->rr_hdrbuf.head[0].iov_base, NULL);
  1259. p = xdr_inline_decode(&rep->rr_stream, 4 * sizeof(*p));
  1260. if (unlikely(!p))
  1261. goto out_shortreply;
  1262. rep->rr_xid = *p++;
  1263. rep->rr_vers = *p++;
  1264. credits = be32_to_cpu(*p++);
  1265. rep->rr_proc = *p++;
  1266. if (rep->rr_vers != rpcrdma_version)
  1267. goto out_badversion;
  1268. if (rpcrdma_is_bcall(r_xprt, rep))
  1269. return;
  1270. /* Match incoming rpcrdma_rep to an rpcrdma_req to
  1271. * get context for handling any incoming chunks.
  1272. */
  1273. spin_lock(&xprt->queue_lock);
  1274. rqst = xprt_lookup_rqst(xprt, rep->rr_xid);
  1275. if (!rqst)
  1276. goto out_norqst;
  1277. xprt_pin_rqst(rqst);
  1278. spin_unlock(&xprt->queue_lock);
  1279. if (credits == 0)
  1280. credits = 1; /* don't deadlock */
  1281. else if (credits > r_xprt->rx_ep->re_max_requests)
  1282. credits = r_xprt->rx_ep->re_max_requests;
  1283. rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1),
  1284. false);
  1285. if (buf->rb_credits != credits)
  1286. rpcrdma_update_cwnd(r_xprt, credits);
  1287. req = rpcr_to_rdmar(rqst);
  1288. if (unlikely(req->rl_reply))
  1289. rpcrdma_rep_put(buf, req->rl_reply);
  1290. req->rl_reply = rep;
  1291. rep->rr_rqst = rqst;
  1292. trace_xprtrdma_reply(rqst->rq_task, rep, credits);
  1293. if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)
  1294. frwr_reminv(rep, &req->rl_registered);
  1295. if (!list_empty(&req->rl_registered))
  1296. frwr_unmap_async(r_xprt, req);
  1297. /* LocalInv completion will complete the RPC */
  1298. else
  1299. kref_put(&req->rl_kref, rpcrdma_reply_done);
  1300. return;
  1301. out_badversion:
  1302. trace_xprtrdma_reply_vers_err(rep);
  1303. goto out;
  1304. out_norqst:
  1305. spin_unlock(&xprt->queue_lock);
  1306. trace_xprtrdma_reply_rqst_err(rep);
  1307. goto out;
  1308. out_shortreply:
  1309. trace_xprtrdma_reply_short_err(rep);
  1310. out:
  1311. rpcrdma_rep_put(buf, rep);
  1312. }