rdma.c 64 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * NVMe over Fabrics RDMA host code.
  4. * Copyright (c) 2015-2016 HGST, a Western Digital Company.
  5. */
  6. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  7. #include <linux/module.h>
  8. #include <linux/init.h>
  9. #include <linux/slab.h>
  10. #include <rdma/mr_pool.h>
  11. #include <linux/err.h>
  12. #include <linux/string.h>
  13. #include <linux/atomic.h>
  14. #include <linux/blk-mq.h>
  15. #include <linux/blk-mq-rdma.h>
  16. #include <linux/blk-integrity.h>
  17. #include <linux/types.h>
  18. #include <linux/list.h>
  19. #include <linux/mutex.h>
  20. #include <linux/scatterlist.h>
  21. #include <linux/nvme.h>
  22. #include <asm/unaligned.h>
  23. #include <rdma/ib_verbs.h>
  24. #include <rdma/rdma_cm.h>
  25. #include <linux/nvme-rdma.h>
  26. #include "nvme.h"
  27. #include "fabrics.h"
  28. #define NVME_RDMA_CM_TIMEOUT_MS 3000 /* 3 second */
  29. #define NVME_RDMA_MAX_SEGMENTS 256
  30. #define NVME_RDMA_MAX_INLINE_SEGMENTS 4
  31. #define NVME_RDMA_DATA_SGL_SIZE \
  32. (sizeof(struct scatterlist) * NVME_INLINE_SG_CNT)
  33. #define NVME_RDMA_METADATA_SGL_SIZE \
  34. (sizeof(struct scatterlist) * NVME_INLINE_METADATA_SG_CNT)
  35. struct nvme_rdma_device {
  36. struct ib_device *dev;
  37. struct ib_pd *pd;
  38. struct kref ref;
  39. struct list_head entry;
  40. unsigned int num_inline_segments;
  41. };
  42. struct nvme_rdma_qe {
  43. struct ib_cqe cqe;
  44. void *data;
  45. u64 dma;
  46. };
  47. struct nvme_rdma_sgl {
  48. int nents;
  49. struct sg_table sg_table;
  50. };
  51. struct nvme_rdma_queue;
  52. struct nvme_rdma_request {
  53. struct nvme_request req;
  54. struct ib_mr *mr;
  55. struct nvme_rdma_qe sqe;
  56. union nvme_result result;
  57. __le16 status;
  58. refcount_t ref;
  59. struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
  60. u32 num_sge;
  61. struct ib_reg_wr reg_wr;
  62. struct ib_cqe reg_cqe;
  63. struct nvme_rdma_queue *queue;
  64. struct nvme_rdma_sgl data_sgl;
  65. struct nvme_rdma_sgl *metadata_sgl;
  66. bool use_sig_mr;
  67. };
  68. enum nvme_rdma_queue_flags {
  69. NVME_RDMA_Q_ALLOCATED = 0,
  70. NVME_RDMA_Q_LIVE = 1,
  71. NVME_RDMA_Q_TR_READY = 2,
  72. };
  73. struct nvme_rdma_queue {
  74. struct nvme_rdma_qe *rsp_ring;
  75. int queue_size;
  76. size_t cmnd_capsule_len;
  77. struct nvme_rdma_ctrl *ctrl;
  78. struct nvme_rdma_device *device;
  79. struct ib_cq *ib_cq;
  80. struct ib_qp *qp;
  81. unsigned long flags;
  82. struct rdma_cm_id *cm_id;
  83. int cm_error;
  84. struct completion cm_done;
  85. bool pi_support;
  86. int cq_size;
  87. struct mutex queue_lock;
  88. };
  89. struct nvme_rdma_ctrl {
  90. /* read only in the hot path */
  91. struct nvme_rdma_queue *queues;
  92. /* other member variables */
  93. struct blk_mq_tag_set tag_set;
  94. struct work_struct err_work;
  95. struct nvme_rdma_qe async_event_sqe;
  96. struct delayed_work reconnect_work;
  97. struct list_head list;
  98. struct blk_mq_tag_set admin_tag_set;
  99. struct nvme_rdma_device *device;
  100. u32 max_fr_pages;
  101. struct sockaddr_storage addr;
  102. struct sockaddr_storage src_addr;
  103. struct nvme_ctrl ctrl;
  104. bool use_inline_data;
  105. u32 io_queues[HCTX_MAX_TYPES];
  106. };
  107. static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl)
  108. {
  109. return container_of(ctrl, struct nvme_rdma_ctrl, ctrl);
  110. }
  111. static LIST_HEAD(device_list);
  112. static DEFINE_MUTEX(device_list_mutex);
  113. static LIST_HEAD(nvme_rdma_ctrl_list);
  114. static DEFINE_MUTEX(nvme_rdma_ctrl_mutex);
  115. /*
  116. * Disabling this option makes small I/O goes faster, but is fundamentally
  117. * unsafe. With it turned off we will have to register a global rkey that
  118. * allows read and write access to all physical memory.
  119. */
  120. static bool register_always = true;
  121. module_param(register_always, bool, 0444);
  122. MODULE_PARM_DESC(register_always,
  123. "Use memory registration even for contiguous memory regions");
  124. static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
  125. struct rdma_cm_event *event);
  126. static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
  127. static void nvme_rdma_complete_rq(struct request *rq);
  128. static const struct blk_mq_ops nvme_rdma_mq_ops;
  129. static const struct blk_mq_ops nvme_rdma_admin_mq_ops;
  130. static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue)
  131. {
  132. return queue - queue->ctrl->queues;
  133. }
  134. static bool nvme_rdma_poll_queue(struct nvme_rdma_queue *queue)
  135. {
  136. return nvme_rdma_queue_idx(queue) >
  137. queue->ctrl->io_queues[HCTX_TYPE_DEFAULT] +
  138. queue->ctrl->io_queues[HCTX_TYPE_READ];
  139. }
  140. static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue)
  141. {
  142. return queue->cmnd_capsule_len - sizeof(struct nvme_command);
  143. }
  144. static void nvme_rdma_free_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe,
  145. size_t capsule_size, enum dma_data_direction dir)
  146. {
  147. ib_dma_unmap_single(ibdev, qe->dma, capsule_size, dir);
  148. kfree(qe->data);
  149. }
  150. static int nvme_rdma_alloc_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe,
  151. size_t capsule_size, enum dma_data_direction dir)
  152. {
  153. qe->data = kzalloc(capsule_size, GFP_KERNEL);
  154. if (!qe->data)
  155. return -ENOMEM;
  156. qe->dma = ib_dma_map_single(ibdev, qe->data, capsule_size, dir);
  157. if (ib_dma_mapping_error(ibdev, qe->dma)) {
  158. kfree(qe->data);
  159. qe->data = NULL;
  160. return -ENOMEM;
  161. }
  162. return 0;
  163. }
  164. static void nvme_rdma_free_ring(struct ib_device *ibdev,
  165. struct nvme_rdma_qe *ring, size_t ib_queue_size,
  166. size_t capsule_size, enum dma_data_direction dir)
  167. {
  168. int i;
  169. for (i = 0; i < ib_queue_size; i++)
  170. nvme_rdma_free_qe(ibdev, &ring[i], capsule_size, dir);
  171. kfree(ring);
  172. }
  173. static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev,
  174. size_t ib_queue_size, size_t capsule_size,
  175. enum dma_data_direction dir)
  176. {
  177. struct nvme_rdma_qe *ring;
  178. int i;
  179. ring = kcalloc(ib_queue_size, sizeof(struct nvme_rdma_qe), GFP_KERNEL);
  180. if (!ring)
  181. return NULL;
  182. /*
  183. * Bind the CQEs (post recv buffers) DMA mapping to the RDMA queue
  184. * lifetime. It's safe, since any chage in the underlying RDMA device
  185. * will issue error recovery and queue re-creation.
  186. */
  187. for (i = 0; i < ib_queue_size; i++) {
  188. if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir))
  189. goto out_free_ring;
  190. }
  191. return ring;
  192. out_free_ring:
  193. nvme_rdma_free_ring(ibdev, ring, i, capsule_size, dir);
  194. return NULL;
  195. }
  196. static void nvme_rdma_qp_event(struct ib_event *event, void *context)
  197. {
  198. pr_debug("QP event %s (%d)\n",
  199. ib_event_msg(event->event), event->event);
  200. }
  201. static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue)
  202. {
  203. int ret;
  204. ret = wait_for_completion_interruptible(&queue->cm_done);
  205. if (ret)
  206. return ret;
  207. WARN_ON_ONCE(queue->cm_error > 0);
  208. return queue->cm_error;
  209. }
  210. static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
  211. {
  212. struct nvme_rdma_device *dev = queue->device;
  213. struct ib_qp_init_attr init_attr;
  214. int ret;
  215. memset(&init_attr, 0, sizeof(init_attr));
  216. init_attr.event_handler = nvme_rdma_qp_event;
  217. /* +1 for drain */
  218. init_attr.cap.max_send_wr = factor * queue->queue_size + 1;
  219. /* +1 for drain */
  220. init_attr.cap.max_recv_wr = queue->queue_size + 1;
  221. init_attr.cap.max_recv_sge = 1;
  222. init_attr.cap.max_send_sge = 1 + dev->num_inline_segments;
  223. init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
  224. init_attr.qp_type = IB_QPT_RC;
  225. init_attr.send_cq = queue->ib_cq;
  226. init_attr.recv_cq = queue->ib_cq;
  227. if (queue->pi_support)
  228. init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN;
  229. init_attr.qp_context = queue;
  230. ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr);
  231. queue->qp = queue->cm_id->qp;
  232. return ret;
  233. }
  234. static void nvme_rdma_exit_request(struct blk_mq_tag_set *set,
  235. struct request *rq, unsigned int hctx_idx)
  236. {
  237. struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
  238. kfree(req->sqe.data);
  239. }
  240. static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
  241. struct request *rq, unsigned int hctx_idx,
  242. unsigned int numa_node)
  243. {
  244. struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(set->driver_data);
  245. struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
  246. int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
  247. struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
  248. nvme_req(rq)->ctrl = &ctrl->ctrl;
  249. req->sqe.data = kzalloc(sizeof(struct nvme_command), GFP_KERNEL);
  250. if (!req->sqe.data)
  251. return -ENOMEM;
  252. /* metadata nvme_rdma_sgl struct is located after command's data SGL */
  253. if (queue->pi_support)
  254. req->metadata_sgl = (void *)nvme_req(rq) +
  255. sizeof(struct nvme_rdma_request) +
  256. NVME_RDMA_DATA_SGL_SIZE;
  257. req->queue = queue;
  258. nvme_req(rq)->cmd = req->sqe.data;
  259. return 0;
  260. }
  261. static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
  262. unsigned int hctx_idx)
  263. {
  264. struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(data);
  265. struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1];
  266. BUG_ON(hctx_idx >= ctrl->ctrl.queue_count);
  267. hctx->driver_data = queue;
  268. return 0;
  269. }
  270. static int nvme_rdma_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
  271. unsigned int hctx_idx)
  272. {
  273. struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(data);
  274. struct nvme_rdma_queue *queue = &ctrl->queues[0];
  275. BUG_ON(hctx_idx != 0);
  276. hctx->driver_data = queue;
  277. return 0;
  278. }
  279. static void nvme_rdma_free_dev(struct kref *ref)
  280. {
  281. struct nvme_rdma_device *ndev =
  282. container_of(ref, struct nvme_rdma_device, ref);
  283. mutex_lock(&device_list_mutex);
  284. list_del(&ndev->entry);
  285. mutex_unlock(&device_list_mutex);
  286. ib_dealloc_pd(ndev->pd);
  287. kfree(ndev);
  288. }
  289. static void nvme_rdma_dev_put(struct nvme_rdma_device *dev)
  290. {
  291. kref_put(&dev->ref, nvme_rdma_free_dev);
  292. }
  293. static int nvme_rdma_dev_get(struct nvme_rdma_device *dev)
  294. {
  295. return kref_get_unless_zero(&dev->ref);
  296. }
  297. static struct nvme_rdma_device *
  298. nvme_rdma_find_get_device(struct rdma_cm_id *cm_id)
  299. {
  300. struct nvme_rdma_device *ndev;
  301. mutex_lock(&device_list_mutex);
  302. list_for_each_entry(ndev, &device_list, entry) {
  303. if (ndev->dev->node_guid == cm_id->device->node_guid &&
  304. nvme_rdma_dev_get(ndev))
  305. goto out_unlock;
  306. }
  307. ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
  308. if (!ndev)
  309. goto out_err;
  310. ndev->dev = cm_id->device;
  311. kref_init(&ndev->ref);
  312. ndev->pd = ib_alloc_pd(ndev->dev,
  313. register_always ? 0 : IB_PD_UNSAFE_GLOBAL_RKEY);
  314. if (IS_ERR(ndev->pd))
  315. goto out_free_dev;
  316. if (!(ndev->dev->attrs.device_cap_flags &
  317. IB_DEVICE_MEM_MGT_EXTENSIONS)) {
  318. dev_err(&ndev->dev->dev,
  319. "Memory registrations not supported.\n");
  320. goto out_free_pd;
  321. }
  322. ndev->num_inline_segments = min(NVME_RDMA_MAX_INLINE_SEGMENTS,
  323. ndev->dev->attrs.max_send_sge - 1);
  324. list_add(&ndev->entry, &device_list);
  325. out_unlock:
  326. mutex_unlock(&device_list_mutex);
  327. return ndev;
  328. out_free_pd:
  329. ib_dealloc_pd(ndev->pd);
  330. out_free_dev:
  331. kfree(ndev);
  332. out_err:
  333. mutex_unlock(&device_list_mutex);
  334. return NULL;
  335. }
  336. static void nvme_rdma_free_cq(struct nvme_rdma_queue *queue)
  337. {
  338. if (nvme_rdma_poll_queue(queue))
  339. ib_free_cq(queue->ib_cq);
  340. else
  341. ib_cq_pool_put(queue->ib_cq, queue->cq_size);
  342. }
  343. static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
  344. {
  345. struct nvme_rdma_device *dev;
  346. struct ib_device *ibdev;
  347. if (!test_and_clear_bit(NVME_RDMA_Q_TR_READY, &queue->flags))
  348. return;
  349. dev = queue->device;
  350. ibdev = dev->dev;
  351. if (queue->pi_support)
  352. ib_mr_pool_destroy(queue->qp, &queue->qp->sig_mrs);
  353. ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs);
  354. /*
  355. * The cm_id object might have been destroyed during RDMA connection
  356. * establishment error flow to avoid getting other cma events, thus
  357. * the destruction of the QP shouldn't use rdma_cm API.
  358. */
  359. ib_destroy_qp(queue->qp);
  360. nvme_rdma_free_cq(queue);
  361. nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
  362. sizeof(struct nvme_completion), DMA_FROM_DEVICE);
  363. nvme_rdma_dev_put(dev);
  364. }
  365. static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev, bool pi_support)
  366. {
  367. u32 max_page_list_len;
  368. if (pi_support)
  369. max_page_list_len = ibdev->attrs.max_pi_fast_reg_page_list_len;
  370. else
  371. max_page_list_len = ibdev->attrs.max_fast_reg_page_list_len;
  372. return min_t(u32, NVME_RDMA_MAX_SEGMENTS, max_page_list_len - 1);
  373. }
  374. static int nvme_rdma_create_cq(struct ib_device *ibdev,
  375. struct nvme_rdma_queue *queue)
  376. {
  377. int ret, comp_vector, idx = nvme_rdma_queue_idx(queue);
  378. enum ib_poll_context poll_ctx;
  379. /*
  380. * Spread I/O queues completion vectors according their queue index.
  381. * Admin queues can always go on completion vector 0.
  382. */
  383. comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors;
  384. /* Polling queues need direct cq polling context */
  385. if (nvme_rdma_poll_queue(queue)) {
  386. poll_ctx = IB_POLL_DIRECT;
  387. queue->ib_cq = ib_alloc_cq(ibdev, queue, queue->cq_size,
  388. comp_vector, poll_ctx);
  389. } else {
  390. poll_ctx = IB_POLL_SOFTIRQ;
  391. queue->ib_cq = ib_cq_pool_get(ibdev, queue->cq_size,
  392. comp_vector, poll_ctx);
  393. }
  394. if (IS_ERR(queue->ib_cq)) {
  395. ret = PTR_ERR(queue->ib_cq);
  396. return ret;
  397. }
  398. return 0;
  399. }
  400. static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
  401. {
  402. struct ib_device *ibdev;
  403. const int send_wr_factor = 3; /* MR, SEND, INV */
  404. const int cq_factor = send_wr_factor + 1; /* + RECV */
  405. int ret, pages_per_mr;
  406. queue->device = nvme_rdma_find_get_device(queue->cm_id);
  407. if (!queue->device) {
  408. dev_err(queue->cm_id->device->dev.parent,
  409. "no client data found!\n");
  410. return -ECONNREFUSED;
  411. }
  412. ibdev = queue->device->dev;
  413. /* +1 for ib_stop_cq */
  414. queue->cq_size = cq_factor * queue->queue_size + 1;
  415. ret = nvme_rdma_create_cq(ibdev, queue);
  416. if (ret)
  417. goto out_put_dev;
  418. ret = nvme_rdma_create_qp(queue, send_wr_factor);
  419. if (ret)
  420. goto out_destroy_ib_cq;
  421. queue->rsp_ring = nvme_rdma_alloc_ring(ibdev, queue->queue_size,
  422. sizeof(struct nvme_completion), DMA_FROM_DEVICE);
  423. if (!queue->rsp_ring) {
  424. ret = -ENOMEM;
  425. goto out_destroy_qp;
  426. }
  427. /*
  428. * Currently we don't use SG_GAPS MR's so if the first entry is
  429. * misaligned we'll end up using two entries for a single data page,
  430. * so one additional entry is required.
  431. */
  432. pages_per_mr = nvme_rdma_get_max_fr_pages(ibdev, queue->pi_support) + 1;
  433. ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs,
  434. queue->queue_size,
  435. IB_MR_TYPE_MEM_REG,
  436. pages_per_mr, 0);
  437. if (ret) {
  438. dev_err(queue->ctrl->ctrl.device,
  439. "failed to initialize MR pool sized %d for QID %d\n",
  440. queue->queue_size, nvme_rdma_queue_idx(queue));
  441. goto out_destroy_ring;
  442. }
  443. if (queue->pi_support) {
  444. ret = ib_mr_pool_init(queue->qp, &queue->qp->sig_mrs,
  445. queue->queue_size, IB_MR_TYPE_INTEGRITY,
  446. pages_per_mr, pages_per_mr);
  447. if (ret) {
  448. dev_err(queue->ctrl->ctrl.device,
  449. "failed to initialize PI MR pool sized %d for QID %d\n",
  450. queue->queue_size, nvme_rdma_queue_idx(queue));
  451. goto out_destroy_mr_pool;
  452. }
  453. }
  454. set_bit(NVME_RDMA_Q_TR_READY, &queue->flags);
  455. return 0;
  456. out_destroy_mr_pool:
  457. ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs);
  458. out_destroy_ring:
  459. nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
  460. sizeof(struct nvme_completion), DMA_FROM_DEVICE);
  461. out_destroy_qp:
  462. rdma_destroy_qp(queue->cm_id);
  463. out_destroy_ib_cq:
  464. nvme_rdma_free_cq(queue);
  465. out_put_dev:
  466. nvme_rdma_dev_put(queue->device);
  467. return ret;
  468. }
  469. static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
  470. int idx, size_t queue_size)
  471. {
  472. struct nvme_rdma_queue *queue;
  473. struct sockaddr *src_addr = NULL;
  474. int ret;
  475. queue = &ctrl->queues[idx];
  476. mutex_init(&queue->queue_lock);
  477. queue->ctrl = ctrl;
  478. if (idx && ctrl->ctrl.max_integrity_segments)
  479. queue->pi_support = true;
  480. else
  481. queue->pi_support = false;
  482. init_completion(&queue->cm_done);
  483. if (idx > 0)
  484. queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
  485. else
  486. queue->cmnd_capsule_len = sizeof(struct nvme_command);
  487. queue->queue_size = queue_size;
  488. queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,
  489. RDMA_PS_TCP, IB_QPT_RC);
  490. if (IS_ERR(queue->cm_id)) {
  491. dev_info(ctrl->ctrl.device,
  492. "failed to create CM ID: %ld\n", PTR_ERR(queue->cm_id));
  493. ret = PTR_ERR(queue->cm_id);
  494. goto out_destroy_mutex;
  495. }
  496. if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
  497. src_addr = (struct sockaddr *)&ctrl->src_addr;
  498. queue->cm_error = -ETIMEDOUT;
  499. ret = rdma_resolve_addr(queue->cm_id, src_addr,
  500. (struct sockaddr *)&ctrl->addr,
  501. NVME_RDMA_CM_TIMEOUT_MS);
  502. if (ret) {
  503. dev_info(ctrl->ctrl.device,
  504. "rdma_resolve_addr failed (%d).\n", ret);
  505. goto out_destroy_cm_id;
  506. }
  507. ret = nvme_rdma_wait_for_cm(queue);
  508. if (ret) {
  509. dev_info(ctrl->ctrl.device,
  510. "rdma connection establishment failed (%d)\n", ret);
  511. goto out_destroy_cm_id;
  512. }
  513. set_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags);
  514. return 0;
  515. out_destroy_cm_id:
  516. rdma_destroy_id(queue->cm_id);
  517. nvme_rdma_destroy_queue_ib(queue);
  518. out_destroy_mutex:
  519. mutex_destroy(&queue->queue_lock);
  520. return ret;
  521. }
  522. static void __nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
  523. {
  524. rdma_disconnect(queue->cm_id);
  525. ib_drain_qp(queue->qp);
  526. }
  527. static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
  528. {
  529. if (!test_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
  530. return;
  531. mutex_lock(&queue->queue_lock);
  532. if (test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags))
  533. __nvme_rdma_stop_queue(queue);
  534. mutex_unlock(&queue->queue_lock);
  535. }
  536. static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
  537. {
  538. if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
  539. return;
  540. rdma_destroy_id(queue->cm_id);
  541. nvme_rdma_destroy_queue_ib(queue);
  542. mutex_destroy(&queue->queue_lock);
  543. }
  544. static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl)
  545. {
  546. int i;
  547. for (i = 1; i < ctrl->ctrl.queue_count; i++)
  548. nvme_rdma_free_queue(&ctrl->queues[i]);
  549. }
  550. static void nvme_rdma_stop_io_queues(struct nvme_rdma_ctrl *ctrl)
  551. {
  552. int i;
  553. for (i = 1; i < ctrl->ctrl.queue_count; i++)
  554. nvme_rdma_stop_queue(&ctrl->queues[i]);
  555. }
  556. static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx)
  557. {
  558. struct nvme_rdma_queue *queue = &ctrl->queues[idx];
  559. int ret;
  560. if (idx)
  561. ret = nvmf_connect_io_queue(&ctrl->ctrl, idx);
  562. else
  563. ret = nvmf_connect_admin_queue(&ctrl->ctrl);
  564. if (!ret) {
  565. set_bit(NVME_RDMA_Q_LIVE, &queue->flags);
  566. } else {
  567. if (test_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
  568. __nvme_rdma_stop_queue(queue);
  569. dev_info(ctrl->ctrl.device,
  570. "failed to connect queue: %d ret=%d\n", idx, ret);
  571. }
  572. return ret;
  573. }
  574. static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl,
  575. int first, int last)
  576. {
  577. int i, ret = 0;
  578. for (i = first; i < last; i++) {
  579. ret = nvme_rdma_start_queue(ctrl, i);
  580. if (ret)
  581. goto out_stop_queues;
  582. }
  583. return 0;
  584. out_stop_queues:
  585. for (i--; i >= first; i--)
  586. nvme_rdma_stop_queue(&ctrl->queues[i]);
  587. return ret;
  588. }
  589. static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
  590. {
  591. struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
  592. struct ib_device *ibdev = ctrl->device->dev;
  593. unsigned int nr_io_queues, nr_default_queues;
  594. unsigned int nr_read_queues, nr_poll_queues;
  595. int i, ret;
  596. nr_read_queues = min_t(unsigned int, ibdev->num_comp_vectors,
  597. min(opts->nr_io_queues, num_online_cpus()));
  598. nr_default_queues = min_t(unsigned int, ibdev->num_comp_vectors,
  599. min(opts->nr_write_queues, num_online_cpus()));
  600. nr_poll_queues = min(opts->nr_poll_queues, num_online_cpus());
  601. nr_io_queues = nr_read_queues + nr_default_queues + nr_poll_queues;
  602. ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
  603. if (ret)
  604. return ret;
  605. if (nr_io_queues == 0) {
  606. dev_err(ctrl->ctrl.device,
  607. "unable to set any I/O queues\n");
  608. return -ENOMEM;
  609. }
  610. ctrl->ctrl.queue_count = nr_io_queues + 1;
  611. dev_info(ctrl->ctrl.device,
  612. "creating %d I/O queues.\n", nr_io_queues);
  613. if (opts->nr_write_queues && nr_read_queues < nr_io_queues) {
  614. /*
  615. * separate read/write queues
  616. * hand out dedicated default queues only after we have
  617. * sufficient read queues.
  618. */
  619. ctrl->io_queues[HCTX_TYPE_READ] = nr_read_queues;
  620. nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
  621. ctrl->io_queues[HCTX_TYPE_DEFAULT] =
  622. min(nr_default_queues, nr_io_queues);
  623. nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
  624. } else {
  625. /*
  626. * shared read/write queues
  627. * either no write queues were requested, or we don't have
  628. * sufficient queue count to have dedicated default queues.
  629. */
  630. ctrl->io_queues[HCTX_TYPE_DEFAULT] =
  631. min(nr_read_queues, nr_io_queues);
  632. nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
  633. }
  634. if (opts->nr_poll_queues && nr_io_queues) {
  635. /* map dedicated poll queues only if we have queues left */
  636. ctrl->io_queues[HCTX_TYPE_POLL] =
  637. min(nr_poll_queues, nr_io_queues);
  638. }
  639. for (i = 1; i < ctrl->ctrl.queue_count; i++) {
  640. ret = nvme_rdma_alloc_queue(ctrl, i,
  641. ctrl->ctrl.sqsize + 1);
  642. if (ret)
  643. goto out_free_queues;
  644. }
  645. return 0;
  646. out_free_queues:
  647. for (i--; i >= 1; i--)
  648. nvme_rdma_free_queue(&ctrl->queues[i]);
  649. return ret;
  650. }
  651. static int nvme_rdma_alloc_tag_set(struct nvme_ctrl *ctrl)
  652. {
  653. unsigned int cmd_size = sizeof(struct nvme_rdma_request) +
  654. NVME_RDMA_DATA_SGL_SIZE;
  655. if (ctrl->max_integrity_segments)
  656. cmd_size += sizeof(struct nvme_rdma_sgl) +
  657. NVME_RDMA_METADATA_SGL_SIZE;
  658. return nvme_alloc_io_tag_set(ctrl, &to_rdma_ctrl(ctrl)->tag_set,
  659. &nvme_rdma_mq_ops,
  660. ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2,
  661. cmd_size);
  662. }
  663. static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl)
  664. {
  665. if (ctrl->async_event_sqe.data) {
  666. cancel_work_sync(&ctrl->ctrl.async_event_work);
  667. nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
  668. sizeof(struct nvme_command), DMA_TO_DEVICE);
  669. ctrl->async_event_sqe.data = NULL;
  670. }
  671. nvme_rdma_free_queue(&ctrl->queues[0]);
  672. }
  673. static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
  674. bool new)
  675. {
  676. bool pi_capable = false;
  677. int error;
  678. error = nvme_rdma_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
  679. if (error)
  680. return error;
  681. ctrl->device = ctrl->queues[0].device;
  682. ctrl->ctrl.numa_node = ibdev_to_node(ctrl->device->dev);
  683. /* T10-PI support */
  684. if (ctrl->device->dev->attrs.kernel_cap_flags &
  685. IBK_INTEGRITY_HANDOVER)
  686. pi_capable = true;
  687. ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev,
  688. pi_capable);
  689. /*
  690. * Bind the async event SQE DMA mapping to the admin queue lifetime.
  691. * It's safe, since any chage in the underlying RDMA device will issue
  692. * error recovery and queue re-creation.
  693. */
  694. error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe,
  695. sizeof(struct nvme_command), DMA_TO_DEVICE);
  696. if (error)
  697. goto out_free_queue;
  698. if (new) {
  699. error = nvme_alloc_admin_tag_set(&ctrl->ctrl,
  700. &ctrl->admin_tag_set, &nvme_rdma_admin_mq_ops,
  701. sizeof(struct nvme_rdma_request) +
  702. NVME_RDMA_DATA_SGL_SIZE);
  703. if (error)
  704. goto out_free_async_qe;
  705. }
  706. error = nvme_rdma_start_queue(ctrl, 0);
  707. if (error)
  708. goto out_remove_admin_tag_set;
  709. error = nvme_enable_ctrl(&ctrl->ctrl);
  710. if (error)
  711. goto out_stop_queue;
  712. ctrl->ctrl.max_segments = ctrl->max_fr_pages;
  713. ctrl->ctrl.max_hw_sectors = ctrl->max_fr_pages << (ilog2(SZ_4K) - 9);
  714. if (pi_capable)
  715. ctrl->ctrl.max_integrity_segments = ctrl->max_fr_pages;
  716. else
  717. ctrl->ctrl.max_integrity_segments = 0;
  718. nvme_start_admin_queue(&ctrl->ctrl);
  719. error = nvme_init_ctrl_finish(&ctrl->ctrl);
  720. if (error)
  721. goto out_quiesce_queue;
  722. return 0;
  723. out_quiesce_queue:
  724. nvme_stop_admin_queue(&ctrl->ctrl);
  725. blk_sync_queue(ctrl->ctrl.admin_q);
  726. out_stop_queue:
  727. nvme_rdma_stop_queue(&ctrl->queues[0]);
  728. nvme_cancel_admin_tagset(&ctrl->ctrl);
  729. out_remove_admin_tag_set:
  730. if (new)
  731. nvme_remove_admin_tag_set(&ctrl->ctrl);
  732. out_free_async_qe:
  733. if (ctrl->async_event_sqe.data) {
  734. nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
  735. sizeof(struct nvme_command), DMA_TO_DEVICE);
  736. ctrl->async_event_sqe.data = NULL;
  737. }
  738. out_free_queue:
  739. nvme_rdma_free_queue(&ctrl->queues[0]);
  740. return error;
  741. }
  742. static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
  743. {
  744. int ret, nr_queues;
  745. ret = nvme_rdma_alloc_io_queues(ctrl);
  746. if (ret)
  747. return ret;
  748. if (new) {
  749. ret = nvme_rdma_alloc_tag_set(&ctrl->ctrl);
  750. if (ret)
  751. goto out_free_io_queues;
  752. }
  753. /*
  754. * Only start IO queues for which we have allocated the tagset
  755. * and limitted it to the available queues. On reconnects, the
  756. * queue number might have changed.
  757. */
  758. nr_queues = min(ctrl->tag_set.nr_hw_queues + 1, ctrl->ctrl.queue_count);
  759. ret = nvme_rdma_start_io_queues(ctrl, 1, nr_queues);
  760. if (ret)
  761. goto out_cleanup_tagset;
  762. if (!new) {
  763. nvme_start_freeze(&ctrl->ctrl);
  764. nvme_start_queues(&ctrl->ctrl);
  765. if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) {
  766. /*
  767. * If we timed out waiting for freeze we are likely to
  768. * be stuck. Fail the controller initialization just
  769. * to be safe.
  770. */
  771. ret = -ENODEV;
  772. nvme_unfreeze(&ctrl->ctrl);
  773. goto out_wait_freeze_timed_out;
  774. }
  775. blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset,
  776. ctrl->ctrl.queue_count - 1);
  777. nvme_unfreeze(&ctrl->ctrl);
  778. }
  779. /*
  780. * If the number of queues has increased (reconnect case)
  781. * start all new queues now.
  782. */
  783. ret = nvme_rdma_start_io_queues(ctrl, nr_queues,
  784. ctrl->tag_set.nr_hw_queues + 1);
  785. if (ret)
  786. goto out_wait_freeze_timed_out;
  787. return 0;
  788. out_wait_freeze_timed_out:
  789. nvme_stop_queues(&ctrl->ctrl);
  790. nvme_sync_io_queues(&ctrl->ctrl);
  791. nvme_rdma_stop_io_queues(ctrl);
  792. out_cleanup_tagset:
  793. nvme_cancel_tagset(&ctrl->ctrl);
  794. if (new)
  795. nvme_remove_io_tag_set(&ctrl->ctrl);
  796. out_free_io_queues:
  797. nvme_rdma_free_io_queues(ctrl);
  798. return ret;
  799. }
  800. static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
  801. bool remove)
  802. {
  803. nvme_stop_admin_queue(&ctrl->ctrl);
  804. blk_sync_queue(ctrl->ctrl.admin_q);
  805. nvme_rdma_stop_queue(&ctrl->queues[0]);
  806. nvme_cancel_admin_tagset(&ctrl->ctrl);
  807. if (remove) {
  808. nvme_start_admin_queue(&ctrl->ctrl);
  809. nvme_remove_admin_tag_set(&ctrl->ctrl);
  810. }
  811. nvme_rdma_destroy_admin_queue(ctrl);
  812. }
  813. static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
  814. bool remove)
  815. {
  816. if (ctrl->ctrl.queue_count > 1) {
  817. nvme_stop_queues(&ctrl->ctrl);
  818. nvme_sync_io_queues(&ctrl->ctrl);
  819. nvme_rdma_stop_io_queues(ctrl);
  820. nvme_cancel_tagset(&ctrl->ctrl);
  821. if (remove) {
  822. nvme_start_queues(&ctrl->ctrl);
  823. nvme_remove_io_tag_set(&ctrl->ctrl);
  824. }
  825. nvme_rdma_free_io_queues(ctrl);
  826. }
  827. }
  828. static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
  829. {
  830. struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
  831. flush_work(&ctrl->err_work);
  832. cancel_delayed_work_sync(&ctrl->reconnect_work);
  833. }
  834. static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
  835. {
  836. struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
  837. if (list_empty(&ctrl->list))
  838. goto free_ctrl;
  839. mutex_lock(&nvme_rdma_ctrl_mutex);
  840. list_del(&ctrl->list);
  841. mutex_unlock(&nvme_rdma_ctrl_mutex);
  842. nvmf_free_options(nctrl->opts);
  843. free_ctrl:
  844. kfree(ctrl->queues);
  845. kfree(ctrl);
  846. }
  847. static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
  848. {
  849. /* If we are resetting/deleting then do nothing */
  850. if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) {
  851. WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW ||
  852. ctrl->ctrl.state == NVME_CTRL_LIVE);
  853. return;
  854. }
  855. if (nvmf_should_reconnect(&ctrl->ctrl)) {
  856. dev_info(ctrl->ctrl.device, "Reconnecting in %d seconds...\n",
  857. ctrl->ctrl.opts->reconnect_delay);
  858. queue_delayed_work(nvme_wq, &ctrl->reconnect_work,
  859. ctrl->ctrl.opts->reconnect_delay * HZ);
  860. } else {
  861. nvme_delete_ctrl(&ctrl->ctrl);
  862. }
  863. }
  864. static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
  865. {
  866. int ret;
  867. bool changed;
  868. ret = nvme_rdma_configure_admin_queue(ctrl, new);
  869. if (ret)
  870. return ret;
  871. if (ctrl->ctrl.icdoff) {
  872. ret = -EOPNOTSUPP;
  873. dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
  874. goto destroy_admin;
  875. }
  876. if (!(ctrl->ctrl.sgls & (1 << 2))) {
  877. ret = -EOPNOTSUPP;
  878. dev_err(ctrl->ctrl.device,
  879. "Mandatory keyed sgls are not supported!\n");
  880. goto destroy_admin;
  881. }
  882. if (ctrl->ctrl.opts->queue_size > ctrl->ctrl.sqsize + 1) {
  883. dev_warn(ctrl->ctrl.device,
  884. "queue_size %zu > ctrl sqsize %u, clamping down\n",
  885. ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1);
  886. }
  887. if (ctrl->ctrl.sqsize + 1 > NVME_RDMA_MAX_QUEUE_SIZE) {
  888. dev_warn(ctrl->ctrl.device,
  889. "ctrl sqsize %u > max queue size %u, clamping down\n",
  890. ctrl->ctrl.sqsize + 1, NVME_RDMA_MAX_QUEUE_SIZE);
  891. ctrl->ctrl.sqsize = NVME_RDMA_MAX_QUEUE_SIZE - 1;
  892. }
  893. if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
  894. dev_warn(ctrl->ctrl.device,
  895. "sqsize %u > ctrl maxcmd %u, clamping down\n",
  896. ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd);
  897. ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1;
  898. }
  899. if (ctrl->ctrl.sgls & (1 << 20))
  900. ctrl->use_inline_data = true;
  901. if (ctrl->ctrl.queue_count > 1) {
  902. ret = nvme_rdma_configure_io_queues(ctrl, new);
  903. if (ret)
  904. goto destroy_admin;
  905. }
  906. changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
  907. if (!changed) {
  908. /*
  909. * state change failure is ok if we started ctrl delete,
  910. * unless we're during creation of a new controller to
  911. * avoid races with teardown flow.
  912. */
  913. WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
  914. ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
  915. WARN_ON_ONCE(new);
  916. ret = -EINVAL;
  917. goto destroy_io;
  918. }
  919. nvme_start_ctrl(&ctrl->ctrl);
  920. return 0;
  921. destroy_io:
  922. if (ctrl->ctrl.queue_count > 1) {
  923. nvme_stop_queues(&ctrl->ctrl);
  924. nvme_sync_io_queues(&ctrl->ctrl);
  925. nvme_rdma_stop_io_queues(ctrl);
  926. nvme_cancel_tagset(&ctrl->ctrl);
  927. if (new)
  928. nvme_remove_io_tag_set(&ctrl->ctrl);
  929. nvme_rdma_free_io_queues(ctrl);
  930. }
  931. destroy_admin:
  932. nvme_stop_admin_queue(&ctrl->ctrl);
  933. blk_sync_queue(ctrl->ctrl.admin_q);
  934. nvme_rdma_stop_queue(&ctrl->queues[0]);
  935. nvme_cancel_admin_tagset(&ctrl->ctrl);
  936. if (new)
  937. nvme_remove_admin_tag_set(&ctrl->ctrl);
  938. nvme_rdma_destroy_admin_queue(ctrl);
  939. return ret;
  940. }
  941. static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
  942. {
  943. struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
  944. struct nvme_rdma_ctrl, reconnect_work);
  945. ++ctrl->ctrl.nr_reconnects;
  946. if (nvme_rdma_setup_ctrl(ctrl, false))
  947. goto requeue;
  948. dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n",
  949. ctrl->ctrl.nr_reconnects);
  950. ctrl->ctrl.nr_reconnects = 0;
  951. return;
  952. requeue:
  953. dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
  954. ctrl->ctrl.nr_reconnects);
  955. nvme_rdma_reconnect_or_remove(ctrl);
  956. }
  957. static void nvme_rdma_error_recovery_work(struct work_struct *work)
  958. {
  959. struct nvme_rdma_ctrl *ctrl = container_of(work,
  960. struct nvme_rdma_ctrl, err_work);
  961. nvme_stop_keep_alive(&ctrl->ctrl);
  962. flush_work(&ctrl->ctrl.async_event_work);
  963. nvme_rdma_teardown_io_queues(ctrl, false);
  964. nvme_start_queues(&ctrl->ctrl);
  965. nvme_rdma_teardown_admin_queue(ctrl, false);
  966. nvme_start_admin_queue(&ctrl->ctrl);
  967. nvme_auth_stop(&ctrl->ctrl);
  968. if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
  969. /* state change failure is ok if we started ctrl delete */
  970. WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
  971. ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
  972. return;
  973. }
  974. nvme_rdma_reconnect_or_remove(ctrl);
  975. }
  976. static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
  977. {
  978. if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
  979. return;
  980. dev_warn(ctrl->ctrl.device, "starting error recovery\n");
  981. queue_work(nvme_reset_wq, &ctrl->err_work);
  982. }
  983. static void nvme_rdma_end_request(struct nvme_rdma_request *req)
  984. {
  985. struct request *rq = blk_mq_rq_from_pdu(req);
  986. if (!refcount_dec_and_test(&req->ref))
  987. return;
  988. if (!nvme_try_complete_req(rq, req->status, req->result))
  989. nvme_rdma_complete_rq(rq);
  990. }
  991. static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
  992. const char *op)
  993. {
  994. struct nvme_rdma_queue *queue = wc->qp->qp_context;
  995. struct nvme_rdma_ctrl *ctrl = queue->ctrl;
  996. if (ctrl->ctrl.state == NVME_CTRL_LIVE)
  997. dev_info(ctrl->ctrl.device,
  998. "%s for CQE 0x%p failed with status %s (%d)\n",
  999. op, wc->wr_cqe,
  1000. ib_wc_status_msg(wc->status), wc->status);
  1001. nvme_rdma_error_recovery(ctrl);
  1002. }
  1003. static void nvme_rdma_memreg_done(struct ib_cq *cq, struct ib_wc *wc)
  1004. {
  1005. if (unlikely(wc->status != IB_WC_SUCCESS))
  1006. nvme_rdma_wr_error(cq, wc, "MEMREG");
  1007. }
  1008. static void nvme_rdma_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)
  1009. {
  1010. struct nvme_rdma_request *req =
  1011. container_of(wc->wr_cqe, struct nvme_rdma_request, reg_cqe);
  1012. if (unlikely(wc->status != IB_WC_SUCCESS))
  1013. nvme_rdma_wr_error(cq, wc, "LOCAL_INV");
  1014. else
  1015. nvme_rdma_end_request(req);
  1016. }
  1017. static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue,
  1018. struct nvme_rdma_request *req)
  1019. {
  1020. struct ib_send_wr wr = {
  1021. .opcode = IB_WR_LOCAL_INV,
  1022. .next = NULL,
  1023. .num_sge = 0,
  1024. .send_flags = IB_SEND_SIGNALED,
  1025. .ex.invalidate_rkey = req->mr->rkey,
  1026. };
  1027. req->reg_cqe.done = nvme_rdma_inv_rkey_done;
  1028. wr.wr_cqe = &req->reg_cqe;
  1029. return ib_post_send(queue->qp, &wr, NULL);
  1030. }
  1031. static void nvme_rdma_dma_unmap_req(struct ib_device *ibdev, struct request *rq)
  1032. {
  1033. struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
  1034. if (blk_integrity_rq(rq)) {
  1035. ib_dma_unmap_sg(ibdev, req->metadata_sgl->sg_table.sgl,
  1036. req->metadata_sgl->nents, rq_dma_dir(rq));
  1037. sg_free_table_chained(&req->metadata_sgl->sg_table,
  1038. NVME_INLINE_METADATA_SG_CNT);
  1039. }
  1040. ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents,
  1041. rq_dma_dir(rq));
  1042. sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT);
  1043. }
  1044. static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
  1045. struct request *rq)
  1046. {
  1047. struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
  1048. struct nvme_rdma_device *dev = queue->device;
  1049. struct ib_device *ibdev = dev->dev;
  1050. struct list_head *pool = &queue->qp->rdma_mrs;
  1051. if (!blk_rq_nr_phys_segments(rq))
  1052. return;
  1053. if (req->use_sig_mr)
  1054. pool = &queue->qp->sig_mrs;
  1055. if (req->mr) {
  1056. ib_mr_pool_put(queue->qp, pool, req->mr);
  1057. req->mr = NULL;
  1058. }
  1059. nvme_rdma_dma_unmap_req(ibdev, rq);
  1060. }
  1061. static int nvme_rdma_set_sg_null(struct nvme_command *c)
  1062. {
  1063. struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
  1064. sg->addr = 0;
  1065. put_unaligned_le24(0, sg->length);
  1066. put_unaligned_le32(0, sg->key);
  1067. sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
  1068. return 0;
  1069. }
  1070. static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
  1071. struct nvme_rdma_request *req, struct nvme_command *c,
  1072. int count)
  1073. {
  1074. struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
  1075. struct ib_sge *sge = &req->sge[1];
  1076. struct scatterlist *sgl;
  1077. u32 len = 0;
  1078. int i;
  1079. for_each_sg(req->data_sgl.sg_table.sgl, sgl, count, i) {
  1080. sge->addr = sg_dma_address(sgl);
  1081. sge->length = sg_dma_len(sgl);
  1082. sge->lkey = queue->device->pd->local_dma_lkey;
  1083. len += sge->length;
  1084. sge++;
  1085. }
  1086. sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
  1087. sg->length = cpu_to_le32(len);
  1088. sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
  1089. req->num_sge += count;
  1090. return 0;
  1091. }
  1092. static int nvme_rdma_map_sg_single(struct nvme_rdma_queue *queue,
  1093. struct nvme_rdma_request *req, struct nvme_command *c)
  1094. {
  1095. struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
  1096. sg->addr = cpu_to_le64(sg_dma_address(req->data_sgl.sg_table.sgl));
  1097. put_unaligned_le24(sg_dma_len(req->data_sgl.sg_table.sgl), sg->length);
  1098. put_unaligned_le32(queue->device->pd->unsafe_global_rkey, sg->key);
  1099. sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
  1100. return 0;
  1101. }
  1102. static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue,
  1103. struct nvme_rdma_request *req, struct nvme_command *c,
  1104. int count)
  1105. {
  1106. struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
  1107. int nr;
  1108. req->mr = ib_mr_pool_get(queue->qp, &queue->qp->rdma_mrs);
  1109. if (WARN_ON_ONCE(!req->mr))
  1110. return -EAGAIN;
  1111. /*
  1112. * Align the MR to a 4K page size to match the ctrl page size and
  1113. * the block virtual boundary.
  1114. */
  1115. nr = ib_map_mr_sg(req->mr, req->data_sgl.sg_table.sgl, count, NULL,
  1116. SZ_4K);
  1117. if (unlikely(nr < count)) {
  1118. ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr);
  1119. req->mr = NULL;
  1120. if (nr < 0)
  1121. return nr;
  1122. return -EINVAL;
  1123. }
  1124. ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
  1125. req->reg_cqe.done = nvme_rdma_memreg_done;
  1126. memset(&req->reg_wr, 0, sizeof(req->reg_wr));
  1127. req->reg_wr.wr.opcode = IB_WR_REG_MR;
  1128. req->reg_wr.wr.wr_cqe = &req->reg_cqe;
  1129. req->reg_wr.wr.num_sge = 0;
  1130. req->reg_wr.mr = req->mr;
  1131. req->reg_wr.key = req->mr->rkey;
  1132. req->reg_wr.access = IB_ACCESS_LOCAL_WRITE |
  1133. IB_ACCESS_REMOTE_READ |
  1134. IB_ACCESS_REMOTE_WRITE;
  1135. sg->addr = cpu_to_le64(req->mr->iova);
  1136. put_unaligned_le24(req->mr->length, sg->length);
  1137. put_unaligned_le32(req->mr->rkey, sg->key);
  1138. sg->type = (NVME_KEY_SGL_FMT_DATA_DESC << 4) |
  1139. NVME_SGL_FMT_INVALIDATE;
  1140. return 0;
  1141. }
  1142. static void nvme_rdma_set_sig_domain(struct blk_integrity *bi,
  1143. struct nvme_command *cmd, struct ib_sig_domain *domain,
  1144. u16 control, u8 pi_type)
  1145. {
  1146. domain->sig_type = IB_SIG_TYPE_T10_DIF;
  1147. domain->sig.dif.bg_type = IB_T10DIF_CRC;
  1148. domain->sig.dif.pi_interval = 1 << bi->interval_exp;
  1149. domain->sig.dif.ref_tag = le32_to_cpu(cmd->rw.reftag);
  1150. if (control & NVME_RW_PRINFO_PRCHK_REF)
  1151. domain->sig.dif.ref_remap = true;
  1152. domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.apptag);
  1153. domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.appmask);
  1154. domain->sig.dif.app_escape = true;
  1155. if (pi_type == NVME_NS_DPS_PI_TYPE3)
  1156. domain->sig.dif.ref_escape = true;
  1157. }
  1158. static void nvme_rdma_set_sig_attrs(struct blk_integrity *bi,
  1159. struct nvme_command *cmd, struct ib_sig_attrs *sig_attrs,
  1160. u8 pi_type)
  1161. {
  1162. u16 control = le16_to_cpu(cmd->rw.control);
  1163. memset(sig_attrs, 0, sizeof(*sig_attrs));
  1164. if (control & NVME_RW_PRINFO_PRACT) {
  1165. /* for WRITE_INSERT/READ_STRIP no memory domain */
  1166. sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE;
  1167. nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control,
  1168. pi_type);
  1169. /* Clear the PRACT bit since HCA will generate/verify the PI */
  1170. control &= ~NVME_RW_PRINFO_PRACT;
  1171. cmd->rw.control = cpu_to_le16(control);
  1172. } else {
  1173. /* for WRITE_PASS/READ_PASS both wire/memory domains exist */
  1174. nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control,
  1175. pi_type);
  1176. nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control,
  1177. pi_type);
  1178. }
  1179. }
  1180. static void nvme_rdma_set_prot_checks(struct nvme_command *cmd, u8 *mask)
  1181. {
  1182. *mask = 0;
  1183. if (le16_to_cpu(cmd->rw.control) & NVME_RW_PRINFO_PRCHK_REF)
  1184. *mask |= IB_SIG_CHECK_REFTAG;
  1185. if (le16_to_cpu(cmd->rw.control) & NVME_RW_PRINFO_PRCHK_GUARD)
  1186. *mask |= IB_SIG_CHECK_GUARD;
  1187. }
  1188. static void nvme_rdma_sig_done(struct ib_cq *cq, struct ib_wc *wc)
  1189. {
  1190. if (unlikely(wc->status != IB_WC_SUCCESS))
  1191. nvme_rdma_wr_error(cq, wc, "SIG");
  1192. }
  1193. static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue,
  1194. struct nvme_rdma_request *req, struct nvme_command *c,
  1195. int count, int pi_count)
  1196. {
  1197. struct nvme_rdma_sgl *sgl = &req->data_sgl;
  1198. struct ib_reg_wr *wr = &req->reg_wr;
  1199. struct request *rq = blk_mq_rq_from_pdu(req);
  1200. struct nvme_ns *ns = rq->q->queuedata;
  1201. struct bio *bio = rq->bio;
  1202. struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
  1203. int nr;
  1204. req->mr = ib_mr_pool_get(queue->qp, &queue->qp->sig_mrs);
  1205. if (WARN_ON_ONCE(!req->mr))
  1206. return -EAGAIN;
  1207. nr = ib_map_mr_sg_pi(req->mr, sgl->sg_table.sgl, count, NULL,
  1208. req->metadata_sgl->sg_table.sgl, pi_count, NULL,
  1209. SZ_4K);
  1210. if (unlikely(nr))
  1211. goto mr_put;
  1212. nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_bdev->bd_disk), c,
  1213. req->mr->sig_attrs, ns->pi_type);
  1214. nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask);
  1215. ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
  1216. req->reg_cqe.done = nvme_rdma_sig_done;
  1217. memset(wr, 0, sizeof(*wr));
  1218. wr->wr.opcode = IB_WR_REG_MR_INTEGRITY;
  1219. wr->wr.wr_cqe = &req->reg_cqe;
  1220. wr->wr.num_sge = 0;
  1221. wr->wr.send_flags = 0;
  1222. wr->mr = req->mr;
  1223. wr->key = req->mr->rkey;
  1224. wr->access = IB_ACCESS_LOCAL_WRITE |
  1225. IB_ACCESS_REMOTE_READ |
  1226. IB_ACCESS_REMOTE_WRITE;
  1227. sg->addr = cpu_to_le64(req->mr->iova);
  1228. put_unaligned_le24(req->mr->length, sg->length);
  1229. put_unaligned_le32(req->mr->rkey, sg->key);
  1230. sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
  1231. return 0;
  1232. mr_put:
  1233. ib_mr_pool_put(queue->qp, &queue->qp->sig_mrs, req->mr);
  1234. req->mr = NULL;
  1235. if (nr < 0)
  1236. return nr;
  1237. return -EINVAL;
  1238. }
  1239. static int nvme_rdma_dma_map_req(struct ib_device *ibdev, struct request *rq,
  1240. int *count, int *pi_count)
  1241. {
  1242. struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
  1243. int ret;
  1244. req->data_sgl.sg_table.sgl = (struct scatterlist *)(req + 1);
  1245. ret = sg_alloc_table_chained(&req->data_sgl.sg_table,
  1246. blk_rq_nr_phys_segments(rq), req->data_sgl.sg_table.sgl,
  1247. NVME_INLINE_SG_CNT);
  1248. if (ret)
  1249. return -ENOMEM;
  1250. req->data_sgl.nents = blk_rq_map_sg(rq->q, rq,
  1251. req->data_sgl.sg_table.sgl);
  1252. *count = ib_dma_map_sg(ibdev, req->data_sgl.sg_table.sgl,
  1253. req->data_sgl.nents, rq_dma_dir(rq));
  1254. if (unlikely(*count <= 0)) {
  1255. ret = -EIO;
  1256. goto out_free_table;
  1257. }
  1258. if (blk_integrity_rq(rq)) {
  1259. req->metadata_sgl->sg_table.sgl =
  1260. (struct scatterlist *)(req->metadata_sgl + 1);
  1261. ret = sg_alloc_table_chained(&req->metadata_sgl->sg_table,
  1262. blk_rq_count_integrity_sg(rq->q, rq->bio),
  1263. req->metadata_sgl->sg_table.sgl,
  1264. NVME_INLINE_METADATA_SG_CNT);
  1265. if (unlikely(ret)) {
  1266. ret = -ENOMEM;
  1267. goto out_unmap_sg;
  1268. }
  1269. req->metadata_sgl->nents = blk_rq_map_integrity_sg(rq->q,
  1270. rq->bio, req->metadata_sgl->sg_table.sgl);
  1271. *pi_count = ib_dma_map_sg(ibdev,
  1272. req->metadata_sgl->sg_table.sgl,
  1273. req->metadata_sgl->nents,
  1274. rq_dma_dir(rq));
  1275. if (unlikely(*pi_count <= 0)) {
  1276. ret = -EIO;
  1277. goto out_free_pi_table;
  1278. }
  1279. }
  1280. return 0;
  1281. out_free_pi_table:
  1282. sg_free_table_chained(&req->metadata_sgl->sg_table,
  1283. NVME_INLINE_METADATA_SG_CNT);
  1284. out_unmap_sg:
  1285. ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents,
  1286. rq_dma_dir(rq));
  1287. out_free_table:
  1288. sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT);
  1289. return ret;
  1290. }
  1291. static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
  1292. struct request *rq, struct nvme_command *c)
  1293. {
  1294. struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
  1295. struct nvme_rdma_device *dev = queue->device;
  1296. struct ib_device *ibdev = dev->dev;
  1297. int pi_count = 0;
  1298. int count, ret;
  1299. req->num_sge = 1;
  1300. refcount_set(&req->ref, 2); /* send and recv completions */
  1301. c->common.flags |= NVME_CMD_SGL_METABUF;
  1302. if (!blk_rq_nr_phys_segments(rq))
  1303. return nvme_rdma_set_sg_null(c);
  1304. ret = nvme_rdma_dma_map_req(ibdev, rq, &count, &pi_count);
  1305. if (unlikely(ret))
  1306. return ret;
  1307. if (req->use_sig_mr) {
  1308. ret = nvme_rdma_map_sg_pi(queue, req, c, count, pi_count);
  1309. goto out;
  1310. }
  1311. if (count <= dev->num_inline_segments) {
  1312. if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) &&
  1313. queue->ctrl->use_inline_data &&
  1314. blk_rq_payload_bytes(rq) <=
  1315. nvme_rdma_inline_data_size(queue)) {
  1316. ret = nvme_rdma_map_sg_inline(queue, req, c, count);
  1317. goto out;
  1318. }
  1319. if (count == 1 && dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) {
  1320. ret = nvme_rdma_map_sg_single(queue, req, c);
  1321. goto out;
  1322. }
  1323. }
  1324. ret = nvme_rdma_map_sg_fr(queue, req, c, count);
  1325. out:
  1326. if (unlikely(ret))
  1327. goto out_dma_unmap_req;
  1328. return 0;
  1329. out_dma_unmap_req:
  1330. nvme_rdma_dma_unmap_req(ibdev, rq);
  1331. return ret;
  1332. }
  1333. static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
  1334. {
  1335. struct nvme_rdma_qe *qe =
  1336. container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
  1337. struct nvme_rdma_request *req =
  1338. container_of(qe, struct nvme_rdma_request, sqe);
  1339. if (unlikely(wc->status != IB_WC_SUCCESS))
  1340. nvme_rdma_wr_error(cq, wc, "SEND");
  1341. else
  1342. nvme_rdma_end_request(req);
  1343. }
  1344. static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
  1345. struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge,
  1346. struct ib_send_wr *first)
  1347. {
  1348. struct ib_send_wr wr;
  1349. int ret;
  1350. sge->addr = qe->dma;
  1351. sge->length = sizeof(struct nvme_command);
  1352. sge->lkey = queue->device->pd->local_dma_lkey;
  1353. wr.next = NULL;
  1354. wr.wr_cqe = &qe->cqe;
  1355. wr.sg_list = sge;
  1356. wr.num_sge = num_sge;
  1357. wr.opcode = IB_WR_SEND;
  1358. wr.send_flags = IB_SEND_SIGNALED;
  1359. if (first)
  1360. first->next = &wr;
  1361. else
  1362. first = &wr;
  1363. ret = ib_post_send(queue->qp, first, NULL);
  1364. if (unlikely(ret)) {
  1365. dev_err(queue->ctrl->ctrl.device,
  1366. "%s failed with error code %d\n", __func__, ret);
  1367. }
  1368. return ret;
  1369. }
  1370. static int nvme_rdma_post_recv(struct nvme_rdma_queue *queue,
  1371. struct nvme_rdma_qe *qe)
  1372. {
  1373. struct ib_recv_wr wr;
  1374. struct ib_sge list;
  1375. int ret;
  1376. list.addr = qe->dma;
  1377. list.length = sizeof(struct nvme_completion);
  1378. list.lkey = queue->device->pd->local_dma_lkey;
  1379. qe->cqe.done = nvme_rdma_recv_done;
  1380. wr.next = NULL;
  1381. wr.wr_cqe = &qe->cqe;
  1382. wr.sg_list = &list;
  1383. wr.num_sge = 1;
  1384. ret = ib_post_recv(queue->qp, &wr, NULL);
  1385. if (unlikely(ret)) {
  1386. dev_err(queue->ctrl->ctrl.device,
  1387. "%s failed with error code %d\n", __func__, ret);
  1388. }
  1389. return ret;
  1390. }
  1391. static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue)
  1392. {
  1393. u32 queue_idx = nvme_rdma_queue_idx(queue);
  1394. if (queue_idx == 0)
  1395. return queue->ctrl->admin_tag_set.tags[queue_idx];
  1396. return queue->ctrl->tag_set.tags[queue_idx - 1];
  1397. }
  1398. static void nvme_rdma_async_done(struct ib_cq *cq, struct ib_wc *wc)
  1399. {
  1400. if (unlikely(wc->status != IB_WC_SUCCESS))
  1401. nvme_rdma_wr_error(cq, wc, "ASYNC");
  1402. }
  1403. static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)
  1404. {
  1405. struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg);
  1406. struct nvme_rdma_queue *queue = &ctrl->queues[0];
  1407. struct ib_device *dev = queue->device->dev;
  1408. struct nvme_rdma_qe *sqe = &ctrl->async_event_sqe;
  1409. struct nvme_command *cmd = sqe->data;
  1410. struct ib_sge sge;
  1411. int ret;
  1412. ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE);
  1413. memset(cmd, 0, sizeof(*cmd));
  1414. cmd->common.opcode = nvme_admin_async_event;
  1415. cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
  1416. cmd->common.flags |= NVME_CMD_SGL_METABUF;
  1417. nvme_rdma_set_sg_null(cmd);
  1418. sqe->cqe.done = nvme_rdma_async_done;
  1419. ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd),
  1420. DMA_TO_DEVICE);
  1421. ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL);
  1422. WARN_ON_ONCE(ret);
  1423. }
  1424. static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
  1425. struct nvme_completion *cqe, struct ib_wc *wc)
  1426. {
  1427. struct request *rq;
  1428. struct nvme_rdma_request *req;
  1429. rq = nvme_find_rq(nvme_rdma_tagset(queue), cqe->command_id);
  1430. if (!rq) {
  1431. dev_err(queue->ctrl->ctrl.device,
  1432. "got bad command_id %#x on QP %#x\n",
  1433. cqe->command_id, queue->qp->qp_num);
  1434. nvme_rdma_error_recovery(queue->ctrl);
  1435. return;
  1436. }
  1437. req = blk_mq_rq_to_pdu(rq);
  1438. req->status = cqe->status;
  1439. req->result = cqe->result;
  1440. if (wc->wc_flags & IB_WC_WITH_INVALIDATE) {
  1441. if (unlikely(!req->mr ||
  1442. wc->ex.invalidate_rkey != req->mr->rkey)) {
  1443. dev_err(queue->ctrl->ctrl.device,
  1444. "Bogus remote invalidation for rkey %#x\n",
  1445. req->mr ? req->mr->rkey : 0);
  1446. nvme_rdma_error_recovery(queue->ctrl);
  1447. }
  1448. } else if (req->mr) {
  1449. int ret;
  1450. ret = nvme_rdma_inv_rkey(queue, req);
  1451. if (unlikely(ret < 0)) {
  1452. dev_err(queue->ctrl->ctrl.device,
  1453. "Queueing INV WR for rkey %#x failed (%d)\n",
  1454. req->mr->rkey, ret);
  1455. nvme_rdma_error_recovery(queue->ctrl);
  1456. }
  1457. /* the local invalidation completion will end the request */
  1458. return;
  1459. }
  1460. nvme_rdma_end_request(req);
  1461. }
  1462. static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
  1463. {
  1464. struct nvme_rdma_qe *qe =
  1465. container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
  1466. struct nvme_rdma_queue *queue = wc->qp->qp_context;
  1467. struct ib_device *ibdev = queue->device->dev;
  1468. struct nvme_completion *cqe = qe->data;
  1469. const size_t len = sizeof(struct nvme_completion);
  1470. if (unlikely(wc->status != IB_WC_SUCCESS)) {
  1471. nvme_rdma_wr_error(cq, wc, "RECV");
  1472. return;
  1473. }
  1474. /* sanity checking for received data length */
  1475. if (unlikely(wc->byte_len < len)) {
  1476. dev_err(queue->ctrl->ctrl.device,
  1477. "Unexpected nvme completion length(%d)\n", wc->byte_len);
  1478. nvme_rdma_error_recovery(queue->ctrl);
  1479. return;
  1480. }
  1481. ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE);
  1482. /*
  1483. * AEN requests are special as they don't time out and can
  1484. * survive any kind of queue freeze and often don't respond to
  1485. * aborts. We don't even bother to allocate a struct request
  1486. * for them but rather special case them here.
  1487. */
  1488. if (unlikely(nvme_is_aen_req(nvme_rdma_queue_idx(queue),
  1489. cqe->command_id)))
  1490. nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
  1491. &cqe->result);
  1492. else
  1493. nvme_rdma_process_nvme_rsp(queue, cqe, wc);
  1494. ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE);
  1495. nvme_rdma_post_recv(queue, qe);
  1496. }
  1497. static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue)
  1498. {
  1499. int ret, i;
  1500. for (i = 0; i < queue->queue_size; i++) {
  1501. ret = nvme_rdma_post_recv(queue, &queue->rsp_ring[i]);
  1502. if (ret)
  1503. return ret;
  1504. }
  1505. return 0;
  1506. }
  1507. static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue,
  1508. struct rdma_cm_event *ev)
  1509. {
  1510. struct rdma_cm_id *cm_id = queue->cm_id;
  1511. int status = ev->status;
  1512. const char *rej_msg;
  1513. const struct nvme_rdma_cm_rej *rej_data;
  1514. u8 rej_data_len;
  1515. rej_msg = rdma_reject_msg(cm_id, status);
  1516. rej_data = rdma_consumer_reject_data(cm_id, ev, &rej_data_len);
  1517. if (rej_data && rej_data_len >= sizeof(u16)) {
  1518. u16 sts = le16_to_cpu(rej_data->sts);
  1519. dev_err(queue->ctrl->ctrl.device,
  1520. "Connect rejected: status %d (%s) nvme status %d (%s).\n",
  1521. status, rej_msg, sts, nvme_rdma_cm_msg(sts));
  1522. } else {
  1523. dev_err(queue->ctrl->ctrl.device,
  1524. "Connect rejected: status %d (%s).\n", status, rej_msg);
  1525. }
  1526. return -ECONNRESET;
  1527. }
  1528. static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue)
  1529. {
  1530. struct nvme_ctrl *ctrl = &queue->ctrl->ctrl;
  1531. int ret;
  1532. ret = nvme_rdma_create_queue_ib(queue);
  1533. if (ret)
  1534. return ret;
  1535. if (ctrl->opts->tos >= 0)
  1536. rdma_set_service_type(queue->cm_id, ctrl->opts->tos);
  1537. ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CM_TIMEOUT_MS);
  1538. if (ret) {
  1539. dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n",
  1540. queue->cm_error);
  1541. goto out_destroy_queue;
  1542. }
  1543. return 0;
  1544. out_destroy_queue:
  1545. nvme_rdma_destroy_queue_ib(queue);
  1546. return ret;
  1547. }
  1548. static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue)
  1549. {
  1550. struct nvme_rdma_ctrl *ctrl = queue->ctrl;
  1551. struct rdma_conn_param param = { };
  1552. struct nvme_rdma_cm_req priv = { };
  1553. int ret;
  1554. param.qp_num = queue->qp->qp_num;
  1555. param.flow_control = 1;
  1556. param.responder_resources = queue->device->dev->attrs.max_qp_rd_atom;
  1557. /* maximum retry count */
  1558. param.retry_count = 7;
  1559. param.rnr_retry_count = 7;
  1560. param.private_data = &priv;
  1561. param.private_data_len = sizeof(priv);
  1562. priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
  1563. priv.qid = cpu_to_le16(nvme_rdma_queue_idx(queue));
  1564. /*
  1565. * set the admin queue depth to the minimum size
  1566. * specified by the Fabrics standard.
  1567. */
  1568. if (priv.qid == 0) {
  1569. priv.hrqsize = cpu_to_le16(NVME_AQ_DEPTH);
  1570. priv.hsqsize = cpu_to_le16(NVME_AQ_DEPTH - 1);
  1571. } else {
  1572. /*
  1573. * current interpretation of the fabrics spec
  1574. * is at minimum you make hrqsize sqsize+1, or a
  1575. * 1's based representation of sqsize.
  1576. */
  1577. priv.hrqsize = cpu_to_le16(queue->queue_size);
  1578. priv.hsqsize = cpu_to_le16(queue->ctrl->ctrl.sqsize);
  1579. }
  1580. ret = rdma_connect_locked(queue->cm_id, &param);
  1581. if (ret) {
  1582. dev_err(ctrl->ctrl.device,
  1583. "rdma_connect_locked failed (%d).\n", ret);
  1584. return ret;
  1585. }
  1586. return 0;
  1587. }
  1588. static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
  1589. struct rdma_cm_event *ev)
  1590. {
  1591. struct nvme_rdma_queue *queue = cm_id->context;
  1592. int cm_error = 0;
  1593. dev_dbg(queue->ctrl->ctrl.device, "%s (%d): status %d id %p\n",
  1594. rdma_event_msg(ev->event), ev->event,
  1595. ev->status, cm_id);
  1596. switch (ev->event) {
  1597. case RDMA_CM_EVENT_ADDR_RESOLVED:
  1598. cm_error = nvme_rdma_addr_resolved(queue);
  1599. break;
  1600. case RDMA_CM_EVENT_ROUTE_RESOLVED:
  1601. cm_error = nvme_rdma_route_resolved(queue);
  1602. break;
  1603. case RDMA_CM_EVENT_ESTABLISHED:
  1604. queue->cm_error = nvme_rdma_conn_established(queue);
  1605. /* complete cm_done regardless of success/failure */
  1606. complete(&queue->cm_done);
  1607. return 0;
  1608. case RDMA_CM_EVENT_REJECTED:
  1609. cm_error = nvme_rdma_conn_rejected(queue, ev);
  1610. break;
  1611. case RDMA_CM_EVENT_ROUTE_ERROR:
  1612. case RDMA_CM_EVENT_CONNECT_ERROR:
  1613. case RDMA_CM_EVENT_UNREACHABLE:
  1614. case RDMA_CM_EVENT_ADDR_ERROR:
  1615. dev_dbg(queue->ctrl->ctrl.device,
  1616. "CM error event %d\n", ev->event);
  1617. cm_error = -ECONNRESET;
  1618. break;
  1619. case RDMA_CM_EVENT_DISCONNECTED:
  1620. case RDMA_CM_EVENT_ADDR_CHANGE:
  1621. case RDMA_CM_EVENT_TIMEWAIT_EXIT:
  1622. dev_dbg(queue->ctrl->ctrl.device,
  1623. "disconnect received - connection closed\n");
  1624. nvme_rdma_error_recovery(queue->ctrl);
  1625. break;
  1626. case RDMA_CM_EVENT_DEVICE_REMOVAL:
  1627. /* device removal is handled via the ib_client API */
  1628. break;
  1629. default:
  1630. dev_err(queue->ctrl->ctrl.device,
  1631. "Unexpected RDMA CM event (%d)\n", ev->event);
  1632. nvme_rdma_error_recovery(queue->ctrl);
  1633. break;
  1634. }
  1635. if (cm_error) {
  1636. queue->cm_error = cm_error;
  1637. complete(&queue->cm_done);
  1638. }
  1639. return 0;
  1640. }
  1641. static void nvme_rdma_complete_timed_out(struct request *rq)
  1642. {
  1643. struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
  1644. struct nvme_rdma_queue *queue = req->queue;
  1645. nvme_rdma_stop_queue(queue);
  1646. nvmf_complete_timed_out_request(rq);
  1647. }
  1648. static enum blk_eh_timer_return nvme_rdma_timeout(struct request *rq)
  1649. {
  1650. struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
  1651. struct nvme_rdma_queue *queue = req->queue;
  1652. struct nvme_rdma_ctrl *ctrl = queue->ctrl;
  1653. dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n",
  1654. rq->tag, nvme_rdma_queue_idx(queue));
  1655. if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
  1656. /*
  1657. * If we are resetting, connecting or deleting we should
  1658. * complete immediately because we may block controller
  1659. * teardown or setup sequence
  1660. * - ctrl disable/shutdown fabrics requests
  1661. * - connect requests
  1662. * - initialization admin requests
  1663. * - I/O requests that entered after unquiescing and
  1664. * the controller stopped responding
  1665. *
  1666. * All other requests should be cancelled by the error
  1667. * recovery work, so it's fine that we fail it here.
  1668. */
  1669. nvme_rdma_complete_timed_out(rq);
  1670. return BLK_EH_DONE;
  1671. }
  1672. /*
  1673. * LIVE state should trigger the normal error recovery which will
  1674. * handle completing this request.
  1675. */
  1676. nvme_rdma_error_recovery(ctrl);
  1677. return BLK_EH_RESET_TIMER;
  1678. }
  1679. static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
  1680. const struct blk_mq_queue_data *bd)
  1681. {
  1682. struct nvme_ns *ns = hctx->queue->queuedata;
  1683. struct nvme_rdma_queue *queue = hctx->driver_data;
  1684. struct request *rq = bd->rq;
  1685. struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
  1686. struct nvme_rdma_qe *sqe = &req->sqe;
  1687. struct nvme_command *c = nvme_req(rq)->cmd;
  1688. struct ib_device *dev;
  1689. bool queue_ready = test_bit(NVME_RDMA_Q_LIVE, &queue->flags);
  1690. blk_status_t ret;
  1691. int err;
  1692. WARN_ON_ONCE(rq->tag < 0);
  1693. if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
  1694. return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq);
  1695. dev = queue->device->dev;
  1696. req->sqe.dma = ib_dma_map_single(dev, req->sqe.data,
  1697. sizeof(struct nvme_command),
  1698. DMA_TO_DEVICE);
  1699. err = ib_dma_mapping_error(dev, req->sqe.dma);
  1700. if (unlikely(err))
  1701. return BLK_STS_RESOURCE;
  1702. ib_dma_sync_single_for_cpu(dev, sqe->dma,
  1703. sizeof(struct nvme_command), DMA_TO_DEVICE);
  1704. ret = nvme_setup_cmd(ns, rq);
  1705. if (ret)
  1706. goto unmap_qe;
  1707. blk_mq_start_request(rq);
  1708. if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
  1709. queue->pi_support &&
  1710. (c->common.opcode == nvme_cmd_write ||
  1711. c->common.opcode == nvme_cmd_read) &&
  1712. nvme_ns_has_pi(ns))
  1713. req->use_sig_mr = true;
  1714. else
  1715. req->use_sig_mr = false;
  1716. err = nvme_rdma_map_data(queue, rq, c);
  1717. if (unlikely(err < 0)) {
  1718. dev_err(queue->ctrl->ctrl.device,
  1719. "Failed to map data (%d)\n", err);
  1720. goto err;
  1721. }
  1722. sqe->cqe.done = nvme_rdma_send_done;
  1723. ib_dma_sync_single_for_device(dev, sqe->dma,
  1724. sizeof(struct nvme_command), DMA_TO_DEVICE);
  1725. err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
  1726. req->mr ? &req->reg_wr.wr : NULL);
  1727. if (unlikely(err))
  1728. goto err_unmap;
  1729. return BLK_STS_OK;
  1730. err_unmap:
  1731. nvme_rdma_unmap_data(queue, rq);
  1732. err:
  1733. if (err == -EIO)
  1734. ret = nvme_host_path_error(rq);
  1735. else if (err == -ENOMEM || err == -EAGAIN)
  1736. ret = BLK_STS_RESOURCE;
  1737. else
  1738. ret = BLK_STS_IOERR;
  1739. nvme_cleanup_cmd(rq);
  1740. unmap_qe:
  1741. ib_dma_unmap_single(dev, req->sqe.dma, sizeof(struct nvme_command),
  1742. DMA_TO_DEVICE);
  1743. return ret;
  1744. }
  1745. static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
  1746. {
  1747. struct nvme_rdma_queue *queue = hctx->driver_data;
  1748. return ib_process_cq_direct(queue->ib_cq, -1);
  1749. }
  1750. static void nvme_rdma_check_pi_status(struct nvme_rdma_request *req)
  1751. {
  1752. struct request *rq = blk_mq_rq_from_pdu(req);
  1753. struct ib_mr_status mr_status;
  1754. int ret;
  1755. ret = ib_check_mr_status(req->mr, IB_MR_CHECK_SIG_STATUS, &mr_status);
  1756. if (ret) {
  1757. pr_err("ib_check_mr_status failed, ret %d\n", ret);
  1758. nvme_req(rq)->status = NVME_SC_INVALID_PI;
  1759. return;
  1760. }
  1761. if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) {
  1762. switch (mr_status.sig_err.err_type) {
  1763. case IB_SIG_BAD_GUARD:
  1764. nvme_req(rq)->status = NVME_SC_GUARD_CHECK;
  1765. break;
  1766. case IB_SIG_BAD_REFTAG:
  1767. nvme_req(rq)->status = NVME_SC_REFTAG_CHECK;
  1768. break;
  1769. case IB_SIG_BAD_APPTAG:
  1770. nvme_req(rq)->status = NVME_SC_APPTAG_CHECK;
  1771. break;
  1772. }
  1773. pr_err("PI error found type %d expected 0x%x vs actual 0x%x\n",
  1774. mr_status.sig_err.err_type, mr_status.sig_err.expected,
  1775. mr_status.sig_err.actual);
  1776. }
  1777. }
  1778. static void nvme_rdma_complete_rq(struct request *rq)
  1779. {
  1780. struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
  1781. struct nvme_rdma_queue *queue = req->queue;
  1782. struct ib_device *ibdev = queue->device->dev;
  1783. if (req->use_sig_mr)
  1784. nvme_rdma_check_pi_status(req);
  1785. nvme_rdma_unmap_data(queue, rq);
  1786. ib_dma_unmap_single(ibdev, req->sqe.dma, sizeof(struct nvme_command),
  1787. DMA_TO_DEVICE);
  1788. nvme_complete_rq(rq);
  1789. }
  1790. static void nvme_rdma_map_queues(struct blk_mq_tag_set *set)
  1791. {
  1792. struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(set->driver_data);
  1793. struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
  1794. if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
  1795. /* separate read/write queues */
  1796. set->map[HCTX_TYPE_DEFAULT].nr_queues =
  1797. ctrl->io_queues[HCTX_TYPE_DEFAULT];
  1798. set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
  1799. set->map[HCTX_TYPE_READ].nr_queues =
  1800. ctrl->io_queues[HCTX_TYPE_READ];
  1801. set->map[HCTX_TYPE_READ].queue_offset =
  1802. ctrl->io_queues[HCTX_TYPE_DEFAULT];
  1803. } else {
  1804. /* shared read/write queues */
  1805. set->map[HCTX_TYPE_DEFAULT].nr_queues =
  1806. ctrl->io_queues[HCTX_TYPE_DEFAULT];
  1807. set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
  1808. set->map[HCTX_TYPE_READ].nr_queues =
  1809. ctrl->io_queues[HCTX_TYPE_DEFAULT];
  1810. set->map[HCTX_TYPE_READ].queue_offset = 0;
  1811. }
  1812. blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT],
  1813. ctrl->device->dev, 0);
  1814. blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ],
  1815. ctrl->device->dev, 0);
  1816. if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
  1817. /* map dedicated poll queues only if we have queues left */
  1818. set->map[HCTX_TYPE_POLL].nr_queues =
  1819. ctrl->io_queues[HCTX_TYPE_POLL];
  1820. set->map[HCTX_TYPE_POLL].queue_offset =
  1821. ctrl->io_queues[HCTX_TYPE_DEFAULT] +
  1822. ctrl->io_queues[HCTX_TYPE_READ];
  1823. blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
  1824. }
  1825. dev_info(ctrl->ctrl.device,
  1826. "mapped %d/%d/%d default/read/poll queues.\n",
  1827. ctrl->io_queues[HCTX_TYPE_DEFAULT],
  1828. ctrl->io_queues[HCTX_TYPE_READ],
  1829. ctrl->io_queues[HCTX_TYPE_POLL]);
  1830. }
  1831. static const struct blk_mq_ops nvme_rdma_mq_ops = {
  1832. .queue_rq = nvme_rdma_queue_rq,
  1833. .complete = nvme_rdma_complete_rq,
  1834. .init_request = nvme_rdma_init_request,
  1835. .exit_request = nvme_rdma_exit_request,
  1836. .init_hctx = nvme_rdma_init_hctx,
  1837. .timeout = nvme_rdma_timeout,
  1838. .map_queues = nvme_rdma_map_queues,
  1839. .poll = nvme_rdma_poll,
  1840. };
  1841. static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
  1842. .queue_rq = nvme_rdma_queue_rq,
  1843. .complete = nvme_rdma_complete_rq,
  1844. .init_request = nvme_rdma_init_request,
  1845. .exit_request = nvme_rdma_exit_request,
  1846. .init_hctx = nvme_rdma_init_admin_hctx,
  1847. .timeout = nvme_rdma_timeout,
  1848. };
  1849. static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
  1850. {
  1851. nvme_rdma_teardown_io_queues(ctrl, shutdown);
  1852. nvme_stop_admin_queue(&ctrl->ctrl);
  1853. if (shutdown)
  1854. nvme_shutdown_ctrl(&ctrl->ctrl);
  1855. else
  1856. nvme_disable_ctrl(&ctrl->ctrl);
  1857. nvme_rdma_teardown_admin_queue(ctrl, shutdown);
  1858. }
  1859. static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl)
  1860. {
  1861. nvme_rdma_shutdown_ctrl(to_rdma_ctrl(ctrl), true);
  1862. }
  1863. static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
  1864. {
  1865. struct nvme_rdma_ctrl *ctrl =
  1866. container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work);
  1867. nvme_stop_ctrl(&ctrl->ctrl);
  1868. nvme_rdma_shutdown_ctrl(ctrl, false);
  1869. if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
  1870. /* state change failure should never happen */
  1871. WARN_ON_ONCE(1);
  1872. return;
  1873. }
  1874. if (nvme_rdma_setup_ctrl(ctrl, false))
  1875. goto out_fail;
  1876. return;
  1877. out_fail:
  1878. ++ctrl->ctrl.nr_reconnects;
  1879. nvme_rdma_reconnect_or_remove(ctrl);
  1880. }
  1881. static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
  1882. .name = "rdma",
  1883. .module = THIS_MODULE,
  1884. .flags = NVME_F_FABRICS | NVME_F_METADATA_SUPPORTED,
  1885. .reg_read32 = nvmf_reg_read32,
  1886. .reg_read64 = nvmf_reg_read64,
  1887. .reg_write32 = nvmf_reg_write32,
  1888. .free_ctrl = nvme_rdma_free_ctrl,
  1889. .submit_async_event = nvme_rdma_submit_async_event,
  1890. .delete_ctrl = nvme_rdma_delete_ctrl,
  1891. .get_address = nvmf_get_address,
  1892. .stop_ctrl = nvme_rdma_stop_ctrl,
  1893. };
  1894. /*
  1895. * Fails a connection request if it matches an existing controller
  1896. * (association) with the same tuple:
  1897. * <Host NQN, Host ID, local address, remote address, remote port, SUBSYS NQN>
  1898. *
  1899. * if local address is not specified in the request, it will match an
  1900. * existing controller with all the other parameters the same and no
  1901. * local port address specified as well.
  1902. *
  1903. * The ports don't need to be compared as they are intrinsically
  1904. * already matched by the port pointers supplied.
  1905. */
  1906. static bool
  1907. nvme_rdma_existing_controller(struct nvmf_ctrl_options *opts)
  1908. {
  1909. struct nvme_rdma_ctrl *ctrl;
  1910. bool found = false;
  1911. mutex_lock(&nvme_rdma_ctrl_mutex);
  1912. list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
  1913. found = nvmf_ip_options_match(&ctrl->ctrl, opts);
  1914. if (found)
  1915. break;
  1916. }
  1917. mutex_unlock(&nvme_rdma_ctrl_mutex);
  1918. return found;
  1919. }
  1920. static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
  1921. struct nvmf_ctrl_options *opts)
  1922. {
  1923. struct nvme_rdma_ctrl *ctrl;
  1924. int ret;
  1925. bool changed;
  1926. ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
  1927. if (!ctrl)
  1928. return ERR_PTR(-ENOMEM);
  1929. ctrl->ctrl.opts = opts;
  1930. INIT_LIST_HEAD(&ctrl->list);
  1931. if (!(opts->mask & NVMF_OPT_TRSVCID)) {
  1932. opts->trsvcid =
  1933. kstrdup(__stringify(NVME_RDMA_IP_PORT), GFP_KERNEL);
  1934. if (!opts->trsvcid) {
  1935. ret = -ENOMEM;
  1936. goto out_free_ctrl;
  1937. }
  1938. opts->mask |= NVMF_OPT_TRSVCID;
  1939. }
  1940. ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
  1941. opts->traddr, opts->trsvcid, &ctrl->addr);
  1942. if (ret) {
  1943. pr_err("malformed address passed: %s:%s\n",
  1944. opts->traddr, opts->trsvcid);
  1945. goto out_free_ctrl;
  1946. }
  1947. if (opts->mask & NVMF_OPT_HOST_TRADDR) {
  1948. ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
  1949. opts->host_traddr, NULL, &ctrl->src_addr);
  1950. if (ret) {
  1951. pr_err("malformed src address passed: %s\n",
  1952. opts->host_traddr);
  1953. goto out_free_ctrl;
  1954. }
  1955. }
  1956. if (!opts->duplicate_connect && nvme_rdma_existing_controller(opts)) {
  1957. ret = -EALREADY;
  1958. goto out_free_ctrl;
  1959. }
  1960. INIT_DELAYED_WORK(&ctrl->reconnect_work,
  1961. nvme_rdma_reconnect_ctrl_work);
  1962. INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
  1963. INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work);
  1964. ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
  1965. opts->nr_poll_queues + 1;
  1966. ctrl->ctrl.sqsize = opts->queue_size - 1;
  1967. ctrl->ctrl.kato = opts->kato;
  1968. ret = -ENOMEM;
  1969. ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
  1970. GFP_KERNEL);
  1971. if (!ctrl->queues)
  1972. goto out_free_ctrl;
  1973. ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops,
  1974. 0 /* no quirks, we're perfect! */);
  1975. if (ret)
  1976. goto out_kfree_queues;
  1977. changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING);
  1978. WARN_ON_ONCE(!changed);
  1979. ret = nvme_rdma_setup_ctrl(ctrl, true);
  1980. if (ret)
  1981. goto out_uninit_ctrl;
  1982. dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
  1983. nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr);
  1984. mutex_lock(&nvme_rdma_ctrl_mutex);
  1985. list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
  1986. mutex_unlock(&nvme_rdma_ctrl_mutex);
  1987. return &ctrl->ctrl;
  1988. out_uninit_ctrl:
  1989. nvme_uninit_ctrl(&ctrl->ctrl);
  1990. nvme_put_ctrl(&ctrl->ctrl);
  1991. if (ret > 0)
  1992. ret = -EIO;
  1993. return ERR_PTR(ret);
  1994. out_kfree_queues:
  1995. kfree(ctrl->queues);
  1996. out_free_ctrl:
  1997. kfree(ctrl);
  1998. return ERR_PTR(ret);
  1999. }
  2000. static struct nvmf_transport_ops nvme_rdma_transport = {
  2001. .name = "rdma",
  2002. .module = THIS_MODULE,
  2003. .required_opts = NVMF_OPT_TRADDR,
  2004. .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
  2005. NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
  2006. NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
  2007. NVMF_OPT_TOS,
  2008. .create_ctrl = nvme_rdma_create_ctrl,
  2009. };
  2010. static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
  2011. {
  2012. struct nvme_rdma_ctrl *ctrl;
  2013. struct nvme_rdma_device *ndev;
  2014. bool found = false;
  2015. mutex_lock(&device_list_mutex);
  2016. list_for_each_entry(ndev, &device_list, entry) {
  2017. if (ndev->dev == ib_device) {
  2018. found = true;
  2019. break;
  2020. }
  2021. }
  2022. mutex_unlock(&device_list_mutex);
  2023. if (!found)
  2024. return;
  2025. /* Delete all controllers using this device */
  2026. mutex_lock(&nvme_rdma_ctrl_mutex);
  2027. list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
  2028. if (ctrl->device->dev != ib_device)
  2029. continue;
  2030. nvme_delete_ctrl(&ctrl->ctrl);
  2031. }
  2032. mutex_unlock(&nvme_rdma_ctrl_mutex);
  2033. flush_workqueue(nvme_delete_wq);
  2034. }
  2035. static struct ib_client nvme_rdma_ib_client = {
  2036. .name = "nvme_rdma",
  2037. .remove = nvme_rdma_remove_one
  2038. };
  2039. static int __init nvme_rdma_init_module(void)
  2040. {
  2041. int ret;
  2042. ret = ib_register_client(&nvme_rdma_ib_client);
  2043. if (ret)
  2044. return ret;
  2045. ret = nvmf_register_transport(&nvme_rdma_transport);
  2046. if (ret)
  2047. goto err_unreg_client;
  2048. return 0;
  2049. err_unreg_client:
  2050. ib_unregister_client(&nvme_rdma_ib_client);
  2051. return ret;
  2052. }
  2053. static void __exit nvme_rdma_cleanup_module(void)
  2054. {
  2055. struct nvme_rdma_ctrl *ctrl;
  2056. nvmf_unregister_transport(&nvme_rdma_transport);
  2057. ib_unregister_client(&nvme_rdma_ib_client);
  2058. mutex_lock(&nvme_rdma_ctrl_mutex);
  2059. list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list)
  2060. nvme_delete_ctrl(&ctrl->ctrl);
  2061. mutex_unlock(&nvme_rdma_ctrl_mutex);
  2062. flush_workqueue(nvme_delete_wq);
  2063. }
  2064. module_init(nvme_rdma_init_module);
  2065. module_exit(nvme_rdma_cleanup_module);
  2066. MODULE_LICENSE("GPL v2");