rtrs-clt.c 83 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * RDMA Transport Layer
  4. *
  5. * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
  6. * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
  7. * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
  8. */
  9. #undef pr_fmt
  10. #define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
  11. #include <linux/module.h>
  12. #include <linux/rculist.h>
  13. #include <linux/random.h>
  14. #include "rtrs-clt.h"
  15. #include "rtrs-log.h"
  16. #include "rtrs-clt-trace.h"
  17. #define RTRS_CONNECT_TIMEOUT_MS 30000
  18. /*
  19. * Wait a bit before trying to reconnect after a failure
  20. * in order to give server time to finish clean up which
  21. * leads to "false positives" failed reconnect attempts
  22. */
  23. #define RTRS_RECONNECT_BACKOFF 1000
  24. /*
  25. * Wait for additional random time between 0 and 8 seconds
  26. * before starting to reconnect to avoid clients reconnecting
  27. * all at once in case of a major network outage
  28. */
  29. #define RTRS_RECONNECT_SEED 8
  30. #define FIRST_CONN 0x01
  31. /* limit to 128 * 4k = 512k max IO */
  32. #define RTRS_MAX_SEGMENTS 128
  33. MODULE_DESCRIPTION("RDMA Transport Client");
  34. MODULE_LICENSE("GPL");
  35. static const struct rtrs_rdma_dev_pd_ops dev_pd_ops;
  36. static struct rtrs_rdma_dev_pd dev_pd = {
  37. .ops = &dev_pd_ops
  38. };
  39. static struct workqueue_struct *rtrs_wq;
  40. static struct class *rtrs_clt_dev_class;
  41. static inline bool rtrs_clt_is_connected(const struct rtrs_clt_sess *clt)
  42. {
  43. struct rtrs_clt_path *clt_path;
  44. bool connected = false;
  45. rcu_read_lock();
  46. list_for_each_entry_rcu(clt_path, &clt->paths_list, s.entry)
  47. if (READ_ONCE(clt_path->state) == RTRS_CLT_CONNECTED) {
  48. connected = true;
  49. break;
  50. }
  51. rcu_read_unlock();
  52. return connected;
  53. }
  54. static struct rtrs_permit *
  55. __rtrs_get_permit(struct rtrs_clt_sess *clt, enum rtrs_clt_con_type con_type)
  56. {
  57. size_t max_depth = clt->queue_depth;
  58. struct rtrs_permit *permit;
  59. int bit;
  60. /*
  61. * Adapted from null_blk get_tag(). Callers from different cpus may
  62. * grab the same bit, since find_first_zero_bit is not atomic.
  63. * But then the test_and_set_bit_lock will fail for all the
  64. * callers but one, so that they will loop again.
  65. * This way an explicit spinlock is not required.
  66. */
  67. do {
  68. bit = find_first_zero_bit(clt->permits_map, max_depth);
  69. if (bit >= max_depth)
  70. return NULL;
  71. } while (test_and_set_bit_lock(bit, clt->permits_map));
  72. permit = get_permit(clt, bit);
  73. WARN_ON(permit->mem_id != bit);
  74. permit->cpu_id = raw_smp_processor_id();
  75. permit->con_type = con_type;
  76. return permit;
  77. }
  78. static inline void __rtrs_put_permit(struct rtrs_clt_sess *clt,
  79. struct rtrs_permit *permit)
  80. {
  81. clear_bit_unlock(permit->mem_id, clt->permits_map);
  82. }
  83. /**
  84. * rtrs_clt_get_permit() - allocates permit for future RDMA operation
  85. * @clt: Current session
  86. * @con_type: Type of connection to use with the permit
  87. * @can_wait: Wait type
  88. *
  89. * Description:
  90. * Allocates permit for the following RDMA operation. Permit is used
  91. * to preallocate all resources and to propagate memory pressure
  92. * up earlier.
  93. *
  94. * Context:
  95. * Can sleep if @wait == RTRS_PERMIT_WAIT
  96. */
  97. struct rtrs_permit *rtrs_clt_get_permit(struct rtrs_clt_sess *clt,
  98. enum rtrs_clt_con_type con_type,
  99. enum wait_type can_wait)
  100. {
  101. struct rtrs_permit *permit;
  102. DEFINE_WAIT(wait);
  103. permit = __rtrs_get_permit(clt, con_type);
  104. if (permit || !can_wait)
  105. return permit;
  106. do {
  107. prepare_to_wait(&clt->permits_wait, &wait,
  108. TASK_UNINTERRUPTIBLE);
  109. permit = __rtrs_get_permit(clt, con_type);
  110. if (permit)
  111. break;
  112. io_schedule();
  113. } while (1);
  114. finish_wait(&clt->permits_wait, &wait);
  115. return permit;
  116. }
  117. EXPORT_SYMBOL(rtrs_clt_get_permit);
  118. /**
  119. * rtrs_clt_put_permit() - puts allocated permit
  120. * @clt: Current session
  121. * @permit: Permit to be freed
  122. *
  123. * Context:
  124. * Does not matter
  125. */
  126. void rtrs_clt_put_permit(struct rtrs_clt_sess *clt,
  127. struct rtrs_permit *permit)
  128. {
  129. if (WARN_ON(!test_bit(permit->mem_id, clt->permits_map)))
  130. return;
  131. __rtrs_put_permit(clt, permit);
  132. /*
  133. * rtrs_clt_get_permit() adds itself to the &clt->permits_wait list
  134. * before calling schedule(). So if rtrs_clt_get_permit() is sleeping
  135. * it must have added itself to &clt->permits_wait before
  136. * __rtrs_put_permit() finished.
  137. * Hence it is safe to guard wake_up() with a waitqueue_active() test.
  138. */
  139. if (waitqueue_active(&clt->permits_wait))
  140. wake_up(&clt->permits_wait);
  141. }
  142. EXPORT_SYMBOL(rtrs_clt_put_permit);
  143. /**
  144. * rtrs_permit_to_clt_con() - returns RDMA connection pointer by the permit
  145. * @clt_path: client path pointer
  146. * @permit: permit for the allocation of the RDMA buffer
  147. * Note:
  148. * IO connection starts from 1.
  149. * 0 connection is for user messages.
  150. */
  151. static
  152. struct rtrs_clt_con *rtrs_permit_to_clt_con(struct rtrs_clt_path *clt_path,
  153. struct rtrs_permit *permit)
  154. {
  155. int id = 0;
  156. if (permit->con_type == RTRS_IO_CON)
  157. id = (permit->cpu_id % (clt_path->s.irq_con_num - 1)) + 1;
  158. return to_clt_con(clt_path->s.con[id]);
  159. }
  160. /**
  161. * rtrs_clt_change_state() - change the session state through session state
  162. * machine.
  163. *
  164. * @clt_path: client path to change the state of.
  165. * @new_state: state to change to.
  166. *
  167. * returns true if sess's state is changed to new state, otherwise return false.
  168. *
  169. * Locks:
  170. * state_wq lock must be hold.
  171. */
  172. static bool rtrs_clt_change_state(struct rtrs_clt_path *clt_path,
  173. enum rtrs_clt_state new_state)
  174. {
  175. enum rtrs_clt_state old_state;
  176. bool changed = false;
  177. lockdep_assert_held(&clt_path->state_wq.lock);
  178. old_state = clt_path->state;
  179. switch (new_state) {
  180. case RTRS_CLT_CONNECTING:
  181. switch (old_state) {
  182. case RTRS_CLT_RECONNECTING:
  183. changed = true;
  184. fallthrough;
  185. default:
  186. break;
  187. }
  188. break;
  189. case RTRS_CLT_RECONNECTING:
  190. switch (old_state) {
  191. case RTRS_CLT_CONNECTED:
  192. case RTRS_CLT_CONNECTING_ERR:
  193. case RTRS_CLT_CLOSED:
  194. changed = true;
  195. fallthrough;
  196. default:
  197. break;
  198. }
  199. break;
  200. case RTRS_CLT_CONNECTED:
  201. switch (old_state) {
  202. case RTRS_CLT_CONNECTING:
  203. changed = true;
  204. fallthrough;
  205. default:
  206. break;
  207. }
  208. break;
  209. case RTRS_CLT_CONNECTING_ERR:
  210. switch (old_state) {
  211. case RTRS_CLT_CONNECTING:
  212. changed = true;
  213. fallthrough;
  214. default:
  215. break;
  216. }
  217. break;
  218. case RTRS_CLT_CLOSING:
  219. switch (old_state) {
  220. case RTRS_CLT_CONNECTING:
  221. case RTRS_CLT_CONNECTING_ERR:
  222. case RTRS_CLT_RECONNECTING:
  223. case RTRS_CLT_CONNECTED:
  224. changed = true;
  225. fallthrough;
  226. default:
  227. break;
  228. }
  229. break;
  230. case RTRS_CLT_CLOSED:
  231. switch (old_state) {
  232. case RTRS_CLT_CLOSING:
  233. changed = true;
  234. fallthrough;
  235. default:
  236. break;
  237. }
  238. break;
  239. case RTRS_CLT_DEAD:
  240. switch (old_state) {
  241. case RTRS_CLT_CLOSED:
  242. changed = true;
  243. fallthrough;
  244. default:
  245. break;
  246. }
  247. break;
  248. default:
  249. break;
  250. }
  251. if (changed) {
  252. clt_path->state = new_state;
  253. wake_up_locked(&clt_path->state_wq);
  254. }
  255. return changed;
  256. }
  257. static bool rtrs_clt_change_state_from_to(struct rtrs_clt_path *clt_path,
  258. enum rtrs_clt_state old_state,
  259. enum rtrs_clt_state new_state)
  260. {
  261. bool changed = false;
  262. spin_lock_irq(&clt_path->state_wq.lock);
  263. if (clt_path->state == old_state)
  264. changed = rtrs_clt_change_state(clt_path, new_state);
  265. spin_unlock_irq(&clt_path->state_wq.lock);
  266. return changed;
  267. }
  268. static void rtrs_clt_stop_and_destroy_conns(struct rtrs_clt_path *clt_path);
  269. static void rtrs_rdma_error_recovery(struct rtrs_clt_con *con)
  270. {
  271. struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
  272. trace_rtrs_rdma_error_recovery(clt_path);
  273. if (rtrs_clt_change_state_from_to(clt_path,
  274. RTRS_CLT_CONNECTED,
  275. RTRS_CLT_RECONNECTING)) {
  276. queue_work(rtrs_wq, &clt_path->err_recovery_work);
  277. } else {
  278. /*
  279. * Error can happen just on establishing new connection,
  280. * so notify waiter with error state, waiter is responsible
  281. * for cleaning the rest and reconnect if needed.
  282. */
  283. rtrs_clt_change_state_from_to(clt_path,
  284. RTRS_CLT_CONNECTING,
  285. RTRS_CLT_CONNECTING_ERR);
  286. }
  287. }
  288. static void rtrs_clt_fast_reg_done(struct ib_cq *cq, struct ib_wc *wc)
  289. {
  290. struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context);
  291. if (wc->status != IB_WC_SUCCESS) {
  292. rtrs_err(con->c.path, "Failed IB_WR_REG_MR: %s\n",
  293. ib_wc_status_msg(wc->status));
  294. rtrs_rdma_error_recovery(con);
  295. }
  296. }
  297. static struct ib_cqe fast_reg_cqe = {
  298. .done = rtrs_clt_fast_reg_done
  299. };
  300. static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno,
  301. bool notify, bool can_wait);
  302. static void rtrs_clt_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)
  303. {
  304. struct rtrs_clt_io_req *req =
  305. container_of(wc->wr_cqe, typeof(*req), inv_cqe);
  306. struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context);
  307. if (wc->status != IB_WC_SUCCESS) {
  308. rtrs_err(con->c.path, "Failed IB_WR_LOCAL_INV: %s\n",
  309. ib_wc_status_msg(wc->status));
  310. rtrs_rdma_error_recovery(con);
  311. }
  312. req->need_inv = false;
  313. if (req->need_inv_comp)
  314. complete(&req->inv_comp);
  315. else
  316. /* Complete request from INV callback */
  317. complete_rdma_req(req, req->inv_errno, true, false);
  318. }
  319. static int rtrs_inv_rkey(struct rtrs_clt_io_req *req)
  320. {
  321. struct rtrs_clt_con *con = req->con;
  322. struct ib_send_wr wr = {
  323. .opcode = IB_WR_LOCAL_INV,
  324. .wr_cqe = &req->inv_cqe,
  325. .send_flags = IB_SEND_SIGNALED,
  326. .ex.invalidate_rkey = req->mr->rkey,
  327. };
  328. req->inv_cqe.done = rtrs_clt_inv_rkey_done;
  329. return ib_post_send(con->c.qp, &wr, NULL);
  330. }
  331. static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno,
  332. bool notify, bool can_wait)
  333. {
  334. struct rtrs_clt_con *con = req->con;
  335. struct rtrs_clt_path *clt_path;
  336. int err;
  337. if (!req->in_use)
  338. return;
  339. if (WARN_ON(!req->con))
  340. return;
  341. clt_path = to_clt_path(con->c.path);
  342. if (req->sg_cnt) {
  343. if (req->dir == DMA_FROM_DEVICE && req->need_inv) {
  344. /*
  345. * We are here to invalidate read requests
  346. * ourselves. In normal scenario server should
  347. * send INV for all read requests, but
  348. * we are here, thus two things could happen:
  349. *
  350. * 1. this is failover, when errno != 0
  351. * and can_wait == 1,
  352. *
  353. * 2. something totally bad happened and
  354. * server forgot to send INV, so we
  355. * should do that ourselves.
  356. */
  357. if (can_wait) {
  358. req->need_inv_comp = true;
  359. } else {
  360. /* This should be IO path, so always notify */
  361. WARN_ON(!notify);
  362. /* Save errno for INV callback */
  363. req->inv_errno = errno;
  364. }
  365. refcount_inc(&req->ref);
  366. err = rtrs_inv_rkey(req);
  367. if (err) {
  368. rtrs_err(con->c.path, "Send INV WR key=%#x: %d\n",
  369. req->mr->rkey, err);
  370. } else if (can_wait) {
  371. wait_for_completion(&req->inv_comp);
  372. } else {
  373. /*
  374. * Something went wrong, so request will be
  375. * completed from INV callback.
  376. */
  377. WARN_ON_ONCE(1);
  378. return;
  379. }
  380. if (!refcount_dec_and_test(&req->ref))
  381. return;
  382. }
  383. ib_dma_unmap_sg(clt_path->s.dev->ib_dev, req->sglist,
  384. req->sg_cnt, req->dir);
  385. }
  386. if (!refcount_dec_and_test(&req->ref))
  387. return;
  388. if (req->mp_policy == MP_POLICY_MIN_INFLIGHT)
  389. atomic_dec(&clt_path->stats->inflight);
  390. req->in_use = false;
  391. req->con = NULL;
  392. if (errno) {
  393. rtrs_err_rl(con->c.path, "IO request failed: error=%d path=%s [%s:%u] notify=%d\n",
  394. errno, kobject_name(&clt_path->kobj), clt_path->hca_name,
  395. clt_path->hca_port, notify);
  396. }
  397. if (notify)
  398. req->conf(req->priv, errno);
  399. }
  400. static int rtrs_post_send_rdma(struct rtrs_clt_con *con,
  401. struct rtrs_clt_io_req *req,
  402. struct rtrs_rbuf *rbuf, u32 off,
  403. u32 imm, struct ib_send_wr *wr)
  404. {
  405. struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
  406. enum ib_send_flags flags;
  407. struct ib_sge sge;
  408. if (!req->sg_size) {
  409. rtrs_wrn(con->c.path,
  410. "Doing RDMA Write failed, no data supplied\n");
  411. return -EINVAL;
  412. }
  413. /* user data and user message in the first list element */
  414. sge.addr = req->iu->dma_addr;
  415. sge.length = req->sg_size;
  416. sge.lkey = clt_path->s.dev->ib_pd->local_dma_lkey;
  417. /*
  418. * From time to time we have to post signalled sends,
  419. * or send queue will fill up and only QP reset can help.
  420. */
  421. flags = atomic_inc_return(&con->c.wr_cnt) % clt_path->s.signal_interval ?
  422. 0 : IB_SEND_SIGNALED;
  423. ib_dma_sync_single_for_device(clt_path->s.dev->ib_dev,
  424. req->iu->dma_addr,
  425. req->sg_size, DMA_TO_DEVICE);
  426. return rtrs_iu_post_rdma_write_imm(&con->c, req->iu, &sge, 1,
  427. rbuf->rkey, rbuf->addr + off,
  428. imm, flags, wr, NULL);
  429. }
  430. static void process_io_rsp(struct rtrs_clt_path *clt_path, u32 msg_id,
  431. s16 errno, bool w_inval)
  432. {
  433. struct rtrs_clt_io_req *req;
  434. if (WARN_ON(msg_id >= clt_path->queue_depth))
  435. return;
  436. req = &clt_path->reqs[msg_id];
  437. /* Drop need_inv if server responded with send with invalidation */
  438. req->need_inv &= !w_inval;
  439. complete_rdma_req(req, errno, true, false);
  440. }
  441. static void rtrs_clt_recv_done(struct rtrs_clt_con *con, struct ib_wc *wc)
  442. {
  443. struct rtrs_iu *iu;
  444. int err;
  445. struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
  446. WARN_ON((clt_path->flags & RTRS_MSG_NEW_RKEY_F) == 0);
  447. iu = container_of(wc->wr_cqe, struct rtrs_iu,
  448. cqe);
  449. err = rtrs_iu_post_recv(&con->c, iu);
  450. if (err) {
  451. rtrs_err(con->c.path, "post iu failed %d\n", err);
  452. rtrs_rdma_error_recovery(con);
  453. }
  454. }
  455. static void rtrs_clt_rkey_rsp_done(struct rtrs_clt_con *con, struct ib_wc *wc)
  456. {
  457. struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
  458. struct rtrs_msg_rkey_rsp *msg;
  459. u32 imm_type, imm_payload;
  460. bool w_inval = false;
  461. struct rtrs_iu *iu;
  462. u32 buf_id;
  463. int err;
  464. WARN_ON((clt_path->flags & RTRS_MSG_NEW_RKEY_F) == 0);
  465. iu = container_of(wc->wr_cqe, struct rtrs_iu, cqe);
  466. if (wc->byte_len < sizeof(*msg)) {
  467. rtrs_err(con->c.path, "rkey response is malformed: size %d\n",
  468. wc->byte_len);
  469. goto out;
  470. }
  471. ib_dma_sync_single_for_cpu(clt_path->s.dev->ib_dev, iu->dma_addr,
  472. iu->size, DMA_FROM_DEVICE);
  473. msg = iu->buf;
  474. if (le16_to_cpu(msg->type) != RTRS_MSG_RKEY_RSP) {
  475. rtrs_err(clt_path->clt,
  476. "rkey response is malformed: type %d\n",
  477. le16_to_cpu(msg->type));
  478. goto out;
  479. }
  480. buf_id = le16_to_cpu(msg->buf_id);
  481. if (WARN_ON(buf_id >= clt_path->queue_depth))
  482. goto out;
  483. rtrs_from_imm(be32_to_cpu(wc->ex.imm_data), &imm_type, &imm_payload);
  484. if (imm_type == RTRS_IO_RSP_IMM ||
  485. imm_type == RTRS_IO_RSP_W_INV_IMM) {
  486. u32 msg_id;
  487. w_inval = (imm_type == RTRS_IO_RSP_W_INV_IMM);
  488. rtrs_from_io_rsp_imm(imm_payload, &msg_id, &err);
  489. if (WARN_ON(buf_id != msg_id))
  490. goto out;
  491. clt_path->rbufs[buf_id].rkey = le32_to_cpu(msg->rkey);
  492. process_io_rsp(clt_path, msg_id, err, w_inval);
  493. }
  494. ib_dma_sync_single_for_device(clt_path->s.dev->ib_dev, iu->dma_addr,
  495. iu->size, DMA_FROM_DEVICE);
  496. return rtrs_clt_recv_done(con, wc);
  497. out:
  498. rtrs_rdma_error_recovery(con);
  499. }
  500. static void rtrs_clt_rdma_done(struct ib_cq *cq, struct ib_wc *wc);
  501. static struct ib_cqe io_comp_cqe = {
  502. .done = rtrs_clt_rdma_done
  503. };
  504. /*
  505. * Post x2 empty WRs: first is for this RDMA with IMM,
  506. * second is for RECV with INV, which happened earlier.
  507. */
  508. static int rtrs_post_recv_empty_x2(struct rtrs_con *con, struct ib_cqe *cqe)
  509. {
  510. struct ib_recv_wr wr_arr[2], *wr;
  511. int i;
  512. memset(wr_arr, 0, sizeof(wr_arr));
  513. for (i = 0; i < ARRAY_SIZE(wr_arr); i++) {
  514. wr = &wr_arr[i];
  515. wr->wr_cqe = cqe;
  516. if (i)
  517. /* Chain backwards */
  518. wr->next = &wr_arr[i - 1];
  519. }
  520. return ib_post_recv(con->qp, wr, NULL);
  521. }
  522. static void rtrs_clt_rdma_done(struct ib_cq *cq, struct ib_wc *wc)
  523. {
  524. struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context);
  525. struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
  526. u32 imm_type, imm_payload;
  527. bool w_inval = false;
  528. int err;
  529. if (wc->status != IB_WC_SUCCESS) {
  530. if (wc->status != IB_WC_WR_FLUSH_ERR) {
  531. rtrs_err(clt_path->clt, "RDMA failed: %s\n",
  532. ib_wc_status_msg(wc->status));
  533. rtrs_rdma_error_recovery(con);
  534. }
  535. return;
  536. }
  537. rtrs_clt_update_wc_stats(con);
  538. switch (wc->opcode) {
  539. case IB_WC_RECV_RDMA_WITH_IMM:
  540. /*
  541. * post_recv() RDMA write completions of IO reqs (read/write)
  542. * and hb
  543. */
  544. if (WARN_ON(wc->wr_cqe->done != rtrs_clt_rdma_done))
  545. return;
  546. rtrs_from_imm(be32_to_cpu(wc->ex.imm_data),
  547. &imm_type, &imm_payload);
  548. if (imm_type == RTRS_IO_RSP_IMM ||
  549. imm_type == RTRS_IO_RSP_W_INV_IMM) {
  550. u32 msg_id;
  551. w_inval = (imm_type == RTRS_IO_RSP_W_INV_IMM);
  552. rtrs_from_io_rsp_imm(imm_payload, &msg_id, &err);
  553. process_io_rsp(clt_path, msg_id, err, w_inval);
  554. } else if (imm_type == RTRS_HB_MSG_IMM) {
  555. WARN_ON(con->c.cid);
  556. rtrs_send_hb_ack(&clt_path->s);
  557. if (clt_path->flags & RTRS_MSG_NEW_RKEY_F)
  558. return rtrs_clt_recv_done(con, wc);
  559. } else if (imm_type == RTRS_HB_ACK_IMM) {
  560. WARN_ON(con->c.cid);
  561. clt_path->s.hb_missed_cnt = 0;
  562. clt_path->s.hb_cur_latency =
  563. ktime_sub(ktime_get(), clt_path->s.hb_last_sent);
  564. if (clt_path->flags & RTRS_MSG_NEW_RKEY_F)
  565. return rtrs_clt_recv_done(con, wc);
  566. } else {
  567. rtrs_wrn(con->c.path, "Unknown IMM type %u\n",
  568. imm_type);
  569. }
  570. if (w_inval)
  571. /*
  572. * Post x2 empty WRs: first is for this RDMA with IMM,
  573. * second is for RECV with INV, which happened earlier.
  574. */
  575. err = rtrs_post_recv_empty_x2(&con->c, &io_comp_cqe);
  576. else
  577. err = rtrs_post_recv_empty(&con->c, &io_comp_cqe);
  578. if (err) {
  579. rtrs_err(con->c.path, "rtrs_post_recv_empty(): %d\n",
  580. err);
  581. rtrs_rdma_error_recovery(con);
  582. }
  583. break;
  584. case IB_WC_RECV:
  585. /*
  586. * Key invalidations from server side
  587. */
  588. WARN_ON(!(wc->wc_flags & IB_WC_WITH_INVALIDATE ||
  589. wc->wc_flags & IB_WC_WITH_IMM));
  590. WARN_ON(wc->wr_cqe->done != rtrs_clt_rdma_done);
  591. if (clt_path->flags & RTRS_MSG_NEW_RKEY_F) {
  592. if (wc->wc_flags & IB_WC_WITH_INVALIDATE)
  593. return rtrs_clt_recv_done(con, wc);
  594. return rtrs_clt_rkey_rsp_done(con, wc);
  595. }
  596. break;
  597. case IB_WC_RDMA_WRITE:
  598. /*
  599. * post_send() RDMA write completions of IO reqs (read/write)
  600. * and hb.
  601. */
  602. break;
  603. default:
  604. rtrs_wrn(clt_path->clt, "Unexpected WC type: %d\n", wc->opcode);
  605. return;
  606. }
  607. }
  608. static int post_recv_io(struct rtrs_clt_con *con, size_t q_size)
  609. {
  610. int err, i;
  611. struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
  612. for (i = 0; i < q_size; i++) {
  613. if (clt_path->flags & RTRS_MSG_NEW_RKEY_F) {
  614. struct rtrs_iu *iu = &con->rsp_ius[i];
  615. err = rtrs_iu_post_recv(&con->c, iu);
  616. } else {
  617. err = rtrs_post_recv_empty(&con->c, &io_comp_cqe);
  618. }
  619. if (err)
  620. return err;
  621. }
  622. return 0;
  623. }
  624. static int post_recv_path(struct rtrs_clt_path *clt_path)
  625. {
  626. size_t q_size = 0;
  627. int err, cid;
  628. for (cid = 0; cid < clt_path->s.con_num; cid++) {
  629. if (cid == 0)
  630. q_size = SERVICE_CON_QUEUE_DEPTH;
  631. else
  632. q_size = clt_path->queue_depth;
  633. /*
  634. * x2 for RDMA read responses + FR key invalidations,
  635. * RDMA writes do not require any FR registrations.
  636. */
  637. q_size *= 2;
  638. err = post_recv_io(to_clt_con(clt_path->s.con[cid]), q_size);
  639. if (err) {
  640. rtrs_err(clt_path->clt, "post_recv_io(), err: %d\n",
  641. err);
  642. return err;
  643. }
  644. }
  645. return 0;
  646. }
  647. struct path_it {
  648. int i;
  649. struct list_head skip_list;
  650. struct rtrs_clt_sess *clt;
  651. struct rtrs_clt_path *(*next_path)(struct path_it *it);
  652. };
  653. /*
  654. * rtrs_clt_get_next_path_or_null - get clt path from the list or return NULL
  655. * @head: the head for the list.
  656. * @clt_path: The element to take the next clt_path from.
  657. *
  658. * Next clt path returned in round-robin fashion, i.e. head will be skipped,
  659. * but if list is observed as empty, NULL will be returned.
  660. *
  661. * This function may safely run concurrently with the _rcu list-mutation
  662. * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
  663. */
  664. static inline struct rtrs_clt_path *
  665. rtrs_clt_get_next_path_or_null(struct list_head *head, struct rtrs_clt_path *clt_path)
  666. {
  667. return list_next_or_null_rcu(head, &clt_path->s.entry, typeof(*clt_path), s.entry) ?:
  668. list_next_or_null_rcu(head,
  669. READ_ONCE((&clt_path->s.entry)->next),
  670. typeof(*clt_path), s.entry);
  671. }
  672. /**
  673. * get_next_path_rr() - Returns path in round-robin fashion.
  674. * @it: the path pointer
  675. *
  676. * Related to @MP_POLICY_RR
  677. *
  678. * Locks:
  679. * rcu_read_lock() must be hold.
  680. */
  681. static struct rtrs_clt_path *get_next_path_rr(struct path_it *it)
  682. {
  683. struct rtrs_clt_path __rcu **ppcpu_path;
  684. struct rtrs_clt_path *path;
  685. struct rtrs_clt_sess *clt;
  686. clt = it->clt;
  687. /*
  688. * Here we use two RCU objects: @paths_list and @pcpu_path
  689. * pointer. See rtrs_clt_remove_path_from_arr() for details
  690. * how that is handled.
  691. */
  692. ppcpu_path = this_cpu_ptr(clt->pcpu_path);
  693. path = rcu_dereference(*ppcpu_path);
  694. if (!path)
  695. path = list_first_or_null_rcu(&clt->paths_list,
  696. typeof(*path), s.entry);
  697. else
  698. path = rtrs_clt_get_next_path_or_null(&clt->paths_list, path);
  699. rcu_assign_pointer(*ppcpu_path, path);
  700. return path;
  701. }
  702. /**
  703. * get_next_path_min_inflight() - Returns path with minimal inflight count.
  704. * @it: the path pointer
  705. *
  706. * Related to @MP_POLICY_MIN_INFLIGHT
  707. *
  708. * Locks:
  709. * rcu_read_lock() must be hold.
  710. */
  711. static struct rtrs_clt_path *get_next_path_min_inflight(struct path_it *it)
  712. {
  713. struct rtrs_clt_path *min_path = NULL;
  714. struct rtrs_clt_sess *clt = it->clt;
  715. struct rtrs_clt_path *clt_path;
  716. int min_inflight = INT_MAX;
  717. int inflight;
  718. list_for_each_entry_rcu(clt_path, &clt->paths_list, s.entry) {
  719. if (READ_ONCE(clt_path->state) != RTRS_CLT_CONNECTED)
  720. continue;
  721. if (!list_empty(raw_cpu_ptr(clt_path->mp_skip_entry)))
  722. continue;
  723. inflight = atomic_read(&clt_path->stats->inflight);
  724. if (inflight < min_inflight) {
  725. min_inflight = inflight;
  726. min_path = clt_path;
  727. }
  728. }
  729. /*
  730. * add the path to the skip list, so that next time we can get
  731. * a different one
  732. */
  733. if (min_path)
  734. list_add(raw_cpu_ptr(min_path->mp_skip_entry), &it->skip_list);
  735. return min_path;
  736. }
  737. /**
  738. * get_next_path_min_latency() - Returns path with minimal latency.
  739. * @it: the path pointer
  740. *
  741. * Return: a path with the lowest latency or NULL if all paths are tried
  742. *
  743. * Locks:
  744. * rcu_read_lock() must be hold.
  745. *
  746. * Related to @MP_POLICY_MIN_LATENCY
  747. *
  748. * This DOES skip an already-tried path.
  749. * There is a skip-list to skip a path if the path has tried but failed.
  750. * It will try the minimum latency path and then the second minimum latency
  751. * path and so on. Finally it will return NULL if all paths are tried.
  752. * Therefore the caller MUST check the returned
  753. * path is NULL and trigger the IO error.
  754. */
  755. static struct rtrs_clt_path *get_next_path_min_latency(struct path_it *it)
  756. {
  757. struct rtrs_clt_path *min_path = NULL;
  758. struct rtrs_clt_sess *clt = it->clt;
  759. struct rtrs_clt_path *clt_path;
  760. ktime_t min_latency = KTIME_MAX;
  761. ktime_t latency;
  762. list_for_each_entry_rcu(clt_path, &clt->paths_list, s.entry) {
  763. if (READ_ONCE(clt_path->state) != RTRS_CLT_CONNECTED)
  764. continue;
  765. if (!list_empty(raw_cpu_ptr(clt_path->mp_skip_entry)))
  766. continue;
  767. latency = clt_path->s.hb_cur_latency;
  768. if (latency < min_latency) {
  769. min_latency = latency;
  770. min_path = clt_path;
  771. }
  772. }
  773. /*
  774. * add the path to the skip list, so that next time we can get
  775. * a different one
  776. */
  777. if (min_path)
  778. list_add(raw_cpu_ptr(min_path->mp_skip_entry), &it->skip_list);
  779. return min_path;
  780. }
  781. static inline void path_it_init(struct path_it *it, struct rtrs_clt_sess *clt)
  782. {
  783. INIT_LIST_HEAD(&it->skip_list);
  784. it->clt = clt;
  785. it->i = 0;
  786. if (clt->mp_policy == MP_POLICY_RR)
  787. it->next_path = get_next_path_rr;
  788. else if (clt->mp_policy == MP_POLICY_MIN_INFLIGHT)
  789. it->next_path = get_next_path_min_inflight;
  790. else
  791. it->next_path = get_next_path_min_latency;
  792. }
  793. static inline void path_it_deinit(struct path_it *it)
  794. {
  795. struct list_head *skip, *tmp;
  796. /*
  797. * The skip_list is used only for the MIN_INFLIGHT and MIN_LATENCY policies.
  798. * We need to remove paths from it, so that next IO can insert
  799. * paths (->mp_skip_entry) into a skip_list again.
  800. */
  801. list_for_each_safe(skip, tmp, &it->skip_list)
  802. list_del_init(skip);
  803. }
  804. /**
  805. * rtrs_clt_init_req() - Initialize an rtrs_clt_io_req holding information
  806. * about an inflight IO.
  807. * The user buffer holding user control message (not data) is copied into
  808. * the corresponding buffer of rtrs_iu (req->iu->buf), which later on will
  809. * also hold the control message of rtrs.
  810. * @req: an io request holding information about IO.
  811. * @clt_path: client path
  812. * @conf: conformation callback function to notify upper layer.
  813. * @permit: permit for allocation of RDMA remote buffer
  814. * @priv: private pointer
  815. * @vec: kernel vector containing control message
  816. * @usr_len: length of the user message
  817. * @sg: scater list for IO data
  818. * @sg_cnt: number of scater list entries
  819. * @data_len: length of the IO data
  820. * @dir: direction of the IO.
  821. */
  822. static void rtrs_clt_init_req(struct rtrs_clt_io_req *req,
  823. struct rtrs_clt_path *clt_path,
  824. void (*conf)(void *priv, int errno),
  825. struct rtrs_permit *permit, void *priv,
  826. const struct kvec *vec, size_t usr_len,
  827. struct scatterlist *sg, size_t sg_cnt,
  828. size_t data_len, int dir)
  829. {
  830. struct iov_iter iter;
  831. size_t len;
  832. req->permit = permit;
  833. req->in_use = true;
  834. req->usr_len = usr_len;
  835. req->data_len = data_len;
  836. req->sglist = sg;
  837. req->sg_cnt = sg_cnt;
  838. req->priv = priv;
  839. req->dir = dir;
  840. req->con = rtrs_permit_to_clt_con(clt_path, permit);
  841. req->conf = conf;
  842. req->need_inv = false;
  843. req->need_inv_comp = false;
  844. req->inv_errno = 0;
  845. refcount_set(&req->ref, 1);
  846. req->mp_policy = clt_path->clt->mp_policy;
  847. iov_iter_kvec(&iter, ITER_SOURCE, vec, 1, usr_len);
  848. len = _copy_from_iter(req->iu->buf, usr_len, &iter);
  849. WARN_ON(len != usr_len);
  850. reinit_completion(&req->inv_comp);
  851. }
  852. static struct rtrs_clt_io_req *
  853. rtrs_clt_get_req(struct rtrs_clt_path *clt_path,
  854. void (*conf)(void *priv, int errno),
  855. struct rtrs_permit *permit, void *priv,
  856. const struct kvec *vec, size_t usr_len,
  857. struct scatterlist *sg, size_t sg_cnt,
  858. size_t data_len, int dir)
  859. {
  860. struct rtrs_clt_io_req *req;
  861. req = &clt_path->reqs[permit->mem_id];
  862. rtrs_clt_init_req(req, clt_path, conf, permit, priv, vec, usr_len,
  863. sg, sg_cnt, data_len, dir);
  864. return req;
  865. }
  866. static struct rtrs_clt_io_req *
  867. rtrs_clt_get_copy_req(struct rtrs_clt_path *alive_path,
  868. struct rtrs_clt_io_req *fail_req)
  869. {
  870. struct rtrs_clt_io_req *req;
  871. struct kvec vec = {
  872. .iov_base = fail_req->iu->buf,
  873. .iov_len = fail_req->usr_len
  874. };
  875. req = &alive_path->reqs[fail_req->permit->mem_id];
  876. rtrs_clt_init_req(req, alive_path, fail_req->conf, fail_req->permit,
  877. fail_req->priv, &vec, fail_req->usr_len,
  878. fail_req->sglist, fail_req->sg_cnt,
  879. fail_req->data_len, fail_req->dir);
  880. return req;
  881. }
  882. static int rtrs_post_rdma_write_sg(struct rtrs_clt_con *con,
  883. struct rtrs_clt_io_req *req,
  884. struct rtrs_rbuf *rbuf, bool fr_en,
  885. u32 count, u32 size, u32 imm,
  886. struct ib_send_wr *wr,
  887. struct ib_send_wr *tail)
  888. {
  889. struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
  890. struct ib_sge *sge = req->sge;
  891. enum ib_send_flags flags;
  892. struct scatterlist *sg;
  893. size_t num_sge;
  894. int i;
  895. struct ib_send_wr *ptail = NULL;
  896. if (fr_en) {
  897. i = 0;
  898. sge[i].addr = req->mr->iova;
  899. sge[i].length = req->mr->length;
  900. sge[i].lkey = req->mr->lkey;
  901. i++;
  902. num_sge = 2;
  903. ptail = tail;
  904. } else {
  905. for_each_sg(req->sglist, sg, count, i) {
  906. sge[i].addr = sg_dma_address(sg);
  907. sge[i].length = sg_dma_len(sg);
  908. sge[i].lkey = clt_path->s.dev->ib_pd->local_dma_lkey;
  909. }
  910. num_sge = 1 + count;
  911. }
  912. sge[i].addr = req->iu->dma_addr;
  913. sge[i].length = size;
  914. sge[i].lkey = clt_path->s.dev->ib_pd->local_dma_lkey;
  915. /*
  916. * From time to time we have to post signalled sends,
  917. * or send queue will fill up and only QP reset can help.
  918. */
  919. flags = atomic_inc_return(&con->c.wr_cnt) % clt_path->s.signal_interval ?
  920. 0 : IB_SEND_SIGNALED;
  921. ib_dma_sync_single_for_device(clt_path->s.dev->ib_dev,
  922. req->iu->dma_addr,
  923. size, DMA_TO_DEVICE);
  924. return rtrs_iu_post_rdma_write_imm(&con->c, req->iu, sge, num_sge,
  925. rbuf->rkey, rbuf->addr, imm,
  926. flags, wr, ptail);
  927. }
  928. static int rtrs_map_sg_fr(struct rtrs_clt_io_req *req, size_t count)
  929. {
  930. int nr;
  931. /* Align the MR to a 4K page size to match the block virt boundary */
  932. nr = ib_map_mr_sg(req->mr, req->sglist, count, NULL, SZ_4K);
  933. if (nr < 0)
  934. return nr;
  935. if (nr < req->sg_cnt)
  936. return -EINVAL;
  937. ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
  938. return nr;
  939. }
  940. static int rtrs_clt_write_req(struct rtrs_clt_io_req *req)
  941. {
  942. struct rtrs_clt_con *con = req->con;
  943. struct rtrs_path *s = con->c.path;
  944. struct rtrs_clt_path *clt_path = to_clt_path(s);
  945. struct rtrs_msg_rdma_write *msg;
  946. struct rtrs_rbuf *rbuf;
  947. int ret, count = 0;
  948. u32 imm, buf_id;
  949. struct ib_reg_wr rwr;
  950. struct ib_send_wr inv_wr;
  951. struct ib_send_wr *wr = NULL;
  952. bool fr_en = false;
  953. const size_t tsize = sizeof(*msg) + req->data_len + req->usr_len;
  954. if (tsize > clt_path->chunk_size) {
  955. rtrs_wrn(s, "Write request failed, size too big %zu > %d\n",
  956. tsize, clt_path->chunk_size);
  957. return -EMSGSIZE;
  958. }
  959. if (req->sg_cnt) {
  960. count = ib_dma_map_sg(clt_path->s.dev->ib_dev, req->sglist,
  961. req->sg_cnt, req->dir);
  962. if (!count) {
  963. rtrs_wrn(s, "Write request failed, map failed\n");
  964. return -EINVAL;
  965. }
  966. }
  967. /* put rtrs msg after sg and user message */
  968. msg = req->iu->buf + req->usr_len;
  969. msg->type = cpu_to_le16(RTRS_MSG_WRITE);
  970. msg->usr_len = cpu_to_le16(req->usr_len);
  971. /* rtrs message on server side will be after user data and message */
  972. imm = req->permit->mem_off + req->data_len + req->usr_len;
  973. imm = rtrs_to_io_req_imm(imm);
  974. buf_id = req->permit->mem_id;
  975. req->sg_size = tsize;
  976. rbuf = &clt_path->rbufs[buf_id];
  977. if (count) {
  978. ret = rtrs_map_sg_fr(req, count);
  979. if (ret < 0) {
  980. rtrs_err_rl(s,
  981. "Write request failed, failed to map fast reg. data, err: %d\n",
  982. ret);
  983. ib_dma_unmap_sg(clt_path->s.dev->ib_dev, req->sglist,
  984. req->sg_cnt, req->dir);
  985. return ret;
  986. }
  987. inv_wr = (struct ib_send_wr) {
  988. .opcode = IB_WR_LOCAL_INV,
  989. .wr_cqe = &req->inv_cqe,
  990. .send_flags = IB_SEND_SIGNALED,
  991. .ex.invalidate_rkey = req->mr->rkey,
  992. };
  993. req->inv_cqe.done = rtrs_clt_inv_rkey_done;
  994. rwr = (struct ib_reg_wr) {
  995. .wr.opcode = IB_WR_REG_MR,
  996. .wr.wr_cqe = &fast_reg_cqe,
  997. .mr = req->mr,
  998. .key = req->mr->rkey,
  999. .access = (IB_ACCESS_LOCAL_WRITE),
  1000. };
  1001. wr = &rwr.wr;
  1002. fr_en = true;
  1003. refcount_inc(&req->ref);
  1004. }
  1005. /*
  1006. * Update stats now, after request is successfully sent it is not
  1007. * safe anymore to touch it.
  1008. */
  1009. rtrs_clt_update_all_stats(req, WRITE);
  1010. ret = rtrs_post_rdma_write_sg(req->con, req, rbuf, fr_en, count,
  1011. req->usr_len + sizeof(*msg),
  1012. imm, wr, &inv_wr);
  1013. if (ret) {
  1014. rtrs_err_rl(s,
  1015. "Write request failed: error=%d path=%s [%s:%u]\n",
  1016. ret, kobject_name(&clt_path->kobj), clt_path->hca_name,
  1017. clt_path->hca_port);
  1018. if (req->mp_policy == MP_POLICY_MIN_INFLIGHT)
  1019. atomic_dec(&clt_path->stats->inflight);
  1020. if (req->sg_cnt)
  1021. ib_dma_unmap_sg(clt_path->s.dev->ib_dev, req->sglist,
  1022. req->sg_cnt, req->dir);
  1023. }
  1024. return ret;
  1025. }
  1026. static int rtrs_clt_read_req(struct rtrs_clt_io_req *req)
  1027. {
  1028. struct rtrs_clt_con *con = req->con;
  1029. struct rtrs_path *s = con->c.path;
  1030. struct rtrs_clt_path *clt_path = to_clt_path(s);
  1031. struct rtrs_msg_rdma_read *msg;
  1032. struct rtrs_ib_dev *dev = clt_path->s.dev;
  1033. struct ib_reg_wr rwr;
  1034. struct ib_send_wr *wr = NULL;
  1035. int ret, count = 0;
  1036. u32 imm, buf_id;
  1037. const size_t tsize = sizeof(*msg) + req->data_len + req->usr_len;
  1038. if (tsize > clt_path->chunk_size) {
  1039. rtrs_wrn(s,
  1040. "Read request failed, message size is %zu, bigger than CHUNK_SIZE %d\n",
  1041. tsize, clt_path->chunk_size);
  1042. return -EMSGSIZE;
  1043. }
  1044. if (req->sg_cnt) {
  1045. count = ib_dma_map_sg(dev->ib_dev, req->sglist, req->sg_cnt,
  1046. req->dir);
  1047. if (!count) {
  1048. rtrs_wrn(s,
  1049. "Read request failed, dma map failed\n");
  1050. return -EINVAL;
  1051. }
  1052. }
  1053. /* put our message into req->buf after user message*/
  1054. msg = req->iu->buf + req->usr_len;
  1055. msg->type = cpu_to_le16(RTRS_MSG_READ);
  1056. msg->usr_len = cpu_to_le16(req->usr_len);
  1057. if (count) {
  1058. ret = rtrs_map_sg_fr(req, count);
  1059. if (ret < 0) {
  1060. rtrs_err_rl(s,
  1061. "Read request failed, failed to map fast reg. data, err: %d\n",
  1062. ret);
  1063. ib_dma_unmap_sg(dev->ib_dev, req->sglist, req->sg_cnt,
  1064. req->dir);
  1065. return ret;
  1066. }
  1067. rwr = (struct ib_reg_wr) {
  1068. .wr.opcode = IB_WR_REG_MR,
  1069. .wr.wr_cqe = &fast_reg_cqe,
  1070. .mr = req->mr,
  1071. .key = req->mr->rkey,
  1072. .access = (IB_ACCESS_LOCAL_WRITE |
  1073. IB_ACCESS_REMOTE_WRITE),
  1074. };
  1075. wr = &rwr.wr;
  1076. msg->sg_cnt = cpu_to_le16(1);
  1077. msg->flags = cpu_to_le16(RTRS_MSG_NEED_INVAL_F);
  1078. msg->desc[0].addr = cpu_to_le64(req->mr->iova);
  1079. msg->desc[0].key = cpu_to_le32(req->mr->rkey);
  1080. msg->desc[0].len = cpu_to_le32(req->mr->length);
  1081. /* Further invalidation is required */
  1082. req->need_inv = !!RTRS_MSG_NEED_INVAL_F;
  1083. } else {
  1084. msg->sg_cnt = 0;
  1085. msg->flags = 0;
  1086. }
  1087. /*
  1088. * rtrs message will be after the space reserved for disk data and
  1089. * user message
  1090. */
  1091. imm = req->permit->mem_off + req->data_len + req->usr_len;
  1092. imm = rtrs_to_io_req_imm(imm);
  1093. buf_id = req->permit->mem_id;
  1094. req->sg_size = sizeof(*msg);
  1095. req->sg_size += le16_to_cpu(msg->sg_cnt) * sizeof(struct rtrs_sg_desc);
  1096. req->sg_size += req->usr_len;
  1097. /*
  1098. * Update stats now, after request is successfully sent it is not
  1099. * safe anymore to touch it.
  1100. */
  1101. rtrs_clt_update_all_stats(req, READ);
  1102. ret = rtrs_post_send_rdma(req->con, req, &clt_path->rbufs[buf_id],
  1103. req->data_len, imm, wr);
  1104. if (ret) {
  1105. rtrs_err_rl(s,
  1106. "Read request failed: error=%d path=%s [%s:%u]\n",
  1107. ret, kobject_name(&clt_path->kobj), clt_path->hca_name,
  1108. clt_path->hca_port);
  1109. if (req->mp_policy == MP_POLICY_MIN_INFLIGHT)
  1110. atomic_dec(&clt_path->stats->inflight);
  1111. req->need_inv = false;
  1112. if (req->sg_cnt)
  1113. ib_dma_unmap_sg(dev->ib_dev, req->sglist,
  1114. req->sg_cnt, req->dir);
  1115. }
  1116. return ret;
  1117. }
  1118. /**
  1119. * rtrs_clt_failover_req() - Try to find an active path for a failed request
  1120. * @clt: clt context
  1121. * @fail_req: a failed io request.
  1122. */
  1123. static int rtrs_clt_failover_req(struct rtrs_clt_sess *clt,
  1124. struct rtrs_clt_io_req *fail_req)
  1125. {
  1126. struct rtrs_clt_path *alive_path;
  1127. struct rtrs_clt_io_req *req;
  1128. int err = -ECONNABORTED;
  1129. struct path_it it;
  1130. rcu_read_lock();
  1131. for (path_it_init(&it, clt);
  1132. (alive_path = it.next_path(&it)) && it.i < it.clt->paths_num;
  1133. it.i++) {
  1134. if (READ_ONCE(alive_path->state) != RTRS_CLT_CONNECTED)
  1135. continue;
  1136. req = rtrs_clt_get_copy_req(alive_path, fail_req);
  1137. if (req->dir == DMA_TO_DEVICE)
  1138. err = rtrs_clt_write_req(req);
  1139. else
  1140. err = rtrs_clt_read_req(req);
  1141. if (err) {
  1142. req->in_use = false;
  1143. continue;
  1144. }
  1145. /* Success path */
  1146. rtrs_clt_inc_failover_cnt(alive_path->stats);
  1147. break;
  1148. }
  1149. path_it_deinit(&it);
  1150. rcu_read_unlock();
  1151. return err;
  1152. }
  1153. static void fail_all_outstanding_reqs(struct rtrs_clt_path *clt_path)
  1154. {
  1155. struct rtrs_clt_sess *clt = clt_path->clt;
  1156. struct rtrs_clt_io_req *req;
  1157. int i, err;
  1158. if (!clt_path->reqs)
  1159. return;
  1160. for (i = 0; i < clt_path->queue_depth; ++i) {
  1161. req = &clt_path->reqs[i];
  1162. if (!req->in_use)
  1163. continue;
  1164. /*
  1165. * Safely (without notification) complete failed request.
  1166. * After completion this request is still useble and can
  1167. * be failovered to another path.
  1168. */
  1169. complete_rdma_req(req, -ECONNABORTED, false, true);
  1170. err = rtrs_clt_failover_req(clt, req);
  1171. if (err)
  1172. /* Failover failed, notify anyway */
  1173. req->conf(req->priv, err);
  1174. }
  1175. }
  1176. static void free_path_reqs(struct rtrs_clt_path *clt_path)
  1177. {
  1178. struct rtrs_clt_io_req *req;
  1179. int i;
  1180. if (!clt_path->reqs)
  1181. return;
  1182. for (i = 0; i < clt_path->queue_depth; ++i) {
  1183. req = &clt_path->reqs[i];
  1184. if (req->mr)
  1185. ib_dereg_mr(req->mr);
  1186. kfree(req->sge);
  1187. rtrs_iu_free(req->iu, clt_path->s.dev->ib_dev, 1);
  1188. }
  1189. kfree(clt_path->reqs);
  1190. clt_path->reqs = NULL;
  1191. }
  1192. static int alloc_path_reqs(struct rtrs_clt_path *clt_path)
  1193. {
  1194. struct rtrs_clt_io_req *req;
  1195. int i, err = -ENOMEM;
  1196. clt_path->reqs = kcalloc(clt_path->queue_depth,
  1197. sizeof(*clt_path->reqs),
  1198. GFP_KERNEL);
  1199. if (!clt_path->reqs)
  1200. return -ENOMEM;
  1201. for (i = 0; i < clt_path->queue_depth; ++i) {
  1202. req = &clt_path->reqs[i];
  1203. req->iu = rtrs_iu_alloc(1, clt_path->max_hdr_size, GFP_KERNEL,
  1204. clt_path->s.dev->ib_dev,
  1205. DMA_TO_DEVICE,
  1206. rtrs_clt_rdma_done);
  1207. if (!req->iu)
  1208. goto out;
  1209. req->sge = kcalloc(2, sizeof(*req->sge), GFP_KERNEL);
  1210. if (!req->sge)
  1211. goto out;
  1212. req->mr = ib_alloc_mr(clt_path->s.dev->ib_pd,
  1213. IB_MR_TYPE_MEM_REG,
  1214. clt_path->max_pages_per_mr);
  1215. if (IS_ERR(req->mr)) {
  1216. err = PTR_ERR(req->mr);
  1217. req->mr = NULL;
  1218. pr_err("Failed to alloc clt_path->max_pages_per_mr %d\n",
  1219. clt_path->max_pages_per_mr);
  1220. goto out;
  1221. }
  1222. init_completion(&req->inv_comp);
  1223. }
  1224. return 0;
  1225. out:
  1226. free_path_reqs(clt_path);
  1227. return err;
  1228. }
  1229. static int alloc_permits(struct rtrs_clt_sess *clt)
  1230. {
  1231. unsigned int chunk_bits;
  1232. int err, i;
  1233. clt->permits_map = bitmap_zalloc(clt->queue_depth, GFP_KERNEL);
  1234. if (!clt->permits_map) {
  1235. err = -ENOMEM;
  1236. goto out_err;
  1237. }
  1238. clt->permits = kcalloc(clt->queue_depth, permit_size(clt), GFP_KERNEL);
  1239. if (!clt->permits) {
  1240. err = -ENOMEM;
  1241. goto err_map;
  1242. }
  1243. chunk_bits = ilog2(clt->queue_depth - 1) + 1;
  1244. for (i = 0; i < clt->queue_depth; i++) {
  1245. struct rtrs_permit *permit;
  1246. permit = get_permit(clt, i);
  1247. permit->mem_id = i;
  1248. permit->mem_off = i << (MAX_IMM_PAYL_BITS - chunk_bits);
  1249. }
  1250. return 0;
  1251. err_map:
  1252. bitmap_free(clt->permits_map);
  1253. clt->permits_map = NULL;
  1254. out_err:
  1255. return err;
  1256. }
  1257. static void free_permits(struct rtrs_clt_sess *clt)
  1258. {
  1259. if (clt->permits_map)
  1260. wait_event(clt->permits_wait,
  1261. bitmap_empty(clt->permits_map, clt->queue_depth));
  1262. bitmap_free(clt->permits_map);
  1263. clt->permits_map = NULL;
  1264. kfree(clt->permits);
  1265. clt->permits = NULL;
  1266. }
  1267. static void query_fast_reg_mode(struct rtrs_clt_path *clt_path)
  1268. {
  1269. struct ib_device *ib_dev;
  1270. u64 max_pages_per_mr;
  1271. int mr_page_shift;
  1272. ib_dev = clt_path->s.dev->ib_dev;
  1273. /*
  1274. * Use the smallest page size supported by the HCA, down to a
  1275. * minimum of 4096 bytes. We're unlikely to build large sglists
  1276. * out of smaller entries.
  1277. */
  1278. mr_page_shift = max(12, ffs(ib_dev->attrs.page_size_cap) - 1);
  1279. max_pages_per_mr = ib_dev->attrs.max_mr_size;
  1280. do_div(max_pages_per_mr, (1ull << mr_page_shift));
  1281. clt_path->max_pages_per_mr =
  1282. min3(clt_path->max_pages_per_mr, (u32)max_pages_per_mr,
  1283. ib_dev->attrs.max_fast_reg_page_list_len);
  1284. clt_path->clt->max_segments =
  1285. min(clt_path->max_pages_per_mr, clt_path->clt->max_segments);
  1286. }
  1287. static bool rtrs_clt_change_state_get_old(struct rtrs_clt_path *clt_path,
  1288. enum rtrs_clt_state new_state,
  1289. enum rtrs_clt_state *old_state)
  1290. {
  1291. bool changed;
  1292. spin_lock_irq(&clt_path->state_wq.lock);
  1293. if (old_state)
  1294. *old_state = clt_path->state;
  1295. changed = rtrs_clt_change_state(clt_path, new_state);
  1296. spin_unlock_irq(&clt_path->state_wq.lock);
  1297. return changed;
  1298. }
  1299. static void rtrs_clt_hb_err_handler(struct rtrs_con *c)
  1300. {
  1301. struct rtrs_clt_con *con = container_of(c, typeof(*con), c);
  1302. rtrs_rdma_error_recovery(con);
  1303. }
  1304. static void rtrs_clt_init_hb(struct rtrs_clt_path *clt_path)
  1305. {
  1306. rtrs_init_hb(&clt_path->s, &io_comp_cqe,
  1307. RTRS_HB_INTERVAL_MS,
  1308. RTRS_HB_MISSED_MAX,
  1309. rtrs_clt_hb_err_handler,
  1310. rtrs_wq);
  1311. }
  1312. static void rtrs_clt_reconnect_work(struct work_struct *work);
  1313. static void rtrs_clt_close_work(struct work_struct *work);
  1314. static void rtrs_clt_err_recovery_work(struct work_struct *work)
  1315. {
  1316. struct rtrs_clt_path *clt_path;
  1317. struct rtrs_clt_sess *clt;
  1318. int delay_ms;
  1319. clt_path = container_of(work, struct rtrs_clt_path, err_recovery_work);
  1320. clt = clt_path->clt;
  1321. delay_ms = clt->reconnect_delay_sec * 1000;
  1322. rtrs_clt_stop_and_destroy_conns(clt_path);
  1323. queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork,
  1324. msecs_to_jiffies(delay_ms +
  1325. prandom_u32_max(RTRS_RECONNECT_SEED)));
  1326. }
  1327. static struct rtrs_clt_path *alloc_path(struct rtrs_clt_sess *clt,
  1328. const struct rtrs_addr *path,
  1329. size_t con_num, u32 nr_poll_queues)
  1330. {
  1331. struct rtrs_clt_path *clt_path;
  1332. int err = -ENOMEM;
  1333. int cpu;
  1334. size_t total_con;
  1335. clt_path = kzalloc(sizeof(*clt_path), GFP_KERNEL);
  1336. if (!clt_path)
  1337. goto err;
  1338. /*
  1339. * irqmode and poll
  1340. * +1: Extra connection for user messages
  1341. */
  1342. total_con = con_num + nr_poll_queues + 1;
  1343. clt_path->s.con = kcalloc(total_con, sizeof(*clt_path->s.con),
  1344. GFP_KERNEL);
  1345. if (!clt_path->s.con)
  1346. goto err_free_path;
  1347. clt_path->s.con_num = total_con;
  1348. clt_path->s.irq_con_num = con_num + 1;
  1349. clt_path->stats = kzalloc(sizeof(*clt_path->stats), GFP_KERNEL);
  1350. if (!clt_path->stats)
  1351. goto err_free_con;
  1352. mutex_init(&clt_path->init_mutex);
  1353. uuid_gen(&clt_path->s.uuid);
  1354. memcpy(&clt_path->s.dst_addr, path->dst,
  1355. rdma_addr_size((struct sockaddr *)path->dst));
  1356. /*
  1357. * rdma_resolve_addr() passes src_addr to cma_bind_addr, which
  1358. * checks the sa_family to be non-zero. If user passed src_addr=NULL
  1359. * the sess->src_addr will contain only zeros, which is then fine.
  1360. */
  1361. if (path->src)
  1362. memcpy(&clt_path->s.src_addr, path->src,
  1363. rdma_addr_size((struct sockaddr *)path->src));
  1364. strscpy(clt_path->s.sessname, clt->sessname,
  1365. sizeof(clt_path->s.sessname));
  1366. clt_path->clt = clt;
  1367. clt_path->max_pages_per_mr = RTRS_MAX_SEGMENTS;
  1368. init_waitqueue_head(&clt_path->state_wq);
  1369. clt_path->state = RTRS_CLT_CONNECTING;
  1370. atomic_set(&clt_path->connected_cnt, 0);
  1371. INIT_WORK(&clt_path->close_work, rtrs_clt_close_work);
  1372. INIT_WORK(&clt_path->err_recovery_work, rtrs_clt_err_recovery_work);
  1373. INIT_DELAYED_WORK(&clt_path->reconnect_dwork, rtrs_clt_reconnect_work);
  1374. rtrs_clt_init_hb(clt_path);
  1375. clt_path->mp_skip_entry = alloc_percpu(typeof(*clt_path->mp_skip_entry));
  1376. if (!clt_path->mp_skip_entry)
  1377. goto err_free_stats;
  1378. for_each_possible_cpu(cpu)
  1379. INIT_LIST_HEAD(per_cpu_ptr(clt_path->mp_skip_entry, cpu));
  1380. err = rtrs_clt_init_stats(clt_path->stats);
  1381. if (err)
  1382. goto err_free_percpu;
  1383. return clt_path;
  1384. err_free_percpu:
  1385. free_percpu(clt_path->mp_skip_entry);
  1386. err_free_stats:
  1387. kfree(clt_path->stats);
  1388. err_free_con:
  1389. kfree(clt_path->s.con);
  1390. err_free_path:
  1391. kfree(clt_path);
  1392. err:
  1393. return ERR_PTR(err);
  1394. }
  1395. void free_path(struct rtrs_clt_path *clt_path)
  1396. {
  1397. free_percpu(clt_path->mp_skip_entry);
  1398. mutex_destroy(&clt_path->init_mutex);
  1399. kfree(clt_path->s.con);
  1400. kfree(clt_path->rbufs);
  1401. kfree(clt_path);
  1402. }
  1403. static int create_con(struct rtrs_clt_path *clt_path, unsigned int cid)
  1404. {
  1405. struct rtrs_clt_con *con;
  1406. con = kzalloc(sizeof(*con), GFP_KERNEL);
  1407. if (!con)
  1408. return -ENOMEM;
  1409. /* Map first two connections to the first CPU */
  1410. con->cpu = (cid ? cid - 1 : 0) % nr_cpu_ids;
  1411. con->c.cid = cid;
  1412. con->c.path = &clt_path->s;
  1413. /* Align with srv, init as 1 */
  1414. atomic_set(&con->c.wr_cnt, 1);
  1415. mutex_init(&con->con_mutex);
  1416. clt_path->s.con[cid] = &con->c;
  1417. return 0;
  1418. }
  1419. static void destroy_con(struct rtrs_clt_con *con)
  1420. {
  1421. struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
  1422. clt_path->s.con[con->c.cid] = NULL;
  1423. mutex_destroy(&con->con_mutex);
  1424. kfree(con);
  1425. }
  1426. static int create_con_cq_qp(struct rtrs_clt_con *con)
  1427. {
  1428. struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
  1429. u32 max_send_wr, max_recv_wr, cq_num, max_send_sge, wr_limit;
  1430. int err, cq_vector;
  1431. struct rtrs_msg_rkey_rsp *rsp;
  1432. lockdep_assert_held(&con->con_mutex);
  1433. if (con->c.cid == 0) {
  1434. max_send_sge = 1;
  1435. /* We must be the first here */
  1436. if (WARN_ON(clt_path->s.dev))
  1437. return -EINVAL;
  1438. /*
  1439. * The whole session uses device from user connection.
  1440. * Be careful not to close user connection before ib dev
  1441. * is gracefully put.
  1442. */
  1443. clt_path->s.dev = rtrs_ib_dev_find_or_add(con->c.cm_id->device,
  1444. &dev_pd);
  1445. if (!clt_path->s.dev) {
  1446. rtrs_wrn(clt_path->clt,
  1447. "rtrs_ib_dev_find_get_or_add(): no memory\n");
  1448. return -ENOMEM;
  1449. }
  1450. clt_path->s.dev_ref = 1;
  1451. query_fast_reg_mode(clt_path);
  1452. wr_limit = clt_path->s.dev->ib_dev->attrs.max_qp_wr;
  1453. /*
  1454. * Two (request + registration) completion for send
  1455. * Two for recv if always_invalidate is set on server
  1456. * or one for recv.
  1457. * + 2 for drain and heartbeat
  1458. * in case qp gets into error state.
  1459. */
  1460. max_send_wr =
  1461. min_t(int, wr_limit, SERVICE_CON_QUEUE_DEPTH * 2 + 2);
  1462. max_recv_wr = max_send_wr;
  1463. } else {
  1464. /*
  1465. * Here we assume that session members are correctly set.
  1466. * This is always true if user connection (cid == 0) is
  1467. * established first.
  1468. */
  1469. if (WARN_ON(!clt_path->s.dev))
  1470. return -EINVAL;
  1471. if (WARN_ON(!clt_path->queue_depth))
  1472. return -EINVAL;
  1473. wr_limit = clt_path->s.dev->ib_dev->attrs.max_qp_wr;
  1474. /* Shared between connections */
  1475. clt_path->s.dev_ref++;
  1476. max_send_wr = min_t(int, wr_limit,
  1477. /* QD * (REQ + RSP + FR REGS or INVS) + drain */
  1478. clt_path->queue_depth * 4 + 1);
  1479. max_recv_wr = min_t(int, wr_limit,
  1480. clt_path->queue_depth * 3 + 1);
  1481. max_send_sge = 2;
  1482. }
  1483. atomic_set(&con->c.sq_wr_avail, max_send_wr);
  1484. cq_num = max_send_wr + max_recv_wr;
  1485. /* alloc iu to recv new rkey reply when server reports flags set */
  1486. if (clt_path->flags & RTRS_MSG_NEW_RKEY_F || con->c.cid == 0) {
  1487. con->rsp_ius = rtrs_iu_alloc(cq_num, sizeof(*rsp),
  1488. GFP_KERNEL,
  1489. clt_path->s.dev->ib_dev,
  1490. DMA_FROM_DEVICE,
  1491. rtrs_clt_rdma_done);
  1492. if (!con->rsp_ius)
  1493. return -ENOMEM;
  1494. con->queue_num = cq_num;
  1495. }
  1496. cq_num = max_send_wr + max_recv_wr;
  1497. cq_vector = con->cpu % clt_path->s.dev->ib_dev->num_comp_vectors;
  1498. if (con->c.cid >= clt_path->s.irq_con_num)
  1499. err = rtrs_cq_qp_create(&clt_path->s, &con->c, max_send_sge,
  1500. cq_vector, cq_num, max_send_wr,
  1501. max_recv_wr, IB_POLL_DIRECT);
  1502. else
  1503. err = rtrs_cq_qp_create(&clt_path->s, &con->c, max_send_sge,
  1504. cq_vector, cq_num, max_send_wr,
  1505. max_recv_wr, IB_POLL_SOFTIRQ);
  1506. /*
  1507. * In case of error we do not bother to clean previous allocations,
  1508. * since destroy_con_cq_qp() must be called.
  1509. */
  1510. return err;
  1511. }
  1512. static void destroy_con_cq_qp(struct rtrs_clt_con *con)
  1513. {
  1514. struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
  1515. /*
  1516. * Be careful here: destroy_con_cq_qp() can be called even
  1517. * create_con_cq_qp() failed, see comments there.
  1518. */
  1519. lockdep_assert_held(&con->con_mutex);
  1520. rtrs_cq_qp_destroy(&con->c);
  1521. if (con->rsp_ius) {
  1522. rtrs_iu_free(con->rsp_ius, clt_path->s.dev->ib_dev,
  1523. con->queue_num);
  1524. con->rsp_ius = NULL;
  1525. con->queue_num = 0;
  1526. }
  1527. if (clt_path->s.dev_ref && !--clt_path->s.dev_ref) {
  1528. rtrs_ib_dev_put(clt_path->s.dev);
  1529. clt_path->s.dev = NULL;
  1530. }
  1531. }
  1532. static void stop_cm(struct rtrs_clt_con *con)
  1533. {
  1534. rdma_disconnect(con->c.cm_id);
  1535. if (con->c.qp)
  1536. ib_drain_qp(con->c.qp);
  1537. }
  1538. static void destroy_cm(struct rtrs_clt_con *con)
  1539. {
  1540. rdma_destroy_id(con->c.cm_id);
  1541. con->c.cm_id = NULL;
  1542. }
  1543. static int rtrs_rdma_addr_resolved(struct rtrs_clt_con *con)
  1544. {
  1545. struct rtrs_path *s = con->c.path;
  1546. int err;
  1547. mutex_lock(&con->con_mutex);
  1548. err = create_con_cq_qp(con);
  1549. mutex_unlock(&con->con_mutex);
  1550. if (err) {
  1551. rtrs_err(s, "create_con_cq_qp(), err: %d\n", err);
  1552. return err;
  1553. }
  1554. err = rdma_resolve_route(con->c.cm_id, RTRS_CONNECT_TIMEOUT_MS);
  1555. if (err)
  1556. rtrs_err(s, "Resolving route failed, err: %d\n", err);
  1557. return err;
  1558. }
  1559. static int rtrs_rdma_route_resolved(struct rtrs_clt_con *con)
  1560. {
  1561. struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
  1562. struct rtrs_clt_sess *clt = clt_path->clt;
  1563. struct rtrs_msg_conn_req msg;
  1564. struct rdma_conn_param param;
  1565. int err;
  1566. param = (struct rdma_conn_param) {
  1567. .retry_count = 7,
  1568. .rnr_retry_count = 7,
  1569. .private_data = &msg,
  1570. .private_data_len = sizeof(msg),
  1571. };
  1572. msg = (struct rtrs_msg_conn_req) {
  1573. .magic = cpu_to_le16(RTRS_MAGIC),
  1574. .version = cpu_to_le16(RTRS_PROTO_VER),
  1575. .cid = cpu_to_le16(con->c.cid),
  1576. .cid_num = cpu_to_le16(clt_path->s.con_num),
  1577. .recon_cnt = cpu_to_le16(clt_path->s.recon_cnt),
  1578. };
  1579. msg.first_conn = clt_path->for_new_clt ? FIRST_CONN : 0;
  1580. uuid_copy(&msg.sess_uuid, &clt_path->s.uuid);
  1581. uuid_copy(&msg.paths_uuid, &clt->paths_uuid);
  1582. err = rdma_connect_locked(con->c.cm_id, &param);
  1583. if (err)
  1584. rtrs_err(clt, "rdma_connect_locked(): %d\n", err);
  1585. return err;
  1586. }
  1587. static int rtrs_rdma_conn_established(struct rtrs_clt_con *con,
  1588. struct rdma_cm_event *ev)
  1589. {
  1590. struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
  1591. struct rtrs_clt_sess *clt = clt_path->clt;
  1592. const struct rtrs_msg_conn_rsp *msg;
  1593. u16 version, queue_depth;
  1594. int errno;
  1595. u8 len;
  1596. msg = ev->param.conn.private_data;
  1597. len = ev->param.conn.private_data_len;
  1598. if (len < sizeof(*msg)) {
  1599. rtrs_err(clt, "Invalid RTRS connection response\n");
  1600. return -ECONNRESET;
  1601. }
  1602. if (le16_to_cpu(msg->magic) != RTRS_MAGIC) {
  1603. rtrs_err(clt, "Invalid RTRS magic\n");
  1604. return -ECONNRESET;
  1605. }
  1606. version = le16_to_cpu(msg->version);
  1607. if (version >> 8 != RTRS_PROTO_VER_MAJOR) {
  1608. rtrs_err(clt, "Unsupported major RTRS version: %d, expected %d\n",
  1609. version >> 8, RTRS_PROTO_VER_MAJOR);
  1610. return -ECONNRESET;
  1611. }
  1612. errno = le16_to_cpu(msg->errno);
  1613. if (errno) {
  1614. rtrs_err(clt, "Invalid RTRS message: errno %d\n",
  1615. errno);
  1616. return -ECONNRESET;
  1617. }
  1618. if (con->c.cid == 0) {
  1619. queue_depth = le16_to_cpu(msg->queue_depth);
  1620. if (clt_path->queue_depth > 0 && queue_depth != clt_path->queue_depth) {
  1621. rtrs_err(clt, "Error: queue depth changed\n");
  1622. /*
  1623. * Stop any more reconnection attempts
  1624. */
  1625. clt_path->reconnect_attempts = -1;
  1626. rtrs_err(clt,
  1627. "Disabling auto-reconnect. Trigger a manual reconnect after issue is resolved\n");
  1628. return -ECONNRESET;
  1629. }
  1630. if (!clt_path->rbufs) {
  1631. clt_path->rbufs = kcalloc(queue_depth,
  1632. sizeof(*clt_path->rbufs),
  1633. GFP_KERNEL);
  1634. if (!clt_path->rbufs)
  1635. return -ENOMEM;
  1636. }
  1637. clt_path->queue_depth = queue_depth;
  1638. clt_path->s.signal_interval = min_not_zero(queue_depth,
  1639. (unsigned short) SERVICE_CON_QUEUE_DEPTH);
  1640. clt_path->max_hdr_size = le32_to_cpu(msg->max_hdr_size);
  1641. clt_path->max_io_size = le32_to_cpu(msg->max_io_size);
  1642. clt_path->flags = le32_to_cpu(msg->flags);
  1643. clt_path->chunk_size = clt_path->max_io_size + clt_path->max_hdr_size;
  1644. /*
  1645. * Global IO size is always a minimum.
  1646. * If while a reconnection server sends us a value a bit
  1647. * higher - client does not care and uses cached minimum.
  1648. *
  1649. * Since we can have several sessions (paths) restablishing
  1650. * connections in parallel, use lock.
  1651. */
  1652. mutex_lock(&clt->paths_mutex);
  1653. clt->queue_depth = clt_path->queue_depth;
  1654. clt->max_io_size = min_not_zero(clt_path->max_io_size,
  1655. clt->max_io_size);
  1656. mutex_unlock(&clt->paths_mutex);
  1657. /*
  1658. * Cache the hca_port and hca_name for sysfs
  1659. */
  1660. clt_path->hca_port = con->c.cm_id->port_num;
  1661. scnprintf(clt_path->hca_name, sizeof(clt_path->hca_name),
  1662. clt_path->s.dev->ib_dev->name);
  1663. clt_path->s.src_addr = con->c.cm_id->route.addr.src_addr;
  1664. /* set for_new_clt, to allow future reconnect on any path */
  1665. clt_path->for_new_clt = 1;
  1666. }
  1667. return 0;
  1668. }
  1669. static inline void flag_success_on_conn(struct rtrs_clt_con *con)
  1670. {
  1671. struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
  1672. atomic_inc(&clt_path->connected_cnt);
  1673. con->cm_err = 1;
  1674. }
  1675. static int rtrs_rdma_conn_rejected(struct rtrs_clt_con *con,
  1676. struct rdma_cm_event *ev)
  1677. {
  1678. struct rtrs_path *s = con->c.path;
  1679. const struct rtrs_msg_conn_rsp *msg;
  1680. const char *rej_msg;
  1681. int status, errno;
  1682. u8 data_len;
  1683. status = ev->status;
  1684. rej_msg = rdma_reject_msg(con->c.cm_id, status);
  1685. msg = rdma_consumer_reject_data(con->c.cm_id, ev, &data_len);
  1686. if (msg && data_len >= sizeof(*msg)) {
  1687. errno = (int16_t)le16_to_cpu(msg->errno);
  1688. if (errno == -EBUSY)
  1689. rtrs_err(s,
  1690. "Previous session is still exists on the server, please reconnect later\n");
  1691. else
  1692. rtrs_err(s,
  1693. "Connect rejected: status %d (%s), rtrs errno %d\n",
  1694. status, rej_msg, errno);
  1695. } else {
  1696. rtrs_err(s,
  1697. "Connect rejected but with malformed message: status %d (%s)\n",
  1698. status, rej_msg);
  1699. }
  1700. return -ECONNRESET;
  1701. }
  1702. void rtrs_clt_close_conns(struct rtrs_clt_path *clt_path, bool wait)
  1703. {
  1704. trace_rtrs_clt_close_conns(clt_path);
  1705. if (rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CLOSING, NULL))
  1706. queue_work(rtrs_wq, &clt_path->close_work);
  1707. if (wait)
  1708. flush_work(&clt_path->close_work);
  1709. }
  1710. static inline void flag_error_on_conn(struct rtrs_clt_con *con, int cm_err)
  1711. {
  1712. if (con->cm_err == 1) {
  1713. struct rtrs_clt_path *clt_path;
  1714. clt_path = to_clt_path(con->c.path);
  1715. if (atomic_dec_and_test(&clt_path->connected_cnt))
  1716. wake_up(&clt_path->state_wq);
  1717. }
  1718. con->cm_err = cm_err;
  1719. }
  1720. static int rtrs_clt_rdma_cm_handler(struct rdma_cm_id *cm_id,
  1721. struct rdma_cm_event *ev)
  1722. {
  1723. struct rtrs_clt_con *con = cm_id->context;
  1724. struct rtrs_path *s = con->c.path;
  1725. struct rtrs_clt_path *clt_path = to_clt_path(s);
  1726. int cm_err = 0;
  1727. switch (ev->event) {
  1728. case RDMA_CM_EVENT_ADDR_RESOLVED:
  1729. cm_err = rtrs_rdma_addr_resolved(con);
  1730. break;
  1731. case RDMA_CM_EVENT_ROUTE_RESOLVED:
  1732. cm_err = rtrs_rdma_route_resolved(con);
  1733. break;
  1734. case RDMA_CM_EVENT_ESTABLISHED:
  1735. cm_err = rtrs_rdma_conn_established(con, ev);
  1736. if (!cm_err) {
  1737. /*
  1738. * Report success and wake up. Here we abuse state_wq,
  1739. * i.e. wake up without state change, but we set cm_err.
  1740. */
  1741. flag_success_on_conn(con);
  1742. wake_up(&clt_path->state_wq);
  1743. return 0;
  1744. }
  1745. break;
  1746. case RDMA_CM_EVENT_REJECTED:
  1747. cm_err = rtrs_rdma_conn_rejected(con, ev);
  1748. break;
  1749. case RDMA_CM_EVENT_DISCONNECTED:
  1750. /* No message for disconnecting */
  1751. cm_err = -ECONNRESET;
  1752. break;
  1753. case RDMA_CM_EVENT_CONNECT_ERROR:
  1754. case RDMA_CM_EVENT_UNREACHABLE:
  1755. case RDMA_CM_EVENT_ADDR_CHANGE:
  1756. case RDMA_CM_EVENT_TIMEWAIT_EXIT:
  1757. rtrs_wrn(s, "CM error (CM event: %s, err: %d)\n",
  1758. rdma_event_msg(ev->event), ev->status);
  1759. cm_err = -ECONNRESET;
  1760. break;
  1761. case RDMA_CM_EVENT_ADDR_ERROR:
  1762. case RDMA_CM_EVENT_ROUTE_ERROR:
  1763. rtrs_wrn(s, "CM error (CM event: %s, err: %d)\n",
  1764. rdma_event_msg(ev->event), ev->status);
  1765. cm_err = -EHOSTUNREACH;
  1766. break;
  1767. case RDMA_CM_EVENT_DEVICE_REMOVAL:
  1768. /*
  1769. * Device removal is a special case. Queue close and return 0.
  1770. */
  1771. rtrs_clt_close_conns(clt_path, false);
  1772. return 0;
  1773. default:
  1774. rtrs_err(s, "Unexpected RDMA CM error (CM event: %s, err: %d)\n",
  1775. rdma_event_msg(ev->event), ev->status);
  1776. cm_err = -ECONNRESET;
  1777. break;
  1778. }
  1779. if (cm_err) {
  1780. /*
  1781. * cm error makes sense only on connection establishing,
  1782. * in other cases we rely on normal procedure of reconnecting.
  1783. */
  1784. flag_error_on_conn(con, cm_err);
  1785. rtrs_rdma_error_recovery(con);
  1786. }
  1787. return 0;
  1788. }
  1789. /* The caller should do the cleanup in case of error */
  1790. static int create_cm(struct rtrs_clt_con *con)
  1791. {
  1792. struct rtrs_path *s = con->c.path;
  1793. struct rtrs_clt_path *clt_path = to_clt_path(s);
  1794. struct rdma_cm_id *cm_id;
  1795. int err;
  1796. cm_id = rdma_create_id(&init_net, rtrs_clt_rdma_cm_handler, con,
  1797. clt_path->s.dst_addr.ss_family == AF_IB ?
  1798. RDMA_PS_IB : RDMA_PS_TCP, IB_QPT_RC);
  1799. if (IS_ERR(cm_id)) {
  1800. err = PTR_ERR(cm_id);
  1801. rtrs_err(s, "Failed to create CM ID, err: %d\n", err);
  1802. return err;
  1803. }
  1804. con->c.cm_id = cm_id;
  1805. con->cm_err = 0;
  1806. /* allow the port to be reused */
  1807. err = rdma_set_reuseaddr(cm_id, 1);
  1808. if (err != 0) {
  1809. rtrs_err(s, "Set address reuse failed, err: %d\n", err);
  1810. return err;
  1811. }
  1812. err = rdma_resolve_addr(cm_id, (struct sockaddr *)&clt_path->s.src_addr,
  1813. (struct sockaddr *)&clt_path->s.dst_addr,
  1814. RTRS_CONNECT_TIMEOUT_MS);
  1815. if (err) {
  1816. rtrs_err(s, "Failed to resolve address, err: %d\n", err);
  1817. return err;
  1818. }
  1819. /*
  1820. * Combine connection status and session events. This is needed
  1821. * for waiting two possible cases: cm_err has something meaningful
  1822. * or session state was really changed to error by device removal.
  1823. */
  1824. err = wait_event_interruptible_timeout(
  1825. clt_path->state_wq,
  1826. con->cm_err || clt_path->state != RTRS_CLT_CONNECTING,
  1827. msecs_to_jiffies(RTRS_CONNECT_TIMEOUT_MS));
  1828. if (err == 0 || err == -ERESTARTSYS) {
  1829. if (err == 0)
  1830. err = -ETIMEDOUT;
  1831. /* Timedout or interrupted */
  1832. return err;
  1833. }
  1834. if (con->cm_err < 0)
  1835. return con->cm_err;
  1836. if (READ_ONCE(clt_path->state) != RTRS_CLT_CONNECTING)
  1837. /* Device removal */
  1838. return -ECONNABORTED;
  1839. return 0;
  1840. }
  1841. static void rtrs_clt_path_up(struct rtrs_clt_path *clt_path)
  1842. {
  1843. struct rtrs_clt_sess *clt = clt_path->clt;
  1844. int up;
  1845. /*
  1846. * We can fire RECONNECTED event only when all paths were
  1847. * connected on rtrs_clt_open(), then each was disconnected
  1848. * and the first one connected again. That's why this nasty
  1849. * game with counter value.
  1850. */
  1851. mutex_lock(&clt->paths_ev_mutex);
  1852. up = ++clt->paths_up;
  1853. /*
  1854. * Here it is safe to access paths num directly since up counter
  1855. * is greater than MAX_PATHS_NUM only while rtrs_clt_open() is
  1856. * in progress, thus paths removals are impossible.
  1857. */
  1858. if (up > MAX_PATHS_NUM && up == MAX_PATHS_NUM + clt->paths_num)
  1859. clt->paths_up = clt->paths_num;
  1860. else if (up == 1)
  1861. clt->link_ev(clt->priv, RTRS_CLT_LINK_EV_RECONNECTED);
  1862. mutex_unlock(&clt->paths_ev_mutex);
  1863. /* Mark session as established */
  1864. clt_path->established = true;
  1865. clt_path->reconnect_attempts = 0;
  1866. clt_path->stats->reconnects.successful_cnt++;
  1867. }
  1868. static void rtrs_clt_path_down(struct rtrs_clt_path *clt_path)
  1869. {
  1870. struct rtrs_clt_sess *clt = clt_path->clt;
  1871. if (!clt_path->established)
  1872. return;
  1873. clt_path->established = false;
  1874. mutex_lock(&clt->paths_ev_mutex);
  1875. WARN_ON(!clt->paths_up);
  1876. if (--clt->paths_up == 0)
  1877. clt->link_ev(clt->priv, RTRS_CLT_LINK_EV_DISCONNECTED);
  1878. mutex_unlock(&clt->paths_ev_mutex);
  1879. }
  1880. static void rtrs_clt_stop_and_destroy_conns(struct rtrs_clt_path *clt_path)
  1881. {
  1882. struct rtrs_clt_con *con;
  1883. unsigned int cid;
  1884. WARN_ON(READ_ONCE(clt_path->state) == RTRS_CLT_CONNECTED);
  1885. /*
  1886. * Possible race with rtrs_clt_open(), when DEVICE_REMOVAL comes
  1887. * exactly in between. Start destroying after it finishes.
  1888. */
  1889. mutex_lock(&clt_path->init_mutex);
  1890. mutex_unlock(&clt_path->init_mutex);
  1891. /*
  1892. * All IO paths must observe !CONNECTED state before we
  1893. * free everything.
  1894. */
  1895. synchronize_rcu();
  1896. rtrs_stop_hb(&clt_path->s);
  1897. /*
  1898. * The order it utterly crucial: firstly disconnect and complete all
  1899. * rdma requests with error (thus set in_use=false for requests),
  1900. * then fail outstanding requests checking in_use for each, and
  1901. * eventually notify upper layer about session disconnection.
  1902. */
  1903. for (cid = 0; cid < clt_path->s.con_num; cid++) {
  1904. if (!clt_path->s.con[cid])
  1905. break;
  1906. con = to_clt_con(clt_path->s.con[cid]);
  1907. stop_cm(con);
  1908. }
  1909. fail_all_outstanding_reqs(clt_path);
  1910. free_path_reqs(clt_path);
  1911. rtrs_clt_path_down(clt_path);
  1912. /*
  1913. * Wait for graceful shutdown, namely when peer side invokes
  1914. * rdma_disconnect(). 'connected_cnt' is decremented only on
  1915. * CM events, thus if other side had crashed and hb has detected
  1916. * something is wrong, here we will stuck for exactly timeout ms,
  1917. * since CM does not fire anything. That is fine, we are not in
  1918. * hurry.
  1919. */
  1920. wait_event_timeout(clt_path->state_wq,
  1921. !atomic_read(&clt_path->connected_cnt),
  1922. msecs_to_jiffies(RTRS_CONNECT_TIMEOUT_MS));
  1923. for (cid = 0; cid < clt_path->s.con_num; cid++) {
  1924. if (!clt_path->s.con[cid])
  1925. break;
  1926. con = to_clt_con(clt_path->s.con[cid]);
  1927. mutex_lock(&con->con_mutex);
  1928. destroy_con_cq_qp(con);
  1929. mutex_unlock(&con->con_mutex);
  1930. destroy_cm(con);
  1931. destroy_con(con);
  1932. }
  1933. }
  1934. static void rtrs_clt_remove_path_from_arr(struct rtrs_clt_path *clt_path)
  1935. {
  1936. struct rtrs_clt_sess *clt = clt_path->clt;
  1937. struct rtrs_clt_path *next;
  1938. bool wait_for_grace = false;
  1939. int cpu;
  1940. mutex_lock(&clt->paths_mutex);
  1941. list_del_rcu(&clt_path->s.entry);
  1942. /* Make sure everybody observes path removal. */
  1943. synchronize_rcu();
  1944. /*
  1945. * At this point nobody sees @sess in the list, but still we have
  1946. * dangling pointer @pcpu_path which _can_ point to @sess. Since
  1947. * nobody can observe @sess in the list, we guarantee that IO path
  1948. * will not assign @sess to @pcpu_path, i.e. @pcpu_path can be equal
  1949. * to @sess, but can never again become @sess.
  1950. */
  1951. /*
  1952. * Decrement paths number only after grace period, because
  1953. * caller of do_each_path() must firstly observe list without
  1954. * path and only then decremented paths number.
  1955. *
  1956. * Otherwise there can be the following situation:
  1957. * o Two paths exist and IO is coming.
  1958. * o One path is removed:
  1959. * CPU#0 CPU#1
  1960. * do_each_path(): rtrs_clt_remove_path_from_arr():
  1961. * path = get_next_path()
  1962. * ^^^ list_del_rcu(path)
  1963. * [!CONNECTED path] clt->paths_num--
  1964. * ^^^^^^^^^
  1965. * load clt->paths_num from 2 to 1
  1966. * ^^^^^^^^^
  1967. * sees 1
  1968. *
  1969. * path is observed as !CONNECTED, but do_each_path() loop
  1970. * ends, because expression i < clt->paths_num is false.
  1971. */
  1972. clt->paths_num--;
  1973. /*
  1974. * Get @next connection from current @sess which is going to be
  1975. * removed. If @sess is the last element, then @next is NULL.
  1976. */
  1977. rcu_read_lock();
  1978. next = rtrs_clt_get_next_path_or_null(&clt->paths_list, clt_path);
  1979. rcu_read_unlock();
  1980. /*
  1981. * @pcpu paths can still point to the path which is going to be
  1982. * removed, so change the pointer manually.
  1983. */
  1984. for_each_possible_cpu(cpu) {
  1985. struct rtrs_clt_path __rcu **ppcpu_path;
  1986. ppcpu_path = per_cpu_ptr(clt->pcpu_path, cpu);
  1987. if (rcu_dereference_protected(*ppcpu_path,
  1988. lockdep_is_held(&clt->paths_mutex)) != clt_path)
  1989. /*
  1990. * synchronize_rcu() was called just after deleting
  1991. * entry from the list, thus IO code path cannot
  1992. * change pointer back to the pointer which is going
  1993. * to be removed, we are safe here.
  1994. */
  1995. continue;
  1996. /*
  1997. * We race with IO code path, which also changes pointer,
  1998. * thus we have to be careful not to overwrite it.
  1999. */
  2000. if (try_cmpxchg((struct rtrs_clt_path **)ppcpu_path, &clt_path,
  2001. next))
  2002. /*
  2003. * @ppcpu_path was successfully replaced with @next,
  2004. * that means that someone could also pick up the
  2005. * @sess and dereferencing it right now, so wait for
  2006. * a grace period is required.
  2007. */
  2008. wait_for_grace = true;
  2009. }
  2010. if (wait_for_grace)
  2011. synchronize_rcu();
  2012. mutex_unlock(&clt->paths_mutex);
  2013. }
  2014. static void rtrs_clt_add_path_to_arr(struct rtrs_clt_path *clt_path)
  2015. {
  2016. struct rtrs_clt_sess *clt = clt_path->clt;
  2017. mutex_lock(&clt->paths_mutex);
  2018. clt->paths_num++;
  2019. list_add_tail_rcu(&clt_path->s.entry, &clt->paths_list);
  2020. mutex_unlock(&clt->paths_mutex);
  2021. }
  2022. static void rtrs_clt_close_work(struct work_struct *work)
  2023. {
  2024. struct rtrs_clt_path *clt_path;
  2025. clt_path = container_of(work, struct rtrs_clt_path, close_work);
  2026. cancel_work_sync(&clt_path->err_recovery_work);
  2027. cancel_delayed_work_sync(&clt_path->reconnect_dwork);
  2028. rtrs_clt_stop_and_destroy_conns(clt_path);
  2029. rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CLOSED, NULL);
  2030. }
  2031. static int init_conns(struct rtrs_clt_path *clt_path)
  2032. {
  2033. unsigned int cid;
  2034. int err, i;
  2035. /*
  2036. * On every new session connections increase reconnect counter
  2037. * to avoid clashes with previous sessions not yet closed
  2038. * sessions on a server side.
  2039. */
  2040. clt_path->s.recon_cnt++;
  2041. /* Establish all RDMA connections */
  2042. for (cid = 0; cid < clt_path->s.con_num; cid++) {
  2043. err = create_con(clt_path, cid);
  2044. if (err)
  2045. goto destroy;
  2046. err = create_cm(to_clt_con(clt_path->s.con[cid]));
  2047. if (err)
  2048. goto destroy;
  2049. }
  2050. err = alloc_path_reqs(clt_path);
  2051. if (err)
  2052. goto destroy;
  2053. return 0;
  2054. destroy:
  2055. /* Make sure we do the cleanup in the order they are created */
  2056. for (i = 0; i <= cid; i++) {
  2057. struct rtrs_clt_con *con;
  2058. if (!clt_path->s.con[i])
  2059. break;
  2060. con = to_clt_con(clt_path->s.con[i]);
  2061. if (con->c.cm_id) {
  2062. stop_cm(con);
  2063. mutex_lock(&con->con_mutex);
  2064. destroy_con_cq_qp(con);
  2065. mutex_unlock(&con->con_mutex);
  2066. destroy_cm(con);
  2067. }
  2068. destroy_con(con);
  2069. }
  2070. /*
  2071. * If we've never taken async path and got an error, say,
  2072. * doing rdma_resolve_addr(), switch to CONNECTION_ERR state
  2073. * manually to keep reconnecting.
  2074. */
  2075. rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CONNECTING_ERR, NULL);
  2076. return err;
  2077. }
  2078. static void rtrs_clt_info_req_done(struct ib_cq *cq, struct ib_wc *wc)
  2079. {
  2080. struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context);
  2081. struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
  2082. struct rtrs_iu *iu;
  2083. iu = container_of(wc->wr_cqe, struct rtrs_iu, cqe);
  2084. rtrs_iu_free(iu, clt_path->s.dev->ib_dev, 1);
  2085. if (wc->status != IB_WC_SUCCESS) {
  2086. rtrs_err(clt_path->clt, "Path info request send failed: %s\n",
  2087. ib_wc_status_msg(wc->status));
  2088. rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CONNECTING_ERR, NULL);
  2089. return;
  2090. }
  2091. rtrs_clt_update_wc_stats(con);
  2092. }
  2093. static int process_info_rsp(struct rtrs_clt_path *clt_path,
  2094. const struct rtrs_msg_info_rsp *msg)
  2095. {
  2096. unsigned int sg_cnt, total_len;
  2097. int i, sgi;
  2098. sg_cnt = le16_to_cpu(msg->sg_cnt);
  2099. if (!sg_cnt || (clt_path->queue_depth % sg_cnt)) {
  2100. rtrs_err(clt_path->clt,
  2101. "Incorrect sg_cnt %d, is not multiple\n",
  2102. sg_cnt);
  2103. return -EINVAL;
  2104. }
  2105. /*
  2106. * Check if IB immediate data size is enough to hold the mem_id and
  2107. * the offset inside the memory chunk.
  2108. */
  2109. if ((ilog2(sg_cnt - 1) + 1) + (ilog2(clt_path->chunk_size - 1) + 1) >
  2110. MAX_IMM_PAYL_BITS) {
  2111. rtrs_err(clt_path->clt,
  2112. "RDMA immediate size (%db) not enough to encode %d buffers of size %dB\n",
  2113. MAX_IMM_PAYL_BITS, sg_cnt, clt_path->chunk_size);
  2114. return -EINVAL;
  2115. }
  2116. total_len = 0;
  2117. for (sgi = 0, i = 0; sgi < sg_cnt && i < clt_path->queue_depth; sgi++) {
  2118. const struct rtrs_sg_desc *desc = &msg->desc[sgi];
  2119. u32 len, rkey;
  2120. u64 addr;
  2121. addr = le64_to_cpu(desc->addr);
  2122. rkey = le32_to_cpu(desc->key);
  2123. len = le32_to_cpu(desc->len);
  2124. total_len += len;
  2125. if (!len || (len % clt_path->chunk_size)) {
  2126. rtrs_err(clt_path->clt, "Incorrect [%d].len %d\n",
  2127. sgi,
  2128. len);
  2129. return -EINVAL;
  2130. }
  2131. for ( ; len && i < clt_path->queue_depth; i++) {
  2132. clt_path->rbufs[i].addr = addr;
  2133. clt_path->rbufs[i].rkey = rkey;
  2134. len -= clt_path->chunk_size;
  2135. addr += clt_path->chunk_size;
  2136. }
  2137. }
  2138. /* Sanity check */
  2139. if (sgi != sg_cnt || i != clt_path->queue_depth) {
  2140. rtrs_err(clt_path->clt,
  2141. "Incorrect sg vector, not fully mapped\n");
  2142. return -EINVAL;
  2143. }
  2144. if (total_len != clt_path->chunk_size * clt_path->queue_depth) {
  2145. rtrs_err(clt_path->clt, "Incorrect total_len %d\n", total_len);
  2146. return -EINVAL;
  2147. }
  2148. return 0;
  2149. }
  2150. static void rtrs_clt_info_rsp_done(struct ib_cq *cq, struct ib_wc *wc)
  2151. {
  2152. struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context);
  2153. struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
  2154. struct rtrs_msg_info_rsp *msg;
  2155. enum rtrs_clt_state state;
  2156. struct rtrs_iu *iu;
  2157. size_t rx_sz;
  2158. int err;
  2159. state = RTRS_CLT_CONNECTING_ERR;
  2160. WARN_ON(con->c.cid);
  2161. iu = container_of(wc->wr_cqe, struct rtrs_iu, cqe);
  2162. if (wc->status != IB_WC_SUCCESS) {
  2163. rtrs_err(clt_path->clt, "Path info response recv failed: %s\n",
  2164. ib_wc_status_msg(wc->status));
  2165. goto out;
  2166. }
  2167. WARN_ON(wc->opcode != IB_WC_RECV);
  2168. if (wc->byte_len < sizeof(*msg)) {
  2169. rtrs_err(clt_path->clt, "Path info response is malformed: size %d\n",
  2170. wc->byte_len);
  2171. goto out;
  2172. }
  2173. ib_dma_sync_single_for_cpu(clt_path->s.dev->ib_dev, iu->dma_addr,
  2174. iu->size, DMA_FROM_DEVICE);
  2175. msg = iu->buf;
  2176. if (le16_to_cpu(msg->type) != RTRS_MSG_INFO_RSP) {
  2177. rtrs_err(clt_path->clt, "Path info response is malformed: type %d\n",
  2178. le16_to_cpu(msg->type));
  2179. goto out;
  2180. }
  2181. rx_sz = sizeof(*msg);
  2182. rx_sz += sizeof(msg->desc[0]) * le16_to_cpu(msg->sg_cnt);
  2183. if (wc->byte_len < rx_sz) {
  2184. rtrs_err(clt_path->clt, "Path info response is malformed: size %d\n",
  2185. wc->byte_len);
  2186. goto out;
  2187. }
  2188. err = process_info_rsp(clt_path, msg);
  2189. if (err)
  2190. goto out;
  2191. err = post_recv_path(clt_path);
  2192. if (err)
  2193. goto out;
  2194. state = RTRS_CLT_CONNECTED;
  2195. out:
  2196. rtrs_clt_update_wc_stats(con);
  2197. rtrs_iu_free(iu, clt_path->s.dev->ib_dev, 1);
  2198. rtrs_clt_change_state_get_old(clt_path, state, NULL);
  2199. }
  2200. static int rtrs_send_path_info(struct rtrs_clt_path *clt_path)
  2201. {
  2202. struct rtrs_clt_con *usr_con = to_clt_con(clt_path->s.con[0]);
  2203. struct rtrs_msg_info_req *msg;
  2204. struct rtrs_iu *tx_iu, *rx_iu;
  2205. size_t rx_sz;
  2206. int err;
  2207. rx_sz = sizeof(struct rtrs_msg_info_rsp);
  2208. rx_sz += sizeof(struct rtrs_sg_desc) * clt_path->queue_depth;
  2209. tx_iu = rtrs_iu_alloc(1, sizeof(struct rtrs_msg_info_req), GFP_KERNEL,
  2210. clt_path->s.dev->ib_dev, DMA_TO_DEVICE,
  2211. rtrs_clt_info_req_done);
  2212. rx_iu = rtrs_iu_alloc(1, rx_sz, GFP_KERNEL, clt_path->s.dev->ib_dev,
  2213. DMA_FROM_DEVICE, rtrs_clt_info_rsp_done);
  2214. if (!tx_iu || !rx_iu) {
  2215. err = -ENOMEM;
  2216. goto out;
  2217. }
  2218. /* Prepare for getting info response */
  2219. err = rtrs_iu_post_recv(&usr_con->c, rx_iu);
  2220. if (err) {
  2221. rtrs_err(clt_path->clt, "rtrs_iu_post_recv(), err: %d\n", err);
  2222. goto out;
  2223. }
  2224. rx_iu = NULL;
  2225. msg = tx_iu->buf;
  2226. msg->type = cpu_to_le16(RTRS_MSG_INFO_REQ);
  2227. memcpy(msg->pathname, clt_path->s.sessname, sizeof(msg->pathname));
  2228. ib_dma_sync_single_for_device(clt_path->s.dev->ib_dev,
  2229. tx_iu->dma_addr,
  2230. tx_iu->size, DMA_TO_DEVICE);
  2231. /* Send info request */
  2232. err = rtrs_iu_post_send(&usr_con->c, tx_iu, sizeof(*msg), NULL);
  2233. if (err) {
  2234. rtrs_err(clt_path->clt, "rtrs_iu_post_send(), err: %d\n", err);
  2235. goto out;
  2236. }
  2237. tx_iu = NULL;
  2238. /* Wait for state change */
  2239. wait_event_interruptible_timeout(clt_path->state_wq,
  2240. clt_path->state != RTRS_CLT_CONNECTING,
  2241. msecs_to_jiffies(
  2242. RTRS_CONNECT_TIMEOUT_MS));
  2243. if (READ_ONCE(clt_path->state) != RTRS_CLT_CONNECTED) {
  2244. if (READ_ONCE(clt_path->state) == RTRS_CLT_CONNECTING_ERR)
  2245. err = -ECONNRESET;
  2246. else
  2247. err = -ETIMEDOUT;
  2248. }
  2249. out:
  2250. if (tx_iu)
  2251. rtrs_iu_free(tx_iu, clt_path->s.dev->ib_dev, 1);
  2252. if (rx_iu)
  2253. rtrs_iu_free(rx_iu, clt_path->s.dev->ib_dev, 1);
  2254. if (err)
  2255. /* If we've never taken async path because of malloc problems */
  2256. rtrs_clt_change_state_get_old(clt_path,
  2257. RTRS_CLT_CONNECTING_ERR, NULL);
  2258. return err;
  2259. }
  2260. /**
  2261. * init_path() - establishes all path connections and does handshake
  2262. * @clt_path: client path.
  2263. * In case of error full close or reconnect procedure should be taken,
  2264. * because reconnect or close async works can be started.
  2265. */
  2266. static int init_path(struct rtrs_clt_path *clt_path)
  2267. {
  2268. int err;
  2269. char str[NAME_MAX];
  2270. struct rtrs_addr path = {
  2271. .src = &clt_path->s.src_addr,
  2272. .dst = &clt_path->s.dst_addr,
  2273. };
  2274. rtrs_addr_to_str(&path, str, sizeof(str));
  2275. mutex_lock(&clt_path->init_mutex);
  2276. err = init_conns(clt_path);
  2277. if (err) {
  2278. rtrs_err(clt_path->clt,
  2279. "init_conns() failed: err=%d path=%s [%s:%u]\n", err,
  2280. str, clt_path->hca_name, clt_path->hca_port);
  2281. goto out;
  2282. }
  2283. err = rtrs_send_path_info(clt_path);
  2284. if (err) {
  2285. rtrs_err(clt_path->clt,
  2286. "rtrs_send_path_info() failed: err=%d path=%s [%s:%u]\n",
  2287. err, str, clt_path->hca_name, clt_path->hca_port);
  2288. goto out;
  2289. }
  2290. rtrs_clt_path_up(clt_path);
  2291. rtrs_start_hb(&clt_path->s);
  2292. out:
  2293. mutex_unlock(&clt_path->init_mutex);
  2294. return err;
  2295. }
  2296. static void rtrs_clt_reconnect_work(struct work_struct *work)
  2297. {
  2298. struct rtrs_clt_path *clt_path;
  2299. struct rtrs_clt_sess *clt;
  2300. int err;
  2301. clt_path = container_of(to_delayed_work(work), struct rtrs_clt_path,
  2302. reconnect_dwork);
  2303. clt = clt_path->clt;
  2304. trace_rtrs_clt_reconnect_work(clt_path);
  2305. if (READ_ONCE(clt_path->state) != RTRS_CLT_RECONNECTING)
  2306. return;
  2307. if (clt_path->reconnect_attempts >= clt->max_reconnect_attempts) {
  2308. /* Close a path completely if max attempts is reached */
  2309. rtrs_clt_close_conns(clt_path, false);
  2310. return;
  2311. }
  2312. clt_path->reconnect_attempts++;
  2313. msleep(RTRS_RECONNECT_BACKOFF);
  2314. if (rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CONNECTING, NULL)) {
  2315. err = init_path(clt_path);
  2316. if (err)
  2317. goto reconnect_again;
  2318. }
  2319. return;
  2320. reconnect_again:
  2321. if (rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_RECONNECTING, NULL)) {
  2322. clt_path->stats->reconnects.fail_cnt++;
  2323. queue_work(rtrs_wq, &clt_path->err_recovery_work);
  2324. }
  2325. }
  2326. static void rtrs_clt_dev_release(struct device *dev)
  2327. {
  2328. struct rtrs_clt_sess *clt = container_of(dev, struct rtrs_clt_sess,
  2329. dev);
  2330. mutex_destroy(&clt->paths_ev_mutex);
  2331. mutex_destroy(&clt->paths_mutex);
  2332. kfree(clt);
  2333. }
  2334. static struct rtrs_clt_sess *alloc_clt(const char *sessname, size_t paths_num,
  2335. u16 port, size_t pdu_sz, void *priv,
  2336. void (*link_ev)(void *priv,
  2337. enum rtrs_clt_link_ev ev),
  2338. unsigned int reconnect_delay_sec,
  2339. unsigned int max_reconnect_attempts)
  2340. {
  2341. struct rtrs_clt_sess *clt;
  2342. int err;
  2343. if (!paths_num || paths_num > MAX_PATHS_NUM)
  2344. return ERR_PTR(-EINVAL);
  2345. if (strlen(sessname) >= sizeof(clt->sessname))
  2346. return ERR_PTR(-EINVAL);
  2347. clt = kzalloc(sizeof(*clt), GFP_KERNEL);
  2348. if (!clt)
  2349. return ERR_PTR(-ENOMEM);
  2350. clt->pcpu_path = alloc_percpu(typeof(*clt->pcpu_path));
  2351. if (!clt->pcpu_path) {
  2352. kfree(clt);
  2353. return ERR_PTR(-ENOMEM);
  2354. }
  2355. clt->dev.class = rtrs_clt_dev_class;
  2356. clt->dev.release = rtrs_clt_dev_release;
  2357. uuid_gen(&clt->paths_uuid);
  2358. INIT_LIST_HEAD_RCU(&clt->paths_list);
  2359. clt->paths_num = paths_num;
  2360. clt->paths_up = MAX_PATHS_NUM;
  2361. clt->port = port;
  2362. clt->pdu_sz = pdu_sz;
  2363. clt->max_segments = RTRS_MAX_SEGMENTS;
  2364. clt->reconnect_delay_sec = reconnect_delay_sec;
  2365. clt->max_reconnect_attempts = max_reconnect_attempts;
  2366. clt->priv = priv;
  2367. clt->link_ev = link_ev;
  2368. clt->mp_policy = MP_POLICY_MIN_INFLIGHT;
  2369. strscpy(clt->sessname, sessname, sizeof(clt->sessname));
  2370. init_waitqueue_head(&clt->permits_wait);
  2371. mutex_init(&clt->paths_ev_mutex);
  2372. mutex_init(&clt->paths_mutex);
  2373. device_initialize(&clt->dev);
  2374. err = dev_set_name(&clt->dev, "%s", sessname);
  2375. if (err)
  2376. goto err_put;
  2377. /*
  2378. * Suppress user space notification until
  2379. * sysfs files are created
  2380. */
  2381. dev_set_uevent_suppress(&clt->dev, true);
  2382. err = device_add(&clt->dev);
  2383. if (err)
  2384. goto err_put;
  2385. clt->kobj_paths = kobject_create_and_add("paths", &clt->dev.kobj);
  2386. if (!clt->kobj_paths) {
  2387. err = -ENOMEM;
  2388. goto err_del;
  2389. }
  2390. err = rtrs_clt_create_sysfs_root_files(clt);
  2391. if (err) {
  2392. kobject_del(clt->kobj_paths);
  2393. kobject_put(clt->kobj_paths);
  2394. goto err_del;
  2395. }
  2396. dev_set_uevent_suppress(&clt->dev, false);
  2397. kobject_uevent(&clt->dev.kobj, KOBJ_ADD);
  2398. return clt;
  2399. err_del:
  2400. device_del(&clt->dev);
  2401. err_put:
  2402. free_percpu(clt->pcpu_path);
  2403. put_device(&clt->dev);
  2404. return ERR_PTR(err);
  2405. }
  2406. static void free_clt(struct rtrs_clt_sess *clt)
  2407. {
  2408. free_percpu(clt->pcpu_path);
  2409. /*
  2410. * release callback will free clt and destroy mutexes in last put
  2411. */
  2412. device_unregister(&clt->dev);
  2413. }
  2414. /**
  2415. * rtrs_clt_open() - Open a path to an RTRS server
  2416. * @ops: holds the link event callback and the private pointer.
  2417. * @pathname: name of the path to an RTRS server
  2418. * @paths: Paths to be established defined by their src and dst addresses
  2419. * @paths_num: Number of elements in the @paths array
  2420. * @port: port to be used by the RTRS session
  2421. * @pdu_sz: Size of extra payload which can be accessed after permit allocation.
  2422. * @reconnect_delay_sec: time between reconnect tries
  2423. * @max_reconnect_attempts: Number of times to reconnect on error before giving
  2424. * up, 0 for * disabled, -1 for forever
  2425. * @nr_poll_queues: number of polling mode connection using IB_POLL_DIRECT flag
  2426. *
  2427. * Starts session establishment with the rtrs_server. The function can block
  2428. * up to ~2000ms before it returns.
  2429. *
  2430. * Return a valid pointer on success otherwise PTR_ERR.
  2431. */
  2432. struct rtrs_clt_sess *rtrs_clt_open(struct rtrs_clt_ops *ops,
  2433. const char *pathname,
  2434. const struct rtrs_addr *paths,
  2435. size_t paths_num, u16 port,
  2436. size_t pdu_sz, u8 reconnect_delay_sec,
  2437. s16 max_reconnect_attempts, u32 nr_poll_queues)
  2438. {
  2439. struct rtrs_clt_path *clt_path, *tmp;
  2440. struct rtrs_clt_sess *clt;
  2441. int err, i;
  2442. if (strchr(pathname, '/') || strchr(pathname, '.')) {
  2443. pr_err("pathname cannot contain / and .\n");
  2444. err = -EINVAL;
  2445. goto out;
  2446. }
  2447. clt = alloc_clt(pathname, paths_num, port, pdu_sz, ops->priv,
  2448. ops->link_ev,
  2449. reconnect_delay_sec,
  2450. max_reconnect_attempts);
  2451. if (IS_ERR(clt)) {
  2452. err = PTR_ERR(clt);
  2453. goto out;
  2454. }
  2455. for (i = 0; i < paths_num; i++) {
  2456. struct rtrs_clt_path *clt_path;
  2457. clt_path = alloc_path(clt, &paths[i], nr_cpu_ids,
  2458. nr_poll_queues);
  2459. if (IS_ERR(clt_path)) {
  2460. err = PTR_ERR(clt_path);
  2461. goto close_all_path;
  2462. }
  2463. if (!i)
  2464. clt_path->for_new_clt = 1;
  2465. list_add_tail_rcu(&clt_path->s.entry, &clt->paths_list);
  2466. err = init_path(clt_path);
  2467. if (err) {
  2468. list_del_rcu(&clt_path->s.entry);
  2469. rtrs_clt_close_conns(clt_path, true);
  2470. free_percpu(clt_path->stats->pcpu_stats);
  2471. kfree(clt_path->stats);
  2472. free_path(clt_path);
  2473. goto close_all_path;
  2474. }
  2475. err = rtrs_clt_create_path_files(clt_path);
  2476. if (err) {
  2477. list_del_rcu(&clt_path->s.entry);
  2478. rtrs_clt_close_conns(clt_path, true);
  2479. free_percpu(clt_path->stats->pcpu_stats);
  2480. kfree(clt_path->stats);
  2481. free_path(clt_path);
  2482. goto close_all_path;
  2483. }
  2484. }
  2485. err = alloc_permits(clt);
  2486. if (err)
  2487. goto close_all_path;
  2488. return clt;
  2489. close_all_path:
  2490. list_for_each_entry_safe(clt_path, tmp, &clt->paths_list, s.entry) {
  2491. rtrs_clt_destroy_path_files(clt_path, NULL);
  2492. rtrs_clt_close_conns(clt_path, true);
  2493. kobject_put(&clt_path->kobj);
  2494. }
  2495. rtrs_clt_destroy_sysfs_root(clt);
  2496. free_clt(clt);
  2497. out:
  2498. return ERR_PTR(err);
  2499. }
  2500. EXPORT_SYMBOL(rtrs_clt_open);
  2501. /**
  2502. * rtrs_clt_close() - Close a path
  2503. * @clt: Session handle. Session is freed upon return.
  2504. */
  2505. void rtrs_clt_close(struct rtrs_clt_sess *clt)
  2506. {
  2507. struct rtrs_clt_path *clt_path, *tmp;
  2508. /* Firstly forbid sysfs access */
  2509. rtrs_clt_destroy_sysfs_root(clt);
  2510. /* Now it is safe to iterate over all paths without locks */
  2511. list_for_each_entry_safe(clt_path, tmp, &clt->paths_list, s.entry) {
  2512. rtrs_clt_close_conns(clt_path, true);
  2513. rtrs_clt_destroy_path_files(clt_path, NULL);
  2514. kobject_put(&clt_path->kobj);
  2515. }
  2516. free_permits(clt);
  2517. free_clt(clt);
  2518. }
  2519. EXPORT_SYMBOL(rtrs_clt_close);
  2520. int rtrs_clt_reconnect_from_sysfs(struct rtrs_clt_path *clt_path)
  2521. {
  2522. enum rtrs_clt_state old_state;
  2523. int err = -EBUSY;
  2524. bool changed;
  2525. changed = rtrs_clt_change_state_get_old(clt_path,
  2526. RTRS_CLT_RECONNECTING,
  2527. &old_state);
  2528. if (changed) {
  2529. clt_path->reconnect_attempts = 0;
  2530. rtrs_clt_stop_and_destroy_conns(clt_path);
  2531. queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork, 0);
  2532. }
  2533. if (changed || old_state == RTRS_CLT_RECONNECTING) {
  2534. /*
  2535. * flush_delayed_work() queues pending work for immediate
  2536. * execution, so do the flush if we have queued something
  2537. * right now or work is pending.
  2538. */
  2539. flush_delayed_work(&clt_path->reconnect_dwork);
  2540. err = (READ_ONCE(clt_path->state) ==
  2541. RTRS_CLT_CONNECTED ? 0 : -ENOTCONN);
  2542. }
  2543. return err;
  2544. }
  2545. int rtrs_clt_remove_path_from_sysfs(struct rtrs_clt_path *clt_path,
  2546. const struct attribute *sysfs_self)
  2547. {
  2548. enum rtrs_clt_state old_state;
  2549. bool changed;
  2550. /*
  2551. * Continue stopping path till state was changed to DEAD or
  2552. * state was observed as DEAD:
  2553. * 1. State was changed to DEAD - we were fast and nobody
  2554. * invoked rtrs_clt_reconnect(), which can again start
  2555. * reconnecting.
  2556. * 2. State was observed as DEAD - we have someone in parallel
  2557. * removing the path.
  2558. */
  2559. do {
  2560. rtrs_clt_close_conns(clt_path, true);
  2561. changed = rtrs_clt_change_state_get_old(clt_path,
  2562. RTRS_CLT_DEAD,
  2563. &old_state);
  2564. } while (!changed && old_state != RTRS_CLT_DEAD);
  2565. if (changed) {
  2566. rtrs_clt_remove_path_from_arr(clt_path);
  2567. rtrs_clt_destroy_path_files(clt_path, sysfs_self);
  2568. kobject_put(&clt_path->kobj);
  2569. }
  2570. return 0;
  2571. }
  2572. void rtrs_clt_set_max_reconnect_attempts(struct rtrs_clt_sess *clt, int value)
  2573. {
  2574. clt->max_reconnect_attempts = (unsigned int)value;
  2575. }
  2576. int rtrs_clt_get_max_reconnect_attempts(const struct rtrs_clt_sess *clt)
  2577. {
  2578. return (int)clt->max_reconnect_attempts;
  2579. }
  2580. /**
  2581. * rtrs_clt_request() - Request data transfer to/from server via RDMA.
  2582. *
  2583. * @dir: READ/WRITE
  2584. * @ops: callback function to be called as confirmation, and the pointer.
  2585. * @clt: Session
  2586. * @permit: Preallocated permit
  2587. * @vec: Message that is sent to server together with the request.
  2588. * Sum of len of all @vec elements limited to <= IO_MSG_SIZE.
  2589. * Since the msg is copied internally it can be allocated on stack.
  2590. * @nr: Number of elements in @vec.
  2591. * @data_len: length of data sent to/from server
  2592. * @sg: Pages to be sent/received to/from server.
  2593. * @sg_cnt: Number of elements in the @sg
  2594. *
  2595. * Return:
  2596. * 0: Success
  2597. * <0: Error
  2598. *
  2599. * On dir=READ rtrs client will request a data transfer from Server to client.
  2600. * The data that the server will respond with will be stored in @sg when
  2601. * the user receives an %RTRS_CLT_RDMA_EV_RDMA_REQUEST_WRITE_COMPL event.
  2602. * On dir=WRITE rtrs client will rdma write data in sg to server side.
  2603. */
  2604. int rtrs_clt_request(int dir, struct rtrs_clt_req_ops *ops,
  2605. struct rtrs_clt_sess *clt, struct rtrs_permit *permit,
  2606. const struct kvec *vec, size_t nr, size_t data_len,
  2607. struct scatterlist *sg, unsigned int sg_cnt)
  2608. {
  2609. struct rtrs_clt_io_req *req;
  2610. struct rtrs_clt_path *clt_path;
  2611. enum dma_data_direction dma_dir;
  2612. int err = -ECONNABORTED, i;
  2613. size_t usr_len, hdr_len;
  2614. struct path_it it;
  2615. /* Get kvec length */
  2616. for (i = 0, usr_len = 0; i < nr; i++)
  2617. usr_len += vec[i].iov_len;
  2618. if (dir == READ) {
  2619. hdr_len = sizeof(struct rtrs_msg_rdma_read) +
  2620. sg_cnt * sizeof(struct rtrs_sg_desc);
  2621. dma_dir = DMA_FROM_DEVICE;
  2622. } else {
  2623. hdr_len = sizeof(struct rtrs_msg_rdma_write);
  2624. dma_dir = DMA_TO_DEVICE;
  2625. }
  2626. rcu_read_lock();
  2627. for (path_it_init(&it, clt);
  2628. (clt_path = it.next_path(&it)) && it.i < it.clt->paths_num; it.i++) {
  2629. if (READ_ONCE(clt_path->state) != RTRS_CLT_CONNECTED)
  2630. continue;
  2631. if (usr_len + hdr_len > clt_path->max_hdr_size) {
  2632. rtrs_wrn_rl(clt_path->clt,
  2633. "%s request failed, user message size is %zu and header length %zu, but max size is %u\n",
  2634. dir == READ ? "Read" : "Write",
  2635. usr_len, hdr_len, clt_path->max_hdr_size);
  2636. err = -EMSGSIZE;
  2637. break;
  2638. }
  2639. req = rtrs_clt_get_req(clt_path, ops->conf_fn, permit, ops->priv,
  2640. vec, usr_len, sg, sg_cnt, data_len,
  2641. dma_dir);
  2642. if (dir == READ)
  2643. err = rtrs_clt_read_req(req);
  2644. else
  2645. err = rtrs_clt_write_req(req);
  2646. if (err) {
  2647. req->in_use = false;
  2648. continue;
  2649. }
  2650. /* Success path */
  2651. break;
  2652. }
  2653. path_it_deinit(&it);
  2654. rcu_read_unlock();
  2655. return err;
  2656. }
  2657. EXPORT_SYMBOL(rtrs_clt_request);
  2658. int rtrs_clt_rdma_cq_direct(struct rtrs_clt_sess *clt, unsigned int index)
  2659. {
  2660. /* If no path, return -1 for block layer not to try again */
  2661. int cnt = -1;
  2662. struct rtrs_con *con;
  2663. struct rtrs_clt_path *clt_path;
  2664. struct path_it it;
  2665. rcu_read_lock();
  2666. for (path_it_init(&it, clt);
  2667. (clt_path = it.next_path(&it)) && it.i < it.clt->paths_num; it.i++) {
  2668. if (READ_ONCE(clt_path->state) != RTRS_CLT_CONNECTED)
  2669. continue;
  2670. con = clt_path->s.con[index + 1];
  2671. cnt = ib_process_cq_direct(con->cq, -1);
  2672. if (cnt)
  2673. break;
  2674. }
  2675. path_it_deinit(&it);
  2676. rcu_read_unlock();
  2677. return cnt;
  2678. }
  2679. EXPORT_SYMBOL(rtrs_clt_rdma_cq_direct);
  2680. /**
  2681. * rtrs_clt_query() - queries RTRS session attributes
  2682. *@clt: session pointer
  2683. *@attr: query results for session attributes.
  2684. * Returns:
  2685. * 0 on success
  2686. * -ECOMM no connection to the server
  2687. */
  2688. int rtrs_clt_query(struct rtrs_clt_sess *clt, struct rtrs_attrs *attr)
  2689. {
  2690. if (!rtrs_clt_is_connected(clt))
  2691. return -ECOMM;
  2692. attr->queue_depth = clt->queue_depth;
  2693. attr->max_segments = clt->max_segments;
  2694. /* Cap max_io_size to min of remote buffer size and the fr pages */
  2695. attr->max_io_size = min_t(int, clt->max_io_size,
  2696. clt->max_segments * SZ_4K);
  2697. return 0;
  2698. }
  2699. EXPORT_SYMBOL(rtrs_clt_query);
  2700. int rtrs_clt_create_path_from_sysfs(struct rtrs_clt_sess *clt,
  2701. struct rtrs_addr *addr)
  2702. {
  2703. struct rtrs_clt_path *clt_path;
  2704. int err;
  2705. clt_path = alloc_path(clt, addr, nr_cpu_ids, 0);
  2706. if (IS_ERR(clt_path))
  2707. return PTR_ERR(clt_path);
  2708. mutex_lock(&clt->paths_mutex);
  2709. if (clt->paths_num == 0) {
  2710. /*
  2711. * When all the paths are removed for a session,
  2712. * the addition of the first path is like a new session for
  2713. * the storage server
  2714. */
  2715. clt_path->for_new_clt = 1;
  2716. }
  2717. mutex_unlock(&clt->paths_mutex);
  2718. /*
  2719. * It is totally safe to add path in CONNECTING state: coming
  2720. * IO will never grab it. Also it is very important to add
  2721. * path before init, since init fires LINK_CONNECTED event.
  2722. */
  2723. rtrs_clt_add_path_to_arr(clt_path);
  2724. err = init_path(clt_path);
  2725. if (err)
  2726. goto close_path;
  2727. err = rtrs_clt_create_path_files(clt_path);
  2728. if (err)
  2729. goto close_path;
  2730. return 0;
  2731. close_path:
  2732. rtrs_clt_remove_path_from_arr(clt_path);
  2733. rtrs_clt_close_conns(clt_path, true);
  2734. free_percpu(clt_path->stats->pcpu_stats);
  2735. kfree(clt_path->stats);
  2736. free_path(clt_path);
  2737. return err;
  2738. }
  2739. static int rtrs_clt_ib_dev_init(struct rtrs_ib_dev *dev)
  2740. {
  2741. if (!(dev->ib_dev->attrs.device_cap_flags &
  2742. IB_DEVICE_MEM_MGT_EXTENSIONS)) {
  2743. pr_err("Memory registrations not supported.\n");
  2744. return -ENOTSUPP;
  2745. }
  2746. return 0;
  2747. }
  2748. static const struct rtrs_rdma_dev_pd_ops dev_pd_ops = {
  2749. .init = rtrs_clt_ib_dev_init
  2750. };
  2751. static int __init rtrs_client_init(void)
  2752. {
  2753. rtrs_rdma_dev_pd_init(0, &dev_pd);
  2754. rtrs_clt_dev_class = class_create(THIS_MODULE, "rtrs-client");
  2755. if (IS_ERR(rtrs_clt_dev_class)) {
  2756. pr_err("Failed to create rtrs-client dev class\n");
  2757. return PTR_ERR(rtrs_clt_dev_class);
  2758. }
  2759. rtrs_wq = alloc_workqueue("rtrs_client_wq", 0, 0);
  2760. if (!rtrs_wq) {
  2761. class_destroy(rtrs_clt_dev_class);
  2762. return -ENOMEM;
  2763. }
  2764. return 0;
  2765. }
  2766. static void __exit rtrs_client_exit(void)
  2767. {
  2768. destroy_workqueue(rtrs_wq);
  2769. class_destroy(rtrs_clt_dev_class);
  2770. rtrs_rdma_dev_pd_deinit(&dev_pd);
  2771. }
  2772. module_init(rtrs_client_init);
  2773. module_exit(rtrs_client_exit);