user_sdma.c 45 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653
  1. // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
  2. /*
  3. * Copyright(c) 2020 - Cornelis Networks, Inc.
  4. * Copyright(c) 2015 - 2018 Intel Corporation.
  5. */
  6. #include <linux/mm.h>
  7. #include <linux/types.h>
  8. #include <linux/device.h>
  9. #include <linux/dmapool.h>
  10. #include <linux/slab.h>
  11. #include <linux/list.h>
  12. #include <linux/highmem.h>
  13. #include <linux/io.h>
  14. #include <linux/uio.h>
  15. #include <linux/rbtree.h>
  16. #include <linux/spinlock.h>
  17. #include <linux/delay.h>
  18. #include <linux/kthread.h>
  19. #include <linux/mmu_context.h>
  20. #include <linux/module.h>
  21. #include <linux/vmalloc.h>
  22. #include <linux/string.h>
  23. #include "hfi.h"
  24. #include "sdma.h"
  25. #include "user_sdma.h"
  26. #include "verbs.h" /* for the headers */
  27. #include "common.h" /* for struct hfi1_tid_info */
  28. #include "trace.h"
  29. static uint hfi1_sdma_comp_ring_size = 128;
  30. module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
  31. MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
  32. static unsigned initial_pkt_count = 8;
  33. static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts);
  34. static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status);
  35. static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq);
  36. static void user_sdma_free_request(struct user_sdma_request *req);
  37. static int check_header_template(struct user_sdma_request *req,
  38. struct hfi1_pkt_header *hdr, u32 lrhlen,
  39. u32 datalen);
  40. static int set_txreq_header(struct user_sdma_request *req,
  41. struct user_sdma_txreq *tx, u32 datalen);
  42. static int set_txreq_header_ahg(struct user_sdma_request *req,
  43. struct user_sdma_txreq *tx, u32 len);
  44. static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
  45. struct hfi1_user_sdma_comp_q *cq,
  46. u16 idx, enum hfi1_sdma_comp_state state,
  47. int ret);
  48. static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags);
  49. static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
  50. static int defer_packet_queue(
  51. struct sdma_engine *sde,
  52. struct iowait_work *wait,
  53. struct sdma_txreq *txreq,
  54. uint seq,
  55. bool pkts_sent);
  56. static void activate_packet_queue(struct iowait *wait, int reason);
  57. static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
  58. unsigned long len);
  59. static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
  60. void *arg2, bool *stop);
  61. static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode);
  62. static struct mmu_rb_ops sdma_rb_ops = {
  63. .filter = sdma_rb_filter,
  64. .evict = sdma_rb_evict,
  65. .remove = sdma_rb_remove,
  66. };
  67. static int add_system_pages_to_sdma_packet(struct user_sdma_request *req,
  68. struct user_sdma_txreq *tx,
  69. struct user_sdma_iovec *iovec,
  70. u32 *pkt_remaining);
  71. static int defer_packet_queue(
  72. struct sdma_engine *sde,
  73. struct iowait_work *wait,
  74. struct sdma_txreq *txreq,
  75. uint seq,
  76. bool pkts_sent)
  77. {
  78. struct hfi1_user_sdma_pkt_q *pq =
  79. container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy);
  80. write_seqlock(&sde->waitlock);
  81. trace_hfi1_usdma_defer(pq, sde, &pq->busy);
  82. if (sdma_progress(sde, seq, txreq))
  83. goto eagain;
  84. /*
  85. * We are assuming that if the list is enqueued somewhere, it
  86. * is to the dmawait list since that is the only place where
  87. * it is supposed to be enqueued.
  88. */
  89. xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
  90. if (list_empty(&pq->busy.list)) {
  91. pq->busy.lock = &sde->waitlock;
  92. iowait_get_priority(&pq->busy);
  93. iowait_queue(pkts_sent, &pq->busy, &sde->dmawait);
  94. }
  95. write_sequnlock(&sde->waitlock);
  96. return -EBUSY;
  97. eagain:
  98. write_sequnlock(&sde->waitlock);
  99. return -EAGAIN;
  100. }
  101. static void activate_packet_queue(struct iowait *wait, int reason)
  102. {
  103. struct hfi1_user_sdma_pkt_q *pq =
  104. container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
  105. trace_hfi1_usdma_activate(pq, wait, reason);
  106. xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
  107. wake_up(&wait->wait_dma);
  108. };
  109. int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
  110. struct hfi1_filedata *fd)
  111. {
  112. int ret = -ENOMEM;
  113. char buf[64];
  114. struct hfi1_devdata *dd;
  115. struct hfi1_user_sdma_comp_q *cq;
  116. struct hfi1_user_sdma_pkt_q *pq;
  117. if (!uctxt || !fd)
  118. return -EBADF;
  119. if (!hfi1_sdma_comp_ring_size)
  120. return -EINVAL;
  121. dd = uctxt->dd;
  122. pq = kzalloc(sizeof(*pq), GFP_KERNEL);
  123. if (!pq)
  124. return -ENOMEM;
  125. pq->dd = dd;
  126. pq->ctxt = uctxt->ctxt;
  127. pq->subctxt = fd->subctxt;
  128. pq->n_max_reqs = hfi1_sdma_comp_ring_size;
  129. atomic_set(&pq->n_reqs, 0);
  130. init_waitqueue_head(&pq->wait);
  131. atomic_set(&pq->n_locked, 0);
  132. iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue,
  133. activate_packet_queue, NULL, NULL);
  134. pq->reqidx = 0;
  135. pq->reqs = kcalloc(hfi1_sdma_comp_ring_size,
  136. sizeof(*pq->reqs),
  137. GFP_KERNEL);
  138. if (!pq->reqs)
  139. goto pq_reqs_nomem;
  140. pq->req_in_use = bitmap_zalloc(hfi1_sdma_comp_ring_size, GFP_KERNEL);
  141. if (!pq->req_in_use)
  142. goto pq_reqs_no_in_use;
  143. snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
  144. fd->subctxt);
  145. pq->txreq_cache = kmem_cache_create(buf,
  146. sizeof(struct user_sdma_txreq),
  147. L1_CACHE_BYTES,
  148. SLAB_HWCACHE_ALIGN,
  149. NULL);
  150. if (!pq->txreq_cache) {
  151. dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
  152. uctxt->ctxt);
  153. goto pq_txreq_nomem;
  154. }
  155. cq = kzalloc(sizeof(*cq), GFP_KERNEL);
  156. if (!cq)
  157. goto cq_nomem;
  158. cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps)
  159. * hfi1_sdma_comp_ring_size));
  160. if (!cq->comps)
  161. goto cq_comps_nomem;
  162. cq->nentries = hfi1_sdma_comp_ring_size;
  163. ret = hfi1_mmu_rb_register(pq, &sdma_rb_ops, dd->pport->hfi1_wq,
  164. &pq->handler);
  165. if (ret) {
  166. dd_dev_err(dd, "Failed to register with MMU %d", ret);
  167. goto pq_mmu_fail;
  168. }
  169. rcu_assign_pointer(fd->pq, pq);
  170. fd->cq = cq;
  171. return 0;
  172. pq_mmu_fail:
  173. vfree(cq->comps);
  174. cq_comps_nomem:
  175. kfree(cq);
  176. cq_nomem:
  177. kmem_cache_destroy(pq->txreq_cache);
  178. pq_txreq_nomem:
  179. bitmap_free(pq->req_in_use);
  180. pq_reqs_no_in_use:
  181. kfree(pq->reqs);
  182. pq_reqs_nomem:
  183. kfree(pq);
  184. return ret;
  185. }
  186. static void flush_pq_iowait(struct hfi1_user_sdma_pkt_q *pq)
  187. {
  188. unsigned long flags;
  189. seqlock_t *lock = pq->busy.lock;
  190. if (!lock)
  191. return;
  192. write_seqlock_irqsave(lock, flags);
  193. if (!list_empty(&pq->busy.list)) {
  194. list_del_init(&pq->busy.list);
  195. pq->busy.lock = NULL;
  196. }
  197. write_sequnlock_irqrestore(lock, flags);
  198. }
  199. int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
  200. struct hfi1_ctxtdata *uctxt)
  201. {
  202. struct hfi1_user_sdma_pkt_q *pq;
  203. trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt);
  204. spin_lock(&fd->pq_rcu_lock);
  205. pq = srcu_dereference_check(fd->pq, &fd->pq_srcu,
  206. lockdep_is_held(&fd->pq_rcu_lock));
  207. if (pq) {
  208. rcu_assign_pointer(fd->pq, NULL);
  209. spin_unlock(&fd->pq_rcu_lock);
  210. synchronize_srcu(&fd->pq_srcu);
  211. /* at this point there can be no more new requests */
  212. iowait_sdma_drain(&pq->busy);
  213. /* Wait until all requests have been freed. */
  214. wait_event_interruptible(
  215. pq->wait,
  216. !atomic_read(&pq->n_reqs));
  217. kfree(pq->reqs);
  218. if (pq->handler)
  219. hfi1_mmu_rb_unregister(pq->handler);
  220. bitmap_free(pq->req_in_use);
  221. kmem_cache_destroy(pq->txreq_cache);
  222. flush_pq_iowait(pq);
  223. kfree(pq);
  224. } else {
  225. spin_unlock(&fd->pq_rcu_lock);
  226. }
  227. if (fd->cq) {
  228. vfree(fd->cq->comps);
  229. kfree(fd->cq);
  230. fd->cq = NULL;
  231. }
  232. return 0;
  233. }
  234. static u8 dlid_to_selector(u16 dlid)
  235. {
  236. static u8 mapping[256];
  237. static int initialized;
  238. static u8 next;
  239. int hash;
  240. if (!initialized) {
  241. memset(mapping, 0xFF, 256);
  242. initialized = 1;
  243. }
  244. hash = ((dlid >> 8) ^ dlid) & 0xFF;
  245. if (mapping[hash] == 0xFF) {
  246. mapping[hash] = next;
  247. next = (next + 1) & 0x7F;
  248. }
  249. return mapping[hash];
  250. }
  251. /**
  252. * hfi1_user_sdma_process_request() - Process and start a user sdma request
  253. * @fd: valid file descriptor
  254. * @iovec: array of io vectors to process
  255. * @dim: overall iovec array size
  256. * @count: number of io vector array entries processed
  257. */
  258. int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
  259. struct iovec *iovec, unsigned long dim,
  260. unsigned long *count)
  261. {
  262. int ret = 0, i;
  263. struct hfi1_ctxtdata *uctxt = fd->uctxt;
  264. struct hfi1_user_sdma_pkt_q *pq =
  265. srcu_dereference(fd->pq, &fd->pq_srcu);
  266. struct hfi1_user_sdma_comp_q *cq = fd->cq;
  267. struct hfi1_devdata *dd = pq->dd;
  268. unsigned long idx = 0;
  269. u8 pcount = initial_pkt_count;
  270. struct sdma_req_info info;
  271. struct user_sdma_request *req;
  272. u8 opcode, sc, vl;
  273. u16 pkey;
  274. u32 slid;
  275. u16 dlid;
  276. u32 selector;
  277. if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
  278. hfi1_cdbg(
  279. SDMA,
  280. "[%u:%u:%u] First vector not big enough for header %lu/%lu",
  281. dd->unit, uctxt->ctxt, fd->subctxt,
  282. iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
  283. return -EINVAL;
  284. }
  285. ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
  286. if (ret) {
  287. hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
  288. dd->unit, uctxt->ctxt, fd->subctxt, ret);
  289. return -EFAULT;
  290. }
  291. trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
  292. (u16 *)&info);
  293. if (info.comp_idx >= hfi1_sdma_comp_ring_size) {
  294. hfi1_cdbg(SDMA,
  295. "[%u:%u:%u:%u] Invalid comp index",
  296. dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
  297. return -EINVAL;
  298. }
  299. /*
  300. * Sanity check the header io vector count. Need at least 1 vector
  301. * (header) and cannot be larger than the actual io vector count.
  302. */
  303. if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) {
  304. hfi1_cdbg(SDMA,
  305. "[%u:%u:%u:%u] Invalid iov count %d, dim %ld",
  306. dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx,
  307. req_iovcnt(info.ctrl), dim);
  308. return -EINVAL;
  309. }
  310. if (!info.fragsize) {
  311. hfi1_cdbg(SDMA,
  312. "[%u:%u:%u:%u] Request does not specify fragsize",
  313. dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
  314. return -EINVAL;
  315. }
  316. /* Try to claim the request. */
  317. if (test_and_set_bit(info.comp_idx, pq->req_in_use)) {
  318. hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use",
  319. dd->unit, uctxt->ctxt, fd->subctxt,
  320. info.comp_idx);
  321. return -EBADSLT;
  322. }
  323. /*
  324. * All safety checks have been done and this request has been claimed.
  325. */
  326. trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt,
  327. info.comp_idx);
  328. req = pq->reqs + info.comp_idx;
  329. req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */
  330. req->data_len = 0;
  331. req->pq = pq;
  332. req->cq = cq;
  333. req->ahg_idx = -1;
  334. req->iov_idx = 0;
  335. req->sent = 0;
  336. req->seqnum = 0;
  337. req->seqcomp = 0;
  338. req->seqsubmitted = 0;
  339. req->tids = NULL;
  340. req->has_error = 0;
  341. INIT_LIST_HEAD(&req->txps);
  342. memcpy(&req->info, &info, sizeof(info));
  343. /* The request is initialized, count it */
  344. atomic_inc(&pq->n_reqs);
  345. if (req_opcode(info.ctrl) == EXPECTED) {
  346. /* expected must have a TID info and at least one data vector */
  347. if (req->data_iovs < 2) {
  348. SDMA_DBG(req,
  349. "Not enough vectors for expected request");
  350. ret = -EINVAL;
  351. goto free_req;
  352. }
  353. req->data_iovs--;
  354. }
  355. if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
  356. SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
  357. MAX_VECTORS_PER_REQ);
  358. ret = -EINVAL;
  359. goto free_req;
  360. }
  361. /* Copy the header from the user buffer */
  362. ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
  363. sizeof(req->hdr));
  364. if (ret) {
  365. SDMA_DBG(req, "Failed to copy header template (%d)", ret);
  366. ret = -EFAULT;
  367. goto free_req;
  368. }
  369. /* If Static rate control is not enabled, sanitize the header. */
  370. if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
  371. req->hdr.pbc[2] = 0;
  372. /* Validate the opcode. Do not trust packets from user space blindly. */
  373. opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
  374. if ((opcode & USER_OPCODE_CHECK_MASK) !=
  375. USER_OPCODE_CHECK_VAL) {
  376. SDMA_DBG(req, "Invalid opcode (%d)", opcode);
  377. ret = -EINVAL;
  378. goto free_req;
  379. }
  380. /*
  381. * Validate the vl. Do not trust packets from user space blindly.
  382. * VL comes from PBC, SC comes from LRH, and the VL needs to
  383. * match the SC look up.
  384. */
  385. vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
  386. sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
  387. (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
  388. if (vl >= dd->pport->vls_operational ||
  389. vl != sc_to_vlt(dd, sc)) {
  390. SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
  391. ret = -EINVAL;
  392. goto free_req;
  393. }
  394. /* Checking P_KEY for requests from user-space */
  395. pkey = (u16)be32_to_cpu(req->hdr.bth[0]);
  396. slid = be16_to_cpu(req->hdr.lrh[3]);
  397. if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) {
  398. ret = -EINVAL;
  399. goto free_req;
  400. }
  401. /*
  402. * Also should check the BTH.lnh. If it says the next header is GRH then
  403. * the RXE parsing will be off and will land in the middle of the KDETH
  404. * or miss it entirely.
  405. */
  406. if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
  407. SDMA_DBG(req, "User tried to pass in a GRH");
  408. ret = -EINVAL;
  409. goto free_req;
  410. }
  411. req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
  412. /*
  413. * Calculate the initial TID offset based on the values of
  414. * KDETH.OFFSET and KDETH.OM that are passed in.
  415. */
  416. req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
  417. (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
  418. KDETH_OM_LARGE : KDETH_OM_SMALL);
  419. trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt,
  420. info.comp_idx, req->tidoffset);
  421. idx++;
  422. /* Save all the IO vector structures */
  423. for (i = 0; i < req->data_iovs; i++) {
  424. req->iovs[i].offset = 0;
  425. INIT_LIST_HEAD(&req->iovs[i].list);
  426. memcpy(&req->iovs[i].iov,
  427. iovec + idx++,
  428. sizeof(req->iovs[i].iov));
  429. if (req->iovs[i].iov.iov_len == 0) {
  430. ret = -EINVAL;
  431. goto free_req;
  432. }
  433. req->data_len += req->iovs[i].iov.iov_len;
  434. }
  435. trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt,
  436. info.comp_idx, req->data_len);
  437. if (pcount > req->info.npkts)
  438. pcount = req->info.npkts;
  439. /*
  440. * Copy any TID info
  441. * User space will provide the TID info only when the
  442. * request type is EXPECTED. This is true even if there is
  443. * only one packet in the request and the header is already
  444. * setup. The reason for the singular TID case is that the
  445. * driver needs to perform safety checks.
  446. */
  447. if (req_opcode(req->info.ctrl) == EXPECTED) {
  448. u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
  449. u32 *tmp;
  450. if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
  451. ret = -EINVAL;
  452. goto free_req;
  453. }
  454. /*
  455. * We have to copy all of the tids because they may vary
  456. * in size and, therefore, the TID count might not be
  457. * equal to the pkt count. However, there is no way to
  458. * tell at this point.
  459. */
  460. tmp = memdup_user(iovec[idx].iov_base,
  461. ntids * sizeof(*req->tids));
  462. if (IS_ERR(tmp)) {
  463. ret = PTR_ERR(tmp);
  464. SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
  465. ntids, ret);
  466. goto free_req;
  467. }
  468. req->tids = tmp;
  469. req->n_tids = ntids;
  470. req->tididx = 0;
  471. idx++;
  472. }
  473. dlid = be16_to_cpu(req->hdr.lrh[1]);
  474. selector = dlid_to_selector(dlid);
  475. selector += uctxt->ctxt + fd->subctxt;
  476. req->sde = sdma_select_user_engine(dd, selector, vl);
  477. if (!req->sde || !sdma_running(req->sde)) {
  478. ret = -ECOMM;
  479. goto free_req;
  480. }
  481. /* We don't need an AHG entry if the request contains only one packet */
  482. if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG))
  483. req->ahg_idx = sdma_ahg_alloc(req->sde);
  484. set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
  485. pq->state = SDMA_PKT_Q_ACTIVE;
  486. /*
  487. * This is a somewhat blocking send implementation.
  488. * The driver will block the caller until all packets of the
  489. * request have been submitted to the SDMA engine. However, it
  490. * will not wait for send completions.
  491. */
  492. while (req->seqsubmitted != req->info.npkts) {
  493. ret = user_sdma_send_pkts(req, pcount);
  494. if (ret < 0) {
  495. int we_ret;
  496. if (ret != -EBUSY)
  497. goto free_req;
  498. we_ret = wait_event_interruptible_timeout(
  499. pq->busy.wait_dma,
  500. pq->state == SDMA_PKT_Q_ACTIVE,
  501. msecs_to_jiffies(
  502. SDMA_IOWAIT_TIMEOUT));
  503. trace_hfi1_usdma_we(pq, we_ret);
  504. if (we_ret <= 0)
  505. flush_pq_iowait(pq);
  506. }
  507. }
  508. *count += idx;
  509. return 0;
  510. free_req:
  511. /*
  512. * If the submitted seqsubmitted == npkts, the completion routine
  513. * controls the final state. If sequbmitted < npkts, wait for any
  514. * outstanding packets to finish before cleaning up.
  515. */
  516. if (req->seqsubmitted < req->info.npkts) {
  517. if (req->seqsubmitted)
  518. wait_event(pq->busy.wait_dma,
  519. (req->seqcomp == req->seqsubmitted - 1));
  520. user_sdma_free_request(req);
  521. pq_update(pq);
  522. set_comp_state(pq, cq, info.comp_idx, ERROR, ret);
  523. }
  524. return ret;
  525. }
  526. static inline u32 compute_data_length(struct user_sdma_request *req,
  527. struct user_sdma_txreq *tx)
  528. {
  529. /*
  530. * Determine the proper size of the packet data.
  531. * The size of the data of the first packet is in the header
  532. * template. However, it includes the header and ICRC, which need
  533. * to be subtracted.
  534. * The minimum representable packet data length in a header is 4 bytes,
  535. * therefore, when the data length request is less than 4 bytes, there's
  536. * only one packet, and the packet data length is equal to that of the
  537. * request data length.
  538. * The size of the remaining packets is the minimum of the frag
  539. * size (MTU) or remaining data in the request.
  540. */
  541. u32 len;
  542. if (!req->seqnum) {
  543. if (req->data_len < sizeof(u32))
  544. len = req->data_len;
  545. else
  546. len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
  547. (sizeof(tx->hdr) - 4));
  548. } else if (req_opcode(req->info.ctrl) == EXPECTED) {
  549. u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
  550. PAGE_SIZE;
  551. /*
  552. * Get the data length based on the remaining space in the
  553. * TID pair.
  554. */
  555. len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
  556. /* If we've filled up the TID pair, move to the next one. */
  557. if (unlikely(!len) && ++req->tididx < req->n_tids &&
  558. req->tids[req->tididx]) {
  559. tidlen = EXP_TID_GET(req->tids[req->tididx],
  560. LEN) * PAGE_SIZE;
  561. req->tidoffset = 0;
  562. len = min_t(u32, tidlen, req->info.fragsize);
  563. }
  564. /*
  565. * Since the TID pairs map entire pages, make sure that we
  566. * are not going to try to send more data that we have
  567. * remaining.
  568. */
  569. len = min(len, req->data_len - req->sent);
  570. } else {
  571. len = min(req->data_len - req->sent, (u32)req->info.fragsize);
  572. }
  573. trace_hfi1_sdma_user_compute_length(req->pq->dd,
  574. req->pq->ctxt,
  575. req->pq->subctxt,
  576. req->info.comp_idx,
  577. len);
  578. return len;
  579. }
  580. static inline u32 pad_len(u32 len)
  581. {
  582. if (len & (sizeof(u32) - 1))
  583. len += sizeof(u32) - (len & (sizeof(u32) - 1));
  584. return len;
  585. }
  586. static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
  587. {
  588. /* (Size of complete header - size of PBC) + 4B ICRC + data length */
  589. return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
  590. }
  591. static int user_sdma_txadd_ahg(struct user_sdma_request *req,
  592. struct user_sdma_txreq *tx,
  593. u32 datalen)
  594. {
  595. int ret;
  596. u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
  597. u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen));
  598. struct hfi1_user_sdma_pkt_q *pq = req->pq;
  599. /*
  600. * Copy the request header into the tx header
  601. * because the HW needs a cacheline-aligned
  602. * address.
  603. * This copy can be optimized out if the hdr
  604. * member of user_sdma_request were also
  605. * cacheline aligned.
  606. */
  607. memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
  608. if (PBC2LRH(pbclen) != lrhlen) {
  609. pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
  610. tx->hdr.pbc[0] = cpu_to_le16(pbclen);
  611. }
  612. ret = check_header_template(req, &tx->hdr, lrhlen, datalen);
  613. if (ret)
  614. return ret;
  615. ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY,
  616. sizeof(tx->hdr) + datalen, req->ahg_idx,
  617. 0, NULL, 0, user_sdma_txreq_cb);
  618. if (ret)
  619. return ret;
  620. ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr));
  621. if (ret)
  622. sdma_txclean(pq->dd, &tx->txreq);
  623. return ret;
  624. }
  625. static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts)
  626. {
  627. int ret = 0;
  628. u16 count;
  629. unsigned npkts = 0;
  630. struct user_sdma_txreq *tx = NULL;
  631. struct hfi1_user_sdma_pkt_q *pq = NULL;
  632. struct user_sdma_iovec *iovec = NULL;
  633. if (!req->pq)
  634. return -EINVAL;
  635. pq = req->pq;
  636. /* If tx completion has reported an error, we are done. */
  637. if (READ_ONCE(req->has_error))
  638. return -EFAULT;
  639. /*
  640. * Check if we might have sent the entire request already
  641. */
  642. if (unlikely(req->seqnum == req->info.npkts)) {
  643. if (!list_empty(&req->txps))
  644. goto dosend;
  645. return ret;
  646. }
  647. if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
  648. maxpkts = req->info.npkts - req->seqnum;
  649. while (npkts < maxpkts) {
  650. u32 datalen = 0;
  651. /*
  652. * Check whether any of the completions have come back
  653. * with errors. If so, we are not going to process any
  654. * more packets from this request.
  655. */
  656. if (READ_ONCE(req->has_error))
  657. return -EFAULT;
  658. tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
  659. if (!tx)
  660. return -ENOMEM;
  661. tx->flags = 0;
  662. tx->req = req;
  663. INIT_LIST_HEAD(&tx->list);
  664. /*
  665. * For the last packet set the ACK request
  666. * and disable header suppression.
  667. */
  668. if (req->seqnum == req->info.npkts - 1)
  669. tx->flags |= (TXREQ_FLAGS_REQ_ACK |
  670. TXREQ_FLAGS_REQ_DISABLE_SH);
  671. /*
  672. * Calculate the payload size - this is min of the fragment
  673. * (MTU) size or the remaining bytes in the request but only
  674. * if we have payload data.
  675. */
  676. if (req->data_len) {
  677. iovec = &req->iovs[req->iov_idx];
  678. if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) {
  679. if (++req->iov_idx == req->data_iovs) {
  680. ret = -EFAULT;
  681. goto free_tx;
  682. }
  683. iovec = &req->iovs[req->iov_idx];
  684. WARN_ON(iovec->offset);
  685. }
  686. datalen = compute_data_length(req, tx);
  687. /*
  688. * Disable header suppression for the payload <= 8DWS.
  689. * If there is an uncorrectable error in the receive
  690. * data FIFO when the received payload size is less than
  691. * or equal to 8DWS then the RxDmaDataFifoRdUncErr is
  692. * not reported.There is set RHF.EccErr if the header
  693. * is not suppressed.
  694. */
  695. if (!datalen) {
  696. SDMA_DBG(req,
  697. "Request has data but pkt len is 0");
  698. ret = -EFAULT;
  699. goto free_tx;
  700. } else if (datalen <= 32) {
  701. tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH;
  702. }
  703. }
  704. if (req->ahg_idx >= 0) {
  705. if (!req->seqnum) {
  706. ret = user_sdma_txadd_ahg(req, tx, datalen);
  707. if (ret)
  708. goto free_tx;
  709. } else {
  710. int changes;
  711. changes = set_txreq_header_ahg(req, tx,
  712. datalen);
  713. if (changes < 0) {
  714. ret = changes;
  715. goto free_tx;
  716. }
  717. }
  718. } else {
  719. ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
  720. datalen, user_sdma_txreq_cb);
  721. if (ret)
  722. goto free_tx;
  723. /*
  724. * Modify the header for this packet. This only needs
  725. * to be done if we are not going to use AHG. Otherwise,
  726. * the HW will do it based on the changes we gave it
  727. * during sdma_txinit_ahg().
  728. */
  729. ret = set_txreq_header(req, tx, datalen);
  730. if (ret)
  731. goto free_txreq;
  732. }
  733. req->koffset += datalen;
  734. if (req_opcode(req->info.ctrl) == EXPECTED)
  735. req->tidoffset += datalen;
  736. req->sent += datalen;
  737. while (datalen) {
  738. ret = add_system_pages_to_sdma_packet(req, tx, iovec,
  739. &datalen);
  740. if (ret)
  741. goto free_txreq;
  742. iovec = &req->iovs[req->iov_idx];
  743. }
  744. list_add_tail(&tx->txreq.list, &req->txps);
  745. /*
  746. * It is important to increment this here as it is used to
  747. * generate the BTH.PSN and, therefore, can't be bulk-updated
  748. * outside of the loop.
  749. */
  750. tx->seqnum = req->seqnum++;
  751. npkts++;
  752. }
  753. dosend:
  754. ret = sdma_send_txlist(req->sde,
  755. iowait_get_ib_work(&pq->busy),
  756. &req->txps, &count);
  757. req->seqsubmitted += count;
  758. if (req->seqsubmitted == req->info.npkts) {
  759. /*
  760. * The txreq has already been submitted to the HW queue
  761. * so we can free the AHG entry now. Corruption will not
  762. * happen due to the sequential manner in which
  763. * descriptors are processed.
  764. */
  765. if (req->ahg_idx >= 0)
  766. sdma_ahg_free(req->sde, req->ahg_idx);
  767. }
  768. return ret;
  769. free_txreq:
  770. sdma_txclean(pq->dd, &tx->txreq);
  771. free_tx:
  772. kmem_cache_free(pq->txreq_cache, tx);
  773. return ret;
  774. }
  775. static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
  776. {
  777. struct evict_data evict_data;
  778. struct mmu_rb_handler *handler = pq->handler;
  779. evict_data.cleared = 0;
  780. evict_data.target = npages;
  781. hfi1_mmu_rb_evict(handler, &evict_data);
  782. return evict_data.cleared;
  783. }
  784. static int check_header_template(struct user_sdma_request *req,
  785. struct hfi1_pkt_header *hdr, u32 lrhlen,
  786. u32 datalen)
  787. {
  788. /*
  789. * Perform safety checks for any type of packet:
  790. * - transfer size is multiple of 64bytes
  791. * - packet length is multiple of 4 bytes
  792. * - packet length is not larger than MTU size
  793. *
  794. * These checks are only done for the first packet of the
  795. * transfer since the header is "given" to us by user space.
  796. * For the remainder of the packets we compute the values.
  797. */
  798. if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 ||
  799. lrhlen > get_lrh_len(*hdr, req->info.fragsize))
  800. return -EINVAL;
  801. if (req_opcode(req->info.ctrl) == EXPECTED) {
  802. /*
  803. * The header is checked only on the first packet. Furthermore,
  804. * we ensure that at least one TID entry is copied when the
  805. * request is submitted. Therefore, we don't have to verify that
  806. * tididx points to something sane.
  807. */
  808. u32 tidval = req->tids[req->tididx],
  809. tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
  810. tididx = EXP_TID_GET(tidval, IDX),
  811. tidctrl = EXP_TID_GET(tidval, CTRL),
  812. tidoff;
  813. __le32 kval = hdr->kdeth.ver_tid_offset;
  814. tidoff = KDETH_GET(kval, OFFSET) *
  815. (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
  816. KDETH_OM_LARGE : KDETH_OM_SMALL);
  817. /*
  818. * Expected receive packets have the following
  819. * additional checks:
  820. * - offset is not larger than the TID size
  821. * - TIDCtrl values match between header and TID array
  822. * - TID indexes match between header and TID array
  823. */
  824. if ((tidoff + datalen > tidlen) ||
  825. KDETH_GET(kval, TIDCTRL) != tidctrl ||
  826. KDETH_GET(kval, TID) != tididx)
  827. return -EINVAL;
  828. }
  829. return 0;
  830. }
  831. /*
  832. * Correctly set the BTH.PSN field based on type of
  833. * transfer - eager packets can just increment the PSN but
  834. * expected packets encode generation and sequence in the
  835. * BTH.PSN field so just incrementing will result in errors.
  836. */
  837. static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
  838. {
  839. u32 val = be32_to_cpu(bthpsn),
  840. mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
  841. 0xffffffull),
  842. psn = val & mask;
  843. if (expct)
  844. psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) |
  845. ((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK);
  846. else
  847. psn = psn + frags;
  848. return psn & mask;
  849. }
  850. static int set_txreq_header(struct user_sdma_request *req,
  851. struct user_sdma_txreq *tx, u32 datalen)
  852. {
  853. struct hfi1_user_sdma_pkt_q *pq = req->pq;
  854. struct hfi1_pkt_header *hdr = &tx->hdr;
  855. u8 omfactor; /* KDETH.OM */
  856. u16 pbclen;
  857. int ret;
  858. u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
  859. /* Copy the header template to the request before modification */
  860. memcpy(hdr, &req->hdr, sizeof(*hdr));
  861. /*
  862. * Check if the PBC and LRH length are mismatched. If so
  863. * adjust both in the header.
  864. */
  865. pbclen = le16_to_cpu(hdr->pbc[0]);
  866. if (PBC2LRH(pbclen) != lrhlen) {
  867. pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
  868. hdr->pbc[0] = cpu_to_le16(pbclen);
  869. hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
  870. /*
  871. * Third packet
  872. * This is the first packet in the sequence that has
  873. * a "static" size that can be used for the rest of
  874. * the packets (besides the last one).
  875. */
  876. if (unlikely(req->seqnum == 2)) {
  877. /*
  878. * From this point on the lengths in both the
  879. * PBC and LRH are the same until the last
  880. * packet.
  881. * Adjust the template so we don't have to update
  882. * every packet
  883. */
  884. req->hdr.pbc[0] = hdr->pbc[0];
  885. req->hdr.lrh[2] = hdr->lrh[2];
  886. }
  887. }
  888. /*
  889. * We only have to modify the header if this is not the
  890. * first packet in the request. Otherwise, we use the
  891. * header given to us.
  892. */
  893. if (unlikely(!req->seqnum)) {
  894. ret = check_header_template(req, hdr, lrhlen, datalen);
  895. if (ret)
  896. return ret;
  897. goto done;
  898. }
  899. hdr->bth[2] = cpu_to_be32(
  900. set_pkt_bth_psn(hdr->bth[2],
  901. (req_opcode(req->info.ctrl) == EXPECTED),
  902. req->seqnum));
  903. /* Set ACK request on last packet */
  904. if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
  905. hdr->bth[2] |= cpu_to_be32(1UL << 31);
  906. /* Set the new offset */
  907. hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
  908. /* Expected packets have to fill in the new TID information */
  909. if (req_opcode(req->info.ctrl) == EXPECTED) {
  910. tidval = req->tids[req->tididx];
  911. /*
  912. * If the offset puts us at the end of the current TID,
  913. * advance everything.
  914. */
  915. if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
  916. PAGE_SIZE)) {
  917. req->tidoffset = 0;
  918. /*
  919. * Since we don't copy all the TIDs, all at once,
  920. * we have to check again.
  921. */
  922. if (++req->tididx > req->n_tids - 1 ||
  923. !req->tids[req->tididx]) {
  924. return -EINVAL;
  925. }
  926. tidval = req->tids[req->tididx];
  927. }
  928. omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
  929. KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT :
  930. KDETH_OM_SMALL_SHIFT;
  931. /* Set KDETH.TIDCtrl based on value for this TID. */
  932. KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
  933. EXP_TID_GET(tidval, CTRL));
  934. /* Set KDETH.TID based on value for this TID */
  935. KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
  936. EXP_TID_GET(tidval, IDX));
  937. /* Clear KDETH.SH when DISABLE_SH flag is set */
  938. if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH))
  939. KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
  940. /*
  941. * Set the KDETH.OFFSET and KDETH.OM based on size of
  942. * transfer.
  943. */
  944. trace_hfi1_sdma_user_tid_info(
  945. pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx,
  946. req->tidoffset, req->tidoffset >> omfactor,
  947. omfactor != KDETH_OM_SMALL_SHIFT);
  948. KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
  949. req->tidoffset >> omfactor);
  950. KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
  951. omfactor != KDETH_OM_SMALL_SHIFT);
  952. }
  953. done:
  954. trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
  955. req->info.comp_idx, hdr, tidval);
  956. return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
  957. }
  958. static int set_txreq_header_ahg(struct user_sdma_request *req,
  959. struct user_sdma_txreq *tx, u32 datalen)
  960. {
  961. u32 ahg[AHG_KDETH_ARRAY_SIZE];
  962. int idx = 0;
  963. u8 omfactor; /* KDETH.OM */
  964. struct hfi1_user_sdma_pkt_q *pq = req->pq;
  965. struct hfi1_pkt_header *hdr = &req->hdr;
  966. u16 pbclen = le16_to_cpu(hdr->pbc[0]);
  967. u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
  968. size_t array_size = ARRAY_SIZE(ahg);
  969. if (PBC2LRH(pbclen) != lrhlen) {
  970. /* PBC.PbcLengthDWs */
  971. idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12,
  972. (__force u16)cpu_to_le16(LRH2PBC(lrhlen)));
  973. if (idx < 0)
  974. return idx;
  975. /* LRH.PktLen (we need the full 16 bits due to byte swap) */
  976. idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16,
  977. (__force u16)cpu_to_be16(lrhlen >> 2));
  978. if (idx < 0)
  979. return idx;
  980. }
  981. /*
  982. * Do the common updates
  983. */
  984. /* BTH.PSN and BTH.A */
  985. val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
  986. (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
  987. if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
  988. val32 |= 1UL << 31;
  989. idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16,
  990. (__force u16)cpu_to_be16(val32 >> 16));
  991. if (idx < 0)
  992. return idx;
  993. idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16,
  994. (__force u16)cpu_to_be16(val32 & 0xffff));
  995. if (idx < 0)
  996. return idx;
  997. /* KDETH.Offset */
  998. idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16,
  999. (__force u16)cpu_to_le16(req->koffset & 0xffff));
  1000. if (idx < 0)
  1001. return idx;
  1002. idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16,
  1003. (__force u16)cpu_to_le16(req->koffset >> 16));
  1004. if (idx < 0)
  1005. return idx;
  1006. if (req_opcode(req->info.ctrl) == EXPECTED) {
  1007. __le16 val;
  1008. tidval = req->tids[req->tididx];
  1009. /*
  1010. * If the offset puts us at the end of the current TID,
  1011. * advance everything.
  1012. */
  1013. if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
  1014. PAGE_SIZE)) {
  1015. req->tidoffset = 0;
  1016. /*
  1017. * Since we don't copy all the TIDs, all at once,
  1018. * we have to check again.
  1019. */
  1020. if (++req->tididx > req->n_tids - 1 ||
  1021. !req->tids[req->tididx])
  1022. return -EINVAL;
  1023. tidval = req->tids[req->tididx];
  1024. }
  1025. omfactor = ((EXP_TID_GET(tidval, LEN) *
  1026. PAGE_SIZE) >=
  1027. KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT :
  1028. KDETH_OM_SMALL_SHIFT;
  1029. /* KDETH.OM and KDETH.OFFSET (TID) */
  1030. idx = ahg_header_set(
  1031. ahg, idx, array_size, 7, 0, 16,
  1032. ((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 |
  1033. ((req->tidoffset >> omfactor)
  1034. & 0x7fff)));
  1035. if (idx < 0)
  1036. return idx;
  1037. /* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */
  1038. val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
  1039. (EXP_TID_GET(tidval, IDX) & 0x3ff));
  1040. if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) {
  1041. val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
  1042. INTR) <<
  1043. AHG_KDETH_INTR_SHIFT));
  1044. } else {
  1045. val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ?
  1046. cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) :
  1047. cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
  1048. INTR) <<
  1049. AHG_KDETH_INTR_SHIFT));
  1050. }
  1051. idx = ahg_header_set(ahg, idx, array_size,
  1052. 7, 16, 14, (__force u16)val);
  1053. if (idx < 0)
  1054. return idx;
  1055. }
  1056. trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
  1057. req->info.comp_idx, req->sde->this_idx,
  1058. req->ahg_idx, ahg, idx, tidval);
  1059. sdma_txinit_ahg(&tx->txreq,
  1060. SDMA_TXREQ_F_USE_AHG,
  1061. datalen, req->ahg_idx, idx,
  1062. ahg, sizeof(req->hdr),
  1063. user_sdma_txreq_cb);
  1064. return idx;
  1065. }
  1066. /**
  1067. * user_sdma_txreq_cb() - SDMA tx request completion callback.
  1068. * @txreq: valid sdma tx request
  1069. * @status: success/failure of request
  1070. *
  1071. * Called when the SDMA progress state machine gets notification that
  1072. * the SDMA descriptors for this tx request have been processed by the
  1073. * DMA engine. Called in interrupt context.
  1074. * Only do work on completed sequences.
  1075. */
  1076. static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
  1077. {
  1078. struct user_sdma_txreq *tx =
  1079. container_of(txreq, struct user_sdma_txreq, txreq);
  1080. struct user_sdma_request *req;
  1081. struct hfi1_user_sdma_pkt_q *pq;
  1082. struct hfi1_user_sdma_comp_q *cq;
  1083. enum hfi1_sdma_comp_state state = COMPLETE;
  1084. if (!tx->req)
  1085. return;
  1086. req = tx->req;
  1087. pq = req->pq;
  1088. cq = req->cq;
  1089. if (status != SDMA_TXREQ_S_OK) {
  1090. SDMA_DBG(req, "SDMA completion with error %d",
  1091. status);
  1092. WRITE_ONCE(req->has_error, 1);
  1093. state = ERROR;
  1094. }
  1095. req->seqcomp = tx->seqnum;
  1096. kmem_cache_free(pq->txreq_cache, tx);
  1097. /* sequence isn't complete? We are done */
  1098. if (req->seqcomp != req->info.npkts - 1)
  1099. return;
  1100. user_sdma_free_request(req);
  1101. set_comp_state(pq, cq, req->info.comp_idx, state, status);
  1102. pq_update(pq);
  1103. }
  1104. static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
  1105. {
  1106. if (atomic_dec_and_test(&pq->n_reqs))
  1107. wake_up(&pq->wait);
  1108. }
  1109. static void user_sdma_free_request(struct user_sdma_request *req)
  1110. {
  1111. if (!list_empty(&req->txps)) {
  1112. struct sdma_txreq *t, *p;
  1113. list_for_each_entry_safe(t, p, &req->txps, list) {
  1114. struct user_sdma_txreq *tx =
  1115. container_of(t, struct user_sdma_txreq, txreq);
  1116. list_del_init(&t->list);
  1117. sdma_txclean(req->pq->dd, t);
  1118. kmem_cache_free(req->pq->txreq_cache, tx);
  1119. }
  1120. }
  1121. kfree(req->tids);
  1122. clear_bit(req->info.comp_idx, req->pq->req_in_use);
  1123. }
  1124. static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
  1125. struct hfi1_user_sdma_comp_q *cq,
  1126. u16 idx, enum hfi1_sdma_comp_state state,
  1127. int ret)
  1128. {
  1129. if (state == ERROR)
  1130. cq->comps[idx].errcode = -ret;
  1131. smp_wmb(); /* make sure errcode is visible first */
  1132. cq->comps[idx].status = state;
  1133. trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
  1134. idx, state, ret);
  1135. }
  1136. static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
  1137. unsigned int start, unsigned int npages)
  1138. {
  1139. hfi1_release_user_pages(mm, pages + start, npages, false);
  1140. kfree(pages);
  1141. }
  1142. static void free_system_node(struct sdma_mmu_node *node)
  1143. {
  1144. if (node->npages) {
  1145. unpin_vector_pages(mm_from_sdma_node(node), node->pages, 0,
  1146. node->npages);
  1147. atomic_sub(node->npages, &node->pq->n_locked);
  1148. }
  1149. kfree(node);
  1150. }
  1151. /*
  1152. * kref_get()'s an additional kref on the returned rb_node to prevent rb_node
  1153. * from being released until after rb_node is assigned to an SDMA descriptor
  1154. * (struct sdma_desc) under add_system_iovec_to_sdma_packet(), even if the
  1155. * virtual address range for rb_node is invalidated between now and then.
  1156. */
  1157. static struct sdma_mmu_node *find_system_node(struct mmu_rb_handler *handler,
  1158. unsigned long start,
  1159. unsigned long end)
  1160. {
  1161. struct mmu_rb_node *rb_node;
  1162. unsigned long flags;
  1163. spin_lock_irqsave(&handler->lock, flags);
  1164. rb_node = hfi1_mmu_rb_get_first(handler, start, (end - start));
  1165. if (!rb_node) {
  1166. spin_unlock_irqrestore(&handler->lock, flags);
  1167. return NULL;
  1168. }
  1169. /* "safety" kref to prevent release before add_system_iovec_to_sdma_packet() */
  1170. kref_get(&rb_node->refcount);
  1171. spin_unlock_irqrestore(&handler->lock, flags);
  1172. return container_of(rb_node, struct sdma_mmu_node, rb);
  1173. }
  1174. static int pin_system_pages(struct user_sdma_request *req,
  1175. uintptr_t start_address, size_t length,
  1176. struct sdma_mmu_node *node, int npages)
  1177. {
  1178. struct hfi1_user_sdma_pkt_q *pq = req->pq;
  1179. int pinned, cleared;
  1180. struct page **pages;
  1181. pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
  1182. if (!pages)
  1183. return -ENOMEM;
  1184. retry:
  1185. if (!hfi1_can_pin_pages(pq->dd, current->mm, atomic_read(&pq->n_locked),
  1186. npages)) {
  1187. SDMA_DBG(req, "Evicting: nlocked %u npages %u",
  1188. atomic_read(&pq->n_locked), npages);
  1189. cleared = sdma_cache_evict(pq, npages);
  1190. if (cleared >= npages)
  1191. goto retry;
  1192. }
  1193. SDMA_DBG(req, "Acquire user pages start_address %lx node->npages %u npages %u",
  1194. start_address, node->npages, npages);
  1195. pinned = hfi1_acquire_user_pages(current->mm, start_address, npages, 0,
  1196. pages);
  1197. if (pinned < 0) {
  1198. kfree(pages);
  1199. SDMA_DBG(req, "pinned %d", pinned);
  1200. return pinned;
  1201. }
  1202. if (pinned != npages) {
  1203. unpin_vector_pages(current->mm, pages, node->npages, pinned);
  1204. SDMA_DBG(req, "npages %u pinned %d", npages, pinned);
  1205. return -EFAULT;
  1206. }
  1207. node->rb.addr = start_address;
  1208. node->rb.len = length;
  1209. node->pages = pages;
  1210. node->npages = npages;
  1211. atomic_add(pinned, &pq->n_locked);
  1212. SDMA_DBG(req, "done. pinned %d", pinned);
  1213. return 0;
  1214. }
  1215. /*
  1216. * kref refcount on *node_p will be 2 on successful addition: one kref from
  1217. * kref_init() for mmu_rb_handler and one kref to prevent *node_p from being
  1218. * released until after *node_p is assigned to an SDMA descriptor (struct
  1219. * sdma_desc) under add_system_iovec_to_sdma_packet(), even if the virtual
  1220. * address range for *node_p is invalidated between now and then.
  1221. */
  1222. static int add_system_pinning(struct user_sdma_request *req,
  1223. struct sdma_mmu_node **node_p,
  1224. unsigned long start, unsigned long len)
  1225. {
  1226. struct hfi1_user_sdma_pkt_q *pq = req->pq;
  1227. struct sdma_mmu_node *node;
  1228. int ret;
  1229. node = kzalloc(sizeof(*node), GFP_KERNEL);
  1230. if (!node)
  1231. return -ENOMEM;
  1232. /* First kref "moves" to mmu_rb_handler */
  1233. kref_init(&node->rb.refcount);
  1234. /* "safety" kref to prevent release before add_system_iovec_to_sdma_packet() */
  1235. kref_get(&node->rb.refcount);
  1236. node->pq = pq;
  1237. ret = pin_system_pages(req, start, len, node, PFN_DOWN(len));
  1238. if (ret == 0) {
  1239. ret = hfi1_mmu_rb_insert(pq->handler, &node->rb);
  1240. if (ret)
  1241. free_system_node(node);
  1242. else
  1243. *node_p = node;
  1244. return ret;
  1245. }
  1246. kfree(node);
  1247. return ret;
  1248. }
  1249. static int get_system_cache_entry(struct user_sdma_request *req,
  1250. struct sdma_mmu_node **node_p,
  1251. size_t req_start, size_t req_len)
  1252. {
  1253. struct hfi1_user_sdma_pkt_q *pq = req->pq;
  1254. u64 start = ALIGN_DOWN(req_start, PAGE_SIZE);
  1255. u64 end = PFN_ALIGN(req_start + req_len);
  1256. struct mmu_rb_handler *handler = pq->handler;
  1257. int ret;
  1258. if ((end - start) == 0) {
  1259. SDMA_DBG(req,
  1260. "Request for empty cache entry req_start %lx req_len %lx start %llx end %llx",
  1261. req_start, req_len, start, end);
  1262. return -EINVAL;
  1263. }
  1264. SDMA_DBG(req, "req_start %lx req_len %lu", req_start, req_len);
  1265. while (1) {
  1266. struct sdma_mmu_node *node =
  1267. find_system_node(handler, start, end);
  1268. u64 prepend_len = 0;
  1269. SDMA_DBG(req, "node %p start %llx end %llu", node, start, end);
  1270. if (!node) {
  1271. ret = add_system_pinning(req, node_p, start,
  1272. end - start);
  1273. if (ret == -EEXIST) {
  1274. /*
  1275. * Another execution context has inserted a
  1276. * conficting entry first.
  1277. */
  1278. continue;
  1279. }
  1280. return ret;
  1281. }
  1282. if (node->rb.addr <= start) {
  1283. /*
  1284. * This entry covers at least part of the region. If it doesn't extend
  1285. * to the end, then this will be called again for the next segment.
  1286. */
  1287. *node_p = node;
  1288. return 0;
  1289. }
  1290. SDMA_DBG(req, "prepend: node->rb.addr %lx, node->rb.refcount %d",
  1291. node->rb.addr, kref_read(&node->rb.refcount));
  1292. prepend_len = node->rb.addr - start;
  1293. /*
  1294. * This node will not be returned, instead a new node
  1295. * will be. So release the reference.
  1296. */
  1297. kref_put(&node->rb.refcount, hfi1_mmu_rb_release);
  1298. /* Prepend a node to cover the beginning of the allocation */
  1299. ret = add_system_pinning(req, node_p, start, prepend_len);
  1300. if (ret == -EEXIST) {
  1301. /* Another execution context has inserted a conficting entry first. */
  1302. continue;
  1303. }
  1304. return ret;
  1305. }
  1306. }
  1307. static void sdma_mmu_rb_node_get(void *ctx)
  1308. {
  1309. struct mmu_rb_node *node = ctx;
  1310. kref_get(&node->refcount);
  1311. }
  1312. static void sdma_mmu_rb_node_put(void *ctx)
  1313. {
  1314. struct sdma_mmu_node *node = ctx;
  1315. kref_put(&node->rb.refcount, hfi1_mmu_rb_release);
  1316. }
  1317. static int add_mapping_to_sdma_packet(struct user_sdma_request *req,
  1318. struct user_sdma_txreq *tx,
  1319. struct sdma_mmu_node *cache_entry,
  1320. size_t start,
  1321. size_t from_this_cache_entry)
  1322. {
  1323. struct hfi1_user_sdma_pkt_q *pq = req->pq;
  1324. unsigned int page_offset;
  1325. unsigned int from_this_page;
  1326. size_t page_index;
  1327. void *ctx;
  1328. int ret;
  1329. /*
  1330. * Because the cache may be more fragmented than the memory that is being accessed,
  1331. * it's not strictly necessary to have a descriptor per cache entry.
  1332. */
  1333. while (from_this_cache_entry) {
  1334. page_index = PFN_DOWN(start - cache_entry->rb.addr);
  1335. if (page_index >= cache_entry->npages) {
  1336. SDMA_DBG(req,
  1337. "Request for page_index %zu >= cache_entry->npages %u",
  1338. page_index, cache_entry->npages);
  1339. return -EINVAL;
  1340. }
  1341. page_offset = start - ALIGN_DOWN(start, PAGE_SIZE);
  1342. from_this_page = PAGE_SIZE - page_offset;
  1343. if (from_this_page < from_this_cache_entry) {
  1344. ctx = NULL;
  1345. } else {
  1346. /*
  1347. * In the case they are equal the next line has no practical effect,
  1348. * but it's better to do a register to register copy than a conditional
  1349. * branch.
  1350. */
  1351. from_this_page = from_this_cache_entry;
  1352. ctx = cache_entry;
  1353. }
  1354. ret = sdma_txadd_page(pq->dd, &tx->txreq,
  1355. cache_entry->pages[page_index],
  1356. page_offset, from_this_page,
  1357. ctx,
  1358. sdma_mmu_rb_node_get,
  1359. sdma_mmu_rb_node_put);
  1360. if (ret) {
  1361. /*
  1362. * When there's a failure, the entire request is freed by
  1363. * user_sdma_send_pkts().
  1364. */
  1365. SDMA_DBG(req,
  1366. "sdma_txadd_page failed %d page_index %lu page_offset %u from_this_page %u",
  1367. ret, page_index, page_offset, from_this_page);
  1368. return ret;
  1369. }
  1370. start += from_this_page;
  1371. from_this_cache_entry -= from_this_page;
  1372. }
  1373. return 0;
  1374. }
  1375. static int add_system_iovec_to_sdma_packet(struct user_sdma_request *req,
  1376. struct user_sdma_txreq *tx,
  1377. struct user_sdma_iovec *iovec,
  1378. size_t from_this_iovec)
  1379. {
  1380. while (from_this_iovec > 0) {
  1381. struct sdma_mmu_node *cache_entry;
  1382. size_t from_this_cache_entry;
  1383. size_t start;
  1384. int ret;
  1385. start = (uintptr_t)iovec->iov.iov_base + iovec->offset;
  1386. ret = get_system_cache_entry(req, &cache_entry, start,
  1387. from_this_iovec);
  1388. if (ret) {
  1389. SDMA_DBG(req, "pin system segment failed %d", ret);
  1390. return ret;
  1391. }
  1392. from_this_cache_entry = cache_entry->rb.len - (start - cache_entry->rb.addr);
  1393. if (from_this_cache_entry > from_this_iovec)
  1394. from_this_cache_entry = from_this_iovec;
  1395. ret = add_mapping_to_sdma_packet(req, tx, cache_entry, start,
  1396. from_this_cache_entry);
  1397. /*
  1398. * Done adding cache_entry to zero or more sdma_desc. Can
  1399. * kref_put() the "safety" kref taken under
  1400. * get_system_cache_entry().
  1401. */
  1402. kref_put(&cache_entry->rb.refcount, hfi1_mmu_rb_release);
  1403. if (ret) {
  1404. SDMA_DBG(req, "add system segment failed %d", ret);
  1405. return ret;
  1406. }
  1407. iovec->offset += from_this_cache_entry;
  1408. from_this_iovec -= from_this_cache_entry;
  1409. }
  1410. return 0;
  1411. }
  1412. static int add_system_pages_to_sdma_packet(struct user_sdma_request *req,
  1413. struct user_sdma_txreq *tx,
  1414. struct user_sdma_iovec *iovec,
  1415. u32 *pkt_data_remaining)
  1416. {
  1417. size_t remaining_to_add = *pkt_data_remaining;
  1418. /*
  1419. * Walk through iovec entries, ensure the associated pages
  1420. * are pinned and mapped, add data to the packet until no more
  1421. * data remains to be added.
  1422. */
  1423. while (remaining_to_add > 0) {
  1424. struct user_sdma_iovec *cur_iovec;
  1425. size_t from_this_iovec;
  1426. int ret;
  1427. cur_iovec = iovec;
  1428. from_this_iovec = iovec->iov.iov_len - iovec->offset;
  1429. if (from_this_iovec > remaining_to_add) {
  1430. from_this_iovec = remaining_to_add;
  1431. } else {
  1432. /* The current iovec entry will be consumed by this pass. */
  1433. req->iov_idx++;
  1434. iovec++;
  1435. }
  1436. ret = add_system_iovec_to_sdma_packet(req, tx, cur_iovec,
  1437. from_this_iovec);
  1438. if (ret)
  1439. return ret;
  1440. remaining_to_add -= from_this_iovec;
  1441. }
  1442. *pkt_data_remaining = remaining_to_add;
  1443. return 0;
  1444. }
  1445. static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
  1446. unsigned long len)
  1447. {
  1448. return (bool)(node->addr == addr);
  1449. }
  1450. /*
  1451. * Return 1 to remove the node from the rb tree and call the remove op.
  1452. *
  1453. * Called with the rb tree lock held.
  1454. */
  1455. static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
  1456. void *evict_arg, bool *stop)
  1457. {
  1458. struct sdma_mmu_node *node =
  1459. container_of(mnode, struct sdma_mmu_node, rb);
  1460. struct evict_data *evict_data = evict_arg;
  1461. /* this node will be evicted, add its pages to our count */
  1462. evict_data->cleared += node->npages;
  1463. /* have enough pages been cleared? */
  1464. if (evict_data->cleared >= evict_data->target)
  1465. *stop = true;
  1466. return 1; /* remove this node */
  1467. }
  1468. static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode)
  1469. {
  1470. struct sdma_mmu_node *node =
  1471. container_of(mnode, struct sdma_mmu_node, rb);
  1472. free_system_node(node);
  1473. }