ib_rdma.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701
  1. /*
  2. * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
  3. *
  4. * This software is available to you under a choice of one of two
  5. * licenses. You may choose to be licensed under the terms of the GNU
  6. * General Public License (GPL) Version 2, available from the file
  7. * COPYING in the main directory of this source tree, or the
  8. * OpenIB.org BSD license below:
  9. *
  10. * Redistribution and use in source and binary forms, with or
  11. * without modification, are permitted provided that the following
  12. * conditions are met:
  13. *
  14. * - Redistributions of source code must retain the above
  15. * copyright notice, this list of conditions and the following
  16. * disclaimer.
  17. *
  18. * - Redistributions in binary form must reproduce the above
  19. * copyright notice, this list of conditions and the following
  20. * disclaimer in the documentation and/or other materials
  21. * provided with the distribution.
  22. *
  23. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24. * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25. * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26. * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27. * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28. * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29. * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30. * SOFTWARE.
  31. *
  32. */
  33. #include <linux/kernel.h>
  34. #include <linux/slab.h>
  35. #include <linux/rculist.h>
  36. #include <linux/llist.h>
  37. #include "rds_single_path.h"
  38. #include "ib_mr.h"
  39. #include "rds.h"
  40. struct workqueue_struct *rds_ib_mr_wq;
  41. struct rds_ib_dereg_odp_mr {
  42. struct work_struct work;
  43. struct ib_mr *mr;
  44. };
  45. static void rds_ib_odp_mr_worker(struct work_struct *work);
  46. static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
  47. {
  48. struct rds_ib_device *rds_ibdev;
  49. struct rds_ib_ipaddr *i_ipaddr;
  50. rcu_read_lock();
  51. list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) {
  52. list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
  53. if (i_ipaddr->ipaddr == ipaddr) {
  54. refcount_inc(&rds_ibdev->refcount);
  55. rcu_read_unlock();
  56. return rds_ibdev;
  57. }
  58. }
  59. }
  60. rcu_read_unlock();
  61. return NULL;
  62. }
  63. static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
  64. {
  65. struct rds_ib_ipaddr *i_ipaddr;
  66. i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL);
  67. if (!i_ipaddr)
  68. return -ENOMEM;
  69. i_ipaddr->ipaddr = ipaddr;
  70. spin_lock_irq(&rds_ibdev->spinlock);
  71. list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
  72. spin_unlock_irq(&rds_ibdev->spinlock);
  73. return 0;
  74. }
  75. static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
  76. {
  77. struct rds_ib_ipaddr *i_ipaddr;
  78. struct rds_ib_ipaddr *to_free = NULL;
  79. spin_lock_irq(&rds_ibdev->spinlock);
  80. list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
  81. if (i_ipaddr->ipaddr == ipaddr) {
  82. list_del_rcu(&i_ipaddr->list);
  83. to_free = i_ipaddr;
  84. break;
  85. }
  86. }
  87. spin_unlock_irq(&rds_ibdev->spinlock);
  88. if (to_free)
  89. kfree_rcu(to_free, rcu);
  90. }
  91. int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev,
  92. struct in6_addr *ipaddr)
  93. {
  94. struct rds_ib_device *rds_ibdev_old;
  95. rds_ibdev_old = rds_ib_get_device(ipaddr->s6_addr32[3]);
  96. if (!rds_ibdev_old)
  97. return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]);
  98. if (rds_ibdev_old != rds_ibdev) {
  99. rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr->s6_addr32[3]);
  100. rds_ib_dev_put(rds_ibdev_old);
  101. return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]);
  102. }
  103. rds_ib_dev_put(rds_ibdev_old);
  104. return 0;
  105. }
  106. void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
  107. {
  108. struct rds_ib_connection *ic = conn->c_transport_data;
  109. /* conn was previously on the nodev_conns_list */
  110. spin_lock_irq(&ib_nodev_conns_lock);
  111. BUG_ON(list_empty(&ib_nodev_conns));
  112. BUG_ON(list_empty(&ic->ib_node));
  113. list_del(&ic->ib_node);
  114. spin_lock(&rds_ibdev->spinlock);
  115. list_add_tail(&ic->ib_node, &rds_ibdev->conn_list);
  116. spin_unlock(&rds_ibdev->spinlock);
  117. spin_unlock_irq(&ib_nodev_conns_lock);
  118. ic->rds_ibdev = rds_ibdev;
  119. refcount_inc(&rds_ibdev->refcount);
  120. }
  121. void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
  122. {
  123. struct rds_ib_connection *ic = conn->c_transport_data;
  124. /* place conn on nodev_conns_list */
  125. spin_lock(&ib_nodev_conns_lock);
  126. spin_lock_irq(&rds_ibdev->spinlock);
  127. BUG_ON(list_empty(&ic->ib_node));
  128. list_del(&ic->ib_node);
  129. spin_unlock_irq(&rds_ibdev->spinlock);
  130. list_add_tail(&ic->ib_node, &ib_nodev_conns);
  131. spin_unlock(&ib_nodev_conns_lock);
  132. ic->rds_ibdev = NULL;
  133. rds_ib_dev_put(rds_ibdev);
  134. }
  135. void rds_ib_destroy_nodev_conns(void)
  136. {
  137. struct rds_ib_connection *ic, *_ic;
  138. LIST_HEAD(tmp_list);
  139. /* avoid calling conn_destroy with irqs off */
  140. spin_lock_irq(&ib_nodev_conns_lock);
  141. list_splice(&ib_nodev_conns, &tmp_list);
  142. spin_unlock_irq(&ib_nodev_conns_lock);
  143. list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node)
  144. rds_conn_destroy(ic->conn);
  145. }
  146. void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo)
  147. {
  148. struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
  149. iinfo->rdma_mr_max = pool_1m->max_items;
  150. iinfo->rdma_mr_size = pool_1m->max_pages;
  151. }
  152. #if IS_ENABLED(CONFIG_IPV6)
  153. void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
  154. struct rds6_info_rdma_connection *iinfo6)
  155. {
  156. struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
  157. iinfo6->rdma_mr_max = pool_1m->max_items;
  158. iinfo6->rdma_mr_size = pool_1m->max_pages;
  159. }
  160. #endif
  161. struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
  162. {
  163. struct rds_ib_mr *ibmr = NULL;
  164. struct llist_node *ret;
  165. unsigned long flags;
  166. spin_lock_irqsave(&pool->clean_lock, flags);
  167. ret = llist_del_first(&pool->clean_list);
  168. spin_unlock_irqrestore(&pool->clean_lock, flags);
  169. if (ret) {
  170. ibmr = llist_entry(ret, struct rds_ib_mr, llnode);
  171. if (pool->pool_type == RDS_IB_MR_8K_POOL)
  172. rds_ib_stats_inc(s_ib_rdma_mr_8k_reused);
  173. else
  174. rds_ib_stats_inc(s_ib_rdma_mr_1m_reused);
  175. }
  176. return ibmr;
  177. }
  178. void rds_ib_sync_mr(void *trans_private, int direction)
  179. {
  180. struct rds_ib_mr *ibmr = trans_private;
  181. struct rds_ib_device *rds_ibdev = ibmr->device;
  182. if (ibmr->odp)
  183. return;
  184. switch (direction) {
  185. case DMA_FROM_DEVICE:
  186. ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg,
  187. ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
  188. break;
  189. case DMA_TO_DEVICE:
  190. ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg,
  191. ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
  192. break;
  193. }
  194. }
  195. void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
  196. {
  197. struct rds_ib_device *rds_ibdev = ibmr->device;
  198. if (ibmr->sg_dma_len) {
  199. ib_dma_unmap_sg(rds_ibdev->dev,
  200. ibmr->sg, ibmr->sg_len,
  201. DMA_BIDIRECTIONAL);
  202. ibmr->sg_dma_len = 0;
  203. }
  204. /* Release the s/g list */
  205. if (ibmr->sg_len) {
  206. unsigned int i;
  207. for (i = 0; i < ibmr->sg_len; ++i) {
  208. struct page *page = sg_page(&ibmr->sg[i]);
  209. /* FIXME we need a way to tell a r/w MR
  210. * from a r/o MR */
  211. WARN_ON(!page->mapping && irqs_disabled());
  212. set_page_dirty(page);
  213. put_page(page);
  214. }
  215. kfree(ibmr->sg);
  216. ibmr->sg = NULL;
  217. ibmr->sg_len = 0;
  218. }
  219. }
  220. void rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
  221. {
  222. unsigned int pinned = ibmr->sg_len;
  223. __rds_ib_teardown_mr(ibmr);
  224. if (pinned) {
  225. struct rds_ib_mr_pool *pool = ibmr->pool;
  226. atomic_sub(pinned, &pool->free_pinned);
  227. }
  228. }
  229. static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all)
  230. {
  231. unsigned int item_count;
  232. item_count = atomic_read(&pool->item_count);
  233. if (free_all)
  234. return item_count;
  235. return 0;
  236. }
  237. /*
  238. * given an llist of mrs, put them all into the list_head for more processing
  239. */
  240. static unsigned int llist_append_to_list(struct llist_head *llist,
  241. struct list_head *list)
  242. {
  243. struct rds_ib_mr *ibmr;
  244. struct llist_node *node;
  245. struct llist_node *next;
  246. unsigned int count = 0;
  247. node = llist_del_all(llist);
  248. while (node) {
  249. next = node->next;
  250. ibmr = llist_entry(node, struct rds_ib_mr, llnode);
  251. list_add_tail(&ibmr->unmap_list, list);
  252. node = next;
  253. count++;
  254. }
  255. return count;
  256. }
  257. /*
  258. * this takes a list head of mrs and turns it into linked llist nodes
  259. * of clusters. Each cluster has linked llist nodes of
  260. * MR_CLUSTER_SIZE mrs that are ready for reuse.
  261. */
  262. static void list_to_llist_nodes(struct list_head *list,
  263. struct llist_node **nodes_head,
  264. struct llist_node **nodes_tail)
  265. {
  266. struct rds_ib_mr *ibmr;
  267. struct llist_node *cur = NULL;
  268. struct llist_node **next = nodes_head;
  269. list_for_each_entry(ibmr, list, unmap_list) {
  270. cur = &ibmr->llnode;
  271. *next = cur;
  272. next = &cur->next;
  273. }
  274. *next = NULL;
  275. *nodes_tail = cur;
  276. }
  277. /*
  278. * Flush our pool of MRs.
  279. * At a minimum, all currently unused MRs are unmapped.
  280. * If the number of MRs allocated exceeds the limit, we also try
  281. * to free as many MRs as needed to get back to this limit.
  282. */
  283. int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
  284. int free_all, struct rds_ib_mr **ibmr_ret)
  285. {
  286. struct rds_ib_mr *ibmr;
  287. struct llist_node *clean_nodes;
  288. struct llist_node *clean_tail;
  289. LIST_HEAD(unmap_list);
  290. unsigned long unpinned = 0;
  291. unsigned int nfreed = 0, dirty_to_clean = 0, free_goal;
  292. if (pool->pool_type == RDS_IB_MR_8K_POOL)
  293. rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_flush);
  294. else
  295. rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_flush);
  296. if (ibmr_ret) {
  297. DEFINE_WAIT(wait);
  298. while (!mutex_trylock(&pool->flush_lock)) {
  299. ibmr = rds_ib_reuse_mr(pool);
  300. if (ibmr) {
  301. *ibmr_ret = ibmr;
  302. finish_wait(&pool->flush_wait, &wait);
  303. goto out_nolock;
  304. }
  305. prepare_to_wait(&pool->flush_wait, &wait,
  306. TASK_UNINTERRUPTIBLE);
  307. if (llist_empty(&pool->clean_list))
  308. schedule();
  309. ibmr = rds_ib_reuse_mr(pool);
  310. if (ibmr) {
  311. *ibmr_ret = ibmr;
  312. finish_wait(&pool->flush_wait, &wait);
  313. goto out_nolock;
  314. }
  315. }
  316. finish_wait(&pool->flush_wait, &wait);
  317. } else
  318. mutex_lock(&pool->flush_lock);
  319. if (ibmr_ret) {
  320. ibmr = rds_ib_reuse_mr(pool);
  321. if (ibmr) {
  322. *ibmr_ret = ibmr;
  323. goto out;
  324. }
  325. }
  326. /* Get the list of all MRs to be dropped. Ordering matters -
  327. * we want to put drop_list ahead of free_list.
  328. */
  329. dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list);
  330. dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list);
  331. if (free_all) {
  332. unsigned long flags;
  333. spin_lock_irqsave(&pool->clean_lock, flags);
  334. llist_append_to_list(&pool->clean_list, &unmap_list);
  335. spin_unlock_irqrestore(&pool->clean_lock, flags);
  336. }
  337. free_goal = rds_ib_flush_goal(pool, free_all);
  338. if (list_empty(&unmap_list))
  339. goto out;
  340. rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal);
  341. if (!list_empty(&unmap_list)) {
  342. unsigned long flags;
  343. list_to_llist_nodes(&unmap_list, &clean_nodes, &clean_tail);
  344. if (ibmr_ret) {
  345. *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode);
  346. clean_nodes = clean_nodes->next;
  347. }
  348. /* more than one entry in llist nodes */
  349. if (clean_nodes) {
  350. spin_lock_irqsave(&pool->clean_lock, flags);
  351. llist_add_batch(clean_nodes, clean_tail,
  352. &pool->clean_list);
  353. spin_unlock_irqrestore(&pool->clean_lock, flags);
  354. }
  355. }
  356. atomic_sub(unpinned, &pool->free_pinned);
  357. atomic_sub(dirty_to_clean, &pool->dirty_count);
  358. atomic_sub(nfreed, &pool->item_count);
  359. out:
  360. mutex_unlock(&pool->flush_lock);
  361. if (waitqueue_active(&pool->flush_wait))
  362. wake_up(&pool->flush_wait);
  363. out_nolock:
  364. return 0;
  365. }
  366. struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool)
  367. {
  368. struct rds_ib_mr *ibmr = NULL;
  369. int iter = 0;
  370. while (1) {
  371. ibmr = rds_ib_reuse_mr(pool);
  372. if (ibmr)
  373. return ibmr;
  374. if (atomic_inc_return(&pool->item_count) <= pool->max_items)
  375. break;
  376. atomic_dec(&pool->item_count);
  377. if (++iter > 2) {
  378. if (pool->pool_type == RDS_IB_MR_8K_POOL)
  379. rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted);
  380. else
  381. rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted);
  382. break;
  383. }
  384. /* We do have some empty MRs. Flush them out. */
  385. if (pool->pool_type == RDS_IB_MR_8K_POOL)
  386. rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait);
  387. else
  388. rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait);
  389. rds_ib_flush_mr_pool(pool, 0, &ibmr);
  390. if (ibmr)
  391. return ibmr;
  392. }
  393. return NULL;
  394. }
  395. static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
  396. {
  397. struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work);
  398. rds_ib_flush_mr_pool(pool, 0, NULL);
  399. }
  400. void rds_ib_free_mr(void *trans_private, int invalidate)
  401. {
  402. struct rds_ib_mr *ibmr = trans_private;
  403. struct rds_ib_mr_pool *pool = ibmr->pool;
  404. struct rds_ib_device *rds_ibdev = ibmr->device;
  405. rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
  406. if (ibmr->odp) {
  407. /* A MR created and marked as use_once. We use delayed work,
  408. * because there is a change that we are in interrupt and can't
  409. * call to ib_dereg_mr() directly.
  410. */
  411. INIT_DELAYED_WORK(&ibmr->work, rds_ib_odp_mr_worker);
  412. queue_delayed_work(rds_ib_mr_wq, &ibmr->work, 0);
  413. return;
  414. }
  415. /* Return it to the pool's free list */
  416. rds_ib_free_frmr_list(ibmr);
  417. atomic_add(ibmr->sg_len, &pool->free_pinned);
  418. atomic_inc(&pool->dirty_count);
  419. /* If we've pinned too many pages, request a flush */
  420. if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
  421. atomic_read(&pool->dirty_count) >= pool->max_items / 5)
  422. queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10);
  423. if (invalidate) {
  424. if (likely(!in_interrupt())) {
  425. rds_ib_flush_mr_pool(pool, 0, NULL);
  426. } else {
  427. /* We get here if the user created a MR marked
  428. * as use_once and invalidate at the same time.
  429. */
  430. queue_delayed_work(rds_ib_mr_wq,
  431. &pool->flush_worker, 10);
  432. }
  433. }
  434. rds_ib_dev_put(rds_ibdev);
  435. }
  436. void rds_ib_flush_mrs(void)
  437. {
  438. struct rds_ib_device *rds_ibdev;
  439. down_read(&rds_ib_devices_lock);
  440. list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
  441. if (rds_ibdev->mr_8k_pool)
  442. rds_ib_flush_mr_pool(rds_ibdev->mr_8k_pool, 0, NULL);
  443. if (rds_ibdev->mr_1m_pool)
  444. rds_ib_flush_mr_pool(rds_ibdev->mr_1m_pool, 0, NULL);
  445. }
  446. up_read(&rds_ib_devices_lock);
  447. }
  448. u32 rds_ib_get_lkey(void *trans_private)
  449. {
  450. struct rds_ib_mr *ibmr = trans_private;
  451. return ibmr->u.mr->lkey;
  452. }
  453. void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
  454. struct rds_sock *rs, u32 *key_ret,
  455. struct rds_connection *conn,
  456. u64 start, u64 length, int need_odp)
  457. {
  458. struct rds_ib_device *rds_ibdev;
  459. struct rds_ib_mr *ibmr = NULL;
  460. struct rds_ib_connection *ic = NULL;
  461. int ret;
  462. rds_ibdev = rds_ib_get_device(rs->rs_bound_addr.s6_addr32[3]);
  463. if (!rds_ibdev) {
  464. ret = -ENODEV;
  465. goto out;
  466. }
  467. if (need_odp == ODP_ZEROBASED || need_odp == ODP_VIRTUAL) {
  468. u64 virt_addr = need_odp == ODP_ZEROBASED ? 0 : start;
  469. int access_flags =
  470. (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ |
  471. IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC |
  472. IB_ACCESS_ON_DEMAND);
  473. struct ib_sge sge = {};
  474. struct ib_mr *ib_mr;
  475. if (!rds_ibdev->odp_capable) {
  476. ret = -EOPNOTSUPP;
  477. goto out;
  478. }
  479. ib_mr = ib_reg_user_mr(rds_ibdev->pd, start, length, virt_addr,
  480. access_flags);
  481. if (IS_ERR(ib_mr)) {
  482. rdsdebug("rds_ib_get_user_mr returned %d\n",
  483. IS_ERR(ib_mr));
  484. ret = PTR_ERR(ib_mr);
  485. goto out;
  486. }
  487. if (key_ret)
  488. *key_ret = ib_mr->rkey;
  489. ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
  490. if (!ibmr) {
  491. ib_dereg_mr(ib_mr);
  492. ret = -ENOMEM;
  493. goto out;
  494. }
  495. ibmr->u.mr = ib_mr;
  496. ibmr->odp = 1;
  497. sge.addr = virt_addr;
  498. sge.length = length;
  499. sge.lkey = ib_mr->lkey;
  500. ib_advise_mr(rds_ibdev->pd,
  501. IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE,
  502. IB_UVERBS_ADVISE_MR_FLAG_FLUSH, &sge, 1);
  503. return ibmr;
  504. }
  505. if (conn)
  506. ic = conn->c_transport_data;
  507. if (!rds_ibdev->mr_8k_pool || !rds_ibdev->mr_1m_pool) {
  508. ret = -ENODEV;
  509. goto out;
  510. }
  511. ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret);
  512. if (IS_ERR(ibmr)) {
  513. ret = PTR_ERR(ibmr);
  514. pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret);
  515. } else {
  516. return ibmr;
  517. }
  518. out:
  519. if (rds_ibdev)
  520. rds_ib_dev_put(rds_ibdev);
  521. return ERR_PTR(ret);
  522. }
  523. void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
  524. {
  525. cancel_delayed_work_sync(&pool->flush_worker);
  526. rds_ib_flush_mr_pool(pool, 1, NULL);
  527. WARN_ON(atomic_read(&pool->item_count));
  528. WARN_ON(atomic_read(&pool->free_pinned));
  529. kfree(pool);
  530. }
  531. struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
  532. int pool_type)
  533. {
  534. struct rds_ib_mr_pool *pool;
  535. pool = kzalloc(sizeof(*pool), GFP_KERNEL);
  536. if (!pool)
  537. return ERR_PTR(-ENOMEM);
  538. pool->pool_type = pool_type;
  539. init_llist_head(&pool->free_list);
  540. init_llist_head(&pool->drop_list);
  541. init_llist_head(&pool->clean_list);
  542. spin_lock_init(&pool->clean_lock);
  543. mutex_init(&pool->flush_lock);
  544. init_waitqueue_head(&pool->flush_wait);
  545. INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
  546. if (pool_type == RDS_IB_MR_1M_POOL) {
  547. /* +1 allows for unaligned MRs */
  548. pool->max_pages = RDS_MR_1M_MSG_SIZE + 1;
  549. pool->max_items = rds_ibdev->max_1m_mrs;
  550. } else {
  551. /* pool_type == RDS_IB_MR_8K_POOL */
  552. pool->max_pages = RDS_MR_8K_MSG_SIZE + 1;
  553. pool->max_items = rds_ibdev->max_8k_mrs;
  554. }
  555. pool->max_free_pinned = pool->max_items * pool->max_pages / 4;
  556. pool->max_items_soft = rds_ibdev->max_mrs * 3 / 4;
  557. return pool;
  558. }
  559. int rds_ib_mr_init(void)
  560. {
  561. rds_ib_mr_wq = alloc_workqueue("rds_mr_flushd", WQ_MEM_RECLAIM, 0);
  562. if (!rds_ib_mr_wq)
  563. return -ENOMEM;
  564. return 0;
  565. }
  566. /* By the time this is called all the IB devices should have been torn down and
  567. * had their pools freed. As each pool is freed its work struct is waited on,
  568. * so the pool flushing work queue should be idle by the time we get here.
  569. */
  570. void rds_ib_mr_exit(void)
  571. {
  572. destroy_workqueue(rds_ib_mr_wq);
  573. }
  574. static void rds_ib_odp_mr_worker(struct work_struct *work)
  575. {
  576. struct rds_ib_mr *ibmr;
  577. ibmr = container_of(work, struct rds_ib_mr, work.work);
  578. ib_dereg_mr(ibmr->u.mr);
  579. kfree(ibmr);
  580. }