siw_mem.c 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
  1. // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
  2. /* Authors: Bernard Metzler <[email protected]> */
  3. /* Copyright (c) 2008-2019, IBM Corporation */
  4. #include <linux/gfp.h>
  5. #include <rdma/ib_verbs.h>
  6. #include <linux/dma-mapping.h>
  7. #include <linux/slab.h>
  8. #include <linux/sched/mm.h>
  9. #include <linux/resource.h>
  10. #include "siw.h"
  11. #include "siw_mem.h"
  12. /*
  13. * Stag lookup is based on its index part only (24 bits).
  14. * The code avoids special Stag of zero and tries to randomize
  15. * STag values between 1 and SIW_STAG_MAX_INDEX.
  16. */
  17. int siw_mem_add(struct siw_device *sdev, struct siw_mem *m)
  18. {
  19. struct xa_limit limit = XA_LIMIT(1, 0x00ffffff);
  20. u32 id, next;
  21. get_random_bytes(&next, 4);
  22. next &= 0x00ffffff;
  23. if (xa_alloc_cyclic(&sdev->mem_xa, &id, m, limit, &next,
  24. GFP_KERNEL) < 0)
  25. return -ENOMEM;
  26. /* Set the STag index part */
  27. m->stag = id << 8;
  28. siw_dbg_mem(m, "new MEM object\n");
  29. return 0;
  30. }
  31. /*
  32. * siw_mem_id2obj()
  33. *
  34. * resolves memory from stag given by id. might be called from:
  35. * o process context before sending out of sgl, or
  36. * o in softirq when resolving target memory
  37. */
  38. struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index)
  39. {
  40. struct siw_mem *mem;
  41. rcu_read_lock();
  42. mem = xa_load(&sdev->mem_xa, stag_index);
  43. if (likely(mem && kref_get_unless_zero(&mem->ref))) {
  44. rcu_read_unlock();
  45. return mem;
  46. }
  47. rcu_read_unlock();
  48. return NULL;
  49. }
  50. static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages,
  51. bool dirty)
  52. {
  53. unpin_user_pages_dirty_lock(chunk->plist, num_pages, dirty);
  54. }
  55. void siw_umem_release(struct siw_umem *umem, bool dirty)
  56. {
  57. struct mm_struct *mm_s = umem->owning_mm;
  58. int i, num_pages = umem->num_pages;
  59. for (i = 0; num_pages; i++) {
  60. int to_free = min_t(int, PAGES_PER_CHUNK, num_pages);
  61. siw_free_plist(&umem->page_chunk[i], to_free,
  62. umem->writable && dirty);
  63. kfree(umem->page_chunk[i].plist);
  64. num_pages -= to_free;
  65. }
  66. atomic64_sub(umem->num_pages, &mm_s->pinned_vm);
  67. mmdrop(mm_s);
  68. kfree(umem->page_chunk);
  69. kfree(umem);
  70. }
  71. int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj,
  72. u64 start, u64 len, int rights)
  73. {
  74. struct siw_device *sdev = to_siw_dev(pd->device);
  75. struct siw_mem *mem = kzalloc(sizeof(*mem), GFP_KERNEL);
  76. struct xa_limit limit = XA_LIMIT(1, 0x00ffffff);
  77. u32 id, next;
  78. if (!mem)
  79. return -ENOMEM;
  80. mem->mem_obj = mem_obj;
  81. mem->stag_valid = 0;
  82. mem->sdev = sdev;
  83. mem->va = start;
  84. mem->len = len;
  85. mem->pd = pd;
  86. mem->perms = rights & IWARP_ACCESS_MASK;
  87. kref_init(&mem->ref);
  88. get_random_bytes(&next, 4);
  89. next &= 0x00ffffff;
  90. if (xa_alloc_cyclic(&sdev->mem_xa, &id, mem, limit, &next,
  91. GFP_KERNEL) < 0) {
  92. kfree(mem);
  93. return -ENOMEM;
  94. }
  95. mr->mem = mem;
  96. /* Set the STag index part */
  97. mem->stag = id << 8;
  98. mr->base_mr.lkey = mr->base_mr.rkey = mem->stag;
  99. return 0;
  100. }
  101. void siw_mr_drop_mem(struct siw_mr *mr)
  102. {
  103. struct siw_mem *mem = mr->mem, *found;
  104. mem->stag_valid = 0;
  105. /* make STag invalid visible asap */
  106. smp_mb();
  107. found = xa_erase(&mem->sdev->mem_xa, mem->stag >> 8);
  108. WARN_ON(found != mem);
  109. siw_mem_put(mem);
  110. }
  111. void siw_free_mem(struct kref *ref)
  112. {
  113. struct siw_mem *mem = container_of(ref, struct siw_mem, ref);
  114. siw_dbg_mem(mem, "free mem, pbl: %s\n", mem->is_pbl ? "y" : "n");
  115. if (!mem->is_mw && mem->mem_obj) {
  116. if (mem->is_pbl == 0)
  117. siw_umem_release(mem->umem, true);
  118. else
  119. kfree(mem->pbl);
  120. }
  121. kfree(mem);
  122. }
  123. /*
  124. * siw_check_mem()
  125. *
  126. * Check protection domain, STAG state, access permissions and
  127. * address range for memory object.
  128. *
  129. * @pd: Protection Domain memory should belong to
  130. * @mem: memory to be checked
  131. * @addr: starting addr of mem
  132. * @perms: requested access permissions
  133. * @len: len of memory interval to be checked
  134. *
  135. */
  136. int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr,
  137. enum ib_access_flags perms, int len)
  138. {
  139. if (!mem->stag_valid) {
  140. siw_dbg_pd(pd, "STag 0x%08x invalid\n", mem->stag);
  141. return -E_STAG_INVALID;
  142. }
  143. if (mem->pd != pd) {
  144. siw_dbg_pd(pd, "STag 0x%08x: PD mismatch\n", mem->stag);
  145. return -E_PD_MISMATCH;
  146. }
  147. /*
  148. * check access permissions
  149. */
  150. if ((mem->perms & perms) < perms) {
  151. siw_dbg_pd(pd, "permissions 0x%08x < 0x%08x\n",
  152. mem->perms, perms);
  153. return -E_ACCESS_PERM;
  154. }
  155. /*
  156. * Check if access falls into valid memory interval.
  157. */
  158. if (addr < mem->va || addr + len > mem->va + mem->len) {
  159. siw_dbg_pd(pd, "MEM interval len %d\n", len);
  160. siw_dbg_pd(pd, "[0x%pK, 0x%pK] out of bounds\n",
  161. (void *)(uintptr_t)addr,
  162. (void *)(uintptr_t)(addr + len));
  163. siw_dbg_pd(pd, "[0x%pK, 0x%pK] STag=0x%08x\n",
  164. (void *)(uintptr_t)mem->va,
  165. (void *)(uintptr_t)(mem->va + mem->len),
  166. mem->stag);
  167. return -E_BASE_BOUNDS;
  168. }
  169. return E_ACCESS_OK;
  170. }
  171. /*
  172. * siw_check_sge()
  173. *
  174. * Check SGE for access rights in given interval
  175. *
  176. * @pd: Protection Domain memory should belong to
  177. * @sge: SGE to be checked
  178. * @mem: location of memory reference within array
  179. * @perms: requested access permissions
  180. * @off: starting offset in SGE
  181. * @len: len of memory interval to be checked
  182. *
  183. * NOTE: Function references SGE's memory object (mem->obj)
  184. * if not yet done. New reference is kept if check went ok and
  185. * released if check failed. If mem->obj is already valid, no new
  186. * lookup is being done and mem is not released it check fails.
  187. */
  188. int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge, struct siw_mem *mem[],
  189. enum ib_access_flags perms, u32 off, int len)
  190. {
  191. struct siw_device *sdev = to_siw_dev(pd->device);
  192. struct siw_mem *new = NULL;
  193. int rv = E_ACCESS_OK;
  194. if (len + off > sge->length) {
  195. rv = -E_BASE_BOUNDS;
  196. goto fail;
  197. }
  198. if (*mem == NULL) {
  199. new = siw_mem_id2obj(sdev, sge->lkey >> 8);
  200. if (unlikely(!new)) {
  201. siw_dbg_pd(pd, "STag unknown: 0x%08x\n", sge->lkey);
  202. rv = -E_STAG_INVALID;
  203. goto fail;
  204. }
  205. *mem = new;
  206. }
  207. /* Check if user re-registered with different STag key */
  208. if (unlikely((*mem)->stag != sge->lkey)) {
  209. siw_dbg_mem((*mem), "STag mismatch: 0x%08x\n", sge->lkey);
  210. rv = -E_STAG_INVALID;
  211. goto fail;
  212. }
  213. rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len);
  214. if (unlikely(rv))
  215. goto fail;
  216. return 0;
  217. fail:
  218. if (new) {
  219. *mem = NULL;
  220. siw_mem_put(new);
  221. }
  222. return rv;
  223. }
  224. void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op)
  225. {
  226. switch (op) {
  227. case SIW_OP_SEND:
  228. case SIW_OP_WRITE:
  229. case SIW_OP_SEND_WITH_IMM:
  230. case SIW_OP_SEND_REMOTE_INV:
  231. case SIW_OP_READ:
  232. case SIW_OP_READ_LOCAL_INV:
  233. if (!(wqe->sqe.flags & SIW_WQE_INLINE))
  234. siw_unref_mem_sgl(wqe->mem, wqe->sqe.num_sge);
  235. break;
  236. case SIW_OP_RECEIVE:
  237. siw_unref_mem_sgl(wqe->mem, wqe->rqe.num_sge);
  238. break;
  239. case SIW_OP_READ_RESPONSE:
  240. siw_unref_mem_sgl(wqe->mem, 1);
  241. break;
  242. default:
  243. /*
  244. * SIW_OP_INVAL_STAG and SIW_OP_REG_MR
  245. * do not hold memory references
  246. */
  247. break;
  248. }
  249. }
  250. int siw_invalidate_stag(struct ib_pd *pd, u32 stag)
  251. {
  252. struct siw_device *sdev = to_siw_dev(pd->device);
  253. struct siw_mem *mem = siw_mem_id2obj(sdev, stag >> 8);
  254. int rv = 0;
  255. if (unlikely(!mem)) {
  256. siw_dbg_pd(pd, "STag 0x%08x unknown\n", stag);
  257. return -EINVAL;
  258. }
  259. if (unlikely(mem->pd != pd)) {
  260. siw_dbg_pd(pd, "PD mismatch for STag 0x%08x\n", stag);
  261. rv = -EACCES;
  262. goto out;
  263. }
  264. /*
  265. * Per RDMA verbs definition, an STag may already be in invalid
  266. * state if invalidation is requested. So no state check here.
  267. */
  268. mem->stag_valid = 0;
  269. siw_dbg_pd(pd, "STag 0x%08x now invalid\n", stag);
  270. out:
  271. siw_mem_put(mem);
  272. return rv;
  273. }
  274. /*
  275. * Gets physical address backed by PBL element. Address is referenced
  276. * by linear byte offset into list of variably sized PB elements.
  277. * Optionally, provides remaining len within current element, and
  278. * current PBL index for later resume at same element.
  279. */
  280. dma_addr_t siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx)
  281. {
  282. int i = idx ? *idx : 0;
  283. while (i < pbl->num_buf) {
  284. struct siw_pble *pble = &pbl->pbe[i];
  285. if (pble->pbl_off + pble->size > off) {
  286. u64 pble_off = off - pble->pbl_off;
  287. if (len)
  288. *len = pble->size - pble_off;
  289. if (idx)
  290. *idx = i;
  291. return pble->addr + pble_off;
  292. }
  293. i++;
  294. }
  295. if (len)
  296. *len = 0;
  297. return 0;
  298. }
  299. struct siw_pbl *siw_pbl_alloc(u32 num_buf)
  300. {
  301. struct siw_pbl *pbl;
  302. if (num_buf == 0)
  303. return ERR_PTR(-EINVAL);
  304. pbl = kzalloc(struct_size(pbl, pbe, num_buf), GFP_KERNEL);
  305. if (!pbl)
  306. return ERR_PTR(-ENOMEM);
  307. pbl->max_buf = num_buf;
  308. return pbl;
  309. }
  310. struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable)
  311. {
  312. struct siw_umem *umem;
  313. struct mm_struct *mm_s;
  314. u64 first_page_va;
  315. unsigned long mlock_limit;
  316. unsigned int foll_flags = FOLL_WRITE;
  317. int num_pages, num_chunks, i, rv = 0;
  318. if (!can_do_mlock())
  319. return ERR_PTR(-EPERM);
  320. if (!len)
  321. return ERR_PTR(-EINVAL);
  322. first_page_va = start & PAGE_MASK;
  323. num_pages = PAGE_ALIGN(start + len - first_page_va) >> PAGE_SHIFT;
  324. num_chunks = (num_pages >> CHUNK_SHIFT) + 1;
  325. umem = kzalloc(sizeof(*umem), GFP_KERNEL);
  326. if (!umem)
  327. return ERR_PTR(-ENOMEM);
  328. mm_s = current->mm;
  329. umem->owning_mm = mm_s;
  330. umem->writable = writable;
  331. mmgrab(mm_s);
  332. if (!writable)
  333. foll_flags |= FOLL_FORCE;
  334. mmap_read_lock(mm_s);
  335. mlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
  336. if (atomic64_add_return(num_pages, &mm_s->pinned_vm) > mlock_limit) {
  337. rv = -ENOMEM;
  338. goto out_sem_up;
  339. }
  340. umem->fp_addr = first_page_va;
  341. umem->page_chunk =
  342. kcalloc(num_chunks, sizeof(struct siw_page_chunk), GFP_KERNEL);
  343. if (!umem->page_chunk) {
  344. rv = -ENOMEM;
  345. goto out_sem_up;
  346. }
  347. for (i = 0; num_pages; i++) {
  348. int nents = min_t(int, num_pages, PAGES_PER_CHUNK);
  349. struct page **plist =
  350. kcalloc(nents, sizeof(struct page *), GFP_KERNEL);
  351. if (!plist) {
  352. rv = -ENOMEM;
  353. goto out_sem_up;
  354. }
  355. umem->page_chunk[i].plist = plist;
  356. while (nents) {
  357. rv = pin_user_pages(first_page_va, nents,
  358. foll_flags | FOLL_LONGTERM,
  359. plist, NULL);
  360. if (rv < 0)
  361. goto out_sem_up;
  362. umem->num_pages += rv;
  363. first_page_va += rv * PAGE_SIZE;
  364. plist += rv;
  365. nents -= rv;
  366. num_pages -= rv;
  367. }
  368. }
  369. out_sem_up:
  370. mmap_read_unlock(mm_s);
  371. if (rv > 0)
  372. return umem;
  373. /* Adjust accounting for pages not pinned */
  374. if (num_pages)
  375. atomic64_sub(num_pages, &mm_s->pinned_vm);
  376. siw_umem_release(umem, false);
  377. return ERR_PTR(rv);
  378. }