usnic_uiom.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554
  1. /*
  2. * Copyright (c) 2005 Topspin Communications. All rights reserved.
  3. * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
  4. * Copyright (c) 2013 Cisco Systems. All rights reserved.
  5. *
  6. * This software is available to you under a choice of one of two
  7. * licenses. You may choose to be licensed under the terms of the GNU
  8. * General Public License (GPL) Version 2, available from the file
  9. * COPYING in the main directory of this source tree, or the
  10. * BSD license below:
  11. *
  12. * Redistribution and use in source and binary forms, with or
  13. * without modification, are permitted provided that the following
  14. * conditions are met:
  15. *
  16. * - Redistributions of source code must retain the above
  17. * copyright notice, this list of conditions and the following
  18. * disclaimer.
  19. *
  20. * - Redistributions in binary form must reproduce the above
  21. * copyright notice, this list of conditions and the following
  22. * disclaimer in the documentation and/or other materials
  23. * provided with the distribution.
  24. *
  25. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  26. * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  27. * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  28. * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  29. * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  30. * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  31. * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  32. * SOFTWARE.
  33. */
  34. #include <linux/mm.h>
  35. #include <linux/dma-mapping.h>
  36. #include <linux/sched/signal.h>
  37. #include <linux/sched/mm.h>
  38. #include <linux/hugetlb.h>
  39. #include <linux/iommu.h>
  40. #include <linux/workqueue.h>
  41. #include <linux/list.h>
  42. #include <rdma/ib_verbs.h>
  43. #include "usnic_log.h"
  44. #include "usnic_uiom.h"
  45. #include "usnic_uiom_interval_tree.h"
  46. #define USNIC_UIOM_PAGE_CHUNK \
  47. ((PAGE_SIZE - offsetof(struct usnic_uiom_chunk, page_list)) /\
  48. ((void *) &((struct usnic_uiom_chunk *) 0)->page_list[1] - \
  49. (void *) &((struct usnic_uiom_chunk *) 0)->page_list[0]))
  50. static int usnic_uiom_dma_fault(struct iommu_domain *domain,
  51. struct device *dev,
  52. unsigned long iova, int flags,
  53. void *token)
  54. {
  55. usnic_err("Device %s iommu fault domain 0x%pK va 0x%lx flags 0x%x\n",
  56. dev_name(dev),
  57. domain, iova, flags);
  58. return -ENOSYS;
  59. }
  60. static void usnic_uiom_put_pages(struct list_head *chunk_list, int dirty)
  61. {
  62. struct usnic_uiom_chunk *chunk, *tmp;
  63. struct page *page;
  64. struct scatterlist *sg;
  65. int i;
  66. dma_addr_t pa;
  67. list_for_each_entry_safe(chunk, tmp, chunk_list, list) {
  68. for_each_sg(chunk->page_list, sg, chunk->nents, i) {
  69. page = sg_page(sg);
  70. pa = sg_phys(sg);
  71. unpin_user_pages_dirty_lock(&page, 1, dirty);
  72. usnic_dbg("pa: %pa\n", &pa);
  73. }
  74. kfree(chunk);
  75. }
  76. }
  77. static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
  78. int dmasync, struct usnic_uiom_reg *uiomr)
  79. {
  80. struct list_head *chunk_list = &uiomr->chunk_list;
  81. struct page **page_list;
  82. struct scatterlist *sg;
  83. struct usnic_uiom_chunk *chunk;
  84. unsigned long locked;
  85. unsigned long lock_limit;
  86. unsigned long cur_base;
  87. unsigned long npages;
  88. int ret;
  89. int off;
  90. int i;
  91. dma_addr_t pa;
  92. unsigned int gup_flags;
  93. struct mm_struct *mm;
  94. /*
  95. * If the combination of the addr and size requested for this memory
  96. * region causes an integer overflow, return error.
  97. */
  98. if (((addr + size) < addr) || PAGE_ALIGN(addr + size) < (addr + size))
  99. return -EINVAL;
  100. if (!size)
  101. return -EINVAL;
  102. if (!can_do_mlock())
  103. return -EPERM;
  104. INIT_LIST_HEAD(chunk_list);
  105. page_list = (struct page **) __get_free_page(GFP_KERNEL);
  106. if (!page_list)
  107. return -ENOMEM;
  108. npages = PAGE_ALIGN(size + (addr & ~PAGE_MASK)) >> PAGE_SHIFT;
  109. uiomr->owning_mm = mm = current->mm;
  110. mmap_read_lock(mm);
  111. locked = atomic64_add_return(npages, &current->mm->pinned_vm);
  112. lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
  113. if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
  114. ret = -ENOMEM;
  115. goto out;
  116. }
  117. gup_flags = FOLL_WRITE;
  118. gup_flags |= (writable) ? 0 : FOLL_FORCE;
  119. cur_base = addr & PAGE_MASK;
  120. ret = 0;
  121. while (npages) {
  122. ret = pin_user_pages(cur_base,
  123. min_t(unsigned long, npages,
  124. PAGE_SIZE / sizeof(struct page *)),
  125. gup_flags | FOLL_LONGTERM,
  126. page_list, NULL);
  127. if (ret < 0)
  128. goto out;
  129. npages -= ret;
  130. off = 0;
  131. while (ret) {
  132. chunk = kmalloc(struct_size(chunk, page_list,
  133. min_t(int, ret, USNIC_UIOM_PAGE_CHUNK)),
  134. GFP_KERNEL);
  135. if (!chunk) {
  136. ret = -ENOMEM;
  137. goto out;
  138. }
  139. chunk->nents = min_t(int, ret, USNIC_UIOM_PAGE_CHUNK);
  140. sg_init_table(chunk->page_list, chunk->nents);
  141. for_each_sg(chunk->page_list, sg, chunk->nents, i) {
  142. sg_set_page(sg, page_list[i + off],
  143. PAGE_SIZE, 0);
  144. pa = sg_phys(sg);
  145. usnic_dbg("va: 0x%lx pa: %pa\n",
  146. cur_base + i*PAGE_SIZE, &pa);
  147. }
  148. cur_base += chunk->nents * PAGE_SIZE;
  149. ret -= chunk->nents;
  150. off += chunk->nents;
  151. list_add_tail(&chunk->list, chunk_list);
  152. }
  153. ret = 0;
  154. }
  155. out:
  156. if (ret < 0) {
  157. usnic_uiom_put_pages(chunk_list, 0);
  158. atomic64_sub(npages, &current->mm->pinned_vm);
  159. } else
  160. mmgrab(uiomr->owning_mm);
  161. mmap_read_unlock(mm);
  162. free_page((unsigned long) page_list);
  163. return ret;
  164. }
  165. static void usnic_uiom_unmap_sorted_intervals(struct list_head *intervals,
  166. struct usnic_uiom_pd *pd)
  167. {
  168. struct usnic_uiom_interval_node *interval, *tmp;
  169. long unsigned va, size;
  170. list_for_each_entry_safe(interval, tmp, intervals, link) {
  171. va = interval->start << PAGE_SHIFT;
  172. size = ((interval->last - interval->start) + 1) << PAGE_SHIFT;
  173. while (size > 0) {
  174. /* Workaround for RH 970401 */
  175. usnic_dbg("va 0x%lx size 0x%lx", va, PAGE_SIZE);
  176. iommu_unmap(pd->domain, va, PAGE_SIZE);
  177. va += PAGE_SIZE;
  178. size -= PAGE_SIZE;
  179. }
  180. }
  181. }
  182. static void __usnic_uiom_reg_release(struct usnic_uiom_pd *pd,
  183. struct usnic_uiom_reg *uiomr,
  184. int dirty)
  185. {
  186. int npages;
  187. unsigned long vpn_start, vpn_last;
  188. struct usnic_uiom_interval_node *interval, *tmp;
  189. int writable = 0;
  190. LIST_HEAD(rm_intervals);
  191. npages = PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT;
  192. vpn_start = (uiomr->va & PAGE_MASK) >> PAGE_SHIFT;
  193. vpn_last = vpn_start + npages - 1;
  194. spin_lock(&pd->lock);
  195. usnic_uiom_remove_interval(&pd->root, vpn_start,
  196. vpn_last, &rm_intervals);
  197. usnic_uiom_unmap_sorted_intervals(&rm_intervals, pd);
  198. list_for_each_entry_safe(interval, tmp, &rm_intervals, link) {
  199. if (interval->flags & IOMMU_WRITE)
  200. writable = 1;
  201. list_del(&interval->link);
  202. kfree(interval);
  203. }
  204. usnic_uiom_put_pages(&uiomr->chunk_list, dirty & writable);
  205. spin_unlock(&pd->lock);
  206. }
  207. static int usnic_uiom_map_sorted_intervals(struct list_head *intervals,
  208. struct usnic_uiom_reg *uiomr)
  209. {
  210. int i, err;
  211. size_t size;
  212. struct usnic_uiom_chunk *chunk;
  213. struct usnic_uiom_interval_node *interval_node;
  214. dma_addr_t pa;
  215. dma_addr_t pa_start = 0;
  216. dma_addr_t pa_end = 0;
  217. long int va_start = -EINVAL;
  218. struct usnic_uiom_pd *pd = uiomr->pd;
  219. long int va = uiomr->va & PAGE_MASK;
  220. int flags = IOMMU_READ | IOMMU_CACHE;
  221. flags |= (uiomr->writable) ? IOMMU_WRITE : 0;
  222. chunk = list_first_entry(&uiomr->chunk_list, struct usnic_uiom_chunk,
  223. list);
  224. list_for_each_entry(interval_node, intervals, link) {
  225. iter_chunk:
  226. for (i = 0; i < chunk->nents; i++, va += PAGE_SIZE) {
  227. pa = sg_phys(&chunk->page_list[i]);
  228. if ((va >> PAGE_SHIFT) < interval_node->start)
  229. continue;
  230. if ((va >> PAGE_SHIFT) == interval_node->start) {
  231. /* First page of the interval */
  232. va_start = va;
  233. pa_start = pa;
  234. pa_end = pa;
  235. }
  236. WARN_ON(va_start == -EINVAL);
  237. if ((pa_end + PAGE_SIZE != pa) &&
  238. (pa != pa_start)) {
  239. /* PAs are not contiguous */
  240. size = pa_end - pa_start + PAGE_SIZE;
  241. usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x",
  242. va_start, &pa_start, size, flags);
  243. err = iommu_map_atomic(pd->domain, va_start,
  244. pa_start, size, flags);
  245. if (err) {
  246. usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n",
  247. va_start, &pa_start, size, err);
  248. goto err_out;
  249. }
  250. va_start = va;
  251. pa_start = pa;
  252. pa_end = pa;
  253. }
  254. if ((va >> PAGE_SHIFT) == interval_node->last) {
  255. /* Last page of the interval */
  256. size = pa - pa_start + PAGE_SIZE;
  257. usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x\n",
  258. va_start, &pa_start, size, flags);
  259. err = iommu_map_atomic(pd->domain, va_start,
  260. pa_start, size, flags);
  261. if (err) {
  262. usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n",
  263. va_start, &pa_start, size, err);
  264. goto err_out;
  265. }
  266. break;
  267. }
  268. if (pa != pa_start)
  269. pa_end += PAGE_SIZE;
  270. }
  271. if (i == chunk->nents) {
  272. /*
  273. * Hit last entry of the chunk,
  274. * hence advance to next chunk
  275. */
  276. chunk = list_first_entry(&chunk->list,
  277. struct usnic_uiom_chunk,
  278. list);
  279. goto iter_chunk;
  280. }
  281. }
  282. return 0;
  283. err_out:
  284. usnic_uiom_unmap_sorted_intervals(intervals, pd);
  285. return err;
  286. }
  287. struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd,
  288. unsigned long addr, size_t size,
  289. int writable, int dmasync)
  290. {
  291. struct usnic_uiom_reg *uiomr;
  292. unsigned long va_base, vpn_start, vpn_last;
  293. unsigned long npages;
  294. int offset, err;
  295. LIST_HEAD(sorted_diff_intervals);
  296. /*
  297. * Intel IOMMU map throws an error if a translation entry is
  298. * changed from read to write. This module may not unmap
  299. * and then remap the entry after fixing the permission
  300. * b/c this open up a small windows where hw DMA may page fault
  301. * Hence, make all entries to be writable.
  302. */
  303. writable = 1;
  304. va_base = addr & PAGE_MASK;
  305. offset = addr & ~PAGE_MASK;
  306. npages = PAGE_ALIGN(size + offset) >> PAGE_SHIFT;
  307. vpn_start = (addr & PAGE_MASK) >> PAGE_SHIFT;
  308. vpn_last = vpn_start + npages - 1;
  309. uiomr = kmalloc(sizeof(*uiomr), GFP_KERNEL);
  310. if (!uiomr)
  311. return ERR_PTR(-ENOMEM);
  312. uiomr->va = va_base;
  313. uiomr->offset = offset;
  314. uiomr->length = size;
  315. uiomr->writable = writable;
  316. uiomr->pd = pd;
  317. err = usnic_uiom_get_pages(addr, size, writable, dmasync,
  318. uiomr);
  319. if (err) {
  320. usnic_err("Failed get_pages vpn [0x%lx,0x%lx] err %d\n",
  321. vpn_start, vpn_last, err);
  322. goto out_free_uiomr;
  323. }
  324. spin_lock(&pd->lock);
  325. err = usnic_uiom_get_intervals_diff(vpn_start, vpn_last,
  326. (writable) ? IOMMU_WRITE : 0,
  327. IOMMU_WRITE,
  328. &pd->root,
  329. &sorted_diff_intervals);
  330. if (err) {
  331. usnic_err("Failed disjoint interval vpn [0x%lx,0x%lx] err %d\n",
  332. vpn_start, vpn_last, err);
  333. goto out_put_pages;
  334. }
  335. err = usnic_uiom_map_sorted_intervals(&sorted_diff_intervals, uiomr);
  336. if (err) {
  337. usnic_err("Failed map interval vpn [0x%lx,0x%lx] err %d\n",
  338. vpn_start, vpn_last, err);
  339. goto out_put_intervals;
  340. }
  341. err = usnic_uiom_insert_interval(&pd->root, vpn_start, vpn_last,
  342. (writable) ? IOMMU_WRITE : 0);
  343. if (err) {
  344. usnic_err("Failed insert interval vpn [0x%lx,0x%lx] err %d\n",
  345. vpn_start, vpn_last, err);
  346. goto out_unmap_intervals;
  347. }
  348. usnic_uiom_put_interval_set(&sorted_diff_intervals);
  349. spin_unlock(&pd->lock);
  350. return uiomr;
  351. out_unmap_intervals:
  352. usnic_uiom_unmap_sorted_intervals(&sorted_diff_intervals, pd);
  353. out_put_intervals:
  354. usnic_uiom_put_interval_set(&sorted_diff_intervals);
  355. out_put_pages:
  356. usnic_uiom_put_pages(&uiomr->chunk_list, 0);
  357. spin_unlock(&pd->lock);
  358. mmdrop(uiomr->owning_mm);
  359. out_free_uiomr:
  360. kfree(uiomr);
  361. return ERR_PTR(err);
  362. }
  363. static void __usnic_uiom_release_tail(struct usnic_uiom_reg *uiomr)
  364. {
  365. mmdrop(uiomr->owning_mm);
  366. kfree(uiomr);
  367. }
  368. static inline size_t usnic_uiom_num_pages(struct usnic_uiom_reg *uiomr)
  369. {
  370. return PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT;
  371. }
  372. void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr)
  373. {
  374. __usnic_uiom_reg_release(uiomr->pd, uiomr, 1);
  375. atomic64_sub(usnic_uiom_num_pages(uiomr), &uiomr->owning_mm->pinned_vm);
  376. __usnic_uiom_release_tail(uiomr);
  377. }
  378. struct usnic_uiom_pd *usnic_uiom_alloc_pd(struct device *dev)
  379. {
  380. struct usnic_uiom_pd *pd;
  381. void *domain;
  382. pd = kzalloc(sizeof(*pd), GFP_KERNEL);
  383. if (!pd)
  384. return ERR_PTR(-ENOMEM);
  385. pd->domain = domain = iommu_domain_alloc(dev->bus);
  386. if (!domain) {
  387. usnic_err("Failed to allocate IOMMU domain");
  388. kfree(pd);
  389. return ERR_PTR(-ENOMEM);
  390. }
  391. iommu_set_fault_handler(pd->domain, usnic_uiom_dma_fault, NULL);
  392. spin_lock_init(&pd->lock);
  393. INIT_LIST_HEAD(&pd->devs);
  394. return pd;
  395. }
  396. void usnic_uiom_dealloc_pd(struct usnic_uiom_pd *pd)
  397. {
  398. iommu_domain_free(pd->domain);
  399. kfree(pd);
  400. }
  401. int usnic_uiom_attach_dev_to_pd(struct usnic_uiom_pd *pd, struct device *dev)
  402. {
  403. struct usnic_uiom_dev *uiom_dev;
  404. int err;
  405. uiom_dev = kzalloc(sizeof(*uiom_dev), GFP_ATOMIC);
  406. if (!uiom_dev)
  407. return -ENOMEM;
  408. uiom_dev->dev = dev;
  409. err = iommu_attach_device(pd->domain, dev);
  410. if (err)
  411. goto out_free_dev;
  412. if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) {
  413. usnic_err("IOMMU of %s does not support cache coherency\n",
  414. dev_name(dev));
  415. err = -EINVAL;
  416. goto out_detach_device;
  417. }
  418. spin_lock(&pd->lock);
  419. list_add_tail(&uiom_dev->link, &pd->devs);
  420. pd->dev_cnt++;
  421. spin_unlock(&pd->lock);
  422. return 0;
  423. out_detach_device:
  424. iommu_detach_device(pd->domain, dev);
  425. out_free_dev:
  426. kfree(uiom_dev);
  427. return err;
  428. }
  429. void usnic_uiom_detach_dev_from_pd(struct usnic_uiom_pd *pd, struct device *dev)
  430. {
  431. struct usnic_uiom_dev *uiom_dev;
  432. int found = 0;
  433. spin_lock(&pd->lock);
  434. list_for_each_entry(uiom_dev, &pd->devs, link) {
  435. if (uiom_dev->dev == dev) {
  436. found = 1;
  437. break;
  438. }
  439. }
  440. if (!found) {
  441. usnic_err("Unable to free dev %s - not found\n",
  442. dev_name(dev));
  443. spin_unlock(&pd->lock);
  444. return;
  445. }
  446. list_del(&uiom_dev->link);
  447. pd->dev_cnt--;
  448. spin_unlock(&pd->lock);
  449. return iommu_detach_device(pd->domain, dev);
  450. }
  451. struct device **usnic_uiom_get_dev_list(struct usnic_uiom_pd *pd)
  452. {
  453. struct usnic_uiom_dev *uiom_dev;
  454. struct device **devs;
  455. int i = 0;
  456. spin_lock(&pd->lock);
  457. devs = kcalloc(pd->dev_cnt + 1, sizeof(*devs), GFP_ATOMIC);
  458. if (!devs) {
  459. devs = ERR_PTR(-ENOMEM);
  460. goto out;
  461. }
  462. list_for_each_entry(uiom_dev, &pd->devs, link) {
  463. devs[i++] = uiom_dev->dev;
  464. }
  465. out:
  466. spin_unlock(&pd->lock);
  467. return devs;
  468. }
  469. void usnic_uiom_free_dev_list(struct device **devs)
  470. {
  471. kfree(devs);
  472. }