iommu.c 45 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
  4. *
  5. * Rewrite, cleanup:
  6. *
  7. * Copyright (C) 2004 Olof Johansson <[email protected]>, IBM Corporation
  8. * Copyright (C) 2006 Olof Johansson <[email protected]>
  9. *
  10. * Dynamic DMA mapping support, pSeries-specific parts, both SMP and LPAR.
  11. */
  12. #include <linux/init.h>
  13. #include <linux/types.h>
  14. #include <linux/slab.h>
  15. #include <linux/mm.h>
  16. #include <linux/memblock.h>
  17. #include <linux/spinlock.h>
  18. #include <linux/string.h>
  19. #include <linux/pci.h>
  20. #include <linux/dma-mapping.h>
  21. #include <linux/crash_dump.h>
  22. #include <linux/memory.h>
  23. #include <linux/of.h>
  24. #include <linux/iommu.h>
  25. #include <linux/rculist.h>
  26. #include <asm/io.h>
  27. #include <asm/prom.h>
  28. #include <asm/rtas.h>
  29. #include <asm/iommu.h>
  30. #include <asm/pci-bridge.h>
  31. #include <asm/machdep.h>
  32. #include <asm/firmware.h>
  33. #include <asm/tce.h>
  34. #include <asm/ppc-pci.h>
  35. #include <asm/udbg.h>
  36. #include <asm/mmzone.h>
  37. #include <asm/plpar_wrappers.h>
  38. #include "pseries.h"
  39. enum {
  40. DDW_QUERY_PE_DMA_WIN = 0,
  41. DDW_CREATE_PE_DMA_WIN = 1,
  42. DDW_REMOVE_PE_DMA_WIN = 2,
  43. DDW_APPLICABLE_SIZE
  44. };
  45. enum {
  46. DDW_EXT_SIZE = 0,
  47. DDW_EXT_RESET_DMA_WIN = 1,
  48. DDW_EXT_QUERY_OUT_SIZE = 2
  49. };
  50. static struct iommu_table *iommu_pseries_alloc_table(int node)
  51. {
  52. struct iommu_table *tbl;
  53. tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
  54. if (!tbl)
  55. return NULL;
  56. INIT_LIST_HEAD_RCU(&tbl->it_group_list);
  57. kref_init(&tbl->it_kref);
  58. return tbl;
  59. }
  60. static struct iommu_table_group *iommu_pseries_alloc_group(int node)
  61. {
  62. struct iommu_table_group *table_group;
  63. table_group = kzalloc_node(sizeof(*table_group), GFP_KERNEL, node);
  64. if (!table_group)
  65. return NULL;
  66. table_group->tables[0] = iommu_pseries_alloc_table(node);
  67. if (table_group->tables[0])
  68. return table_group;
  69. kfree(table_group);
  70. return NULL;
  71. }
  72. static void iommu_pseries_free_group(struct iommu_table_group *table_group,
  73. const char *node_name)
  74. {
  75. if (!table_group)
  76. return;
  77. #ifdef CONFIG_IOMMU_API
  78. if (table_group->group) {
  79. iommu_group_put(table_group->group);
  80. BUG_ON(table_group->group);
  81. }
  82. #endif
  83. /* Default DMA window table is at index 0, while DDW at 1. SR-IOV
  84. * adapters only have table on index 1.
  85. */
  86. if (table_group->tables[0])
  87. iommu_tce_table_put(table_group->tables[0]);
  88. if (table_group->tables[1])
  89. iommu_tce_table_put(table_group->tables[1]);
  90. kfree(table_group);
  91. }
  92. static int tce_build_pSeries(struct iommu_table *tbl, long index,
  93. long npages, unsigned long uaddr,
  94. enum dma_data_direction direction,
  95. unsigned long attrs)
  96. {
  97. u64 proto_tce;
  98. __be64 *tcep;
  99. u64 rpn;
  100. const unsigned long tceshift = tbl->it_page_shift;
  101. const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl);
  102. proto_tce = TCE_PCI_READ; // Read allowed
  103. if (direction != DMA_TO_DEVICE)
  104. proto_tce |= TCE_PCI_WRITE;
  105. tcep = ((__be64 *)tbl->it_base) + index;
  106. while (npages--) {
  107. /* can't move this out since we might cross MEMBLOCK boundary */
  108. rpn = __pa(uaddr) >> tceshift;
  109. *tcep = cpu_to_be64(proto_tce | rpn << tceshift);
  110. uaddr += pagesize;
  111. tcep++;
  112. }
  113. return 0;
  114. }
  115. static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages)
  116. {
  117. __be64 *tcep;
  118. tcep = ((__be64 *)tbl->it_base) + index;
  119. while (npages--)
  120. *(tcep++) = 0;
  121. }
  122. static unsigned long tce_get_pseries(struct iommu_table *tbl, long index)
  123. {
  124. __be64 *tcep;
  125. tcep = ((__be64 *)tbl->it_base) + index;
  126. return be64_to_cpu(*tcep);
  127. }
  128. static void tce_free_pSeriesLP(unsigned long liobn, long, long, long);
  129. static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
  130. static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
  131. long npages, unsigned long uaddr,
  132. enum dma_data_direction direction,
  133. unsigned long attrs)
  134. {
  135. u64 rc = 0;
  136. u64 proto_tce, tce;
  137. u64 rpn;
  138. int ret = 0;
  139. long tcenum_start = tcenum, npages_start = npages;
  140. rpn = __pa(uaddr) >> tceshift;
  141. proto_tce = TCE_PCI_READ;
  142. if (direction != DMA_TO_DEVICE)
  143. proto_tce |= TCE_PCI_WRITE;
  144. while (npages--) {
  145. tce = proto_tce | rpn << tceshift;
  146. rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce);
  147. if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
  148. ret = (int)rc;
  149. tce_free_pSeriesLP(liobn, tcenum_start, tceshift,
  150. (npages_start - (npages + 1)));
  151. break;
  152. }
  153. if (rc && printk_ratelimit()) {
  154. printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
  155. printk("\tindex = 0x%llx\n", (u64)liobn);
  156. printk("\ttcenum = 0x%llx\n", (u64)tcenum);
  157. printk("\ttce val = 0x%llx\n", tce );
  158. dump_stack();
  159. }
  160. tcenum++;
  161. rpn++;
  162. }
  163. return ret;
  164. }
  165. static DEFINE_PER_CPU(__be64 *, tce_page);
  166. static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
  167. long npages, unsigned long uaddr,
  168. enum dma_data_direction direction,
  169. unsigned long attrs)
  170. {
  171. u64 rc = 0;
  172. u64 proto_tce;
  173. __be64 *tcep;
  174. u64 rpn;
  175. long l, limit;
  176. long tcenum_start = tcenum, npages_start = npages;
  177. int ret = 0;
  178. unsigned long flags;
  179. const unsigned long tceshift = tbl->it_page_shift;
  180. if ((npages == 1) || !firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) {
  181. return tce_build_pSeriesLP(tbl->it_index, tcenum,
  182. tceshift, npages, uaddr,
  183. direction, attrs);
  184. }
  185. local_irq_save(flags); /* to protect tcep and the page behind it */
  186. tcep = __this_cpu_read(tce_page);
  187. /* This is safe to do since interrupts are off when we're called
  188. * from iommu_alloc{,_sg}()
  189. */
  190. if (!tcep) {
  191. tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
  192. /* If allocation fails, fall back to the loop implementation */
  193. if (!tcep) {
  194. local_irq_restore(flags);
  195. return tce_build_pSeriesLP(tbl->it_index, tcenum,
  196. tceshift,
  197. npages, uaddr, direction, attrs);
  198. }
  199. __this_cpu_write(tce_page, tcep);
  200. }
  201. rpn = __pa(uaddr) >> tceshift;
  202. proto_tce = TCE_PCI_READ;
  203. if (direction != DMA_TO_DEVICE)
  204. proto_tce |= TCE_PCI_WRITE;
  205. /* We can map max one pageful of TCEs at a time */
  206. do {
  207. /*
  208. * Set up the page with TCE data, looping through and setting
  209. * the values.
  210. */
  211. limit = min_t(long, npages, 4096/TCE_ENTRY_SIZE);
  212. for (l = 0; l < limit; l++) {
  213. tcep[l] = cpu_to_be64(proto_tce | rpn << tceshift);
  214. rpn++;
  215. }
  216. rc = plpar_tce_put_indirect((u64)tbl->it_index,
  217. (u64)tcenum << tceshift,
  218. (u64)__pa(tcep),
  219. limit);
  220. npages -= limit;
  221. tcenum += limit;
  222. } while (npages > 0 && !rc);
  223. local_irq_restore(flags);
  224. if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
  225. ret = (int)rc;
  226. tce_freemulti_pSeriesLP(tbl, tcenum_start,
  227. (npages_start - (npages + limit)));
  228. return ret;
  229. }
  230. if (rc && printk_ratelimit()) {
  231. printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
  232. printk("\tindex = 0x%llx\n", (u64)tbl->it_index);
  233. printk("\tnpages = 0x%llx\n", (u64)npages);
  234. printk("\ttce[0] val = 0x%llx\n", tcep[0]);
  235. dump_stack();
  236. }
  237. return ret;
  238. }
  239. static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
  240. long npages)
  241. {
  242. u64 rc;
  243. while (npages--) {
  244. rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, 0);
  245. if (rc && printk_ratelimit()) {
  246. printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
  247. printk("\tindex = 0x%llx\n", (u64)liobn);
  248. printk("\ttcenum = 0x%llx\n", (u64)tcenum);
  249. dump_stack();
  250. }
  251. tcenum++;
  252. }
  253. }
  254. static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
  255. {
  256. u64 rc;
  257. long rpages = npages;
  258. unsigned long limit;
  259. if (!firmware_has_feature(FW_FEATURE_STUFF_TCE))
  260. return tce_free_pSeriesLP(tbl->it_index, tcenum,
  261. tbl->it_page_shift, npages);
  262. do {
  263. limit = min_t(unsigned long, rpages, 512);
  264. rc = plpar_tce_stuff((u64)tbl->it_index,
  265. (u64)tcenum << tbl->it_page_shift, 0, limit);
  266. rpages -= limit;
  267. tcenum += limit;
  268. } while (rpages > 0 && !rc);
  269. if (rc && printk_ratelimit()) {
  270. printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n");
  271. printk("\trc = %lld\n", rc);
  272. printk("\tindex = 0x%llx\n", (u64)tbl->it_index);
  273. printk("\tnpages = 0x%llx\n", (u64)npages);
  274. dump_stack();
  275. }
  276. }
  277. static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum)
  278. {
  279. u64 rc;
  280. unsigned long tce_ret;
  281. rc = plpar_tce_get((u64)tbl->it_index,
  282. (u64)tcenum << tbl->it_page_shift, &tce_ret);
  283. if (rc && printk_ratelimit()) {
  284. printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc);
  285. printk("\tindex = 0x%llx\n", (u64)tbl->it_index);
  286. printk("\ttcenum = 0x%llx\n", (u64)tcenum);
  287. dump_stack();
  288. }
  289. return tce_ret;
  290. }
  291. /* this is compatible with cells for the device tree property */
  292. struct dynamic_dma_window_prop {
  293. __be32 liobn; /* tce table number */
  294. __be64 dma_base; /* address hi,lo */
  295. __be32 tce_shift; /* ilog2(tce_page_size) */
  296. __be32 window_shift; /* ilog2(tce_window_size) */
  297. };
  298. struct dma_win {
  299. struct device_node *device;
  300. const struct dynamic_dma_window_prop *prop;
  301. struct list_head list;
  302. };
  303. /* Dynamic DMA Window support */
  304. struct ddw_query_response {
  305. u32 windows_available;
  306. u64 largest_available_block;
  307. u32 page_size;
  308. u32 migration_capable;
  309. };
  310. struct ddw_create_response {
  311. u32 liobn;
  312. u32 addr_hi;
  313. u32 addr_lo;
  314. };
  315. static LIST_HEAD(dma_win_list);
  316. /* prevents races between memory on/offline and window creation */
  317. static DEFINE_SPINLOCK(dma_win_list_lock);
  318. /* protects initializing window twice for same device */
  319. static DEFINE_MUTEX(dma_win_init_mutex);
  320. #define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
  321. #define DMA64_PROPNAME "linux,dma64-ddr-window-info"
  322. static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
  323. unsigned long num_pfn, const void *arg)
  324. {
  325. const struct dynamic_dma_window_prop *maprange = arg;
  326. int rc;
  327. u64 tce_size, num_tce, dma_offset, next;
  328. u32 tce_shift;
  329. long limit;
  330. tce_shift = be32_to_cpu(maprange->tce_shift);
  331. tce_size = 1ULL << tce_shift;
  332. next = start_pfn << PAGE_SHIFT;
  333. num_tce = num_pfn << PAGE_SHIFT;
  334. /* round back to the beginning of the tce page size */
  335. num_tce += next & (tce_size - 1);
  336. next &= ~(tce_size - 1);
  337. /* covert to number of tces */
  338. num_tce |= tce_size - 1;
  339. num_tce >>= tce_shift;
  340. do {
  341. /*
  342. * Set up the page with TCE data, looping through and setting
  343. * the values.
  344. */
  345. limit = min_t(long, num_tce, 512);
  346. dma_offset = next + be64_to_cpu(maprange->dma_base);
  347. rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn),
  348. dma_offset,
  349. 0, limit);
  350. next += limit * tce_size;
  351. num_tce -= limit;
  352. } while (num_tce > 0 && !rc);
  353. return rc;
  354. }
  355. static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
  356. unsigned long num_pfn, const void *arg)
  357. {
  358. const struct dynamic_dma_window_prop *maprange = arg;
  359. u64 tce_size, num_tce, dma_offset, next, proto_tce, liobn;
  360. __be64 *tcep;
  361. u32 tce_shift;
  362. u64 rc = 0;
  363. long l, limit;
  364. if (!firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) {
  365. unsigned long tceshift = be32_to_cpu(maprange->tce_shift);
  366. unsigned long dmastart = (start_pfn << PAGE_SHIFT) +
  367. be64_to_cpu(maprange->dma_base);
  368. unsigned long tcenum = dmastart >> tceshift;
  369. unsigned long npages = num_pfn << PAGE_SHIFT >> tceshift;
  370. void *uaddr = __va(start_pfn << PAGE_SHIFT);
  371. return tce_build_pSeriesLP(be32_to_cpu(maprange->liobn),
  372. tcenum, tceshift, npages, (unsigned long) uaddr,
  373. DMA_BIDIRECTIONAL, 0);
  374. }
  375. local_irq_disable(); /* to protect tcep and the page behind it */
  376. tcep = __this_cpu_read(tce_page);
  377. if (!tcep) {
  378. tcep = (__be64 *)__get_free_page(GFP_ATOMIC);
  379. if (!tcep) {
  380. local_irq_enable();
  381. return -ENOMEM;
  382. }
  383. __this_cpu_write(tce_page, tcep);
  384. }
  385. proto_tce = TCE_PCI_READ | TCE_PCI_WRITE;
  386. liobn = (u64)be32_to_cpu(maprange->liobn);
  387. tce_shift = be32_to_cpu(maprange->tce_shift);
  388. tce_size = 1ULL << tce_shift;
  389. next = start_pfn << PAGE_SHIFT;
  390. num_tce = num_pfn << PAGE_SHIFT;
  391. /* round back to the beginning of the tce page size */
  392. num_tce += next & (tce_size - 1);
  393. next &= ~(tce_size - 1);
  394. /* covert to number of tces */
  395. num_tce |= tce_size - 1;
  396. num_tce >>= tce_shift;
  397. /* We can map max one pageful of TCEs at a time */
  398. do {
  399. /*
  400. * Set up the page with TCE data, looping through and setting
  401. * the values.
  402. */
  403. limit = min_t(long, num_tce, 4096/TCE_ENTRY_SIZE);
  404. dma_offset = next + be64_to_cpu(maprange->dma_base);
  405. for (l = 0; l < limit; l++) {
  406. tcep[l] = cpu_to_be64(proto_tce | next);
  407. next += tce_size;
  408. }
  409. rc = plpar_tce_put_indirect(liobn,
  410. dma_offset,
  411. (u64)__pa(tcep),
  412. limit);
  413. num_tce -= limit;
  414. } while (num_tce > 0 && !rc);
  415. /* error cleanup: caller will clear whole range */
  416. local_irq_enable();
  417. return rc;
  418. }
  419. static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn,
  420. unsigned long num_pfn, void *arg)
  421. {
  422. return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);
  423. }
  424. static void iommu_table_setparms_common(struct iommu_table *tbl, unsigned long busno,
  425. unsigned long liobn, unsigned long win_addr,
  426. unsigned long window_size, unsigned long page_shift,
  427. void *base, struct iommu_table_ops *table_ops)
  428. {
  429. tbl->it_busno = busno;
  430. tbl->it_index = liobn;
  431. tbl->it_offset = win_addr >> page_shift;
  432. tbl->it_size = window_size >> page_shift;
  433. tbl->it_page_shift = page_shift;
  434. tbl->it_base = (unsigned long)base;
  435. tbl->it_blocksize = 16;
  436. tbl->it_type = TCE_PCI;
  437. tbl->it_ops = table_ops;
  438. }
  439. struct iommu_table_ops iommu_table_pseries_ops;
  440. static void iommu_table_setparms(struct pci_controller *phb,
  441. struct device_node *dn,
  442. struct iommu_table *tbl)
  443. {
  444. struct device_node *node;
  445. const unsigned long *basep;
  446. const u32 *sizep;
  447. /* Test if we are going over 2GB of DMA space */
  448. if (phb->dma_window_base_cur + phb->dma_window_size > SZ_2G) {
  449. udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
  450. panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
  451. }
  452. node = phb->dn;
  453. basep = of_get_property(node, "linux,tce-base", NULL);
  454. sizep = of_get_property(node, "linux,tce-size", NULL);
  455. if (basep == NULL || sizep == NULL) {
  456. printk(KERN_ERR "PCI_DMA: iommu_table_setparms: %pOF has "
  457. "missing tce entries !\n", dn);
  458. return;
  459. }
  460. iommu_table_setparms_common(tbl, phb->bus->number, 0, phb->dma_window_base_cur,
  461. phb->dma_window_size, IOMMU_PAGE_SHIFT_4K,
  462. __va(*basep), &iommu_table_pseries_ops);
  463. if (!is_kdump_kernel())
  464. memset((void *)tbl->it_base, 0, *sizep);
  465. phb->dma_window_base_cur += phb->dma_window_size;
  466. }
  467. struct iommu_table_ops iommu_table_lpar_multi_ops;
  468. /*
  469. * iommu_table_setparms_lpar
  470. *
  471. * Function: On pSeries LPAR systems, return TCE table info, given a pci bus.
  472. */
  473. static void iommu_table_setparms_lpar(struct pci_controller *phb,
  474. struct device_node *dn,
  475. struct iommu_table *tbl,
  476. struct iommu_table_group *table_group,
  477. const __be32 *dma_window)
  478. {
  479. unsigned long offset, size, liobn;
  480. of_parse_dma_window(dn, dma_window, &liobn, &offset, &size);
  481. iommu_table_setparms_common(tbl, phb->bus->number, liobn, offset, size, IOMMU_PAGE_SHIFT_4K, NULL,
  482. &iommu_table_lpar_multi_ops);
  483. table_group->tce32_start = offset;
  484. table_group->tce32_size = size;
  485. }
  486. struct iommu_table_ops iommu_table_pseries_ops = {
  487. .set = tce_build_pSeries,
  488. .clear = tce_free_pSeries,
  489. .get = tce_get_pseries
  490. };
  491. static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
  492. {
  493. struct device_node *dn;
  494. struct iommu_table *tbl;
  495. struct device_node *isa_dn, *isa_dn_orig;
  496. struct device_node *tmp;
  497. struct pci_dn *pci;
  498. int children;
  499. dn = pci_bus_to_OF_node(bus);
  500. pr_debug("pci_dma_bus_setup_pSeries: setting up bus %pOF\n", dn);
  501. if (bus->self) {
  502. /* This is not a root bus, any setup will be done for the
  503. * device-side of the bridge in iommu_dev_setup_pSeries().
  504. */
  505. return;
  506. }
  507. pci = PCI_DN(dn);
  508. /* Check if the ISA bus on the system is under
  509. * this PHB.
  510. */
  511. isa_dn = isa_dn_orig = of_find_node_by_type(NULL, "isa");
  512. while (isa_dn && isa_dn != dn)
  513. isa_dn = isa_dn->parent;
  514. of_node_put(isa_dn_orig);
  515. /* Count number of direct PCI children of the PHB. */
  516. for (children = 0, tmp = dn->child; tmp; tmp = tmp->sibling)
  517. children++;
  518. pr_debug("Children: %d\n", children);
  519. /* Calculate amount of DMA window per slot. Each window must be
  520. * a power of two (due to pci_alloc_consistent requirements).
  521. *
  522. * Keep 256MB aside for PHBs with ISA.
  523. */
  524. if (!isa_dn) {
  525. /* No ISA/IDE - just set window size and return */
  526. pci->phb->dma_window_size = 0x80000000ul; /* To be divided */
  527. while (pci->phb->dma_window_size * children > 0x80000000ul)
  528. pci->phb->dma_window_size >>= 1;
  529. pr_debug("No ISA/IDE, window size is 0x%llx\n",
  530. pci->phb->dma_window_size);
  531. pci->phb->dma_window_base_cur = 0;
  532. return;
  533. }
  534. /* If we have ISA, then we probably have an IDE
  535. * controller too. Allocate a 128MB table but
  536. * skip the first 128MB to avoid stepping on ISA
  537. * space.
  538. */
  539. pci->phb->dma_window_size = 0x8000000ul;
  540. pci->phb->dma_window_base_cur = 0x8000000ul;
  541. pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
  542. tbl = pci->table_group->tables[0];
  543. iommu_table_setparms(pci->phb, dn, tbl);
  544. if (!iommu_init_table(tbl, pci->phb->node, 0, 0))
  545. panic("Failed to initialize iommu table");
  546. /* Divide the rest (1.75GB) among the children */
  547. pci->phb->dma_window_size = 0x80000000ul;
  548. while (pci->phb->dma_window_size * children > 0x70000000ul)
  549. pci->phb->dma_window_size >>= 1;
  550. pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size);
  551. }
  552. #ifdef CONFIG_IOMMU_API
  553. static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned
  554. long *tce, enum dma_data_direction *direction)
  555. {
  556. long rc;
  557. unsigned long ioba = (unsigned long) index << tbl->it_page_shift;
  558. unsigned long flags, oldtce = 0;
  559. u64 proto_tce = iommu_direction_to_tce_perm(*direction);
  560. unsigned long newtce = *tce | proto_tce;
  561. spin_lock_irqsave(&tbl->large_pool.lock, flags);
  562. rc = plpar_tce_get((u64)tbl->it_index, ioba, &oldtce);
  563. if (!rc)
  564. rc = plpar_tce_put((u64)tbl->it_index, ioba, newtce);
  565. if (!rc) {
  566. *direction = iommu_tce_direction(oldtce);
  567. *tce = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
  568. }
  569. spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
  570. return rc;
  571. }
  572. #endif
  573. struct iommu_table_ops iommu_table_lpar_multi_ops = {
  574. .set = tce_buildmulti_pSeriesLP,
  575. #ifdef CONFIG_IOMMU_API
  576. .xchg_no_kill = tce_exchange_pseries,
  577. #endif
  578. .clear = tce_freemulti_pSeriesLP,
  579. .get = tce_get_pSeriesLP
  580. };
  581. /*
  582. * Find nearest ibm,dma-window (default DMA window) or direct DMA window or
  583. * dynamic 64bit DMA window, walking up the device tree.
  584. */
  585. static struct device_node *pci_dma_find(struct device_node *dn,
  586. const __be32 **dma_window)
  587. {
  588. const __be32 *dw = NULL;
  589. for ( ; dn && PCI_DN(dn); dn = dn->parent) {
  590. dw = of_get_property(dn, "ibm,dma-window", NULL);
  591. if (dw) {
  592. if (dma_window)
  593. *dma_window = dw;
  594. return dn;
  595. }
  596. dw = of_get_property(dn, DIRECT64_PROPNAME, NULL);
  597. if (dw)
  598. return dn;
  599. dw = of_get_property(dn, DMA64_PROPNAME, NULL);
  600. if (dw)
  601. return dn;
  602. }
  603. return NULL;
  604. }
  605. static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
  606. {
  607. struct iommu_table *tbl;
  608. struct device_node *dn, *pdn;
  609. struct pci_dn *ppci;
  610. const __be32 *dma_window = NULL;
  611. dn = pci_bus_to_OF_node(bus);
  612. pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
  613. dn);
  614. pdn = pci_dma_find(dn, &dma_window);
  615. if (dma_window == NULL)
  616. pr_debug(" no ibm,dma-window property !\n");
  617. ppci = PCI_DN(pdn);
  618. pr_debug(" parent is %pOF, iommu_table: 0x%p\n",
  619. pdn, ppci->table_group);
  620. if (!ppci->table_group) {
  621. ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);
  622. tbl = ppci->table_group->tables[0];
  623. if (dma_window) {
  624. iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
  625. ppci->table_group, dma_window);
  626. if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))
  627. panic("Failed to initialize iommu table");
  628. }
  629. iommu_register_group(ppci->table_group,
  630. pci_domain_nr(bus), 0);
  631. pr_debug(" created table: %p\n", ppci->table_group);
  632. }
  633. }
  634. static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
  635. {
  636. struct device_node *dn;
  637. struct iommu_table *tbl;
  638. pr_debug("pci_dma_dev_setup_pSeries: %s\n", pci_name(dev));
  639. dn = dev->dev.of_node;
  640. /* If we're the direct child of a root bus, then we need to allocate
  641. * an iommu table ourselves. The bus setup code should have setup
  642. * the window sizes already.
  643. */
  644. if (!dev->bus->self) {
  645. struct pci_controller *phb = PCI_DN(dn)->phb;
  646. pr_debug(" --> first child, no bridge. Allocating iommu table.\n");
  647. PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node);
  648. tbl = PCI_DN(dn)->table_group->tables[0];
  649. iommu_table_setparms(phb, dn, tbl);
  650. if (!iommu_init_table(tbl, phb->node, 0, 0))
  651. panic("Failed to initialize iommu table");
  652. set_iommu_table_base(&dev->dev, tbl);
  653. return;
  654. }
  655. /* If this device is further down the bus tree, search upwards until
  656. * an already allocated iommu table is found and use that.
  657. */
  658. while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL)
  659. dn = dn->parent;
  660. if (dn && PCI_DN(dn))
  661. set_iommu_table_base(&dev->dev,
  662. PCI_DN(dn)->table_group->tables[0]);
  663. else
  664. printk(KERN_WARNING "iommu: Device %s has no iommu table\n",
  665. pci_name(dev));
  666. }
  667. static int __read_mostly disable_ddw;
  668. static int __init disable_ddw_setup(char *str)
  669. {
  670. disable_ddw = 1;
  671. printk(KERN_INFO "ppc iommu: disabling ddw.\n");
  672. return 0;
  673. }
  674. early_param("disable_ddw", disable_ddw_setup);
  675. static void clean_dma_window(struct device_node *np, struct dynamic_dma_window_prop *dwp)
  676. {
  677. int ret;
  678. ret = tce_clearrange_multi_pSeriesLP(0,
  679. 1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp);
  680. if (ret)
  681. pr_warn("%pOF failed to clear tces in window.\n",
  682. np);
  683. else
  684. pr_debug("%pOF successfully cleared tces in window.\n",
  685. np);
  686. }
  687. /*
  688. * Call only if DMA window is clean.
  689. */
  690. static void __remove_dma_window(struct device_node *np, u32 *ddw_avail, u64 liobn)
  691. {
  692. int ret;
  693. ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn);
  694. if (ret)
  695. pr_warn("%pOF: failed to remove DMA window: rtas returned "
  696. "%d to ibm,remove-pe-dma-window(%x) %llx\n",
  697. np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
  698. else
  699. pr_debug("%pOF: successfully removed DMA window: rtas returned "
  700. "%d to ibm,remove-pe-dma-window(%x) %llx\n",
  701. np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
  702. }
  703. static void remove_dma_window(struct device_node *np, u32 *ddw_avail,
  704. struct property *win)
  705. {
  706. struct dynamic_dma_window_prop *dwp;
  707. u64 liobn;
  708. dwp = win->value;
  709. liobn = (u64)be32_to_cpu(dwp->liobn);
  710. clean_dma_window(np, dwp);
  711. __remove_dma_window(np, ddw_avail, liobn);
  712. }
  713. static int remove_ddw(struct device_node *np, bool remove_prop, const char *win_name)
  714. {
  715. struct property *win;
  716. u32 ddw_avail[DDW_APPLICABLE_SIZE];
  717. int ret = 0;
  718. win = of_find_property(np, win_name, NULL);
  719. if (!win)
  720. return -EINVAL;
  721. ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
  722. &ddw_avail[0], DDW_APPLICABLE_SIZE);
  723. if (ret)
  724. return 0;
  725. if (win->length >= sizeof(struct dynamic_dma_window_prop))
  726. remove_dma_window(np, ddw_avail, win);
  727. if (!remove_prop)
  728. return 0;
  729. ret = of_remove_property(np, win);
  730. if (ret)
  731. pr_warn("%pOF: failed to remove DMA window property: %d\n",
  732. np, ret);
  733. return 0;
  734. }
  735. static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int *window_shift)
  736. {
  737. struct dma_win *window;
  738. const struct dynamic_dma_window_prop *dma64;
  739. bool found = false;
  740. spin_lock(&dma_win_list_lock);
  741. /* check if we already created a window and dupe that config if so */
  742. list_for_each_entry(window, &dma_win_list, list) {
  743. if (window->device == pdn) {
  744. dma64 = window->prop;
  745. *dma_addr = be64_to_cpu(dma64->dma_base);
  746. *window_shift = be32_to_cpu(dma64->window_shift);
  747. found = true;
  748. break;
  749. }
  750. }
  751. spin_unlock(&dma_win_list_lock);
  752. return found;
  753. }
  754. static struct dma_win *ddw_list_new_entry(struct device_node *pdn,
  755. const struct dynamic_dma_window_prop *dma64)
  756. {
  757. struct dma_win *window;
  758. window = kzalloc(sizeof(*window), GFP_KERNEL);
  759. if (!window)
  760. return NULL;
  761. window->device = pdn;
  762. window->prop = dma64;
  763. return window;
  764. }
  765. static void find_existing_ddw_windows_named(const char *name)
  766. {
  767. int len;
  768. struct device_node *pdn;
  769. struct dma_win *window;
  770. const struct dynamic_dma_window_prop *dma64;
  771. for_each_node_with_property(pdn, name) {
  772. dma64 = of_get_property(pdn, name, &len);
  773. if (!dma64 || len < sizeof(*dma64)) {
  774. remove_ddw(pdn, true, name);
  775. continue;
  776. }
  777. window = ddw_list_new_entry(pdn, dma64);
  778. if (!window) {
  779. of_node_put(pdn);
  780. break;
  781. }
  782. spin_lock(&dma_win_list_lock);
  783. list_add(&window->list, &dma_win_list);
  784. spin_unlock(&dma_win_list_lock);
  785. }
  786. }
  787. static int find_existing_ddw_windows(void)
  788. {
  789. if (!firmware_has_feature(FW_FEATURE_LPAR))
  790. return 0;
  791. find_existing_ddw_windows_named(DIRECT64_PROPNAME);
  792. find_existing_ddw_windows_named(DMA64_PROPNAME);
  793. return 0;
  794. }
  795. machine_arch_initcall(pseries, find_existing_ddw_windows);
  796. /**
  797. * ddw_read_ext - Get the value of an DDW extension
  798. * @np: device node from which the extension value is to be read.
  799. * @extnum: index number of the extension.
  800. * @value: pointer to return value, modified when extension is available.
  801. *
  802. * Checks if "ibm,ddw-extensions" exists for this node, and get the value
  803. * on index 'extnum'.
  804. * It can be used only to check if a property exists, passing value == NULL.
  805. *
  806. * Returns:
  807. * 0 if extension successfully read
  808. * -EINVAL if the "ibm,ddw-extensions" does not exist,
  809. * -ENODATA if "ibm,ddw-extensions" does not have a value, and
  810. * -EOVERFLOW if "ibm,ddw-extensions" does not contain this extension.
  811. */
  812. static inline int ddw_read_ext(const struct device_node *np, int extnum,
  813. u32 *value)
  814. {
  815. static const char propname[] = "ibm,ddw-extensions";
  816. u32 count;
  817. int ret;
  818. ret = of_property_read_u32_index(np, propname, DDW_EXT_SIZE, &count);
  819. if (ret)
  820. return ret;
  821. if (count < extnum)
  822. return -EOVERFLOW;
  823. if (!value)
  824. value = &count;
  825. return of_property_read_u32_index(np, propname, extnum, value);
  826. }
  827. static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
  828. struct ddw_query_response *query,
  829. struct device_node *parent)
  830. {
  831. struct device_node *dn;
  832. struct pci_dn *pdn;
  833. u32 cfg_addr, ext_query, query_out[5];
  834. u64 buid;
  835. int ret, out_sz;
  836. /*
  837. * From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can rule how many
  838. * output parameters ibm,query-pe-dma-windows will have, ranging from
  839. * 5 to 6.
  840. */
  841. ret = ddw_read_ext(parent, DDW_EXT_QUERY_OUT_SIZE, &ext_query);
  842. if (!ret && ext_query == 1)
  843. out_sz = 6;
  844. else
  845. out_sz = 5;
  846. /*
  847. * Get the config address and phb buid of the PE window.
  848. * Rely on eeh to retrieve this for us.
  849. * Retrieve them from the pci device, not the node with the
  850. * dma-window property
  851. */
  852. dn = pci_device_to_OF_node(dev);
  853. pdn = PCI_DN(dn);
  854. buid = pdn->phb->buid;
  855. cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
  856. ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, out_sz, query_out,
  857. cfg_addr, BUID_HI(buid), BUID_LO(buid));
  858. switch (out_sz) {
  859. case 5:
  860. query->windows_available = query_out[0];
  861. query->largest_available_block = query_out[1];
  862. query->page_size = query_out[2];
  863. query->migration_capable = query_out[3];
  864. break;
  865. case 6:
  866. query->windows_available = query_out[0];
  867. query->largest_available_block = ((u64)query_out[1] << 32) |
  868. query_out[2];
  869. query->page_size = query_out[3];
  870. query->migration_capable = query_out[4];
  871. break;
  872. }
  873. dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x returned %d, lb=%llx ps=%x wn=%d\n",
  874. ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr, BUID_HI(buid),
  875. BUID_LO(buid), ret, query->largest_available_block,
  876. query->page_size, query->windows_available);
  877. return ret;
  878. }
  879. static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail,
  880. struct ddw_create_response *create, int page_shift,
  881. int window_shift)
  882. {
  883. struct device_node *dn;
  884. struct pci_dn *pdn;
  885. u32 cfg_addr;
  886. u64 buid;
  887. int ret;
  888. /*
  889. * Get the config address and phb buid of the PE window.
  890. * Rely on eeh to retrieve this for us.
  891. * Retrieve them from the pci device, not the node with the
  892. * dma-window property
  893. */
  894. dn = pci_device_to_OF_node(dev);
  895. pdn = PCI_DN(dn);
  896. buid = pdn->phb->buid;
  897. cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
  898. do {
  899. /* extra outputs are LIOBN and dma-addr (hi, lo) */
  900. ret = rtas_call(ddw_avail[DDW_CREATE_PE_DMA_WIN], 5, 4,
  901. (u32 *)create, cfg_addr, BUID_HI(buid),
  902. BUID_LO(buid), page_shift, window_shift);
  903. } while (rtas_busy_delay(ret));
  904. dev_info(&dev->dev,
  905. "ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d "
  906. "(liobn = 0x%x starting addr = %x %x)\n",
  907. ddw_avail[DDW_CREATE_PE_DMA_WIN], cfg_addr, BUID_HI(buid),
  908. BUID_LO(buid), page_shift, window_shift, ret, create->liobn,
  909. create->addr_hi, create->addr_lo);
  910. return ret;
  911. }
  912. struct failed_ddw_pdn {
  913. struct device_node *pdn;
  914. struct list_head list;
  915. };
  916. static LIST_HEAD(failed_ddw_pdn_list);
  917. static phys_addr_t ddw_memory_hotplug_max(void)
  918. {
  919. phys_addr_t max_addr = memory_hotplug_max();
  920. struct device_node *memory;
  921. for_each_node_by_type(memory, "memory") {
  922. unsigned long start, size;
  923. int n_mem_addr_cells, n_mem_size_cells, len;
  924. const __be32 *memcell_buf;
  925. memcell_buf = of_get_property(memory, "reg", &len);
  926. if (!memcell_buf || len <= 0)
  927. continue;
  928. n_mem_addr_cells = of_n_addr_cells(memory);
  929. n_mem_size_cells = of_n_size_cells(memory);
  930. start = of_read_number(memcell_buf, n_mem_addr_cells);
  931. memcell_buf += n_mem_addr_cells;
  932. size = of_read_number(memcell_buf, n_mem_size_cells);
  933. memcell_buf += n_mem_size_cells;
  934. max_addr = max_t(phys_addr_t, max_addr, start + size);
  935. }
  936. return max_addr;
  937. }
  938. /*
  939. * Platforms supporting the DDW option starting with LoPAR level 2.7 implement
  940. * ibm,ddw-extensions, which carries the rtas token for
  941. * ibm,reset-pe-dma-windows.
  942. * That rtas-call can be used to restore the default DMA window for the device.
  943. */
  944. static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
  945. {
  946. int ret;
  947. u32 cfg_addr, reset_dma_win;
  948. u64 buid;
  949. struct device_node *dn;
  950. struct pci_dn *pdn;
  951. ret = ddw_read_ext(par_dn, DDW_EXT_RESET_DMA_WIN, &reset_dma_win);
  952. if (ret)
  953. return;
  954. dn = pci_device_to_OF_node(dev);
  955. pdn = PCI_DN(dn);
  956. buid = pdn->phb->buid;
  957. cfg_addr = (pdn->busno << 16) | (pdn->devfn << 8);
  958. ret = rtas_call(reset_dma_win, 3, 1, NULL, cfg_addr, BUID_HI(buid),
  959. BUID_LO(buid));
  960. if (ret)
  961. dev_info(&dev->dev,
  962. "ibm,reset-pe-dma-windows(%x) %x %x %x returned %d ",
  963. reset_dma_win, cfg_addr, BUID_HI(buid), BUID_LO(buid),
  964. ret);
  965. }
  966. /* Return largest page shift based on "IO Page Sizes" output of ibm,query-pe-dma-window. */
  967. static int iommu_get_page_shift(u32 query_page_size)
  968. {
  969. /* Supported IO page-sizes according to LoPAR, note that 2M is out of order */
  970. const int shift[] = {
  971. __builtin_ctzll(SZ_4K), __builtin_ctzll(SZ_64K), __builtin_ctzll(SZ_16M),
  972. __builtin_ctzll(SZ_32M), __builtin_ctzll(SZ_64M), __builtin_ctzll(SZ_128M),
  973. __builtin_ctzll(SZ_256M), __builtin_ctzll(SZ_16G), __builtin_ctzll(SZ_2M)
  974. };
  975. int i = ARRAY_SIZE(shift) - 1;
  976. int ret = 0;
  977. /*
  978. * On LoPAR, ibm,query-pe-dma-window outputs "IO Page Sizes" using a bit field:
  979. * - bit 31 means 4k pages are supported,
  980. * - bit 30 means 64k pages are supported, and so on.
  981. * Larger pagesizes map more memory with the same amount of TCEs, so start probing them.
  982. */
  983. for (; i >= 0 ; i--) {
  984. if (query_page_size & (1 << i))
  985. ret = max(ret, shift[i]);
  986. }
  987. return ret;
  988. }
  989. static struct property *ddw_property_create(const char *propname, u32 liobn, u64 dma_addr,
  990. u32 page_shift, u32 window_shift)
  991. {
  992. struct dynamic_dma_window_prop *ddwprop;
  993. struct property *win64;
  994. win64 = kzalloc(sizeof(*win64), GFP_KERNEL);
  995. if (!win64)
  996. return NULL;
  997. win64->name = kstrdup(propname, GFP_KERNEL);
  998. ddwprop = kzalloc(sizeof(*ddwprop), GFP_KERNEL);
  999. win64->value = ddwprop;
  1000. win64->length = sizeof(*ddwprop);
  1001. if (!win64->name || !win64->value) {
  1002. kfree(win64->name);
  1003. kfree(win64->value);
  1004. kfree(win64);
  1005. return NULL;
  1006. }
  1007. ddwprop->liobn = cpu_to_be32(liobn);
  1008. ddwprop->dma_base = cpu_to_be64(dma_addr);
  1009. ddwprop->tce_shift = cpu_to_be32(page_shift);
  1010. ddwprop->window_shift = cpu_to_be32(window_shift);
  1011. return win64;
  1012. }
  1013. /*
  1014. * If the PE supports dynamic dma windows, and there is space for a table
  1015. * that can map all pages in a linear offset, then setup such a table,
  1016. * and record the dma-offset in the struct device.
  1017. *
  1018. * dev: the pci device we are checking
  1019. * pdn: the parent pe node with the ibm,dma_window property
  1020. * Future: also check if we can remap the base window for our base page size
  1021. *
  1022. * returns true if can map all pages (direct mapping), false otherwise..
  1023. */
  1024. static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
  1025. {
  1026. int len = 0, ret;
  1027. int max_ram_len = order_base_2(ddw_memory_hotplug_max());
  1028. struct ddw_query_response query;
  1029. struct ddw_create_response create;
  1030. int page_shift;
  1031. u64 win_addr;
  1032. const char *win_name;
  1033. struct device_node *dn;
  1034. u32 ddw_avail[DDW_APPLICABLE_SIZE];
  1035. struct dma_win *window;
  1036. struct property *win64;
  1037. struct failed_ddw_pdn *fpdn;
  1038. bool default_win_removed = false, direct_mapping = false;
  1039. bool pmem_present;
  1040. struct pci_dn *pci = PCI_DN(pdn);
  1041. struct property *default_win = NULL;
  1042. dn = of_find_node_by_type(NULL, "ibm,pmemory");
  1043. pmem_present = dn != NULL;
  1044. of_node_put(dn);
  1045. mutex_lock(&dma_win_init_mutex);
  1046. if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &len)) {
  1047. direct_mapping = (len >= max_ram_len);
  1048. goto out_unlock;
  1049. }
  1050. /*
  1051. * If we already went through this for a previous function of
  1052. * the same device and failed, we don't want to muck with the
  1053. * DMA window again, as it will race with in-flight operations
  1054. * and can lead to EEHs. The above mutex protects access to the
  1055. * list.
  1056. */
  1057. list_for_each_entry(fpdn, &failed_ddw_pdn_list, list) {
  1058. if (fpdn->pdn == pdn)
  1059. goto out_unlock;
  1060. }
  1061. /*
  1062. * the ibm,ddw-applicable property holds the tokens for:
  1063. * ibm,query-pe-dma-window
  1064. * ibm,create-pe-dma-window
  1065. * ibm,remove-pe-dma-window
  1066. * for the given node in that order.
  1067. * the property is actually in the parent, not the PE
  1068. */
  1069. ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable",
  1070. &ddw_avail[0], DDW_APPLICABLE_SIZE);
  1071. if (ret)
  1072. goto out_failed;
  1073. /*
  1074. * Query if there is a second window of size to map the
  1075. * whole partition. Query returns number of windows, largest
  1076. * block assigned to PE (partition endpoint), and two bitmasks
  1077. * of page sizes: supported and supported for migrate-dma.
  1078. */
  1079. dn = pci_device_to_OF_node(dev);
  1080. ret = query_ddw(dev, ddw_avail, &query, pdn);
  1081. if (ret != 0)
  1082. goto out_failed;
  1083. /*
  1084. * If there is no window available, remove the default DMA window,
  1085. * if it's present. This will make all the resources available to the
  1086. * new DDW window.
  1087. * If anything fails after this, we need to restore it, so also check
  1088. * for extensions presence.
  1089. */
  1090. if (query.windows_available == 0) {
  1091. int reset_win_ext;
  1092. /* DDW + IOMMU on single window may fail if there is any allocation */
  1093. if (iommu_table_in_use(pci->table_group->tables[0])) {
  1094. dev_warn(&dev->dev, "current IOMMU table in use, can't be replaced.\n");
  1095. goto out_failed;
  1096. }
  1097. default_win = of_find_property(pdn, "ibm,dma-window", NULL);
  1098. if (!default_win)
  1099. goto out_failed;
  1100. reset_win_ext = ddw_read_ext(pdn, DDW_EXT_RESET_DMA_WIN, NULL);
  1101. if (reset_win_ext)
  1102. goto out_failed;
  1103. remove_dma_window(pdn, ddw_avail, default_win);
  1104. default_win_removed = true;
  1105. /* Query again, to check if the window is available */
  1106. ret = query_ddw(dev, ddw_avail, &query, pdn);
  1107. if (ret != 0)
  1108. goto out_failed;
  1109. if (query.windows_available == 0) {
  1110. /* no windows are available for this device. */
  1111. dev_dbg(&dev->dev, "no free dynamic windows");
  1112. goto out_failed;
  1113. }
  1114. }
  1115. page_shift = iommu_get_page_shift(query.page_size);
  1116. if (!page_shift) {
  1117. dev_dbg(&dev->dev, "no supported page size in mask %x",
  1118. query.page_size);
  1119. goto out_failed;
  1120. }
  1121. /*
  1122. * The "ibm,pmemory" can appear anywhere in the address space.
  1123. * Assuming it is still backed by page structs, try MAX_PHYSMEM_BITS
  1124. * for the upper limit and fallback to max RAM otherwise but this
  1125. * disables device::dma_ops_bypass.
  1126. */
  1127. len = max_ram_len;
  1128. if (pmem_present) {
  1129. if (query.largest_available_block >=
  1130. (1ULL << (MAX_PHYSMEM_BITS - page_shift)))
  1131. len = MAX_PHYSMEM_BITS;
  1132. else
  1133. dev_info(&dev->dev, "Skipping ibm,pmemory");
  1134. }
  1135. /* check if the available block * number of ptes will map everything */
  1136. if (query.largest_available_block < (1ULL << (len - page_shift))) {
  1137. dev_dbg(&dev->dev,
  1138. "can't map partition max 0x%llx with %llu %llu-sized pages\n",
  1139. 1ULL << len,
  1140. query.largest_available_block,
  1141. 1ULL << page_shift);
  1142. len = order_base_2(query.largest_available_block << page_shift);
  1143. win_name = DMA64_PROPNAME;
  1144. } else {
  1145. direct_mapping = !default_win_removed ||
  1146. (len == MAX_PHYSMEM_BITS) ||
  1147. (!pmem_present && (len == max_ram_len));
  1148. win_name = direct_mapping ? DIRECT64_PROPNAME : DMA64_PROPNAME;
  1149. }
  1150. ret = create_ddw(dev, ddw_avail, &create, page_shift, len);
  1151. if (ret != 0)
  1152. goto out_failed;
  1153. dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n",
  1154. create.liobn, dn);
  1155. win_addr = ((u64)create.addr_hi << 32) | create.addr_lo;
  1156. win64 = ddw_property_create(win_name, create.liobn, win_addr, page_shift, len);
  1157. if (!win64) {
  1158. dev_info(&dev->dev,
  1159. "couldn't allocate property, property name, or value\n");
  1160. goto out_remove_win;
  1161. }
  1162. ret = of_add_property(pdn, win64);
  1163. if (ret) {
  1164. dev_err(&dev->dev, "unable to add DMA window property for %pOF: %d",
  1165. pdn, ret);
  1166. goto out_free_prop;
  1167. }
  1168. window = ddw_list_new_entry(pdn, win64->value);
  1169. if (!window)
  1170. goto out_del_prop;
  1171. if (direct_mapping) {
  1172. /* DDW maps the whole partition, so enable direct DMA mapping */
  1173. ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
  1174. win64->value, tce_setrange_multi_pSeriesLP_walk);
  1175. if (ret) {
  1176. dev_info(&dev->dev, "failed to map DMA window for %pOF: %d\n",
  1177. dn, ret);
  1178. /* Make sure to clean DDW if any TCE was set*/
  1179. clean_dma_window(pdn, win64->value);
  1180. goto out_del_list;
  1181. }
  1182. } else {
  1183. struct iommu_table *newtbl;
  1184. int i;
  1185. unsigned long start = 0, end = 0;
  1186. for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) {
  1187. const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM;
  1188. /* Look for MMIO32 */
  1189. if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM) {
  1190. start = pci->phb->mem_resources[i].start;
  1191. end = pci->phb->mem_resources[i].end;
  1192. break;
  1193. }
  1194. }
  1195. /* New table for using DDW instead of the default DMA window */
  1196. newtbl = iommu_pseries_alloc_table(pci->phb->node);
  1197. if (!newtbl) {
  1198. dev_dbg(&dev->dev, "couldn't create new IOMMU table\n");
  1199. goto out_del_list;
  1200. }
  1201. iommu_table_setparms_common(newtbl, pci->phb->bus->number, create.liobn, win_addr,
  1202. 1UL << len, page_shift, NULL, &iommu_table_lpar_multi_ops);
  1203. iommu_init_table(newtbl, pci->phb->node, start, end);
  1204. pci->table_group->tables[1] = newtbl;
  1205. set_iommu_table_base(&dev->dev, newtbl);
  1206. }
  1207. if (default_win_removed) {
  1208. iommu_tce_table_put(pci->table_group->tables[0]);
  1209. pci->table_group->tables[0] = NULL;
  1210. /* default_win is valid here because default_win_removed == true */
  1211. of_remove_property(pdn, default_win);
  1212. dev_info(&dev->dev, "Removed default DMA window for %pOF\n", pdn);
  1213. }
  1214. spin_lock(&dma_win_list_lock);
  1215. list_add(&window->list, &dma_win_list);
  1216. spin_unlock(&dma_win_list_lock);
  1217. dev->dev.archdata.dma_offset = win_addr;
  1218. goto out_unlock;
  1219. out_del_list:
  1220. kfree(window);
  1221. out_del_prop:
  1222. of_remove_property(pdn, win64);
  1223. out_free_prop:
  1224. kfree(win64->name);
  1225. kfree(win64->value);
  1226. kfree(win64);
  1227. out_remove_win:
  1228. /* DDW is clean, so it's ok to call this directly. */
  1229. __remove_dma_window(pdn, ddw_avail, create.liobn);
  1230. out_failed:
  1231. if (default_win_removed)
  1232. reset_dma_window(dev, pdn);
  1233. fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);
  1234. if (!fpdn)
  1235. goto out_unlock;
  1236. fpdn->pdn = pdn;
  1237. list_add(&fpdn->list, &failed_ddw_pdn_list);
  1238. out_unlock:
  1239. mutex_unlock(&dma_win_init_mutex);
  1240. /*
  1241. * If we have persistent memory and the window size is only as big
  1242. * as RAM, then we failed to create a window to cover persistent
  1243. * memory and need to set the DMA limit.
  1244. */
  1245. if (pmem_present && direct_mapping && len == max_ram_len)
  1246. dev->dev.bus_dma_limit = dev->dev.archdata.dma_offset + (1ULL << len);
  1247. return direct_mapping;
  1248. }
  1249. static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
  1250. {
  1251. struct device_node *pdn, *dn;
  1252. struct iommu_table *tbl;
  1253. const __be32 *dma_window = NULL;
  1254. struct pci_dn *pci;
  1255. pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev));
  1256. /* dev setup for LPAR is a little tricky, since the device tree might
  1257. * contain the dma-window properties per-device and not necessarily
  1258. * for the bus. So we need to search upwards in the tree until we
  1259. * either hit a dma-window property, OR find a parent with a table
  1260. * already allocated.
  1261. */
  1262. dn = pci_device_to_OF_node(dev);
  1263. pr_debug(" node is %pOF\n", dn);
  1264. pdn = pci_dma_find(dn, &dma_window);
  1265. if (!pdn || !PCI_DN(pdn)) {
  1266. printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: "
  1267. "no DMA window found for pci dev=%s dn=%pOF\n",
  1268. pci_name(dev), dn);
  1269. return;
  1270. }
  1271. pr_debug(" parent is %pOF\n", pdn);
  1272. pci = PCI_DN(pdn);
  1273. if (!pci->table_group) {
  1274. pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
  1275. tbl = pci->table_group->tables[0];
  1276. iommu_table_setparms_lpar(pci->phb, pdn, tbl,
  1277. pci->table_group, dma_window);
  1278. iommu_init_table(tbl, pci->phb->node, 0, 0);
  1279. iommu_register_group(pci->table_group,
  1280. pci_domain_nr(pci->phb->bus), 0);
  1281. pr_debug(" created table: %p\n", pci->table_group);
  1282. } else {
  1283. pr_debug(" found DMA window, table: %p\n", pci->table_group);
  1284. }
  1285. set_iommu_table_base(&dev->dev, pci->table_group->tables[0]);
  1286. iommu_add_device(pci->table_group, &dev->dev);
  1287. }
  1288. static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask)
  1289. {
  1290. struct device_node *dn = pci_device_to_OF_node(pdev), *pdn;
  1291. /* only attempt to use a new window if 64-bit DMA is requested */
  1292. if (dma_mask < DMA_BIT_MASK(64))
  1293. return false;
  1294. dev_dbg(&pdev->dev, "node is %pOF\n", dn);
  1295. /*
  1296. * the device tree might contain the dma-window properties
  1297. * per-device and not necessarily for the bus. So we need to
  1298. * search upwards in the tree until we either hit a dma-window
  1299. * property, OR find a parent with a table already allocated.
  1300. */
  1301. pdn = pci_dma_find(dn, NULL);
  1302. if (pdn && PCI_DN(pdn))
  1303. return enable_ddw(pdev, pdn);
  1304. return false;
  1305. }
  1306. static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,
  1307. void *data)
  1308. {
  1309. struct dma_win *window;
  1310. struct memory_notify *arg = data;
  1311. int ret = 0;
  1312. switch (action) {
  1313. case MEM_GOING_ONLINE:
  1314. spin_lock(&dma_win_list_lock);
  1315. list_for_each_entry(window, &dma_win_list, list) {
  1316. ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn,
  1317. arg->nr_pages, window->prop);
  1318. /* XXX log error */
  1319. }
  1320. spin_unlock(&dma_win_list_lock);
  1321. break;
  1322. case MEM_CANCEL_ONLINE:
  1323. case MEM_OFFLINE:
  1324. spin_lock(&dma_win_list_lock);
  1325. list_for_each_entry(window, &dma_win_list, list) {
  1326. ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn,
  1327. arg->nr_pages, window->prop);
  1328. /* XXX log error */
  1329. }
  1330. spin_unlock(&dma_win_list_lock);
  1331. break;
  1332. default:
  1333. break;
  1334. }
  1335. if (ret && action != MEM_CANCEL_ONLINE)
  1336. return NOTIFY_BAD;
  1337. return NOTIFY_OK;
  1338. }
  1339. static struct notifier_block iommu_mem_nb = {
  1340. .notifier_call = iommu_mem_notifier,
  1341. };
  1342. static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *data)
  1343. {
  1344. int err = NOTIFY_OK;
  1345. struct of_reconfig_data *rd = data;
  1346. struct device_node *np = rd->dn;
  1347. struct pci_dn *pci = PCI_DN(np);
  1348. struct dma_win *window;
  1349. switch (action) {
  1350. case OF_RECONFIG_DETACH_NODE:
  1351. /*
  1352. * Removing the property will invoke the reconfig
  1353. * notifier again, which causes dead-lock on the
  1354. * read-write semaphore of the notifier chain. So
  1355. * we have to remove the property when releasing
  1356. * the device node.
  1357. */
  1358. if (remove_ddw(np, false, DIRECT64_PROPNAME))
  1359. remove_ddw(np, false, DMA64_PROPNAME);
  1360. if (pci && pci->table_group)
  1361. iommu_pseries_free_group(pci->table_group,
  1362. np->full_name);
  1363. spin_lock(&dma_win_list_lock);
  1364. list_for_each_entry(window, &dma_win_list, list) {
  1365. if (window->device == np) {
  1366. list_del(&window->list);
  1367. kfree(window);
  1368. break;
  1369. }
  1370. }
  1371. spin_unlock(&dma_win_list_lock);
  1372. break;
  1373. default:
  1374. err = NOTIFY_DONE;
  1375. break;
  1376. }
  1377. return err;
  1378. }
  1379. static struct notifier_block iommu_reconfig_nb = {
  1380. .notifier_call = iommu_reconfig_notifier,
  1381. };
  1382. /* These are called very early. */
  1383. void __init iommu_init_early_pSeries(void)
  1384. {
  1385. if (of_chosen && of_get_property(of_chosen, "linux,iommu-off", NULL))
  1386. return;
  1387. if (firmware_has_feature(FW_FEATURE_LPAR)) {
  1388. pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP;
  1389. pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP;
  1390. if (!disable_ddw)
  1391. pseries_pci_controller_ops.iommu_bypass_supported =
  1392. iommu_bypass_supported_pSeriesLP;
  1393. } else {
  1394. pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeries;
  1395. pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeries;
  1396. }
  1397. of_reconfig_notifier_register(&iommu_reconfig_nb);
  1398. register_memory_notifier(&iommu_mem_nb);
  1399. set_pci_dma_ops(&dma_iommu_ops);
  1400. }
  1401. static int __init disable_multitce(char *str)
  1402. {
  1403. if (strcmp(str, "off") == 0 &&
  1404. firmware_has_feature(FW_FEATURE_LPAR) &&
  1405. (firmware_has_feature(FW_FEATURE_PUT_TCE_IND) ||
  1406. firmware_has_feature(FW_FEATURE_STUFF_TCE))) {
  1407. printk(KERN_INFO "Disabling MULTITCE firmware feature\n");
  1408. powerpc_firmware_features &=
  1409. ~(FW_FEATURE_PUT_TCE_IND | FW_FEATURE_STUFF_TCE);
  1410. }
  1411. return 1;
  1412. }
  1413. __setup("multitce=", disable_multitce);
  1414. static int tce_iommu_bus_notifier(struct notifier_block *nb,
  1415. unsigned long action, void *data)
  1416. {
  1417. struct device *dev = data;
  1418. switch (action) {
  1419. case BUS_NOTIFY_DEL_DEVICE:
  1420. iommu_del_device(dev);
  1421. return 0;
  1422. default:
  1423. return 0;
  1424. }
  1425. }
  1426. static struct notifier_block tce_iommu_bus_nb = {
  1427. .notifier_call = tce_iommu_bus_notifier,
  1428. };
  1429. static int __init tce_iommu_bus_notifier_init(void)
  1430. {
  1431. bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
  1432. return 0;
  1433. }
  1434. machine_subsys_initcall_sync(pseries, tce_iommu_bus_notifier_init);