arm-smmu-nvidia.c 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. // Copyright (C) 2019-2020 NVIDIA CORPORATION. All rights reserved.
  3. #include <linux/bitfield.h>
  4. #include <linux/delay.h>
  5. #include <linux/of.h>
  6. #include <linux/platform_device.h>
  7. #include <linux/slab.h>
  8. #include <soc/tegra/mc.h>
  9. #include "arm-smmu.h"
  10. /*
  11. * Tegra194 has three ARM MMU-500 Instances.
  12. * Two of them are used together and must be programmed identically for
  13. * interleaved IOVA accesses across them and translates accesses from
  14. * non-isochronous HW devices.
  15. * Third one is used for translating accesses from isochronous HW devices.
  16. *
  17. * In addition, the SMMU driver needs to coordinate with the memory controller
  18. * driver to ensure that the right SID override is programmed for any given
  19. * memory client. This is necessary to allow for use-case such as seamlessly
  20. * handing over the display controller configuration from the firmware to the
  21. * kernel.
  22. *
  23. * This implementation supports programming of the two instances that must
  24. * be programmed identically and takes care of invoking the memory controller
  25. * driver for SID override programming after devices have been attached to an
  26. * SMMU instance.
  27. */
  28. #define MAX_SMMU_INSTANCES 2
  29. struct nvidia_smmu {
  30. struct arm_smmu_device smmu;
  31. void __iomem *bases[MAX_SMMU_INSTANCES];
  32. unsigned int num_instances;
  33. struct tegra_mc *mc;
  34. };
  35. static inline struct nvidia_smmu *to_nvidia_smmu(struct arm_smmu_device *smmu)
  36. {
  37. return container_of(smmu, struct nvidia_smmu, smmu);
  38. }
  39. static inline void __iomem *nvidia_smmu_page(struct arm_smmu_device *smmu,
  40. unsigned int inst, int page)
  41. {
  42. struct nvidia_smmu *nvidia_smmu;
  43. nvidia_smmu = container_of(smmu, struct nvidia_smmu, smmu);
  44. return nvidia_smmu->bases[inst] + (page << smmu->pgshift);
  45. }
  46. static u32 nvidia_smmu_read_reg(struct arm_smmu_device *smmu,
  47. int page, int offset)
  48. {
  49. void __iomem *reg = nvidia_smmu_page(smmu, 0, page) + offset;
  50. return readl_relaxed(reg);
  51. }
  52. static void nvidia_smmu_write_reg(struct arm_smmu_device *smmu,
  53. int page, int offset, u32 val)
  54. {
  55. struct nvidia_smmu *nvidia = to_nvidia_smmu(smmu);
  56. unsigned int i;
  57. for (i = 0; i < nvidia->num_instances; i++) {
  58. void __iomem *reg = nvidia_smmu_page(smmu, i, page) + offset;
  59. writel_relaxed(val, reg);
  60. }
  61. }
  62. static u64 nvidia_smmu_read_reg64(struct arm_smmu_device *smmu,
  63. int page, int offset)
  64. {
  65. void __iomem *reg = nvidia_smmu_page(smmu, 0, page) + offset;
  66. return readq_relaxed(reg);
  67. }
  68. static void nvidia_smmu_write_reg64(struct arm_smmu_device *smmu,
  69. int page, int offset, u64 val)
  70. {
  71. struct nvidia_smmu *nvidia = to_nvidia_smmu(smmu);
  72. unsigned int i;
  73. for (i = 0; i < nvidia->num_instances; i++) {
  74. void __iomem *reg = nvidia_smmu_page(smmu, i, page) + offset;
  75. writeq_relaxed(val, reg);
  76. }
  77. }
  78. static void nvidia_smmu_tlb_sync(struct arm_smmu_device *smmu, int page,
  79. int sync, int status)
  80. {
  81. struct nvidia_smmu *nvidia = to_nvidia_smmu(smmu);
  82. unsigned int delay;
  83. arm_smmu_writel(smmu, page, sync, 0);
  84. for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
  85. unsigned int spin_cnt;
  86. for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) {
  87. u32 val = 0;
  88. unsigned int i;
  89. for (i = 0; i < nvidia->num_instances; i++) {
  90. void __iomem *reg;
  91. reg = nvidia_smmu_page(smmu, i, page) + status;
  92. val |= readl_relaxed(reg);
  93. }
  94. if (!(val & ARM_SMMU_sTLBGSTATUS_GSACTIVE))
  95. return;
  96. cpu_relax();
  97. }
  98. udelay(delay);
  99. }
  100. dev_err_ratelimited(smmu->dev,
  101. "TLB sync timed out -- SMMU may be deadlocked\n");
  102. }
  103. static int nvidia_smmu_reset(struct arm_smmu_device *smmu)
  104. {
  105. struct nvidia_smmu *nvidia = to_nvidia_smmu(smmu);
  106. unsigned int i;
  107. for (i = 0; i < nvidia->num_instances; i++) {
  108. u32 val;
  109. void __iomem *reg = nvidia_smmu_page(smmu, i, ARM_SMMU_GR0) +
  110. ARM_SMMU_GR0_sGFSR;
  111. /* clear global FSR */
  112. val = readl_relaxed(reg);
  113. writel_relaxed(val, reg);
  114. }
  115. return 0;
  116. }
  117. static irqreturn_t nvidia_smmu_global_fault_inst(int irq,
  118. struct arm_smmu_device *smmu,
  119. int inst)
  120. {
  121. u32 gfsr, gfsynr0, gfsynr1, gfsynr2;
  122. void __iomem *gr0_base = nvidia_smmu_page(smmu, inst, 0);
  123. gfsr = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSR);
  124. if (!gfsr)
  125. return IRQ_NONE;
  126. gfsynr0 = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSYNR0);
  127. gfsynr1 = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSYNR1);
  128. gfsynr2 = readl_relaxed(gr0_base + ARM_SMMU_GR0_sGFSYNR2);
  129. dev_err_ratelimited(smmu->dev,
  130. "Unexpected global fault, this could be serious\n");
  131. dev_err_ratelimited(smmu->dev,
  132. "\tGFSR 0x%08x, GFSYNR0 0x%08x, GFSYNR1 0x%08x, GFSYNR2 0x%08x\n",
  133. gfsr, gfsynr0, gfsynr1, gfsynr2);
  134. writel_relaxed(gfsr, gr0_base + ARM_SMMU_GR0_sGFSR);
  135. return IRQ_HANDLED;
  136. }
  137. static irqreturn_t nvidia_smmu_global_fault(int irq, void *dev)
  138. {
  139. unsigned int inst;
  140. irqreturn_t ret = IRQ_NONE;
  141. struct arm_smmu_device *smmu = dev;
  142. struct nvidia_smmu *nvidia = to_nvidia_smmu(smmu);
  143. for (inst = 0; inst < nvidia->num_instances; inst++) {
  144. irqreturn_t irq_ret;
  145. irq_ret = nvidia_smmu_global_fault_inst(irq, smmu, inst);
  146. if (irq_ret == IRQ_HANDLED)
  147. ret = IRQ_HANDLED;
  148. }
  149. return ret;
  150. }
  151. static irqreturn_t nvidia_smmu_context_fault_bank(int irq,
  152. struct arm_smmu_device *smmu,
  153. int idx, int inst)
  154. {
  155. u32 fsr, fsynr, cbfrsynra;
  156. unsigned long iova;
  157. void __iomem *gr1_base = nvidia_smmu_page(smmu, inst, 1);
  158. void __iomem *cb_base = nvidia_smmu_page(smmu, inst, smmu->numpage + idx);
  159. fsr = readl_relaxed(cb_base + ARM_SMMU_CB_FSR);
  160. if (!(fsr & ARM_SMMU_FSR_FAULT))
  161. return IRQ_NONE;
  162. fsynr = readl_relaxed(cb_base + ARM_SMMU_CB_FSYNR0);
  163. iova = readq_relaxed(cb_base + ARM_SMMU_CB_FAR);
  164. cbfrsynra = readl_relaxed(gr1_base + ARM_SMMU_GR1_CBFRSYNRA(idx));
  165. dev_err_ratelimited(smmu->dev,
  166. "Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n",
  167. fsr, iova, fsynr, cbfrsynra, idx);
  168. writel_relaxed(fsr, cb_base + ARM_SMMU_CB_FSR);
  169. return IRQ_HANDLED;
  170. }
  171. static irqreturn_t nvidia_smmu_context_fault(int irq, void *dev)
  172. {
  173. int idx;
  174. unsigned int inst;
  175. irqreturn_t ret = IRQ_NONE;
  176. struct arm_smmu_device *smmu;
  177. struct iommu_domain *domain = dev;
  178. struct arm_smmu_domain *smmu_domain;
  179. struct nvidia_smmu *nvidia;
  180. smmu_domain = container_of(domain, struct arm_smmu_domain, domain);
  181. smmu = smmu_domain->smmu;
  182. nvidia = to_nvidia_smmu(smmu);
  183. for (inst = 0; inst < nvidia->num_instances; inst++) {
  184. irqreturn_t irq_ret;
  185. /*
  186. * Interrupt line is shared between all contexts.
  187. * Check for faults across all contexts.
  188. */
  189. for (idx = 0; idx < smmu->num_context_banks; idx++) {
  190. irq_ret = nvidia_smmu_context_fault_bank(irq, smmu,
  191. idx, inst);
  192. if (irq_ret == IRQ_HANDLED)
  193. ret = IRQ_HANDLED;
  194. }
  195. }
  196. return ret;
  197. }
  198. static void nvidia_smmu_probe_finalize(struct arm_smmu_device *smmu, struct device *dev)
  199. {
  200. struct nvidia_smmu *nvidia = to_nvidia_smmu(smmu);
  201. int err;
  202. err = tegra_mc_probe_device(nvidia->mc, dev);
  203. if (err < 0)
  204. dev_err(smmu->dev, "memory controller probe failed for %s: %d\n",
  205. dev_name(dev), err);
  206. }
  207. static int nvidia_smmu_init_context(struct arm_smmu_domain *smmu_domain,
  208. struct io_pgtable_cfg *pgtbl_cfg,
  209. struct device *dev)
  210. {
  211. struct arm_smmu_device *smmu = smmu_domain->smmu;
  212. const struct device_node *np = smmu->dev->of_node;
  213. /*
  214. * Tegra194 and Tegra234 SoCs have the erratum that causes walk cache
  215. * entries to not be invalidated correctly. The problem is that the walk
  216. * cache index generated for IOVA is not same across translation and
  217. * invalidation requests. This is leading to page faults when PMD entry
  218. * is released during unmap and populated with new PTE table during
  219. * subsequent map request. Disabling large page mappings avoids the
  220. * release of PMD entry and avoid translations seeing stale PMD entry in
  221. * walk cache.
  222. * Fix this by limiting the page mappings to PAGE_SIZE on Tegra194 and
  223. * Tegra234.
  224. */
  225. if (of_device_is_compatible(np, "nvidia,tegra234-smmu") ||
  226. of_device_is_compatible(np, "nvidia,tegra194-smmu")) {
  227. smmu->pgsize_bitmap = PAGE_SIZE;
  228. pgtbl_cfg->pgsize_bitmap = smmu->pgsize_bitmap;
  229. }
  230. return 0;
  231. }
  232. static const struct arm_smmu_impl nvidia_smmu_impl = {
  233. .read_reg = nvidia_smmu_read_reg,
  234. .write_reg = nvidia_smmu_write_reg,
  235. .read_reg64 = nvidia_smmu_read_reg64,
  236. .write_reg64 = nvidia_smmu_write_reg64,
  237. .reset = nvidia_smmu_reset,
  238. .tlb_sync = nvidia_smmu_tlb_sync,
  239. .global_fault = nvidia_smmu_global_fault,
  240. .context_fault = nvidia_smmu_context_fault,
  241. .probe_finalize = nvidia_smmu_probe_finalize,
  242. .init_context = nvidia_smmu_init_context,
  243. };
  244. static const struct arm_smmu_impl nvidia_smmu_single_impl = {
  245. .probe_finalize = nvidia_smmu_probe_finalize,
  246. .init_context = nvidia_smmu_init_context,
  247. };
  248. struct arm_smmu_device *nvidia_smmu_impl_init(struct arm_smmu_device *smmu)
  249. {
  250. struct resource *res;
  251. struct device *dev = smmu->dev;
  252. struct nvidia_smmu *nvidia_smmu;
  253. struct platform_device *pdev = to_platform_device(dev);
  254. unsigned int i;
  255. nvidia_smmu = devm_krealloc(dev, smmu, sizeof(*nvidia_smmu), GFP_KERNEL);
  256. if (!nvidia_smmu)
  257. return ERR_PTR(-ENOMEM);
  258. nvidia_smmu->mc = devm_tegra_memory_controller_get(dev);
  259. if (IS_ERR(nvidia_smmu->mc))
  260. return ERR_CAST(nvidia_smmu->mc);
  261. /* Instance 0 is ioremapped by arm-smmu.c. */
  262. nvidia_smmu->bases[0] = smmu->base;
  263. nvidia_smmu->num_instances++;
  264. for (i = 1; i < MAX_SMMU_INSTANCES; i++) {
  265. res = platform_get_resource(pdev, IORESOURCE_MEM, i);
  266. if (!res)
  267. break;
  268. nvidia_smmu->bases[i] = devm_ioremap_resource(dev, res);
  269. if (IS_ERR(nvidia_smmu->bases[i]))
  270. return ERR_CAST(nvidia_smmu->bases[i]);
  271. nvidia_smmu->num_instances++;
  272. }
  273. if (nvidia_smmu->num_instances == 1)
  274. nvidia_smmu->smmu.impl = &nvidia_smmu_single_impl;
  275. else
  276. nvidia_smmu->smmu.impl = &nvidia_smmu_impl;
  277. return &nvidia_smmu->smmu;
  278. }