aldebaran.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426
  1. /*
  2. * Copyright 2021 Advanced Micro Devices, Inc.
  3. *
  4. * Permission is hereby granted, free of charge, to any person obtaining a
  5. * copy of this software and associated documentation files (the "Software"),
  6. * to deal in the Software without restriction, including without limitation
  7. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8. * and/or sell copies of the Software, and to permit persons to whom the
  9. * Software is furnished to do so, subject to the following conditions:
  10. *
  11. * The above copyright notice and this permission notice shall be included in
  12. * all copies or substantial portions of the Software.
  13. *
  14. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  17. * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18. * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19. * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20. * OTHER DEALINGS IN THE SOFTWARE.
  21. *
  22. */
  23. #include "aldebaran.h"
  24. #include "amdgpu_reset.h"
  25. #include "amdgpu_amdkfd.h"
  26. #include "amdgpu_dpm.h"
  27. #include "amdgpu_job.h"
  28. #include "amdgpu_ring.h"
  29. #include "amdgpu_ras.h"
  30. #include "amdgpu_psp.h"
  31. #include "amdgpu_xgmi.h"
  32. static bool aldebaran_is_mode2_default(struct amdgpu_reset_control *reset_ctl)
  33. {
  34. struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
  35. if ((adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2) &&
  36. adev->gmc.xgmi.connected_to_cpu))
  37. return true;
  38. return false;
  39. }
  40. static struct amdgpu_reset_handler *
  41. aldebaran_get_reset_handler(struct amdgpu_reset_control *reset_ctl,
  42. struct amdgpu_reset_context *reset_context)
  43. {
  44. struct amdgpu_reset_handler *handler;
  45. struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
  46. if (reset_context->method != AMD_RESET_METHOD_NONE) {
  47. dev_dbg(adev->dev, "Getting reset handler for method %d\n",
  48. reset_context->method);
  49. list_for_each_entry(handler, &reset_ctl->reset_handlers,
  50. handler_list) {
  51. if (handler->reset_method == reset_context->method)
  52. return handler;
  53. }
  54. }
  55. if (aldebaran_is_mode2_default(reset_ctl)) {
  56. list_for_each_entry(handler, &reset_ctl->reset_handlers,
  57. handler_list) {
  58. if (handler->reset_method == AMD_RESET_METHOD_MODE2) {
  59. reset_context->method = AMD_RESET_METHOD_MODE2;
  60. return handler;
  61. }
  62. }
  63. }
  64. dev_dbg(adev->dev, "Reset handler not found!\n");
  65. return NULL;
  66. }
  67. static int aldebaran_mode2_suspend_ip(struct amdgpu_device *adev)
  68. {
  69. int r, i;
  70. amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
  71. amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
  72. for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
  73. if (!(adev->ip_blocks[i].version->type ==
  74. AMD_IP_BLOCK_TYPE_GFX ||
  75. adev->ip_blocks[i].version->type ==
  76. AMD_IP_BLOCK_TYPE_SDMA))
  77. continue;
  78. r = adev->ip_blocks[i].version->funcs->suspend(adev);
  79. if (r) {
  80. dev_err(adev->dev,
  81. "suspend of IP block <%s> failed %d\n",
  82. adev->ip_blocks[i].version->funcs->name, r);
  83. return r;
  84. }
  85. adev->ip_blocks[i].status.hw = false;
  86. }
  87. return r;
  88. }
  89. static int
  90. aldebaran_mode2_prepare_hwcontext(struct amdgpu_reset_control *reset_ctl,
  91. struct amdgpu_reset_context *reset_context)
  92. {
  93. int r = 0;
  94. struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
  95. dev_dbg(adev->dev, "Aldebaran prepare hw context\n");
  96. /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
  97. if (!amdgpu_sriov_vf(adev))
  98. r = aldebaran_mode2_suspend_ip(adev);
  99. return r;
  100. }
  101. static void aldebaran_async_reset(struct work_struct *work)
  102. {
  103. struct amdgpu_reset_handler *handler;
  104. struct amdgpu_reset_control *reset_ctl =
  105. container_of(work, struct amdgpu_reset_control, reset_work);
  106. struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
  107. list_for_each_entry(handler, &reset_ctl->reset_handlers,
  108. handler_list) {
  109. if (handler->reset_method == reset_ctl->active_reset) {
  110. dev_dbg(adev->dev, "Resetting device\n");
  111. handler->do_reset(adev);
  112. break;
  113. }
  114. }
  115. }
  116. static int aldebaran_mode2_reset(struct amdgpu_device *adev)
  117. {
  118. /* disable BM */
  119. pci_clear_master(adev->pdev);
  120. adev->asic_reset_res = amdgpu_dpm_mode2_reset(adev);
  121. return adev->asic_reset_res;
  122. }
  123. static int
  124. aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
  125. struct amdgpu_reset_context *reset_context)
  126. {
  127. struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
  128. struct list_head *reset_device_list = reset_context->reset_device_list;
  129. struct amdgpu_device *tmp_adev = NULL;
  130. int r = 0;
  131. dev_dbg(adev->dev, "aldebaran perform hw reset\n");
  132. if (reset_device_list == NULL)
  133. return -EINVAL;
  134. if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2) &&
  135. reset_context->hive == NULL) {
  136. /* Wrong context, return error */
  137. return -EINVAL;
  138. }
  139. list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
  140. mutex_lock(&tmp_adev->reset_cntl->reset_lock);
  141. tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_MODE2;
  142. }
  143. /*
  144. * Mode2 reset doesn't need any sync between nodes in XGMI hive, instead launch
  145. * them together so that they can be completed asynchronously on multiple nodes
  146. */
  147. list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
  148. /* For XGMI run all resets in parallel to speed up the process */
  149. if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
  150. if (!queue_work(system_unbound_wq,
  151. &tmp_adev->reset_cntl->reset_work))
  152. r = -EALREADY;
  153. } else
  154. r = aldebaran_mode2_reset(tmp_adev);
  155. if (r) {
  156. dev_err(tmp_adev->dev,
  157. "ASIC reset failed with error, %d for drm dev, %s",
  158. r, adev_to_drm(tmp_adev)->unique);
  159. break;
  160. }
  161. }
  162. /* For XGMI wait for all resets to complete before proceed */
  163. if (!r) {
  164. list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
  165. if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
  166. flush_work(&tmp_adev->reset_cntl->reset_work);
  167. r = tmp_adev->asic_reset_res;
  168. if (r)
  169. break;
  170. }
  171. }
  172. }
  173. list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
  174. mutex_unlock(&tmp_adev->reset_cntl->reset_lock);
  175. tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_NONE;
  176. }
  177. return r;
  178. }
  179. static int aldebaran_mode2_restore_ip(struct amdgpu_device *adev)
  180. {
  181. struct amdgpu_firmware_info *ucode_list[AMDGPU_UCODE_ID_MAXIMUM];
  182. struct amdgpu_firmware_info *ucode;
  183. struct amdgpu_ip_block *cmn_block;
  184. int ucode_count = 0;
  185. int i, r;
  186. dev_dbg(adev->dev, "Reloading ucodes after reset\n");
  187. for (i = 0; i < adev->firmware.max_ucodes; i++) {
  188. ucode = &adev->firmware.ucode[i];
  189. if (!ucode->fw)
  190. continue;
  191. switch (ucode->ucode_id) {
  192. case AMDGPU_UCODE_ID_SDMA0:
  193. case AMDGPU_UCODE_ID_SDMA1:
  194. case AMDGPU_UCODE_ID_SDMA2:
  195. case AMDGPU_UCODE_ID_SDMA3:
  196. case AMDGPU_UCODE_ID_SDMA4:
  197. case AMDGPU_UCODE_ID_SDMA5:
  198. case AMDGPU_UCODE_ID_SDMA6:
  199. case AMDGPU_UCODE_ID_SDMA7:
  200. case AMDGPU_UCODE_ID_CP_MEC1:
  201. case AMDGPU_UCODE_ID_CP_MEC1_JT:
  202. case AMDGPU_UCODE_ID_RLC_RESTORE_LIST_CNTL:
  203. case AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM:
  204. case AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM:
  205. case AMDGPU_UCODE_ID_RLC_G:
  206. ucode_list[ucode_count++] = ucode;
  207. break;
  208. default:
  209. break;
  210. }
  211. }
  212. /* Reinit NBIF block */
  213. cmn_block =
  214. amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_COMMON);
  215. if (unlikely(!cmn_block)) {
  216. dev_err(adev->dev, "Failed to get BIF handle\n");
  217. return -EINVAL;
  218. }
  219. r = cmn_block->version->funcs->resume(adev);
  220. if (r)
  221. return r;
  222. /* Reinit GFXHUB */
  223. adev->gfxhub.funcs->init(adev);
  224. r = adev->gfxhub.funcs->gart_enable(adev);
  225. if (r) {
  226. dev_err(adev->dev, "GFXHUB gart reenable failed after reset\n");
  227. return r;
  228. }
  229. /* Reload GFX firmware */
  230. r = psp_load_fw_list(&adev->psp, ucode_list, ucode_count);
  231. if (r) {
  232. dev_err(adev->dev, "GFX ucode load failed after reset\n");
  233. return r;
  234. }
  235. /* Resume RLC, FW needs RLC alive to complete reset process */
  236. adev->gfx.rlc.funcs->resume(adev);
  237. /* Wait for FW reset event complete */
  238. r = amdgpu_dpm_wait_for_event(adev, SMU_EVENT_RESET_COMPLETE, 0);
  239. if (r) {
  240. dev_err(adev->dev,
  241. "Failed to get response from firmware after reset\n");
  242. return r;
  243. }
  244. for (i = 0; i < adev->num_ip_blocks; i++) {
  245. if (!(adev->ip_blocks[i].version->type ==
  246. AMD_IP_BLOCK_TYPE_GFX ||
  247. adev->ip_blocks[i].version->type ==
  248. AMD_IP_BLOCK_TYPE_SDMA))
  249. continue;
  250. r = adev->ip_blocks[i].version->funcs->resume(adev);
  251. if (r) {
  252. dev_err(adev->dev,
  253. "resume of IP block <%s> failed %d\n",
  254. adev->ip_blocks[i].version->funcs->name, r);
  255. return r;
  256. }
  257. adev->ip_blocks[i].status.hw = true;
  258. }
  259. for (i = 0; i < adev->num_ip_blocks; i++) {
  260. if (!(adev->ip_blocks[i].version->type ==
  261. AMD_IP_BLOCK_TYPE_GFX ||
  262. adev->ip_blocks[i].version->type ==
  263. AMD_IP_BLOCK_TYPE_SDMA ||
  264. adev->ip_blocks[i].version->type ==
  265. AMD_IP_BLOCK_TYPE_COMMON))
  266. continue;
  267. if (adev->ip_blocks[i].version->funcs->late_init) {
  268. r = adev->ip_blocks[i].version->funcs->late_init(
  269. (void *)adev);
  270. if (r) {
  271. dev_err(adev->dev,
  272. "late_init of IP block <%s> failed %d after reset\n",
  273. adev->ip_blocks[i].version->funcs->name,
  274. r);
  275. return r;
  276. }
  277. }
  278. adev->ip_blocks[i].status.late_initialized = true;
  279. }
  280. amdgpu_ras_set_error_query_ready(adev, true);
  281. amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
  282. amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
  283. return r;
  284. }
  285. static int
  286. aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
  287. struct amdgpu_reset_context *reset_context)
  288. {
  289. struct list_head *reset_device_list = reset_context->reset_device_list;
  290. struct amdgpu_device *tmp_adev = NULL;
  291. int r;
  292. if (reset_device_list == NULL)
  293. return -EINVAL;
  294. if (reset_context->reset_req_dev->ip_versions[MP1_HWIP][0] ==
  295. IP_VERSION(13, 0, 2) &&
  296. reset_context->hive == NULL) {
  297. /* Wrong context, return error */
  298. return -EINVAL;
  299. }
  300. list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
  301. dev_info(tmp_adev->dev,
  302. "GPU reset succeeded, trying to resume\n");
  303. r = aldebaran_mode2_restore_ip(tmp_adev);
  304. if (r)
  305. goto end;
  306. /*
  307. * Add this ASIC as tracked as reset was already
  308. * complete successfully.
  309. */
  310. amdgpu_register_gpu_instance(tmp_adev);
  311. /* Resume RAS */
  312. amdgpu_ras_resume(tmp_adev);
  313. /* Update PSP FW topology after reset */
  314. if (reset_context->hive &&
  315. tmp_adev->gmc.xgmi.num_physical_nodes > 1)
  316. r = amdgpu_xgmi_update_topology(reset_context->hive,
  317. tmp_adev);
  318. if (!r) {
  319. amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
  320. r = amdgpu_ib_ring_tests(tmp_adev);
  321. if (r) {
  322. dev_err(tmp_adev->dev,
  323. "ib ring test failed (%d).\n", r);
  324. r = -EAGAIN;
  325. tmp_adev->asic_reset_res = r;
  326. goto end;
  327. }
  328. }
  329. }
  330. end:
  331. return r;
  332. }
  333. static struct amdgpu_reset_handler aldebaran_mode2_handler = {
  334. .reset_method = AMD_RESET_METHOD_MODE2,
  335. .prepare_env = NULL,
  336. .prepare_hwcontext = aldebaran_mode2_prepare_hwcontext,
  337. .perform_reset = aldebaran_mode2_perform_reset,
  338. .restore_hwcontext = aldebaran_mode2_restore_hwcontext,
  339. .restore_env = NULL,
  340. .do_reset = aldebaran_mode2_reset,
  341. };
  342. int aldebaran_reset_init(struct amdgpu_device *adev)
  343. {
  344. struct amdgpu_reset_control *reset_ctl;
  345. reset_ctl = kzalloc(sizeof(*reset_ctl), GFP_KERNEL);
  346. if (!reset_ctl)
  347. return -ENOMEM;
  348. reset_ctl->handle = adev;
  349. reset_ctl->async_reset = aldebaran_async_reset;
  350. reset_ctl->active_reset = AMD_RESET_METHOD_NONE;
  351. reset_ctl->get_reset_handler = aldebaran_get_reset_handler;
  352. INIT_LIST_HEAD(&reset_ctl->reset_handlers);
  353. INIT_WORK(&reset_ctl->reset_work, reset_ctl->async_reset);
  354. /* Only mode2 is handled through reset control now */
  355. amdgpu_reset_add_handler(reset_ctl, &aldebaran_mode2_handler);
  356. adev->reset_cntl = reset_ctl;
  357. return 0;
  358. }
  359. int aldebaran_reset_fini(struct amdgpu_device *adev)
  360. {
  361. kfree(adev->reset_cntl);
  362. adev->reset_cntl = NULL;
  363. return 0;
  364. }