err.c 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * This file implements the error recovery as a core part of PCIe error
  4. * reporting. When a PCIe error is delivered, an error message will be
  5. * collected and printed to console, then, an error recovery procedure
  6. * will be executed by following the PCI error recovery rules.
  7. *
  8. * Copyright (C) 2006 Intel Corp.
  9. * Tom Long Nguyen ([email protected])
  10. * Zhang Yanmin ([email protected])
  11. */
  12. #define dev_fmt(fmt) "AER: " fmt
  13. #include <linux/pci.h>
  14. #include <linux/module.h>
  15. #include <linux/kernel.h>
  16. #include <linux/errno.h>
  17. #include <linux/aer.h>
  18. #include "portdrv.h"
  19. #include "../pci.h"
  20. static pci_ers_result_t merge_result(enum pci_ers_result orig,
  21. enum pci_ers_result new)
  22. {
  23. if (new == PCI_ERS_RESULT_NO_AER_DRIVER)
  24. return PCI_ERS_RESULT_NO_AER_DRIVER;
  25. if (new == PCI_ERS_RESULT_NONE)
  26. return orig;
  27. switch (orig) {
  28. case PCI_ERS_RESULT_CAN_RECOVER:
  29. case PCI_ERS_RESULT_RECOVERED:
  30. orig = new;
  31. break;
  32. case PCI_ERS_RESULT_DISCONNECT:
  33. if (new == PCI_ERS_RESULT_NEED_RESET)
  34. orig = PCI_ERS_RESULT_NEED_RESET;
  35. break;
  36. default:
  37. break;
  38. }
  39. return orig;
  40. }
  41. static int report_error_detected(struct pci_dev *dev,
  42. pci_channel_state_t state,
  43. enum pci_ers_result *result)
  44. {
  45. struct pci_driver *pdrv;
  46. pci_ers_result_t vote;
  47. const struct pci_error_handlers *err_handler;
  48. device_lock(&dev->dev);
  49. pdrv = dev->driver;
  50. if (pci_dev_is_disconnected(dev)) {
  51. vote = PCI_ERS_RESULT_DISCONNECT;
  52. } else if (!pci_dev_set_io_state(dev, state)) {
  53. pci_info(dev, "can't recover (state transition %u -> %u invalid)\n",
  54. dev->error_state, state);
  55. vote = PCI_ERS_RESULT_NONE;
  56. } else if (!pdrv || !pdrv->err_handler ||
  57. !pdrv->err_handler->error_detected) {
  58. /*
  59. * If any device in the subtree does not have an error_detected
  60. * callback, PCI_ERS_RESULT_NO_AER_DRIVER prevents subsequent
  61. * error callbacks of "any" device in the subtree, and will
  62. * exit in the disconnected error state.
  63. */
  64. if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
  65. vote = PCI_ERS_RESULT_NO_AER_DRIVER;
  66. pci_info(dev, "can't recover (no error_detected callback)\n");
  67. } else {
  68. vote = PCI_ERS_RESULT_NONE;
  69. }
  70. } else {
  71. err_handler = pdrv->err_handler;
  72. vote = err_handler->error_detected(dev, state);
  73. }
  74. pci_uevent_ers(dev, vote);
  75. *result = merge_result(*result, vote);
  76. device_unlock(&dev->dev);
  77. return 0;
  78. }
  79. static int report_frozen_detected(struct pci_dev *dev, void *data)
  80. {
  81. return report_error_detected(dev, pci_channel_io_frozen, data);
  82. }
  83. static int report_normal_detected(struct pci_dev *dev, void *data)
  84. {
  85. return report_error_detected(dev, pci_channel_io_normal, data);
  86. }
  87. static int report_mmio_enabled(struct pci_dev *dev, void *data)
  88. {
  89. struct pci_driver *pdrv;
  90. pci_ers_result_t vote, *result = data;
  91. const struct pci_error_handlers *err_handler;
  92. device_lock(&dev->dev);
  93. pdrv = dev->driver;
  94. if (!pdrv ||
  95. !pdrv->err_handler ||
  96. !pdrv->err_handler->mmio_enabled)
  97. goto out;
  98. err_handler = pdrv->err_handler;
  99. vote = err_handler->mmio_enabled(dev);
  100. *result = merge_result(*result, vote);
  101. out:
  102. device_unlock(&dev->dev);
  103. return 0;
  104. }
  105. static int report_slot_reset(struct pci_dev *dev, void *data)
  106. {
  107. struct pci_driver *pdrv;
  108. pci_ers_result_t vote, *result = data;
  109. const struct pci_error_handlers *err_handler;
  110. device_lock(&dev->dev);
  111. pdrv = dev->driver;
  112. if (!pdrv ||
  113. !pdrv->err_handler ||
  114. !pdrv->err_handler->slot_reset)
  115. goto out;
  116. err_handler = pdrv->err_handler;
  117. vote = err_handler->slot_reset(dev);
  118. *result = merge_result(*result, vote);
  119. out:
  120. device_unlock(&dev->dev);
  121. return 0;
  122. }
  123. static int report_resume(struct pci_dev *dev, void *data)
  124. {
  125. struct pci_driver *pdrv;
  126. const struct pci_error_handlers *err_handler;
  127. device_lock(&dev->dev);
  128. pdrv = dev->driver;
  129. if (!pci_dev_set_io_state(dev, pci_channel_io_normal) ||
  130. !pdrv ||
  131. !pdrv->err_handler ||
  132. !pdrv->err_handler->resume)
  133. goto out;
  134. err_handler = pdrv->err_handler;
  135. err_handler->resume(dev);
  136. out:
  137. pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
  138. device_unlock(&dev->dev);
  139. return 0;
  140. }
  141. /**
  142. * pci_walk_bridge - walk bridges potentially AER affected
  143. * @bridge: bridge which may be a Port, an RCEC, or an RCiEP
  144. * @cb: callback to be called for each device found
  145. * @userdata: arbitrary pointer to be passed to callback
  146. *
  147. * If the device provided is a bridge, walk the subordinate bus, including
  148. * any bridged devices on buses under this bus. Call the provided callback
  149. * on each device found.
  150. *
  151. * If the device provided has no subordinate bus, e.g., an RCEC or RCiEP,
  152. * call the callback on the device itself.
  153. */
  154. static void pci_walk_bridge(struct pci_dev *bridge,
  155. int (*cb)(struct pci_dev *, void *),
  156. void *userdata)
  157. {
  158. if (bridge->subordinate)
  159. pci_walk_bus(bridge->subordinate, cb, userdata);
  160. else
  161. cb(bridge, userdata);
  162. }
  163. pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
  164. pci_channel_state_t state,
  165. pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev))
  166. {
  167. int type = pci_pcie_type(dev);
  168. struct pci_dev *bridge;
  169. pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
  170. struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
  171. /*
  172. * If the error was detected by a Root Port, Downstream Port, RCEC,
  173. * or RCiEP, recovery runs on the device itself. For Ports, that
  174. * also includes any subordinate devices.
  175. *
  176. * If it was detected by another device (Endpoint, etc), recovery
  177. * runs on the device and anything else under the same Port, i.e.,
  178. * everything under "bridge".
  179. */
  180. if (type == PCI_EXP_TYPE_ROOT_PORT ||
  181. type == PCI_EXP_TYPE_DOWNSTREAM ||
  182. type == PCI_EXP_TYPE_RC_EC ||
  183. type == PCI_EXP_TYPE_RC_END)
  184. bridge = dev;
  185. else
  186. bridge = pci_upstream_bridge(dev);
  187. pci_dbg(bridge, "broadcast error_detected message\n");
  188. if (state == pci_channel_io_frozen) {
  189. pci_walk_bridge(bridge, report_frozen_detected, &status);
  190. if (reset_subordinates(bridge) != PCI_ERS_RESULT_RECOVERED) {
  191. pci_warn(bridge, "subordinate device reset failed\n");
  192. goto failed;
  193. }
  194. } else {
  195. pci_walk_bridge(bridge, report_normal_detected, &status);
  196. }
  197. if (status == PCI_ERS_RESULT_CAN_RECOVER) {
  198. status = PCI_ERS_RESULT_RECOVERED;
  199. pci_dbg(bridge, "broadcast mmio_enabled message\n");
  200. pci_walk_bridge(bridge, report_mmio_enabled, &status);
  201. }
  202. if (status == PCI_ERS_RESULT_NEED_RESET) {
  203. /*
  204. * TODO: Should call platform-specific
  205. * functions to reset slot before calling
  206. * drivers' slot_reset callbacks?
  207. */
  208. status = PCI_ERS_RESULT_RECOVERED;
  209. pci_dbg(bridge, "broadcast slot_reset message\n");
  210. pci_walk_bridge(bridge, report_slot_reset, &status);
  211. }
  212. if (status != PCI_ERS_RESULT_RECOVERED)
  213. goto failed;
  214. pci_dbg(bridge, "broadcast resume message\n");
  215. pci_walk_bridge(bridge, report_resume, &status);
  216. /*
  217. * If we have native control of AER, clear error status in the device
  218. * that detected the error. If the platform retained control of AER,
  219. * it is responsible for clearing this status. In that case, the
  220. * signaling device may not even be visible to the OS.
  221. */
  222. if (host->native_aer || pcie_ports_native) {
  223. pcie_clear_device_status(dev);
  224. pci_aer_clear_nonfatal_status(dev);
  225. }
  226. pci_info(bridge, "device recovery successful\n");
  227. return status;
  228. failed:
  229. pci_uevent_ers(bridge, PCI_ERS_RESULT_DISCONNECT);
  230. /* TODO: Should kernel panic here? */
  231. pci_info(bridge, "device recovery failed\n");
  232. return status;
  233. }