eeh_event.c 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. *
  4. * Copyright (c) 2005 Linas Vepstas <[email protected]>
  5. */
  6. #include <linux/delay.h>
  7. #include <linux/list.h>
  8. #include <linux/sched.h>
  9. #include <linux/semaphore.h>
  10. #include <linux/pci.h>
  11. #include <linux/slab.h>
  12. #include <linux/kthread.h>
  13. #include <asm/eeh_event.h>
  14. #include <asm/ppc-pci.h>
  15. /** Overview:
  16. * EEH error states may be detected within exception handlers;
  17. * however, the recovery processing needs to occur asynchronously
  18. * in a normal kernel context and not an interrupt context.
  19. * This pair of routines creates an event and queues it onto a
  20. * work-queue, where a worker thread can drive recovery.
  21. */
  22. static DEFINE_SPINLOCK(eeh_eventlist_lock);
  23. static DECLARE_COMPLETION(eeh_eventlist_event);
  24. static LIST_HEAD(eeh_eventlist);
  25. /**
  26. * eeh_event_handler - Dispatch EEH events.
  27. * @dummy - unused
  28. *
  29. * The detection of a frozen slot can occur inside an interrupt,
  30. * where it can be hard to do anything about it. The goal of this
  31. * routine is to pull these detection events out of the context
  32. * of the interrupt handler, and re-dispatch them for processing
  33. * at a later time in a normal context.
  34. */
  35. static int eeh_event_handler(void * dummy)
  36. {
  37. unsigned long flags;
  38. struct eeh_event *event;
  39. while (!kthread_should_stop()) {
  40. if (wait_for_completion_interruptible(&eeh_eventlist_event))
  41. break;
  42. /* Fetch EEH event from the queue */
  43. spin_lock_irqsave(&eeh_eventlist_lock, flags);
  44. event = NULL;
  45. if (!list_empty(&eeh_eventlist)) {
  46. event = list_entry(eeh_eventlist.next,
  47. struct eeh_event, list);
  48. list_del(&event->list);
  49. }
  50. spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
  51. if (!event)
  52. continue;
  53. /* We might have event without binding PE */
  54. if (event->pe)
  55. eeh_handle_normal_event(event->pe);
  56. else
  57. eeh_handle_special_event();
  58. kfree(event);
  59. }
  60. return 0;
  61. }
  62. /**
  63. * eeh_event_init - Start kernel thread to handle EEH events
  64. *
  65. * This routine is called to start the kernel thread for processing
  66. * EEH event.
  67. */
  68. int eeh_event_init(void)
  69. {
  70. struct task_struct *t;
  71. int ret = 0;
  72. t = kthread_run(eeh_event_handler, NULL, "eehd");
  73. if (IS_ERR(t)) {
  74. ret = PTR_ERR(t);
  75. pr_err("%s: Failed to start EEH daemon (%d)\n",
  76. __func__, ret);
  77. return ret;
  78. }
  79. return 0;
  80. }
  81. /**
  82. * eeh_send_failure_event - Generate a PCI error event
  83. * @pe: EEH PE
  84. *
  85. * This routine can be called within an interrupt context;
  86. * the actual event will be delivered in a normal context
  87. * (from a workqueue).
  88. */
  89. int __eeh_send_failure_event(struct eeh_pe *pe)
  90. {
  91. unsigned long flags;
  92. struct eeh_event *event;
  93. event = kzalloc(sizeof(*event), GFP_ATOMIC);
  94. if (!event) {
  95. pr_err("EEH: out of memory, event not handled\n");
  96. return -ENOMEM;
  97. }
  98. event->pe = pe;
  99. /*
  100. * Mark the PE as recovering before inserting it in the queue.
  101. * This prevents the PE from being free()ed by a hotplug driver
  102. * while the PE is sitting in the event queue.
  103. */
  104. if (pe) {
  105. #ifdef CONFIG_STACKTRACE
  106. /*
  107. * Save the current stack trace so we can dump it from the
  108. * event handler thread.
  109. */
  110. pe->trace_entries = stack_trace_save(pe->stack_trace,
  111. ARRAY_SIZE(pe->stack_trace), 0);
  112. #endif /* CONFIG_STACKTRACE */
  113. eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
  114. }
  115. /* We may or may not be called in an interrupt context */
  116. spin_lock_irqsave(&eeh_eventlist_lock, flags);
  117. list_add(&event->list, &eeh_eventlist);
  118. spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
  119. /* For EEH deamon to knick in */
  120. complete(&eeh_eventlist_event);
  121. return 0;
  122. }
  123. int eeh_send_failure_event(struct eeh_pe *pe)
  124. {
  125. /*
  126. * If we've manually suppressed recovery events via debugfs
  127. * then just drop it on the floor.
  128. */
  129. if (eeh_debugfs_no_recover) {
  130. pr_err("EEH: Event dropped due to no_recover setting\n");
  131. return 0;
  132. }
  133. return __eeh_send_failure_event(pe);
  134. }
  135. /**
  136. * eeh_remove_event - Remove EEH event from the queue
  137. * @pe: Event binding to the PE
  138. * @force: Event will be removed unconditionally
  139. *
  140. * On PowerNV platform, we might have subsequent coming events
  141. * is part of the former one. For that case, those subsequent
  142. * coming events are totally duplicated and unnecessary, thus
  143. * they should be removed.
  144. */
  145. void eeh_remove_event(struct eeh_pe *pe, bool force)
  146. {
  147. unsigned long flags;
  148. struct eeh_event *event, *tmp;
  149. /*
  150. * If we have NULL PE passed in, we have dead IOC
  151. * or we're sure we can report all existing errors
  152. * by the caller.
  153. *
  154. * With "force", the event with associated PE that
  155. * have been isolated, the event won't be removed
  156. * to avoid event lost.
  157. */
  158. spin_lock_irqsave(&eeh_eventlist_lock, flags);
  159. list_for_each_entry_safe(event, tmp, &eeh_eventlist, list) {
  160. if (!force && event->pe &&
  161. (event->pe->state & EEH_PE_ISOLATED))
  162. continue;
  163. if (!pe) {
  164. list_del(&event->list);
  165. kfree(event);
  166. } else if (pe->type & EEH_PE_PHB) {
  167. if (event->pe && event->pe->phb == pe->phb) {
  168. list_del(&event->list);
  169. kfree(event);
  170. }
  171. } else if (event->pe == pe) {
  172. list_del(&event->list);
  173. kfree(event);
  174. }
  175. }
  176. spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
  177. }