nmi.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright (C) 1991, 1992 Linus Torvalds
  4. * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
  5. * Copyright (C) 2011 Don Zickus Red Hat, Inc.
  6. *
  7. * Pentium III FXSR, SSE support
  8. * Gareth Hughes <[email protected]>, May 2000
  9. */
  10. /*
  11. * Handle hardware traps and faults.
  12. */
  13. #include <linux/spinlock.h>
  14. #include <linux/kprobes.h>
  15. #include <linux/kdebug.h>
  16. #include <linux/sched/debug.h>
  17. #include <linux/nmi.h>
  18. #include <linux/debugfs.h>
  19. #include <linux/delay.h>
  20. #include <linux/hardirq.h>
  21. #include <linux/ratelimit.h>
  22. #include <linux/slab.h>
  23. #include <linux/export.h>
  24. #include <linux/atomic.h>
  25. #include <linux/sched/clock.h>
  26. #include <asm/cpu_entry_area.h>
  27. #include <asm/traps.h>
  28. #include <asm/mach_traps.h>
  29. #include <asm/nmi.h>
  30. #include <asm/x86_init.h>
  31. #include <asm/reboot.h>
  32. #include <asm/cache.h>
  33. #include <asm/nospec-branch.h>
  34. #include <asm/sev.h>
  35. #define CREATE_TRACE_POINTS
  36. #include <trace/events/nmi.h>
  37. struct nmi_desc {
  38. raw_spinlock_t lock;
  39. struct list_head head;
  40. };
  41. static struct nmi_desc nmi_desc[NMI_MAX] =
  42. {
  43. {
  44. .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock),
  45. .head = LIST_HEAD_INIT(nmi_desc[0].head),
  46. },
  47. {
  48. .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
  49. .head = LIST_HEAD_INIT(nmi_desc[1].head),
  50. },
  51. {
  52. .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
  53. .head = LIST_HEAD_INIT(nmi_desc[2].head),
  54. },
  55. {
  56. .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
  57. .head = LIST_HEAD_INIT(nmi_desc[3].head),
  58. },
  59. };
  60. struct nmi_stats {
  61. unsigned int normal;
  62. unsigned int unknown;
  63. unsigned int external;
  64. unsigned int swallow;
  65. };
  66. static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
  67. static int ignore_nmis __read_mostly;
  68. int unknown_nmi_panic;
  69. /*
  70. * Prevent NMI reason port (0x61) being accessed simultaneously, can
  71. * only be used in NMI handler.
  72. */
  73. static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
  74. static int __init setup_unknown_nmi_panic(char *str)
  75. {
  76. unknown_nmi_panic = 1;
  77. return 1;
  78. }
  79. __setup("unknown_nmi_panic", setup_unknown_nmi_panic);
  80. #define nmi_to_desc(type) (&nmi_desc[type])
  81. static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC;
  82. static int __init nmi_warning_debugfs(void)
  83. {
  84. debugfs_create_u64("nmi_longest_ns", 0644,
  85. arch_debugfs_dir, &nmi_longest_ns);
  86. return 0;
  87. }
  88. fs_initcall(nmi_warning_debugfs);
  89. static void nmi_check_duration(struct nmiaction *action, u64 duration)
  90. {
  91. int remainder_ns, decimal_msecs;
  92. if (duration < nmi_longest_ns || duration < action->max_duration)
  93. return;
  94. action->max_duration = duration;
  95. remainder_ns = do_div(duration, (1000 * 1000));
  96. decimal_msecs = remainder_ns / 1000;
  97. printk_ratelimited(KERN_INFO
  98. "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
  99. action->handler, duration, decimal_msecs);
  100. }
  101. static int nmi_handle(unsigned int type, struct pt_regs *regs)
  102. {
  103. struct nmi_desc *desc = nmi_to_desc(type);
  104. struct nmiaction *a;
  105. int handled=0;
  106. rcu_read_lock();
  107. /*
  108. * NMIs are edge-triggered, which means if you have enough
  109. * of them concurrently, you can lose some because only one
  110. * can be latched at any given time. Walk the whole list
  111. * to handle those situations.
  112. */
  113. list_for_each_entry_rcu(a, &desc->head, list) {
  114. int thishandled;
  115. u64 delta;
  116. delta = sched_clock();
  117. thishandled = a->handler(type, regs);
  118. handled += thishandled;
  119. delta = sched_clock() - delta;
  120. trace_nmi_handler(a->handler, (int)delta, thishandled);
  121. nmi_check_duration(a, delta);
  122. }
  123. rcu_read_unlock();
  124. /* return total number of NMI events handled */
  125. return handled;
  126. }
  127. NOKPROBE_SYMBOL(nmi_handle);
  128. int __register_nmi_handler(unsigned int type, struct nmiaction *action)
  129. {
  130. struct nmi_desc *desc = nmi_to_desc(type);
  131. unsigned long flags;
  132. if (WARN_ON_ONCE(!action->handler || !list_empty(&action->list)))
  133. return -EINVAL;
  134. raw_spin_lock_irqsave(&desc->lock, flags);
  135. /*
  136. * Indicate if there are multiple registrations on the
  137. * internal NMI handler call chains (SERR and IO_CHECK).
  138. */
  139. WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head));
  140. WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head));
  141. /*
  142. * some handlers need to be executed first otherwise a fake
  143. * event confuses some handlers (kdump uses this flag)
  144. */
  145. if (action->flags & NMI_FLAG_FIRST)
  146. list_add_rcu(&action->list, &desc->head);
  147. else
  148. list_add_tail_rcu(&action->list, &desc->head);
  149. raw_spin_unlock_irqrestore(&desc->lock, flags);
  150. return 0;
  151. }
  152. EXPORT_SYMBOL(__register_nmi_handler);
  153. void unregister_nmi_handler(unsigned int type, const char *name)
  154. {
  155. struct nmi_desc *desc = nmi_to_desc(type);
  156. struct nmiaction *n, *found = NULL;
  157. unsigned long flags;
  158. raw_spin_lock_irqsave(&desc->lock, flags);
  159. list_for_each_entry_rcu(n, &desc->head, list) {
  160. /*
  161. * the name passed in to describe the nmi handler
  162. * is used as the lookup key
  163. */
  164. if (!strcmp(n->name, name)) {
  165. WARN(in_nmi(),
  166. "Trying to free NMI (%s) from NMI context!\n", n->name);
  167. list_del_rcu(&n->list);
  168. found = n;
  169. break;
  170. }
  171. }
  172. raw_spin_unlock_irqrestore(&desc->lock, flags);
  173. if (found) {
  174. synchronize_rcu();
  175. INIT_LIST_HEAD(&found->list);
  176. }
  177. }
  178. EXPORT_SYMBOL_GPL(unregister_nmi_handler);
  179. static void
  180. pci_serr_error(unsigned char reason, struct pt_regs *regs)
  181. {
  182. /* check to see if anyone registered against these types of errors */
  183. if (nmi_handle(NMI_SERR, regs))
  184. return;
  185. pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
  186. reason, smp_processor_id());
  187. if (panic_on_unrecovered_nmi)
  188. nmi_panic(regs, "NMI: Not continuing");
  189. pr_emerg("Dazed and confused, but trying to continue\n");
  190. /* Clear and disable the PCI SERR error line. */
  191. reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
  192. outb(reason, NMI_REASON_PORT);
  193. }
  194. NOKPROBE_SYMBOL(pci_serr_error);
  195. static void
  196. io_check_error(unsigned char reason, struct pt_regs *regs)
  197. {
  198. unsigned long i;
  199. /* check to see if anyone registered against these types of errors */
  200. if (nmi_handle(NMI_IO_CHECK, regs))
  201. return;
  202. pr_emerg(
  203. "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
  204. reason, smp_processor_id());
  205. show_regs(regs);
  206. if (panic_on_io_nmi) {
  207. nmi_panic(regs, "NMI IOCK error: Not continuing");
  208. /*
  209. * If we end up here, it means we have received an NMI while
  210. * processing panic(). Simply return without delaying and
  211. * re-enabling NMIs.
  212. */
  213. return;
  214. }
  215. /* Re-enable the IOCK line, wait for a few seconds */
  216. reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
  217. outb(reason, NMI_REASON_PORT);
  218. i = 20000;
  219. while (--i) {
  220. touch_nmi_watchdog();
  221. udelay(100);
  222. }
  223. reason &= ~NMI_REASON_CLEAR_IOCHK;
  224. outb(reason, NMI_REASON_PORT);
  225. }
  226. NOKPROBE_SYMBOL(io_check_error);
  227. static void
  228. unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
  229. {
  230. int handled;
  231. /*
  232. * Use 'false' as back-to-back NMIs are dealt with one level up.
  233. * Of course this makes having multiple 'unknown' handlers useless
  234. * as only the first one is ever run (unless it can actually determine
  235. * if it caused the NMI)
  236. */
  237. handled = nmi_handle(NMI_UNKNOWN, regs);
  238. if (handled) {
  239. __this_cpu_add(nmi_stats.unknown, handled);
  240. return;
  241. }
  242. __this_cpu_add(nmi_stats.unknown, 1);
  243. pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
  244. reason, smp_processor_id());
  245. if (unknown_nmi_panic || panic_on_unrecovered_nmi)
  246. nmi_panic(regs, "NMI: Not continuing");
  247. pr_emerg("Dazed and confused, but trying to continue\n");
  248. }
  249. NOKPROBE_SYMBOL(unknown_nmi_error);
  250. static DEFINE_PER_CPU(bool, swallow_nmi);
  251. static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
  252. static noinstr void default_do_nmi(struct pt_regs *regs)
  253. {
  254. unsigned char reason = 0;
  255. int handled;
  256. bool b2b = false;
  257. /*
  258. * CPU-specific NMI must be processed before non-CPU-specific
  259. * NMI, otherwise we may lose it, because the CPU-specific
  260. * NMI can not be detected/processed on other CPUs.
  261. */
  262. /*
  263. * Back-to-back NMIs are interesting because they can either
  264. * be two NMI or more than two NMIs (any thing over two is dropped
  265. * due to NMI being edge-triggered). If this is the second half
  266. * of the back-to-back NMI, assume we dropped things and process
  267. * more handlers. Otherwise reset the 'swallow' NMI behaviour
  268. */
  269. if (regs->ip == __this_cpu_read(last_nmi_rip))
  270. b2b = true;
  271. else
  272. __this_cpu_write(swallow_nmi, false);
  273. __this_cpu_write(last_nmi_rip, regs->ip);
  274. instrumentation_begin();
  275. handled = nmi_handle(NMI_LOCAL, regs);
  276. __this_cpu_add(nmi_stats.normal, handled);
  277. if (handled) {
  278. /*
  279. * There are cases when a NMI handler handles multiple
  280. * events in the current NMI. One of these events may
  281. * be queued for in the next NMI. Because the event is
  282. * already handled, the next NMI will result in an unknown
  283. * NMI. Instead lets flag this for a potential NMI to
  284. * swallow.
  285. */
  286. if (handled > 1)
  287. __this_cpu_write(swallow_nmi, true);
  288. goto out;
  289. }
  290. /*
  291. * Non-CPU-specific NMI: NMI sources can be processed on any CPU.
  292. *
  293. * Another CPU may be processing panic routines while holding
  294. * nmi_reason_lock. Check if the CPU issued the IPI for crash dumping,
  295. * and if so, call its callback directly. If there is no CPU preparing
  296. * crash dump, we simply loop here.
  297. */
  298. while (!raw_spin_trylock(&nmi_reason_lock)) {
  299. run_crash_ipi_callback(regs);
  300. cpu_relax();
  301. }
  302. reason = x86_platform.get_nmi_reason();
  303. if (reason & NMI_REASON_MASK) {
  304. if (reason & NMI_REASON_SERR)
  305. pci_serr_error(reason, regs);
  306. else if (reason & NMI_REASON_IOCHK)
  307. io_check_error(reason, regs);
  308. #ifdef CONFIG_X86_32
  309. /*
  310. * Reassert NMI in case it became active
  311. * meanwhile as it's edge-triggered:
  312. */
  313. reassert_nmi();
  314. #endif
  315. __this_cpu_add(nmi_stats.external, 1);
  316. raw_spin_unlock(&nmi_reason_lock);
  317. goto out;
  318. }
  319. raw_spin_unlock(&nmi_reason_lock);
  320. /*
  321. * Only one NMI can be latched at a time. To handle
  322. * this we may process multiple nmi handlers at once to
  323. * cover the case where an NMI is dropped. The downside
  324. * to this approach is we may process an NMI prematurely,
  325. * while its real NMI is sitting latched. This will cause
  326. * an unknown NMI on the next run of the NMI processing.
  327. *
  328. * We tried to flag that condition above, by setting the
  329. * swallow_nmi flag when we process more than one event.
  330. * This condition is also only present on the second half
  331. * of a back-to-back NMI, so we flag that condition too.
  332. *
  333. * If both are true, we assume we already processed this
  334. * NMI previously and we swallow it. Otherwise we reset
  335. * the logic.
  336. *
  337. * There are scenarios where we may accidentally swallow
  338. * a 'real' unknown NMI. For example, while processing
  339. * a perf NMI another perf NMI comes in along with a
  340. * 'real' unknown NMI. These two NMIs get combined into
  341. * one (as described above). When the next NMI gets
  342. * processed, it will be flagged by perf as handled, but
  343. * no one will know that there was a 'real' unknown NMI sent
  344. * also. As a result it gets swallowed. Or if the first
  345. * perf NMI returns two events handled then the second
  346. * NMI will get eaten by the logic below, again losing a
  347. * 'real' unknown NMI. But this is the best we can do
  348. * for now.
  349. */
  350. if (b2b && __this_cpu_read(swallow_nmi))
  351. __this_cpu_add(nmi_stats.swallow, 1);
  352. else
  353. unknown_nmi_error(reason, regs);
  354. out:
  355. instrumentation_end();
  356. }
  357. /*
  358. * NMIs can page fault or hit breakpoints which will cause it to lose
  359. * its NMI context with the CPU when the breakpoint or page fault does an IRET.
  360. *
  361. * As a result, NMIs can nest if NMIs get unmasked due an IRET during
  362. * NMI processing. On x86_64, the asm glue protects us from nested NMIs
  363. * if the outer NMI came from kernel mode, but we can still nest if the
  364. * outer NMI came from user mode.
  365. *
  366. * To handle these nested NMIs, we have three states:
  367. *
  368. * 1) not running
  369. * 2) executing
  370. * 3) latched
  371. *
  372. * When no NMI is in progress, it is in the "not running" state.
  373. * When an NMI comes in, it goes into the "executing" state.
  374. * Normally, if another NMI is triggered, it does not interrupt
  375. * the running NMI and the HW will simply latch it so that when
  376. * the first NMI finishes, it will restart the second NMI.
  377. * (Note, the latch is binary, thus multiple NMIs triggering,
  378. * when one is running, are ignored. Only one NMI is restarted.)
  379. *
  380. * If an NMI executes an iret, another NMI can preempt it. We do not
  381. * want to allow this new NMI to run, but we want to execute it when the
  382. * first one finishes. We set the state to "latched", and the exit of
  383. * the first NMI will perform a dec_return, if the result is zero
  384. * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the
  385. * dec_return would have set the state to NMI_EXECUTING (what we want it
  386. * to be when we are running). In this case, we simply jump back to
  387. * rerun the NMI handler again, and restart the 'latched' NMI.
  388. *
  389. * No trap (breakpoint or page fault) should be hit before nmi_restart,
  390. * thus there is no race between the first check of state for NOT_RUNNING
  391. * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs
  392. * at this point.
  393. *
  394. * In case the NMI takes a page fault, we need to save off the CR2
  395. * because the NMI could have preempted another page fault and corrupt
  396. * the CR2 that is about to be read. As nested NMIs must be restarted
  397. * and they can not take breakpoints or page faults, the update of the
  398. * CR2 must be done before converting the nmi state back to NOT_RUNNING.
  399. * Otherwise, there would be a race of another nested NMI coming in
  400. * after setting state to NOT_RUNNING but before updating the nmi_cr2.
  401. */
  402. enum nmi_states {
  403. NMI_NOT_RUNNING = 0,
  404. NMI_EXECUTING,
  405. NMI_LATCHED,
  406. };
  407. static DEFINE_PER_CPU(enum nmi_states, nmi_state);
  408. static DEFINE_PER_CPU(unsigned long, nmi_cr2);
  409. static DEFINE_PER_CPU(unsigned long, nmi_dr7);
  410. DEFINE_IDTENTRY_RAW(exc_nmi)
  411. {
  412. irqentry_state_t irq_state;
  413. /*
  414. * Re-enable NMIs right here when running as an SEV-ES guest. This might
  415. * cause nested NMIs, but those can be handled safely.
  416. */
  417. sev_es_nmi_complete();
  418. if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id()))
  419. return;
  420. if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
  421. this_cpu_write(nmi_state, NMI_LATCHED);
  422. return;
  423. }
  424. this_cpu_write(nmi_state, NMI_EXECUTING);
  425. this_cpu_write(nmi_cr2, read_cr2());
  426. nmi_restart:
  427. /*
  428. * Needs to happen before DR7 is accessed, because the hypervisor can
  429. * intercept DR7 reads/writes, turning those into #VC exceptions.
  430. */
  431. sev_es_ist_enter(regs);
  432. this_cpu_write(nmi_dr7, local_db_save());
  433. irq_state = irqentry_nmi_enter(regs);
  434. inc_irq_stat(__nmi_count);
  435. if (!ignore_nmis)
  436. default_do_nmi(regs);
  437. irqentry_nmi_exit(regs, irq_state);
  438. local_db_restore(this_cpu_read(nmi_dr7));
  439. sev_es_ist_exit();
  440. if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
  441. write_cr2(this_cpu_read(nmi_cr2));
  442. if (this_cpu_dec_return(nmi_state))
  443. goto nmi_restart;
  444. if (user_mode(regs))
  445. mds_user_clear_cpu_buffers();
  446. }
  447. #if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
  448. DEFINE_IDTENTRY_RAW(exc_nmi_noist)
  449. {
  450. exc_nmi(regs);
  451. }
  452. #endif
  453. #if IS_MODULE(CONFIG_KVM_INTEL)
  454. EXPORT_SYMBOL_GPL(asm_exc_nmi_noist);
  455. #endif
  456. void stop_nmi(void)
  457. {
  458. ignore_nmis++;
  459. }
  460. void restart_nmi(void)
  461. {
  462. ignore_nmis--;
  463. }
  464. /* reset the back-to-back NMI logic */
  465. void local_touch_nmi(void)
  466. {
  467. __this_cpu_write(last_nmi_rip, 0);
  468. }
  469. EXPORT_SYMBOL_GPL(local_touch_nmi);