common.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/context_tracking.h>
  3. #include <linux/entry-common.h>
  4. #include <linux/resume_user_mode.h>
  5. #include <linux/highmem.h>
  6. #include <linux/jump_label.h>
  7. #include <linux/kmsan.h>
  8. #include <linux/livepatch.h>
  9. #include <linux/audit.h>
  10. #include <linux/tick.h>
  11. #include "common.h"
  12. #define CREATE_TRACE_POINTS
  13. #include <trace/events/syscalls.h>
  14. /* See comment for enter_from_user_mode() in entry-common.h */
  15. static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
  16. {
  17. arch_enter_from_user_mode(regs);
  18. lockdep_hardirqs_off(CALLER_ADDR0);
  19. CT_WARN_ON(__ct_state() != CONTEXT_USER);
  20. user_exit_irqoff();
  21. instrumentation_begin();
  22. kmsan_unpoison_entry_regs(regs);
  23. trace_hardirqs_off_finish();
  24. instrumentation_end();
  25. }
  26. void noinstr enter_from_user_mode(struct pt_regs *regs)
  27. {
  28. __enter_from_user_mode(regs);
  29. }
  30. static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
  31. {
  32. if (unlikely(audit_context())) {
  33. unsigned long args[6];
  34. syscall_get_arguments(current, regs, args);
  35. audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
  36. }
  37. }
  38. static long syscall_trace_enter(struct pt_regs *regs, long syscall,
  39. unsigned long work)
  40. {
  41. long ret = 0;
  42. /*
  43. * Handle Syscall User Dispatch. This must comes first, since
  44. * the ABI here can be something that doesn't make sense for
  45. * other syscall_work features.
  46. */
  47. if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
  48. if (syscall_user_dispatch(regs))
  49. return -1L;
  50. }
  51. /* Handle ptrace */
  52. if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
  53. ret = ptrace_report_syscall_entry(regs);
  54. if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
  55. return -1L;
  56. }
  57. /* Do seccomp after ptrace, to catch any tracer changes. */
  58. if (work & SYSCALL_WORK_SECCOMP) {
  59. ret = __secure_computing(NULL);
  60. if (ret == -1L)
  61. return ret;
  62. }
  63. /* Either of the above might have changed the syscall number */
  64. syscall = syscall_get_nr(current, regs);
  65. if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
  66. trace_sys_enter(regs, syscall);
  67. syscall_enter_audit(regs, syscall);
  68. return ret ? : syscall;
  69. }
  70. static __always_inline long
  71. __syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
  72. {
  73. unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
  74. if (work & SYSCALL_WORK_ENTER)
  75. syscall = syscall_trace_enter(regs, syscall, work);
  76. return syscall;
  77. }
  78. long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
  79. {
  80. return __syscall_enter_from_user_work(regs, syscall);
  81. }
  82. noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
  83. {
  84. long ret;
  85. __enter_from_user_mode(regs);
  86. instrumentation_begin();
  87. local_irq_enable();
  88. ret = __syscall_enter_from_user_work(regs, syscall);
  89. instrumentation_end();
  90. return ret;
  91. }
  92. noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
  93. {
  94. __enter_from_user_mode(regs);
  95. instrumentation_begin();
  96. local_irq_enable();
  97. instrumentation_end();
  98. }
  99. /* See comment for exit_to_user_mode() in entry-common.h */
  100. static __always_inline void __exit_to_user_mode(void)
  101. {
  102. instrumentation_begin();
  103. trace_hardirqs_on_prepare();
  104. lockdep_hardirqs_on_prepare();
  105. instrumentation_end();
  106. user_enter_irqoff();
  107. arch_exit_to_user_mode();
  108. lockdep_hardirqs_on(CALLER_ADDR0);
  109. }
  110. void noinstr exit_to_user_mode(void)
  111. {
  112. __exit_to_user_mode();
  113. }
  114. /* Workaround to allow gradual conversion of architecture code */
  115. void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
  116. static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
  117. unsigned long ti_work)
  118. {
  119. /*
  120. * Before returning to user space ensure that all pending work
  121. * items have been completed.
  122. */
  123. while (ti_work & EXIT_TO_USER_MODE_WORK) {
  124. local_irq_enable_exit_to_user(ti_work);
  125. if (ti_work & _TIF_NEED_RESCHED)
  126. schedule();
  127. if (ti_work & _TIF_UPROBE)
  128. uprobe_notify_resume(regs);
  129. if (ti_work & _TIF_PATCH_PENDING)
  130. klp_update_patch_state(current);
  131. if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
  132. arch_do_signal_or_restart(regs);
  133. if (ti_work & _TIF_NOTIFY_RESUME)
  134. resume_user_mode_work(regs);
  135. /* Architecture specific TIF work */
  136. arch_exit_to_user_mode_work(regs, ti_work);
  137. /*
  138. * Disable interrupts and reevaluate the work flags as they
  139. * might have changed while interrupts and preemption was
  140. * enabled above.
  141. */
  142. local_irq_disable_exit_to_user();
  143. /* Check if any of the above work has queued a deferred wakeup */
  144. tick_nohz_user_enter_prepare();
  145. ti_work = read_thread_flags();
  146. }
  147. /* Return the latest work state for arch_exit_to_user_mode() */
  148. return ti_work;
  149. }
  150. static void exit_to_user_mode_prepare(struct pt_regs *regs)
  151. {
  152. unsigned long ti_work;
  153. lockdep_assert_irqs_disabled();
  154. /* Flush pending rcuog wakeup before the last need_resched() check */
  155. tick_nohz_user_enter_prepare();
  156. ti_work = read_thread_flags();
  157. if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
  158. ti_work = exit_to_user_mode_loop(regs, ti_work);
  159. arch_exit_to_user_mode_prepare(regs, ti_work);
  160. /* Ensure that the address limit is intact and no locks are held */
  161. addr_limit_user_check();
  162. kmap_assert_nomap();
  163. lockdep_assert_irqs_disabled();
  164. lockdep_sys_exit();
  165. }
  166. /*
  167. * If SYSCALL_EMU is set, then the only reason to report is when
  168. * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
  169. * instruction has been already reported in syscall_enter_from_user_mode().
  170. */
  171. static inline bool report_single_step(unsigned long work)
  172. {
  173. if (work & SYSCALL_WORK_SYSCALL_EMU)
  174. return false;
  175. return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
  176. }
  177. static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
  178. {
  179. bool step;
  180. /*
  181. * If the syscall was rolled back due to syscall user dispatching,
  182. * then the tracers below are not invoked for the same reason as
  183. * the entry side was not invoked in syscall_trace_enter(): The ABI
  184. * of these syscalls is unknown.
  185. */
  186. if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
  187. if (unlikely(current->syscall_dispatch.on_dispatch)) {
  188. current->syscall_dispatch.on_dispatch = false;
  189. return;
  190. }
  191. }
  192. audit_syscall_exit(regs);
  193. if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
  194. trace_sys_exit(regs, syscall_get_return_value(current, regs));
  195. step = report_single_step(work);
  196. if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
  197. ptrace_report_syscall_exit(regs, step);
  198. }
  199. /*
  200. * Syscall specific exit to user mode preparation. Runs with interrupts
  201. * enabled.
  202. */
  203. static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
  204. {
  205. unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
  206. unsigned long nr = syscall_get_nr(current, regs);
  207. CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
  208. if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
  209. if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
  210. local_irq_enable();
  211. }
  212. rseq_syscall(regs);
  213. /*
  214. * Do one-time syscall specific work. If these work items are
  215. * enabled, we want to run them exactly once per syscall exit with
  216. * interrupts enabled.
  217. */
  218. if (unlikely(work & SYSCALL_WORK_EXIT))
  219. syscall_exit_work(regs, work);
  220. }
  221. static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs)
  222. {
  223. syscall_exit_to_user_mode_prepare(regs);
  224. local_irq_disable_exit_to_user();
  225. exit_to_user_mode_prepare(regs);
  226. }
  227. void syscall_exit_to_user_mode_work(struct pt_regs *regs)
  228. {
  229. __syscall_exit_to_user_mode_work(regs);
  230. }
  231. __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
  232. {
  233. instrumentation_begin();
  234. __syscall_exit_to_user_mode_work(regs);
  235. instrumentation_end();
  236. __exit_to_user_mode();
  237. }
  238. noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
  239. {
  240. __enter_from_user_mode(regs);
  241. }
  242. noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
  243. {
  244. instrumentation_begin();
  245. exit_to_user_mode_prepare(regs);
  246. instrumentation_end();
  247. __exit_to_user_mode();
  248. }
  249. noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
  250. {
  251. irqentry_state_t ret = {
  252. .exit_rcu = false,
  253. };
  254. if (user_mode(regs)) {
  255. irqentry_enter_from_user_mode(regs);
  256. return ret;
  257. }
  258. /*
  259. * If this entry hit the idle task invoke ct_irq_enter() whether
  260. * RCU is watching or not.
  261. *
  262. * Interrupts can nest when the first interrupt invokes softirq
  263. * processing on return which enables interrupts.
  264. *
  265. * Scheduler ticks in the idle task can mark quiescent state and
  266. * terminate a grace period, if and only if the timer interrupt is
  267. * not nested into another interrupt.
  268. *
  269. * Checking for rcu_is_watching() here would prevent the nesting
  270. * interrupt to invoke ct_irq_enter(). If that nested interrupt is
  271. * the tick then rcu_flavor_sched_clock_irq() would wrongfully
  272. * assume that it is the first interrupt and eventually claim
  273. * quiescent state and end grace periods prematurely.
  274. *
  275. * Unconditionally invoke ct_irq_enter() so RCU state stays
  276. * consistent.
  277. *
  278. * TINY_RCU does not support EQS, so let the compiler eliminate
  279. * this part when enabled.
  280. */
  281. if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
  282. /*
  283. * If RCU is not watching then the same careful
  284. * sequence vs. lockdep and tracing is required
  285. * as in irqentry_enter_from_user_mode().
  286. */
  287. lockdep_hardirqs_off(CALLER_ADDR0);
  288. ct_irq_enter();
  289. instrumentation_begin();
  290. kmsan_unpoison_entry_regs(regs);
  291. trace_hardirqs_off_finish();
  292. instrumentation_end();
  293. ret.exit_rcu = true;
  294. return ret;
  295. }
  296. /*
  297. * If RCU is watching then RCU only wants to check whether it needs
  298. * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
  299. * already contains a warning when RCU is not watching, so no point
  300. * in having another one here.
  301. */
  302. lockdep_hardirqs_off(CALLER_ADDR0);
  303. instrumentation_begin();
  304. kmsan_unpoison_entry_regs(regs);
  305. rcu_irq_enter_check_tick();
  306. trace_hardirqs_off_finish();
  307. instrumentation_end();
  308. return ret;
  309. }
  310. void raw_irqentry_exit_cond_resched(void)
  311. {
  312. if (!preempt_count()) {
  313. /* Sanity check RCU and thread stack */
  314. rcu_irq_exit_check_preempt();
  315. if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
  316. WARN_ON_ONCE(!on_thread_stack());
  317. if (need_resched())
  318. preempt_schedule_irq();
  319. }
  320. }
  321. #ifdef CONFIG_PREEMPT_DYNAMIC
  322. #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
  323. DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
  324. #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
  325. DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
  326. void dynamic_irqentry_exit_cond_resched(void)
  327. {
  328. if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched))
  329. return;
  330. raw_irqentry_exit_cond_resched();
  331. }
  332. #endif
  333. #endif
  334. noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
  335. {
  336. lockdep_assert_irqs_disabled();
  337. /* Check whether this returns to user mode */
  338. if (user_mode(regs)) {
  339. irqentry_exit_to_user_mode(regs);
  340. } else if (!regs_irqs_disabled(regs)) {
  341. /*
  342. * If RCU was not watching on entry this needs to be done
  343. * carefully and needs the same ordering of lockdep/tracing
  344. * and RCU as the return to user mode path.
  345. */
  346. if (state.exit_rcu) {
  347. instrumentation_begin();
  348. /* Tell the tracer that IRET will enable interrupts */
  349. trace_hardirqs_on_prepare();
  350. lockdep_hardirqs_on_prepare();
  351. instrumentation_end();
  352. ct_irq_exit();
  353. lockdep_hardirqs_on(CALLER_ADDR0);
  354. return;
  355. }
  356. instrumentation_begin();
  357. if (IS_ENABLED(CONFIG_PREEMPTION))
  358. irqentry_exit_cond_resched();
  359. /* Covers both tracing and lockdep */
  360. trace_hardirqs_on();
  361. instrumentation_end();
  362. } else {
  363. /*
  364. * IRQ flags state is correct already. Just tell RCU if it
  365. * was not watching on entry.
  366. */
  367. if (state.exit_rcu)
  368. ct_irq_exit();
  369. }
  370. }
  371. irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
  372. {
  373. irqentry_state_t irq_state;
  374. irq_state.lockdep = lockdep_hardirqs_enabled();
  375. __nmi_enter();
  376. lockdep_hardirqs_off(CALLER_ADDR0);
  377. lockdep_hardirq_enter();
  378. ct_nmi_enter();
  379. instrumentation_begin();
  380. kmsan_unpoison_entry_regs(regs);
  381. trace_hardirqs_off_finish();
  382. ftrace_nmi_enter();
  383. instrumentation_end();
  384. return irq_state;
  385. }
  386. void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
  387. {
  388. instrumentation_begin();
  389. ftrace_nmi_exit();
  390. if (irq_state.lockdep) {
  391. trace_hardirqs_on_prepare();
  392. lockdep_hardirqs_on_prepare();
  393. }
  394. instrumentation_end();
  395. ct_nmi_exit();
  396. lockdep_hardirq_exit();
  397. if (irq_state.lockdep)
  398. lockdep_hardirqs_on(CALLER_ADDR0);
  399. __nmi_exit();
  400. }