common.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * common.c - C code for kernel entry and exit
  4. * Copyright (c) 2015 Andrew Lutomirski
  5. *
  6. * Based on asm and ptrace code by many authors. The code here originated
  7. * in ptrace.c and signal.c.
  8. */
  9. #include <linux/kernel.h>
  10. #include <linux/sched.h>
  11. #include <linux/sched/task_stack.h>
  12. #include <linux/entry-common.h>
  13. #include <linux/mm.h>
  14. #include <linux/smp.h>
  15. #include <linux/errno.h>
  16. #include <linux/ptrace.h>
  17. #include <linux/export.h>
  18. #include <linux/nospec.h>
  19. #include <linux/syscalls.h>
  20. #include <linux/uaccess.h>
  21. #ifdef CONFIG_XEN_PV
  22. #include <xen/xen-ops.h>
  23. #include <xen/events.h>
  24. #endif
  25. #include <asm/apic.h>
  26. #include <asm/desc.h>
  27. #include <asm/traps.h>
  28. #include <asm/vdso.h>
  29. #include <asm/cpufeature.h>
  30. #include <asm/fpu/api.h>
  31. #include <asm/nospec-branch.h>
  32. #include <asm/io_bitmap.h>
  33. #include <asm/syscall.h>
  34. #include <asm/irq_stack.h>
  35. #ifdef CONFIG_X86_64
  36. static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
  37. {
  38. /*
  39. * Convert negative numbers to very high and thus out of range
  40. * numbers for comparisons.
  41. */
  42. unsigned int unr = nr;
  43. if (likely(unr < NR_syscalls)) {
  44. unr = array_index_nospec(unr, NR_syscalls);
  45. regs->ax = sys_call_table[unr](regs);
  46. return true;
  47. }
  48. return false;
  49. }
  50. static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
  51. {
  52. /*
  53. * Adjust the starting offset of the table, and convert numbers
  54. * < __X32_SYSCALL_BIT to very high and thus out of range
  55. * numbers for comparisons.
  56. */
  57. unsigned int xnr = nr - __X32_SYSCALL_BIT;
  58. if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
  59. xnr = array_index_nospec(xnr, X32_NR_syscalls);
  60. regs->ax = x32_sys_call_table[xnr](regs);
  61. return true;
  62. }
  63. return false;
  64. }
  65. __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
  66. {
  67. add_random_kstack_offset();
  68. nr = syscall_enter_from_user_mode(regs, nr);
  69. instrumentation_begin();
  70. if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
  71. /* Invalid system call, but still a system call. */
  72. regs->ax = __x64_sys_ni_syscall(regs);
  73. }
  74. instrumentation_end();
  75. syscall_exit_to_user_mode(regs);
  76. }
  77. #endif
  78. #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
  79. static __always_inline int syscall_32_enter(struct pt_regs *regs)
  80. {
  81. if (IS_ENABLED(CONFIG_IA32_EMULATION))
  82. current_thread_info()->status |= TS_COMPAT;
  83. return (int)regs->orig_ax;
  84. }
  85. #ifdef CONFIG_IA32_EMULATION
  86. bool __ia32_enabled __ro_after_init = true;
  87. #endif
  88. /*
  89. * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL.
  90. */
  91. static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
  92. {
  93. /*
  94. * Convert negative numbers to very high and thus out of range
  95. * numbers for comparisons.
  96. */
  97. unsigned int unr = nr;
  98. if (likely(unr < IA32_NR_syscalls)) {
  99. unr = array_index_nospec(unr, IA32_NR_syscalls);
  100. regs->ax = ia32_sys_call_table[unr](regs);
  101. } else if (nr != -1) {
  102. regs->ax = __ia32_sys_ni_syscall(regs);
  103. }
  104. }
  105. #ifdef CONFIG_IA32_EMULATION
  106. static __always_inline bool int80_is_external(void)
  107. {
  108. const unsigned int offs = (0x80 / 32) * 0x10;
  109. const u32 bit = BIT(0x80 % 32);
  110. /* The local APIC on XENPV guests is fake */
  111. if (cpu_feature_enabled(X86_FEATURE_XENPV))
  112. return false;
  113. /*
  114. * If vector 0x80 is set in the APIC ISR then this is an external
  115. * interrupt. Either from broken hardware or injected by a VMM.
  116. *
  117. * Note: In guest mode this is only valid for secure guests where
  118. * the secure module fully controls the vAPIC exposed to the guest.
  119. */
  120. return apic_read(APIC_ISR + offs) & bit;
  121. }
  122. /**
  123. * int80_emulation - 32-bit legacy syscall entry
  124. *
  125. * This entry point can be used by 32-bit and 64-bit programs to perform
  126. * 32-bit system calls. Instances of INT $0x80 can be found inline in
  127. * various programs and libraries. It is also used by the vDSO's
  128. * __kernel_vsyscall fallback for hardware that doesn't support a faster
  129. * entry method. Restarted 32-bit system calls also fall back to INT
  130. * $0x80 regardless of what instruction was originally used to do the
  131. * system call.
  132. *
  133. * This is considered a slow path. It is not used by most libc
  134. * implementations on modern hardware except during process startup.
  135. *
  136. * The arguments for the INT $0x80 based syscall are on stack in the
  137. * pt_regs structure:
  138. * eax: system call number
  139. * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6
  140. */
  141. DEFINE_IDTENTRY_RAW(int80_emulation)
  142. {
  143. int nr;
  144. /* Kernel does not use INT $0x80! */
  145. if (unlikely(!user_mode(regs))) {
  146. irqentry_enter(regs);
  147. instrumentation_begin();
  148. panic("Unexpected external interrupt 0x80\n");
  149. }
  150. /*
  151. * Establish kernel context for instrumentation, including for
  152. * int80_is_external() below which calls into the APIC driver.
  153. * Identical for soft and external interrupts.
  154. */
  155. enter_from_user_mode(regs);
  156. instrumentation_begin();
  157. add_random_kstack_offset();
  158. /* Validate that this is a soft interrupt to the extent possible */
  159. if (unlikely(int80_is_external()))
  160. panic("Unexpected external interrupt 0x80\n");
  161. /*
  162. * The low level idtentry code pushed -1 into regs::orig_ax
  163. * and regs::ax contains the syscall number.
  164. *
  165. * User tracing code (ptrace or signal handlers) might assume
  166. * that the regs::orig_ax contains a 32-bit number on invoking
  167. * a 32-bit syscall.
  168. *
  169. * Establish the syscall convention by saving the 32bit truncated
  170. * syscall number in regs::orig_ax and by invalidating regs::ax.
  171. */
  172. regs->orig_ax = regs->ax & GENMASK(31, 0);
  173. regs->ax = -ENOSYS;
  174. nr = syscall_32_enter(regs);
  175. local_irq_enable();
  176. nr = syscall_enter_from_user_mode_work(regs, nr);
  177. do_syscall_32_irqs_on(regs, nr);
  178. instrumentation_end();
  179. syscall_exit_to_user_mode(regs);
  180. }
  181. #else /* CONFIG_IA32_EMULATION */
  182. /* Handles int $0x80 on a 32bit kernel */
  183. __visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
  184. {
  185. int nr = syscall_32_enter(regs);
  186. add_random_kstack_offset();
  187. /*
  188. * Subtlety here: if ptrace pokes something larger than 2^31-1 into
  189. * orig_ax, the int return value truncates it. This matches
  190. * the semantics of syscall_get_nr().
  191. */
  192. nr = syscall_enter_from_user_mode(regs, nr);
  193. instrumentation_begin();
  194. do_syscall_32_irqs_on(regs, nr);
  195. instrumentation_end();
  196. syscall_exit_to_user_mode(regs);
  197. }
  198. #endif /* !CONFIG_IA32_EMULATION */
  199. static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
  200. {
  201. int nr = syscall_32_enter(regs);
  202. int res;
  203. add_random_kstack_offset();
  204. /*
  205. * This cannot use syscall_enter_from_user_mode() as it has to
  206. * fetch EBP before invoking any of the syscall entry work
  207. * functions.
  208. */
  209. syscall_enter_from_user_mode_prepare(regs);
  210. instrumentation_begin();
  211. /* Fetch EBP from where the vDSO stashed it. */
  212. if (IS_ENABLED(CONFIG_X86_64)) {
  213. /*
  214. * Micro-optimization: the pointer we're following is
  215. * explicitly 32 bits, so it can't be out of range.
  216. */
  217. res = __get_user(*(u32 *)&regs->bp,
  218. (u32 __user __force *)(unsigned long)(u32)regs->sp);
  219. } else {
  220. res = get_user(*(u32 *)&regs->bp,
  221. (u32 __user __force *)(unsigned long)(u32)regs->sp);
  222. }
  223. if (res) {
  224. /* User code screwed up. */
  225. regs->ax = -EFAULT;
  226. local_irq_disable();
  227. instrumentation_end();
  228. irqentry_exit_to_user_mode(regs);
  229. return false;
  230. }
  231. nr = syscall_enter_from_user_mode_work(regs, nr);
  232. /* Now this is just like a normal syscall. */
  233. do_syscall_32_irqs_on(regs, nr);
  234. instrumentation_end();
  235. syscall_exit_to_user_mode(regs);
  236. return true;
  237. }
  238. /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
  239. __visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
  240. {
  241. /*
  242. * Called using the internal vDSO SYSENTER/SYSCALL32 calling
  243. * convention. Adjust regs so it looks like we entered using int80.
  244. */
  245. unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
  246. vdso_image_32.sym_int80_landing_pad;
  247. /*
  248. * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
  249. * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
  250. * Fix it up.
  251. */
  252. regs->ip = landing_pad;
  253. /* Invoke the syscall. If it failed, keep it simple: use IRET. */
  254. if (!__do_fast_syscall_32(regs))
  255. return 0;
  256. #ifdef CONFIG_X86_64
  257. /*
  258. * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
  259. * SYSRETL is available on all 64-bit CPUs, so we don't need to
  260. * bother with SYSEXIT.
  261. *
  262. * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
  263. * because the ECX fixup above will ensure that this is essentially
  264. * never the case.
  265. */
  266. return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
  267. regs->ip == landing_pad &&
  268. (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
  269. #else
  270. /*
  271. * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
  272. *
  273. * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
  274. * because the ECX fixup above will ensure that this is essentially
  275. * never the case.
  276. *
  277. * We don't allow syscalls at all from VM86 mode, but we still
  278. * need to check VM, because we might be returning from sys_vm86.
  279. */
  280. return static_cpu_has(X86_FEATURE_SEP) &&
  281. regs->cs == __USER_CS && regs->ss == __USER_DS &&
  282. regs->ip == landing_pad &&
  283. (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
  284. #endif
  285. }
  286. /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
  287. __visible noinstr long do_SYSENTER_32(struct pt_regs *regs)
  288. {
  289. /* SYSENTER loses RSP, but the vDSO saved it in RBP. */
  290. regs->sp = regs->bp;
  291. /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */
  292. regs->flags |= X86_EFLAGS_IF;
  293. return do_fast_syscall_32(regs);
  294. }
  295. #endif
  296. SYSCALL_DEFINE0(ni_syscall)
  297. {
  298. return -ENOSYS;
  299. }
  300. #ifdef CONFIG_XEN_PV
  301. #ifndef CONFIG_PREEMPTION
  302. /*
  303. * Some hypercalls issued by the toolstack can take many 10s of
  304. * seconds. Allow tasks running hypercalls via the privcmd driver to
  305. * be voluntarily preempted even if full kernel preemption is
  306. * disabled.
  307. *
  308. * Such preemptible hypercalls are bracketed by
  309. * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
  310. * calls.
  311. */
  312. DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
  313. EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
  314. /*
  315. * In case of scheduling the flag must be cleared and restored after
  316. * returning from schedule as the task might move to a different CPU.
  317. */
  318. static __always_inline bool get_and_clear_inhcall(void)
  319. {
  320. bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
  321. __this_cpu_write(xen_in_preemptible_hcall, false);
  322. return inhcall;
  323. }
  324. static __always_inline void restore_inhcall(bool inhcall)
  325. {
  326. __this_cpu_write(xen_in_preemptible_hcall, inhcall);
  327. }
  328. #else
  329. static __always_inline bool get_and_clear_inhcall(void) { return false; }
  330. static __always_inline void restore_inhcall(bool inhcall) { }
  331. #endif
  332. static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs)
  333. {
  334. struct pt_regs *old_regs = set_irq_regs(regs);
  335. inc_irq_stat(irq_hv_callback_count);
  336. xen_evtchn_do_upcall();
  337. set_irq_regs(old_regs);
  338. }
  339. __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
  340. {
  341. irqentry_state_t state = irqentry_enter(regs);
  342. bool inhcall;
  343. instrumentation_begin();
  344. run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
  345. inhcall = get_and_clear_inhcall();
  346. if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
  347. irqentry_exit_cond_resched();
  348. instrumentation_end();
  349. restore_inhcall(inhcall);
  350. } else {
  351. instrumentation_end();
  352. irqentry_exit(regs, state);
  353. }
  354. }
  355. #endif /* CONFIG_XEN_PV */