process.c 25 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042
  1. // SPDX-License-Identifier: GPL-2.0
  2. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  3. #include <linux/errno.h>
  4. #include <linux/kernel.h>
  5. #include <linux/mm.h>
  6. #include <linux/smp.h>
  7. #include <linux/prctl.h>
  8. #include <linux/slab.h>
  9. #include <linux/sched.h>
  10. #include <linux/sched/idle.h>
  11. #include <linux/sched/debug.h>
  12. #include <linux/sched/task.h>
  13. #include <linux/sched/task_stack.h>
  14. #include <linux/init.h>
  15. #include <linux/export.h>
  16. #include <linux/pm.h>
  17. #include <linux/tick.h>
  18. #include <linux/random.h>
  19. #include <linux/user-return-notifier.h>
  20. #include <linux/dmi.h>
  21. #include <linux/utsname.h>
  22. #include <linux/stackprotector.h>
  23. #include <linux/cpuidle.h>
  24. #include <linux/acpi.h>
  25. #include <linux/elf-randomize.h>
  26. #include <trace/events/power.h>
  27. #include <linux/hw_breakpoint.h>
  28. #include <asm/cpu.h>
  29. #include <asm/apic.h>
  30. #include <linux/uaccess.h>
  31. #include <asm/mwait.h>
  32. #include <asm/fpu/api.h>
  33. #include <asm/fpu/sched.h>
  34. #include <asm/fpu/xstate.h>
  35. #include <asm/debugreg.h>
  36. #include <asm/nmi.h>
  37. #include <asm/tlbflush.h>
  38. #include <asm/mce.h>
  39. #include <asm/vm86.h>
  40. #include <asm/switch_to.h>
  41. #include <asm/desc.h>
  42. #include <asm/prctl.h>
  43. #include <asm/spec-ctrl.h>
  44. #include <asm/io_bitmap.h>
  45. #include <asm/proto.h>
  46. #include <asm/frame.h>
  47. #include <asm/unwind.h>
  48. #include <asm/tdx.h>
  49. #include "process.h"
  50. /*
  51. * per-CPU TSS segments. Threads are completely 'soft' on Linux,
  52. * no more per-task TSS's. The TSS size is kept cacheline-aligned
  53. * so they are allowed to end up in the .data..cacheline_aligned
  54. * section. Since TSS's are completely CPU-local, we want them
  55. * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  56. */
  57. __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
  58. .x86_tss = {
  59. /*
  60. * .sp0 is only used when entering ring 0 from a lower
  61. * privilege level. Since the init task never runs anything
  62. * but ring 0 code, there is no need for a valid value here.
  63. * Poison it.
  64. */
  65. .sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
  66. #ifdef CONFIG_X86_32
  67. .sp1 = TOP_OF_INIT_STACK,
  68. .ss0 = __KERNEL_DS,
  69. .ss1 = __KERNEL_CS,
  70. #endif
  71. .io_bitmap_base = IO_BITMAP_OFFSET_INVALID,
  72. },
  73. };
  74. EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
  75. DEFINE_PER_CPU(bool, __tss_limit_invalid);
  76. EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
  77. /*
  78. * this gets called so that we can store lazy state into memory and copy the
  79. * current task into the new thread.
  80. */
  81. int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
  82. {
  83. memcpy(dst, src, arch_task_struct_size);
  84. #ifdef CONFIG_VM86
  85. dst->thread.vm86 = NULL;
  86. #endif
  87. /* Drop the copied pointer to current's fpstate */
  88. dst->thread.fpu.fpstate = NULL;
  89. return 0;
  90. }
  91. #ifdef CONFIG_X86_64
  92. void arch_release_task_struct(struct task_struct *tsk)
  93. {
  94. if (fpu_state_size_dynamic())
  95. fpstate_free(&tsk->thread.fpu);
  96. }
  97. #endif
  98. /*
  99. * Free thread data structures etc..
  100. */
  101. void exit_thread(struct task_struct *tsk)
  102. {
  103. struct thread_struct *t = &tsk->thread;
  104. struct fpu *fpu = &t->fpu;
  105. if (test_thread_flag(TIF_IO_BITMAP))
  106. io_bitmap_exit(tsk);
  107. free_vm86(t);
  108. fpu__drop(fpu);
  109. }
  110. static int set_new_tls(struct task_struct *p, unsigned long tls)
  111. {
  112. struct user_desc __user *utls = (struct user_desc __user *)tls;
  113. if (in_ia32_syscall())
  114. return do_set_thread_area(p, -1, utls, 0);
  115. else
  116. return do_set_thread_area_64(p, ARCH_SET_FS, tls);
  117. }
  118. int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
  119. {
  120. unsigned long clone_flags = args->flags;
  121. unsigned long sp = args->stack;
  122. unsigned long tls = args->tls;
  123. struct inactive_task_frame *frame;
  124. struct fork_frame *fork_frame;
  125. struct pt_regs *childregs;
  126. int ret = 0;
  127. childregs = task_pt_regs(p);
  128. fork_frame = container_of(childregs, struct fork_frame, regs);
  129. frame = &fork_frame->frame;
  130. frame->bp = encode_frame_pointer(childregs);
  131. frame->ret_addr = (unsigned long) ret_from_fork;
  132. p->thread.sp = (unsigned long) fork_frame;
  133. p->thread.io_bitmap = NULL;
  134. p->thread.iopl_warn = 0;
  135. memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
  136. #ifdef CONFIG_X86_64
  137. current_save_fsgs();
  138. p->thread.fsindex = current->thread.fsindex;
  139. p->thread.fsbase = current->thread.fsbase;
  140. p->thread.gsindex = current->thread.gsindex;
  141. p->thread.gsbase = current->thread.gsbase;
  142. savesegment(es, p->thread.es);
  143. savesegment(ds, p->thread.ds);
  144. #else
  145. p->thread.sp0 = (unsigned long) (childregs + 1);
  146. savesegment(gs, p->thread.gs);
  147. /*
  148. * Clear all status flags including IF and set fixed bit. 64bit
  149. * does not have this initialization as the frame does not contain
  150. * flags. The flags consistency (especially vs. AC) is there
  151. * ensured via objtool, which lacks 32bit support.
  152. */
  153. frame->flags = X86_EFLAGS_FIXED;
  154. #endif
  155. fpu_clone(p, clone_flags, args->fn);
  156. /* Kernel thread ? */
  157. if (unlikely(p->flags & PF_KTHREAD)) {
  158. p->thread.pkru = pkru_get_init_value();
  159. memset(childregs, 0, sizeof(struct pt_regs));
  160. kthread_frame_init(frame, args->fn, args->fn_arg);
  161. return 0;
  162. }
  163. /*
  164. * Clone current's PKRU value from hardware. tsk->thread.pkru
  165. * is only valid when scheduled out.
  166. */
  167. p->thread.pkru = read_pkru();
  168. frame->bx = 0;
  169. *childregs = *current_pt_regs();
  170. childregs->ax = 0;
  171. if (sp)
  172. childregs->sp = sp;
  173. if (unlikely(args->fn)) {
  174. /*
  175. * A user space thread, but it doesn't return to
  176. * ret_after_fork().
  177. *
  178. * In order to indicate that to tools like gdb,
  179. * we reset the stack and instruction pointers.
  180. *
  181. * It does the same kernel frame setup to return to a kernel
  182. * function that a kernel thread does.
  183. */
  184. childregs->sp = 0;
  185. childregs->ip = 0;
  186. kthread_frame_init(frame, args->fn, args->fn_arg);
  187. return 0;
  188. }
  189. /* Set a new TLS for the child thread? */
  190. if (clone_flags & CLONE_SETTLS)
  191. ret = set_new_tls(p, tls);
  192. if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP)))
  193. io_bitmap_share(p);
  194. return ret;
  195. }
  196. static void pkru_flush_thread(void)
  197. {
  198. /*
  199. * If PKRU is enabled the default PKRU value has to be loaded into
  200. * the hardware right here (similar to context switch).
  201. */
  202. pkru_write_default();
  203. }
  204. void flush_thread(void)
  205. {
  206. struct task_struct *tsk = current;
  207. flush_ptrace_hw_breakpoint(tsk);
  208. memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
  209. fpu_flush_thread();
  210. pkru_flush_thread();
  211. }
  212. void disable_TSC(void)
  213. {
  214. preempt_disable();
  215. if (!test_and_set_thread_flag(TIF_NOTSC))
  216. /*
  217. * Must flip the CPU state synchronously with
  218. * TIF_NOTSC in the current running context.
  219. */
  220. cr4_set_bits(X86_CR4_TSD);
  221. preempt_enable();
  222. }
  223. static void enable_TSC(void)
  224. {
  225. preempt_disable();
  226. if (test_and_clear_thread_flag(TIF_NOTSC))
  227. /*
  228. * Must flip the CPU state synchronously with
  229. * TIF_NOTSC in the current running context.
  230. */
  231. cr4_clear_bits(X86_CR4_TSD);
  232. preempt_enable();
  233. }
  234. int get_tsc_mode(unsigned long adr)
  235. {
  236. unsigned int val;
  237. if (test_thread_flag(TIF_NOTSC))
  238. val = PR_TSC_SIGSEGV;
  239. else
  240. val = PR_TSC_ENABLE;
  241. return put_user(val, (unsigned int __user *)adr);
  242. }
  243. int set_tsc_mode(unsigned int val)
  244. {
  245. if (val == PR_TSC_SIGSEGV)
  246. disable_TSC();
  247. else if (val == PR_TSC_ENABLE)
  248. enable_TSC();
  249. else
  250. return -EINVAL;
  251. return 0;
  252. }
  253. DEFINE_PER_CPU(u64, msr_misc_features_shadow);
  254. static void set_cpuid_faulting(bool on)
  255. {
  256. u64 msrval;
  257. msrval = this_cpu_read(msr_misc_features_shadow);
  258. msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
  259. msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT);
  260. this_cpu_write(msr_misc_features_shadow, msrval);
  261. wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval);
  262. }
  263. static void disable_cpuid(void)
  264. {
  265. preempt_disable();
  266. if (!test_and_set_thread_flag(TIF_NOCPUID)) {
  267. /*
  268. * Must flip the CPU state synchronously with
  269. * TIF_NOCPUID in the current running context.
  270. */
  271. set_cpuid_faulting(true);
  272. }
  273. preempt_enable();
  274. }
  275. static void enable_cpuid(void)
  276. {
  277. preempt_disable();
  278. if (test_and_clear_thread_flag(TIF_NOCPUID)) {
  279. /*
  280. * Must flip the CPU state synchronously with
  281. * TIF_NOCPUID in the current running context.
  282. */
  283. set_cpuid_faulting(false);
  284. }
  285. preempt_enable();
  286. }
  287. static int get_cpuid_mode(void)
  288. {
  289. return !test_thread_flag(TIF_NOCPUID);
  290. }
  291. static int set_cpuid_mode(unsigned long cpuid_enabled)
  292. {
  293. if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT))
  294. return -ENODEV;
  295. if (cpuid_enabled)
  296. enable_cpuid();
  297. else
  298. disable_cpuid();
  299. return 0;
  300. }
  301. /*
  302. * Called immediately after a successful exec.
  303. */
  304. void arch_setup_new_exec(void)
  305. {
  306. /* If cpuid was previously disabled for this task, re-enable it. */
  307. if (test_thread_flag(TIF_NOCPUID))
  308. enable_cpuid();
  309. /*
  310. * Don't inherit TIF_SSBD across exec boundary when
  311. * PR_SPEC_DISABLE_NOEXEC is used.
  312. */
  313. if (test_thread_flag(TIF_SSBD) &&
  314. task_spec_ssb_noexec(current)) {
  315. clear_thread_flag(TIF_SSBD);
  316. task_clear_spec_ssb_disable(current);
  317. task_clear_spec_ssb_noexec(current);
  318. speculation_ctrl_update(read_thread_flags());
  319. }
  320. }
  321. #ifdef CONFIG_X86_IOPL_IOPERM
  322. static inline void switch_to_bitmap(unsigned long tifp)
  323. {
  324. /*
  325. * Invalidate I/O bitmap if the previous task used it. This prevents
  326. * any possible leakage of an active I/O bitmap.
  327. *
  328. * If the next task has an I/O bitmap it will handle it on exit to
  329. * user mode.
  330. */
  331. if (tifp & _TIF_IO_BITMAP)
  332. tss_invalidate_io_bitmap();
  333. }
  334. static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm)
  335. {
  336. /*
  337. * Copy at least the byte range of the incoming tasks bitmap which
  338. * covers the permitted I/O ports.
  339. *
  340. * If the previous task which used an I/O bitmap had more bits
  341. * permitted, then the copy needs to cover those as well so they
  342. * get turned off.
  343. */
  344. memcpy(tss->io_bitmap.bitmap, iobm->bitmap,
  345. max(tss->io_bitmap.prev_max, iobm->max));
  346. /*
  347. * Store the new max and the sequence number of this bitmap
  348. * and a pointer to the bitmap itself.
  349. */
  350. tss->io_bitmap.prev_max = iobm->max;
  351. tss->io_bitmap.prev_sequence = iobm->sequence;
  352. }
  353. /**
  354. * native_tss_update_io_bitmap - Update I/O bitmap before exiting to user mode
  355. */
  356. void native_tss_update_io_bitmap(void)
  357. {
  358. struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
  359. struct thread_struct *t = &current->thread;
  360. u16 *base = &tss->x86_tss.io_bitmap_base;
  361. if (!test_thread_flag(TIF_IO_BITMAP)) {
  362. native_tss_invalidate_io_bitmap();
  363. return;
  364. }
  365. if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) {
  366. *base = IO_BITMAP_OFFSET_VALID_ALL;
  367. } else {
  368. struct io_bitmap *iobm = t->io_bitmap;
  369. /*
  370. * Only copy bitmap data when the sequence number differs. The
  371. * update time is accounted to the incoming task.
  372. */
  373. if (tss->io_bitmap.prev_sequence != iobm->sequence)
  374. tss_copy_io_bitmap(tss, iobm);
  375. /* Enable the bitmap */
  376. *base = IO_BITMAP_OFFSET_VALID_MAP;
  377. }
  378. /*
  379. * Make sure that the TSS limit is covering the IO bitmap. It might have
  380. * been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O
  381. * access from user space to trigger a #GP because tbe bitmap is outside
  382. * the TSS limit.
  383. */
  384. refresh_tss_limit();
  385. }
  386. #else /* CONFIG_X86_IOPL_IOPERM */
  387. static inline void switch_to_bitmap(unsigned long tifp) { }
  388. #endif
  389. #ifdef CONFIG_SMP
  390. struct ssb_state {
  391. struct ssb_state *shared_state;
  392. raw_spinlock_t lock;
  393. unsigned int disable_state;
  394. unsigned long local_state;
  395. };
  396. #define LSTATE_SSB 0
  397. static DEFINE_PER_CPU(struct ssb_state, ssb_state);
  398. void speculative_store_bypass_ht_init(void)
  399. {
  400. struct ssb_state *st = this_cpu_ptr(&ssb_state);
  401. unsigned int this_cpu = smp_processor_id();
  402. unsigned int cpu;
  403. st->local_state = 0;
  404. /*
  405. * Shared state setup happens once on the first bringup
  406. * of the CPU. It's not destroyed on CPU hotunplug.
  407. */
  408. if (st->shared_state)
  409. return;
  410. raw_spin_lock_init(&st->lock);
  411. /*
  412. * Go over HT siblings and check whether one of them has set up the
  413. * shared state pointer already.
  414. */
  415. for_each_cpu(cpu, topology_sibling_cpumask(this_cpu)) {
  416. if (cpu == this_cpu)
  417. continue;
  418. if (!per_cpu(ssb_state, cpu).shared_state)
  419. continue;
  420. /* Link it to the state of the sibling: */
  421. st->shared_state = per_cpu(ssb_state, cpu).shared_state;
  422. return;
  423. }
  424. /*
  425. * First HT sibling to come up on the core. Link shared state of
  426. * the first HT sibling to itself. The siblings on the same core
  427. * which come up later will see the shared state pointer and link
  428. * themselves to the state of this CPU.
  429. */
  430. st->shared_state = st;
  431. }
  432. /*
  433. * Logic is: First HT sibling enables SSBD for both siblings in the core
  434. * and last sibling to disable it, disables it for the whole core. This how
  435. * MSR_SPEC_CTRL works in "hardware":
  436. *
  437. * CORE_SPEC_CTRL = THREAD0_SPEC_CTRL | THREAD1_SPEC_CTRL
  438. */
  439. static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
  440. {
  441. struct ssb_state *st = this_cpu_ptr(&ssb_state);
  442. u64 msr = x86_amd_ls_cfg_base;
  443. if (!static_cpu_has(X86_FEATURE_ZEN)) {
  444. msr |= ssbd_tif_to_amd_ls_cfg(tifn);
  445. wrmsrl(MSR_AMD64_LS_CFG, msr);
  446. return;
  447. }
  448. if (tifn & _TIF_SSBD) {
  449. /*
  450. * Since this can race with prctl(), block reentry on the
  451. * same CPU.
  452. */
  453. if (__test_and_set_bit(LSTATE_SSB, &st->local_state))
  454. return;
  455. msr |= x86_amd_ls_cfg_ssbd_mask;
  456. raw_spin_lock(&st->shared_state->lock);
  457. /* First sibling enables SSBD: */
  458. if (!st->shared_state->disable_state)
  459. wrmsrl(MSR_AMD64_LS_CFG, msr);
  460. st->shared_state->disable_state++;
  461. raw_spin_unlock(&st->shared_state->lock);
  462. } else {
  463. if (!__test_and_clear_bit(LSTATE_SSB, &st->local_state))
  464. return;
  465. raw_spin_lock(&st->shared_state->lock);
  466. st->shared_state->disable_state--;
  467. if (!st->shared_state->disable_state)
  468. wrmsrl(MSR_AMD64_LS_CFG, msr);
  469. raw_spin_unlock(&st->shared_state->lock);
  470. }
  471. }
  472. #else
  473. static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
  474. {
  475. u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn);
  476. wrmsrl(MSR_AMD64_LS_CFG, msr);
  477. }
  478. #endif
  479. static __always_inline void amd_set_ssb_virt_state(unsigned long tifn)
  480. {
  481. /*
  482. * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL,
  483. * so ssbd_tif_to_spec_ctrl() just works.
  484. */
  485. wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn));
  486. }
  487. /*
  488. * Update the MSRs managing speculation control, during context switch.
  489. *
  490. * tifp: Previous task's thread flags
  491. * tifn: Next task's thread flags
  492. */
  493. static __always_inline void __speculation_ctrl_update(unsigned long tifp,
  494. unsigned long tifn)
  495. {
  496. unsigned long tif_diff = tifp ^ tifn;
  497. u64 msr = x86_spec_ctrl_base;
  498. bool updmsr = false;
  499. lockdep_assert_irqs_disabled();
  500. /* Handle change of TIF_SSBD depending on the mitigation method. */
  501. if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) {
  502. if (tif_diff & _TIF_SSBD)
  503. amd_set_ssb_virt_state(tifn);
  504. } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
  505. if (tif_diff & _TIF_SSBD)
  506. amd_set_core_ssb_state(tifn);
  507. } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
  508. static_cpu_has(X86_FEATURE_AMD_SSBD)) {
  509. updmsr |= !!(tif_diff & _TIF_SSBD);
  510. msr |= ssbd_tif_to_spec_ctrl(tifn);
  511. }
  512. /* Only evaluate TIF_SPEC_IB if conditional STIBP is enabled. */
  513. if (IS_ENABLED(CONFIG_SMP) &&
  514. static_branch_unlikely(&switch_to_cond_stibp)) {
  515. updmsr |= !!(tif_diff & _TIF_SPEC_IB);
  516. msr |= stibp_tif_to_spec_ctrl(tifn);
  517. }
  518. if (updmsr)
  519. update_spec_ctrl_cond(msr);
  520. }
  521. static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
  522. {
  523. if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) {
  524. if (task_spec_ssb_disable(tsk))
  525. set_tsk_thread_flag(tsk, TIF_SSBD);
  526. else
  527. clear_tsk_thread_flag(tsk, TIF_SSBD);
  528. if (task_spec_ib_disable(tsk))
  529. set_tsk_thread_flag(tsk, TIF_SPEC_IB);
  530. else
  531. clear_tsk_thread_flag(tsk, TIF_SPEC_IB);
  532. }
  533. /* Return the updated threadinfo flags*/
  534. return read_task_thread_flags(tsk);
  535. }
  536. void speculation_ctrl_update(unsigned long tif)
  537. {
  538. unsigned long flags;
  539. /* Forced update. Make sure all relevant TIF flags are different */
  540. local_irq_save(flags);
  541. __speculation_ctrl_update(~tif, tif);
  542. local_irq_restore(flags);
  543. }
  544. /* Called from seccomp/prctl update */
  545. void speculation_ctrl_update_current(void)
  546. {
  547. preempt_disable();
  548. speculation_ctrl_update(speculation_ctrl_update_tif(current));
  549. preempt_enable();
  550. }
  551. static inline void cr4_toggle_bits_irqsoff(unsigned long mask)
  552. {
  553. unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4);
  554. newval = cr4 ^ mask;
  555. if (newval != cr4) {
  556. this_cpu_write(cpu_tlbstate.cr4, newval);
  557. __write_cr4(newval);
  558. }
  559. }
  560. void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
  561. {
  562. unsigned long tifp, tifn;
  563. tifn = read_task_thread_flags(next_p);
  564. tifp = read_task_thread_flags(prev_p);
  565. switch_to_bitmap(tifp);
  566. propagate_user_return_notify(prev_p, next_p);
  567. if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) &&
  568. arch_has_block_step()) {
  569. unsigned long debugctl, msk;
  570. rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
  571. debugctl &= ~DEBUGCTLMSR_BTF;
  572. msk = tifn & _TIF_BLOCKSTEP;
  573. debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT;
  574. wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
  575. }
  576. if ((tifp ^ tifn) & _TIF_NOTSC)
  577. cr4_toggle_bits_irqsoff(X86_CR4_TSD);
  578. if ((tifp ^ tifn) & _TIF_NOCPUID)
  579. set_cpuid_faulting(!!(tifn & _TIF_NOCPUID));
  580. if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) {
  581. __speculation_ctrl_update(tifp, tifn);
  582. } else {
  583. speculation_ctrl_update_tif(prev_p);
  584. tifn = speculation_ctrl_update_tif(next_p);
  585. /* Enforce MSR update to ensure consistent state */
  586. __speculation_ctrl_update(~tifn, tifn);
  587. }
  588. }
  589. /*
  590. * Idle related variables and functions
  591. */
  592. unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
  593. EXPORT_SYMBOL(boot_option_idle_override);
  594. static void (*x86_idle)(void);
  595. #ifndef CONFIG_SMP
  596. static inline void play_dead(void)
  597. {
  598. BUG();
  599. }
  600. #endif
  601. void arch_cpu_idle_enter(void)
  602. {
  603. tsc_verify_tsc_adjust(false);
  604. local_touch_nmi();
  605. }
  606. void arch_cpu_idle_dead(void)
  607. {
  608. play_dead();
  609. }
  610. /*
  611. * Called from the generic idle code.
  612. */
  613. void arch_cpu_idle(void)
  614. {
  615. x86_idle();
  616. }
  617. /*
  618. * We use this if we don't have any better idle routine..
  619. */
  620. void __cpuidle default_idle(void)
  621. {
  622. raw_safe_halt();
  623. }
  624. #if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE)
  625. EXPORT_SYMBOL(default_idle);
  626. #endif
  627. #ifdef CONFIG_XEN
  628. bool xen_set_default_idle(void)
  629. {
  630. bool ret = !!x86_idle;
  631. x86_idle = default_idle;
  632. return ret;
  633. }
  634. #endif
  635. struct cpumask cpus_stop_mask;
  636. void __noreturn stop_this_cpu(void *dummy)
  637. {
  638. struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info);
  639. unsigned int cpu = smp_processor_id();
  640. local_irq_disable();
  641. /*
  642. * Remove this CPU from the online mask and disable it
  643. * unconditionally. This might be redundant in case that the reboot
  644. * vector was handled late and stop_other_cpus() sent an NMI.
  645. *
  646. * According to SDM and APM NMIs can be accepted even after soft
  647. * disabling the local APIC.
  648. */
  649. set_cpu_online(cpu, false);
  650. disable_local_APIC();
  651. mcheck_cpu_clear(c);
  652. /*
  653. * Use wbinvd on processors that support SME. This provides support
  654. * for performing a successful kexec when going from SME inactive
  655. * to SME active (or vice-versa). The cache must be cleared so that
  656. * if there are entries with the same physical address, both with and
  657. * without the encryption bit, they don't race each other when flushed
  658. * and potentially end up with the wrong entry being committed to
  659. * memory.
  660. *
  661. * Test the CPUID bit directly because the machine might've cleared
  662. * X86_FEATURE_SME due to cmdline options.
  663. */
  664. if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0)))
  665. native_wbinvd();
  666. /*
  667. * This brings a cache line back and dirties it, but
  668. * native_stop_other_cpus() will overwrite cpus_stop_mask after it
  669. * observed that all CPUs reported stop. This write will invalidate
  670. * the related cache line on this CPU.
  671. */
  672. cpumask_clear_cpu(cpu, &cpus_stop_mask);
  673. for (;;) {
  674. /*
  675. * Use native_halt() so that memory contents don't change
  676. * (stack usage and variables) after possibly issuing the
  677. * native_wbinvd() above.
  678. */
  679. native_halt();
  680. }
  681. }
  682. /*
  683. * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power
  684. * states (local apic timer and TSC stop).
  685. *
  686. * XXX this function is completely buggered vs RCU and tracing.
  687. */
  688. static void amd_e400_idle(void)
  689. {
  690. /*
  691. * We cannot use static_cpu_has_bug() here because X86_BUG_AMD_APIC_C1E
  692. * gets set after static_cpu_has() places have been converted via
  693. * alternatives.
  694. */
  695. if (!boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) {
  696. default_idle();
  697. return;
  698. }
  699. tick_broadcast_enter();
  700. default_idle();
  701. /*
  702. * The switch back from broadcast mode needs to be called with
  703. * interrupts disabled.
  704. */
  705. raw_local_irq_disable();
  706. tick_broadcast_exit();
  707. raw_local_irq_enable();
  708. }
  709. /*
  710. * Prefer MWAIT over HALT if MWAIT is supported, MWAIT_CPUID leaf
  711. * exists and whenever MONITOR/MWAIT extensions are present there is at
  712. * least one C1 substate.
  713. *
  714. * Do not prefer MWAIT if MONITOR instruction has a bug or idle=nomwait
  715. * is passed to kernel commandline parameter.
  716. */
  717. static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
  718. {
  719. u32 eax, ebx, ecx, edx;
  720. /* User has disallowed the use of MWAIT. Fallback to HALT */
  721. if (boot_option_idle_override == IDLE_NOMWAIT)
  722. return 0;
  723. /* MWAIT is not supported on this platform. Fallback to HALT */
  724. if (!cpu_has(c, X86_FEATURE_MWAIT))
  725. return 0;
  726. /* Monitor has a bug. Fallback to HALT */
  727. if (boot_cpu_has_bug(X86_BUG_MONITOR))
  728. return 0;
  729. cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
  730. /*
  731. * If MWAIT extensions are not available, it is safe to use MWAIT
  732. * with EAX=0, ECX=0.
  733. */
  734. if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED))
  735. return 1;
  736. /*
  737. * If MWAIT extensions are available, there should be at least one
  738. * MWAIT C1 substate present.
  739. */
  740. return (edx & MWAIT_C1_SUBSTATE_MASK);
  741. }
  742. /*
  743. * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT
  744. * with interrupts enabled and no flags, which is backwards compatible with the
  745. * original MWAIT implementation.
  746. */
  747. static __cpuidle void mwait_idle(void)
  748. {
  749. if (!current_set_polling_and_test()) {
  750. if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
  751. mb(); /* quirk */
  752. clflush((void *)&current_thread_info()->flags);
  753. mb(); /* quirk */
  754. }
  755. __monitor((void *)&current_thread_info()->flags, 0, 0);
  756. if (!need_resched())
  757. __sti_mwait(0, 0);
  758. else
  759. raw_local_irq_enable();
  760. } else {
  761. raw_local_irq_enable();
  762. }
  763. __current_clr_polling();
  764. }
  765. void select_idle_routine(const struct cpuinfo_x86 *c)
  766. {
  767. #ifdef CONFIG_SMP
  768. if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1)
  769. pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
  770. #endif
  771. if (x86_idle || boot_option_idle_override == IDLE_POLL)
  772. return;
  773. if (boot_cpu_has_bug(X86_BUG_AMD_E400)) {
  774. pr_info("using AMD E400 aware idle routine\n");
  775. x86_idle = amd_e400_idle;
  776. } else if (prefer_mwait_c1_over_halt(c)) {
  777. pr_info("using mwait in idle threads\n");
  778. x86_idle = mwait_idle;
  779. } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
  780. pr_info("using TDX aware idle routine\n");
  781. x86_idle = tdx_safe_halt;
  782. } else
  783. x86_idle = default_idle;
  784. }
  785. void amd_e400_c1e_apic_setup(void)
  786. {
  787. if (boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) {
  788. pr_info("Switch to broadcast mode on CPU%d\n", smp_processor_id());
  789. local_irq_disable();
  790. tick_broadcast_force();
  791. local_irq_enable();
  792. }
  793. }
  794. void __init arch_post_acpi_subsys_init(void)
  795. {
  796. u32 lo, hi;
  797. if (!boot_cpu_has_bug(X86_BUG_AMD_E400))
  798. return;
  799. /*
  800. * AMD E400 detection needs to happen after ACPI has been enabled. If
  801. * the machine is affected K8_INTP_C1E_ACTIVE_MASK bits are set in
  802. * MSR_K8_INT_PENDING_MSG.
  803. */
  804. rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
  805. if (!(lo & K8_INTP_C1E_ACTIVE_MASK))
  806. return;
  807. boot_cpu_set_bug(X86_BUG_AMD_APIC_C1E);
  808. if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
  809. mark_tsc_unstable("TSC halt in AMD C1E");
  810. pr_info("System has AMD C1E enabled\n");
  811. }
  812. static int __init idle_setup(char *str)
  813. {
  814. if (!str)
  815. return -EINVAL;
  816. if (!strcmp(str, "poll")) {
  817. pr_info("using polling idle threads\n");
  818. boot_option_idle_override = IDLE_POLL;
  819. cpu_idle_poll_ctrl(true);
  820. } else if (!strcmp(str, "halt")) {
  821. /*
  822. * When the boot option of idle=halt is added, halt is
  823. * forced to be used for CPU idle. In such case CPU C2/C3
  824. * won't be used again.
  825. * To continue to load the CPU idle driver, don't touch
  826. * the boot_option_idle_override.
  827. */
  828. x86_idle = default_idle;
  829. boot_option_idle_override = IDLE_HALT;
  830. } else if (!strcmp(str, "nomwait")) {
  831. /*
  832. * If the boot option of "idle=nomwait" is added,
  833. * it means that mwait will be disabled for CPU C1/C2/C3
  834. * states.
  835. */
  836. boot_option_idle_override = IDLE_NOMWAIT;
  837. } else
  838. return -1;
  839. return 0;
  840. }
  841. early_param("idle", idle_setup);
  842. unsigned long arch_align_stack(unsigned long sp)
  843. {
  844. if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
  845. sp -= prandom_u32_max(8192);
  846. return sp & ~0xf;
  847. }
  848. unsigned long arch_randomize_brk(struct mm_struct *mm)
  849. {
  850. return randomize_page(mm->brk, 0x02000000);
  851. }
  852. /*
  853. * Called from fs/proc with a reference on @p to find the function
  854. * which called into schedule(). This needs to be done carefully
  855. * because the task might wake up and we might look at a stack
  856. * changing under us.
  857. */
  858. unsigned long __get_wchan(struct task_struct *p)
  859. {
  860. struct unwind_state state;
  861. unsigned long addr = 0;
  862. if (!try_get_task_stack(p))
  863. return 0;
  864. for (unwind_start(&state, p, NULL, NULL); !unwind_done(&state);
  865. unwind_next_frame(&state)) {
  866. addr = unwind_get_return_address(&state);
  867. if (!addr)
  868. break;
  869. if (in_sched_functions(addr))
  870. continue;
  871. break;
  872. }
  873. put_task_stack(p);
  874. return addr;
  875. }
  876. long do_arch_prctl_common(int option, unsigned long arg2)
  877. {
  878. switch (option) {
  879. case ARCH_GET_CPUID:
  880. return get_cpuid_mode();
  881. case ARCH_SET_CPUID:
  882. return set_cpuid_mode(arg2);
  883. case ARCH_GET_XCOMP_SUPP:
  884. case ARCH_GET_XCOMP_PERM:
  885. case ARCH_REQ_XCOMP_PERM:
  886. case ARCH_GET_XCOMP_GUEST_PERM:
  887. case ARCH_REQ_XCOMP_GUEST_PERM:
  888. return fpu_xstate_prctl(option, arg2);
  889. }
  890. return -EINVAL;
  891. }