Merge branch 'x86/urgent' into x86/mm, to pick up dependent fix
Signed-off-by: Ingo Molnar <mingo@kernel.org>
此提交包含在:
@@ -55,10 +55,10 @@ struct addr_marker {
|
||||
enum address_markers_idx {
|
||||
USER_SPACE_NR = 0,
|
||||
KERNEL_SPACE_NR,
|
||||
LOW_KERNEL_NR,
|
||||
#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
|
||||
#ifdef CONFIG_MODIFY_LDT_SYSCALL
|
||||
LDT_NR,
|
||||
#endif
|
||||
LOW_KERNEL_NR,
|
||||
VMALLOC_START_NR,
|
||||
VMEMMAP_START_NR,
|
||||
#ifdef CONFIG_KASAN
|
||||
@@ -66,9 +66,6 @@ enum address_markers_idx {
|
||||
KASAN_SHADOW_END_NR,
|
||||
#endif
|
||||
CPU_ENTRY_AREA_NR,
|
||||
#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
|
||||
LDT_NR,
|
||||
#endif
|
||||
#ifdef CONFIG_X86_ESPFIX64
|
||||
ESPFIX_START_NR,
|
||||
#endif
|
||||
@@ -512,11 +509,11 @@ static inline bool is_hypervisor_range(int idx)
|
||||
{
|
||||
#ifdef CONFIG_X86_64
|
||||
/*
|
||||
* ffff800000000000 - ffff87ffffffffff is reserved for
|
||||
* the hypervisor.
|
||||
* A hole in the beginning of kernel address space reserved
|
||||
* for a hypervisor.
|
||||
*/
|
||||
return (idx >= pgd_index(__PAGE_OFFSET) - 16) &&
|
||||
(idx < pgd_index(__PAGE_OFFSET));
|
||||
return (idx >= pgd_index(GUARD_HOLE_BASE_ADDR)) &&
|
||||
(idx < pgd_index(GUARD_HOLE_END_ADDR));
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
|
@@ -285,20 +285,16 @@ static void cpa_flush_all(unsigned long cache)
|
||||
on_each_cpu(__cpa_flush_all, (void *) cache, 1);
|
||||
}
|
||||
|
||||
static bool __cpa_flush_range(unsigned long start, int numpages, int cache)
|
||||
static bool __inv_flush_all(int cache)
|
||||
{
|
||||
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
|
||||
|
||||
WARN_ON(PAGE_ALIGN(start) != start);
|
||||
|
||||
if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
|
||||
cpa_flush_all(cache);
|
||||
return true;
|
||||
}
|
||||
|
||||
flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages);
|
||||
|
||||
return !cache;
|
||||
return false;
|
||||
}
|
||||
|
||||
static void cpa_flush_range(unsigned long start, int numpages, int cache)
|
||||
@@ -306,7 +302,14 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
|
||||
unsigned int i, level;
|
||||
unsigned long addr;
|
||||
|
||||
if (__cpa_flush_range(start, numpages, cache))
|
||||
WARN_ON(PAGE_ALIGN(start) != start);
|
||||
|
||||
if (__inv_flush_all(cache))
|
||||
return;
|
||||
|
||||
flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages);
|
||||
|
||||
if (!cache)
|
||||
return;
|
||||
|
||||
/*
|
||||
@@ -332,7 +335,12 @@ static void cpa_flush_array(unsigned long baddr, unsigned long *start,
|
||||
{
|
||||
unsigned int i, level;
|
||||
|
||||
if (__cpa_flush_range(baddr, numpages, cache))
|
||||
if (__inv_flush_all(cache))
|
||||
return;
|
||||
|
||||
flush_tlb_all();
|
||||
|
||||
if (!cache)
|
||||
return;
|
||||
|
||||
/*
|
||||
|
@@ -519,8 +519,13 @@ static u64 sanitize_phys(u64 address)
|
||||
* for a "decoy" virtual address (bit 63 clear) passed to
|
||||
* set_memory_X(). __pa() on a "decoy" address results in a
|
||||
* physical address with bit 63 set.
|
||||
*
|
||||
* Decoy addresses are not present for 32-bit builds, see
|
||||
* set_mce_nospec().
|
||||
*/
|
||||
return address & __PHYSICAL_MASK;
|
||||
if (IS_ENABLED(CONFIG_X86_64))
|
||||
return address & __PHYSICAL_MASK;
|
||||
return address;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -546,7 +551,11 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
|
||||
|
||||
start = sanitize_phys(start);
|
||||
end = sanitize_phys(end);
|
||||
BUG_ON(start >= end); /* end is exclusive */
|
||||
if (start >= end) {
|
||||
WARN(1, "%s failed: [mem %#010Lx-%#010Lx], req %s\n", __func__,
|
||||
start, end - 1, cattr_name(req_type));
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (!pat_enabled()) {
|
||||
/* This is identical to page table setting without PAT */
|
||||
|
@@ -7,7 +7,6 @@
|
||||
#include <linux/export.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/ptrace.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/mmu_context.h>
|
||||
@@ -30,6 +29,12 @@
|
||||
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
|
||||
*/
|
||||
|
||||
/*
|
||||
* Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is
|
||||
* stored in cpu_tlb_state.last_user_mm_ibpb.
|
||||
*/
|
||||
#define LAST_USER_MM_IBPB 0x1UL
|
||||
|
||||
/*
|
||||
* We get here when we do something requiring a TLB invalidation
|
||||
* but could not go invalidate all of the contexts. We do the
|
||||
@@ -181,17 +186,87 @@ static void sync_current_stack_to_mm(struct mm_struct *mm)
|
||||
}
|
||||
}
|
||||
|
||||
static bool ibpb_needed(struct task_struct *tsk, u64 last_ctx_id)
|
||||
static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
|
||||
{
|
||||
unsigned long next_tif = task_thread_info(next)->flags;
|
||||
unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
|
||||
|
||||
return (unsigned long)next->mm | ibpb;
|
||||
}
|
||||
|
||||
static void cond_ibpb(struct task_struct *next)
|
||||
{
|
||||
if (!next || !next->mm)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Check if the current (previous) task has access to the memory
|
||||
* of the @tsk (next) task. If access is denied, make sure to
|
||||
* issue a IBPB to stop user->user Spectre-v2 attacks.
|
||||
*
|
||||
* Note: __ptrace_may_access() returns 0 or -ERRNO.
|
||||
* Both, the conditional and the always IBPB mode use the mm
|
||||
* pointer to avoid the IBPB when switching between tasks of the
|
||||
* same process. Using the mm pointer instead of mm->context.ctx_id
|
||||
* opens a hypothetical hole vs. mm_struct reuse, which is more or
|
||||
* less impossible to control by an attacker. Aside of that it
|
||||
* would only affect the first schedule so the theoretically
|
||||
* exposed data is not really interesting.
|
||||
*/
|
||||
return (tsk && tsk->mm && tsk->mm->context.ctx_id != last_ctx_id &&
|
||||
ptrace_may_access_sched(tsk, PTRACE_MODE_SPEC_IBPB));
|
||||
if (static_branch_likely(&switch_mm_cond_ibpb)) {
|
||||
unsigned long prev_mm, next_mm;
|
||||
|
||||
/*
|
||||
* This is a bit more complex than the always mode because
|
||||
* it has to handle two cases:
|
||||
*
|
||||
* 1) Switch from a user space task (potential attacker)
|
||||
* which has TIF_SPEC_IB set to a user space task
|
||||
* (potential victim) which has TIF_SPEC_IB not set.
|
||||
*
|
||||
* 2) Switch from a user space task (potential attacker)
|
||||
* which has TIF_SPEC_IB not set to a user space task
|
||||
* (potential victim) which has TIF_SPEC_IB set.
|
||||
*
|
||||
* This could be done by unconditionally issuing IBPB when
|
||||
* a task which has TIF_SPEC_IB set is either scheduled in
|
||||
* or out. Though that results in two flushes when:
|
||||
*
|
||||
* - the same user space task is scheduled out and later
|
||||
* scheduled in again and only a kernel thread ran in
|
||||
* between.
|
||||
*
|
||||
* - a user space task belonging to the same process is
|
||||
* scheduled in after a kernel thread ran in between
|
||||
*
|
||||
* - a user space task belonging to the same process is
|
||||
* scheduled in immediately.
|
||||
*
|
||||
* Optimize this with reasonably small overhead for the
|
||||
* above cases. Mangle the TIF_SPEC_IB bit into the mm
|
||||
* pointer of the incoming task which is stored in
|
||||
* cpu_tlbstate.last_user_mm_ibpb for comparison.
|
||||
*/
|
||||
next_mm = mm_mangle_tif_spec_ib(next);
|
||||
prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb);
|
||||
|
||||
/*
|
||||
* Issue IBPB only if the mm's are different and one or
|
||||
* both have the IBPB bit set.
|
||||
*/
|
||||
if (next_mm != prev_mm &&
|
||||
(next_mm | prev_mm) & LAST_USER_MM_IBPB)
|
||||
indirect_branch_prediction_barrier();
|
||||
|
||||
this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm);
|
||||
}
|
||||
|
||||
if (static_branch_unlikely(&switch_mm_always_ibpb)) {
|
||||
/*
|
||||
* Only flush when switching to a user space task with a
|
||||
* different context than the user space task which ran
|
||||
* last on this CPU.
|
||||
*/
|
||||
if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) {
|
||||
indirect_branch_prediction_barrier();
|
||||
this_cpu_write(cpu_tlbstate.last_user_mm, next->mm);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
||||
@@ -292,22 +367,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
||||
new_asid = prev_asid;
|
||||
need_flush = true;
|
||||
} else {
|
||||
u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
|
||||
|
||||
/*
|
||||
* Avoid user/user BTB poisoning by flushing the branch
|
||||
* predictor when switching between processes. This stops
|
||||
* one process from doing Spectre-v2 attacks on another.
|
||||
*
|
||||
* As an optimization, flush indirect branches only when
|
||||
* switching into a processes that can't be ptrace by the
|
||||
* current one (as in such case, attacker has much more
|
||||
* convenient way how to tamper with the next process than
|
||||
* branch buffer poisoning).
|
||||
*/
|
||||
if (static_cpu_has(X86_FEATURE_USE_IBPB) &&
|
||||
ibpb_needed(tsk, last_ctx_id))
|
||||
indirect_branch_prediction_barrier();
|
||||
cond_ibpb(tsk);
|
||||
|
||||
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
|
||||
/*
|
||||
@@ -365,14 +430,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
||||
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Record last user mm's context id, so we can avoid
|
||||
* flushing branch buffer with IBPB if we switch back
|
||||
* to the same user.
|
||||
*/
|
||||
if (next != &init_mm)
|
||||
this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
|
||||
|
||||
/* Make sure we write CR3 before loaded_mm. */
|
||||
barrier();
|
||||
|
||||
@@ -441,7 +498,7 @@ void initialize_tlbstate_and_flush(void)
|
||||
write_cr3(build_cr3(mm->pgd, 0));
|
||||
|
||||
/* Reinitialize tlbstate. */
|
||||
this_cpu_write(cpu_tlbstate.last_ctx_id, mm->context.ctx_id);
|
||||
this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB);
|
||||
this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
|
||||
this_cpu_write(cpu_tlbstate.next_asid, 1);
|
||||
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
|
||||
|
新增問題並參考
封鎖使用者