Merge branch 'mm-pkeys-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 protection key support from Ingo Molnar:
 "This tree adds support for a new memory protection hardware feature
  that is available in upcoming Intel CPUs: 'protection keys' (pkeys).

  There's a background article at LWN.net:

      https://lwn.net/Articles/643797/

  The gist is that protection keys allow the encoding of
  user-controllable permission masks in the pte.  So instead of having a
  fixed protection mask in the pte (which needs a system call to change
  and works on a per page basis), the user can map a (handful of)
  protection mask variants and can change the masks runtime relatively
  cheaply, without having to change every single page in the affected
  virtual memory range.

  This allows the dynamic switching of the protection bits of large
  amounts of virtual memory, via user-space instructions.  It also
  allows more precise control of MMU permission bits: for example the
  executable bit is separate from the read bit (see more about that
  below).

  This tree adds the MM infrastructure and low level x86 glue needed for
  that, plus it adds a high level API to make use of protection keys -
  if a user-space application calls:

        mmap(..., PROT_EXEC);

  or

        mprotect(ptr, sz, PROT_EXEC);

  (note PROT_EXEC-only, without PROT_READ/WRITE), the kernel will notice
  this special case, and will set a special protection key on this
  memory range.  It also sets the appropriate bits in the Protection
  Keys User Rights (PKRU) register so that the memory becomes unreadable
  and unwritable.

  So using protection keys the kernel is able to implement 'true'
  PROT_EXEC on x86 CPUs: without protection keys PROT_EXEC implies
  PROT_READ as well.  Unreadable executable mappings have security
  advantages: they cannot be read via information leaks to figure out
  ASLR details, nor can they be scanned for ROP gadgets - and they
  cannot be used by exploits for data purposes either.

  We know about no user-space code that relies on pure PROT_EXEC
  mappings today, but binary loaders could start making use of this new
  feature to map binaries and libraries in a more secure fashion.

  There is other pending pkeys work that offers more high level system
  call APIs to manage protection keys - but those are not part of this
  pull request.

  Right now there's a Kconfig that controls this feature
  (CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) that is default enabled
  (like most x86 CPU feature enablement code that has no runtime
  overhead), but it's not user-configurable at the moment.  If there's
  any serious problem with this then we can make it configurable and/or
  flip the default"

* 'mm-pkeys-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (38 commits)
  x86/mm/pkeys: Fix mismerge of protection keys CPUID bits
  mm/pkeys: Fix siginfo ABI breakage caused by new u64 field
  x86/mm/pkeys: Fix access_error() denial of writes to write-only VMA
  mm/core, x86/mm/pkeys: Add execute-only protection keys support
  x86/mm/pkeys: Create an x86 arch_calc_vm_prot_bits() for VMA flags
  x86/mm/pkeys: Allow kernel to modify user pkey rights register
  x86/fpu: Allow setting of XSAVE state
  x86/mm: Factor out LDT init from context init
  mm/core, x86/mm/pkeys: Add arch_validate_pkey()
  mm/core, arch, powerpc: Pass a protection key in to calc_vm_flag_bits()
  x86/mm/pkeys: Actually enable Memory Protection Keys in the CPU
  x86/mm/pkeys: Add Kconfig prompt to existing config option
  x86/mm/pkeys: Dump pkey from VMA in /proc/pid/smaps
  x86/mm/pkeys: Dump PKRU with other kernel registers
  mm/core, x86/mm/pkeys: Differentiate instruction fetches
  x86/mm/pkeys: Optimize fault handling in access_error()
  mm/core: Do not enforce PKEY permissions on remote mm access
  um, pkeys: Add UML arch_*_access_permitted() methods
  mm/gup, x86/mm/pkeys: Check VMAs and PTEs for protection keys
  x86/mm/gup: Simplify get_user_pages() PTE bit handling
  ...
This commit is contained in:
Linus Torvalds
2016-03-20 19:08:56 -07:00
کامیت 643ad15d47
85فایلهای تغییر یافته به همراه1406 افزوده شده و 241 حذف شده

مشاهده پرونده

@@ -34,3 +34,5 @@ obj-$(CONFIG_ACPI_NUMA) += srat.o
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o

مشاهده پرونده

@@ -15,12 +15,14 @@
#include <linux/context_tracking.h> /* exception_enter(), ... */
#include <linux/uaccess.h> /* faulthandler_disabled() */
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
#include <asm/fixmap.h> /* VSYSCALL_ADDR */
#include <asm/vsyscall.h> /* emulate_vsyscall */
#include <asm/vm86.h> /* struct vm86 */
#include <asm/mmu_context.h> /* vma_pkey() */
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
@@ -33,6 +35,7 @@
* bit 2 == 0: kernel-mode access 1: user-mode access
* bit 3 == 1: use of reserved bit detected
* bit 4 == 1: fault was an instruction fetch
* bit 5 == 1: protection keys block access
*/
enum x86_pf_error_code {
@@ -41,6 +44,7 @@ enum x86_pf_error_code {
PF_USER = 1 << 2,
PF_RSVD = 1 << 3,
PF_INSTR = 1 << 4,
PF_PK = 1 << 5,
};
/*
@@ -167,9 +171,60 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
return prefetch;
}
/*
* A protection key fault means that the PKRU value did not allow
* access to some PTE. Userspace can figure out what PKRU was
* from the XSAVE state, and this function fills out a field in
* siginfo so userspace can discover which protection key was set
* on the PTE.
*
* If we get here, we know that the hardware signaled a PF_PK
* fault and that there was a VMA once we got in the fault
* handler. It does *not* guarantee that the VMA we find here
* was the one that we faulted on.
*
* 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
* 2. T1 : set PKRU to deny access to pkey=4, touches page
* 3. T1 : faults...
* 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
* 5. T1 : enters fault handler, takes mmap_sem, etc...
* 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
* faulted on a pte with its pkey=4.
*/
static void fill_sig_info_pkey(int si_code, siginfo_t *info,
struct vm_area_struct *vma)
{
/* This is effectively an #ifdef */
if (!boot_cpu_has(X86_FEATURE_OSPKE))
return;
/* Fault not from Protection Keys: nothing to do */
if (si_code != SEGV_PKUERR)
return;
/*
* force_sig_info_fault() is called from a number of
* contexts, some of which have a VMA and some of which
* do not. The PF_PK handing happens after we have a
* valid VMA, so we should never reach this without a
* valid VMA.
*/
if (!vma) {
WARN_ONCE(1, "PKU fault with no VMA passed in");
info->si_pkey = 0;
return;
}
/*
* si_pkey should be thought of as a strong hint, but not
* absolutely guranteed to be 100% accurate because of
* the race explained above.
*/
info->si_pkey = vma_pkey(vma);
}
static void
force_sig_info_fault(int si_signo, int si_code, unsigned long address,
struct task_struct *tsk, int fault)
struct task_struct *tsk, struct vm_area_struct *vma,
int fault)
{
unsigned lsb = 0;
siginfo_t info;
@@ -184,6 +239,8 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
lsb = PAGE_SHIFT;
info.si_addr_lsb = lsb;
fill_sig_info_pkey(si_code, &info, vma);
force_sig_info(si_signo, &info, tsk);
}
@@ -661,6 +718,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
struct task_struct *tsk = current;
unsigned long flags;
int sig;
/* No context means no VMA to pass down */
struct vm_area_struct *vma = NULL;
/* Are we prepared to handle this kernel fault? */
if (fixup_exception(regs, X86_TRAP_PF)) {
@@ -684,7 +743,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
tsk->thread.cr2 = address;
/* XXX: hwpoison faults will set the wrong code. */
force_sig_info_fault(signal, si_code, address, tsk, 0);
force_sig_info_fault(signal, si_code, address,
tsk, vma, 0);
}
/*
@@ -761,7 +821,8 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
static void
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
unsigned long address, int si_code)
unsigned long address, struct vm_area_struct *vma,
int si_code)
{
struct task_struct *tsk = current;
@@ -804,7 +865,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_PF;
force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0);
return;
}
@@ -817,14 +878,14 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
static noinline void
bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
unsigned long address, struct vm_area_struct *vma)
{
__bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
__bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR);
}
static void
__bad_area(struct pt_regs *regs, unsigned long error_code,
unsigned long address, int si_code)
unsigned long address, struct vm_area_struct *vma, int si_code)
{
struct mm_struct *mm = current->mm;
@@ -834,25 +895,50 @@ __bad_area(struct pt_regs *regs, unsigned long error_code,
*/
up_read(&mm->mmap_sem);
__bad_area_nosemaphore(regs, error_code, address, si_code);
__bad_area_nosemaphore(regs, error_code, address, vma, si_code);
}
static noinline void
bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
__bad_area(regs, error_code, address, SEGV_MAPERR);
__bad_area(regs, error_code, address, NULL, SEGV_MAPERR);
}
static inline bool bad_area_access_from_pkeys(unsigned long error_code,
struct vm_area_struct *vma)
{
/* This code is always called on the current mm */
bool foreign = false;
if (!boot_cpu_has(X86_FEATURE_OSPKE))
return false;
if (error_code & PF_PK)
return true;
/* this checks permission keys on the VMA: */
if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
(error_code & PF_INSTR), foreign))
return true;
return false;
}
static noinline void
bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
unsigned long address, struct vm_area_struct *vma)
{
__bad_area(regs, error_code, address, SEGV_ACCERR);
/*
* This OSPKE check is not strictly necessary at runtime.
* But, doing it this way allows compiler optimizations
* if pkeys are compiled out.
*/
if (bad_area_access_from_pkeys(error_code, vma))
__bad_area(regs, error_code, address, vma, SEGV_PKUERR);
else
__bad_area(regs, error_code, address, vma, SEGV_ACCERR);
}
static void
do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
unsigned int fault)
struct vm_area_struct *vma, unsigned int fault)
{
struct task_struct *tsk = current;
int code = BUS_ADRERR;
@@ -879,12 +965,13 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
code = BUS_MCEERR_AR;
}
#endif
force_sig_info_fault(SIGBUS, code, address, tsk, fault);
force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault);
}
static noinline void
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
unsigned long address, unsigned int fault)
unsigned long address, struct vm_area_struct *vma,
unsigned int fault)
{
if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
no_context(regs, error_code, address, 0, 0);
@@ -908,9 +995,9 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
} else {
if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
VM_FAULT_HWPOISON_LARGE))
do_sigbus(regs, error_code, address, fault);
do_sigbus(regs, error_code, address, vma, fault);
else if (fault & VM_FAULT_SIGSEGV)
bad_area_nosemaphore(regs, error_code, address);
bad_area_nosemaphore(regs, error_code, address, vma);
else
BUG();
}
@@ -923,6 +1010,12 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
if ((error_code & PF_INSTR) && !pte_exec(*pte))
return 0;
/*
* Note: We do not do lazy flushing on protection key
* changes, so no spurious fault will ever set PF_PK.
*/
if ((error_code & PF_PK))
return 1;
return 1;
}
@@ -1012,6 +1105,17 @@ int show_unhandled_signals = 1;
static inline int
access_error(unsigned long error_code, struct vm_area_struct *vma)
{
/* This is only called for the current mm, so: */
bool foreign = false;
/*
* Make sure to check the VMA so that we do not perform
* faults just to hit a PF_PK as soon as we fill in a
* page.
*/
if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
(error_code & PF_INSTR), foreign))
return 1;
if (error_code & PF_WRITE) {
/* write, present and write, not present: */
if (unlikely(!(vma->vm_flags & VM_WRITE)))
@@ -1118,7 +1222,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock:
*/
bad_area_nosemaphore(regs, error_code, address);
bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
@@ -1131,7 +1235,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
pgtable_bad(regs, error_code, address);
if (unlikely(smap_violation(error_code, regs))) {
bad_area_nosemaphore(regs, error_code, address);
bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
@@ -1140,7 +1244,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* in a region with pagefaults disabled then we must not take the fault
*/
if (unlikely(faulthandler_disabled() || !mm)) {
bad_area_nosemaphore(regs, error_code, address);
bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
@@ -1164,6 +1268,8 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
if (error_code & PF_WRITE)
flags |= FAULT_FLAG_WRITE;
if (error_code & PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
/*
* When running in the kernel we expect faults to occur only to
@@ -1184,7 +1290,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
if ((error_code & PF_USER) == 0 &&
!search_exception_tables(regs->ip)) {
bad_area_nosemaphore(regs, error_code, address);
bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
retry:
@@ -1232,7 +1338,7 @@ retry:
*/
good_area:
if (unlikely(access_error(error_code, vma))) {
bad_area_access_error(regs, error_code, address);
bad_area_access_error(regs, error_code, address, vma);
return;
}
@@ -1270,7 +1376,7 @@ good_area:
up_read(&mm->mmap_sem);
if (unlikely(fault & VM_FAULT_ERROR)) {
mm_fault_error(regs, error_code, address, fault);
mm_fault_error(regs, error_code, address, vma, fault);
return;
}

مشاهده پرونده

@@ -11,6 +11,7 @@
#include <linux/swap.h>
#include <linux/memremap.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
static inline pte_t gup_get_pte(pte_t *ptep)
@@ -74,6 +75,28 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
}
}
/*
* 'pteval' can come from a pte, pmd or pud. We only check
* _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
* same value on all 3 types.
*/
static inline int pte_allows_gup(unsigned long pteval, int write)
{
unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
if (write)
need_pte_bits |= _PAGE_RW;
if ((pteval & need_pte_bits) != need_pte_bits)
return 0;
/* Check memory protection keys permissions. */
if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write))
return 0;
return 1;
}
/*
* The performance critical leaf functions are made noinline otherwise gcc
* inlines everything into a single function which results in too much
@@ -83,14 +106,9 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
unsigned long mask;
int nr_start = *nr;
pte_t *ptep;
mask = _PAGE_PRESENT|_PAGE_USER;
if (write)
mask |= _PAGE_RW;
ptep = pte_offset_map(&pmd, addr);
do {
pte_t pte = gup_get_pte(ptep);
@@ -109,7 +127,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
pte_unmap(ptep);
return 0;
}
} else if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
} else if (!pte_allows_gup(pte_val(pte), write) ||
pte_special(pte)) {
pte_unmap(ptep);
return 0;
}
@@ -164,14 +183,10 @@ static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
unsigned long mask;
struct page *head, *page;
int refs;
mask = _PAGE_PRESENT|_PAGE_USER;
if (write)
mask |= _PAGE_RW;
if ((pmd_flags(pmd) & mask) != mask)
if (!pte_allows_gup(pmd_val(pmd), write))
return 0;
VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
@@ -231,14 +246,10 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
unsigned long mask;
struct page *head, *page;
int refs;
mask = _PAGE_PRESENT|_PAGE_USER;
if (write)
mask |= _PAGE_RW;
if ((pud_flags(pud) & mask) != mask)
if (!pte_allows_gup(pud_val(pud), write))
return 0;
/* hugepages are never "special" */
VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
@@ -422,7 +433,7 @@ slow_irqon:
start += nr << PAGE_SHIFT;
pages += nr;
ret = get_user_pages_unlocked(current, mm, start,
ret = get_user_pages_unlocked(start,
(end - start) >> PAGE_SHIFT,
write, 0, pages);

مشاهده پرونده

@@ -546,8 +546,8 @@ static int mpx_resolve_fault(long __user *addr, int write)
int nr_pages = 1;
int force = 0;
gup_ret = get_user_pages(current, current->mm, (unsigned long)addr,
nr_pages, write, force, NULL, NULL);
gup_ret = get_user_pages((unsigned long)addr, nr_pages, write,
force, NULL, NULL);
/*
* get_user_pages() returns number of pages gotten.
* 0 means we failed to fault in and get anything,

101
arch/x86/mm/pkeys.c Normal file
مشاهده پرونده

@@ -0,0 +1,101 @@
/*
* Intel Memory Protection Keys management
* Copyright (c) 2015, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#include <linux/mm_types.h> /* mm_struct, vma, etc... */
#include <linux/pkeys.h> /* PKEY_* */
#include <uapi/asm-generic/mman-common.h>
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/mmu_context.h> /* vma_pkey() */
#include <asm/fpu/internal.h> /* fpregs_active() */
int __execute_only_pkey(struct mm_struct *mm)
{
int ret;
/*
* We do not want to go through the relatively costly
* dance to set PKRU if we do not need to. Check it
* first and assume that if the execute-only pkey is
* write-disabled that we do not have to set it
* ourselves. We need preempt off so that nobody
* can make fpregs inactive.
*/
preempt_disable();
if (fpregs_active() &&
!__pkru_allows_read(read_pkru(), PKEY_DEDICATED_EXECUTE_ONLY)) {
preempt_enable();
return PKEY_DEDICATED_EXECUTE_ONLY;
}
preempt_enable();
ret = arch_set_user_pkey_access(current, PKEY_DEDICATED_EXECUTE_ONLY,
PKEY_DISABLE_ACCESS);
/*
* If the PKRU-set operation failed somehow, just return
* 0 and effectively disable execute-only support.
*/
if (ret)
return 0;
return PKEY_DEDICATED_EXECUTE_ONLY;
}
static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
{
/* Do this check first since the vm_flags should be hot */
if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)
return false;
if (vma_pkey(vma) != PKEY_DEDICATED_EXECUTE_ONLY)
return false;
return true;
}
/*
* This is only called for *plain* mprotect calls.
*/
int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey)
{
/*
* Is this an mprotect_pkey() call? If so, never
* override the value that came from the user.
*/
if (pkey != -1)
return pkey;
/*
* Look for a protection-key-drive execute-only mapping
* which is now being given permissions that are not
* execute-only. Move it back to the default pkey.
*/
if (vma_is_pkey_exec_only(vma) &&
(prot & (PROT_READ|PROT_WRITE))) {
return 0;
}
/*
* The mapping is execute-only. Go try to get the
* execute-only protection key. If we fail to do that,
* fall through as if we do not have execute-only
* support.
*/
if (prot == PROT_EXEC) {
pkey = execute_only_pkey(vma->vm_mm);
if (pkey > 0)
return pkey;
}
/*
* This is a vanilla, non-pkey mprotect (or we failed to
* setup execute-only), inherit the pkey from the VMA we
* are working on.
*/
return vma_pkey(vma);
}