i386: move mm

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
Thomas Gleixner
2007-10-11 11:16:47 +02:00
parent 96ae6ea0be
commit ad757b6aa5
16 changed files with 4 additions and 4 deletions

5
arch/x86/mm/Makefile Normal file
View File

@@ -0,0 +1,5 @@
ifeq ($(CONFIG_X86_32),y)
include ${srctree}/arch/x86/mm/Makefile_32
else
include ${srctree}/arch/x86_64/mm/Makefile_64
endif

10
arch/x86/mm/Makefile_32 Normal file
View File

@@ -0,0 +1,10 @@
#
# Makefile for the linux i386-specific parts of the memory manager.
#
obj-y := init_32.o pgtable_32.o fault_32.o ioremap_32.o extable_32.o pageattr_32.o mmap_32.o
obj-$(CONFIG_NUMA) += discontig_32.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_HIGHMEM) += highmem_32.o
obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap_32.o

View File

@@ -0,0 +1,100 @@
/*
* arch/i386/mm/boot_ioremap.c
*
* Re-map functions for early boot-time before paging_init() when the
* boot-time pagetables are still in use
*
* Written by Dave Hansen <haveblue@us.ibm.com>
*/
/*
* We need to use the 2-level pagetable functions, but CONFIG_X86_PAE
* keeps that from happenning. If anyone has a better way, I'm listening.
*
* boot_pte_t is defined only if this all works correctly
*/
#undef CONFIG_X86_PAE
#undef CONFIG_PARAVIRT
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <linux/init.h>
#include <linux/stddef.h>
/*
* I'm cheating here. It is known that the two boot PTE pages are
* allocated next to each other. I'm pretending that they're just
* one big array.
*/
#define BOOT_PTE_PTRS (PTRS_PER_PTE*2)
static unsigned long boot_pte_index(unsigned long vaddr)
{
return __pa(vaddr) >> PAGE_SHIFT;
}
static inline boot_pte_t* boot_vaddr_to_pte(void *address)
{
boot_pte_t* boot_pg = (boot_pte_t*)pg0;
return &boot_pg[boot_pte_index((unsigned long)address)];
}
/*
* This is only for a caller who is clever enough to page-align
* phys_addr and virtual_source, and who also has a preference
* about which virtual address from which to steal ptes
*/
static void __boot_ioremap(unsigned long phys_addr, unsigned long nrpages,
void* virtual_source)
{
boot_pte_t* pte;
int i;
char *vaddr = virtual_source;
pte = boot_vaddr_to_pte(virtual_source);
for (i=0; i < nrpages; i++, phys_addr += PAGE_SIZE, pte++) {
set_pte(pte, pfn_pte(phys_addr>>PAGE_SHIFT, PAGE_KERNEL));
__flush_tlb_one(&vaddr[i*PAGE_SIZE]);
}
}
/* the virtual space we're going to remap comes from this array */
#define BOOT_IOREMAP_PAGES 4
#define BOOT_IOREMAP_SIZE (BOOT_IOREMAP_PAGES*PAGE_SIZE)
static __initdata char boot_ioremap_space[BOOT_IOREMAP_SIZE]
__attribute__ ((aligned (PAGE_SIZE)));
/*
* This only applies to things which need to ioremap before paging_init()
* bt_ioremap() and plain ioremap() are both useless at this point.
*
* When used, we're still using the boot-time pagetables, which only
* have 2 PTE pages mapping the first 8MB
*
* There is no unmap. The boot-time PTE pages aren't used after boot.
* If you really want the space back, just remap it yourself.
* boot_ioremap(&ioremap_space-PAGE_OFFSET, BOOT_IOREMAP_SIZE)
*/
__init void* boot_ioremap(unsigned long phys_addr, unsigned long size)
{
unsigned long last_addr, offset;
unsigned int nrpages;
last_addr = phys_addr + size - 1;
/* page align the requested address */
offset = phys_addr & ~PAGE_MASK;
phys_addr &= PAGE_MASK;
size = PAGE_ALIGN(last_addr) - phys_addr;
nrpages = size >> PAGE_SHIFT;
if (nrpages > BOOT_IOREMAP_PAGES)
return NULL;
__boot_ioremap(phys_addr, nrpages, boot_ioremap_space);
return &boot_ioremap_space[offset];
}

431
arch/x86/mm/discontig_32.c Normal file
View File

@@ -0,0 +1,431 @@
/*
* Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation
* August 2002: added remote node KVA remap - Martin J. Bligh
*
* Copyright (C) 2002, IBM Corp.
*
* All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/mm.h>
#include <linux/bootmem.h>
#include <linux/mmzone.h>
#include <linux/highmem.h>
#include <linux/initrd.h>
#include <linux/nodemask.h>
#include <linux/module.h>
#include <linux/kexec.h>
#include <linux/pfn.h>
#include <linux/swap.h>
#include <asm/e820.h>
#include <asm/setup.h>
#include <asm/mmzone.h>
#include <bios_ebda.h>
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
bootmem_data_t node0_bdata;
/*
* numa interface - we expect the numa architecture specific code to have
* populated the following initialisation.
*
* 1) node_online_map - the map of all nodes configured (online) in the system
* 2) node_start_pfn - the starting page frame number for a node
* 3) node_end_pfn - the ending page fram number for a node
*/
unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
#ifdef CONFIG_DISCONTIGMEM
/*
* 4) physnode_map - the mapping between a pfn and owning node
* physnode_map keeps track of the physical memory layout of a generic
* numa node on a 256Mb break (each element of the array will
* represent 256Mb of memory and will be marked by the node id. so,
* if the first gig is on node 0, and the second gig is on node 1
* physnode_map will contain:
*
* physnode_map[0-3] = 0;
* physnode_map[4-7] = 1;
* physnode_map[8- ] = -1;
*/
s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
EXPORT_SYMBOL(physnode_map);
void memory_present(int nid, unsigned long start, unsigned long end)
{
unsigned long pfn;
printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n",
nid, start, end);
printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
printk(KERN_DEBUG " ");
for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
printk("%ld ", pfn);
}
printk("\n");
}
unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
unsigned long end_pfn)
{
unsigned long nr_pages = end_pfn - start_pfn;
if (!nr_pages)
return 0;
return (nr_pages + 1) * sizeof(struct page);
}
#endif
extern unsigned long find_max_low_pfn(void);
extern void add_one_highpage_init(struct page *, int, int);
extern unsigned long highend_pfn, highstart_pfn;
#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
unsigned long node_remap_start_pfn[MAX_NUMNODES];
unsigned long node_remap_size[MAX_NUMNODES];
unsigned long node_remap_offset[MAX_NUMNODES];
void *node_remap_start_vaddr[MAX_NUMNODES];
void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
void *node_remap_end_vaddr[MAX_NUMNODES];
void *node_remap_alloc_vaddr[MAX_NUMNODES];
static unsigned long kva_start_pfn;
static unsigned long kva_pages;
/*
* FLAT - support for basic PC memory model with discontig enabled, essentially
* a single node with all available processors in it with a flat
* memory map.
*/
int __init get_memcfg_numa_flat(void)
{
printk("NUMA - single node, flat memory mode\n");
/* Run the memory configuration and find the top of memory. */
find_max_pfn();
node_start_pfn[0] = 0;
node_end_pfn[0] = max_pfn;
memory_present(0, 0, max_pfn);
/* Indicate there is one node available. */
nodes_clear(node_online_map);
node_set_online(0);
return 1;
}
/*
* Find the highest page frame number we have available for the node
*/
static void __init find_max_pfn_node(int nid)
{
if (node_end_pfn[nid] > max_pfn)
node_end_pfn[nid] = max_pfn;
/*
* if a user has given mem=XXXX, then we need to make sure
* that the node _starts_ before that, too, not just ends
*/
if (node_start_pfn[nid] > max_pfn)
node_start_pfn[nid] = max_pfn;
BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]);
}
/*
* Allocate memory for the pg_data_t for this node via a crude pre-bootmem
* method. For node zero take this from the bottom of memory, for
* subsequent nodes place them at node_remap_start_vaddr which contains
* node local data in physically node local memory. See setup_memory()
* for details.
*/
static void __init allocate_pgdat(int nid)
{
if (nid && node_has_online_mem(nid))
NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
else {
NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn));
min_low_pfn += PFN_UP(sizeof(pg_data_t));
}
}
void *alloc_remap(int nid, unsigned long size)
{
void *allocation = node_remap_alloc_vaddr[nid];
size = ALIGN(size, L1_CACHE_BYTES);
if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid])
return 0;
node_remap_alloc_vaddr[nid] += size;
memset(allocation, 0, size);
return allocation;
}
void __init remap_numa_kva(void)
{
void *vaddr;
unsigned long pfn;
int node;
for_each_online_node(node) {
for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
set_pmd_pfn((ulong) vaddr,
node_remap_start_pfn[node] + pfn,
PAGE_KERNEL_LARGE);
}
}
}
static unsigned long calculate_numa_remap_pages(void)
{
int nid;
unsigned long size, reserve_pages = 0;
unsigned long pfn;
for_each_online_node(nid) {
unsigned old_end_pfn = node_end_pfn[nid];
/*
* The acpi/srat node info can show hot-add memroy zones
* where memory could be added but not currently present.
*/
if (node_start_pfn[nid] > max_pfn)
continue;
if (node_end_pfn[nid] > max_pfn)
node_end_pfn[nid] = max_pfn;
/* ensure the remap includes space for the pgdat. */
size = node_remap_size[nid] + sizeof(pg_data_t);
/* convert size to large (pmd size) pages, rounding up */
size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
/* now the roundup is correct, convert to PAGE_SIZE pages */
size = size * PTRS_PER_PTE;
/*
* Validate the region we are allocating only contains valid
* pages.
*/
for (pfn = node_end_pfn[nid] - size;
pfn < node_end_pfn[nid]; pfn++)
if (!page_is_ram(pfn))
break;
if (pfn != node_end_pfn[nid])
size = 0;
printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
size, nid);
node_remap_size[nid] = size;
node_remap_offset[nid] = reserve_pages;
reserve_pages += size;
printk("Shrinking node %d from %ld pages to %ld pages\n",
nid, node_end_pfn[nid], node_end_pfn[nid] - size);
if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) {
/*
* Align node_end_pfn[] and node_remap_start_pfn[] to
* pmd boundary. remap_numa_kva will barf otherwise.
*/
printk("Shrinking node %d further by %ld pages for proper alignment\n",
nid, node_end_pfn[nid] & (PTRS_PER_PTE-1));
size += node_end_pfn[nid] & (PTRS_PER_PTE-1);
}
node_end_pfn[nid] -= size;
node_remap_start_pfn[nid] = node_end_pfn[nid];
shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]);
}
printk("Reserving total of %ld pages for numa KVA remap\n",
reserve_pages);
return reserve_pages;
}
extern void setup_bootmem_allocator(void);
unsigned long __init setup_memory(void)
{
int nid;
unsigned long system_start_pfn, system_max_low_pfn;
/*
* When mapping a NUMA machine we allocate the node_mem_map arrays
* from node local memory. They are then mapped directly into KVA
* between zone normal and vmalloc space. Calculate the size of
* this space and use it to adjust the boundry between ZONE_NORMAL
* and ZONE_HIGHMEM.
*/
find_max_pfn();
get_memcfg_numa();
kva_pages = calculate_numa_remap_pages();
/* partially used pages are not usable - thus round upwards */
system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
kva_start_pfn = find_max_low_pfn() - kva_pages;
#ifdef CONFIG_BLK_DEV_INITRD
/* Numa kva area is below the initrd */
if (LOADER_TYPE && INITRD_START)
kva_start_pfn = PFN_DOWN(INITRD_START) - kva_pages;
#endif
kva_start_pfn -= kva_start_pfn & (PTRS_PER_PTE-1);
system_max_low_pfn = max_low_pfn = find_max_low_pfn();
printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
kva_start_pfn, max_low_pfn);
printk("max_pfn = %ld\n", max_pfn);
#ifdef CONFIG_HIGHMEM
highstart_pfn = highend_pfn = max_pfn;
if (max_pfn > system_max_low_pfn)
highstart_pfn = system_max_low_pfn;
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
pages_to_mb(highend_pfn - highstart_pfn));
num_physpages = highend_pfn;
high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
#else
num_physpages = system_max_low_pfn;
high_memory = (void *) __va(system_max_low_pfn * PAGE_SIZE - 1) + 1;
#endif
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
pages_to_mb(system_max_low_pfn));
printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n",
min_low_pfn, max_low_pfn, highstart_pfn);
printk("Low memory ends at vaddr %08lx\n",
(ulong) pfn_to_kaddr(max_low_pfn));
for_each_online_node(nid) {
node_remap_start_vaddr[nid] = pfn_to_kaddr(
kva_start_pfn + node_remap_offset[nid]);
/* Init the node remap allocator */
node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
(node_remap_size[nid] * PAGE_SIZE);
node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
ALIGN(sizeof(pg_data_t), PAGE_SIZE);
allocate_pgdat(nid);
printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
(ulong) node_remap_start_vaddr[nid],
(ulong) pfn_to_kaddr(highstart_pfn
+ node_remap_offset[nid] + node_remap_size[nid]));
}
printk("High memory starts at vaddr %08lx\n",
(ulong) pfn_to_kaddr(highstart_pfn));
for_each_online_node(nid)
find_max_pfn_node(nid);
memset(NODE_DATA(0), 0, sizeof(struct pglist_data));
NODE_DATA(0)->bdata = &node0_bdata;
setup_bootmem_allocator();
return max_low_pfn;
}
void __init numa_kva_reserve(void)
{
reserve_bootmem(PFN_PHYS(kva_start_pfn),PFN_PHYS(kva_pages));
}
void __init zone_sizes_init(void)
{
int nid;
unsigned long max_zone_pfns[MAX_NR_ZONES];
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
max_zone_pfns[ZONE_DMA] =
virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
#ifdef CONFIG_HIGHMEM
max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
#endif
/* If SRAT has not registered memory, register it now */
if (find_max_pfn_with_active_regions() == 0) {
for_each_online_node(nid) {
if (node_has_online_mem(nid))
add_active_range(nid, node_start_pfn[nid],
node_end_pfn[nid]);
}
}
free_area_init_nodes(max_zone_pfns);
return;
}
void __init set_highmem_pages_init(int bad_ppro)
{
#ifdef CONFIG_HIGHMEM
struct zone *zone;
struct page *page;
for_each_zone(zone) {
unsigned long node_pfn, zone_start_pfn, zone_end_pfn;
if (!is_highmem(zone))
continue;
zone_start_pfn = zone->zone_start_pfn;
zone_end_pfn = zone_start_pfn + zone->spanned_pages;
printk("Initializing %s for node %d (%08lx:%08lx)\n",
zone->name, zone_to_nid(zone),
zone_start_pfn, zone_end_pfn);
for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) {
if (!pfn_valid(node_pfn))
continue;
page = pfn_to_page(node_pfn);
add_one_highpage_init(page, node_pfn, bad_ppro);
}
}
totalram_pages += totalhigh_pages;
#endif
}
#ifdef CONFIG_MEMORY_HOTPLUG
int paddr_to_nid(u64 addr)
{
int nid;
unsigned long pfn = PFN_DOWN(addr);
for_each_node(nid)
if (node_start_pfn[nid] <= pfn &&
pfn < node_end_pfn[nid])
return nid;
return -1;
}
/*
* This function is used to ask node id BEFORE memmap and mem_section's
* initialization (pfn_to_nid() can't be used yet).
* If _PXM is not defined on ACPI's DSDT, node id must be found by this.
*/
int memory_add_physaddr_to_nid(u64 addr)
{
int nid = paddr_to_nid(addr);
return (nid >= 0) ? nid : 0;
}
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif

35
arch/x86/mm/extable_32.c Normal file
View File

@@ -0,0 +1,35 @@
/*
* linux/arch/i386/mm/extable.c
*/
#include <linux/module.h>
#include <linux/spinlock.h>
#include <asm/uaccess.h>
int fixup_exception(struct pt_regs *regs)
{
const struct exception_table_entry *fixup;
#ifdef CONFIG_PNPBIOS
if (unlikely(SEGMENT_IS_PNP_CODE(regs->xcs)))
{
extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
extern u32 pnp_bios_is_utter_crap;
pnp_bios_is_utter_crap = 1;
printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
__asm__ volatile(
"movl %0, %%esp\n\t"
"jmp *%1\n\t"
: : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
panic("do_trap: can't hit this");
}
#endif
fixup = search_exception_tables(regs->eip);
if (fixup) {
regs->eip = fixup->fixup;
return 1;
}
return 0;
}

657
arch/x86/mm/fault_32.c Normal file
View File

@@ -0,0 +1,657 @@
/*
* linux/arch/i386/mm/fault.c
*
* Copyright (C) 1995 Linus Torvalds
*/
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/tty.h>
#include <linux/vt_kern.h> /* For unblank_screen() */
#include <linux/highmem.h>
#include <linux/bootmem.h> /* for max_low_pfn */
#include <linux/vmalloc.h>
#include <linux/module.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/kdebug.h>
#include <asm/system.h>
#include <asm/desc.h>
#include <asm/segment.h>
extern void die(const char *,struct pt_regs *,long);
static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
int register_page_fault_notifier(struct notifier_block *nb)
{
vmalloc_sync_all();
return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
}
EXPORT_SYMBOL_GPL(register_page_fault_notifier);
int unregister_page_fault_notifier(struct notifier_block *nb)
{
return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
}
EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
static inline int notify_page_fault(struct pt_regs *regs, long err)
{
struct die_args args = {
.regs = regs,
.str = "page fault",
.err = err,
.trapnr = 14,
.signr = SIGSEGV
};
return atomic_notifier_call_chain(&notify_page_fault_chain,
DIE_PAGE_FAULT, &args);
}
/*
* Return EIP plus the CS segment base. The segment limit is also
* adjusted, clamped to the kernel/user address space (whichever is
* appropriate), and returned in *eip_limit.
*
* The segment is checked, because it might have been changed by another
* task between the original faulting instruction and here.
*
* If CS is no longer a valid code segment, or if EIP is beyond the
* limit, or if it is a kernel address when CS is not a kernel segment,
* then the returned value will be greater than *eip_limit.
*
* This is slow, but is very rarely executed.
*/
static inline unsigned long get_segment_eip(struct pt_regs *regs,
unsigned long *eip_limit)
{
unsigned long eip = regs->eip;
unsigned seg = regs->xcs & 0xffff;
u32 seg_ar, seg_limit, base, *desc;
/* Unlikely, but must come before segment checks. */
if (unlikely(regs->eflags & VM_MASK)) {
base = seg << 4;
*eip_limit = base + 0xffff;
return base + (eip & 0xffff);
}
/* The standard kernel/user address space limit. */
*eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
/* By far the most common cases. */
if (likely(SEGMENT_IS_FLAT_CODE(seg)))
return eip;
/* Check the segment exists, is within the current LDT/GDT size,
that kernel/user (ring 0..3) has the appropriate privilege,
that it's a code segment, and get the limit. */
__asm__ ("larl %3,%0; lsll %3,%1"
: "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
if ((~seg_ar & 0x9800) || eip > seg_limit) {
*eip_limit = 0;
return 1; /* So that returned eip > *eip_limit. */
}
/* Get the GDT/LDT descriptor base.
When you look for races in this code remember that
LDT and other horrors are only used in user space. */
if (seg & (1<<2)) {
/* Must lock the LDT while reading it. */
down(&current->mm->context.sem);
desc = current->mm->context.ldt;
desc = (void *)desc + (seg & ~7);
} else {
/* Must disable preemption while reading the GDT. */
desc = (u32 *)get_cpu_gdt_table(get_cpu());
desc = (void *)desc + (seg & ~7);
}
/* Decode the code segment base from the descriptor */
base = get_desc_base((unsigned long *)desc);
if (seg & (1<<2)) {
up(&current->mm->context.sem);
} else
put_cpu();
/* Adjust EIP and segment limit, and clamp at the kernel limit.
It's legitimate for segments to wrap at 0xffffffff. */
seg_limit += base;
if (seg_limit < *eip_limit && seg_limit >= base)
*eip_limit = seg_limit;
return eip + base;
}
/*
* Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
* Check that here and ignore it.
*/
static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
{
unsigned long limit;
unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
int scan_more = 1;
int prefetch = 0;
int i;
for (i = 0; scan_more && i < 15; i++) {
unsigned char opcode;
unsigned char instr_hi;
unsigned char instr_lo;
if (instr > (unsigned char *)limit)
break;
if (probe_kernel_address(instr, opcode))
break;
instr_hi = opcode & 0xf0;
instr_lo = opcode & 0x0f;
instr++;
switch (instr_hi) {
case 0x20:
case 0x30:
/* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
scan_more = ((instr_lo & 7) == 0x6);
break;
case 0x60:
/* 0x64 thru 0x67 are valid prefixes in all modes. */
scan_more = (instr_lo & 0xC) == 0x4;
break;
case 0xF0:
/* 0xF0, 0xF2, and 0xF3 are valid prefixes */
scan_more = !instr_lo || (instr_lo>>1) == 1;
break;
case 0x00:
/* Prefetch instruction is 0x0F0D or 0x0F18 */
scan_more = 0;
if (instr > (unsigned char *)limit)
break;
if (probe_kernel_address(instr, opcode))
break;
prefetch = (instr_lo == 0xF) &&
(opcode == 0x0D || opcode == 0x18);
break;
default:
scan_more = 0;
break;
}
}
return prefetch;
}
static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
unsigned long error_code)
{
if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
boot_cpu_data.x86 >= 6)) {
/* Catch an obscure case of prefetch inside an NX page. */
if (nx_enabled && (error_code & 16))
return 0;
return __is_prefetch(regs, addr);
}
return 0;
}
static noinline void force_sig_info_fault(int si_signo, int si_code,
unsigned long address, struct task_struct *tsk)
{
siginfo_t info;
info.si_signo = si_signo;
info.si_errno = 0;
info.si_code = si_code;
info.si_addr = (void __user *)address;
force_sig_info(si_signo, &info, tsk);
}
fastcall void do_invalid_op(struct pt_regs *, unsigned long);
static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
{
unsigned index = pgd_index(address);
pgd_t *pgd_k;
pud_t *pud, *pud_k;
pmd_t *pmd, *pmd_k;
pgd += index;
pgd_k = init_mm.pgd + index;
if (!pgd_present(*pgd_k))
return NULL;
/*
* set_pgd(pgd, *pgd_k); here would be useless on PAE
* and redundant with the set_pmd() on non-PAE. As would
* set_pud.
*/
pud = pud_offset(pgd, address);
pud_k = pud_offset(pgd_k, address);
if (!pud_present(*pud_k))
return NULL;
pmd = pmd_offset(pud, address);
pmd_k = pmd_offset(pud_k, address);
if (!pmd_present(*pmd_k))
return NULL;
if (!pmd_present(*pmd)) {
set_pmd(pmd, *pmd_k);
arch_flush_lazy_mmu_mode();
} else
BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
return pmd_k;
}
/*
* Handle a fault on the vmalloc or module mapping area
*
* This assumes no large pages in there.
*/
static inline int vmalloc_fault(unsigned long address)
{
unsigned long pgd_paddr;
pmd_t *pmd_k;
pte_t *pte_k;
/*
* Synchronize this task's top level page-table
* with the 'reference' page table.
*
* Do _not_ use "current" here. We might be inside
* an interrupt in the middle of a task switch..
*/
pgd_paddr = read_cr3();
pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
if (!pmd_k)
return -1;
pte_k = pte_offset_kernel(pmd_k, address);
if (!pte_present(*pte_k))
return -1;
return 0;
}
int show_unhandled_signals = 1;
/*
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
* routines.
*
* error_code:
* bit 0 == 0 means no page found, 1 means protection fault
* bit 1 == 0 means read, 1 means write
* bit 2 == 0 means kernel, 1 means user-mode
* bit 3 == 1 means use of reserved bit detected
* bit 4 == 1 means fault was an instruction fetch
*/
fastcall void __kprobes do_page_fault(struct pt_regs *regs,
unsigned long error_code)
{
struct task_struct *tsk;
struct mm_struct *mm;
struct vm_area_struct * vma;
unsigned long address;
int write, si_code;
int fault;
/* get the address */
address = read_cr2();
tsk = current;
si_code = SEGV_MAPERR;
/*
* We fault-in kernel-space virtual memory on-demand. The
* 'reference' page table is init_mm.pgd.
*
* NOTE! We MUST NOT take any locks for this case. We may
* be in an interrupt or a critical region, and should
* only copy the information from the master page table,
* nothing more.
*
* This verifies that the fault happens in kernel space
* (error_code & 4) == 0, and that the fault was not a
* protection error (error_code & 9) == 0.
*/
if (unlikely(address >= TASK_SIZE)) {
if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
return;
if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
return;
/*
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock.
*/
goto bad_area_nosemaphore;
}
if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
return;
/* It's safe to allow irq's after cr2 has been saved and the vmalloc
fault has been handled. */
if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
local_irq_enable();
mm = tsk->mm;
/*
* If we're in an interrupt, have no user context or are running in an
* atomic region then we must not take the fault..
*/
if (in_atomic() || !mm)
goto bad_area_nosemaphore;
/* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
* kernel and should generate an OOPS. Unfortunatly, in the case of an
* erroneous fault occurring in a code path which already holds mmap_sem
* we will deadlock attempting to validate the fault against the
* address space. Luckily the kernel only validly references user
* space from well defined areas of code, which are listed in the
* exceptions table.
*
* As the vast majority of faults will be valid we will only perform
* the source reference check when there is a possibilty of a deadlock.
* Attempt to lock the address space, if we cannot we then validate the
* source. If this is invalid we can skip the address space check,
* thus avoiding the deadlock.
*/
if (!down_read_trylock(&mm->mmap_sem)) {
if ((error_code & 4) == 0 &&
!search_exception_tables(regs->eip))
goto bad_area_nosemaphore;
down_read(&mm->mmap_sem);
}
vma = find_vma(mm, address);
if (!vma)
goto bad_area;
if (vma->vm_start <= address)
goto good_area;
if (!(vma->vm_flags & VM_GROWSDOWN))
goto bad_area;
if (error_code & 4) {
/*
* Accessing the stack below %esp is always a bug.
* The large cushion allows instructions like enter
* and pusha to work. ("enter $65535,$31" pushes
* 32 pointers and then decrements %esp by 65535.)
*/
if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
goto bad_area;
}
if (expand_stack(vma, address))
goto bad_area;
/*
* Ok, we have a good vm_area for this memory access, so
* we can handle it..
*/
good_area:
si_code = SEGV_ACCERR;
write = 0;
switch (error_code & 3) {
default: /* 3: write, present */
/* fall through */
case 2: /* write, not present */
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
write++;
break;
case 1: /* read, present */
goto bad_area;
case 0: /* read, not present */
if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
goto bad_area;
}
survive:
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
fault = handle_mm_fault(mm, vma, address, write);
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
else if (fault & VM_FAULT_SIGBUS)
goto do_sigbus;
BUG();
}
if (fault & VM_FAULT_MAJOR)
tsk->maj_flt++;
else
tsk->min_flt++;
/*
* Did it hit the DOS screen memory VA from vm86 mode?
*/
if (regs->eflags & VM_MASK) {
unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
if (bit < 32)
tsk->thread.screen_bitmap |= 1 << bit;
}
up_read(&mm->mmap_sem);
return;
/*
* Something tried to access memory that isn't in our memory map..
* Fix it, but check if it's kernel or user first..
*/
bad_area:
up_read(&mm->mmap_sem);
bad_area_nosemaphore:
/* User mode accesses just cause a SIGSEGV */
if (error_code & 4) {
/*
* It's possible to have interrupts off here.
*/
local_irq_enable();
/*
* Valid to do another page fault here because this one came
* from user space.
*/
if (is_prefetch(regs, address, error_code))
return;
if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
printk_ratelimit()) {
printk("%s%s[%d]: segfault at %08lx eip %08lx "
"esp %08lx error %lx\n",
tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
tsk->comm, tsk->pid, address, regs->eip,
regs->esp, error_code);
}
tsk->thread.cr2 = address;
/* Kernel addresses are always protection faults */
tsk->thread.error_code = error_code | (address >= TASK_SIZE);
tsk->thread.trap_no = 14;
force_sig_info_fault(SIGSEGV, si_code, address, tsk);
return;
}
#ifdef CONFIG_X86_F00F_BUG
/*
* Pentium F0 0F C7 C8 bug workaround.
*/
if (boot_cpu_data.f00f_bug) {
unsigned long nr;
nr = (address - idt_descr.address) >> 3;
if (nr == 6) {
do_invalid_op(regs, 0);
return;
}
}
#endif
no_context:
/* Are we prepared to handle this kernel fault? */
if (fixup_exception(regs))
return;
/*
* Valid to do another page fault here, because if this fault
* had been triggered by is_prefetch fixup_exception would have
* handled it.
*/
if (is_prefetch(regs, address, error_code))
return;
/*
* Oops. The kernel tried to access some bad page. We'll have to
* terminate things with extreme prejudice.
*/
bust_spinlocks(1);
if (oops_may_print()) {
__typeof__(pte_val(__pte(0))) page;
#ifdef CONFIG_X86_PAE
if (error_code & 16) {
pte_t *pte = lookup_address(address);
if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
printk(KERN_CRIT "kernel tried to execute "
"NX-protected page - exploit attempt? "
"(uid: %d)\n", current->uid);
}
#endif
if (address < PAGE_SIZE)
printk(KERN_ALERT "BUG: unable to handle kernel NULL "
"pointer dereference");
else
printk(KERN_ALERT "BUG: unable to handle kernel paging"
" request");
printk(" at virtual address %08lx\n",address);
printk(KERN_ALERT " printing eip:\n");
printk("%08lx\n", regs->eip);
page = read_cr3();
page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
#ifdef CONFIG_X86_PAE
printk(KERN_ALERT "*pdpt = %016Lx\n", page);
if ((page >> PAGE_SHIFT) < max_low_pfn
&& page & _PAGE_PRESENT) {
page &= PAGE_MASK;
page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
& (PTRS_PER_PMD - 1)];
printk(KERN_ALERT "*pde = %016Lx\n", page);
page &= ~_PAGE_NX;
}
#else
printk(KERN_ALERT "*pde = %08lx\n", page);
#endif
/*
* We must not directly access the pte in the highpte
* case if the page table is located in highmem.
* And let's rather not kmap-atomic the pte, just in case
* it's allocated already.
*/
if ((page >> PAGE_SHIFT) < max_low_pfn
&& (page & _PAGE_PRESENT)) {
page &= PAGE_MASK;
page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
& (PTRS_PER_PTE - 1)];
printk(KERN_ALERT "*pte = %0*Lx\n", sizeof(page)*2, (u64)page);
}
}
tsk->thread.cr2 = address;
tsk->thread.trap_no = 14;
tsk->thread.error_code = error_code;
die("Oops", regs, error_code);
bust_spinlocks(0);
do_exit(SIGKILL);
/*
* We ran out of memory, or some other thing happened to us that made
* us unable to handle the page fault gracefully.
*/
out_of_memory:
up_read(&mm->mmap_sem);
if (is_init(tsk)) {
yield();
down_read(&mm->mmap_sem);
goto survive;
}
printk("VM: killing process %s\n", tsk->comm);
if (error_code & 4)
do_exit(SIGKILL);
goto no_context;
do_sigbus:
up_read(&mm->mmap_sem);
/* Kernel mode? Handle exceptions or die */
if (!(error_code & 4))
goto no_context;
/* User space => ok to do another page fault */
if (is_prefetch(regs, address, error_code))
return;
tsk->thread.cr2 = address;
tsk->thread.error_code = error_code;
tsk->thread.trap_no = 14;
force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
}
void vmalloc_sync_all(void)
{
/*
* Note that races in the updates of insync and start aren't
* problematic: insync can only get set bits added, and updates to
* start are only improving performance (without affecting correctness
* if undone).
*/
static DECLARE_BITMAP(insync, PTRS_PER_PGD);
static unsigned long start = TASK_SIZE;
unsigned long address;
if (SHARED_KERNEL_PMD)
return;
BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
if (!test_bit(pgd_index(address), insync)) {
unsigned long flags;
struct page *page;
spin_lock_irqsave(&pgd_lock, flags);
for (page = pgd_list; page; page =
(struct page *)page->index)
if (!vmalloc_sync_one(page_address(page),
address)) {
BUG_ON(page != pgd_list);
break;
}
spin_unlock_irqrestore(&pgd_lock, flags);
if (!page)
set_bit(pgd_index(address), insync);
}
if (address == start && test_bit(pgd_index(address), insync))
start = address + PGDIR_SIZE;
}
}

113
arch/x86/mm/highmem_32.c Normal file
View File

@@ -0,0 +1,113 @@
#include <linux/highmem.h>
#include <linux/module.h>
void *kmap(struct page *page)
{
might_sleep();
if (!PageHighMem(page))
return page_address(page);
return kmap_high(page);
}
void kunmap(struct page *page)
{
if (in_interrupt())
BUG();
if (!PageHighMem(page))
return;
kunmap_high(page);
}
/*
* kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
* no global lock is needed and because the kmap code must perform a global TLB
* invalidation when the kmap pool wraps.
*
* However when holding an atomic kmap is is not legal to sleep, so atomic
* kmaps are appropriate for short, tight code paths only.
*/
void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
{
enum fixed_addresses idx;
unsigned long vaddr;
/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
pagefault_disable();
if (!PageHighMem(page))
return page_address(page);
idx = type + KM_TYPE_NR*smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
BUG_ON(!pte_none(*(kmap_pte-idx)));
set_pte(kmap_pte-idx, mk_pte(page, prot));
arch_flush_lazy_mmu_mode();
return (void *)vaddr;
}
void *kmap_atomic(struct page *page, enum km_type type)
{
return kmap_atomic_prot(page, type, kmap_prot);
}
void kunmap_atomic(void *kvaddr, enum km_type type)
{
unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
/*
* Force other mappings to Oops if they'll try to access this pte
* without first remap it. Keeping stale mappings around is a bad idea
* also, in case the page changes cacheability attributes or becomes
* a protected page in a hypervisor.
*/
if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
kpte_clear_flush(kmap_pte-idx, vaddr);
else {
#ifdef CONFIG_DEBUG_HIGHMEM
BUG_ON(vaddr < PAGE_OFFSET);
BUG_ON(vaddr >= (unsigned long)high_memory);
#endif
}
arch_flush_lazy_mmu_mode();
pagefault_enable();
}
/* This is the same as kmap_atomic() but can map memory that doesn't
* have a struct page associated with it.
*/
void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
{
enum fixed_addresses idx;
unsigned long vaddr;
pagefault_disable();
idx = type + KM_TYPE_NR*smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
arch_flush_lazy_mmu_mode();
return (void*) vaddr;
}
struct page *kmap_atomic_to_page(void *ptr)
{
unsigned long idx, vaddr = (unsigned long)ptr;
pte_t *pte;
if (vaddr < FIXADDR_START)
return virt_to_page(ptr);
idx = virt_to_fix(vaddr);
pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
return pte_page(*pte);
}
EXPORT_SYMBOL(kmap);
EXPORT_SYMBOL(kunmap);
EXPORT_SYMBOL(kmap_atomic);
EXPORT_SYMBOL(kunmap_atomic);
EXPORT_SYMBOL(kmap_atomic_to_page);

391
arch/x86/mm/hugetlbpage.c Normal file
View File

@@ -0,0 +1,391 @@
/*
* IA-32 Huge TLB Page Support for Kernel.
*
* Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
*/
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/err.h>
#include <linux/sysctl.h>
#include <asm/mman.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
static unsigned long page_table_shareable(struct vm_area_struct *svma,
struct vm_area_struct *vma,
unsigned long addr, pgoff_t idx)
{
unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
svma->vm_start;
unsigned long sbase = saddr & PUD_MASK;
unsigned long s_end = sbase + PUD_SIZE;
/*
* match the virtual addresses, permission and the alignment of the
* page table page.
*/
if (pmd_index(addr) != pmd_index(saddr) ||
vma->vm_flags != svma->vm_flags ||
sbase < svma->vm_start || svma->vm_end < s_end)
return 0;
return saddr;
}
static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
{
unsigned long base = addr & PUD_MASK;
unsigned long end = base + PUD_SIZE;
/*
* check on proper vm_flags and page table alignment
*/
if (vma->vm_flags & VM_MAYSHARE &&
vma->vm_start <= base && end <= vma->vm_end)
return 1;
return 0;
}
/*
* search for a shareable pmd page for hugetlb.
*/
static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
{
struct vm_area_struct *vma = find_vma(mm, addr);
struct address_space *mapping = vma->vm_file->f_mapping;
pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
vma->vm_pgoff;
struct prio_tree_iter iter;
struct vm_area_struct *svma;
unsigned long saddr;
pte_t *spte = NULL;
if (!vma_shareable(vma, addr))
return;
spin_lock(&mapping->i_mmap_lock);
vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
saddr = page_table_shareable(svma, vma, addr, idx);
if (saddr) {
spte = huge_pte_offset(svma->vm_mm, saddr);
if (spte) {
get_page(virt_to_page(spte));
break;
}
}
}
if (!spte)
goto out;
spin_lock(&mm->page_table_lock);
if (pud_none(*pud))
pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK);
else
put_page(virt_to_page(spte));
spin_unlock(&mm->page_table_lock);
out:
spin_unlock(&mapping->i_mmap_lock);
}
/*
* unmap huge page backed by shared pte.
*
* Hugetlb pte page is ref counted at the time of mapping. If pte is shared
* indicated by page_count > 1, unmap is achieved by clearing pud and
* decrementing the ref count. If count == 1, the pte page is not shared.
*
* called with vma->vm_mm->page_table_lock held.
*
* returns: 1 successfully unmapped a shared pte page
* 0 the underlying pte page is not shared, or it is the last user
*/
int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
{
pgd_t *pgd = pgd_offset(mm, *addr);
pud_t *pud = pud_offset(pgd, *addr);
BUG_ON(page_count(virt_to_page(ptep)) == 0);
if (page_count(virt_to_page(ptep)) == 1)
return 0;
pud_clear(pud);
put_page(virt_to_page(ptep));
*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
return 1;
}
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
pud_t *pud;
pte_t *pte = NULL;
pgd = pgd_offset(mm, addr);
pud = pud_alloc(mm, pgd, addr);
if (pud) {
if (pud_none(*pud))
huge_pmd_share(mm, addr, pud);
pte = (pte_t *) pmd_alloc(mm, pud, addr);
}
BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
return pte;
}
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd = NULL;
pgd = pgd_offset(mm, addr);
if (pgd_present(*pgd)) {
pud = pud_offset(pgd, addr);
if (pud_present(*pud))
pmd = pmd_offset(pud, addr);
}
return (pte_t *) pmd;
}
#if 0 /* This is just for testing */
struct page *
follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
{
unsigned long start = address;
int length = 1;
int nr;
struct page *page;
struct vm_area_struct *vma;
vma = find_vma(mm, addr);
if (!vma || !is_vm_hugetlb_page(vma))
return ERR_PTR(-EINVAL);
pte = huge_pte_offset(mm, address);
/* hugetlb should be locked, and hence, prefaulted */
WARN_ON(!pte || pte_none(*pte));
page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
WARN_ON(!PageCompound(page));
return page;
}
int pmd_huge(pmd_t pmd)
{
return 0;
}
struct page *
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
pmd_t *pmd, int write)
{
return NULL;
}
#else
struct page *
follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
{
return ERR_PTR(-EINVAL);
}
int pmd_huge(pmd_t pmd)
{
return !!(pmd_val(pmd) & _PAGE_PSE);
}
struct page *
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
pmd_t *pmd, int write)
{
struct page *page;
page = pte_page(*(pte_t *)pmd);
if (page)
page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT);
return page;
}
#endif
/* x86_64 also uses this file */
#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long start_addr;
if (len > mm->cached_hole_size) {
start_addr = mm->free_area_cache;
} else {
start_addr = TASK_UNMAPPED_BASE;
mm->cached_hole_size = 0;
}
full_search:
addr = ALIGN(start_addr, HPAGE_SIZE);
for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
/* At this point: (!vma || addr < vma->vm_end). */
if (TASK_SIZE - len < addr) {
/*
* Start a new search - just in case we missed
* some holes.
*/
if (start_addr != TASK_UNMAPPED_BASE) {
start_addr = TASK_UNMAPPED_BASE;
mm->cached_hole_size = 0;
goto full_search;
}
return -ENOMEM;
}
if (!vma || addr + len <= vma->vm_start) {
mm->free_area_cache = addr + len;
return addr;
}
if (addr + mm->cached_hole_size < vma->vm_start)
mm->cached_hole_size = vma->vm_start - addr;
addr = ALIGN(vma->vm_end, HPAGE_SIZE);
}
}
static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
unsigned long addr0, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev_vma;
unsigned long base = mm->mmap_base, addr = addr0;
unsigned long largest_hole = mm->cached_hole_size;
int first_time = 1;
/* don't allow allocations above current base */
if (mm->free_area_cache > base)
mm->free_area_cache = base;
if (len <= largest_hole) {
largest_hole = 0;
mm->free_area_cache = base;
}
try_again:
/* make sure it can fit in the remaining address space */
if (mm->free_area_cache < len)
goto fail;
/* either no address requested or cant fit in requested address hole */
addr = (mm->free_area_cache - len) & HPAGE_MASK;
do {
/*
* Lookup failure means no vma is above this address,
* i.e. return with success:
*/
if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
return addr;
/*
* new region fits between prev_vma->vm_end and
* vma->vm_start, use it:
*/
if (addr + len <= vma->vm_start &&
(!prev_vma || (addr >= prev_vma->vm_end))) {
/* remember the address as a hint for next time */
mm->cached_hole_size = largest_hole;
return (mm->free_area_cache = addr);
} else {
/* pull free_area_cache down to the first hole */
if (mm->free_area_cache == vma->vm_end) {
mm->free_area_cache = vma->vm_start;
mm->cached_hole_size = largest_hole;
}
}
/* remember the largest hole we saw so far */
if (addr + largest_hole < vma->vm_start)
largest_hole = vma->vm_start - addr;
/* try just below the current vma->vm_start */
addr = (vma->vm_start - len) & HPAGE_MASK;
} while (len <= vma->vm_start);
fail:
/*
* if hint left us with no space for the requested
* mapping then try again:
*/
if (first_time) {
mm->free_area_cache = base;
largest_hole = 0;
first_time = 0;
goto try_again;
}
/*
* A failed mmap() very likely causes application failure,
* so fall back to the bottom-up function here. This scenario
* can happen with large stack limits and large mmap()
* allocations.
*/
mm->free_area_cache = TASK_UNMAPPED_BASE;
mm->cached_hole_size = ~0UL;
addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
len, pgoff, flags);
/*
* Restore the topdown base:
*/
mm->free_area_cache = base;
mm->cached_hole_size = ~0UL;
return addr;
}
unsigned long
hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
if (len & ~HPAGE_MASK)
return -EINVAL;
if (len > TASK_SIZE)
return -ENOMEM;
if (flags & MAP_FIXED) {
if (prepare_hugepage_range(addr, len))
return -EINVAL;
return addr;
}
if (addr) {
addr = ALIGN(addr, HPAGE_SIZE);
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
if (mm->get_unmapped_area == arch_get_unmapped_area)
return hugetlb_get_unmapped_area_bottomup(file, addr, len,
pgoff, flags);
else
return hugetlb_get_unmapped_area_topdown(file, addr, len,
pgoff, flags);
}
#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/

858
arch/x86/mm/init_32.c Normal file
View File

@@ -0,0 +1,858 @@
/*
* linux/arch/i386/mm/init.c
*
* Copyright (C) 1995 Linus Torvalds
*
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
*/
#include <linux/module.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/pfn.h>
#include <linux/poison.h>
#include <linux/bootmem.h>
#include <linux/slab.h>
#include <linux/proc_fs.h>
#include <linux/efi.h>
#include <linux/memory_hotplug.h>
#include <linux/initrd.h>
#include <linux/cpumask.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
#include <asm/paravirt.h>
unsigned int __VMALLOC_RESERVE = 128 << 20;
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
unsigned long highstart_pfn, highend_pfn;
static int noinline do_test_wp_bit(void);
/*
* Creates a middle page table and puts a pointer to it in the
* given global directory entry. This only returns the gd entry
* in non-PAE compilation mode, since the middle layer is folded.
*/
static pmd_t * __init one_md_table_init(pgd_t *pgd)
{
pud_t *pud;
pmd_t *pmd_table;
#ifdef CONFIG_X86_PAE
if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
pud = pud_offset(pgd, 0);
if (pmd_table != pmd_offset(pud, 0))
BUG();
}
#endif
pud = pud_offset(pgd, 0);
pmd_table = pmd_offset(pud, 0);
return pmd_table;
}
/*
* Create a page table and place a pointer to it in a middle page
* directory entry.
*/
static pte_t * __init one_page_table_init(pmd_t *pmd)
{
if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
BUG_ON(page_table != pte_offset_kernel(pmd, 0));
}
return pte_offset_kernel(pmd, 0);
}
/*
* This function initializes a certain range of kernel virtual memory
* with new bootmem page tables, everywhere page tables are missing in
* the given range.
*/
/*
* NOTE: The pagetables are allocated contiguous on the physical space
* so we can cache the place of the first one and move around without
* checking the pgd every time.
*/
static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
{
pgd_t *pgd;
pmd_t *pmd;
int pgd_idx, pmd_idx;
unsigned long vaddr;
vaddr = start;
pgd_idx = pgd_index(vaddr);
pmd_idx = pmd_index(vaddr);
pgd = pgd_base + pgd_idx;
for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
pmd = one_md_table_init(pgd);
pmd = pmd + pmd_index(vaddr);
for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
one_page_table_init(pmd);
vaddr += PMD_SIZE;
}
pmd_idx = 0;
}
}
static inline int is_kernel_text(unsigned long addr)
{
if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
return 1;
return 0;
}
/*
* This maps the physical memory to kernel virtual address space, a total
* of max_low_pfn pages, by creating page tables starting from address
* PAGE_OFFSET.
*/
static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
{
unsigned long pfn;
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte;
int pgd_idx, pmd_idx, pte_ofs;
pgd_idx = pgd_index(PAGE_OFFSET);
pgd = pgd_base + pgd_idx;
pfn = 0;
for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
pmd = one_md_table_init(pgd);
if (pfn >= max_low_pfn)
continue;
for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
/* Map with big pages if possible, otherwise create normal page tables. */
if (cpu_has_pse) {
unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
if (is_kernel_text(address) || is_kernel_text(address2))
set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
else
set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
pfn += PTRS_PER_PTE;
} else {
pte = one_page_table_init(pmd);
for (pte_ofs = 0;
pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
if (is_kernel_text(address))
set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
else
set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
}
}
}
}
}
static inline int page_kills_ppro(unsigned long pagenr)
{
if (pagenr >= 0x70000 && pagenr <= 0x7003F)
return 1;
return 0;
}
int page_is_ram(unsigned long pagenr)
{
int i;
unsigned long addr, end;
if (efi_enabled) {
efi_memory_desc_t *md;
void *p;
for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
md = p;
if (!is_available_memory(md))
continue;
addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
if ((pagenr >= addr) && (pagenr < end))
return 1;
}
return 0;
}
for (i = 0; i < e820.nr_map; i++) {
if (e820.map[i].type != E820_RAM) /* not usable memory */
continue;
/*
* !!!FIXME!!! Some BIOSen report areas as RAM that
* are not. Notably the 640->1Mb area. We need a sanity
* check here.
*/
addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
if ((pagenr >= addr) && (pagenr < end))
return 1;
}
return 0;
}
#ifdef CONFIG_HIGHMEM
pte_t *kmap_pte;
pgprot_t kmap_prot;
#define kmap_get_fixmap_pte(vaddr) \
pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
static void __init kmap_init(void)
{
unsigned long kmap_vstart;
/* cache the first kmap pte */
kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
kmap_prot = PAGE_KERNEL;
}
static void __init permanent_kmaps_init(pgd_t *pgd_base)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
unsigned long vaddr;
vaddr = PKMAP_BASE;
page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
pgd = swapper_pg_dir + pgd_index(vaddr);
pud = pud_offset(pgd, vaddr);
pmd = pmd_offset(pud, vaddr);
pte = pte_offset_kernel(pmd, vaddr);
pkmap_page_table = pte;
}
static void __meminit free_new_highpage(struct page *page)
{
init_page_count(page);
__free_page(page);
totalhigh_pages++;
}
void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
{
if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
ClearPageReserved(page);
free_new_highpage(page);
} else
SetPageReserved(page);
}
static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn)
{
free_new_highpage(page);
totalram_pages++;
#ifdef CONFIG_FLATMEM
max_mapnr = max(pfn, max_mapnr);
#endif
num_physpages++;
return 0;
}
/*
* Not currently handling the NUMA case.
* Assuming single node and all memory that
* has been added dynamically that would be
* onlined here is in HIGHMEM
*/
void __meminit online_page(struct page *page)
{
ClearPageReserved(page);
add_one_highpage_hotplug(page, page_to_pfn(page));
}
#ifdef CONFIG_NUMA
extern void set_highmem_pages_init(int);
#else
static void __init set_highmem_pages_init(int bad_ppro)
{
int pfn;
for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
totalram_pages += totalhigh_pages;
}
#endif /* CONFIG_FLATMEM */
#else
#define kmap_init() do { } while (0)
#define permanent_kmaps_init(pgd_base) do { } while (0)
#define set_highmem_pages_init(bad_ppro) do { } while (0)
#endif /* CONFIG_HIGHMEM */
unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
EXPORT_SYMBOL(__PAGE_KERNEL);
unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
#ifdef CONFIG_NUMA
extern void __init remap_numa_kva(void);
#else
#define remap_numa_kva() do {} while (0)
#endif
void __init native_pagetable_setup_start(pgd_t *base)
{
#ifdef CONFIG_X86_PAE
int i;
/*
* Init entries of the first-level page table to the
* zero page, if they haven't already been set up.
*
* In a normal native boot, we'll be running on a
* pagetable rooted in swapper_pg_dir, but not in PAE
* mode, so this will end up clobbering the mappings
* for the lower 24Mbytes of the address space,
* without affecting the kernel address space.
*/
for (i = 0; i < USER_PTRS_PER_PGD; i++)
set_pgd(&base[i],
__pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
/* Make sure kernel address space is empty so that a pagetable
will be allocated for it. */
memset(&base[USER_PTRS_PER_PGD], 0,
KERNEL_PGD_PTRS * sizeof(pgd_t));
#else
paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT);
#endif
}
void __init native_pagetable_setup_done(pgd_t *base)
{
#ifdef CONFIG_X86_PAE
/*
* Add low memory identity-mappings - SMP needs it when
* starting up on an AP from real-mode. In the non-PAE
* case we already have these mappings through head.S.
* All user-space mappings are explicitly cleared after
* SMP startup.
*/
set_pgd(&base[0], base[USER_PTRS_PER_PGD]);
#endif
}
/*
* Build a proper pagetable for the kernel mappings. Up until this
* point, we've been running on some set of pagetables constructed by
* the boot process.
*
* If we're booting on native hardware, this will be a pagetable
* constructed in arch/i386/kernel/head.S, and not running in PAE mode
* (even if we'll end up running in PAE). The root of the pagetable
* will be swapper_pg_dir.
*
* If we're booting paravirtualized under a hypervisor, then there are
* more options: we may already be running PAE, and the pagetable may
* or may not be based in swapper_pg_dir. In any case,
* paravirt_pagetable_setup_start() will set up swapper_pg_dir
* appropriately for the rest of the initialization to work.
*
* In general, pagetable_init() assumes that the pagetable may already
* be partially populated, and so it avoids stomping on any existing
* mappings.
*/
static void __init pagetable_init (void)
{
unsigned long vaddr, end;
pgd_t *pgd_base = swapper_pg_dir;
paravirt_pagetable_setup_start(pgd_base);
/* Enable PSE if available */
if (cpu_has_pse)
set_in_cr4(X86_CR4_PSE);
/* Enable PGE if available */
if (cpu_has_pge) {
set_in_cr4(X86_CR4_PGE);
__PAGE_KERNEL |= _PAGE_GLOBAL;
__PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
}
kernel_physical_mapping_init(pgd_base);
remap_numa_kva();
/*
* Fixed mappings, only the page table structure has to be
* created - mappings will be set by set_fixmap():
*/
vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
page_table_range_init(vaddr, end, pgd_base);
permanent_kmaps_init(pgd_base);
paravirt_pagetable_setup_done(pgd_base);
}
#if defined(CONFIG_HIBERNATION) || defined(CONFIG_ACPI)
/*
* Swap suspend & friends need this for resume because things like the intel-agp
* driver might have split up a kernel 4MB mapping.
*/
char __nosavedata swsusp_pg_dir[PAGE_SIZE]
__attribute__ ((aligned (PAGE_SIZE)));
static inline void save_pg_dir(void)
{
memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
}
#else
static inline void save_pg_dir(void)
{
}
#endif
void zap_low_mappings (void)
{
int i;
save_pg_dir();
/*
* Zap initial low-memory mappings.
*
* Note that "pgd_clear()" doesn't do it for
* us, because pgd_clear() is a no-op on i386.
*/
for (i = 0; i < USER_PTRS_PER_PGD; i++)
#ifdef CONFIG_X86_PAE
set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
#else
set_pgd(swapper_pg_dir+i, __pgd(0));
#endif
flush_tlb_all();
}
int nx_enabled = 0;
#ifdef CONFIG_X86_PAE
static int disable_nx __initdata = 0;
u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
EXPORT_SYMBOL_GPL(__supported_pte_mask);
/*
* noexec = on|off
*
* Control non executable mappings.
*
* on Enable
* off Disable
*/
static int __init noexec_setup(char *str)
{
if (!str || !strcmp(str, "on")) {
if (cpu_has_nx) {
__supported_pte_mask |= _PAGE_NX;
disable_nx = 0;
}
} else if (!strcmp(str,"off")) {
disable_nx = 1;
__supported_pte_mask &= ~_PAGE_NX;
} else
return -EINVAL;
return 0;
}
early_param("noexec", noexec_setup);
static void __init set_nx(void)
{
unsigned int v[4], l, h;
if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
if ((v[3] & (1 << 20)) && !disable_nx) {
rdmsr(MSR_EFER, l, h);
l |= EFER_NX;
wrmsr(MSR_EFER, l, h);
nx_enabled = 1;
__supported_pte_mask |= _PAGE_NX;
}
}
}
/*
* Enables/disables executability of a given kernel page and
* returns the previous setting.
*/
int __init set_kernel_exec(unsigned long vaddr, int enable)
{
pte_t *pte;
int ret = 1;
if (!nx_enabled)
goto out;
pte = lookup_address(vaddr);
BUG_ON(!pte);
if (!pte_exec_kernel(*pte))
ret = 0;
if (enable)
pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
else
pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
pte_update_defer(&init_mm, vaddr, pte);
__flush_tlb_all();
out:
return ret;
}
#endif
/*
* paging_init() sets up the page tables - note that the first 8MB are
* already mapped by head.S.
*
* This routines also unmaps the page at virtual kernel address 0, so
* that we can trap those pesky NULL-reference errors in the kernel.
*/
void __init paging_init(void)
{
#ifdef CONFIG_X86_PAE
set_nx();
if (nx_enabled)
printk("NX (Execute Disable) protection: active\n");
#endif
pagetable_init();
load_cr3(swapper_pg_dir);
#ifdef CONFIG_X86_PAE
/*
* We will bail out later - printk doesn't work right now so
* the user would just see a hanging kernel.
*/
if (cpu_has_pae)
set_in_cr4(X86_CR4_PAE);
#endif
__flush_tlb_all();
kmap_init();
}
/*
* Test if the WP bit works in supervisor mode. It isn't supported on 386's
* and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
* used to involve black magic jumps to work around some nasty CPU bugs,
* but fortunately the switch to using exceptions got rid of all that.
*/
static void __init test_wp_bit(void)
{
printk("Checking if this processor honours the WP bit even in supervisor mode... ");
/* Any page-aligned address will do, the test is non-destructive */
__set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
boot_cpu_data.wp_works_ok = do_test_wp_bit();
clear_fixmap(FIX_WP_TEST);
if (!boot_cpu_data.wp_works_ok) {
printk("No.\n");
#ifdef CONFIG_X86_WP_WORKS_OK
panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
#endif
} else {
printk("Ok.\n");
}
}
static struct kcore_list kcore_mem, kcore_vmalloc;
void __init mem_init(void)
{
extern int ppro_with_ram_bug(void);
int codesize, reservedpages, datasize, initsize;
int tmp;
int bad_ppro;
#ifdef CONFIG_FLATMEM
BUG_ON(!mem_map);
#endif
bad_ppro = ppro_with_ram_bug();
#ifdef CONFIG_HIGHMEM
/* check that fixmap and pkmap do not overlap */
if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
BUG();
}
#endif
/* this will put all low memory onto the freelists */
totalram_pages += free_all_bootmem();
reservedpages = 0;
for (tmp = 0; tmp < max_low_pfn; tmp++)
/*
* Only count reserved RAM pages
*/
if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
reservedpages++;
set_highmem_pages_init(bad_ppro);
codesize = (unsigned long) &_etext - (unsigned long) &_text;
datasize = (unsigned long) &_edata - (unsigned long) &_etext;
initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
VMALLOC_END-VMALLOC_START);
printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
num_physpages << (PAGE_SHIFT-10),
codesize >> 10,
reservedpages << (PAGE_SHIFT-10),
datasize >> 10,
initsize >> 10,
(unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
);
#if 1 /* double-sanity-check paranoia */
printk("virtual kernel memory layout:\n"
" fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
#ifdef CONFIG_HIGHMEM
" pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
#endif
" vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
" lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
" .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
" .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
" .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
FIXADDR_START, FIXADDR_TOP,
(FIXADDR_TOP - FIXADDR_START) >> 10,
#ifdef CONFIG_HIGHMEM
PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
(LAST_PKMAP*PAGE_SIZE) >> 10,
#endif
VMALLOC_START, VMALLOC_END,
(VMALLOC_END - VMALLOC_START) >> 20,
(unsigned long)__va(0), (unsigned long)high_memory,
((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
(unsigned long)&__init_begin, (unsigned long)&__init_end,
((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
(unsigned long)&_etext, (unsigned long)&_edata,
((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
(unsigned long)&_text, (unsigned long)&_etext,
((unsigned long)&_etext - (unsigned long)&_text) >> 10);
#ifdef CONFIG_HIGHMEM
BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
BUG_ON(VMALLOC_END > PKMAP_BASE);
#endif
BUG_ON(VMALLOC_START > VMALLOC_END);
BUG_ON((unsigned long)high_memory > VMALLOC_START);
#endif /* double-sanity-check paranoia */
#ifdef CONFIG_X86_PAE
if (!cpu_has_pae)
panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
#endif
if (boot_cpu_data.wp_works_ok < 0)
test_wp_bit();
/*
* Subtle. SMP is doing it's boot stuff late (because it has to
* fork idle threads) - but it also needs low mappings for the
* protected-mode entry to work. We zap these entries only after
* the WP-bit has been tested.
*/
#ifndef CONFIG_SMP
zap_low_mappings();
#endif
}
#ifdef CONFIG_MEMORY_HOTPLUG
int arch_add_memory(int nid, u64 start, u64 size)
{
struct pglist_data *pgdata = NODE_DATA(nid);
struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
return __add_pages(zone, start_pfn, nr_pages);
}
int remove_memory(u64 start, u64 size)
{
return -EINVAL;
}
EXPORT_SYMBOL_GPL(remove_memory);
#endif
struct kmem_cache *pmd_cache;
void __init pgtable_cache_init(void)
{
size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t);
if (PTRS_PER_PMD > 1) {
pmd_cache = kmem_cache_create("pmd",
PTRS_PER_PMD*sizeof(pmd_t),
PTRS_PER_PMD*sizeof(pmd_t),
SLAB_PANIC,
pmd_ctor);
if (!SHARED_KERNEL_PMD) {
/* If we're in PAE mode and have a non-shared
kernel pmd, then the pgd size must be a
page size. This is because the pgd_list
links through the page structure, so there
can only be one pgd per page for this to
work. */
pgd_size = PAGE_SIZE;
}
}
}
/*
* This function cannot be __init, since exceptions don't work in that
* section. Put this after the callers, so that it cannot be inlined.
*/
static int noinline do_test_wp_bit(void)
{
char tmp_reg;
int flag;
__asm__ __volatile__(
" movb %0,%1 \n"
"1: movb %1,%0 \n"
" xorl %2,%2 \n"
"2: \n"
".section __ex_table,\"a\"\n"
" .align 4 \n"
" .long 1b,2b \n"
".previous \n"
:"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
"=q" (tmp_reg),
"=r" (flag)
:"2" (1)
:"memory");
return flag;
}
#ifdef CONFIG_DEBUG_RODATA
void mark_rodata_ro(void)
{
unsigned long start = PFN_ALIGN(_text);
unsigned long size = PFN_ALIGN(_etext) - start;
#ifndef CONFIG_KPROBES
#ifdef CONFIG_HOTPLUG_CPU
/* It must still be possible to apply SMP alternatives. */
if (num_possible_cpus() <= 1)
#endif
{
change_page_attr(virt_to_page(start),
size >> PAGE_SHIFT, PAGE_KERNEL_RX);
printk("Write protecting the kernel text: %luk\n", size >> 10);
}
#endif
start += size;
size = (unsigned long)__end_rodata - start;
change_page_attr(virt_to_page(start),
size >> PAGE_SHIFT, PAGE_KERNEL_RO);
printk("Write protecting the kernel read-only data: %luk\n",
size >> 10);
/*
* change_page_attr() requires a global_flush_tlb() call after it.
* We do this after the printk so that if something went wrong in the
* change, the printk gets out at least to give a better debug hint
* of who is the culprit.
*/
global_flush_tlb();
}
#endif
void free_init_pages(char *what, unsigned long begin, unsigned long end)
{
unsigned long addr;
for (addr = begin; addr < end; addr += PAGE_SIZE) {
ClearPageReserved(virt_to_page(addr));
init_page_count(virt_to_page(addr));
memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
free_page(addr);
totalram_pages++;
}
printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
}
void free_initmem(void)
{
free_init_pages("unused kernel memory",
(unsigned long)(&__init_begin),
(unsigned long)(&__init_end));
}
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
free_init_pages("initrd memory", start, end);
}
#endif

274
arch/x86/mm/ioremap_32.c Normal file
View File

@@ -0,0 +1,274 @@
/*
* arch/i386/mm/ioremap.c
*
* Re-map IO memory to kernel address space so that we can access it.
* This is needed for high PCI addresses that aren't mapped in the
* 640k-1MB IO memory area on PC's
*
* (C) Copyright 1995 1996 Linus Torvalds
*/
#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/io.h>
#include <asm/fixmap.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/pgtable.h>
#define ISA_START_ADDRESS 0xa0000
#define ISA_END_ADDRESS 0x100000
/*
* Generic mapping function (not visible outside):
*/
/*
* Remap an arbitrary physical address space into the kernel virtual
* address space. Needed when the kernel wants to access high addresses
* directly.
*
* NOTE! We need to allow non-page-aligned mappings too: we will obviously
* have to convert them into an offset in a page-aligned mapping, but the
* caller shouldn't need to know that small detail.
*/
void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
{
void __iomem * addr;
struct vm_struct * area;
unsigned long offset, last_addr;
pgprot_t prot;
/* Don't allow wraparound or zero size */
last_addr = phys_addr + size - 1;
if (!size || last_addr < phys_addr)
return NULL;
/*
* Don't remap the low PCI/ISA area, it's always mapped..
*/
if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
return (void __iomem *) phys_to_virt(phys_addr);
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
if (phys_addr <= virt_to_phys(high_memory - 1)) {
char *t_addr, *t_end;
struct page *page;
t_addr = __va(phys_addr);
t_end = t_addr + (size - 1);
for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
if(!PageReserved(page))
return NULL;
}
prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY
| _PAGE_ACCESSED | flags);
/*
* Mappings have to be page-aligned
*/
offset = phys_addr & ~PAGE_MASK;
phys_addr &= PAGE_MASK;
size = PAGE_ALIGN(last_addr+1) - phys_addr;
/*
* Ok, go for it..
*/
area = get_vm_area(size, VM_IOREMAP | (flags << 20));
if (!area)
return NULL;
area->phys_addr = phys_addr;
addr = (void __iomem *) area->addr;
if (ioremap_page_range((unsigned long) addr,
(unsigned long) addr + size, phys_addr, prot)) {
vunmap((void __force *) addr);
return NULL;
}
return (void __iomem *) (offset + (char __iomem *)addr);
}
EXPORT_SYMBOL(__ioremap);
/**
* ioremap_nocache - map bus memory into CPU space
* @offset: bus address of the memory
* @size: size of the resource to map
*
* ioremap_nocache performs a platform specific sequence of operations to
* make bus memory CPU accessible via the readb/readw/readl/writeb/
* writew/writel functions and the other mmio helpers. The returned
* address is not guaranteed to be usable directly as a virtual
* address.
*
* This version of ioremap ensures that the memory is marked uncachable
* on the CPU as well as honouring existing caching rules from things like
* the PCI bus. Note that there are other caches and buffers on many
* busses. In particular driver authors should read up on PCI writes
*
* It's useful if some control registers are in such an area and
* write combining or read caching is not desirable:
*
* Must be freed with iounmap.
*/
void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
{
unsigned long last_addr;
void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
if (!p)
return p;
/* Guaranteed to be > phys_addr, as per __ioremap() */
last_addr = phys_addr + size - 1;
if (last_addr < virt_to_phys(high_memory) - 1) {
struct page *ppage = virt_to_page(__va(phys_addr));
unsigned long npages;
phys_addr &= PAGE_MASK;
/* This might overflow and become zero.. */
last_addr = PAGE_ALIGN(last_addr);
/* .. but that's ok, because modulo-2**n arithmetic will make
* the page-aligned "last - first" come out right.
*/
npages = (last_addr - phys_addr) >> PAGE_SHIFT;
if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
iounmap(p);
p = NULL;
}
global_flush_tlb();
}
return p;
}
EXPORT_SYMBOL(ioremap_nocache);
/**
* iounmap - Free a IO remapping
* @addr: virtual address from ioremap_*
*
* Caller must ensure there is only one unmapping for the same pointer.
*/
void iounmap(volatile void __iomem *addr)
{
struct vm_struct *p, *o;
if ((void __force *)addr <= high_memory)
return;
/*
* __ioremap special-cases the PCI/ISA range by not instantiating a
* vm_area and by simply returning an address into the kernel mapping
* of ISA space. So handle that here.
*/
if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
addr < phys_to_virt(ISA_END_ADDRESS))
return;
addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
/* Use the vm area unlocked, assuming the caller
ensures there isn't another iounmap for the same address
in parallel. Reuse of the virtual address is prevented by
leaving it in the global lists until we're done with it.
cpa takes care of the direct mappings. */
read_lock(&vmlist_lock);
for (p = vmlist; p; p = p->next) {
if (p->addr == addr)
break;
}
read_unlock(&vmlist_lock);
if (!p) {
printk("iounmap: bad address %p\n", addr);
dump_stack();
return;
}
/* Reset the direct mapping. Can block */
if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) {
change_page_attr(virt_to_page(__va(p->phys_addr)),
get_vm_area_size(p) >> PAGE_SHIFT,
PAGE_KERNEL);
global_flush_tlb();
}
/* Finally remove it */
o = remove_vm_area((void *)addr);
BUG_ON(p != o || o == NULL);
kfree(p);
}
EXPORT_SYMBOL(iounmap);
void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
{
unsigned long offset, last_addr;
unsigned int nrpages;
enum fixed_addresses idx;
/* Don't allow wraparound or zero size */
last_addr = phys_addr + size - 1;
if (!size || last_addr < phys_addr)
return NULL;
/*
* Don't remap the low PCI/ISA area, it's always mapped..
*/
if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
return phys_to_virt(phys_addr);
/*
* Mappings have to be page-aligned
*/
offset = phys_addr & ~PAGE_MASK;
phys_addr &= PAGE_MASK;
size = PAGE_ALIGN(last_addr) - phys_addr;
/*
* Mappings have to fit in the FIX_BTMAP area.
*/
nrpages = size >> PAGE_SHIFT;
if (nrpages > NR_FIX_BTMAPS)
return NULL;
/*
* Ok, go for it..
*/
idx = FIX_BTMAP_BEGIN;
while (nrpages > 0) {
set_fixmap(idx, phys_addr);
phys_addr += PAGE_SIZE;
--idx;
--nrpages;
}
return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
}
void __init bt_iounmap(void *addr, unsigned long size)
{
unsigned long virt_addr;
unsigned long offset;
unsigned int nrpages;
enum fixed_addresses idx;
virt_addr = (unsigned long)addr;
if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
return;
offset = virt_addr & ~PAGE_MASK;
nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
idx = FIX_BTMAP_BEGIN;
while (nrpages > 0) {
clear_fixmap(idx);
--idx;
--nrpages;
}
}

77
arch/x86/mm/mmap_32.c Normal file
View File

@@ -0,0 +1,77 @@
/*
* linux/arch/i386/mm/mmap.c
*
* flexible mmap layout support
*
* Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*
* Started by Ingo Molnar <mingo@elte.hu>
*/
#include <linux/personality.h>
#include <linux/mm.h>
#include <linux/random.h>
#include <linux/sched.h>
/*
* Top of mmap area (just below the process stack).
*
* Leave an at least ~128 MB hole.
*/
#define MIN_GAP (128*1024*1024)
#define MAX_GAP (TASK_SIZE/6*5)
static inline unsigned long mmap_base(struct mm_struct *mm)
{
unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
unsigned long random_factor = 0;
if (current->flags & PF_RANDOMIZE)
random_factor = get_random_int() % (1024*1024);
if (gap < MIN_GAP)
gap = MIN_GAP;
else if (gap > MAX_GAP)
gap = MAX_GAP;
return PAGE_ALIGN(TASK_SIZE - gap - random_factor);
}
/*
* This function, called very early during the creation of a new
* process VM image, sets up which VM layout function to use:
*/
void arch_pick_mmap_layout(struct mm_struct *mm)
{
/*
* Fall back to the standard layout if the personality
* bit is set, or if the expected stack growth is unlimited:
*/
if (sysctl_legacy_va_layout ||
(current->personality & ADDR_COMPAT_LAYOUT) ||
current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
mm->unmap_area = arch_unmap_area;
} else {
mm->mmap_base = mmap_base(mm);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
mm->unmap_area = arch_unmap_area_topdown;
}
}

278
arch/x86/mm/pageattr_32.c Normal file
View File

@@ -0,0 +1,278 @@
/*
* Copyright 2002 Andi Kleen, SuSE Labs.
* Thanks to Ben LaHaise for precious feedback.
*/
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/highmem.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
#include <asm/sections.h>
static DEFINE_SPINLOCK(cpa_lock);
static struct list_head df_list = LIST_HEAD_INIT(df_list);
pte_t *lookup_address(unsigned long address)
{
pgd_t *pgd = pgd_offset_k(address);
pud_t *pud;
pmd_t *pmd;
if (pgd_none(*pgd))
return NULL;
pud = pud_offset(pgd, address);
if (pud_none(*pud))
return NULL;
pmd = pmd_offset(pud, address);
if (pmd_none(*pmd))
return NULL;
if (pmd_large(*pmd))
return (pte_t *)pmd;
return pte_offset_kernel(pmd, address);
}
static struct page *split_large_page(unsigned long address, pgprot_t prot,
pgprot_t ref_prot)
{
int i;
unsigned long addr;
struct page *base;
pte_t *pbase;
spin_unlock_irq(&cpa_lock);
base = alloc_pages(GFP_KERNEL, 0);
spin_lock_irq(&cpa_lock);
if (!base)
return NULL;
/*
* page_private is used to track the number of entries in
* the page table page that have non standard attributes.
*/
SetPagePrivate(base);
page_private(base) = 0;
address = __pa(address);
addr = address & LARGE_PAGE_MASK;
pbase = (pte_t *)page_address(base);
paravirt_alloc_pt(&init_mm, page_to_pfn(base));
for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
addr == address ? prot : ref_prot));
}
return base;
}
static void cache_flush_page(struct page *p)
{
unsigned long adr = (unsigned long)page_address(p);
int i;
for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
asm volatile("clflush (%0)" :: "r" (adr + i));
}
static void flush_kernel_map(void *arg)
{
struct list_head *lh = (struct list_head *)arg;
struct page *p;
/* High level code is not ready for clflush yet */
if (0 && cpu_has_clflush) {
list_for_each_entry (p, lh, lru)
cache_flush_page(p);
} else if (boot_cpu_data.x86_model >= 4)
wbinvd();
/* Flush all to work around Errata in early athlons regarding
* large page flushing.
*/
__flush_tlb_all();
}
static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
{
struct page *page;
unsigned long flags;
set_pte_atomic(kpte, pte); /* change init_mm */
if (SHARED_KERNEL_PMD)
return;
spin_lock_irqsave(&pgd_lock, flags);
for (page = pgd_list; page; page = (struct page *)page->index) {
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pgd = (pgd_t *)page_address(page) + pgd_index(address);
pud = pud_offset(pgd, address);
pmd = pmd_offset(pud, address);
set_pte_atomic((pte_t *)pmd, pte);
}
spin_unlock_irqrestore(&pgd_lock, flags);
}
/*
* No more special protections in this 2/4MB area - revert to a
* large page again.
*/
static inline void revert_page(struct page *kpte_page, unsigned long address)
{
pgprot_t ref_prot;
pte_t *linear;
ref_prot =
((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
? PAGE_KERNEL_LARGE_EXEC : PAGE_KERNEL_LARGE;
linear = (pte_t *)
pmd_offset(pud_offset(pgd_offset_k(address), address), address);
set_pmd_pte(linear, address,
pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
ref_prot));
}
static inline void save_page(struct page *kpte_page)
{
if (!test_and_set_bit(PG_arch_1, &kpte_page->flags))
list_add(&kpte_page->lru, &df_list);
}
static int
__change_page_attr(struct page *page, pgprot_t prot)
{
pte_t *kpte;
unsigned long address;
struct page *kpte_page;
BUG_ON(PageHighMem(page));
address = (unsigned long)page_address(page);
kpte = lookup_address(address);
if (!kpte)
return -EINVAL;
kpte_page = virt_to_page(kpte);
BUG_ON(PageLRU(kpte_page));
BUG_ON(PageCompound(kpte_page));
if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) {
if (!pte_huge(*kpte)) {
set_pte_atomic(kpte, mk_pte(page, prot));
} else {
pgprot_t ref_prot;
struct page *split;
ref_prot =
((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
? PAGE_KERNEL_EXEC : PAGE_KERNEL;
split = split_large_page(address, prot, ref_prot);
if (!split)
return -ENOMEM;
set_pmd_pte(kpte,address,mk_pte(split, ref_prot));
kpte_page = split;
}
page_private(kpte_page)++;
} else if (!pte_huge(*kpte)) {
set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
BUG_ON(page_private(kpte_page) == 0);
page_private(kpte_page)--;
} else
BUG();
/*
* If the pte was reserved, it means it was created at boot
* time (not via split_large_page) and in turn we must not
* replace it with a largepage.
*/
save_page(kpte_page);
if (!PageReserved(kpte_page)) {
if (cpu_has_pse && (page_private(kpte_page) == 0)) {
paravirt_release_pt(page_to_pfn(kpte_page));
revert_page(kpte_page, address);
}
}
return 0;
}
static inline void flush_map(struct list_head *l)
{
on_each_cpu(flush_kernel_map, l, 1, 1);
}
/*
* Change the page attributes of an page in the linear mapping.
*
* This should be used when a page is mapped with a different caching policy
* than write-back somewhere - some CPUs do not like it when mappings with
* different caching policies exist. This changes the page attributes of the
* in kernel linear mapping too.
*
* The caller needs to ensure that there are no conflicting mappings elsewhere.
* This function only deals with the kernel linear map.
*
* Caller must call global_flush_tlb() after this.
*/
int change_page_attr(struct page *page, int numpages, pgprot_t prot)
{
int err = 0;
int i;
unsigned long flags;
spin_lock_irqsave(&cpa_lock, flags);
for (i = 0; i < numpages; i++, page++) {
err = __change_page_attr(page, prot);
if (err)
break;
}
spin_unlock_irqrestore(&cpa_lock, flags);
return err;
}
void global_flush_tlb(void)
{
struct list_head l;
struct page *pg, *next;
BUG_ON(irqs_disabled());
spin_lock_irq(&cpa_lock);
list_replace_init(&df_list, &l);
spin_unlock_irq(&cpa_lock);
flush_map(&l);
list_for_each_entry_safe(pg, next, &l, lru) {
list_del(&pg->lru);
clear_bit(PG_arch_1, &pg->flags);
if (PageReserved(pg) || !cpu_has_pse || page_private(pg) != 0)
continue;
ClearPagePrivate(pg);
__free_page(pg);
}
}
#ifdef CONFIG_DEBUG_PAGEALLOC
void kernel_map_pages(struct page *page, int numpages, int enable)
{
if (PageHighMem(page))
return;
if (!enable)
debug_check_no_locks_freed(page_address(page),
numpages * PAGE_SIZE);
/* the return value is ignored - the calls cannot fail,
* large pages are disabled at boot time.
*/
change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
/* we should perform an IPI and flush all tlbs,
* but that can deadlock->flush only current cpu.
*/
__flush_tlb_all();
}
#endif
EXPORT_SYMBOL(change_page_attr);
EXPORT_SYMBOL(global_flush_tlb);

373
arch/x86/mm/pgtable_32.c Normal file
View File

@@ -0,0 +1,373 @@
/*
* linux/arch/i386/mm/pgtable.c
*/
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/spinlock.h>
#include <linux/module.h>
#include <linux/quicklist.h>
#include <asm/system.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
void show_mem(void)
{
int total = 0, reserved = 0;
int shared = 0, cached = 0;
int highmem = 0;
struct page *page;
pg_data_t *pgdat;
unsigned long i;
unsigned long flags;
printk(KERN_INFO "Mem-info:\n");
show_free_areas();
printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
for_each_online_pgdat(pgdat) {
pgdat_resize_lock(pgdat, &flags);
for (i = 0; i < pgdat->node_spanned_pages; ++i) {
page = pgdat_page_nr(pgdat, i);
total++;
if (PageHighMem(page))
highmem++;
if (PageReserved(page))
reserved++;
else if (PageSwapCache(page))
cached++;
else if (page_count(page))
shared += page_count(page) - 1;
}
pgdat_resize_unlock(pgdat, &flags);
}
printk(KERN_INFO "%d pages of RAM\n", total);
printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
printk(KERN_INFO "%d reserved pages\n", reserved);
printk(KERN_INFO "%d pages shared\n", shared);
printk(KERN_INFO "%d pages swap cached\n", cached);
printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
printk(KERN_INFO "%lu pages writeback\n",
global_page_state(NR_WRITEBACK));
printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
printk(KERN_INFO "%lu pages slab\n",
global_page_state(NR_SLAB_RECLAIMABLE) +
global_page_state(NR_SLAB_UNRECLAIMABLE));
printk(KERN_INFO "%lu pages pagetables\n",
global_page_state(NR_PAGETABLE));
}
/*
* Associate a virtual page frame with a given physical page frame
* and protection flags for that frame.
*/
static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pgd = swapper_pg_dir + pgd_index(vaddr);
if (pgd_none(*pgd)) {
BUG();
return;
}
pud = pud_offset(pgd, vaddr);
if (pud_none(*pud)) {
BUG();
return;
}
pmd = pmd_offset(pud, vaddr);
if (pmd_none(*pmd)) {
BUG();
return;
}
pte = pte_offset_kernel(pmd, vaddr);
if (pgprot_val(flags))
/* <pfn,flags> stored as-is, to permit clearing entries */
set_pte(pte, pfn_pte(pfn, flags));
else
pte_clear(&init_mm, vaddr, pte);
/*
* It's enough to flush this one mapping.
* (PGE mappings get flushed as well)
*/
__flush_tlb_one(vaddr);
}
/*
* Associate a large virtual page frame with a given physical page frame
* and protection flags for that frame. pfn is for the base of the page,
* vaddr is what the page gets mapped to - both must be properly aligned.
* The pmd must already be instantiated. Assumes PAE mode.
*/
void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
return; /* BUG(); */
}
if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
return; /* BUG(); */
}
pgd = swapper_pg_dir + pgd_index(vaddr);
if (pgd_none(*pgd)) {
printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
return; /* BUG(); */
}
pud = pud_offset(pgd, vaddr);
pmd = pmd_offset(pud, vaddr);
set_pmd(pmd, pfn_pmd(pfn, flags));
/*
* It's enough to flush this one mapping.
* (PGE mappings get flushed as well)
*/
__flush_tlb_one(vaddr);
}
static int fixmaps;
unsigned long __FIXADDR_TOP = 0xfffff000;
EXPORT_SYMBOL(__FIXADDR_TOP);
void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
{
unsigned long address = __fix_to_virt(idx);
if (idx >= __end_of_fixed_addresses) {
BUG();
return;
}
set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
fixmaps++;
}
/**
* reserve_top_address - reserves a hole in the top of kernel address space
* @reserve - size of hole to reserve
*
* Can be used to relocate the fixmap area and poke a hole in the top
* of kernel address space to make room for a hypervisor.
*/
void reserve_top_address(unsigned long reserve)
{
BUG_ON(fixmaps > 0);
printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
(int)-reserve);
__FIXADDR_TOP = -reserve - PAGE_SIZE;
__VMALLOC_RESERVE += reserve;
}
pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
{
return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
}
struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
{
struct page *pte;
#ifdef CONFIG_HIGHPTE
pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
#else
pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
#endif
return pte;
}
void pmd_ctor(void *pmd, struct kmem_cache *cache, unsigned long flags)
{
memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
}
/*
* List of all pgd's needed for non-PAE so it can invalidate entries
* in both cached and uncached pgd's; not needed for PAE since the
* kernel pmd is shared. If PAE were not to share the pmd a similar
* tactic would be needed. This is essentially codepath-based locking
* against pageattr.c; it is the unique case in which a valid change
* of kernel pagetables can't be lazily synchronized by vmalloc faults.
* vmalloc faults work because attached pagetables are never freed.
* -- wli
*/
DEFINE_SPINLOCK(pgd_lock);
struct page *pgd_list;
static inline void pgd_list_add(pgd_t *pgd)
{
struct page *page = virt_to_page(pgd);
page->index = (unsigned long)pgd_list;
if (pgd_list)
set_page_private(pgd_list, (unsigned long)&page->index);
pgd_list = page;
set_page_private(page, (unsigned long)&pgd_list);
}
static inline void pgd_list_del(pgd_t *pgd)
{
struct page *next, **pprev, *page = virt_to_page(pgd);
next = (struct page *)page->index;
pprev = (struct page **)page_private(page);
*pprev = next;
if (next)
set_page_private(next, (unsigned long)pprev);
}
#if (PTRS_PER_PMD == 1)
/* Non-PAE pgd constructor */
static void pgd_ctor(void *pgd)
{
unsigned long flags;
/* !PAE, no pagetable sharing */
memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
spin_lock_irqsave(&pgd_lock, flags);
/* must happen under lock */
clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
swapper_pg_dir + USER_PTRS_PER_PGD,
KERNEL_PGD_PTRS);
paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
__pa(swapper_pg_dir) >> PAGE_SHIFT,
USER_PTRS_PER_PGD,
KERNEL_PGD_PTRS);
pgd_list_add(pgd);
spin_unlock_irqrestore(&pgd_lock, flags);
}
#else /* PTRS_PER_PMD > 1 */
/* PAE pgd constructor */
static void pgd_ctor(void *pgd)
{
/* PAE, kernel PMD may be shared */
if (SHARED_KERNEL_PMD) {
clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
swapper_pg_dir + USER_PTRS_PER_PGD,
KERNEL_PGD_PTRS);
} else {
unsigned long flags;
memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
spin_lock_irqsave(&pgd_lock, flags);
pgd_list_add(pgd);
spin_unlock_irqrestore(&pgd_lock, flags);
}
}
#endif /* PTRS_PER_PMD */
static void pgd_dtor(void *pgd)
{
unsigned long flags; /* can be called from interrupt context */
if (SHARED_KERNEL_PMD)
return;
paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
spin_lock_irqsave(&pgd_lock, flags);
pgd_list_del(pgd);
spin_unlock_irqrestore(&pgd_lock, flags);
}
#define UNSHARED_PTRS_PER_PGD \
(SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
/* If we allocate a pmd for part of the kernel address space, then
make sure its initialized with the appropriate kernel mappings.
Otherwise use a cached zeroed pmd. */
static pmd_t *pmd_cache_alloc(int idx)
{
pmd_t *pmd;
if (idx >= USER_PTRS_PER_PGD) {
pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
if (pmd)
memcpy(pmd,
(void *)pgd_page_vaddr(swapper_pg_dir[idx]),
sizeof(pmd_t) * PTRS_PER_PMD);
} else
pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
return pmd;
}
static void pmd_cache_free(pmd_t *pmd, int idx)
{
if (idx >= USER_PTRS_PER_PGD)
free_page((unsigned long)pmd);
else
kmem_cache_free(pmd_cache, pmd);
}
pgd_t *pgd_alloc(struct mm_struct *mm)
{
int i;
pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
if (PTRS_PER_PMD == 1 || !pgd)
return pgd;
for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
pmd_t *pmd = pmd_cache_alloc(i);
if (!pmd)
goto out_oom;
paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
}
return pgd;
out_oom:
for (i--; i >= 0; i--) {
pgd_t pgdent = pgd[i];
void* pmd = (void *)__va(pgd_val(pgdent)-1);
paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
pmd_cache_free(pmd, i);
}
quicklist_free(0, pgd_dtor, pgd);
return NULL;
}
void pgd_free(pgd_t *pgd)
{
int i;
/* in the PAE case user pgd entries are overwritten before usage */
if (PTRS_PER_PMD > 1)
for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
pgd_t pgdent = pgd[i];
void* pmd = (void *)__va(pgd_val(pgdent)-1);
paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
pmd_cache_free(pmd, i);
}
/* in the non-PAE case, free_pgtables() clears user pgd entries */
quicklist_free(0, pgd_dtor, pgd);
}
void check_pgt_cache(void)
{
quicklist_trim(0, pgd_dtor, 25, 16);
}