i386: move mm
Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
5
arch/x86/mm/Makefile
Normal file
5
arch/x86/mm/Makefile
Normal file
@@ -0,0 +1,5 @@
|
||||
ifeq ($(CONFIG_X86_32),y)
|
||||
include ${srctree}/arch/x86/mm/Makefile_32
|
||||
else
|
||||
include ${srctree}/arch/x86_64/mm/Makefile_64
|
||||
endif
|
10
arch/x86/mm/Makefile_32
Normal file
10
arch/x86/mm/Makefile_32
Normal file
@@ -0,0 +1,10 @@
|
||||
#
|
||||
# Makefile for the linux i386-specific parts of the memory manager.
|
||||
#
|
||||
|
||||
obj-y := init_32.o pgtable_32.o fault_32.o ioremap_32.o extable_32.o pageattr_32.o mmap_32.o
|
||||
|
||||
obj-$(CONFIG_NUMA) += discontig_32.o
|
||||
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
|
||||
obj-$(CONFIG_HIGHMEM) += highmem_32.o
|
||||
obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap_32.o
|
100
arch/x86/mm/boot_ioremap_32.c
Normal file
100
arch/x86/mm/boot_ioremap_32.c
Normal file
@@ -0,0 +1,100 @@
|
||||
/*
|
||||
* arch/i386/mm/boot_ioremap.c
|
||||
*
|
||||
* Re-map functions for early boot-time before paging_init() when the
|
||||
* boot-time pagetables are still in use
|
||||
*
|
||||
* Written by Dave Hansen <haveblue@us.ibm.com>
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* We need to use the 2-level pagetable functions, but CONFIG_X86_PAE
|
||||
* keeps that from happenning. If anyone has a better way, I'm listening.
|
||||
*
|
||||
* boot_pte_t is defined only if this all works correctly
|
||||
*/
|
||||
|
||||
#undef CONFIG_X86_PAE
|
||||
#undef CONFIG_PARAVIRT
|
||||
#include <asm/page.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/tlbflush.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/stddef.h>
|
||||
|
||||
/*
|
||||
* I'm cheating here. It is known that the two boot PTE pages are
|
||||
* allocated next to each other. I'm pretending that they're just
|
||||
* one big array.
|
||||
*/
|
||||
|
||||
#define BOOT_PTE_PTRS (PTRS_PER_PTE*2)
|
||||
|
||||
static unsigned long boot_pte_index(unsigned long vaddr)
|
||||
{
|
||||
return __pa(vaddr) >> PAGE_SHIFT;
|
||||
}
|
||||
|
||||
static inline boot_pte_t* boot_vaddr_to_pte(void *address)
|
||||
{
|
||||
boot_pte_t* boot_pg = (boot_pte_t*)pg0;
|
||||
return &boot_pg[boot_pte_index((unsigned long)address)];
|
||||
}
|
||||
|
||||
/*
|
||||
* This is only for a caller who is clever enough to page-align
|
||||
* phys_addr and virtual_source, and who also has a preference
|
||||
* about which virtual address from which to steal ptes
|
||||
*/
|
||||
static void __boot_ioremap(unsigned long phys_addr, unsigned long nrpages,
|
||||
void* virtual_source)
|
||||
{
|
||||
boot_pte_t* pte;
|
||||
int i;
|
||||
char *vaddr = virtual_source;
|
||||
|
||||
pte = boot_vaddr_to_pte(virtual_source);
|
||||
for (i=0; i < nrpages; i++, phys_addr += PAGE_SIZE, pte++) {
|
||||
set_pte(pte, pfn_pte(phys_addr>>PAGE_SHIFT, PAGE_KERNEL));
|
||||
__flush_tlb_one(&vaddr[i*PAGE_SIZE]);
|
||||
}
|
||||
}
|
||||
|
||||
/* the virtual space we're going to remap comes from this array */
|
||||
#define BOOT_IOREMAP_PAGES 4
|
||||
#define BOOT_IOREMAP_SIZE (BOOT_IOREMAP_PAGES*PAGE_SIZE)
|
||||
static __initdata char boot_ioremap_space[BOOT_IOREMAP_SIZE]
|
||||
__attribute__ ((aligned (PAGE_SIZE)));
|
||||
|
||||
/*
|
||||
* This only applies to things which need to ioremap before paging_init()
|
||||
* bt_ioremap() and plain ioremap() are both useless at this point.
|
||||
*
|
||||
* When used, we're still using the boot-time pagetables, which only
|
||||
* have 2 PTE pages mapping the first 8MB
|
||||
*
|
||||
* There is no unmap. The boot-time PTE pages aren't used after boot.
|
||||
* If you really want the space back, just remap it yourself.
|
||||
* boot_ioremap(&ioremap_space-PAGE_OFFSET, BOOT_IOREMAP_SIZE)
|
||||
*/
|
||||
__init void* boot_ioremap(unsigned long phys_addr, unsigned long size)
|
||||
{
|
||||
unsigned long last_addr, offset;
|
||||
unsigned int nrpages;
|
||||
|
||||
last_addr = phys_addr + size - 1;
|
||||
|
||||
/* page align the requested address */
|
||||
offset = phys_addr & ~PAGE_MASK;
|
||||
phys_addr &= PAGE_MASK;
|
||||
size = PAGE_ALIGN(last_addr) - phys_addr;
|
||||
|
||||
nrpages = size >> PAGE_SHIFT;
|
||||
if (nrpages > BOOT_IOREMAP_PAGES)
|
||||
return NULL;
|
||||
|
||||
__boot_ioremap(phys_addr, nrpages, boot_ioremap_space);
|
||||
|
||||
return &boot_ioremap_space[offset];
|
||||
}
|
431
arch/x86/mm/discontig_32.c
Normal file
431
arch/x86/mm/discontig_32.c
Normal file
@@ -0,0 +1,431 @@
|
||||
/*
|
||||
* Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation
|
||||
* August 2002: added remote node KVA remap - Martin J. Bligh
|
||||
*
|
||||
* Copyright (C) 2002, IBM Corp.
|
||||
*
|
||||
* All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for more
|
||||
* details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/bootmem.h>
|
||||
#include <linux/mmzone.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/initrd.h>
|
||||
#include <linux/nodemask.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/kexec.h>
|
||||
#include <linux/pfn.h>
|
||||
#include <linux/swap.h>
|
||||
|
||||
#include <asm/e820.h>
|
||||
#include <asm/setup.h>
|
||||
#include <asm/mmzone.h>
|
||||
#include <bios_ebda.h>
|
||||
|
||||
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
|
||||
EXPORT_SYMBOL(node_data);
|
||||
bootmem_data_t node0_bdata;
|
||||
|
||||
/*
|
||||
* numa interface - we expect the numa architecture specific code to have
|
||||
* populated the following initialisation.
|
||||
*
|
||||
* 1) node_online_map - the map of all nodes configured (online) in the system
|
||||
* 2) node_start_pfn - the starting page frame number for a node
|
||||
* 3) node_end_pfn - the ending page fram number for a node
|
||||
*/
|
||||
unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
|
||||
unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
|
||||
|
||||
|
||||
#ifdef CONFIG_DISCONTIGMEM
|
||||
/*
|
||||
* 4) physnode_map - the mapping between a pfn and owning node
|
||||
* physnode_map keeps track of the physical memory layout of a generic
|
||||
* numa node on a 256Mb break (each element of the array will
|
||||
* represent 256Mb of memory and will be marked by the node id. so,
|
||||
* if the first gig is on node 0, and the second gig is on node 1
|
||||
* physnode_map will contain:
|
||||
*
|
||||
* physnode_map[0-3] = 0;
|
||||
* physnode_map[4-7] = 1;
|
||||
* physnode_map[8- ] = -1;
|
||||
*/
|
||||
s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
|
||||
EXPORT_SYMBOL(physnode_map);
|
||||
|
||||
void memory_present(int nid, unsigned long start, unsigned long end)
|
||||
{
|
||||
unsigned long pfn;
|
||||
|
||||
printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n",
|
||||
nid, start, end);
|
||||
printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
|
||||
printk(KERN_DEBUG " ");
|
||||
for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
|
||||
physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
|
||||
printk("%ld ", pfn);
|
||||
}
|
||||
printk("\n");
|
||||
}
|
||||
|
||||
unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
|
||||
unsigned long end_pfn)
|
||||
{
|
||||
unsigned long nr_pages = end_pfn - start_pfn;
|
||||
|
||||
if (!nr_pages)
|
||||
return 0;
|
||||
|
||||
return (nr_pages + 1) * sizeof(struct page);
|
||||
}
|
||||
#endif
|
||||
|
||||
extern unsigned long find_max_low_pfn(void);
|
||||
extern void add_one_highpage_init(struct page *, int, int);
|
||||
extern unsigned long highend_pfn, highstart_pfn;
|
||||
|
||||
#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
|
||||
|
||||
unsigned long node_remap_start_pfn[MAX_NUMNODES];
|
||||
unsigned long node_remap_size[MAX_NUMNODES];
|
||||
unsigned long node_remap_offset[MAX_NUMNODES];
|
||||
void *node_remap_start_vaddr[MAX_NUMNODES];
|
||||
void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
|
||||
|
||||
void *node_remap_end_vaddr[MAX_NUMNODES];
|
||||
void *node_remap_alloc_vaddr[MAX_NUMNODES];
|
||||
static unsigned long kva_start_pfn;
|
||||
static unsigned long kva_pages;
|
||||
/*
|
||||
* FLAT - support for basic PC memory model with discontig enabled, essentially
|
||||
* a single node with all available processors in it with a flat
|
||||
* memory map.
|
||||
*/
|
||||
int __init get_memcfg_numa_flat(void)
|
||||
{
|
||||
printk("NUMA - single node, flat memory mode\n");
|
||||
|
||||
/* Run the memory configuration and find the top of memory. */
|
||||
find_max_pfn();
|
||||
node_start_pfn[0] = 0;
|
||||
node_end_pfn[0] = max_pfn;
|
||||
memory_present(0, 0, max_pfn);
|
||||
|
||||
/* Indicate there is one node available. */
|
||||
nodes_clear(node_online_map);
|
||||
node_set_online(0);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Find the highest page frame number we have available for the node
|
||||
*/
|
||||
static void __init find_max_pfn_node(int nid)
|
||||
{
|
||||
if (node_end_pfn[nid] > max_pfn)
|
||||
node_end_pfn[nid] = max_pfn;
|
||||
/*
|
||||
* if a user has given mem=XXXX, then we need to make sure
|
||||
* that the node _starts_ before that, too, not just ends
|
||||
*/
|
||||
if (node_start_pfn[nid] > max_pfn)
|
||||
node_start_pfn[nid] = max_pfn;
|
||||
BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]);
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate memory for the pg_data_t for this node via a crude pre-bootmem
|
||||
* method. For node zero take this from the bottom of memory, for
|
||||
* subsequent nodes place them at node_remap_start_vaddr which contains
|
||||
* node local data in physically node local memory. See setup_memory()
|
||||
* for details.
|
||||
*/
|
||||
static void __init allocate_pgdat(int nid)
|
||||
{
|
||||
if (nid && node_has_online_mem(nid))
|
||||
NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
|
||||
else {
|
||||
NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn));
|
||||
min_low_pfn += PFN_UP(sizeof(pg_data_t));
|
||||
}
|
||||
}
|
||||
|
||||
void *alloc_remap(int nid, unsigned long size)
|
||||
{
|
||||
void *allocation = node_remap_alloc_vaddr[nid];
|
||||
|
||||
size = ALIGN(size, L1_CACHE_BYTES);
|
||||
|
||||
if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid])
|
||||
return 0;
|
||||
|
||||
node_remap_alloc_vaddr[nid] += size;
|
||||
memset(allocation, 0, size);
|
||||
|
||||
return allocation;
|
||||
}
|
||||
|
||||
void __init remap_numa_kva(void)
|
||||
{
|
||||
void *vaddr;
|
||||
unsigned long pfn;
|
||||
int node;
|
||||
|
||||
for_each_online_node(node) {
|
||||
for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
|
||||
vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
|
||||
set_pmd_pfn((ulong) vaddr,
|
||||
node_remap_start_pfn[node] + pfn,
|
||||
PAGE_KERNEL_LARGE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned long calculate_numa_remap_pages(void)
|
||||
{
|
||||
int nid;
|
||||
unsigned long size, reserve_pages = 0;
|
||||
unsigned long pfn;
|
||||
|
||||
for_each_online_node(nid) {
|
||||
unsigned old_end_pfn = node_end_pfn[nid];
|
||||
|
||||
/*
|
||||
* The acpi/srat node info can show hot-add memroy zones
|
||||
* where memory could be added but not currently present.
|
||||
*/
|
||||
if (node_start_pfn[nid] > max_pfn)
|
||||
continue;
|
||||
if (node_end_pfn[nid] > max_pfn)
|
||||
node_end_pfn[nid] = max_pfn;
|
||||
|
||||
/* ensure the remap includes space for the pgdat. */
|
||||
size = node_remap_size[nid] + sizeof(pg_data_t);
|
||||
|
||||
/* convert size to large (pmd size) pages, rounding up */
|
||||
size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
|
||||
/* now the roundup is correct, convert to PAGE_SIZE pages */
|
||||
size = size * PTRS_PER_PTE;
|
||||
|
||||
/*
|
||||
* Validate the region we are allocating only contains valid
|
||||
* pages.
|
||||
*/
|
||||
for (pfn = node_end_pfn[nid] - size;
|
||||
pfn < node_end_pfn[nid]; pfn++)
|
||||
if (!page_is_ram(pfn))
|
||||
break;
|
||||
|
||||
if (pfn != node_end_pfn[nid])
|
||||
size = 0;
|
||||
|
||||
printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
|
||||
size, nid);
|
||||
node_remap_size[nid] = size;
|
||||
node_remap_offset[nid] = reserve_pages;
|
||||
reserve_pages += size;
|
||||
printk("Shrinking node %d from %ld pages to %ld pages\n",
|
||||
nid, node_end_pfn[nid], node_end_pfn[nid] - size);
|
||||
|
||||
if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) {
|
||||
/*
|
||||
* Align node_end_pfn[] and node_remap_start_pfn[] to
|
||||
* pmd boundary. remap_numa_kva will barf otherwise.
|
||||
*/
|
||||
printk("Shrinking node %d further by %ld pages for proper alignment\n",
|
||||
nid, node_end_pfn[nid] & (PTRS_PER_PTE-1));
|
||||
size += node_end_pfn[nid] & (PTRS_PER_PTE-1);
|
||||
}
|
||||
|
||||
node_end_pfn[nid] -= size;
|
||||
node_remap_start_pfn[nid] = node_end_pfn[nid];
|
||||
shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]);
|
||||
}
|
||||
printk("Reserving total of %ld pages for numa KVA remap\n",
|
||||
reserve_pages);
|
||||
return reserve_pages;
|
||||
}
|
||||
|
||||
extern void setup_bootmem_allocator(void);
|
||||
unsigned long __init setup_memory(void)
|
||||
{
|
||||
int nid;
|
||||
unsigned long system_start_pfn, system_max_low_pfn;
|
||||
|
||||
/*
|
||||
* When mapping a NUMA machine we allocate the node_mem_map arrays
|
||||
* from node local memory. They are then mapped directly into KVA
|
||||
* between zone normal and vmalloc space. Calculate the size of
|
||||
* this space and use it to adjust the boundry between ZONE_NORMAL
|
||||
* and ZONE_HIGHMEM.
|
||||
*/
|
||||
find_max_pfn();
|
||||
get_memcfg_numa();
|
||||
|
||||
kva_pages = calculate_numa_remap_pages();
|
||||
|
||||
/* partially used pages are not usable - thus round upwards */
|
||||
system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
|
||||
|
||||
kva_start_pfn = find_max_low_pfn() - kva_pages;
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_INITRD
|
||||
/* Numa kva area is below the initrd */
|
||||
if (LOADER_TYPE && INITRD_START)
|
||||
kva_start_pfn = PFN_DOWN(INITRD_START) - kva_pages;
|
||||
#endif
|
||||
kva_start_pfn -= kva_start_pfn & (PTRS_PER_PTE-1);
|
||||
|
||||
system_max_low_pfn = max_low_pfn = find_max_low_pfn();
|
||||
printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
|
||||
kva_start_pfn, max_low_pfn);
|
||||
printk("max_pfn = %ld\n", max_pfn);
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
highstart_pfn = highend_pfn = max_pfn;
|
||||
if (max_pfn > system_max_low_pfn)
|
||||
highstart_pfn = system_max_low_pfn;
|
||||
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
|
||||
pages_to_mb(highend_pfn - highstart_pfn));
|
||||
num_physpages = highend_pfn;
|
||||
high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
|
||||
#else
|
||||
num_physpages = system_max_low_pfn;
|
||||
high_memory = (void *) __va(system_max_low_pfn * PAGE_SIZE - 1) + 1;
|
||||
#endif
|
||||
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
|
||||
pages_to_mb(system_max_low_pfn));
|
||||
printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n",
|
||||
min_low_pfn, max_low_pfn, highstart_pfn);
|
||||
|
||||
printk("Low memory ends at vaddr %08lx\n",
|
||||
(ulong) pfn_to_kaddr(max_low_pfn));
|
||||
for_each_online_node(nid) {
|
||||
node_remap_start_vaddr[nid] = pfn_to_kaddr(
|
||||
kva_start_pfn + node_remap_offset[nid]);
|
||||
/* Init the node remap allocator */
|
||||
node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
|
||||
(node_remap_size[nid] * PAGE_SIZE);
|
||||
node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
|
||||
ALIGN(sizeof(pg_data_t), PAGE_SIZE);
|
||||
|
||||
allocate_pgdat(nid);
|
||||
printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
|
||||
(ulong) node_remap_start_vaddr[nid],
|
||||
(ulong) pfn_to_kaddr(highstart_pfn
|
||||
+ node_remap_offset[nid] + node_remap_size[nid]));
|
||||
}
|
||||
printk("High memory starts at vaddr %08lx\n",
|
||||
(ulong) pfn_to_kaddr(highstart_pfn));
|
||||
for_each_online_node(nid)
|
||||
find_max_pfn_node(nid);
|
||||
|
||||
memset(NODE_DATA(0), 0, sizeof(struct pglist_data));
|
||||
NODE_DATA(0)->bdata = &node0_bdata;
|
||||
setup_bootmem_allocator();
|
||||
return max_low_pfn;
|
||||
}
|
||||
|
||||
void __init numa_kva_reserve(void)
|
||||
{
|
||||
reserve_bootmem(PFN_PHYS(kva_start_pfn),PFN_PHYS(kva_pages));
|
||||
}
|
||||
|
||||
void __init zone_sizes_init(void)
|
||||
{
|
||||
int nid;
|
||||
unsigned long max_zone_pfns[MAX_NR_ZONES];
|
||||
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
|
||||
max_zone_pfns[ZONE_DMA] =
|
||||
virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
|
||||
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
|
||||
#endif
|
||||
|
||||
/* If SRAT has not registered memory, register it now */
|
||||
if (find_max_pfn_with_active_regions() == 0) {
|
||||
for_each_online_node(nid) {
|
||||
if (node_has_online_mem(nid))
|
||||
add_active_range(nid, node_start_pfn[nid],
|
||||
node_end_pfn[nid]);
|
||||
}
|
||||
}
|
||||
|
||||
free_area_init_nodes(max_zone_pfns);
|
||||
return;
|
||||
}
|
||||
|
||||
void __init set_highmem_pages_init(int bad_ppro)
|
||||
{
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
struct zone *zone;
|
||||
struct page *page;
|
||||
|
||||
for_each_zone(zone) {
|
||||
unsigned long node_pfn, zone_start_pfn, zone_end_pfn;
|
||||
|
||||
if (!is_highmem(zone))
|
||||
continue;
|
||||
|
||||
zone_start_pfn = zone->zone_start_pfn;
|
||||
zone_end_pfn = zone_start_pfn + zone->spanned_pages;
|
||||
|
||||
printk("Initializing %s for node %d (%08lx:%08lx)\n",
|
||||
zone->name, zone_to_nid(zone),
|
||||
zone_start_pfn, zone_end_pfn);
|
||||
|
||||
for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) {
|
||||
if (!pfn_valid(node_pfn))
|
||||
continue;
|
||||
page = pfn_to_page(node_pfn);
|
||||
add_one_highpage_init(page, node_pfn, bad_ppro);
|
||||
}
|
||||
}
|
||||
totalram_pages += totalhigh_pages;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
int paddr_to_nid(u64 addr)
|
||||
{
|
||||
int nid;
|
||||
unsigned long pfn = PFN_DOWN(addr);
|
||||
|
||||
for_each_node(nid)
|
||||
if (node_start_pfn[nid] <= pfn &&
|
||||
pfn < node_end_pfn[nid])
|
||||
return nid;
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is used to ask node id BEFORE memmap and mem_section's
|
||||
* initialization (pfn_to_nid() can't be used yet).
|
||||
* If _PXM is not defined on ACPI's DSDT, node id must be found by this.
|
||||
*/
|
||||
int memory_add_physaddr_to_nid(u64 addr)
|
||||
{
|
||||
int nid = paddr_to_nid(addr);
|
||||
return (nid >= 0) ? nid : 0;
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
|
||||
#endif
|
35
arch/x86/mm/extable_32.c
Normal file
35
arch/x86/mm/extable_32.c
Normal file
@@ -0,0 +1,35 @@
|
||||
/*
|
||||
* linux/arch/i386/mm/extable.c
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <asm/uaccess.h>
|
||||
|
||||
int fixup_exception(struct pt_regs *regs)
|
||||
{
|
||||
const struct exception_table_entry *fixup;
|
||||
|
||||
#ifdef CONFIG_PNPBIOS
|
||||
if (unlikely(SEGMENT_IS_PNP_CODE(regs->xcs)))
|
||||
{
|
||||
extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
|
||||
extern u32 pnp_bios_is_utter_crap;
|
||||
pnp_bios_is_utter_crap = 1;
|
||||
printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
|
||||
__asm__ volatile(
|
||||
"movl %0, %%esp\n\t"
|
||||
"jmp *%1\n\t"
|
||||
: : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
|
||||
panic("do_trap: can't hit this");
|
||||
}
|
||||
#endif
|
||||
|
||||
fixup = search_exception_tables(regs->eip);
|
||||
if (fixup) {
|
||||
regs->eip = fixup->fixup;
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
657
arch/x86/mm/fault_32.c
Normal file
657
arch/x86/mm/fault_32.c
Normal file
@@ -0,0 +1,657 @@
|
||||
/*
|
||||
* linux/arch/i386/mm/fault.c
|
||||
*
|
||||
* Copyright (C) 1995 Linus Torvalds
|
||||
*/
|
||||
|
||||
#include <linux/signal.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/ptrace.h>
|
||||
#include <linux/mman.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/smp.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/tty.h>
|
||||
#include <linux/vt_kern.h> /* For unblank_screen() */
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/bootmem.h> /* for max_low_pfn */
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/kprobes.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/kdebug.h>
|
||||
|
||||
#include <asm/system.h>
|
||||
#include <asm/desc.h>
|
||||
#include <asm/segment.h>
|
||||
|
||||
extern void die(const char *,struct pt_regs *,long);
|
||||
|
||||
static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
|
||||
|
||||
int register_page_fault_notifier(struct notifier_block *nb)
|
||||
{
|
||||
vmalloc_sync_all();
|
||||
return atomic_notifier_chain_register(¬ify_page_fault_chain, nb);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(register_page_fault_notifier);
|
||||
|
||||
int unregister_page_fault_notifier(struct notifier_block *nb)
|
||||
{
|
||||
return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
|
||||
|
||||
static inline int notify_page_fault(struct pt_regs *regs, long err)
|
||||
{
|
||||
struct die_args args = {
|
||||
.regs = regs,
|
||||
.str = "page fault",
|
||||
.err = err,
|
||||
.trapnr = 14,
|
||||
.signr = SIGSEGV
|
||||
};
|
||||
return atomic_notifier_call_chain(¬ify_page_fault_chain,
|
||||
DIE_PAGE_FAULT, &args);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return EIP plus the CS segment base. The segment limit is also
|
||||
* adjusted, clamped to the kernel/user address space (whichever is
|
||||
* appropriate), and returned in *eip_limit.
|
||||
*
|
||||
* The segment is checked, because it might have been changed by another
|
||||
* task between the original faulting instruction and here.
|
||||
*
|
||||
* If CS is no longer a valid code segment, or if EIP is beyond the
|
||||
* limit, or if it is a kernel address when CS is not a kernel segment,
|
||||
* then the returned value will be greater than *eip_limit.
|
||||
*
|
||||
* This is slow, but is very rarely executed.
|
||||
*/
|
||||
static inline unsigned long get_segment_eip(struct pt_regs *regs,
|
||||
unsigned long *eip_limit)
|
||||
{
|
||||
unsigned long eip = regs->eip;
|
||||
unsigned seg = regs->xcs & 0xffff;
|
||||
u32 seg_ar, seg_limit, base, *desc;
|
||||
|
||||
/* Unlikely, but must come before segment checks. */
|
||||
if (unlikely(regs->eflags & VM_MASK)) {
|
||||
base = seg << 4;
|
||||
*eip_limit = base + 0xffff;
|
||||
return base + (eip & 0xffff);
|
||||
}
|
||||
|
||||
/* The standard kernel/user address space limit. */
|
||||
*eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
|
||||
|
||||
/* By far the most common cases. */
|
||||
if (likely(SEGMENT_IS_FLAT_CODE(seg)))
|
||||
return eip;
|
||||
|
||||
/* Check the segment exists, is within the current LDT/GDT size,
|
||||
that kernel/user (ring 0..3) has the appropriate privilege,
|
||||
that it's a code segment, and get the limit. */
|
||||
__asm__ ("larl %3,%0; lsll %3,%1"
|
||||
: "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
|
||||
if ((~seg_ar & 0x9800) || eip > seg_limit) {
|
||||
*eip_limit = 0;
|
||||
return 1; /* So that returned eip > *eip_limit. */
|
||||
}
|
||||
|
||||
/* Get the GDT/LDT descriptor base.
|
||||
When you look for races in this code remember that
|
||||
LDT and other horrors are only used in user space. */
|
||||
if (seg & (1<<2)) {
|
||||
/* Must lock the LDT while reading it. */
|
||||
down(¤t->mm->context.sem);
|
||||
desc = current->mm->context.ldt;
|
||||
desc = (void *)desc + (seg & ~7);
|
||||
} else {
|
||||
/* Must disable preemption while reading the GDT. */
|
||||
desc = (u32 *)get_cpu_gdt_table(get_cpu());
|
||||
desc = (void *)desc + (seg & ~7);
|
||||
}
|
||||
|
||||
/* Decode the code segment base from the descriptor */
|
||||
base = get_desc_base((unsigned long *)desc);
|
||||
|
||||
if (seg & (1<<2)) {
|
||||
up(¤t->mm->context.sem);
|
||||
} else
|
||||
put_cpu();
|
||||
|
||||
/* Adjust EIP and segment limit, and clamp at the kernel limit.
|
||||
It's legitimate for segments to wrap at 0xffffffff. */
|
||||
seg_limit += base;
|
||||
if (seg_limit < *eip_limit && seg_limit >= base)
|
||||
*eip_limit = seg_limit;
|
||||
return eip + base;
|
||||
}
|
||||
|
||||
/*
|
||||
* Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
|
||||
* Check that here and ignore it.
|
||||
*/
|
||||
static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
|
||||
{
|
||||
unsigned long limit;
|
||||
unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
|
||||
int scan_more = 1;
|
||||
int prefetch = 0;
|
||||
int i;
|
||||
|
||||
for (i = 0; scan_more && i < 15; i++) {
|
||||
unsigned char opcode;
|
||||
unsigned char instr_hi;
|
||||
unsigned char instr_lo;
|
||||
|
||||
if (instr > (unsigned char *)limit)
|
||||
break;
|
||||
if (probe_kernel_address(instr, opcode))
|
||||
break;
|
||||
|
||||
instr_hi = opcode & 0xf0;
|
||||
instr_lo = opcode & 0x0f;
|
||||
instr++;
|
||||
|
||||
switch (instr_hi) {
|
||||
case 0x20:
|
||||
case 0x30:
|
||||
/* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
|
||||
scan_more = ((instr_lo & 7) == 0x6);
|
||||
break;
|
||||
|
||||
case 0x60:
|
||||
/* 0x64 thru 0x67 are valid prefixes in all modes. */
|
||||
scan_more = (instr_lo & 0xC) == 0x4;
|
||||
break;
|
||||
case 0xF0:
|
||||
/* 0xF0, 0xF2, and 0xF3 are valid prefixes */
|
||||
scan_more = !instr_lo || (instr_lo>>1) == 1;
|
||||
break;
|
||||
case 0x00:
|
||||
/* Prefetch instruction is 0x0F0D or 0x0F18 */
|
||||
scan_more = 0;
|
||||
if (instr > (unsigned char *)limit)
|
||||
break;
|
||||
if (probe_kernel_address(instr, opcode))
|
||||
break;
|
||||
prefetch = (instr_lo == 0xF) &&
|
||||
(opcode == 0x0D || opcode == 0x18);
|
||||
break;
|
||||
default:
|
||||
scan_more = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return prefetch;
|
||||
}
|
||||
|
||||
static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
|
||||
unsigned long error_code)
|
||||
{
|
||||
if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
|
||||
boot_cpu_data.x86 >= 6)) {
|
||||
/* Catch an obscure case of prefetch inside an NX page. */
|
||||
if (nx_enabled && (error_code & 16))
|
||||
return 0;
|
||||
return __is_prefetch(regs, addr);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static noinline void force_sig_info_fault(int si_signo, int si_code,
|
||||
unsigned long address, struct task_struct *tsk)
|
||||
{
|
||||
siginfo_t info;
|
||||
|
||||
info.si_signo = si_signo;
|
||||
info.si_errno = 0;
|
||||
info.si_code = si_code;
|
||||
info.si_addr = (void __user *)address;
|
||||
force_sig_info(si_signo, &info, tsk);
|
||||
}
|
||||
|
||||
fastcall void do_invalid_op(struct pt_regs *, unsigned long);
|
||||
|
||||
static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
|
||||
{
|
||||
unsigned index = pgd_index(address);
|
||||
pgd_t *pgd_k;
|
||||
pud_t *pud, *pud_k;
|
||||
pmd_t *pmd, *pmd_k;
|
||||
|
||||
pgd += index;
|
||||
pgd_k = init_mm.pgd + index;
|
||||
|
||||
if (!pgd_present(*pgd_k))
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* set_pgd(pgd, *pgd_k); here would be useless on PAE
|
||||
* and redundant with the set_pmd() on non-PAE. As would
|
||||
* set_pud.
|
||||
*/
|
||||
|
||||
pud = pud_offset(pgd, address);
|
||||
pud_k = pud_offset(pgd_k, address);
|
||||
if (!pud_present(*pud_k))
|
||||
return NULL;
|
||||
|
||||
pmd = pmd_offset(pud, address);
|
||||
pmd_k = pmd_offset(pud_k, address);
|
||||
if (!pmd_present(*pmd_k))
|
||||
return NULL;
|
||||
if (!pmd_present(*pmd)) {
|
||||
set_pmd(pmd, *pmd_k);
|
||||
arch_flush_lazy_mmu_mode();
|
||||
} else
|
||||
BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
|
||||
return pmd_k;
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle a fault on the vmalloc or module mapping area
|
||||
*
|
||||
* This assumes no large pages in there.
|
||||
*/
|
||||
static inline int vmalloc_fault(unsigned long address)
|
||||
{
|
||||
unsigned long pgd_paddr;
|
||||
pmd_t *pmd_k;
|
||||
pte_t *pte_k;
|
||||
/*
|
||||
* Synchronize this task's top level page-table
|
||||
* with the 'reference' page table.
|
||||
*
|
||||
* Do _not_ use "current" here. We might be inside
|
||||
* an interrupt in the middle of a task switch..
|
||||
*/
|
||||
pgd_paddr = read_cr3();
|
||||
pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
|
||||
if (!pmd_k)
|
||||
return -1;
|
||||
pte_k = pte_offset_kernel(pmd_k, address);
|
||||
if (!pte_present(*pte_k))
|
||||
return -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int show_unhandled_signals = 1;
|
||||
|
||||
/*
|
||||
* This routine handles page faults. It determines the address,
|
||||
* and the problem, and then passes it off to one of the appropriate
|
||||
* routines.
|
||||
*
|
||||
* error_code:
|
||||
* bit 0 == 0 means no page found, 1 means protection fault
|
||||
* bit 1 == 0 means read, 1 means write
|
||||
* bit 2 == 0 means kernel, 1 means user-mode
|
||||
* bit 3 == 1 means use of reserved bit detected
|
||||
* bit 4 == 1 means fault was an instruction fetch
|
||||
*/
|
||||
fastcall void __kprobes do_page_fault(struct pt_regs *regs,
|
||||
unsigned long error_code)
|
||||
{
|
||||
struct task_struct *tsk;
|
||||
struct mm_struct *mm;
|
||||
struct vm_area_struct * vma;
|
||||
unsigned long address;
|
||||
int write, si_code;
|
||||
int fault;
|
||||
|
||||
/* get the address */
|
||||
address = read_cr2();
|
||||
|
||||
tsk = current;
|
||||
|
||||
si_code = SEGV_MAPERR;
|
||||
|
||||
/*
|
||||
* We fault-in kernel-space virtual memory on-demand. The
|
||||
* 'reference' page table is init_mm.pgd.
|
||||
*
|
||||
* NOTE! We MUST NOT take any locks for this case. We may
|
||||
* be in an interrupt or a critical region, and should
|
||||
* only copy the information from the master page table,
|
||||
* nothing more.
|
||||
*
|
||||
* This verifies that the fault happens in kernel space
|
||||
* (error_code & 4) == 0, and that the fault was not a
|
||||
* protection error (error_code & 9) == 0.
|
||||
*/
|
||||
if (unlikely(address >= TASK_SIZE)) {
|
||||
if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
|
||||
return;
|
||||
if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
|
||||
return;
|
||||
/*
|
||||
* Don't take the mm semaphore here. If we fixup a prefetch
|
||||
* fault we could otherwise deadlock.
|
||||
*/
|
||||
goto bad_area_nosemaphore;
|
||||
}
|
||||
|
||||
if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
|
||||
return;
|
||||
|
||||
/* It's safe to allow irq's after cr2 has been saved and the vmalloc
|
||||
fault has been handled. */
|
||||
if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
|
||||
local_irq_enable();
|
||||
|
||||
mm = tsk->mm;
|
||||
|
||||
/*
|
||||
* If we're in an interrupt, have no user context or are running in an
|
||||
* atomic region then we must not take the fault..
|
||||
*/
|
||||
if (in_atomic() || !mm)
|
||||
goto bad_area_nosemaphore;
|
||||
|
||||
/* When running in the kernel we expect faults to occur only to
|
||||
* addresses in user space. All other faults represent errors in the
|
||||
* kernel and should generate an OOPS. Unfortunatly, in the case of an
|
||||
* erroneous fault occurring in a code path which already holds mmap_sem
|
||||
* we will deadlock attempting to validate the fault against the
|
||||
* address space. Luckily the kernel only validly references user
|
||||
* space from well defined areas of code, which are listed in the
|
||||
* exceptions table.
|
||||
*
|
||||
* As the vast majority of faults will be valid we will only perform
|
||||
* the source reference check when there is a possibilty of a deadlock.
|
||||
* Attempt to lock the address space, if we cannot we then validate the
|
||||
* source. If this is invalid we can skip the address space check,
|
||||
* thus avoiding the deadlock.
|
||||
*/
|
||||
if (!down_read_trylock(&mm->mmap_sem)) {
|
||||
if ((error_code & 4) == 0 &&
|
||||
!search_exception_tables(regs->eip))
|
||||
goto bad_area_nosemaphore;
|
||||
down_read(&mm->mmap_sem);
|
||||
}
|
||||
|
||||
vma = find_vma(mm, address);
|
||||
if (!vma)
|
||||
goto bad_area;
|
||||
if (vma->vm_start <= address)
|
||||
goto good_area;
|
||||
if (!(vma->vm_flags & VM_GROWSDOWN))
|
||||
goto bad_area;
|
||||
if (error_code & 4) {
|
||||
/*
|
||||
* Accessing the stack below %esp is always a bug.
|
||||
* The large cushion allows instructions like enter
|
||||
* and pusha to work. ("enter $65535,$31" pushes
|
||||
* 32 pointers and then decrements %esp by 65535.)
|
||||
*/
|
||||
if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
|
||||
goto bad_area;
|
||||
}
|
||||
if (expand_stack(vma, address))
|
||||
goto bad_area;
|
||||
/*
|
||||
* Ok, we have a good vm_area for this memory access, so
|
||||
* we can handle it..
|
||||
*/
|
||||
good_area:
|
||||
si_code = SEGV_ACCERR;
|
||||
write = 0;
|
||||
switch (error_code & 3) {
|
||||
default: /* 3: write, present */
|
||||
/* fall through */
|
||||
case 2: /* write, not present */
|
||||
if (!(vma->vm_flags & VM_WRITE))
|
||||
goto bad_area;
|
||||
write++;
|
||||
break;
|
||||
case 1: /* read, present */
|
||||
goto bad_area;
|
||||
case 0: /* read, not present */
|
||||
if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
|
||||
goto bad_area;
|
||||
}
|
||||
|
||||
survive:
|
||||
/*
|
||||
* If for any reason at all we couldn't handle the fault,
|
||||
* make sure we exit gracefully rather than endlessly redo
|
||||
* the fault.
|
||||
*/
|
||||
fault = handle_mm_fault(mm, vma, address, write);
|
||||
if (unlikely(fault & VM_FAULT_ERROR)) {
|
||||
if (fault & VM_FAULT_OOM)
|
||||
goto out_of_memory;
|
||||
else if (fault & VM_FAULT_SIGBUS)
|
||||
goto do_sigbus;
|
||||
BUG();
|
||||
}
|
||||
if (fault & VM_FAULT_MAJOR)
|
||||
tsk->maj_flt++;
|
||||
else
|
||||
tsk->min_flt++;
|
||||
|
||||
/*
|
||||
* Did it hit the DOS screen memory VA from vm86 mode?
|
||||
*/
|
||||
if (regs->eflags & VM_MASK) {
|
||||
unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
|
||||
if (bit < 32)
|
||||
tsk->thread.screen_bitmap |= 1 << bit;
|
||||
}
|
||||
up_read(&mm->mmap_sem);
|
||||
return;
|
||||
|
||||
/*
|
||||
* Something tried to access memory that isn't in our memory map..
|
||||
* Fix it, but check if it's kernel or user first..
|
||||
*/
|
||||
bad_area:
|
||||
up_read(&mm->mmap_sem);
|
||||
|
||||
bad_area_nosemaphore:
|
||||
/* User mode accesses just cause a SIGSEGV */
|
||||
if (error_code & 4) {
|
||||
/*
|
||||
* It's possible to have interrupts off here.
|
||||
*/
|
||||
local_irq_enable();
|
||||
|
||||
/*
|
||||
* Valid to do another page fault here because this one came
|
||||
* from user space.
|
||||
*/
|
||||
if (is_prefetch(regs, address, error_code))
|
||||
return;
|
||||
|
||||
if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
|
||||
printk_ratelimit()) {
|
||||
printk("%s%s[%d]: segfault at %08lx eip %08lx "
|
||||
"esp %08lx error %lx\n",
|
||||
tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
|
||||
tsk->comm, tsk->pid, address, regs->eip,
|
||||
regs->esp, error_code);
|
||||
}
|
||||
tsk->thread.cr2 = address;
|
||||
/* Kernel addresses are always protection faults */
|
||||
tsk->thread.error_code = error_code | (address >= TASK_SIZE);
|
||||
tsk->thread.trap_no = 14;
|
||||
force_sig_info_fault(SIGSEGV, si_code, address, tsk);
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_F00F_BUG
|
||||
/*
|
||||
* Pentium F0 0F C7 C8 bug workaround.
|
||||
*/
|
||||
if (boot_cpu_data.f00f_bug) {
|
||||
unsigned long nr;
|
||||
|
||||
nr = (address - idt_descr.address) >> 3;
|
||||
|
||||
if (nr == 6) {
|
||||
do_invalid_op(regs, 0);
|
||||
return;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
no_context:
|
||||
/* Are we prepared to handle this kernel fault? */
|
||||
if (fixup_exception(regs))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Valid to do another page fault here, because if this fault
|
||||
* had been triggered by is_prefetch fixup_exception would have
|
||||
* handled it.
|
||||
*/
|
||||
if (is_prefetch(regs, address, error_code))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Oops. The kernel tried to access some bad page. We'll have to
|
||||
* terminate things with extreme prejudice.
|
||||
*/
|
||||
|
||||
bust_spinlocks(1);
|
||||
|
||||
if (oops_may_print()) {
|
||||
__typeof__(pte_val(__pte(0))) page;
|
||||
|
||||
#ifdef CONFIG_X86_PAE
|
||||
if (error_code & 16) {
|
||||
pte_t *pte = lookup_address(address);
|
||||
|
||||
if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
|
||||
printk(KERN_CRIT "kernel tried to execute "
|
||||
"NX-protected page - exploit attempt? "
|
||||
"(uid: %d)\n", current->uid);
|
||||
}
|
||||
#endif
|
||||
if (address < PAGE_SIZE)
|
||||
printk(KERN_ALERT "BUG: unable to handle kernel NULL "
|
||||
"pointer dereference");
|
||||
else
|
||||
printk(KERN_ALERT "BUG: unable to handle kernel paging"
|
||||
" request");
|
||||
printk(" at virtual address %08lx\n",address);
|
||||
printk(KERN_ALERT " printing eip:\n");
|
||||
printk("%08lx\n", regs->eip);
|
||||
|
||||
page = read_cr3();
|
||||
page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
|
||||
#ifdef CONFIG_X86_PAE
|
||||
printk(KERN_ALERT "*pdpt = %016Lx\n", page);
|
||||
if ((page >> PAGE_SHIFT) < max_low_pfn
|
||||
&& page & _PAGE_PRESENT) {
|
||||
page &= PAGE_MASK;
|
||||
page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
|
||||
& (PTRS_PER_PMD - 1)];
|
||||
printk(KERN_ALERT "*pde = %016Lx\n", page);
|
||||
page &= ~_PAGE_NX;
|
||||
}
|
||||
#else
|
||||
printk(KERN_ALERT "*pde = %08lx\n", page);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* We must not directly access the pte in the highpte
|
||||
* case if the page table is located in highmem.
|
||||
* And let's rather not kmap-atomic the pte, just in case
|
||||
* it's allocated already.
|
||||
*/
|
||||
if ((page >> PAGE_SHIFT) < max_low_pfn
|
||||
&& (page & _PAGE_PRESENT)) {
|
||||
page &= PAGE_MASK;
|
||||
page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
|
||||
& (PTRS_PER_PTE - 1)];
|
||||
printk(KERN_ALERT "*pte = %0*Lx\n", sizeof(page)*2, (u64)page);
|
||||
}
|
||||
}
|
||||
|
||||
tsk->thread.cr2 = address;
|
||||
tsk->thread.trap_no = 14;
|
||||
tsk->thread.error_code = error_code;
|
||||
die("Oops", regs, error_code);
|
||||
bust_spinlocks(0);
|
||||
do_exit(SIGKILL);
|
||||
|
||||
/*
|
||||
* We ran out of memory, or some other thing happened to us that made
|
||||
* us unable to handle the page fault gracefully.
|
||||
*/
|
||||
out_of_memory:
|
||||
up_read(&mm->mmap_sem);
|
||||
if (is_init(tsk)) {
|
||||
yield();
|
||||
down_read(&mm->mmap_sem);
|
||||
goto survive;
|
||||
}
|
||||
printk("VM: killing process %s\n", tsk->comm);
|
||||
if (error_code & 4)
|
||||
do_exit(SIGKILL);
|
||||
goto no_context;
|
||||
|
||||
do_sigbus:
|
||||
up_read(&mm->mmap_sem);
|
||||
|
||||
/* Kernel mode? Handle exceptions or die */
|
||||
if (!(error_code & 4))
|
||||
goto no_context;
|
||||
|
||||
/* User space => ok to do another page fault */
|
||||
if (is_prefetch(regs, address, error_code))
|
||||
return;
|
||||
|
||||
tsk->thread.cr2 = address;
|
||||
tsk->thread.error_code = error_code;
|
||||
tsk->thread.trap_no = 14;
|
||||
force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
|
||||
}
|
||||
|
||||
void vmalloc_sync_all(void)
|
||||
{
|
||||
/*
|
||||
* Note that races in the updates of insync and start aren't
|
||||
* problematic: insync can only get set bits added, and updates to
|
||||
* start are only improving performance (without affecting correctness
|
||||
* if undone).
|
||||
*/
|
||||
static DECLARE_BITMAP(insync, PTRS_PER_PGD);
|
||||
static unsigned long start = TASK_SIZE;
|
||||
unsigned long address;
|
||||
|
||||
if (SHARED_KERNEL_PMD)
|
||||
return;
|
||||
|
||||
BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
|
||||
for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
|
||||
if (!test_bit(pgd_index(address), insync)) {
|
||||
unsigned long flags;
|
||||
struct page *page;
|
||||
|
||||
spin_lock_irqsave(&pgd_lock, flags);
|
||||
for (page = pgd_list; page; page =
|
||||
(struct page *)page->index)
|
||||
if (!vmalloc_sync_one(page_address(page),
|
||||
address)) {
|
||||
BUG_ON(page != pgd_list);
|
||||
break;
|
||||
}
|
||||
spin_unlock_irqrestore(&pgd_lock, flags);
|
||||
if (!page)
|
||||
set_bit(pgd_index(address), insync);
|
||||
}
|
||||
if (address == start && test_bit(pgd_index(address), insync))
|
||||
start = address + PGDIR_SIZE;
|
||||
}
|
||||
}
|
113
arch/x86/mm/highmem_32.c
Normal file
113
arch/x86/mm/highmem_32.c
Normal file
@@ -0,0 +1,113 @@
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
void *kmap(struct page *page)
|
||||
{
|
||||
might_sleep();
|
||||
if (!PageHighMem(page))
|
||||
return page_address(page);
|
||||
return kmap_high(page);
|
||||
}
|
||||
|
||||
void kunmap(struct page *page)
|
||||
{
|
||||
if (in_interrupt())
|
||||
BUG();
|
||||
if (!PageHighMem(page))
|
||||
return;
|
||||
kunmap_high(page);
|
||||
}
|
||||
|
||||
/*
|
||||
* kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
|
||||
* no global lock is needed and because the kmap code must perform a global TLB
|
||||
* invalidation when the kmap pool wraps.
|
||||
*
|
||||
* However when holding an atomic kmap is is not legal to sleep, so atomic
|
||||
* kmaps are appropriate for short, tight code paths only.
|
||||
*/
|
||||
void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
|
||||
{
|
||||
enum fixed_addresses idx;
|
||||
unsigned long vaddr;
|
||||
|
||||
/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
|
||||
pagefault_disable();
|
||||
|
||||
if (!PageHighMem(page))
|
||||
return page_address(page);
|
||||
|
||||
idx = type + KM_TYPE_NR*smp_processor_id();
|
||||
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
|
||||
BUG_ON(!pte_none(*(kmap_pte-idx)));
|
||||
set_pte(kmap_pte-idx, mk_pte(page, prot));
|
||||
arch_flush_lazy_mmu_mode();
|
||||
|
||||
return (void *)vaddr;
|
||||
}
|
||||
|
||||
void *kmap_atomic(struct page *page, enum km_type type)
|
||||
{
|
||||
return kmap_atomic_prot(page, type, kmap_prot);
|
||||
}
|
||||
|
||||
void kunmap_atomic(void *kvaddr, enum km_type type)
|
||||
{
|
||||
unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
|
||||
enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
|
||||
|
||||
/*
|
||||
* Force other mappings to Oops if they'll try to access this pte
|
||||
* without first remap it. Keeping stale mappings around is a bad idea
|
||||
* also, in case the page changes cacheability attributes or becomes
|
||||
* a protected page in a hypervisor.
|
||||
*/
|
||||
if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
|
||||
kpte_clear_flush(kmap_pte-idx, vaddr);
|
||||
else {
|
||||
#ifdef CONFIG_DEBUG_HIGHMEM
|
||||
BUG_ON(vaddr < PAGE_OFFSET);
|
||||
BUG_ON(vaddr >= (unsigned long)high_memory);
|
||||
#endif
|
||||
}
|
||||
|
||||
arch_flush_lazy_mmu_mode();
|
||||
pagefault_enable();
|
||||
}
|
||||
|
||||
/* This is the same as kmap_atomic() but can map memory that doesn't
|
||||
* have a struct page associated with it.
|
||||
*/
|
||||
void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
|
||||
{
|
||||
enum fixed_addresses idx;
|
||||
unsigned long vaddr;
|
||||
|
||||
pagefault_disable();
|
||||
|
||||
idx = type + KM_TYPE_NR*smp_processor_id();
|
||||
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
|
||||
set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
|
||||
arch_flush_lazy_mmu_mode();
|
||||
|
||||
return (void*) vaddr;
|
||||
}
|
||||
|
||||
struct page *kmap_atomic_to_page(void *ptr)
|
||||
{
|
||||
unsigned long idx, vaddr = (unsigned long)ptr;
|
||||
pte_t *pte;
|
||||
|
||||
if (vaddr < FIXADDR_START)
|
||||
return virt_to_page(ptr);
|
||||
|
||||
idx = virt_to_fix(vaddr);
|
||||
pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
|
||||
return pte_page(*pte);
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(kmap);
|
||||
EXPORT_SYMBOL(kunmap);
|
||||
EXPORT_SYMBOL(kmap_atomic);
|
||||
EXPORT_SYMBOL(kunmap_atomic);
|
||||
EXPORT_SYMBOL(kmap_atomic_to_page);
|
391
arch/x86/mm/hugetlbpage.c
Normal file
391
arch/x86/mm/hugetlbpage.c
Normal file
@@ -0,0 +1,391 @@
|
||||
/*
|
||||
* IA-32 Huge TLB Page Support for Kernel.
|
||||
*
|
||||
* Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
|
||||
*/
|
||||
|
||||
#include <linux/init.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/err.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <asm/mman.h>
|
||||
#include <asm/tlb.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
static unsigned long page_table_shareable(struct vm_area_struct *svma,
|
||||
struct vm_area_struct *vma,
|
||||
unsigned long addr, pgoff_t idx)
|
||||
{
|
||||
unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
|
||||
svma->vm_start;
|
||||
unsigned long sbase = saddr & PUD_MASK;
|
||||
unsigned long s_end = sbase + PUD_SIZE;
|
||||
|
||||
/*
|
||||
* match the virtual addresses, permission and the alignment of the
|
||||
* page table page.
|
||||
*/
|
||||
if (pmd_index(addr) != pmd_index(saddr) ||
|
||||
vma->vm_flags != svma->vm_flags ||
|
||||
sbase < svma->vm_start || svma->vm_end < s_end)
|
||||
return 0;
|
||||
|
||||
return saddr;
|
||||
}
|
||||
|
||||
static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
|
||||
{
|
||||
unsigned long base = addr & PUD_MASK;
|
||||
unsigned long end = base + PUD_SIZE;
|
||||
|
||||
/*
|
||||
* check on proper vm_flags and page table alignment
|
||||
*/
|
||||
if (vma->vm_flags & VM_MAYSHARE &&
|
||||
vma->vm_start <= base && end <= vma->vm_end)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* search for a shareable pmd page for hugetlb.
|
||||
*/
|
||||
static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
|
||||
{
|
||||
struct vm_area_struct *vma = find_vma(mm, addr);
|
||||
struct address_space *mapping = vma->vm_file->f_mapping;
|
||||
pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
|
||||
vma->vm_pgoff;
|
||||
struct prio_tree_iter iter;
|
||||
struct vm_area_struct *svma;
|
||||
unsigned long saddr;
|
||||
pte_t *spte = NULL;
|
||||
|
||||
if (!vma_shareable(vma, addr))
|
||||
return;
|
||||
|
||||
spin_lock(&mapping->i_mmap_lock);
|
||||
vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
|
||||
if (svma == vma)
|
||||
continue;
|
||||
|
||||
saddr = page_table_shareable(svma, vma, addr, idx);
|
||||
if (saddr) {
|
||||
spte = huge_pte_offset(svma->vm_mm, saddr);
|
||||
if (spte) {
|
||||
get_page(virt_to_page(spte));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!spte)
|
||||
goto out;
|
||||
|
||||
spin_lock(&mm->page_table_lock);
|
||||
if (pud_none(*pud))
|
||||
pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK);
|
||||
else
|
||||
put_page(virt_to_page(spte));
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
out:
|
||||
spin_unlock(&mapping->i_mmap_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* unmap huge page backed by shared pte.
|
||||
*
|
||||
* Hugetlb pte page is ref counted at the time of mapping. If pte is shared
|
||||
* indicated by page_count > 1, unmap is achieved by clearing pud and
|
||||
* decrementing the ref count. If count == 1, the pte page is not shared.
|
||||
*
|
||||
* called with vma->vm_mm->page_table_lock held.
|
||||
*
|
||||
* returns: 1 successfully unmapped a shared pte page
|
||||
* 0 the underlying pte page is not shared, or it is the last user
|
||||
*/
|
||||
int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
|
||||
{
|
||||
pgd_t *pgd = pgd_offset(mm, *addr);
|
||||
pud_t *pud = pud_offset(pgd, *addr);
|
||||
|
||||
BUG_ON(page_count(virt_to_page(ptep)) == 0);
|
||||
if (page_count(virt_to_page(ptep)) == 1)
|
||||
return 0;
|
||||
|
||||
pud_clear(pud);
|
||||
put_page(virt_to_page(ptep));
|
||||
*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
|
||||
return 1;
|
||||
}
|
||||
|
||||
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
pte_t *pte = NULL;
|
||||
|
||||
pgd = pgd_offset(mm, addr);
|
||||
pud = pud_alloc(mm, pgd, addr);
|
||||
if (pud) {
|
||||
if (pud_none(*pud))
|
||||
huge_pmd_share(mm, addr, pud);
|
||||
pte = (pte_t *) pmd_alloc(mm, pud, addr);
|
||||
}
|
||||
BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
|
||||
|
||||
return pte;
|
||||
}
|
||||
|
||||
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd = NULL;
|
||||
|
||||
pgd = pgd_offset(mm, addr);
|
||||
if (pgd_present(*pgd)) {
|
||||
pud = pud_offset(pgd, addr);
|
||||
if (pud_present(*pud))
|
||||
pmd = pmd_offset(pud, addr);
|
||||
}
|
||||
return (pte_t *) pmd;
|
||||
}
|
||||
|
||||
#if 0 /* This is just for testing */
|
||||
struct page *
|
||||
follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
|
||||
{
|
||||
unsigned long start = address;
|
||||
int length = 1;
|
||||
int nr;
|
||||
struct page *page;
|
||||
struct vm_area_struct *vma;
|
||||
|
||||
vma = find_vma(mm, addr);
|
||||
if (!vma || !is_vm_hugetlb_page(vma))
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
pte = huge_pte_offset(mm, address);
|
||||
|
||||
/* hugetlb should be locked, and hence, prefaulted */
|
||||
WARN_ON(!pte || pte_none(*pte));
|
||||
|
||||
page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
|
||||
|
||||
WARN_ON(!PageCompound(page));
|
||||
|
||||
return page;
|
||||
}
|
||||
|
||||
int pmd_huge(pmd_t pmd)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct page *
|
||||
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
|
||||
pmd_t *pmd, int write)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
struct page *
|
||||
follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
|
||||
{
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
int pmd_huge(pmd_t pmd)
|
||||
{
|
||||
return !!(pmd_val(pmd) & _PAGE_PSE);
|
||||
}
|
||||
|
||||
struct page *
|
||||
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
|
||||
pmd_t *pmd, int write)
|
||||
{
|
||||
struct page *page;
|
||||
|
||||
page = pte_page(*(pte_t *)pmd);
|
||||
if (page)
|
||||
page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT);
|
||||
return page;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* x86_64 also uses this file */
|
||||
|
||||
#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
|
||||
static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
|
||||
unsigned long addr, unsigned long len,
|
||||
unsigned long pgoff, unsigned long flags)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct vm_area_struct *vma;
|
||||
unsigned long start_addr;
|
||||
|
||||
if (len > mm->cached_hole_size) {
|
||||
start_addr = mm->free_area_cache;
|
||||
} else {
|
||||
start_addr = TASK_UNMAPPED_BASE;
|
||||
mm->cached_hole_size = 0;
|
||||
}
|
||||
|
||||
full_search:
|
||||
addr = ALIGN(start_addr, HPAGE_SIZE);
|
||||
|
||||
for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
|
||||
/* At this point: (!vma || addr < vma->vm_end). */
|
||||
if (TASK_SIZE - len < addr) {
|
||||
/*
|
||||
* Start a new search - just in case we missed
|
||||
* some holes.
|
||||
*/
|
||||
if (start_addr != TASK_UNMAPPED_BASE) {
|
||||
start_addr = TASK_UNMAPPED_BASE;
|
||||
mm->cached_hole_size = 0;
|
||||
goto full_search;
|
||||
}
|
||||
return -ENOMEM;
|
||||
}
|
||||
if (!vma || addr + len <= vma->vm_start) {
|
||||
mm->free_area_cache = addr + len;
|
||||
return addr;
|
||||
}
|
||||
if (addr + mm->cached_hole_size < vma->vm_start)
|
||||
mm->cached_hole_size = vma->vm_start - addr;
|
||||
addr = ALIGN(vma->vm_end, HPAGE_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
|
||||
unsigned long addr0, unsigned long len,
|
||||
unsigned long pgoff, unsigned long flags)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct vm_area_struct *vma, *prev_vma;
|
||||
unsigned long base = mm->mmap_base, addr = addr0;
|
||||
unsigned long largest_hole = mm->cached_hole_size;
|
||||
int first_time = 1;
|
||||
|
||||
/* don't allow allocations above current base */
|
||||
if (mm->free_area_cache > base)
|
||||
mm->free_area_cache = base;
|
||||
|
||||
if (len <= largest_hole) {
|
||||
largest_hole = 0;
|
||||
mm->free_area_cache = base;
|
||||
}
|
||||
try_again:
|
||||
/* make sure it can fit in the remaining address space */
|
||||
if (mm->free_area_cache < len)
|
||||
goto fail;
|
||||
|
||||
/* either no address requested or cant fit in requested address hole */
|
||||
addr = (mm->free_area_cache - len) & HPAGE_MASK;
|
||||
do {
|
||||
/*
|
||||
* Lookup failure means no vma is above this address,
|
||||
* i.e. return with success:
|
||||
*/
|
||||
if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
|
||||
return addr;
|
||||
|
||||
/*
|
||||
* new region fits between prev_vma->vm_end and
|
||||
* vma->vm_start, use it:
|
||||
*/
|
||||
if (addr + len <= vma->vm_start &&
|
||||
(!prev_vma || (addr >= prev_vma->vm_end))) {
|
||||
/* remember the address as a hint for next time */
|
||||
mm->cached_hole_size = largest_hole;
|
||||
return (mm->free_area_cache = addr);
|
||||
} else {
|
||||
/* pull free_area_cache down to the first hole */
|
||||
if (mm->free_area_cache == vma->vm_end) {
|
||||
mm->free_area_cache = vma->vm_start;
|
||||
mm->cached_hole_size = largest_hole;
|
||||
}
|
||||
}
|
||||
|
||||
/* remember the largest hole we saw so far */
|
||||
if (addr + largest_hole < vma->vm_start)
|
||||
largest_hole = vma->vm_start - addr;
|
||||
|
||||
/* try just below the current vma->vm_start */
|
||||
addr = (vma->vm_start - len) & HPAGE_MASK;
|
||||
} while (len <= vma->vm_start);
|
||||
|
||||
fail:
|
||||
/*
|
||||
* if hint left us with no space for the requested
|
||||
* mapping then try again:
|
||||
*/
|
||||
if (first_time) {
|
||||
mm->free_area_cache = base;
|
||||
largest_hole = 0;
|
||||
first_time = 0;
|
||||
goto try_again;
|
||||
}
|
||||
/*
|
||||
* A failed mmap() very likely causes application failure,
|
||||
* so fall back to the bottom-up function here. This scenario
|
||||
* can happen with large stack limits and large mmap()
|
||||
* allocations.
|
||||
*/
|
||||
mm->free_area_cache = TASK_UNMAPPED_BASE;
|
||||
mm->cached_hole_size = ~0UL;
|
||||
addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
|
||||
len, pgoff, flags);
|
||||
|
||||
/*
|
||||
* Restore the topdown base:
|
||||
*/
|
||||
mm->free_area_cache = base;
|
||||
mm->cached_hole_size = ~0UL;
|
||||
|
||||
return addr;
|
||||
}
|
||||
|
||||
unsigned long
|
||||
hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
|
||||
unsigned long len, unsigned long pgoff, unsigned long flags)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct vm_area_struct *vma;
|
||||
|
||||
if (len & ~HPAGE_MASK)
|
||||
return -EINVAL;
|
||||
if (len > TASK_SIZE)
|
||||
return -ENOMEM;
|
||||
|
||||
if (flags & MAP_FIXED) {
|
||||
if (prepare_hugepage_range(addr, len))
|
||||
return -EINVAL;
|
||||
return addr;
|
||||
}
|
||||
|
||||
if (addr) {
|
||||
addr = ALIGN(addr, HPAGE_SIZE);
|
||||
vma = find_vma(mm, addr);
|
||||
if (TASK_SIZE - len >= addr &&
|
||||
(!vma || addr + len <= vma->vm_start))
|
||||
return addr;
|
||||
}
|
||||
if (mm->get_unmapped_area == arch_get_unmapped_area)
|
||||
return hugetlb_get_unmapped_area_bottomup(file, addr, len,
|
||||
pgoff, flags);
|
||||
else
|
||||
return hugetlb_get_unmapped_area_topdown(file, addr, len,
|
||||
pgoff, flags);
|
||||
}
|
||||
|
||||
#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
|
||||
|
858
arch/x86/mm/init_32.c
Normal file
858
arch/x86/mm/init_32.c
Normal file
@@ -0,0 +1,858 @@
|
||||
/*
|
||||
* linux/arch/i386/mm/init.c
|
||||
*
|
||||
* Copyright (C) 1995 Linus Torvalds
|
||||
*
|
||||
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/signal.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/ptrace.h>
|
||||
#include <linux/mman.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/smp.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/pfn.h>
|
||||
#include <linux/poison.h>
|
||||
#include <linux/bootmem.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/proc_fs.h>
|
||||
#include <linux/efi.h>
|
||||
#include <linux/memory_hotplug.h>
|
||||
#include <linux/initrd.h>
|
||||
#include <linux/cpumask.h>
|
||||
|
||||
#include <asm/processor.h>
|
||||
#include <asm/system.h>
|
||||
#include <asm/uaccess.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/dma.h>
|
||||
#include <asm/fixmap.h>
|
||||
#include <asm/e820.h>
|
||||
#include <asm/apic.h>
|
||||
#include <asm/tlb.h>
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/sections.h>
|
||||
#include <asm/paravirt.h>
|
||||
|
||||
unsigned int __VMALLOC_RESERVE = 128 << 20;
|
||||
|
||||
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
|
||||
unsigned long highstart_pfn, highend_pfn;
|
||||
|
||||
static int noinline do_test_wp_bit(void);
|
||||
|
||||
/*
|
||||
* Creates a middle page table and puts a pointer to it in the
|
||||
* given global directory entry. This only returns the gd entry
|
||||
* in non-PAE compilation mode, since the middle layer is folded.
|
||||
*/
|
||||
static pmd_t * __init one_md_table_init(pgd_t *pgd)
|
||||
{
|
||||
pud_t *pud;
|
||||
pmd_t *pmd_table;
|
||||
|
||||
#ifdef CONFIG_X86_PAE
|
||||
if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
|
||||
pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
|
||||
|
||||
paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
|
||||
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
|
||||
pud = pud_offset(pgd, 0);
|
||||
if (pmd_table != pmd_offset(pud, 0))
|
||||
BUG();
|
||||
}
|
||||
#endif
|
||||
pud = pud_offset(pgd, 0);
|
||||
pmd_table = pmd_offset(pud, 0);
|
||||
return pmd_table;
|
||||
}
|
||||
|
||||
/*
|
||||
* Create a page table and place a pointer to it in a middle page
|
||||
* directory entry.
|
||||
*/
|
||||
static pte_t * __init one_page_table_init(pmd_t *pmd)
|
||||
{
|
||||
if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
|
||||
pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
|
||||
|
||||
paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
|
||||
set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
|
||||
BUG_ON(page_table != pte_offset_kernel(pmd, 0));
|
||||
}
|
||||
|
||||
return pte_offset_kernel(pmd, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function initializes a certain range of kernel virtual memory
|
||||
* with new bootmem page tables, everywhere page tables are missing in
|
||||
* the given range.
|
||||
*/
|
||||
|
||||
/*
|
||||
* NOTE: The pagetables are allocated contiguous on the physical space
|
||||
* so we can cache the place of the first one and move around without
|
||||
* checking the pgd every time.
|
||||
*/
|
||||
static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
pmd_t *pmd;
|
||||
int pgd_idx, pmd_idx;
|
||||
unsigned long vaddr;
|
||||
|
||||
vaddr = start;
|
||||
pgd_idx = pgd_index(vaddr);
|
||||
pmd_idx = pmd_index(vaddr);
|
||||
pgd = pgd_base + pgd_idx;
|
||||
|
||||
for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
|
||||
pmd = one_md_table_init(pgd);
|
||||
pmd = pmd + pmd_index(vaddr);
|
||||
for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
|
||||
one_page_table_init(pmd);
|
||||
|
||||
vaddr += PMD_SIZE;
|
||||
}
|
||||
pmd_idx = 0;
|
||||
}
|
||||
}
|
||||
|
||||
static inline int is_kernel_text(unsigned long addr)
|
||||
{
|
||||
if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* This maps the physical memory to kernel virtual address space, a total
|
||||
* of max_low_pfn pages, by creating page tables starting from address
|
||||
* PAGE_OFFSET.
|
||||
*/
|
||||
static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
|
||||
{
|
||||
unsigned long pfn;
|
||||
pgd_t *pgd;
|
||||
pmd_t *pmd;
|
||||
pte_t *pte;
|
||||
int pgd_idx, pmd_idx, pte_ofs;
|
||||
|
||||
pgd_idx = pgd_index(PAGE_OFFSET);
|
||||
pgd = pgd_base + pgd_idx;
|
||||
pfn = 0;
|
||||
|
||||
for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
|
||||
pmd = one_md_table_init(pgd);
|
||||
if (pfn >= max_low_pfn)
|
||||
continue;
|
||||
for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
|
||||
unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
|
||||
|
||||
/* Map with big pages if possible, otherwise create normal page tables. */
|
||||
if (cpu_has_pse) {
|
||||
unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
|
||||
if (is_kernel_text(address) || is_kernel_text(address2))
|
||||
set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
|
||||
else
|
||||
set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
|
||||
|
||||
pfn += PTRS_PER_PTE;
|
||||
} else {
|
||||
pte = one_page_table_init(pmd);
|
||||
|
||||
for (pte_ofs = 0;
|
||||
pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
|
||||
pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
|
||||
if (is_kernel_text(address))
|
||||
set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
|
||||
else
|
||||
set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline int page_kills_ppro(unsigned long pagenr)
|
||||
{
|
||||
if (pagenr >= 0x70000 && pagenr <= 0x7003F)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int page_is_ram(unsigned long pagenr)
|
||||
{
|
||||
int i;
|
||||
unsigned long addr, end;
|
||||
|
||||
if (efi_enabled) {
|
||||
efi_memory_desc_t *md;
|
||||
void *p;
|
||||
|
||||
for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
|
||||
md = p;
|
||||
if (!is_available_memory(md))
|
||||
continue;
|
||||
addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
|
||||
end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
|
||||
|
||||
if ((pagenr >= addr) && (pagenr < end))
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (i = 0; i < e820.nr_map; i++) {
|
||||
|
||||
if (e820.map[i].type != E820_RAM) /* not usable memory */
|
||||
continue;
|
||||
/*
|
||||
* !!!FIXME!!! Some BIOSen report areas as RAM that
|
||||
* are not. Notably the 640->1Mb area. We need a sanity
|
||||
* check here.
|
||||
*/
|
||||
addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
|
||||
end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
|
||||
if ((pagenr >= addr) && (pagenr < end))
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
pte_t *kmap_pte;
|
||||
pgprot_t kmap_prot;
|
||||
|
||||
#define kmap_get_fixmap_pte(vaddr) \
|
||||
pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
|
||||
|
||||
static void __init kmap_init(void)
|
||||
{
|
||||
unsigned long kmap_vstart;
|
||||
|
||||
/* cache the first kmap pte */
|
||||
kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
|
||||
kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
|
||||
|
||||
kmap_prot = PAGE_KERNEL;
|
||||
}
|
||||
|
||||
static void __init permanent_kmaps_init(pgd_t *pgd_base)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
pte_t *pte;
|
||||
unsigned long vaddr;
|
||||
|
||||
vaddr = PKMAP_BASE;
|
||||
page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
|
||||
|
||||
pgd = swapper_pg_dir + pgd_index(vaddr);
|
||||
pud = pud_offset(pgd, vaddr);
|
||||
pmd = pmd_offset(pud, vaddr);
|
||||
pte = pte_offset_kernel(pmd, vaddr);
|
||||
pkmap_page_table = pte;
|
||||
}
|
||||
|
||||
static void __meminit free_new_highpage(struct page *page)
|
||||
{
|
||||
init_page_count(page);
|
||||
__free_page(page);
|
||||
totalhigh_pages++;
|
||||
}
|
||||
|
||||
void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
|
||||
{
|
||||
if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
|
||||
ClearPageReserved(page);
|
||||
free_new_highpage(page);
|
||||
} else
|
||||
SetPageReserved(page);
|
||||
}
|
||||
|
||||
static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn)
|
||||
{
|
||||
free_new_highpage(page);
|
||||
totalram_pages++;
|
||||
#ifdef CONFIG_FLATMEM
|
||||
max_mapnr = max(pfn, max_mapnr);
|
||||
#endif
|
||||
num_physpages++;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Not currently handling the NUMA case.
|
||||
* Assuming single node and all memory that
|
||||
* has been added dynamically that would be
|
||||
* onlined here is in HIGHMEM
|
||||
*/
|
||||
void __meminit online_page(struct page *page)
|
||||
{
|
||||
ClearPageReserved(page);
|
||||
add_one_highpage_hotplug(page, page_to_pfn(page));
|
||||
}
|
||||
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
extern void set_highmem_pages_init(int);
|
||||
#else
|
||||
static void __init set_highmem_pages_init(int bad_ppro)
|
||||
{
|
||||
int pfn;
|
||||
for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
|
||||
add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
|
||||
totalram_pages += totalhigh_pages;
|
||||
}
|
||||
#endif /* CONFIG_FLATMEM */
|
||||
|
||||
#else
|
||||
#define kmap_init() do { } while (0)
|
||||
#define permanent_kmaps_init(pgd_base) do { } while (0)
|
||||
#define set_highmem_pages_init(bad_ppro) do { } while (0)
|
||||
#endif /* CONFIG_HIGHMEM */
|
||||
|
||||
unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
|
||||
EXPORT_SYMBOL(__PAGE_KERNEL);
|
||||
unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
extern void __init remap_numa_kva(void);
|
||||
#else
|
||||
#define remap_numa_kva() do {} while (0)
|
||||
#endif
|
||||
|
||||
void __init native_pagetable_setup_start(pgd_t *base)
|
||||
{
|
||||
#ifdef CONFIG_X86_PAE
|
||||
int i;
|
||||
|
||||
/*
|
||||
* Init entries of the first-level page table to the
|
||||
* zero page, if they haven't already been set up.
|
||||
*
|
||||
* In a normal native boot, we'll be running on a
|
||||
* pagetable rooted in swapper_pg_dir, but not in PAE
|
||||
* mode, so this will end up clobbering the mappings
|
||||
* for the lower 24Mbytes of the address space,
|
||||
* without affecting the kernel address space.
|
||||
*/
|
||||
for (i = 0; i < USER_PTRS_PER_PGD; i++)
|
||||
set_pgd(&base[i],
|
||||
__pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
|
||||
|
||||
/* Make sure kernel address space is empty so that a pagetable
|
||||
will be allocated for it. */
|
||||
memset(&base[USER_PTRS_PER_PGD], 0,
|
||||
KERNEL_PGD_PTRS * sizeof(pgd_t));
|
||||
#else
|
||||
paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT);
|
||||
#endif
|
||||
}
|
||||
|
||||
void __init native_pagetable_setup_done(pgd_t *base)
|
||||
{
|
||||
#ifdef CONFIG_X86_PAE
|
||||
/*
|
||||
* Add low memory identity-mappings - SMP needs it when
|
||||
* starting up on an AP from real-mode. In the non-PAE
|
||||
* case we already have these mappings through head.S.
|
||||
* All user-space mappings are explicitly cleared after
|
||||
* SMP startup.
|
||||
*/
|
||||
set_pgd(&base[0], base[USER_PTRS_PER_PGD]);
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Build a proper pagetable for the kernel mappings. Up until this
|
||||
* point, we've been running on some set of pagetables constructed by
|
||||
* the boot process.
|
||||
*
|
||||
* If we're booting on native hardware, this will be a pagetable
|
||||
* constructed in arch/i386/kernel/head.S, and not running in PAE mode
|
||||
* (even if we'll end up running in PAE). The root of the pagetable
|
||||
* will be swapper_pg_dir.
|
||||
*
|
||||
* If we're booting paravirtualized under a hypervisor, then there are
|
||||
* more options: we may already be running PAE, and the pagetable may
|
||||
* or may not be based in swapper_pg_dir. In any case,
|
||||
* paravirt_pagetable_setup_start() will set up swapper_pg_dir
|
||||
* appropriately for the rest of the initialization to work.
|
||||
*
|
||||
* In general, pagetable_init() assumes that the pagetable may already
|
||||
* be partially populated, and so it avoids stomping on any existing
|
||||
* mappings.
|
||||
*/
|
||||
static void __init pagetable_init (void)
|
||||
{
|
||||
unsigned long vaddr, end;
|
||||
pgd_t *pgd_base = swapper_pg_dir;
|
||||
|
||||
paravirt_pagetable_setup_start(pgd_base);
|
||||
|
||||
/* Enable PSE if available */
|
||||
if (cpu_has_pse)
|
||||
set_in_cr4(X86_CR4_PSE);
|
||||
|
||||
/* Enable PGE if available */
|
||||
if (cpu_has_pge) {
|
||||
set_in_cr4(X86_CR4_PGE);
|
||||
__PAGE_KERNEL |= _PAGE_GLOBAL;
|
||||
__PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
|
||||
}
|
||||
|
||||
kernel_physical_mapping_init(pgd_base);
|
||||
remap_numa_kva();
|
||||
|
||||
/*
|
||||
* Fixed mappings, only the page table structure has to be
|
||||
* created - mappings will be set by set_fixmap():
|
||||
*/
|
||||
vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
|
||||
end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
|
||||
page_table_range_init(vaddr, end, pgd_base);
|
||||
|
||||
permanent_kmaps_init(pgd_base);
|
||||
|
||||
paravirt_pagetable_setup_done(pgd_base);
|
||||
}
|
||||
|
||||
#if defined(CONFIG_HIBERNATION) || defined(CONFIG_ACPI)
|
||||
/*
|
||||
* Swap suspend & friends need this for resume because things like the intel-agp
|
||||
* driver might have split up a kernel 4MB mapping.
|
||||
*/
|
||||
char __nosavedata swsusp_pg_dir[PAGE_SIZE]
|
||||
__attribute__ ((aligned (PAGE_SIZE)));
|
||||
|
||||
static inline void save_pg_dir(void)
|
||||
{
|
||||
memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
|
||||
}
|
||||
#else
|
||||
static inline void save_pg_dir(void)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
void zap_low_mappings (void)
|
||||
{
|
||||
int i;
|
||||
|
||||
save_pg_dir();
|
||||
|
||||
/*
|
||||
* Zap initial low-memory mappings.
|
||||
*
|
||||
* Note that "pgd_clear()" doesn't do it for
|
||||
* us, because pgd_clear() is a no-op on i386.
|
||||
*/
|
||||
for (i = 0; i < USER_PTRS_PER_PGD; i++)
|
||||
#ifdef CONFIG_X86_PAE
|
||||
set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
|
||||
#else
|
||||
set_pgd(swapper_pg_dir+i, __pgd(0));
|
||||
#endif
|
||||
flush_tlb_all();
|
||||
}
|
||||
|
||||
int nx_enabled = 0;
|
||||
|
||||
#ifdef CONFIG_X86_PAE
|
||||
|
||||
static int disable_nx __initdata = 0;
|
||||
u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
|
||||
EXPORT_SYMBOL_GPL(__supported_pte_mask);
|
||||
|
||||
/*
|
||||
* noexec = on|off
|
||||
*
|
||||
* Control non executable mappings.
|
||||
*
|
||||
* on Enable
|
||||
* off Disable
|
||||
*/
|
||||
static int __init noexec_setup(char *str)
|
||||
{
|
||||
if (!str || !strcmp(str, "on")) {
|
||||
if (cpu_has_nx) {
|
||||
__supported_pte_mask |= _PAGE_NX;
|
||||
disable_nx = 0;
|
||||
}
|
||||
} else if (!strcmp(str,"off")) {
|
||||
disable_nx = 1;
|
||||
__supported_pte_mask &= ~_PAGE_NX;
|
||||
} else
|
||||
return -EINVAL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
early_param("noexec", noexec_setup);
|
||||
|
||||
static void __init set_nx(void)
|
||||
{
|
||||
unsigned int v[4], l, h;
|
||||
|
||||
if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
|
||||
cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
|
||||
if ((v[3] & (1 << 20)) && !disable_nx) {
|
||||
rdmsr(MSR_EFER, l, h);
|
||||
l |= EFER_NX;
|
||||
wrmsr(MSR_EFER, l, h);
|
||||
nx_enabled = 1;
|
||||
__supported_pte_mask |= _PAGE_NX;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Enables/disables executability of a given kernel page and
|
||||
* returns the previous setting.
|
||||
*/
|
||||
int __init set_kernel_exec(unsigned long vaddr, int enable)
|
||||
{
|
||||
pte_t *pte;
|
||||
int ret = 1;
|
||||
|
||||
if (!nx_enabled)
|
||||
goto out;
|
||||
|
||||
pte = lookup_address(vaddr);
|
||||
BUG_ON(!pte);
|
||||
|
||||
if (!pte_exec_kernel(*pte))
|
||||
ret = 0;
|
||||
|
||||
if (enable)
|
||||
pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
|
||||
else
|
||||
pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
|
||||
pte_update_defer(&init_mm, vaddr, pte);
|
||||
__flush_tlb_all();
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* paging_init() sets up the page tables - note that the first 8MB are
|
||||
* already mapped by head.S.
|
||||
*
|
||||
* This routines also unmaps the page at virtual kernel address 0, so
|
||||
* that we can trap those pesky NULL-reference errors in the kernel.
|
||||
*/
|
||||
void __init paging_init(void)
|
||||
{
|
||||
#ifdef CONFIG_X86_PAE
|
||||
set_nx();
|
||||
if (nx_enabled)
|
||||
printk("NX (Execute Disable) protection: active\n");
|
||||
#endif
|
||||
|
||||
pagetable_init();
|
||||
|
||||
load_cr3(swapper_pg_dir);
|
||||
|
||||
#ifdef CONFIG_X86_PAE
|
||||
/*
|
||||
* We will bail out later - printk doesn't work right now so
|
||||
* the user would just see a hanging kernel.
|
||||
*/
|
||||
if (cpu_has_pae)
|
||||
set_in_cr4(X86_CR4_PAE);
|
||||
#endif
|
||||
__flush_tlb_all();
|
||||
|
||||
kmap_init();
|
||||
}
|
||||
|
||||
/*
|
||||
* Test if the WP bit works in supervisor mode. It isn't supported on 386's
|
||||
* and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
|
||||
* used to involve black magic jumps to work around some nasty CPU bugs,
|
||||
* but fortunately the switch to using exceptions got rid of all that.
|
||||
*/
|
||||
|
||||
static void __init test_wp_bit(void)
|
||||
{
|
||||
printk("Checking if this processor honours the WP bit even in supervisor mode... ");
|
||||
|
||||
/* Any page-aligned address will do, the test is non-destructive */
|
||||
__set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
|
||||
boot_cpu_data.wp_works_ok = do_test_wp_bit();
|
||||
clear_fixmap(FIX_WP_TEST);
|
||||
|
||||
if (!boot_cpu_data.wp_works_ok) {
|
||||
printk("No.\n");
|
||||
#ifdef CONFIG_X86_WP_WORKS_OK
|
||||
panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
|
||||
#endif
|
||||
} else {
|
||||
printk("Ok.\n");
|
||||
}
|
||||
}
|
||||
|
||||
static struct kcore_list kcore_mem, kcore_vmalloc;
|
||||
|
||||
void __init mem_init(void)
|
||||
{
|
||||
extern int ppro_with_ram_bug(void);
|
||||
int codesize, reservedpages, datasize, initsize;
|
||||
int tmp;
|
||||
int bad_ppro;
|
||||
|
||||
#ifdef CONFIG_FLATMEM
|
||||
BUG_ON(!mem_map);
|
||||
#endif
|
||||
|
||||
bad_ppro = ppro_with_ram_bug();
|
||||
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
/* check that fixmap and pkmap do not overlap */
|
||||
if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
|
||||
printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
|
||||
printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
|
||||
PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
|
||||
BUG();
|
||||
}
|
||||
#endif
|
||||
|
||||
/* this will put all low memory onto the freelists */
|
||||
totalram_pages += free_all_bootmem();
|
||||
|
||||
reservedpages = 0;
|
||||
for (tmp = 0; tmp < max_low_pfn; tmp++)
|
||||
/*
|
||||
* Only count reserved RAM pages
|
||||
*/
|
||||
if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
|
||||
reservedpages++;
|
||||
|
||||
set_highmem_pages_init(bad_ppro);
|
||||
|
||||
codesize = (unsigned long) &_etext - (unsigned long) &_text;
|
||||
datasize = (unsigned long) &_edata - (unsigned long) &_etext;
|
||||
initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
|
||||
|
||||
kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
|
||||
kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
|
||||
VMALLOC_END-VMALLOC_START);
|
||||
|
||||
printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
|
||||
(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
|
||||
num_physpages << (PAGE_SHIFT-10),
|
||||
codesize >> 10,
|
||||
reservedpages << (PAGE_SHIFT-10),
|
||||
datasize >> 10,
|
||||
initsize >> 10,
|
||||
(unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
|
||||
);
|
||||
|
||||
#if 1 /* double-sanity-check paranoia */
|
||||
printk("virtual kernel memory layout:\n"
|
||||
" fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
" pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
|
||||
#endif
|
||||
" vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
|
||||
" lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
|
||||
" .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
|
||||
" .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
|
||||
" .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
|
||||
FIXADDR_START, FIXADDR_TOP,
|
||||
(FIXADDR_TOP - FIXADDR_START) >> 10,
|
||||
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
|
||||
(LAST_PKMAP*PAGE_SIZE) >> 10,
|
||||
#endif
|
||||
|
||||
VMALLOC_START, VMALLOC_END,
|
||||
(VMALLOC_END - VMALLOC_START) >> 20,
|
||||
|
||||
(unsigned long)__va(0), (unsigned long)high_memory,
|
||||
((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
|
||||
|
||||
(unsigned long)&__init_begin, (unsigned long)&__init_end,
|
||||
((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
|
||||
|
||||
(unsigned long)&_etext, (unsigned long)&_edata,
|
||||
((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
|
||||
|
||||
(unsigned long)&_text, (unsigned long)&_etext,
|
||||
((unsigned long)&_etext - (unsigned long)&_text) >> 10);
|
||||
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
|
||||
BUG_ON(VMALLOC_END > PKMAP_BASE);
|
||||
#endif
|
||||
BUG_ON(VMALLOC_START > VMALLOC_END);
|
||||
BUG_ON((unsigned long)high_memory > VMALLOC_START);
|
||||
#endif /* double-sanity-check paranoia */
|
||||
|
||||
#ifdef CONFIG_X86_PAE
|
||||
if (!cpu_has_pae)
|
||||
panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
|
||||
#endif
|
||||
if (boot_cpu_data.wp_works_ok < 0)
|
||||
test_wp_bit();
|
||||
|
||||
/*
|
||||
* Subtle. SMP is doing it's boot stuff late (because it has to
|
||||
* fork idle threads) - but it also needs low mappings for the
|
||||
* protected-mode entry to work. We zap these entries only after
|
||||
* the WP-bit has been tested.
|
||||
*/
|
||||
#ifndef CONFIG_SMP
|
||||
zap_low_mappings();
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
int arch_add_memory(int nid, u64 start, u64 size)
|
||||
{
|
||||
struct pglist_data *pgdata = NODE_DATA(nid);
|
||||
struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
|
||||
unsigned long start_pfn = start >> PAGE_SHIFT;
|
||||
unsigned long nr_pages = size >> PAGE_SHIFT;
|
||||
|
||||
return __add_pages(zone, start_pfn, nr_pages);
|
||||
}
|
||||
|
||||
int remove_memory(u64 start, u64 size)
|
||||
{
|
||||
return -EINVAL;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(remove_memory);
|
||||
#endif
|
||||
|
||||
struct kmem_cache *pmd_cache;
|
||||
|
||||
void __init pgtable_cache_init(void)
|
||||
{
|
||||
size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t);
|
||||
|
||||
if (PTRS_PER_PMD > 1) {
|
||||
pmd_cache = kmem_cache_create("pmd",
|
||||
PTRS_PER_PMD*sizeof(pmd_t),
|
||||
PTRS_PER_PMD*sizeof(pmd_t),
|
||||
SLAB_PANIC,
|
||||
pmd_ctor);
|
||||
if (!SHARED_KERNEL_PMD) {
|
||||
/* If we're in PAE mode and have a non-shared
|
||||
kernel pmd, then the pgd size must be a
|
||||
page size. This is because the pgd_list
|
||||
links through the page structure, so there
|
||||
can only be one pgd per page for this to
|
||||
work. */
|
||||
pgd_size = PAGE_SIZE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This function cannot be __init, since exceptions don't work in that
|
||||
* section. Put this after the callers, so that it cannot be inlined.
|
||||
*/
|
||||
static int noinline do_test_wp_bit(void)
|
||||
{
|
||||
char tmp_reg;
|
||||
int flag;
|
||||
|
||||
__asm__ __volatile__(
|
||||
" movb %0,%1 \n"
|
||||
"1: movb %1,%0 \n"
|
||||
" xorl %2,%2 \n"
|
||||
"2: \n"
|
||||
".section __ex_table,\"a\"\n"
|
||||
" .align 4 \n"
|
||||
" .long 1b,2b \n"
|
||||
".previous \n"
|
||||
:"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
|
||||
"=q" (tmp_reg),
|
||||
"=r" (flag)
|
||||
:"2" (1)
|
||||
:"memory");
|
||||
|
||||
return flag;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_DEBUG_RODATA
|
||||
|
||||
void mark_rodata_ro(void)
|
||||
{
|
||||
unsigned long start = PFN_ALIGN(_text);
|
||||
unsigned long size = PFN_ALIGN(_etext) - start;
|
||||
|
||||
#ifndef CONFIG_KPROBES
|
||||
#ifdef CONFIG_HOTPLUG_CPU
|
||||
/* It must still be possible to apply SMP alternatives. */
|
||||
if (num_possible_cpus() <= 1)
|
||||
#endif
|
||||
{
|
||||
change_page_attr(virt_to_page(start),
|
||||
size >> PAGE_SHIFT, PAGE_KERNEL_RX);
|
||||
printk("Write protecting the kernel text: %luk\n", size >> 10);
|
||||
}
|
||||
#endif
|
||||
start += size;
|
||||
size = (unsigned long)__end_rodata - start;
|
||||
change_page_attr(virt_to_page(start),
|
||||
size >> PAGE_SHIFT, PAGE_KERNEL_RO);
|
||||
printk("Write protecting the kernel read-only data: %luk\n",
|
||||
size >> 10);
|
||||
|
||||
/*
|
||||
* change_page_attr() requires a global_flush_tlb() call after it.
|
||||
* We do this after the printk so that if something went wrong in the
|
||||
* change, the printk gets out at least to give a better debug hint
|
||||
* of who is the culprit.
|
||||
*/
|
||||
global_flush_tlb();
|
||||
}
|
||||
#endif
|
||||
|
||||
void free_init_pages(char *what, unsigned long begin, unsigned long end)
|
||||
{
|
||||
unsigned long addr;
|
||||
|
||||
for (addr = begin; addr < end; addr += PAGE_SIZE) {
|
||||
ClearPageReserved(virt_to_page(addr));
|
||||
init_page_count(virt_to_page(addr));
|
||||
memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
|
||||
free_page(addr);
|
||||
totalram_pages++;
|
||||
}
|
||||
printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
|
||||
}
|
||||
|
||||
void free_initmem(void)
|
||||
{
|
||||
free_init_pages("unused kernel memory",
|
||||
(unsigned long)(&__init_begin),
|
||||
(unsigned long)(&__init_end));
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_INITRD
|
||||
void free_initrd_mem(unsigned long start, unsigned long end)
|
||||
{
|
||||
free_init_pages("initrd memory", start, end);
|
||||
}
|
||||
#endif
|
||||
|
274
arch/x86/mm/ioremap_32.c
Normal file
274
arch/x86/mm/ioremap_32.c
Normal file
@@ -0,0 +1,274 @@
|
||||
/*
|
||||
* arch/i386/mm/ioremap.c
|
||||
*
|
||||
* Re-map IO memory to kernel address space so that we can access it.
|
||||
* This is needed for high PCI addresses that aren't mapped in the
|
||||
* 640k-1MB IO memory area on PC's
|
||||
*
|
||||
* (C) Copyright 1995 1996 Linus Torvalds
|
||||
*/
|
||||
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/io.h>
|
||||
#include <asm/fixmap.h>
|
||||
#include <asm/cacheflush.h>
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/pgtable.h>
|
||||
|
||||
#define ISA_START_ADDRESS 0xa0000
|
||||
#define ISA_END_ADDRESS 0x100000
|
||||
|
||||
/*
|
||||
* Generic mapping function (not visible outside):
|
||||
*/
|
||||
|
||||
/*
|
||||
* Remap an arbitrary physical address space into the kernel virtual
|
||||
* address space. Needed when the kernel wants to access high addresses
|
||||
* directly.
|
||||
*
|
||||
* NOTE! We need to allow non-page-aligned mappings too: we will obviously
|
||||
* have to convert them into an offset in a page-aligned mapping, but the
|
||||
* caller shouldn't need to know that small detail.
|
||||
*/
|
||||
void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
|
||||
{
|
||||
void __iomem * addr;
|
||||
struct vm_struct * area;
|
||||
unsigned long offset, last_addr;
|
||||
pgprot_t prot;
|
||||
|
||||
/* Don't allow wraparound or zero size */
|
||||
last_addr = phys_addr + size - 1;
|
||||
if (!size || last_addr < phys_addr)
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* Don't remap the low PCI/ISA area, it's always mapped..
|
||||
*/
|
||||
if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
|
||||
return (void __iomem *) phys_to_virt(phys_addr);
|
||||
|
||||
/*
|
||||
* Don't allow anybody to remap normal RAM that we're using..
|
||||
*/
|
||||
if (phys_addr <= virt_to_phys(high_memory - 1)) {
|
||||
char *t_addr, *t_end;
|
||||
struct page *page;
|
||||
|
||||
t_addr = __va(phys_addr);
|
||||
t_end = t_addr + (size - 1);
|
||||
|
||||
for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
|
||||
if(!PageReserved(page))
|
||||
return NULL;
|
||||
}
|
||||
|
||||
prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY
|
||||
| _PAGE_ACCESSED | flags);
|
||||
|
||||
/*
|
||||
* Mappings have to be page-aligned
|
||||
*/
|
||||
offset = phys_addr & ~PAGE_MASK;
|
||||
phys_addr &= PAGE_MASK;
|
||||
size = PAGE_ALIGN(last_addr+1) - phys_addr;
|
||||
|
||||
/*
|
||||
* Ok, go for it..
|
||||
*/
|
||||
area = get_vm_area(size, VM_IOREMAP | (flags << 20));
|
||||
if (!area)
|
||||
return NULL;
|
||||
area->phys_addr = phys_addr;
|
||||
addr = (void __iomem *) area->addr;
|
||||
if (ioremap_page_range((unsigned long) addr,
|
||||
(unsigned long) addr + size, phys_addr, prot)) {
|
||||
vunmap((void __force *) addr);
|
||||
return NULL;
|
||||
}
|
||||
return (void __iomem *) (offset + (char __iomem *)addr);
|
||||
}
|
||||
EXPORT_SYMBOL(__ioremap);
|
||||
|
||||
/**
|
||||
* ioremap_nocache - map bus memory into CPU space
|
||||
* @offset: bus address of the memory
|
||||
* @size: size of the resource to map
|
||||
*
|
||||
* ioremap_nocache performs a platform specific sequence of operations to
|
||||
* make bus memory CPU accessible via the readb/readw/readl/writeb/
|
||||
* writew/writel functions and the other mmio helpers. The returned
|
||||
* address is not guaranteed to be usable directly as a virtual
|
||||
* address.
|
||||
*
|
||||
* This version of ioremap ensures that the memory is marked uncachable
|
||||
* on the CPU as well as honouring existing caching rules from things like
|
||||
* the PCI bus. Note that there are other caches and buffers on many
|
||||
* busses. In particular driver authors should read up on PCI writes
|
||||
*
|
||||
* It's useful if some control registers are in such an area and
|
||||
* write combining or read caching is not desirable:
|
||||
*
|
||||
* Must be freed with iounmap.
|
||||
*/
|
||||
|
||||
void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
|
||||
{
|
||||
unsigned long last_addr;
|
||||
void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
|
||||
if (!p)
|
||||
return p;
|
||||
|
||||
/* Guaranteed to be > phys_addr, as per __ioremap() */
|
||||
last_addr = phys_addr + size - 1;
|
||||
|
||||
if (last_addr < virt_to_phys(high_memory) - 1) {
|
||||
struct page *ppage = virt_to_page(__va(phys_addr));
|
||||
unsigned long npages;
|
||||
|
||||
phys_addr &= PAGE_MASK;
|
||||
|
||||
/* This might overflow and become zero.. */
|
||||
last_addr = PAGE_ALIGN(last_addr);
|
||||
|
||||
/* .. but that's ok, because modulo-2**n arithmetic will make
|
||||
* the page-aligned "last - first" come out right.
|
||||
*/
|
||||
npages = (last_addr - phys_addr) >> PAGE_SHIFT;
|
||||
|
||||
if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
|
||||
iounmap(p);
|
||||
p = NULL;
|
||||
}
|
||||
global_flush_tlb();
|
||||
}
|
||||
|
||||
return p;
|
||||
}
|
||||
EXPORT_SYMBOL(ioremap_nocache);
|
||||
|
||||
/**
|
||||
* iounmap - Free a IO remapping
|
||||
* @addr: virtual address from ioremap_*
|
||||
*
|
||||
* Caller must ensure there is only one unmapping for the same pointer.
|
||||
*/
|
||||
void iounmap(volatile void __iomem *addr)
|
||||
{
|
||||
struct vm_struct *p, *o;
|
||||
|
||||
if ((void __force *)addr <= high_memory)
|
||||
return;
|
||||
|
||||
/*
|
||||
* __ioremap special-cases the PCI/ISA range by not instantiating a
|
||||
* vm_area and by simply returning an address into the kernel mapping
|
||||
* of ISA space. So handle that here.
|
||||
*/
|
||||
if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
|
||||
addr < phys_to_virt(ISA_END_ADDRESS))
|
||||
return;
|
||||
|
||||
addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
|
||||
|
||||
/* Use the vm area unlocked, assuming the caller
|
||||
ensures there isn't another iounmap for the same address
|
||||
in parallel. Reuse of the virtual address is prevented by
|
||||
leaving it in the global lists until we're done with it.
|
||||
cpa takes care of the direct mappings. */
|
||||
read_lock(&vmlist_lock);
|
||||
for (p = vmlist; p; p = p->next) {
|
||||
if (p->addr == addr)
|
||||
break;
|
||||
}
|
||||
read_unlock(&vmlist_lock);
|
||||
|
||||
if (!p) {
|
||||
printk("iounmap: bad address %p\n", addr);
|
||||
dump_stack();
|
||||
return;
|
||||
}
|
||||
|
||||
/* Reset the direct mapping. Can block */
|
||||
if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) {
|
||||
change_page_attr(virt_to_page(__va(p->phys_addr)),
|
||||
get_vm_area_size(p) >> PAGE_SHIFT,
|
||||
PAGE_KERNEL);
|
||||
global_flush_tlb();
|
||||
}
|
||||
|
||||
/* Finally remove it */
|
||||
o = remove_vm_area((void *)addr);
|
||||
BUG_ON(p != o || o == NULL);
|
||||
kfree(p);
|
||||
}
|
||||
EXPORT_SYMBOL(iounmap);
|
||||
|
||||
void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
|
||||
{
|
||||
unsigned long offset, last_addr;
|
||||
unsigned int nrpages;
|
||||
enum fixed_addresses idx;
|
||||
|
||||
/* Don't allow wraparound or zero size */
|
||||
last_addr = phys_addr + size - 1;
|
||||
if (!size || last_addr < phys_addr)
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* Don't remap the low PCI/ISA area, it's always mapped..
|
||||
*/
|
||||
if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
|
||||
return phys_to_virt(phys_addr);
|
||||
|
||||
/*
|
||||
* Mappings have to be page-aligned
|
||||
*/
|
||||
offset = phys_addr & ~PAGE_MASK;
|
||||
phys_addr &= PAGE_MASK;
|
||||
size = PAGE_ALIGN(last_addr) - phys_addr;
|
||||
|
||||
/*
|
||||
* Mappings have to fit in the FIX_BTMAP area.
|
||||
*/
|
||||
nrpages = size >> PAGE_SHIFT;
|
||||
if (nrpages > NR_FIX_BTMAPS)
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* Ok, go for it..
|
||||
*/
|
||||
idx = FIX_BTMAP_BEGIN;
|
||||
while (nrpages > 0) {
|
||||
set_fixmap(idx, phys_addr);
|
||||
phys_addr += PAGE_SIZE;
|
||||
--idx;
|
||||
--nrpages;
|
||||
}
|
||||
return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
|
||||
}
|
||||
|
||||
void __init bt_iounmap(void *addr, unsigned long size)
|
||||
{
|
||||
unsigned long virt_addr;
|
||||
unsigned long offset;
|
||||
unsigned int nrpages;
|
||||
enum fixed_addresses idx;
|
||||
|
||||
virt_addr = (unsigned long)addr;
|
||||
if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
|
||||
return;
|
||||
offset = virt_addr & ~PAGE_MASK;
|
||||
nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
|
||||
|
||||
idx = FIX_BTMAP_BEGIN;
|
||||
while (nrpages > 0) {
|
||||
clear_fixmap(idx);
|
||||
--idx;
|
||||
--nrpages;
|
||||
}
|
||||
}
|
77
arch/x86/mm/mmap_32.c
Normal file
77
arch/x86/mm/mmap_32.c
Normal file
@@ -0,0 +1,77 @@
|
||||
/*
|
||||
* linux/arch/i386/mm/mmap.c
|
||||
*
|
||||
* flexible mmap layout support
|
||||
*
|
||||
* Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
|
||||
* All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
*
|
||||
*
|
||||
* Started by Ingo Molnar <mingo@elte.hu>
|
||||
*/
|
||||
|
||||
#include <linux/personality.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/sched.h>
|
||||
|
||||
/*
|
||||
* Top of mmap area (just below the process stack).
|
||||
*
|
||||
* Leave an at least ~128 MB hole.
|
||||
*/
|
||||
#define MIN_GAP (128*1024*1024)
|
||||
#define MAX_GAP (TASK_SIZE/6*5)
|
||||
|
||||
static inline unsigned long mmap_base(struct mm_struct *mm)
|
||||
{
|
||||
unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
|
||||
unsigned long random_factor = 0;
|
||||
|
||||
if (current->flags & PF_RANDOMIZE)
|
||||
random_factor = get_random_int() % (1024*1024);
|
||||
|
||||
if (gap < MIN_GAP)
|
||||
gap = MIN_GAP;
|
||||
else if (gap > MAX_GAP)
|
||||
gap = MAX_GAP;
|
||||
|
||||
return PAGE_ALIGN(TASK_SIZE - gap - random_factor);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function, called very early during the creation of a new
|
||||
* process VM image, sets up which VM layout function to use:
|
||||
*/
|
||||
void arch_pick_mmap_layout(struct mm_struct *mm)
|
||||
{
|
||||
/*
|
||||
* Fall back to the standard layout if the personality
|
||||
* bit is set, or if the expected stack growth is unlimited:
|
||||
*/
|
||||
if (sysctl_legacy_va_layout ||
|
||||
(current->personality & ADDR_COMPAT_LAYOUT) ||
|
||||
current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
|
||||
mm->mmap_base = TASK_UNMAPPED_BASE;
|
||||
mm->get_unmapped_area = arch_get_unmapped_area;
|
||||
mm->unmap_area = arch_unmap_area;
|
||||
} else {
|
||||
mm->mmap_base = mmap_base(mm);
|
||||
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
|
||||
mm->unmap_area = arch_unmap_area_topdown;
|
||||
}
|
||||
}
|
278
arch/x86/mm/pageattr_32.c
Normal file
278
arch/x86/mm/pageattr_32.c
Normal file
@@ -0,0 +1,278 @@
|
||||
/*
|
||||
* Copyright 2002 Andi Kleen, SuSE Labs.
|
||||
* Thanks to Ben LaHaise for precious feedback.
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <asm/uaccess.h>
|
||||
#include <asm/processor.h>
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/pgalloc.h>
|
||||
#include <asm/sections.h>
|
||||
|
||||
static DEFINE_SPINLOCK(cpa_lock);
|
||||
static struct list_head df_list = LIST_HEAD_INIT(df_list);
|
||||
|
||||
|
||||
pte_t *lookup_address(unsigned long address)
|
||||
{
|
||||
pgd_t *pgd = pgd_offset_k(address);
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
if (pgd_none(*pgd))
|
||||
return NULL;
|
||||
pud = pud_offset(pgd, address);
|
||||
if (pud_none(*pud))
|
||||
return NULL;
|
||||
pmd = pmd_offset(pud, address);
|
||||
if (pmd_none(*pmd))
|
||||
return NULL;
|
||||
if (pmd_large(*pmd))
|
||||
return (pte_t *)pmd;
|
||||
return pte_offset_kernel(pmd, address);
|
||||
}
|
||||
|
||||
static struct page *split_large_page(unsigned long address, pgprot_t prot,
|
||||
pgprot_t ref_prot)
|
||||
{
|
||||
int i;
|
||||
unsigned long addr;
|
||||
struct page *base;
|
||||
pte_t *pbase;
|
||||
|
||||
spin_unlock_irq(&cpa_lock);
|
||||
base = alloc_pages(GFP_KERNEL, 0);
|
||||
spin_lock_irq(&cpa_lock);
|
||||
if (!base)
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* page_private is used to track the number of entries in
|
||||
* the page table page that have non standard attributes.
|
||||
*/
|
||||
SetPagePrivate(base);
|
||||
page_private(base) = 0;
|
||||
|
||||
address = __pa(address);
|
||||
addr = address & LARGE_PAGE_MASK;
|
||||
pbase = (pte_t *)page_address(base);
|
||||
paravirt_alloc_pt(&init_mm, page_to_pfn(base));
|
||||
for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
|
||||
set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
|
||||
addr == address ? prot : ref_prot));
|
||||
}
|
||||
return base;
|
||||
}
|
||||
|
||||
static void cache_flush_page(struct page *p)
|
||||
{
|
||||
unsigned long adr = (unsigned long)page_address(p);
|
||||
int i;
|
||||
for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
|
||||
asm volatile("clflush (%0)" :: "r" (adr + i));
|
||||
}
|
||||
|
||||
static void flush_kernel_map(void *arg)
|
||||
{
|
||||
struct list_head *lh = (struct list_head *)arg;
|
||||
struct page *p;
|
||||
|
||||
/* High level code is not ready for clflush yet */
|
||||
if (0 && cpu_has_clflush) {
|
||||
list_for_each_entry (p, lh, lru)
|
||||
cache_flush_page(p);
|
||||
} else if (boot_cpu_data.x86_model >= 4)
|
||||
wbinvd();
|
||||
|
||||
/* Flush all to work around Errata in early athlons regarding
|
||||
* large page flushing.
|
||||
*/
|
||||
__flush_tlb_all();
|
||||
}
|
||||
|
||||
static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
|
||||
{
|
||||
struct page *page;
|
||||
unsigned long flags;
|
||||
|
||||
set_pte_atomic(kpte, pte); /* change init_mm */
|
||||
if (SHARED_KERNEL_PMD)
|
||||
return;
|
||||
|
||||
spin_lock_irqsave(&pgd_lock, flags);
|
||||
for (page = pgd_list; page; page = (struct page *)page->index) {
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
pgd = (pgd_t *)page_address(page) + pgd_index(address);
|
||||
pud = pud_offset(pgd, address);
|
||||
pmd = pmd_offset(pud, address);
|
||||
set_pte_atomic((pte_t *)pmd, pte);
|
||||
}
|
||||
spin_unlock_irqrestore(&pgd_lock, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* No more special protections in this 2/4MB area - revert to a
|
||||
* large page again.
|
||||
*/
|
||||
static inline void revert_page(struct page *kpte_page, unsigned long address)
|
||||
{
|
||||
pgprot_t ref_prot;
|
||||
pte_t *linear;
|
||||
|
||||
ref_prot =
|
||||
((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
|
||||
? PAGE_KERNEL_LARGE_EXEC : PAGE_KERNEL_LARGE;
|
||||
|
||||
linear = (pte_t *)
|
||||
pmd_offset(pud_offset(pgd_offset_k(address), address), address);
|
||||
set_pmd_pte(linear, address,
|
||||
pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
|
||||
ref_prot));
|
||||
}
|
||||
|
||||
static inline void save_page(struct page *kpte_page)
|
||||
{
|
||||
if (!test_and_set_bit(PG_arch_1, &kpte_page->flags))
|
||||
list_add(&kpte_page->lru, &df_list);
|
||||
}
|
||||
|
||||
static int
|
||||
__change_page_attr(struct page *page, pgprot_t prot)
|
||||
{
|
||||
pte_t *kpte;
|
||||
unsigned long address;
|
||||
struct page *kpte_page;
|
||||
|
||||
BUG_ON(PageHighMem(page));
|
||||
address = (unsigned long)page_address(page);
|
||||
|
||||
kpte = lookup_address(address);
|
||||
if (!kpte)
|
||||
return -EINVAL;
|
||||
kpte_page = virt_to_page(kpte);
|
||||
BUG_ON(PageLRU(kpte_page));
|
||||
BUG_ON(PageCompound(kpte_page));
|
||||
|
||||
if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) {
|
||||
if (!pte_huge(*kpte)) {
|
||||
set_pte_atomic(kpte, mk_pte(page, prot));
|
||||
} else {
|
||||
pgprot_t ref_prot;
|
||||
struct page *split;
|
||||
|
||||
ref_prot =
|
||||
((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
|
||||
? PAGE_KERNEL_EXEC : PAGE_KERNEL;
|
||||
split = split_large_page(address, prot, ref_prot);
|
||||
if (!split)
|
||||
return -ENOMEM;
|
||||
set_pmd_pte(kpte,address,mk_pte(split, ref_prot));
|
||||
kpte_page = split;
|
||||
}
|
||||
page_private(kpte_page)++;
|
||||
} else if (!pte_huge(*kpte)) {
|
||||
set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
|
||||
BUG_ON(page_private(kpte_page) == 0);
|
||||
page_private(kpte_page)--;
|
||||
} else
|
||||
BUG();
|
||||
|
||||
/*
|
||||
* If the pte was reserved, it means it was created at boot
|
||||
* time (not via split_large_page) and in turn we must not
|
||||
* replace it with a largepage.
|
||||
*/
|
||||
|
||||
save_page(kpte_page);
|
||||
if (!PageReserved(kpte_page)) {
|
||||
if (cpu_has_pse && (page_private(kpte_page) == 0)) {
|
||||
paravirt_release_pt(page_to_pfn(kpte_page));
|
||||
revert_page(kpte_page, address);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void flush_map(struct list_head *l)
|
||||
{
|
||||
on_each_cpu(flush_kernel_map, l, 1, 1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Change the page attributes of an page in the linear mapping.
|
||||
*
|
||||
* This should be used when a page is mapped with a different caching policy
|
||||
* than write-back somewhere - some CPUs do not like it when mappings with
|
||||
* different caching policies exist. This changes the page attributes of the
|
||||
* in kernel linear mapping too.
|
||||
*
|
||||
* The caller needs to ensure that there are no conflicting mappings elsewhere.
|
||||
* This function only deals with the kernel linear map.
|
||||
*
|
||||
* Caller must call global_flush_tlb() after this.
|
||||
*/
|
||||
int change_page_attr(struct page *page, int numpages, pgprot_t prot)
|
||||
{
|
||||
int err = 0;
|
||||
int i;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&cpa_lock, flags);
|
||||
for (i = 0; i < numpages; i++, page++) {
|
||||
err = __change_page_attr(page, prot);
|
||||
if (err)
|
||||
break;
|
||||
}
|
||||
spin_unlock_irqrestore(&cpa_lock, flags);
|
||||
return err;
|
||||
}
|
||||
|
||||
void global_flush_tlb(void)
|
||||
{
|
||||
struct list_head l;
|
||||
struct page *pg, *next;
|
||||
|
||||
BUG_ON(irqs_disabled());
|
||||
|
||||
spin_lock_irq(&cpa_lock);
|
||||
list_replace_init(&df_list, &l);
|
||||
spin_unlock_irq(&cpa_lock);
|
||||
flush_map(&l);
|
||||
list_for_each_entry_safe(pg, next, &l, lru) {
|
||||
list_del(&pg->lru);
|
||||
clear_bit(PG_arch_1, &pg->flags);
|
||||
if (PageReserved(pg) || !cpu_has_pse || page_private(pg) != 0)
|
||||
continue;
|
||||
ClearPagePrivate(pg);
|
||||
__free_page(pg);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_DEBUG_PAGEALLOC
|
||||
void kernel_map_pages(struct page *page, int numpages, int enable)
|
||||
{
|
||||
if (PageHighMem(page))
|
||||
return;
|
||||
if (!enable)
|
||||
debug_check_no_locks_freed(page_address(page),
|
||||
numpages * PAGE_SIZE);
|
||||
|
||||
/* the return value is ignored - the calls cannot fail,
|
||||
* large pages are disabled at boot time.
|
||||
*/
|
||||
change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
|
||||
/* we should perform an IPI and flush all tlbs,
|
||||
* but that can deadlock->flush only current cpu.
|
||||
*/
|
||||
__flush_tlb_all();
|
||||
}
|
||||
#endif
|
||||
|
||||
EXPORT_SYMBOL(change_page_attr);
|
||||
EXPORT_SYMBOL(global_flush_tlb);
|
373
arch/x86/mm/pgtable_32.c
Normal file
373
arch/x86/mm/pgtable_32.c
Normal file
@@ -0,0 +1,373 @@
|
||||
/*
|
||||
* linux/arch/i386/mm/pgtable.c
|
||||
*/
|
||||
|
||||
#include <linux/sched.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/smp.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/quicklist.h>
|
||||
|
||||
#include <asm/system.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/pgalloc.h>
|
||||
#include <asm/fixmap.h>
|
||||
#include <asm/e820.h>
|
||||
#include <asm/tlb.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
void show_mem(void)
|
||||
{
|
||||
int total = 0, reserved = 0;
|
||||
int shared = 0, cached = 0;
|
||||
int highmem = 0;
|
||||
struct page *page;
|
||||
pg_data_t *pgdat;
|
||||
unsigned long i;
|
||||
unsigned long flags;
|
||||
|
||||
printk(KERN_INFO "Mem-info:\n");
|
||||
show_free_areas();
|
||||
printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
|
||||
for_each_online_pgdat(pgdat) {
|
||||
pgdat_resize_lock(pgdat, &flags);
|
||||
for (i = 0; i < pgdat->node_spanned_pages; ++i) {
|
||||
page = pgdat_page_nr(pgdat, i);
|
||||
total++;
|
||||
if (PageHighMem(page))
|
||||
highmem++;
|
||||
if (PageReserved(page))
|
||||
reserved++;
|
||||
else if (PageSwapCache(page))
|
||||
cached++;
|
||||
else if (page_count(page))
|
||||
shared += page_count(page) - 1;
|
||||
}
|
||||
pgdat_resize_unlock(pgdat, &flags);
|
||||
}
|
||||
printk(KERN_INFO "%d pages of RAM\n", total);
|
||||
printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
|
||||
printk(KERN_INFO "%d reserved pages\n", reserved);
|
||||
printk(KERN_INFO "%d pages shared\n", shared);
|
||||
printk(KERN_INFO "%d pages swap cached\n", cached);
|
||||
|
||||
printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
|
||||
printk(KERN_INFO "%lu pages writeback\n",
|
||||
global_page_state(NR_WRITEBACK));
|
||||
printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
|
||||
printk(KERN_INFO "%lu pages slab\n",
|
||||
global_page_state(NR_SLAB_RECLAIMABLE) +
|
||||
global_page_state(NR_SLAB_UNRECLAIMABLE));
|
||||
printk(KERN_INFO "%lu pages pagetables\n",
|
||||
global_page_state(NR_PAGETABLE));
|
||||
}
|
||||
|
||||
/*
|
||||
* Associate a virtual page frame with a given physical page frame
|
||||
* and protection flags for that frame.
|
||||
*/
|
||||
static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
pte_t *pte;
|
||||
|
||||
pgd = swapper_pg_dir + pgd_index(vaddr);
|
||||
if (pgd_none(*pgd)) {
|
||||
BUG();
|
||||
return;
|
||||
}
|
||||
pud = pud_offset(pgd, vaddr);
|
||||
if (pud_none(*pud)) {
|
||||
BUG();
|
||||
return;
|
||||
}
|
||||
pmd = pmd_offset(pud, vaddr);
|
||||
if (pmd_none(*pmd)) {
|
||||
BUG();
|
||||
return;
|
||||
}
|
||||
pte = pte_offset_kernel(pmd, vaddr);
|
||||
if (pgprot_val(flags))
|
||||
/* <pfn,flags> stored as-is, to permit clearing entries */
|
||||
set_pte(pte, pfn_pte(pfn, flags));
|
||||
else
|
||||
pte_clear(&init_mm, vaddr, pte);
|
||||
|
||||
/*
|
||||
* It's enough to flush this one mapping.
|
||||
* (PGE mappings get flushed as well)
|
||||
*/
|
||||
__flush_tlb_one(vaddr);
|
||||
}
|
||||
|
||||
/*
|
||||
* Associate a large virtual page frame with a given physical page frame
|
||||
* and protection flags for that frame. pfn is for the base of the page,
|
||||
* vaddr is what the page gets mapped to - both must be properly aligned.
|
||||
* The pmd must already be instantiated. Assumes PAE mode.
|
||||
*/
|
||||
void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
|
||||
if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
|
||||
printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
|
||||
return; /* BUG(); */
|
||||
}
|
||||
if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
|
||||
printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
|
||||
return; /* BUG(); */
|
||||
}
|
||||
pgd = swapper_pg_dir + pgd_index(vaddr);
|
||||
if (pgd_none(*pgd)) {
|
||||
printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
|
||||
return; /* BUG(); */
|
||||
}
|
||||
pud = pud_offset(pgd, vaddr);
|
||||
pmd = pmd_offset(pud, vaddr);
|
||||
set_pmd(pmd, pfn_pmd(pfn, flags));
|
||||
/*
|
||||
* It's enough to flush this one mapping.
|
||||
* (PGE mappings get flushed as well)
|
||||
*/
|
||||
__flush_tlb_one(vaddr);
|
||||
}
|
||||
|
||||
static int fixmaps;
|
||||
unsigned long __FIXADDR_TOP = 0xfffff000;
|
||||
EXPORT_SYMBOL(__FIXADDR_TOP);
|
||||
|
||||
void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
|
||||
{
|
||||
unsigned long address = __fix_to_virt(idx);
|
||||
|
||||
if (idx >= __end_of_fixed_addresses) {
|
||||
BUG();
|
||||
return;
|
||||
}
|
||||
set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
|
||||
fixmaps++;
|
||||
}
|
||||
|
||||
/**
|
||||
* reserve_top_address - reserves a hole in the top of kernel address space
|
||||
* @reserve - size of hole to reserve
|
||||
*
|
||||
* Can be used to relocate the fixmap area and poke a hole in the top
|
||||
* of kernel address space to make room for a hypervisor.
|
||||
*/
|
||||
void reserve_top_address(unsigned long reserve)
|
||||
{
|
||||
BUG_ON(fixmaps > 0);
|
||||
printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
|
||||
(int)-reserve);
|
||||
__FIXADDR_TOP = -reserve - PAGE_SIZE;
|
||||
__VMALLOC_RESERVE += reserve;
|
||||
}
|
||||
|
||||
pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
|
||||
{
|
||||
return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
|
||||
}
|
||||
|
||||
struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
|
||||
{
|
||||
struct page *pte;
|
||||
|
||||
#ifdef CONFIG_HIGHPTE
|
||||
pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
|
||||
#else
|
||||
pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
|
||||
#endif
|
||||
return pte;
|
||||
}
|
||||
|
||||
void pmd_ctor(void *pmd, struct kmem_cache *cache, unsigned long flags)
|
||||
{
|
||||
memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
|
||||
}
|
||||
|
||||
/*
|
||||
* List of all pgd's needed for non-PAE so it can invalidate entries
|
||||
* in both cached and uncached pgd's; not needed for PAE since the
|
||||
* kernel pmd is shared. If PAE were not to share the pmd a similar
|
||||
* tactic would be needed. This is essentially codepath-based locking
|
||||
* against pageattr.c; it is the unique case in which a valid change
|
||||
* of kernel pagetables can't be lazily synchronized by vmalloc faults.
|
||||
* vmalloc faults work because attached pagetables are never freed.
|
||||
* -- wli
|
||||
*/
|
||||
DEFINE_SPINLOCK(pgd_lock);
|
||||
struct page *pgd_list;
|
||||
|
||||
static inline void pgd_list_add(pgd_t *pgd)
|
||||
{
|
||||
struct page *page = virt_to_page(pgd);
|
||||
page->index = (unsigned long)pgd_list;
|
||||
if (pgd_list)
|
||||
set_page_private(pgd_list, (unsigned long)&page->index);
|
||||
pgd_list = page;
|
||||
set_page_private(page, (unsigned long)&pgd_list);
|
||||
}
|
||||
|
||||
static inline void pgd_list_del(pgd_t *pgd)
|
||||
{
|
||||
struct page *next, **pprev, *page = virt_to_page(pgd);
|
||||
next = (struct page *)page->index;
|
||||
pprev = (struct page **)page_private(page);
|
||||
*pprev = next;
|
||||
if (next)
|
||||
set_page_private(next, (unsigned long)pprev);
|
||||
}
|
||||
|
||||
|
||||
|
||||
#if (PTRS_PER_PMD == 1)
|
||||
/* Non-PAE pgd constructor */
|
||||
static void pgd_ctor(void *pgd)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
/* !PAE, no pagetable sharing */
|
||||
memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
|
||||
|
||||
spin_lock_irqsave(&pgd_lock, flags);
|
||||
|
||||
/* must happen under lock */
|
||||
clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
|
||||
swapper_pg_dir + USER_PTRS_PER_PGD,
|
||||
KERNEL_PGD_PTRS);
|
||||
paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
|
||||
__pa(swapper_pg_dir) >> PAGE_SHIFT,
|
||||
USER_PTRS_PER_PGD,
|
||||
KERNEL_PGD_PTRS);
|
||||
pgd_list_add(pgd);
|
||||
spin_unlock_irqrestore(&pgd_lock, flags);
|
||||
}
|
||||
#else /* PTRS_PER_PMD > 1 */
|
||||
/* PAE pgd constructor */
|
||||
static void pgd_ctor(void *pgd)
|
||||
{
|
||||
/* PAE, kernel PMD may be shared */
|
||||
|
||||
if (SHARED_KERNEL_PMD) {
|
||||
clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
|
||||
swapper_pg_dir + USER_PTRS_PER_PGD,
|
||||
KERNEL_PGD_PTRS);
|
||||
} else {
|
||||
unsigned long flags;
|
||||
|
||||
memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
|
||||
spin_lock_irqsave(&pgd_lock, flags);
|
||||
pgd_list_add(pgd);
|
||||
spin_unlock_irqrestore(&pgd_lock, flags);
|
||||
}
|
||||
}
|
||||
#endif /* PTRS_PER_PMD */
|
||||
|
||||
static void pgd_dtor(void *pgd)
|
||||
{
|
||||
unsigned long flags; /* can be called from interrupt context */
|
||||
|
||||
if (SHARED_KERNEL_PMD)
|
||||
return;
|
||||
|
||||
paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
|
||||
spin_lock_irqsave(&pgd_lock, flags);
|
||||
pgd_list_del(pgd);
|
||||
spin_unlock_irqrestore(&pgd_lock, flags);
|
||||
}
|
||||
|
||||
#define UNSHARED_PTRS_PER_PGD \
|
||||
(SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
|
||||
|
||||
/* If we allocate a pmd for part of the kernel address space, then
|
||||
make sure its initialized with the appropriate kernel mappings.
|
||||
Otherwise use a cached zeroed pmd. */
|
||||
static pmd_t *pmd_cache_alloc(int idx)
|
||||
{
|
||||
pmd_t *pmd;
|
||||
|
||||
if (idx >= USER_PTRS_PER_PGD) {
|
||||
pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
|
||||
|
||||
if (pmd)
|
||||
memcpy(pmd,
|
||||
(void *)pgd_page_vaddr(swapper_pg_dir[idx]),
|
||||
sizeof(pmd_t) * PTRS_PER_PMD);
|
||||
} else
|
||||
pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
|
||||
|
||||
return pmd;
|
||||
}
|
||||
|
||||
static void pmd_cache_free(pmd_t *pmd, int idx)
|
||||
{
|
||||
if (idx >= USER_PTRS_PER_PGD)
|
||||
free_page((unsigned long)pmd);
|
||||
else
|
||||
kmem_cache_free(pmd_cache, pmd);
|
||||
}
|
||||
|
||||
pgd_t *pgd_alloc(struct mm_struct *mm)
|
||||
{
|
||||
int i;
|
||||
pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
|
||||
|
||||
if (PTRS_PER_PMD == 1 || !pgd)
|
||||
return pgd;
|
||||
|
||||
for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
|
||||
pmd_t *pmd = pmd_cache_alloc(i);
|
||||
|
||||
if (!pmd)
|
||||
goto out_oom;
|
||||
|
||||
paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
|
||||
set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
|
||||
}
|
||||
return pgd;
|
||||
|
||||
out_oom:
|
||||
for (i--; i >= 0; i--) {
|
||||
pgd_t pgdent = pgd[i];
|
||||
void* pmd = (void *)__va(pgd_val(pgdent)-1);
|
||||
paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
|
||||
pmd_cache_free(pmd, i);
|
||||
}
|
||||
quicklist_free(0, pgd_dtor, pgd);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void pgd_free(pgd_t *pgd)
|
||||
{
|
||||
int i;
|
||||
|
||||
/* in the PAE case user pgd entries are overwritten before usage */
|
||||
if (PTRS_PER_PMD > 1)
|
||||
for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
|
||||
pgd_t pgdent = pgd[i];
|
||||
void* pmd = (void *)__va(pgd_val(pgdent)-1);
|
||||
paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
|
||||
pmd_cache_free(pmd, i);
|
||||
}
|
||||
/* in the non-PAE case, free_pgtables() clears user pgd entries */
|
||||
quicklist_free(0, pgd_dtor, pgd);
|
||||
}
|
||||
|
||||
void check_pgt_cache(void)
|
||||
{
|
||||
quicklist_trim(0, pgd_dtor, 25, 16);
|
||||
}
|
||||
|
Reference in New Issue
Block a user