Linux-2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
2005-04-16 15:20:36 -07:00
commit 1da177e4c3
17291 changed files with 6718755 additions and 0 deletions
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -0,0 +1,20 @@
+#
+# Makefile for the linux memory manager.
+#
+
+mmu-y			:= nommu.o
+mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
+			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
+			   vmalloc.o
+
+obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
+			   page_alloc.o page-writeback.o pdflush.o \
+			   readahead.o slab.o swap.o truncate.o vmscan.o \
+			   prio_tree.o $(mmu-y)
+
+obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
+obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
+obj-$(CONFIG_NUMA) 	+= mempolicy.o
+obj-$(CONFIG_SHMEM) += shmem.o
+obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
+
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -0,0 +1,400 @@
+/*
+ *  linux/mm/bootmem.c
+ *
+ *  Copyright (C) 1999 Ingo Molnar
+ *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
+ *
+ *  simple boot-time physical memory area allocator and
+ *  free memory collector. It's used to deal with reserved
+ *  system memory and memory holes as well.
+ */
+
+#include <linux/mm.h>
+#include <linux/kernel_stat.h>
+#include <linux/swap.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/module.h>
+#include <asm/dma.h>
+#include <asm/io.h>
+#include "internal.h"
+
+/*
+ * Access to this subsystem has to be serialized externally. (this is
+ * true for the boot process anyway)
+ */
+unsigned long max_low_pfn;
+unsigned long min_low_pfn;
+unsigned long max_pfn;
+
+EXPORT_SYMBOL(max_pfn);		/* This is exported so
+				 * dma_get_required_mask(), which uses
+				 * it, can be an inline function */
+
+/* return the number of _pages_ that will be allocated for the boot bitmap */
+unsigned long __init bootmem_bootmap_pages (unsigned long pages)
+{
+	unsigned long mapsize;
+
+	mapsize = (pages+7)/8;
+	mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK;
+	mapsize >>= PAGE_SHIFT;
+
+	return mapsize;
+}
+
+/*
+ * Called once to set up the allocator itself.
+ */
+static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
+	unsigned long mapstart, unsigned long start, unsigned long end)
+{
+	bootmem_data_t *bdata = pgdat->bdata;
+	unsigned long mapsize = ((end - start)+7)/8;
+
+	pgdat->pgdat_next = pgdat_list;
+	pgdat_list = pgdat;
+
+	mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL);
+	bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
+	bdata->node_boot_start = (start << PAGE_SHIFT);
+	bdata->node_low_pfn = end;
+
+	/*
+	 * Initially all pages are reserved - setup_arch() has to
+	 * register free RAM areas explicitly.
+	 */
+	memset(bdata->node_bootmem_map, 0xff, mapsize);
+
+	return mapsize;
+}
+
+/*
+ * Marks a particular physical memory range as unallocatable. Usable RAM
+ * might be used for boot-time allocations - or it might get added
+ * to the free page pool later on.
+ */
+static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size)
+{
+	unsigned long i;
+	/*
+	 * round up, partially reserved pages are considered
+	 * fully reserved.
+	 */
+	unsigned long sidx = (addr - bdata->node_boot_start)/PAGE_SIZE;
+	unsigned long eidx = (addr + size - bdata->node_boot_start + 
+							PAGE_SIZE-1)/PAGE_SIZE;
+	unsigned long end = (addr + size + PAGE_SIZE-1)/PAGE_SIZE;
+
+	BUG_ON(!size);
+	BUG_ON(sidx >= eidx);
+	BUG_ON((addr >> PAGE_SHIFT) >= bdata->node_low_pfn);
+	BUG_ON(end > bdata->node_low_pfn);
+
+	for (i = sidx; i < eidx; i++)
+		if (test_and_set_bit(i, bdata->node_bootmem_map)) {
+#ifdef CONFIG_DEBUG_BOOTMEM
+			printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE);
+#endif
+		}
+}
+
+static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size)
+{
+	unsigned long i;
+	unsigned long start;
+	/*
+	 * round down end of usable mem, partially free pages are
+	 * considered reserved.
+	 */
+	unsigned long sidx;
+	unsigned long eidx = (addr + size - bdata->node_boot_start)/PAGE_SIZE;
+	unsigned long end = (addr + size)/PAGE_SIZE;
+
+	BUG_ON(!size);
+	BUG_ON(end > bdata->node_low_pfn);
+
+	if (addr < bdata->last_success)
+		bdata->last_success = addr;
+
+	/*
+	 * Round up the beginning of the address.
+	 */
+	start = (addr + PAGE_SIZE-1) / PAGE_SIZE;
+	sidx = start - (bdata->node_boot_start/PAGE_SIZE);
+
+	for (i = sidx; i < eidx; i++) {
+		if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map)))
+			BUG();
+	}
+}
+
+/*
+ * We 'merge' subsequent allocations to save space. We might 'lose'
+ * some fraction of a page if allocations cannot be satisfied due to
+ * size constraints on boxes where there is physical RAM space
+ * fragmentation - in these cases (mostly large memory boxes) this
+ * is not a problem.
+ *
+ * On low memory boxes we get it right in 100% of the cases.
+ *
+ * alignment has to be a power of 2 value.
+ *
+ * NOTE:  This function is _not_ reentrant.
+ */
+static void * __init
+__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
+		unsigned long align, unsigned long goal)
+{
+	unsigned long offset, remaining_size, areasize, preferred;
+	unsigned long i, start = 0, incr, eidx;
+	void *ret;
+
+	if(!size) {
+		printk("__alloc_bootmem_core(): zero-sized request\n");
+		BUG();
+	}
+	BUG_ON(align & (align-1));
+
+	eidx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
+	offset = 0;
+	if (align &&
+	    (bdata->node_boot_start & (align - 1UL)) != 0)
+		offset = (align - (bdata->node_boot_start & (align - 1UL)));
+	offset >>= PAGE_SHIFT;
+
+	/*
+	 * We try to allocate bootmem pages above 'goal'
+	 * first, then we try to allocate lower pages.
+	 */
+	if (goal && (goal >= bdata->node_boot_start) && 
+	    ((goal >> PAGE_SHIFT) < bdata->node_low_pfn)) {
+		preferred = goal - bdata->node_boot_start;
+
+		if (bdata->last_success >= preferred)
+			preferred = bdata->last_success;
+	} else
+		preferred = 0;
+
+	preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT;
+	preferred += offset;
+	areasize = (size+PAGE_SIZE-1)/PAGE_SIZE;
+	incr = align >> PAGE_SHIFT ? : 1;
+
+restart_scan:
+	for (i = preferred; i < eidx; i += incr) {
+		unsigned long j;
+		i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i);
+		i = ALIGN(i, incr);
+		if (test_bit(i, bdata->node_bootmem_map))
+			continue;
+		for (j = i + 1; j < i + areasize; ++j) {
+			if (j >= eidx)
+				goto fail_block;
+			if (test_bit (j, bdata->node_bootmem_map))
+				goto fail_block;
+		}
+		start = i;
+		goto found;
+	fail_block:
+		i = ALIGN(j, incr);
+	}
+
+	if (preferred > offset) {
+		preferred = offset;
+		goto restart_scan;
+	}
+	return NULL;
+
+found:
+	bdata->last_success = start << PAGE_SHIFT;
+	BUG_ON(start >= eidx);
+
+	/*
+	 * Is the next page of the previous allocation-end the start
+	 * of this allocation's buffer? If yes then we can 'merge'
+	 * the previous partial page with this allocation.
+	 */
+	if (align < PAGE_SIZE &&
+	    bdata->last_offset && bdata->last_pos+1 == start) {
+		offset = (bdata->last_offset+align-1) & ~(align-1);
+		BUG_ON(offset > PAGE_SIZE);
+		remaining_size = PAGE_SIZE-offset;
+		if (size < remaining_size) {
+			areasize = 0;
+			/* last_pos unchanged */
+			bdata->last_offset = offset+size;
+			ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset +
+						bdata->node_boot_start);
+		} else {
+			remaining_size = size - remaining_size;
+			areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE;
+			ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset +
+						bdata->node_boot_start);
+			bdata->last_pos = start+areasize-1;
+			bdata->last_offset = remaining_size;
+		}
+		bdata->last_offset &= ~PAGE_MASK;
+	} else {
+		bdata->last_pos = start + areasize - 1;
+		bdata->last_offset = size & ~PAGE_MASK;
+		ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start);
+	}
+
+	/*
+	 * Reserve the area now:
+	 */
+	for (i = start; i < start+areasize; i++)
+		if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map)))
+			BUG();
+	memset(ret, 0, size);
+	return ret;
+}
+
+static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
+{
+	struct page *page;
+	bootmem_data_t *bdata = pgdat->bdata;
+	unsigned long i, count, total = 0;
+	unsigned long idx;
+	unsigned long *map; 
+	int gofast = 0;
+
+	BUG_ON(!bdata->node_bootmem_map);
+
+	count = 0;
+	/* first extant page of the node */
+	page = virt_to_page(phys_to_virt(bdata->node_boot_start));
+	idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
+	map = bdata->node_bootmem_map;
+	/* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
+	if (bdata->node_boot_start == 0 ||
+	    ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG))
+		gofast = 1;
+	for (i = 0; i < idx; ) {
+		unsigned long v = ~map[i / BITS_PER_LONG];
+		if (gofast && v == ~0UL) {
+			int j, order;
+
+			count += BITS_PER_LONG;
+			__ClearPageReserved(page);
+			order = ffs(BITS_PER_LONG) - 1;
+			set_page_refs(page, order);
+			for (j = 1; j < BITS_PER_LONG; j++) {
+				if (j + 16 < BITS_PER_LONG)
+					prefetchw(page + j + 16);
+				__ClearPageReserved(page + j);
+			}
+			__free_pages(page, order);
+			i += BITS_PER_LONG;
+			page += BITS_PER_LONG;
+		} else if (v) {
+			unsigned long m;
+			for (m = 1; m && i < idx; m<<=1, page++, i++) {
+				if (v & m) {
+					count++;
+					__ClearPageReserved(page);
+					set_page_refs(page, 0);
+					__free_page(page);
+				}
+			}
+		} else {
+			i+=BITS_PER_LONG;
+			page += BITS_PER_LONG;
+		}
+	}
+	total += count;
+
+	/*
+	 * Now free the allocator bitmap itself, it's not
+	 * needed anymore:
+	 */
+	page = virt_to_page(bdata->node_bootmem_map);
+	count = 0;
+	for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) {
+		count++;
+		__ClearPageReserved(page);
+		set_page_count(page, 1);
+		__free_page(page);
+	}
+	total += count;
+	bdata->node_bootmem_map = NULL;
+
+	return total;
+}
+
+unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn)
+{
+	return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn));
+}
+
+void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size)
+{
+	reserve_bootmem_core(pgdat->bdata, physaddr, size);
+}
+
+void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size)
+{
+	free_bootmem_core(pgdat->bdata, physaddr, size);
+}
+
+unsigned long __init free_all_bootmem_node (pg_data_t *pgdat)
+{
+	return(free_all_bootmem_core(pgdat));
+}
+
+unsigned long __init init_bootmem (unsigned long start, unsigned long pages)
+{
+	max_low_pfn = pages;
+	min_low_pfn = start;
+	return(init_bootmem_core(NODE_DATA(0), start, 0, pages));
+}
+
+#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
+void __init reserve_bootmem (unsigned long addr, unsigned long size)
+{
+	reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size);
+}
+#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
+
+void __init free_bootmem (unsigned long addr, unsigned long size)
+{
+	free_bootmem_core(NODE_DATA(0)->bdata, addr, size);
+}
+
+unsigned long __init free_all_bootmem (void)
+{
+	return(free_all_bootmem_core(NODE_DATA(0)));
+}
+
+void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal)
+{
+	pg_data_t *pgdat = pgdat_list;
+	void *ptr;
+
+	for_each_pgdat(pgdat)
+		if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
+						align, goal)))
+			return(ptr);
+
+	/*
+	 * Whoops, we cannot satisfy the allocation request.
+	 */
+	printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+	panic("Out of memory");
+	return NULL;
+}
+
+void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal)
+{
+	void *ptr;
+
+	ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal);
+	if (ptr)
+		return (ptr);
+
+	return __alloc_bootmem(size, align, goal);
+}
+
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -0,0 +1,111 @@
+/*
+ * mm/fadvise.c
+ *
+ * Copyright (C) 2002, Linus Torvalds
+ *
+ * 11Jan2003	akpm@digeo.com
+ *		Initial version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/backing-dev.h>
+#include <linux/pagevec.h>
+#include <linux/fadvise.h>
+#include <linux/syscalls.h>
+
+#include <asm/unistd.h>
+
+/*
+ * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
+ * deactivate the pages and clear PG_Referenced.
+ */
+asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
+{
+	struct file *file = fget(fd);
+	struct address_space *mapping;
+	struct backing_dev_info *bdi;
+	loff_t endbyte;
+	pgoff_t start_index;
+	pgoff_t end_index;
+	unsigned long nrpages;
+	int ret = 0;
+
+	if (!file)
+		return -EBADF;
+
+	mapping = file->f_mapping;
+	if (!mapping || len < 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* Careful about overflows. Len == 0 means "as much as possible" */
+	endbyte = offset + len;
+	if (!len || endbyte < len)
+		endbyte = -1;
+
+	bdi = mapping->backing_dev_info;
+
+	switch (advice) {
+	case POSIX_FADV_NORMAL:
+		file->f_ra.ra_pages = bdi->ra_pages;
+		break;
+	case POSIX_FADV_RANDOM:
+		file->f_ra.ra_pages = 0;
+		break;
+	case POSIX_FADV_SEQUENTIAL:
+		file->f_ra.ra_pages = bdi->ra_pages * 2;
+		break;
+	case POSIX_FADV_WILLNEED:
+	case POSIX_FADV_NOREUSE:
+		if (!mapping->a_ops->readpage) {
+			ret = -EINVAL;
+			break;
+		}
+
+		/* First and last PARTIAL page! */
+		start_index = offset >> PAGE_CACHE_SHIFT;
+		end_index = (endbyte-1) >> PAGE_CACHE_SHIFT;
+
+		/* Careful about overflow on the "+1" */
+		nrpages = end_index - start_index + 1;
+		if (!nrpages)
+			nrpages = ~0UL;
+		
+		ret = force_page_cache_readahead(mapping, file,
+				start_index,
+				max_sane_readahead(nrpages));
+		if (ret > 0)
+			ret = 0;
+		break;
+	case POSIX_FADV_DONTNEED:
+		if (!bdi_write_congested(mapping->backing_dev_info))
+			filemap_flush(mapping);
+
+		/* First and last FULL page! */
+		start_index = (offset + (PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
+		end_index = (endbyte >> PAGE_CACHE_SHIFT);
+
+		if (end_index > start_index)
+			invalidate_mapping_pages(mapping, start_index, end_index-1);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+out:
+	fput(file);
+	return ret;
+}
+
+#ifdef __ARCH_WANT_SYS_FADVISE64
+
+asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice)
+{
+	return sys_fadvise64_64(fd, offset, len, advice);
+}
+
+#endif
--- a/mm/filemap.c
+++ b/mm/filemap.c
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -0,0 +1,256 @@
+/*
+ *   linux/mm/fremap.c
+ * 
+ * Explicit pagetable population and nonlinear (random) mappings support.
+ *
+ * started by Ingo Molnar, Copyright (C) 2002, 2003
+ */
+
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/file.h>
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/swapops.h>
+#include <linux/rmap.h>
+#include <linux/module.h>
+#include <linux/syscalls.h>
+
+#include <asm/mmu_context.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long addr, pte_t *ptep)
+{
+	pte_t pte = *ptep;
+
+	if (pte_none(pte))
+		return;
+	if (pte_present(pte)) {
+		unsigned long pfn = pte_pfn(pte);
+
+		flush_cache_page(vma, addr, pfn);
+		pte = ptep_clear_flush(vma, addr, ptep);
+		if (pfn_valid(pfn)) {
+			struct page *page = pfn_to_page(pfn);
+			if (!PageReserved(page)) {
+				if (pte_dirty(pte))
+					set_page_dirty(page);
+				page_remove_rmap(page);
+				page_cache_release(page);
+				dec_mm_counter(mm, rss);
+			}
+		}
+	} else {
+		if (!pte_file(pte))
+			free_swap_and_cache(pte_to_swp_entry(pte));
+		pte_clear(mm, addr, ptep);
+	}
+}
+
+/*
+ * Install a file page to a given virtual memory address, release any
+ * previously existing mapping.
+ */
+int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long addr, struct page *page, pgprot_t prot)
+{
+	struct inode *inode;
+	pgoff_t size;
+	int err = -ENOMEM;
+	pte_t *pte;
+	pmd_t *pmd;
+	pud_t *pud;
+	pgd_t *pgd;
+	pte_t pte_val;
+
+	pgd = pgd_offset(mm, addr);
+	spin_lock(&mm->page_table_lock);
+	
+	pud = pud_alloc(mm, pgd, addr);
+	if (!pud)
+		goto err_unlock;
+
+	pmd = pmd_alloc(mm, pud, addr);
+	if (!pmd)
+		goto err_unlock;
+
+	pte = pte_alloc_map(mm, pmd, addr);
+	if (!pte)
+		goto err_unlock;
+
+	/*
+	 * This page may have been truncated. Tell the
+	 * caller about it.
+	 */
+	err = -EINVAL;
+	inode = vma->vm_file->f_mapping->host;
+	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	if (!page->mapping || page->index >= size)
+		goto err_unlock;
+
+	zap_pte(mm, vma, addr, pte);
+
+	inc_mm_counter(mm,rss);
+	flush_icache_page(vma, page);
+	set_pte_at(mm, addr, pte, mk_pte(page, prot));
+	page_add_file_rmap(page);
+	pte_val = *pte;
+	pte_unmap(pte);
+	update_mmu_cache(vma, addr, pte_val);
+
+	err = 0;
+err_unlock:
+	spin_unlock(&mm->page_table_lock);
+	return err;
+}
+EXPORT_SYMBOL(install_page);
+
+
+/*
+ * Install a file pte to a given virtual memory address, release any
+ * previously existing mapping.
+ */
+int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long addr, unsigned long pgoff, pgprot_t prot)
+{
+	int err = -ENOMEM;
+	pte_t *pte;
+	pmd_t *pmd;
+	pud_t *pud;
+	pgd_t *pgd;
+	pte_t pte_val;
+
+	pgd = pgd_offset(mm, addr);
+	spin_lock(&mm->page_table_lock);
+	
+	pud = pud_alloc(mm, pgd, addr);
+	if (!pud)
+		goto err_unlock;
+
+	pmd = pmd_alloc(mm, pud, addr);
+	if (!pmd)
+		goto err_unlock;
+
+	pte = pte_alloc_map(mm, pmd, addr);
+	if (!pte)
+		goto err_unlock;
+
+	zap_pte(mm, vma, addr, pte);
+
+	set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
+	pte_val = *pte;
+	pte_unmap(pte);
+	update_mmu_cache(vma, addr, pte_val);
+	spin_unlock(&mm->page_table_lock);
+	return 0;
+
+err_unlock:
+	spin_unlock(&mm->page_table_lock);
+	return err;
+}
+
+
+/***
+ * sys_remap_file_pages - remap arbitrary pages of a shared backing store
+ *                        file within an existing vma.
+ * @start: start of the remapped virtual memory range
+ * @size: size of the remapped virtual memory range
+ * @prot: new protection bits of the range
+ * @pgoff: to be mapped page of the backing store file
+ * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
+ *
+ * this syscall works purely via pagetables, so it's the most efficient
+ * way to map the same (large) file into a given virtual window. Unlike
+ * mmap()/mremap() it does not create any new vmas. The new mappings are
+ * also safe across swapout.
+ *
+ * NOTE: the 'prot' parameter right now is ignored, and the vma's default
+ * protection is used. Arbitrary protections might be implemented in the
+ * future.
+ */
+asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
+	unsigned long __prot, unsigned long pgoff, unsigned long flags)
+{
+	struct mm_struct *mm = current->mm;
+	struct address_space *mapping;
+	unsigned long end = start + size;
+	struct vm_area_struct *vma;
+	int err = -EINVAL;
+	int has_write_lock = 0;
+
+	if (__prot)
+		return err;
+	/*
+	 * Sanitize the syscall parameters:
+	 */
+	start = start & PAGE_MASK;
+	size = size & PAGE_MASK;
+
+	/* Does the address range wrap, or is the span zero-sized? */
+	if (start + size <= start)
+		return err;
+
+	/* Can we represent this offset inside this architecture's pte's? */
+#if PTE_FILE_MAX_BITS < BITS_PER_LONG
+	if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
+		return err;
+#endif
+
+	/* We need down_write() to change vma->vm_flags. */
+	down_read(&mm->mmap_sem);
+ retry:
+	vma = find_vma(mm, start);
+
+	/*
+	 * Make sure the vma is shared, that it supports prefaulting,
+	 * and that the remapped range is valid and fully within
+	 * the single existing vma.  vm_private_data is used as a
+	 * swapout cursor in a VM_NONLINEAR vma (unless VM_RESERVED
+	 * or VM_LOCKED, but VM_LOCKED could be revoked later on).
+	 */
+	if (vma && (vma->vm_flags & VM_SHARED) &&
+		(!vma->vm_private_data ||
+			(vma->vm_flags & (VM_NONLINEAR|VM_RESERVED))) &&
+		vma->vm_ops && vma->vm_ops->populate &&
+			end > start && start >= vma->vm_start &&
+				end <= vma->vm_end) {
+
+		/* Must set VM_NONLINEAR before any pages are populated. */
+		if (pgoff != linear_page_index(vma, start) &&
+		    !(vma->vm_flags & VM_NONLINEAR)) {
+			if (!has_write_lock) {
+				up_read(&mm->mmap_sem);
+				down_write(&mm->mmap_sem);
+				has_write_lock = 1;
+				goto retry;
+			}
+			mapping = vma->vm_file->f_mapping;
+			spin_lock(&mapping->i_mmap_lock);
+			flush_dcache_mmap_lock(mapping);
+			vma->vm_flags |= VM_NONLINEAR;
+			vma_prio_tree_remove(vma, &mapping->i_mmap);
+			vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
+			flush_dcache_mmap_unlock(mapping);
+			spin_unlock(&mapping->i_mmap_lock);
+		}
+
+		err = vma->vm_ops->populate(vma, start, size,
+					    vma->vm_page_prot,
+					    pgoff, flags & MAP_NONBLOCK);
+
+		/*
+		 * We can't clear VM_NONLINEAR because we'd have to do
+		 * it after ->populate completes, and that would prevent
+		 * downgrading the lock.  (Locks can't be upgraded).
+		 */
+	}
+	if (likely(!has_write_lock))
+		up_read(&mm->mmap_sem);
+	else
+		up_write(&mm->mmap_sem);
+
+	return err;
+}
+
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -0,0 +1,607 @@
+/*
+ * High memory handling common code and variables.
+ *
+ * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
+ *          Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
+ *
+ *
+ * Redesigned the x86 32-bit VM architecture to deal with
+ * 64-bit physical space. With current x86 CPUs this
+ * means up to 64 Gigabytes physical RAM.
+ *
+ * Rewrote high memory support to move the page cache into
+ * high memory. Implemented permanent (schedulable) kmaps
+ * based on Linus' idea.
+ *
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/bio.h>
+#include <linux/pagemap.h>
+#include <linux/mempool.h>
+#include <linux/blkdev.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/highmem.h>
+#include <asm/tlbflush.h>
+
+static mempool_t *page_pool, *isa_page_pool;
+
+static void *page_pool_alloc(unsigned int __nocast gfp_mask, void *data)
+{
+	unsigned int gfp = gfp_mask | (unsigned int) (long) data;
+
+	return alloc_page(gfp);
+}
+
+static void page_pool_free(void *page, void *data)
+{
+	__free_page(page);
+}
+
+/*
+ * Virtual_count is not a pure "count".
+ *  0 means that it is not mapped, and has not been mapped
+ *    since a TLB flush - it is usable.
+ *  1 means that there are no users, but it has been mapped
+ *    since the last TLB flush - so we can't use it.
+ *  n means that there are (n-1) current users of it.
+ */
+#ifdef CONFIG_HIGHMEM
+static int pkmap_count[LAST_PKMAP];
+static unsigned int last_pkmap_nr;
+static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
+
+pte_t * pkmap_page_table;
+
+static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
+
+static void flush_all_zero_pkmaps(void)
+{
+	int i;
+
+	flush_cache_kmaps();
+
+	for (i = 0; i < LAST_PKMAP; i++) {
+		struct page *page;
+
+		/*
+		 * zero means we don't have anything to do,
+		 * >1 means that it is still in use. Only
+		 * a count of 1 means that it is free but
+		 * needs to be unmapped
+		 */
+		if (pkmap_count[i] != 1)
+			continue;
+		pkmap_count[i] = 0;
+
+		/* sanity check */
+		if (pte_none(pkmap_page_table[i]))
+			BUG();
+
+		/*
+		 * Don't need an atomic fetch-and-clear op here;
+		 * no-one has the page mapped, and cannot get at
+		 * its virtual address (and hence PTE) without first
+		 * getting the kmap_lock (which is held here).
+		 * So no dangers, even with speculative execution.
+		 */
+		page = pte_page(pkmap_page_table[i]);
+		pte_clear(&init_mm, (unsigned long)page_address(page),
+			  &pkmap_page_table[i]);
+
+		set_page_address(page, NULL);
+	}
+	flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
+}
+
+static inline unsigned long map_new_virtual(struct page *page)
+{
+	unsigned long vaddr;
+	int count;
+
+start:
+	count = LAST_PKMAP;
+	/* Find an empty entry */
+	for (;;) {
+		last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
+		if (!last_pkmap_nr) {
+			flush_all_zero_pkmaps();
+			count = LAST_PKMAP;
+		}
+		if (!pkmap_count[last_pkmap_nr])
+			break;	/* Found a usable entry */
+		if (--count)
+			continue;
+
+		/*
+		 * Sleep for somebody else to unmap their entries
+		 */
+		{
+			DECLARE_WAITQUEUE(wait, current);
+
+			__set_current_state(TASK_UNINTERRUPTIBLE);
+			add_wait_queue(&pkmap_map_wait, &wait);
+			spin_unlock(&kmap_lock);
+			schedule();
+			remove_wait_queue(&pkmap_map_wait, &wait);
+			spin_lock(&kmap_lock);
+
+			/* Somebody else might have mapped it while we slept */
+			if (page_address(page))
+				return (unsigned long)page_address(page);
+
+			/* Re-start */
+			goto start;
+		}
+	}
+	vaddr = PKMAP_ADDR(last_pkmap_nr);
+	set_pte_at(&init_mm, vaddr,
+		   &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
+
+	pkmap_count[last_pkmap_nr] = 1;
+	set_page_address(page, (void *)vaddr);
+
+	return vaddr;
+}
+
+void fastcall *kmap_high(struct page *page)
+{
+	unsigned long vaddr;
+
+	/*
+	 * For highmem pages, we can't trust "virtual" until
+	 * after we have the lock.
+	 *
+	 * We cannot call this from interrupts, as it may block
+	 */
+	spin_lock(&kmap_lock);
+	vaddr = (unsigned long)page_address(page);
+	if (!vaddr)
+		vaddr = map_new_virtual(page);
+	pkmap_count[PKMAP_NR(vaddr)]++;
+	if (pkmap_count[PKMAP_NR(vaddr)] < 2)
+		BUG();
+	spin_unlock(&kmap_lock);
+	return (void*) vaddr;
+}
+
+EXPORT_SYMBOL(kmap_high);
+
+void fastcall kunmap_high(struct page *page)
+{
+	unsigned long vaddr;
+	unsigned long nr;
+	int need_wakeup;
+
+	spin_lock(&kmap_lock);
+	vaddr = (unsigned long)page_address(page);
+	if (!vaddr)
+		BUG();
+	nr = PKMAP_NR(vaddr);
+
+	/*
+	 * A count must never go down to zero
+	 * without a TLB flush!
+	 */
+	need_wakeup = 0;
+	switch (--pkmap_count[nr]) {
+	case 0:
+		BUG();
+	case 1:
+		/*
+		 * Avoid an unnecessary wake_up() function call.
+		 * The common case is pkmap_count[] == 1, but
+		 * no waiters.
+		 * The tasks queued in the wait-queue are guarded
+		 * by both the lock in the wait-queue-head and by
+		 * the kmap_lock.  As the kmap_lock is held here,
+		 * no need for the wait-queue-head's lock.  Simply
+		 * test if the queue is empty.
+		 */
+		need_wakeup = waitqueue_active(&pkmap_map_wait);
+	}
+	spin_unlock(&kmap_lock);
+
+	/* do wake-up, if needed, race-free outside of the spin lock */
+	if (need_wakeup)
+		wake_up(&pkmap_map_wait);
+}
+
+EXPORT_SYMBOL(kunmap_high);
+
+#define POOL_SIZE	64
+
+static __init int init_emergency_pool(void)
+{
+	struct sysinfo i;
+	si_meminfo(&i);
+	si_swapinfo(&i);
+        
+	if (!i.totalhigh)
+		return 0;
+
+	page_pool = mempool_create(POOL_SIZE, page_pool_alloc, page_pool_free, NULL);
+	if (!page_pool)
+		BUG();
+	printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
+
+	return 0;
+}
+
+__initcall(init_emergency_pool);
+
+/*
+ * highmem version, map in to vec
+ */
+static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
+{
+	unsigned long flags;
+	unsigned char *vto;
+
+	local_irq_save(flags);
+	vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
+	memcpy(vto + to->bv_offset, vfrom, to->bv_len);
+	kunmap_atomic(vto, KM_BOUNCE_READ);
+	local_irq_restore(flags);
+}
+
+#else /* CONFIG_HIGHMEM */
+
+#define bounce_copy_vec(to, vfrom)	\
+	memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
+
+#endif
+
+#define ISA_POOL_SIZE	16
+
+/*
+ * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
+ * as the max address, so check if the pool has already been created.
+ */
+int init_emergency_isa_pool(void)
+{
+	if (isa_page_pool)
+		return 0;
+
+	isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc, page_pool_free, (void *) __GFP_DMA);
+	if (!isa_page_pool)
+		BUG();
+
+	printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
+	return 0;
+}
+
+/*
+ * Simple bounce buffer support for highmem pages. Depending on the
+ * queue gfp mask set, *to may or may not be a highmem page. kmap it
+ * always, it will do the Right Thing
+ */
+static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
+{
+	unsigned char *vfrom;
+	struct bio_vec *tovec, *fromvec;
+	int i;
+
+	__bio_for_each_segment(tovec, to, i, 0) {
+		fromvec = from->bi_io_vec + i;
+
+		/*
+		 * not bounced
+		 */
+		if (tovec->bv_page == fromvec->bv_page)
+			continue;
+
+		/*
+		 * fromvec->bv_offset and fromvec->bv_len might have been
+		 * modified by the block layer, so use the original copy,
+		 * bounce_copy_vec already uses tovec->bv_len
+		 */
+		vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
+
+		flush_dcache_page(tovec->bv_page);
+		bounce_copy_vec(tovec, vfrom);
+	}
+}
+
+static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
+{
+	struct bio *bio_orig = bio->bi_private;
+	struct bio_vec *bvec, *org_vec;
+	int i;
+
+	if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
+		set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
+
+	/*
+	 * free up bounce indirect pages used
+	 */
+	__bio_for_each_segment(bvec, bio, i, 0) {
+		org_vec = bio_orig->bi_io_vec + i;
+		if (bvec->bv_page == org_vec->bv_page)
+			continue;
+
+		mempool_free(bvec->bv_page, pool);	
+	}
+
+	bio_endio(bio_orig, bio_orig->bi_size, err);
+	bio_put(bio);
+}
+
+static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done,int err)
+{
+	if (bio->bi_size)
+		return 1;
+
+	bounce_end_io(bio, page_pool, err);
+	return 0;
+}
+
+static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err)
+{
+	if (bio->bi_size)
+		return 1;
+
+	bounce_end_io(bio, isa_page_pool, err);
+	return 0;
+}
+
+static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
+{
+	struct bio *bio_orig = bio->bi_private;
+
+	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+		copy_to_high_bio_irq(bio_orig, bio);
+
+	bounce_end_io(bio, pool, err);
+}
+
+static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
+{
+	if (bio->bi_size)
+		return 1;
+
+	__bounce_end_io_read(bio, page_pool, err);
+	return 0;
+}
+
+static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err)
+{
+	if (bio->bi_size)
+		return 1;
+
+	__bounce_end_io_read(bio, isa_page_pool, err);
+	return 0;
+}
+
+static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
+			mempool_t *pool)
+{
+	struct page *page;
+	struct bio *bio = NULL;
+	int i, rw = bio_data_dir(*bio_orig);
+	struct bio_vec *to, *from;
+
+	bio_for_each_segment(from, *bio_orig, i) {
+		page = from->bv_page;
+
+		/*
+		 * is destination page below bounce pfn?
+		 */
+		if (page_to_pfn(page) < q->bounce_pfn)
+			continue;
+
+		/*
+		 * irk, bounce it
+		 */
+		if (!bio)
+			bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt);
+
+		to = bio->bi_io_vec + i;
+
+		to->bv_page = mempool_alloc(pool, q->bounce_gfp);
+		to->bv_len = from->bv_len;
+		to->bv_offset = from->bv_offset;
+
+		if (rw == WRITE) {
+			char *vto, *vfrom;
+
+			flush_dcache_page(from->bv_page);
+			vto = page_address(to->bv_page) + to->bv_offset;
+			vfrom = kmap(from->bv_page) + from->bv_offset;
+			memcpy(vto, vfrom, to->bv_len);
+			kunmap(from->bv_page);
+		}
+	}
+
+	/*
+	 * no pages bounced
+	 */
+	if (!bio)
+		return;
+
+	/*
+	 * at least one page was bounced, fill in possible non-highmem
+	 * pages
+	 */
+	__bio_for_each_segment(from, *bio_orig, i, 0) {
+		to = bio_iovec_idx(bio, i);
+		if (!to->bv_page) {
+			to->bv_page = from->bv_page;
+			to->bv_len = from->bv_len;
+			to->bv_offset = from->bv_offset;
+		}
+	}
+
+	bio->bi_bdev = (*bio_orig)->bi_bdev;
+	bio->bi_flags |= (1 << BIO_BOUNCED);
+	bio->bi_sector = (*bio_orig)->bi_sector;
+	bio->bi_rw = (*bio_orig)->bi_rw;
+
+	bio->bi_vcnt = (*bio_orig)->bi_vcnt;
+	bio->bi_idx = (*bio_orig)->bi_idx;
+	bio->bi_size = (*bio_orig)->bi_size;
+
+	if (pool == page_pool) {
+		bio->bi_end_io = bounce_end_io_write;
+		if (rw == READ)
+			bio->bi_end_io = bounce_end_io_read;
+	} else {
+		bio->bi_end_io = bounce_end_io_write_isa;
+		if (rw == READ)
+			bio->bi_end_io = bounce_end_io_read_isa;
+	}
+
+	bio->bi_private = *bio_orig;
+	*bio_orig = bio;
+}
+
+void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
+{
+	mempool_t *pool;
+
+	/*
+	 * for non-isa bounce case, just check if the bounce pfn is equal
+	 * to or bigger than the highest pfn in the system -- in that case,
+	 * don't waste time iterating over bio segments
+	 */
+	if (!(q->bounce_gfp & GFP_DMA)) {
+		if (q->bounce_pfn >= blk_max_pfn)
+			return;
+		pool = page_pool;
+	} else {
+		BUG_ON(!isa_page_pool);
+		pool = isa_page_pool;
+	}
+
+	/*
+	 * slow path
+	 */
+	__blk_queue_bounce(q, bio_orig, pool);
+}
+
+EXPORT_SYMBOL(blk_queue_bounce);
+
+#if defined(HASHED_PAGE_VIRTUAL)
+
+#define PA_HASH_ORDER	7
+
+/*
+ * Describes one page->virtual association
+ */
+struct page_address_map {
+	struct page *page;
+	void *virtual;
+	struct list_head list;
+};
+
+/*
+ * page_address_map freelist, allocated from page_address_maps.
+ */
+static struct list_head page_address_pool;	/* freelist */
+static spinlock_t pool_lock;			/* protects page_address_pool */
+
+/*
+ * Hash table bucket
+ */
+static struct page_address_slot {
+	struct list_head lh;			/* List of page_address_maps */
+	spinlock_t lock;			/* Protect this bucket's list */
+} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER];
+
+static struct page_address_slot *page_slot(struct page *page)
+{
+	return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
+}
+
+void *page_address(struct page *page)
+{
+	unsigned long flags;
+	void *ret;
+	struct page_address_slot *pas;
+
+	if (!PageHighMem(page))
+		return lowmem_page_address(page);
+
+	pas = page_slot(page);
+	ret = NULL;
+	spin_lock_irqsave(&pas->lock, flags);
+	if (!list_empty(&pas->lh)) {
+		struct page_address_map *pam;
+
+		list_for_each_entry(pam, &pas->lh, list) {
+			if (pam->page == page) {
+				ret = pam->virtual;
+				goto done;
+			}
+		}
+	}
+done:
+	spin_unlock_irqrestore(&pas->lock, flags);
+	return ret;
+}
+
+EXPORT_SYMBOL(page_address);
+
+void set_page_address(struct page *page, void *virtual)
+{
+	unsigned long flags;
+	struct page_address_slot *pas;
+	struct page_address_map *pam;
+
+	BUG_ON(!PageHighMem(page));
+
+	pas = page_slot(page);
+	if (virtual) {		/* Add */
+		BUG_ON(list_empty(&page_address_pool));
+
+		spin_lock_irqsave(&pool_lock, flags);
+		pam = list_entry(page_address_pool.next,
+				struct page_address_map, list);
+		list_del(&pam->list);
+		spin_unlock_irqrestore(&pool_lock, flags);
+
+		pam->page = page;
+		pam->virtual = virtual;
+
+		spin_lock_irqsave(&pas->lock, flags);
+		list_add_tail(&pam->list, &pas->lh);
+		spin_unlock_irqrestore(&pas->lock, flags);
+	} else {		/* Remove */
+		spin_lock_irqsave(&pas->lock, flags);
+		list_for_each_entry(pam, &pas->lh, list) {
+			if (pam->page == page) {
+				list_del(&pam->list);
+				spin_unlock_irqrestore(&pas->lock, flags);
+				spin_lock_irqsave(&pool_lock, flags);
+				list_add_tail(&pam->list, &page_address_pool);
+				spin_unlock_irqrestore(&pool_lock, flags);
+				goto done;
+			}
+		}
+		spin_unlock_irqrestore(&pas->lock, flags);
+	}
+done:
+	return;
+}
+
+static struct page_address_map page_address_maps[LAST_PKMAP];
+
+void __init page_address_init(void)
+{
+	int i;
+
+	INIT_LIST_HEAD(&page_address_pool);
+	for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
+		list_add(&page_address_maps[i].list, &page_address_pool);
+	for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
+		INIT_LIST_HEAD(&page_address_htable[i].lh);
+		spin_lock_init(&page_address_htable[i].lock);
+	}
+	spin_lock_init(&pool_lock);
+}
+
+#endif	/* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -0,0 +1,260 @@
+/*
+ * Generic hugetlb support.
+ * (C) William Irwin, April 2004
+ */
+#include <linux/gfp.h>
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/sysctl.h>
+#include <linux/highmem.h>
+#include <linux/nodemask.h>
+
+const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
+static unsigned long nr_huge_pages, free_huge_pages;
+unsigned long max_huge_pages;
+static struct list_head hugepage_freelists[MAX_NUMNODES];
+static unsigned int nr_huge_pages_node[MAX_NUMNODES];
+static unsigned int free_huge_pages_node[MAX_NUMNODES];
+static DEFINE_SPINLOCK(hugetlb_lock);
+
+static void enqueue_huge_page(struct page *page)
+{
+	int nid = page_to_nid(page);
+	list_add(&page->lru, &hugepage_freelists[nid]);
+	free_huge_pages++;
+	free_huge_pages_node[nid]++;
+}
+
+static struct page *dequeue_huge_page(void)
+{
+	int nid = numa_node_id();
+	struct page *page = NULL;
+
+	if (list_empty(&hugepage_freelists[nid])) {
+		for (nid = 0; nid < MAX_NUMNODES; ++nid)
+			if (!list_empty(&hugepage_freelists[nid]))
+				break;
+	}
+	if (nid >= 0 && nid < MAX_NUMNODES &&
+	    !list_empty(&hugepage_freelists[nid])) {
+		page = list_entry(hugepage_freelists[nid].next,
+				  struct page, lru);
+		list_del(&page->lru);
+		free_huge_pages--;
+		free_huge_pages_node[nid]--;
+	}
+	return page;
+}
+
+static struct page *alloc_fresh_huge_page(void)
+{
+	static int nid = 0;
+	struct page *page;
+	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
+					HUGETLB_PAGE_ORDER);
+	nid = (nid + 1) % num_online_nodes();
+	if (page) {
+		nr_huge_pages++;
+		nr_huge_pages_node[page_to_nid(page)]++;
+	}
+	return page;
+}
+
+void free_huge_page(struct page *page)
+{
+	BUG_ON(page_count(page));
+
+	INIT_LIST_HEAD(&page->lru);
+	page[1].mapping = NULL;
+
+	spin_lock(&hugetlb_lock);
+	enqueue_huge_page(page);
+	spin_unlock(&hugetlb_lock);
+}
+
+struct page *alloc_huge_page(void)
+{
+	struct page *page;
+	int i;
+
+	spin_lock(&hugetlb_lock);
+	page = dequeue_huge_page();
+	if (!page) {
+		spin_unlock(&hugetlb_lock);
+		return NULL;
+	}
+	spin_unlock(&hugetlb_lock);
+	set_page_count(page, 1);
+	page[1].mapping = (void *)free_huge_page;
+	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
+		clear_highpage(&page[i]);
+	return page;
+}
+
+static int __init hugetlb_init(void)
+{
+	unsigned long i;
+	struct page *page;
+
+	for (i = 0; i < MAX_NUMNODES; ++i)
+		INIT_LIST_HEAD(&hugepage_freelists[i]);
+
+	for (i = 0; i < max_huge_pages; ++i) {
+		page = alloc_fresh_huge_page();
+		if (!page)
+			break;
+		spin_lock(&hugetlb_lock);
+		enqueue_huge_page(page);
+		spin_unlock(&hugetlb_lock);
+	}
+	max_huge_pages = free_huge_pages = nr_huge_pages = i;
+	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
+	return 0;
+}
+module_init(hugetlb_init);
+
+static int __init hugetlb_setup(char *s)
+{
+	if (sscanf(s, "%lu", &max_huge_pages) <= 0)
+		max_huge_pages = 0;
+	return 1;
+}
+__setup("hugepages=", hugetlb_setup);
+
+#ifdef CONFIG_SYSCTL
+static void update_and_free_page(struct page *page)
+{
+	int i;
+	nr_huge_pages--;
+	nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--;
+	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
+		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
+				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
+				1 << PG_private | 1<< PG_writeback);
+		set_page_count(&page[i], 0);
+	}
+	set_page_count(page, 1);
+	__free_pages(page, HUGETLB_PAGE_ORDER);
+}
+
+#ifdef CONFIG_HIGHMEM
+static void try_to_free_low(unsigned long count)
+{
+	int i, nid;
+	for (i = 0; i < MAX_NUMNODES; ++i) {
+		struct page *page, *next;
+		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
+			if (PageHighMem(page))
+				continue;
+			list_del(&page->lru);
+			update_and_free_page(page);
+			nid = page_zone(page)->zone_pgdat->node_id;
+			free_huge_pages--;
+			free_huge_pages_node[nid]--;
+			if (count >= nr_huge_pages)
+				return;
+		}
+	}
+}
+#else
+static inline void try_to_free_low(unsigned long count)
+{
+}
+#endif
+
+static unsigned long set_max_huge_pages(unsigned long count)
+{
+	while (count > nr_huge_pages) {
+		struct page *page = alloc_fresh_huge_page();
+		if (!page)
+			return nr_huge_pages;
+		spin_lock(&hugetlb_lock);
+		enqueue_huge_page(page);
+		spin_unlock(&hugetlb_lock);
+	}
+	if (count >= nr_huge_pages)
+		return nr_huge_pages;
+
+	spin_lock(&hugetlb_lock);
+	try_to_free_low(count);
+	while (count < nr_huge_pages) {
+		struct page *page = dequeue_huge_page();
+		if (!page)
+			break;
+		update_and_free_page(page);
+	}
+	spin_unlock(&hugetlb_lock);
+	return nr_huge_pages;
+}
+
+int hugetlb_sysctl_handler(struct ctl_table *table, int write,
+			   struct file *file, void __user *buffer,
+			   size_t *length, loff_t *ppos)
+{
+	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+	max_huge_pages = set_max_huge_pages(max_huge_pages);
+	return 0;
+}
+#endif /* CONFIG_SYSCTL */
+
+int hugetlb_report_meminfo(char *buf)
+{
+	return sprintf(buf,
+			"HugePages_Total: %5lu\n"
+			"HugePages_Free:  %5lu\n"
+			"Hugepagesize:    %5lu kB\n",
+			nr_huge_pages,
+			free_huge_pages,
+			HPAGE_SIZE/1024);
+}
+
+int hugetlb_report_node_meminfo(int nid, char *buf)
+{
+	return sprintf(buf,
+		"Node %d HugePages_Total: %5u\n"
+		"Node %d HugePages_Free:  %5u\n",
+		nid, nr_huge_pages_node[nid],
+		nid, free_huge_pages_node[nid]);
+}
+
+int is_hugepage_mem_enough(size_t size)
+{
+	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
+}
+
+/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
+unsigned long hugetlb_total_pages(void)
+{
+	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
+}
+EXPORT_SYMBOL(hugetlb_total_pages);
+
+/*
+ * We cannot handle pagefaults against hugetlb pages at all.  They cause
+ * handle_mm_fault() to try to instantiate regular-sized pages in the
+ * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
+ * this far.
+ */
+static struct page *hugetlb_nopage(struct vm_area_struct *vma,
+				unsigned long address, int *unused)
+{
+	BUG();
+	return NULL;
+}
+
+struct vm_operations_struct hugetlb_vm_ops = {
+	.nopage = hugetlb_nopage,
+};
+
+void zap_hugepage_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long length)
+{
+	struct mm_struct *mm = vma->vm_mm;
+
+	spin_lock(&mm->page_table_lock);
+	unmap_hugepage_range(vma, start, start + length);
+	spin_unlock(&mm->page_table_lock);
+}
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -0,0 +1,13 @@
+/* internal.h: mm/ internal definitions
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* page_alloc.c */
+extern void set_page_refs(struct page *page, int order);
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -0,0 +1,242 @@
+/*
+ *	linux/mm/madvise.c
+ *
+ * Copyright (C) 1999  Linus Torvalds
+ * Copyright (C) 2002  Christoph Hellwig
+ */
+
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/syscalls.h>
+#include <linux/hugetlb.h>
+
+/*
+ * We can potentially split a vm area into separate
+ * areas, each area with its own behavior.
+ */
+static long madvise_behavior(struct vm_area_struct * vma, unsigned long start,
+			     unsigned long end, int behavior)
+{
+	struct mm_struct * mm = vma->vm_mm;
+	int error = 0;
+
+	if (start != vma->vm_start) {
+		error = split_vma(mm, vma, start, 1);
+		if (error)
+			goto out;
+	}
+
+	if (end != vma->vm_end) {
+		error = split_vma(mm, vma, end, 0);
+		if (error)
+			goto out;
+	}
+
+	/*
+	 * vm_flags is protected by the mmap_sem held in write mode.
+	 */
+	VM_ClearReadHint(vma);
+
+	switch (behavior) {
+	case MADV_SEQUENTIAL:
+		vma->vm_flags |= VM_SEQ_READ;
+		break;
+	case MADV_RANDOM:
+		vma->vm_flags |= VM_RAND_READ;
+		break;
+	default:
+		break;
+	}
+
+out:
+	if (error == -ENOMEM)
+		error = -EAGAIN;
+	return error;
+}
+
+/*
+ * Schedule all required I/O operations.  Do not wait for completion.
+ */
+static long madvise_willneed(struct vm_area_struct * vma,
+			     unsigned long start, unsigned long end)
+{
+	struct file *file = vma->vm_file;
+
+	if (!file)
+		return -EBADF;
+
+	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+	if (end > vma->vm_end)
+		end = vma->vm_end;
+	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+
+	force_page_cache_readahead(file->f_mapping,
+			file, start, max_sane_readahead(end - start));
+	return 0;
+}
+
+/*
+ * Application no longer needs these pages.  If the pages are dirty,
+ * it's OK to just throw them away.  The app will be more careful about
+ * data it wants to keep.  Be sure to free swap resources too.  The
+ * zap_page_range call sets things up for refill_inactive to actually free
+ * these pages later if no one else has touched them in the meantime,
+ * although we could add these pages to a global reuse list for
+ * refill_inactive to pick up before reclaiming other pages.
+ *
+ * NB: This interface discards data rather than pushes it out to swap,
+ * as some implementations do.  This has performance implications for
+ * applications like large transactional databases which want to discard
+ * pages in anonymous maps after committing to backing store the data
+ * that was kept in them.  There is no reason to write this data out to
+ * the swap area if the application is discarding it.
+ *
+ * An interface that causes the system to free clean pages and flush
+ * dirty pages is already available as msync(MS_INVALIDATE).
+ */
+static long madvise_dontneed(struct vm_area_struct * vma,
+			     unsigned long start, unsigned long end)
+{
+	if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma))
+		return -EINVAL;
+
+	if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
+		struct zap_details details = {
+			.nonlinear_vma = vma,
+			.last_index = ULONG_MAX,
+		};
+		zap_page_range(vma, start, end - start, &details);
+	} else
+		zap_page_range(vma, start, end - start, NULL);
+	return 0;
+}
+
+static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
+			unsigned long end, int behavior)
+{
+	long error = -EBADF;
+
+	switch (behavior) {
+	case MADV_NORMAL:
+	case MADV_SEQUENTIAL:
+	case MADV_RANDOM:
+		error = madvise_behavior(vma, start, end, behavior);
+		break;
+
+	case MADV_WILLNEED:
+		error = madvise_willneed(vma, start, end);
+		break;
+
+	case MADV_DONTNEED:
+		error = madvise_dontneed(vma, start, end);
+		break;
+
+	default:
+		error = -EINVAL;
+		break;
+	}
+		
+	return error;
+}
+
+/*
+ * The madvise(2) system call.
+ *
+ * Applications can use madvise() to advise the kernel how it should
+ * handle paging I/O in this VM area.  The idea is to help the kernel
+ * use appropriate read-ahead and caching techniques.  The information
+ * provided is advisory only, and can be safely disregarded by the
+ * kernel without affecting the correct operation of the application.
+ *
+ * behavior values:
+ *  MADV_NORMAL - the default behavior is to read clusters.  This
+ *		results in some read-ahead and read-behind.
+ *  MADV_RANDOM - the system should read the minimum amount of data
+ *		on any access, since it is unlikely that the appli-
+ *		cation will need more than what it asks for.
+ *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
+ *		once, so they can be aggressively read ahead, and
+ *		can be freed soon after they are accessed.
+ *  MADV_WILLNEED - the application is notifying the system to read
+ *		some pages ahead.
+ *  MADV_DONTNEED - the application is finished with the given range,
+ *		so the kernel can free resources associated with it.
+ *
+ * return values:
+ *  zero    - success
+ *  -EINVAL - start + len < 0, start is not page-aligned,
+ *		"behavior" is not a valid value, or application
+ *		is attempting to release locked or shared pages.
+ *  -ENOMEM - addresses in the specified range are not currently
+ *		mapped, or are outside the AS of the process.
+ *  -EIO    - an I/O error occurred while paging in data.
+ *  -EBADF  - map exists, but area maps something that isn't a file.
+ *  -EAGAIN - a kernel resource was temporarily unavailable.
+ */
+asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
+{
+	unsigned long end;
+	struct vm_area_struct * vma;
+	int unmapped_error = 0;
+	int error = -EINVAL;
+	size_t len;
+
+	down_write(&current->mm->mmap_sem);
+
+	if (start & ~PAGE_MASK)
+		goto out;
+	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+
+	/* Check to see whether len was rounded up from small -ve to zero */
+	if (len_in && !len)
+		goto out;
+
+	end = start + len;
+	if (end < start)
+		goto out;
+
+	error = 0;
+	if (end == start)
+		goto out;
+
+	/*
+	 * If the interval [start,end) covers some unmapped address
+	 * ranges, just ignore them, but return -ENOMEM at the end.
+	 */
+	vma = find_vma(current->mm, start);
+	for (;;) {
+		/* Still start < end. */
+		error = -ENOMEM;
+		if (!vma)
+			goto out;
+
+		/* Here start < vma->vm_end. */
+		if (start < vma->vm_start) {
+			unmapped_error = -ENOMEM;
+			start = vma->vm_start;
+		}
+
+		/* Here vma->vm_start <= start < vma->vm_end. */
+		if (end <= vma->vm_end) {
+			if (start < end) {
+				error = madvise_vma(vma, start, end,
+							behavior);
+				if (error)
+					goto out;
+			}
+			error = unmapped_error;
+			goto out;
+		}
+
+		/* Here vma->vm_start <= start < vma->vm_end < end. */
+		error = madvise_vma(vma, start, vma->vm_end, behavior);
+		if (error)
+			goto out;
+		start = vma->vm_end;
+		vma = vma->vm_next;
+	}
+
+out:
+	up_write(&current->mm->mmap_sem);
+	return error;
+}
--- a/mm/memory.c
+++ b/mm/memory.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -0,0 +1,290 @@
+/*
+ *  linux/mm/mempool.c
+ *
+ *  memory buffer pool support. Such pools are mostly used
+ *  for guaranteed, deadlock-free memory allocations during
+ *  extreme VM load.
+ *
+ *  started by Ingo Molnar, Copyright (C) 2001
+ */
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/mempool.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+
+static void add_element(mempool_t *pool, void *element)
+{
+	BUG_ON(pool->curr_nr >= pool->min_nr);
+	pool->elements[pool->curr_nr++] = element;
+}
+
+static void *remove_element(mempool_t *pool)
+{
+	BUG_ON(pool->curr_nr <= 0);
+	return pool->elements[--pool->curr_nr];
+}
+
+static void free_pool(mempool_t *pool)
+{
+	while (pool->curr_nr) {
+		void *element = remove_element(pool);
+		pool->free(element, pool->pool_data);
+	}
+	kfree(pool->elements);
+	kfree(pool);
+}
+
+/**
+ * mempool_create - create a memory pool
+ * @min_nr:    the minimum number of elements guaranteed to be
+ *             allocated for this pool.
+ * @alloc_fn:  user-defined element-allocation function.
+ * @free_fn:   user-defined element-freeing function.
+ * @pool_data: optional private data available to the user-defined functions.
+ *
+ * this function creates and allocates a guaranteed size, preallocated
+ * memory pool. The pool can be used from the mempool_alloc and mempool_free
+ * functions. This function might sleep. Both the alloc_fn() and the free_fn()
+ * functions might sleep - as long as the mempool_alloc function is not called
+ * from IRQ contexts.
+ */
+mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
+				mempool_free_t *free_fn, void *pool_data)
+{
+	mempool_t *pool;
+
+	pool = kmalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool)
+		return NULL;
+	memset(pool, 0, sizeof(*pool));
+	pool->elements = kmalloc(min_nr * sizeof(void *), GFP_KERNEL);
+	if (!pool->elements) {
+		kfree(pool);
+		return NULL;
+	}
+	spin_lock_init(&pool->lock);
+	pool->min_nr = min_nr;
+	pool->pool_data = pool_data;
+	init_waitqueue_head(&pool->wait);
+	pool->alloc = alloc_fn;
+	pool->free = free_fn;
+
+	/*
+	 * First pre-allocate the guaranteed number of buffers.
+	 */
+	while (pool->curr_nr < pool->min_nr) {
+		void *element;
+
+		element = pool->alloc(GFP_KERNEL, pool->pool_data);
+		if (unlikely(!element)) {
+			free_pool(pool);
+			return NULL;
+		}
+		add_element(pool, element);
+	}
+	return pool;
+}
+EXPORT_SYMBOL(mempool_create);
+
+/**
+ * mempool_resize - resize an existing memory pool
+ * @pool:       pointer to the memory pool which was allocated via
+ *              mempool_create().
+ * @new_min_nr: the new minimum number of elements guaranteed to be
+ *              allocated for this pool.
+ * @gfp_mask:   the usual allocation bitmask.
+ *
+ * This function shrinks/grows the pool. In the case of growing,
+ * it cannot be guaranteed that the pool will be grown to the new
+ * size immediately, but new mempool_free() calls will refill it.
+ *
+ * Note, the caller must guarantee that no mempool_destroy is called
+ * while this function is running. mempool_alloc() & mempool_free()
+ * might be called (eg. from IRQ contexts) while this function executes.
+ */
+int mempool_resize(mempool_t *pool, int new_min_nr, unsigned int __nocast gfp_mask)
+{
+	void *element;
+	void **new_elements;
+	unsigned long flags;
+
+	BUG_ON(new_min_nr <= 0);
+
+	spin_lock_irqsave(&pool->lock, flags);
+	if (new_min_nr <= pool->min_nr) {
+		while (new_min_nr < pool->curr_nr) {
+			element = remove_element(pool);
+			spin_unlock_irqrestore(&pool->lock, flags);
+			pool->free(element, pool->pool_data);
+			spin_lock_irqsave(&pool->lock, flags);
+		}
+		pool->min_nr = new_min_nr;
+		goto out_unlock;
+	}
+	spin_unlock_irqrestore(&pool->lock, flags);
+
+	/* Grow the pool */
+	new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask);
+	if (!new_elements)
+		return -ENOMEM;
+
+	spin_lock_irqsave(&pool->lock, flags);
+	if (unlikely(new_min_nr <= pool->min_nr)) {
+		/* Raced, other resize will do our work */
+		spin_unlock_irqrestore(&pool->lock, flags);
+		kfree(new_elements);
+		goto out;
+	}
+	memcpy(new_elements, pool->elements,
+			pool->curr_nr * sizeof(*new_elements));
+	kfree(pool->elements);
+	pool->elements = new_elements;
+	pool->min_nr = new_min_nr;
+
+	while (pool->curr_nr < pool->min_nr) {
+		spin_unlock_irqrestore(&pool->lock, flags);
+		element = pool->alloc(gfp_mask, pool->pool_data);
+		if (!element)
+			goto out;
+		spin_lock_irqsave(&pool->lock, flags);
+		if (pool->curr_nr < pool->min_nr) {
+			add_element(pool, element);
+		} else {
+			spin_unlock_irqrestore(&pool->lock, flags);
+			pool->free(element, pool->pool_data);	/* Raced */
+			goto out;
+		}
+	}
+out_unlock:
+	spin_unlock_irqrestore(&pool->lock, flags);
+out:
+	return 0;
+}
+EXPORT_SYMBOL(mempool_resize);
+
+/**
+ * mempool_destroy - deallocate a memory pool
+ * @pool:      pointer to the memory pool which was allocated via
+ *             mempool_create().
+ *
+ * this function only sleeps if the free_fn() function sleeps. The caller
+ * has to guarantee that all elements have been returned to the pool (ie:
+ * freed) prior to calling mempool_destroy().
+ */
+void mempool_destroy(mempool_t *pool)
+{
+	if (pool->curr_nr != pool->min_nr)
+		BUG();		/* There were outstanding elements */
+	free_pool(pool);
+}
+EXPORT_SYMBOL(mempool_destroy);
+
+/**
+ * mempool_alloc - allocate an element from a specific memory pool
+ * @pool:      pointer to the memory pool which was allocated via
+ *             mempool_create().
+ * @gfp_mask:  the usual allocation bitmask.
+ *
+ * this function only sleeps if the alloc_fn function sleeps or
+ * returns NULL. Note that due to preallocation, this function
+ * *never* fails when called from process contexts. (it might
+ * fail if called from an IRQ context.)
+ */
+void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask)
+{
+	void *element;
+	unsigned long flags;
+	DEFINE_WAIT(wait);
+	int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
+
+	might_sleep_if(gfp_mask & __GFP_WAIT);
+repeat_alloc:
+	element = pool->alloc(gfp_nowait|__GFP_NOWARN, pool->pool_data);
+	if (likely(element != NULL))
+		return element;
+
+	/*
+	 * If the pool is less than 50% full and we can perform effective
+	 * page reclaim then try harder to allocate an element.
+	 */
+	mb();
+	if ((gfp_mask & __GFP_FS) && (gfp_mask != gfp_nowait) &&
+				(pool->curr_nr <= pool->min_nr/2)) {
+		element = pool->alloc(gfp_mask, pool->pool_data);
+		if (likely(element != NULL))
+			return element;
+	}
+
+	/*
+	 * Kick the VM at this point.
+	 */
+	wakeup_bdflush(0);
+
+	spin_lock_irqsave(&pool->lock, flags);
+	if (likely(pool->curr_nr)) {
+		element = remove_element(pool);
+		spin_unlock_irqrestore(&pool->lock, flags);
+		return element;
+	}
+	spin_unlock_irqrestore(&pool->lock, flags);
+
+	/* We must not sleep in the GFP_ATOMIC case */
+	if (!(gfp_mask & __GFP_WAIT))
+		return NULL;
+
+	prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
+	mb();
+	if (!pool->curr_nr)
+		io_schedule();
+	finish_wait(&pool->wait, &wait);
+
+	goto repeat_alloc;
+}
+EXPORT_SYMBOL(mempool_alloc);
+
+/**
+ * mempool_free - return an element to the pool.
+ * @element:   pool element pointer.
+ * @pool:      pointer to the memory pool which was allocated via
+ *             mempool_create().
+ *
+ * this function only sleeps if the free_fn() function sleeps.
+ */
+void mempool_free(void *element, mempool_t *pool)
+{
+	unsigned long flags;
+
+	mb();
+	if (pool->curr_nr < pool->min_nr) {
+		spin_lock_irqsave(&pool->lock, flags);
+		if (pool->curr_nr < pool->min_nr) {
+			add_element(pool, element);
+			spin_unlock_irqrestore(&pool->lock, flags);
+			wake_up(&pool->wait);
+			return;
+		}
+		spin_unlock_irqrestore(&pool->lock, flags);
+	}
+	pool->free(element, pool->pool_data);
+}
+EXPORT_SYMBOL(mempool_free);
+
+/*
+ * A commonly used alloc and free fn.
+ */
+void *mempool_alloc_slab(unsigned int __nocast gfp_mask, void *pool_data)
+{
+	kmem_cache_t *mem = (kmem_cache_t *) pool_data;
+	return kmem_cache_alloc(mem, gfp_mask);
+}
+EXPORT_SYMBOL(mempool_alloc_slab);
+
+void mempool_free_slab(void *element, void *pool_data)
+{
+	kmem_cache_t *mem = (kmem_cache_t *) pool_data;
+	kmem_cache_free(mem, element);
+}
+EXPORT_SYMBOL(mempool_free_slab);
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -0,0 +1,191 @@
+/*
+ *	linux/mm/mincore.c
+ *
+ * Copyright (C) 1994-1999  Linus Torvalds
+ */
+
+/*
+ * The mincore() system call.
+ */
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/syscalls.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+
+/*
+ * Later we can get more picky about what "in core" means precisely.
+ * For now, simply check to see if the page is in the page cache,
+ * and is up to date; i.e. that no page-in operation would be required
+ * at this time if an application were to map and access this page.
+ */
+static unsigned char mincore_page(struct vm_area_struct * vma,
+	unsigned long pgoff)
+{
+	unsigned char present = 0;
+	struct address_space * as = vma->vm_file->f_mapping;
+	struct page * page;
+
+	page = find_get_page(as, pgoff);
+	if (page) {
+		present = PageUptodate(page);
+		page_cache_release(page);
+	}
+
+	return present;
+}
+
+static long mincore_vma(struct vm_area_struct * vma,
+	unsigned long start, unsigned long end, unsigned char __user * vec)
+{
+	long error, i, remaining;
+	unsigned char * tmp;
+
+	error = -ENOMEM;
+	if (!vma->vm_file)
+		return error;
+
+	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+	if (end > vma->vm_end)
+		end = vma->vm_end;
+	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+
+	error = -EAGAIN;
+	tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
+	if (!tmp)
+		return error;
+
+	/* (end - start) is # of pages, and also # of bytes in "vec */
+	remaining = (end - start),
+
+	error = 0;
+	for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
+		int j = 0;
+		long thispiece = (remaining < PAGE_SIZE) ?
+						remaining : PAGE_SIZE;
+
+		while (j < thispiece)
+			tmp[j++] = mincore_page(vma, start++);
+
+		if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
+			error = -EFAULT;
+			break;
+		}
+	}
+
+	free_page((unsigned long) tmp);
+	return error;
+}
+
+/*
+ * The mincore(2) system call.
+ *
+ * mincore() returns the memory residency status of the pages in the
+ * current process's address space specified by [addr, addr + len).
+ * The status is returned in a vector of bytes.  The least significant
+ * bit of each byte is 1 if the referenced page is in memory, otherwise
+ * it is zero.
+ *
+ * Because the status of a page can change after mincore() checks it
+ * but before it returns to the application, the returned vector may
+ * contain stale information.  Only locked pages are guaranteed to
+ * remain in memory.
+ *
+ * return values:
+ *  zero    - success
+ *  -EFAULT - vec points to an illegal address
+ *  -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE
+ *  -ENOMEM - Addresses in the range [addr, addr + len] are
+ *		invalid for the address space of this process, or
+ *		specify one or more pages which are not currently
+ *		mapped
+ *  -EAGAIN - A kernel resource was temporarily unavailable.
+ */
+asmlinkage long sys_mincore(unsigned long start, size_t len,
+	unsigned char __user * vec)
+{
+	int index = 0;
+	unsigned long end, limit;
+	struct vm_area_struct * vma;
+	size_t max;
+	int unmapped_error = 0;
+	long error;
+
+	/* check the arguments */
+ 	if (start & ~PAGE_CACHE_MASK)
+		goto einval;
+
+	if (start < FIRST_USER_PGD_NR * PGDIR_SIZE)
+		goto enomem;
+
+	limit = TASK_SIZE;
+	if (start >= limit)
+		goto enomem;
+
+	if (!len)
+		return 0;
+
+	max = limit - start;
+	len = PAGE_CACHE_ALIGN(len);
+	if (len > max || !len)
+		goto enomem;
+
+	end = start + len;
+
+	/* check the output buffer whilst holding the lock */
+	error = -EFAULT;
+	down_read(&current->mm->mmap_sem);
+
+	if (!access_ok(VERIFY_WRITE, vec, len >> PAGE_SHIFT))
+		goto out;
+
+	/*
+	 * If the interval [start,end) covers some unmapped address
+	 * ranges, just ignore them, but return -ENOMEM at the end.
+	 */
+	error = 0;
+
+	vma = find_vma(current->mm, start);
+	while (vma) {
+		/* Here start < vma->vm_end. */
+		if (start < vma->vm_start) {
+			unmapped_error = -ENOMEM;
+			start = vma->vm_start;
+		}
+
+		/* Here vma->vm_start <= start < vma->vm_end. */
+		if (end <= vma->vm_end) {
+			if (start < end) {
+				error = mincore_vma(vma, start, end,
+							&vec[index]);
+				if (error)
+					goto out;
+			}
+			error = unmapped_error;
+			goto out;
+		}
+
+		/* Here vma->vm_start <= start < vma->vm_end < end. */
+		error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
+		if (error)
+			goto out;
+		index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
+		start = vma->vm_end;
+		vma = vma->vm_next;
+	}
+
+	/* we found a hole in the area queried if we arrive here */
+	error = -ENOMEM;
+
+out:
+	up_read(&current->mm->mmap_sem);
+	return error;
+
+einval:
+	return -EINVAL;
+enomem:
+	return -ENOMEM;
+}
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -0,0 +1,253 @@
+/*
+ *	linux/mm/mlock.c
+ *
+ *  (C) Copyright 1995 Linus Torvalds
+ *  (C) Copyright 2002 Christoph Hellwig
+ */
+
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/mempolicy.h>
+#include <linux/syscalls.h>
+
+
+static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
+	unsigned long start, unsigned long end, unsigned int newflags)
+{
+	struct mm_struct * mm = vma->vm_mm;
+	pgoff_t pgoff;
+	int pages;
+	int ret = 0;
+
+	if (newflags == vma->vm_flags) {
+		*prev = vma;
+		goto out;
+	}
+
+	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+	*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
+			  vma->vm_file, pgoff, vma_policy(vma));
+	if (*prev) {
+		vma = *prev;
+		goto success;
+	}
+
+	*prev = vma;
+
+	if (start != vma->vm_start) {
+		ret = split_vma(mm, vma, start, 1);
+		if (ret)
+			goto out;
+	}
+
+	if (end != vma->vm_end) {
+		ret = split_vma(mm, vma, end, 0);
+		if (ret)
+			goto out;
+	}
+
+success:
+	/*
+	 * vm_flags is protected by the mmap_sem held in write mode.
+	 * It's okay if try_to_unmap_one unmaps a page just after we
+	 * set VM_LOCKED, make_pages_present below will bring it back.
+	 */
+	vma->vm_flags = newflags;
+
+	/*
+	 * Keep track of amount of locked VM.
+	 */
+	pages = (end - start) >> PAGE_SHIFT;
+	if (newflags & VM_LOCKED) {
+		pages = -pages;
+		if (!(newflags & VM_IO))
+			ret = make_pages_present(start, end);
+	}
+
+	vma->vm_mm->locked_vm -= pages;
+out:
+	if (ret == -ENOMEM)
+		ret = -EAGAIN;
+	return ret;
+}
+
+static int do_mlock(unsigned long start, size_t len, int on)
+{
+	unsigned long nstart, end, tmp;
+	struct vm_area_struct * vma, * prev;
+	int error;
+
+	len = PAGE_ALIGN(len);
+	end = start + len;
+	if (end < start)
+		return -EINVAL;
+	if (end == start)
+		return 0;
+	vma = find_vma_prev(current->mm, start, &prev);
+	if (!vma || vma->vm_start > start)
+		return -ENOMEM;
+
+	if (start > vma->vm_start)
+		prev = vma;
+
+	for (nstart = start ; ; ) {
+		unsigned int newflags;
+
+		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
+
+		newflags = vma->vm_flags | VM_LOCKED;
+		if (!on)
+			newflags &= ~VM_LOCKED;
+
+		tmp = vma->vm_end;
+		if (tmp > end)
+			tmp = end;
+		error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
+		if (error)
+			break;
+		nstart = tmp;
+		if (nstart < prev->vm_end)
+			nstart = prev->vm_end;
+		if (nstart >= end)
+			break;
+
+		vma = prev->vm_next;
+		if (!vma || vma->vm_start != nstart) {
+			error = -ENOMEM;
+			break;
+		}
+	}
+	return error;
+}
+
+asmlinkage long sys_mlock(unsigned long start, size_t len)
+{
+	unsigned long locked;
+	unsigned long lock_limit;
+	int error = -ENOMEM;
+
+	if (!can_do_mlock())
+		return -EPERM;
+
+	down_write(&current->mm->mmap_sem);
+	len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
+	start &= PAGE_MASK;
+
+	locked = len >> PAGE_SHIFT;
+	locked += current->mm->locked_vm;
+
+	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+	lock_limit >>= PAGE_SHIFT;
+
+	/* check against resource limits */
+	if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
+		error = do_mlock(start, len, 1);
+	up_write(&current->mm->mmap_sem);
+	return error;
+}
+
+asmlinkage long sys_munlock(unsigned long start, size_t len)
+{
+	int ret;
+
+	down_write(&current->mm->mmap_sem);
+	len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
+	start &= PAGE_MASK;
+	ret = do_mlock(start, len, 0);
+	up_write(&current->mm->mmap_sem);
+	return ret;
+}
+
+static int do_mlockall(int flags)
+{
+	struct vm_area_struct * vma, * prev = NULL;
+	unsigned int def_flags = 0;
+
+	if (flags & MCL_FUTURE)
+		def_flags = VM_LOCKED;
+	current->mm->def_flags = def_flags;
+	if (flags == MCL_FUTURE)
+		goto out;
+
+	for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
+		unsigned int newflags;
+
+		newflags = vma->vm_flags | VM_LOCKED;
+		if (!(flags & MCL_CURRENT))
+			newflags &= ~VM_LOCKED;
+
+		/* Ignore errors */
+		mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
+	}
+out:
+	return 0;
+}
+
+asmlinkage long sys_mlockall(int flags)
+{
+	unsigned long lock_limit;
+	int ret = -EINVAL;
+
+	if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE)))
+		goto out;
+
+	ret = -EPERM;
+	if (!can_do_mlock())
+		goto out;
+
+	down_write(&current->mm->mmap_sem);
+
+	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+	lock_limit >>= PAGE_SHIFT;
+
+	ret = -ENOMEM;
+	if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
+	    capable(CAP_IPC_LOCK))
+		ret = do_mlockall(flags);
+	up_write(&current->mm->mmap_sem);
+out:
+	return ret;
+}
+
+asmlinkage long sys_munlockall(void)
+{
+	int ret;
+
+	down_write(&current->mm->mmap_sem);
+	ret = do_mlockall(0);
+	up_write(&current->mm->mmap_sem);
+	return ret;
+}
+
+/*
+ * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
+ * shm segments) get accounted against the user_struct instead.
+ */
+static DEFINE_SPINLOCK(shmlock_user_lock);
+
+int user_shm_lock(size_t size, struct user_struct *user)
+{
+	unsigned long lock_limit, locked;
+	int allowed = 0;
+
+	locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+	lock_limit >>= PAGE_SHIFT;
+	spin_lock(&shmlock_user_lock);
+	if (locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
+		goto out;
+	get_uid(user);
+	user->locked_shm += locked;
+	allowed = 1;
+out:
+	spin_unlock(&shmlock_user_lock);
+	return allowed;
+}
+
+void user_shm_unlock(size_t size, struct user_struct *user)
+{
+	spin_lock(&shmlock_user_lock);
+	user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	spin_unlock(&shmlock_user_lock);
+	free_uid(user);
+}
--- a/mm/mmap.c
+++ b/mm/mmap.c
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -0,0 +1,282 @@
+/*
+ *  mm/mprotect.c
+ *
+ *  (C) Copyright 1994 Linus Torvalds
+ *  (C) Copyright 2002 Christoph Hellwig
+ *
+ *  Address space accounting code	<alan@redhat.com>
+ *  (C) Copyright 2002 Red Hat Inc, All Rights Reserved
+ */
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/slab.h>
+#include <linux/shm.h>
+#include <linux/mman.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/security.h>
+#include <linux/mempolicy.h>
+#include <linux/personality.h>
+#include <linux/syscalls.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
+		unsigned long addr, unsigned long end, pgprot_t newprot)
+{
+	pte_t *pte;
+
+	pte = pte_offset_map(pmd, addr);
+	do {
+		if (pte_present(*pte)) {
+			pte_t ptent;
+
+			/* Avoid an SMP race with hardware updated dirty/clean
+			 * bits by wiping the pte and then setting the new pte
+			 * into place.
+			 */
+			ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot);
+			set_pte_at(mm, addr, pte, ptent);
+			lazy_mmu_prot_update(ptent);
+		}
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	pte_unmap(pte - 1);
+}
+
+static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
+		unsigned long addr, unsigned long end, pgprot_t newprot)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none_or_clear_bad(pmd))
+			continue;
+		change_pte_range(mm, pmd, addr, next, newprot);
+	} while (pmd++, addr = next, addr != end);
+}
+
+static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
+		unsigned long addr, unsigned long end, pgprot_t newprot)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_offset(pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(pud))
+			continue;
+		change_pmd_range(mm, pud, addr, next, newprot);
+	} while (pud++, addr = next, addr != end);
+}
+
+static void change_protection(struct vm_area_struct *vma,
+		unsigned long addr, unsigned long end, pgprot_t newprot)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pgd_t *pgd;
+	unsigned long next;
+	unsigned long start = addr;
+
+	BUG_ON(addr >= end);
+	pgd = pgd_offset(mm, addr);
+	flush_cache_range(vma, addr, end);
+	spin_lock(&mm->page_table_lock);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		change_pud_range(mm, pgd, addr, next, newprot);
+	} while (pgd++, addr = next, addr != end);
+	flush_tlb_range(vma, start, end);
+	spin_unlock(&mm->page_table_lock);
+}
+
+static int
+mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
+	unsigned long start, unsigned long end, unsigned long newflags)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long oldflags = vma->vm_flags;
+	long nrpages = (end - start) >> PAGE_SHIFT;
+	unsigned long charged = 0;
+	pgprot_t newprot;
+	pgoff_t pgoff;
+	int error;
+
+	if (newflags == oldflags) {
+		*pprev = vma;
+		return 0;
+	}
+
+	/*
+	 * If we make a private mapping writable we increase our commit;
+	 * but (without finer accounting) cannot reduce our commit if we
+	 * make it unwritable again.
+	 *
+	 * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting
+	 * a MAP_NORESERVE private mapping to writable will now reserve.
+	 */
+	if (newflags & VM_WRITE) {
+		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
+			charged = nrpages;
+			if (security_vm_enough_memory(charged))
+				return -ENOMEM;
+			newflags |= VM_ACCOUNT;
+		}
+	}
+
+	newprot = protection_map[newflags & 0xf];
+
+	/*
+	 * First try to merge with previous and/or next vma.
+	 */
+	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+	*pprev = vma_merge(mm, *pprev, start, end, newflags,
+			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+	if (*pprev) {
+		vma = *pprev;
+		goto success;
+	}
+
+	*pprev = vma;
+
+	if (start != vma->vm_start) {
+		error = split_vma(mm, vma, start, 1);
+		if (error)
+			goto fail;
+	}
+
+	if (end != vma->vm_end) {
+		error = split_vma(mm, vma, end, 0);
+		if (error)
+			goto fail;
+	}
+
+success:
+	/*
+	 * vm_flags and vm_page_prot are protected by the mmap_sem
+	 * held in write mode.
+	 */
+	vma->vm_flags = newflags;
+	vma->vm_page_prot = newprot;
+	change_protection(vma, start, end, newprot);
+	__vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
+	__vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+	return 0;
+
+fail:
+	vm_unacct_memory(charged);
+	return error;
+}
+
+asmlinkage long
+sys_mprotect(unsigned long start, size_t len, unsigned long prot)
+{
+	unsigned long vm_flags, nstart, end, tmp, reqprot;
+	struct vm_area_struct *vma, *prev;
+	int error = -EINVAL;
+	const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
+	prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
+	if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
+		return -EINVAL;
+
+	if (start & ~PAGE_MASK)
+		return -EINVAL;
+	if (!len)
+		return 0;
+	len = PAGE_ALIGN(len);
+	end = start + len;
+	if (end <= start)
+		return -ENOMEM;
+	if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM))
+		return -EINVAL;
+
+	reqprot = prot;
+	/*
+	 * Does the application expect PROT_READ to imply PROT_EXEC:
+	 */
+	if (unlikely((prot & PROT_READ) &&
+			(current->personality & READ_IMPLIES_EXEC)))
+		prot |= PROT_EXEC;
+
+	vm_flags = calc_vm_prot_bits(prot);
+
+	down_write(&current->mm->mmap_sem);
+
+	vma = find_vma_prev(current->mm, start, &prev);
+	error = -ENOMEM;
+	if (!vma)
+		goto out;
+	if (unlikely(grows & PROT_GROWSDOWN)) {
+		if (vma->vm_start >= end)
+			goto out;
+		start = vma->vm_start;
+		error = -EINVAL;
+		if (!(vma->vm_flags & VM_GROWSDOWN))
+			goto out;
+	}
+	else {
+		if (vma->vm_start > start)
+			goto out;
+		if (unlikely(grows & PROT_GROWSUP)) {
+			end = vma->vm_end;
+			error = -EINVAL;
+			if (!(vma->vm_flags & VM_GROWSUP))
+				goto out;
+		}
+	}
+	if (start > vma->vm_start)
+		prev = vma;
+
+	for (nstart = start ; ; ) {
+		unsigned long newflags;
+
+		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
+
+		if (is_vm_hugetlb_page(vma)) {
+			error = -EACCES;
+			goto out;
+		}
+
+		newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
+
+		if ((newflags & ~(newflags >> 4)) & 0xf) {
+			error = -EACCES;
+			goto out;
+		}
+
+		error = security_file_mprotect(vma, reqprot, prot);
+		if (error)
+			goto out;
+
+		tmp = vma->vm_end;
+		if (tmp > end)
+			tmp = end;
+		error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
+		if (error)
+			goto out;
+		nstart = tmp;
+
+		if (nstart < prev->vm_end)
+			nstart = prev->vm_end;
+		if (nstart >= end)
+			goto out;
+
+		vma = prev->vm_next;
+		if (!vma || vma->vm_start != nstart) {
+			error = -ENOMEM;
+			goto out;
+		}
+	}
+out:
+	up_write(&current->mm->mmap_sem);
+	return error;
+}
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -0,0 +1,426 @@
+/*
+ *	mm/mremap.c
+ *
+ *	(C) Copyright 1996 Linus Torvalds
+ *
+ *	Address space accounting code	<alan@redhat.com>
+ *	(C) Copyright 2002 Red Hat Inc, All Rights Reserved
+ */
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/slab.h>
+#include <linux/shm.h>
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+
+#include <asm/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+static pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte = NULL;
+
+	pgd = pgd_offset(mm, addr);
+	if (pgd_none_or_clear_bad(pgd))
+		goto end;
+
+	pud = pud_offset(pgd, addr);
+	if (pud_none_or_clear_bad(pud))
+		goto end;
+
+	pmd = pmd_offset(pud, addr);
+	if (pmd_none_or_clear_bad(pmd))
+		goto end;
+
+	pte = pte_offset_map_nested(pmd, addr);
+	if (pte_none(*pte)) {
+		pte_unmap_nested(pte);
+		pte = NULL;
+	}
+end:
+	return pte;
+}
+
+static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	pgd = pgd_offset(mm, addr);
+	if (pgd_none_or_clear_bad(pgd))
+		return NULL;
+
+	pud = pud_offset(pgd, addr);
+	if (pud_none_or_clear_bad(pud))
+		return NULL;
+
+	pmd = pmd_offset(pud, addr);
+	if (pmd_none_or_clear_bad(pmd))
+		return NULL;
+
+	return pte_offset_map(pmd, addr);
+}
+
+static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte = NULL;
+
+	pgd = pgd_offset(mm, addr);
+
+	pud = pud_alloc(mm, pgd, addr);
+	if (!pud)
+		return NULL;
+	pmd = pmd_alloc(mm, pud, addr);
+	if (pmd)
+		pte = pte_alloc_map(mm, pmd, addr);
+	return pte;
+}
+
+static int
+move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
+		struct vm_area_struct *new_vma, unsigned long new_addr)
+{
+	struct address_space *mapping = NULL;
+	struct mm_struct *mm = vma->vm_mm;
+	int error = 0;
+	pte_t *src, *dst;
+
+	if (vma->vm_file) {
+		/*
+		 * Subtle point from Rajesh Venkatasubramanian: before
+		 * moving file-based ptes, we must lock vmtruncate out,
+		 * since it might clean the dst vma before the src vma,
+		 * and we propagate stale pages into the dst afterward.
+		 */
+		mapping = vma->vm_file->f_mapping;
+		spin_lock(&mapping->i_mmap_lock);
+		if (new_vma->vm_truncate_count &&
+		    new_vma->vm_truncate_count != vma->vm_truncate_count)
+			new_vma->vm_truncate_count = 0;
+	}
+	spin_lock(&mm->page_table_lock);
+
+	src = get_one_pte_map_nested(mm, old_addr);
+	if (src) {
+		/*
+		 * Look to see whether alloc_one_pte_map needs to perform a
+		 * memory allocation.  If it does then we need to drop the
+		 * atomic kmap
+		 */
+		dst = get_one_pte_map(mm, new_addr);
+		if (unlikely(!dst)) {
+			pte_unmap_nested(src);
+			if (mapping)
+				spin_unlock(&mapping->i_mmap_lock);
+			dst = alloc_one_pte_map(mm, new_addr);
+			if (mapping && !spin_trylock(&mapping->i_mmap_lock)) {
+				spin_unlock(&mm->page_table_lock);
+				spin_lock(&mapping->i_mmap_lock);
+				spin_lock(&mm->page_table_lock);
+			}
+			src = get_one_pte_map_nested(mm, old_addr);
+		}
+		/*
+		 * Since alloc_one_pte_map can drop and re-acquire
+		 * page_table_lock, we should re-check the src entry...
+		 */
+		if (src) {
+			if (dst) {
+				pte_t pte;
+				pte = ptep_clear_flush(vma, old_addr, src);
+				set_pte_at(mm, new_addr, dst, pte);
+			} else
+				error = -ENOMEM;
+			pte_unmap_nested(src);
+		}
+		if (dst)
+			pte_unmap(dst);
+	}
+	spin_unlock(&mm->page_table_lock);
+	if (mapping)
+		spin_unlock(&mapping->i_mmap_lock);
+	return error;
+}
+
+static unsigned long move_page_tables(struct vm_area_struct *vma,
+		unsigned long old_addr, struct vm_area_struct *new_vma,
+		unsigned long new_addr, unsigned long len)
+{
+	unsigned long offset;
+
+	flush_cache_range(vma, old_addr, old_addr + len);
+
+	/*
+	 * This is not the clever way to do this, but we're taking the
+	 * easy way out on the assumption that most remappings will be
+	 * only a few pages.. This also makes error recovery easier.
+	 */
+	for (offset = 0; offset < len; offset += PAGE_SIZE) {
+		if (move_one_page(vma, old_addr + offset,
+				new_vma, new_addr + offset) < 0)
+			break;
+		cond_resched();
+	}
+	return offset;
+}
+
+static unsigned long move_vma(struct vm_area_struct *vma,
+		unsigned long old_addr, unsigned long old_len,
+		unsigned long new_len, unsigned long new_addr)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct vm_area_struct *new_vma;
+	unsigned long vm_flags = vma->vm_flags;
+	unsigned long new_pgoff;
+	unsigned long moved_len;
+	unsigned long excess = 0;
+	int split = 0;
+
+	/*
+	 * We'd prefer to avoid failure later on in do_munmap:
+	 * which may split one vma into three before unmapping.
+	 */
+	if (mm->map_count >= sysctl_max_map_count - 3)
+		return -ENOMEM;
+
+	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
+	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
+	if (!new_vma)
+		return -ENOMEM;
+
+	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
+	if (moved_len < old_len) {
+		/*
+		 * On error, move entries back from new area to old,
+		 * which will succeed since page tables still there,
+		 * and then proceed to unmap new area instead of old.
+		 */
+		move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
+		vma = new_vma;
+		old_len = new_len;
+		old_addr = new_addr;
+		new_addr = -ENOMEM;
+	}
+
+	/* Conceal VM_ACCOUNT so old reservation is not undone */
+	if (vm_flags & VM_ACCOUNT) {
+		vma->vm_flags &= ~VM_ACCOUNT;
+		excess = vma->vm_end - vma->vm_start - old_len;
+		if (old_addr > vma->vm_start &&
+		    old_addr + old_len < vma->vm_end)
+			split = 1;
+	}
+
+	if (do_munmap(mm, old_addr, old_len) < 0) {
+		/* OOM: unable to split vma, just get accounts right */
+		vm_unacct_memory(excess >> PAGE_SHIFT);
+		excess = 0;
+	}
+
+	/* Restore VM_ACCOUNT if one or two pieces of vma left */
+	if (excess) {
+		vma->vm_flags |= VM_ACCOUNT;
+		if (split)
+			vma->vm_next->vm_flags |= VM_ACCOUNT;
+	}
+
+	mm->total_vm += new_len >> PAGE_SHIFT;
+	__vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
+	if (vm_flags & VM_LOCKED) {
+		mm->locked_vm += new_len >> PAGE_SHIFT;
+		if (new_len > old_len)
+			make_pages_present(new_addr + old_len,
+					   new_addr + new_len);
+	}
+
+	return new_addr;
+}
+
+/*
+ * Expand (or shrink) an existing mapping, potentially moving it at the
+ * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
+ *
+ * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
+ * This option implies MREMAP_MAYMOVE.
+ */
+unsigned long do_mremap(unsigned long addr,
+	unsigned long old_len, unsigned long new_len,
+	unsigned long flags, unsigned long new_addr)
+{
+	struct vm_area_struct *vma;
+	unsigned long ret = -EINVAL;
+	unsigned long charged = 0;
+
+	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
+		goto out;
+
+	if (addr & ~PAGE_MASK)
+		goto out;
+
+	old_len = PAGE_ALIGN(old_len);
+	new_len = PAGE_ALIGN(new_len);
+
+	/*
+	 * We allow a zero old-len as a special case
+	 * for DOS-emu "duplicate shm area" thing. But
+	 * a zero new-len is nonsensical.
+	 */
+	if (!new_len)
+		goto out;
+
+	/* new_addr is only valid if MREMAP_FIXED is specified */
+	if (flags & MREMAP_FIXED) {
+		if (new_addr & ~PAGE_MASK)
+			goto out;
+		if (!(flags & MREMAP_MAYMOVE))
+			goto out;
+
+		if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
+			goto out;
+
+		/* Check if the location we're moving into overlaps the
+		 * old location at all, and fail if it does.
+		 */
+		if ((new_addr <= addr) && (new_addr+new_len) > addr)
+			goto out;
+
+		if ((addr <= new_addr) && (addr+old_len) > new_addr)
+			goto out;
+
+		ret = do_munmap(current->mm, new_addr, new_len);
+		if (ret)
+			goto out;
+	}
+
+	/*
+	 * Always allow a shrinking remap: that just unmaps
+	 * the unnecessary pages..
+	 * do_munmap does all the needed commit accounting
+	 */
+	if (old_len >= new_len) {
+		ret = do_munmap(current->mm, addr+new_len, old_len - new_len);
+		if (ret && old_len != new_len)
+			goto out;
+		ret = addr;
+		if (!(flags & MREMAP_FIXED) || (new_addr == addr))
+			goto out;
+		old_len = new_len;
+	}
+
+	/*
+	 * Ok, we need to grow..  or relocate.
+	 */
+	ret = -EFAULT;
+	vma = find_vma(current->mm, addr);
+	if (!vma || vma->vm_start > addr)
+		goto out;
+	if (is_vm_hugetlb_page(vma)) {
+		ret = -EINVAL;
+		goto out;
+	}
+	/* We can't remap across vm area boundaries */
+	if (old_len > vma->vm_end - addr)
+		goto out;
+	if (vma->vm_flags & VM_DONTEXPAND) {
+		if (new_len > old_len)
+			goto out;
+	}
+	if (vma->vm_flags & VM_LOCKED) {
+		unsigned long locked, lock_limit;
+		locked = current->mm->locked_vm << PAGE_SHIFT;
+		lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+		locked += new_len - old_len;
+		ret = -EAGAIN;
+		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+			goto out;
+	}
+	ret = -ENOMEM;
+	if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len)
+	    > current->signal->rlim[RLIMIT_AS].rlim_cur)
+		goto out;
+
+	if (vma->vm_flags & VM_ACCOUNT) {
+		charged = (new_len - old_len) >> PAGE_SHIFT;
+		if (security_vm_enough_memory(charged))
+			goto out_nc;
+	}
+
+	/* old_len exactly to the end of the area..
+	 * And we're not relocating the area.
+	 */
+	if (old_len == vma->vm_end - addr &&
+	    !((flags & MREMAP_FIXED) && (addr != new_addr)) &&
+	    (old_len != new_len || !(flags & MREMAP_MAYMOVE))) {
+		unsigned long max_addr = TASK_SIZE;
+		if (vma->vm_next)
+			max_addr = vma->vm_next->vm_start;
+		/* can we just expand the current mapping? */
+		if (max_addr - addr >= new_len) {
+			int pages = (new_len - old_len) >> PAGE_SHIFT;
+
+			vma_adjust(vma, vma->vm_start,
+				addr + new_len, vma->vm_pgoff, NULL);
+
+			current->mm->total_vm += pages;
+			__vm_stat_account(vma->vm_mm, vma->vm_flags,
+							vma->vm_file, pages);
+			if (vma->vm_flags & VM_LOCKED) {
+				current->mm->locked_vm += pages;
+				make_pages_present(addr + old_len,
+						   addr + new_len);
+			}
+			ret = addr;
+			goto out;
+		}
+	}
+
+	/*
+	 * We weren't able to just expand or shrink the area,
+	 * we need to create a new one and move it..
+	 */
+	ret = -ENOMEM;
+	if (flags & MREMAP_MAYMOVE) {
+		if (!(flags & MREMAP_FIXED)) {
+			unsigned long map_flags = 0;
+			if (vma->vm_flags & VM_MAYSHARE)
+				map_flags |= MAP_SHARED;
+
+			new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
+						vma->vm_pgoff, map_flags);
+			ret = new_addr;
+			if (new_addr & ~PAGE_MASK)
+				goto out;
+		}
+		ret = move_vma(vma, addr, old_len, new_len, new_addr);
+	}
+out:
+	if (ret & ~PAGE_MASK)
+		vm_unacct_memory(charged);
+out_nc:
+	return ret;
+}
+
+asmlinkage unsigned long sys_mremap(unsigned long addr,
+	unsigned long old_len, unsigned long new_len,
+	unsigned long flags, unsigned long new_addr)
+{
+	unsigned long ret;
+
+	down_write(&current->mm->mmap_sem);
+	ret = do_mremap(addr, old_len, new_len, flags, new_addr);
+	up_write(&current->mm->mmap_sem);
+	return ret;
+}
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -0,0 +1,236 @@
+/*
+ *	linux/mm/msync.c
+ *
+ * Copyright (C) 1994-1999  Linus Torvalds
+ */
+
+/*
+ * The msync() system call.
+ */
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/hugetlb.h>
+#include <linux/syscalls.h>
+
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+
+/*
+ * Called with mm->page_table_lock held to protect against other
+ * threads/the swapper from ripping pte's out from under us.
+ */
+
+static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+				unsigned long addr, unsigned long end)
+{
+	pte_t *pte;
+
+	pte = pte_offset_map(pmd, addr);
+	do {
+		unsigned long pfn;
+		struct page *page;
+
+		if (!pte_present(*pte))
+			continue;
+		pfn = pte_pfn(*pte);
+		if (!pfn_valid(pfn))
+			continue;
+		page = pfn_to_page(pfn);
+		if (PageReserved(page))
+			continue;
+
+		if (ptep_clear_flush_dirty(vma, addr, pte) ||
+		    page_test_and_clear_dirty(page))
+			set_page_dirty(page);
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	pte_unmap(pte - 1);
+}
+
+static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+				unsigned long addr, unsigned long end)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none_or_clear_bad(pmd))
+			continue;
+		sync_pte_range(vma, pmd, addr, next);
+	} while (pmd++, addr = next, addr != end);
+}
+
+static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+				unsigned long addr, unsigned long end)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_offset(pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(pud))
+			continue;
+		sync_pmd_range(vma, pud, addr, next);
+	} while (pud++, addr = next, addr != end);
+}
+
+static void sync_page_range(struct vm_area_struct *vma,
+				unsigned long addr, unsigned long end)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pgd_t *pgd;
+	unsigned long next;
+
+	/* For hugepages we can't go walking the page table normally,
+	 * but that's ok, hugetlbfs is memory based, so we don't need
+	 * to do anything more on an msync() */
+	if (is_vm_hugetlb_page(vma))
+		return;
+
+	BUG_ON(addr >= end);
+	pgd = pgd_offset(mm, addr);
+	flush_cache_range(vma, addr, end);
+	spin_lock(&mm->page_table_lock);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		sync_pud_range(vma, pgd, addr, next);
+	} while (pgd++, addr = next, addr != end);
+	spin_unlock(&mm->page_table_lock);
+}
+
+#ifdef CONFIG_PREEMPT
+static inline void filemap_sync(struct vm_area_struct *vma,
+				unsigned long addr, unsigned long end)
+{
+	const size_t chunk = 64 * 1024;	/* bytes */
+	unsigned long next;
+
+	do {
+		next = addr + chunk;
+		if (next > end || next < addr)
+			next = end;
+		sync_page_range(vma, addr, next);
+		cond_resched();
+	} while (addr = next, addr != end);
+}
+#else
+static inline void filemap_sync(struct vm_area_struct *vma,
+				unsigned long addr, unsigned long end)
+{
+	sync_page_range(vma, addr, end);
+}
+#endif
+
+/*
+ * MS_SYNC syncs the entire file - including mappings.
+ *
+ * MS_ASYNC does not start I/O (it used to, up to 2.5.67).  Instead, it just
+ * marks the relevant pages dirty.  The application may now run fsync() to
+ * write out the dirty pages and wait on the writeout and check the result.
+ * Or the application may run fadvise(FADV_DONTNEED) against the fd to start
+ * async writeout immediately.
+ * So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to
+ * applications.
+ */
+static int msync_interval(struct vm_area_struct *vma,
+			unsigned long addr, unsigned long end, int flags)
+{
+	int ret = 0;
+	struct file *file = vma->vm_file;
+
+	if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED))
+		return -EBUSY;
+
+	if (file && (vma->vm_flags & VM_SHARED)) {
+		filemap_sync(vma, addr, end);
+
+		if (flags & MS_SYNC) {
+			struct address_space *mapping = file->f_mapping;
+			int err;
+
+			ret = filemap_fdatawrite(mapping);
+			if (file->f_op && file->f_op->fsync) {
+				/*
+				 * We don't take i_sem here because mmap_sem
+				 * is already held.
+				 */
+				err = file->f_op->fsync(file,file->f_dentry,1);
+				if (err && !ret)
+					ret = err;
+			}
+			err = filemap_fdatawait(mapping);
+			if (!ret)
+				ret = err;
+		}
+	}
+	return ret;
+}
+
+asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
+{
+	unsigned long end;
+	struct vm_area_struct *vma;
+	int unmapped_error, error = -EINVAL;
+
+	if (flags & MS_SYNC)
+		current->flags |= PF_SYNCWRITE;
+
+	down_read(&current->mm->mmap_sem);
+	if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
+		goto out;
+	if (start & ~PAGE_MASK)
+		goto out;
+	if ((flags & MS_ASYNC) && (flags & MS_SYNC))
+		goto out;
+	error = -ENOMEM;
+	len = (len + ~PAGE_MASK) & PAGE_MASK;
+	end = start + len;
+	if (end < start)
+		goto out;
+	error = 0;
+	if (end == start)
+		goto out;
+	/*
+	 * If the interval [start,end) covers some unmapped address ranges,
+	 * just ignore them, but return -ENOMEM at the end.
+	 */
+	vma = find_vma(current->mm, start);
+	unmapped_error = 0;
+	for (;;) {
+		/* Still start < end. */
+		error = -ENOMEM;
+		if (!vma)
+			goto out;
+		/* Here start < vma->vm_end. */
+		if (start < vma->vm_start) {
+			unmapped_error = -ENOMEM;
+			start = vma->vm_start;
+		}
+		/* Here vma->vm_start <= start < vma->vm_end. */
+		if (end <= vma->vm_end) {
+			if (start < end) {
+				error = msync_interval(vma, start, end, flags);
+				if (error)
+					goto out;
+			}
+			error = unmapped_error;
+			goto out;
+		}
+		/* Here vma->vm_start <= start < vma->vm_end < end. */
+		error = msync_interval(vma, start, vma->vm_end, flags);
+		if (error)
+			goto out;
+		start = vma->vm_end;
+		vma = vma->vm_next;
+	}
+out:
+	up_read(&current->mm->mmap_sem);
+	current->flags &= ~PF_SYNCWRITE;
+	return error;
+}
--- a/mm/nommu.c
+++ b/mm/nommu.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -0,0 +1,292 @@
+/*
+ *  linux/mm/oom_kill.c
+ * 
+ *  Copyright (C)  1998,2000  Rik van Riel
+ *	Thanks go out to Claus Fischer for some serious inspiration and
+ *	for goading me into coding this file...
+ *
+ *  The routines in this file are used to kill a process when
+ *  we're seriously out of memory. This gets called from kswapd()
+ *  in linux/mm/vmscan.c when we really run out of memory.
+ *
+ *  Since we won't call these routines often (on a well-configured
+ *  machine) this file will double as a 'coding guide' and a signpost
+ *  for newbie kernel hackers. It features several pointers to major
+ *  kernel subsystems and hints as to where to find out what things do.
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/swap.h>
+#include <linux/timex.h>
+#include <linux/jiffies.h>
+
+/* #define DEBUG */
+
+/**
+ * oom_badness - calculate a numeric value for how bad this task has been
+ * @p: task struct of which task we should calculate
+ * @p: current uptime in seconds
+ *
+ * The formula used is relatively simple and documented inline in the
+ * function. The main rationale is that we want to select a good task
+ * to kill when we run out of memory.
+ *
+ * Good in this context means that:
+ * 1) we lose the minimum amount of work done
+ * 2) we recover a large amount of memory
+ * 3) we don't kill anything innocent of eating tons of memory
+ * 4) we want to kill the minimum amount of processes (one)
+ * 5) we try to kill the process the user expects us to kill, this
+ *    algorithm has been meticulously tuned to meet the principle
+ *    of least surprise ... (be careful when you change it)
+ */
+
+unsigned long badness(struct task_struct *p, unsigned long uptime)
+{
+	unsigned long points, cpu_time, run_time, s;
+	struct list_head *tsk;
+
+	if (!p->mm)
+		return 0;
+
+	/*
+	 * The memory size of the process is the basis for the badness.
+	 */
+	points = p->mm->total_vm;
+
+	/*
+	 * Processes which fork a lot of child processes are likely
+	 * a good choice. We add the vmsize of the childs if they
+	 * have an own mm. This prevents forking servers to flood the
+	 * machine with an endless amount of childs
+	 */
+	list_for_each(tsk, &p->children) {
+		struct task_struct *chld;
+		chld = list_entry(tsk, struct task_struct, sibling);
+		if (chld->mm != p->mm && chld->mm)
+			points += chld->mm->total_vm;
+	}
+
+	/*
+	 * CPU time is in tens of seconds and run time is in thousands
+         * of seconds. There is no particular reason for this other than
+         * that it turned out to work very well in practice.
+	 */
+	cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime))
+		>> (SHIFT_HZ + 3);
+
+	if (uptime >= p->start_time.tv_sec)
+		run_time = (uptime - p->start_time.tv_sec) >> 10;
+	else
+		run_time = 0;
+
+	s = int_sqrt(cpu_time);
+	if (s)
+		points /= s;
+	s = int_sqrt(int_sqrt(run_time));
+	if (s)
+		points /= s;
+
+	/*
+	 * Niced processes are most likely less important, so double
+	 * their badness points.
+	 */
+	if (task_nice(p) > 0)
+		points *= 2;
+
+	/*
+	 * Superuser processes are usually more important, so we make it
+	 * less likely that we kill those.
+	 */
+	if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) ||
+				p->uid == 0 || p->euid == 0)
+		points /= 4;
+
+	/*
+	 * We don't want to kill a process with direct hardware access.
+	 * Not only could that mess up the hardware, but usually users
+	 * tend to only have this flag set on applications they think
+	 * of as important.
+	 */
+	if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))
+		points /= 4;
+
+	/*
+	 * Adjust the score by oomkilladj.
+	 */
+	if (p->oomkilladj) {
+		if (p->oomkilladj > 0)
+			points <<= p->oomkilladj;
+		else
+			points >>= -(p->oomkilladj);
+	}
+
+#ifdef DEBUG
+	printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n",
+	p->pid, p->comm, points);
+#endif
+	return points;
+}
+
+/*
+ * Simple selection loop. We chose the process with the highest
+ * number of 'points'. We expect the caller will lock the tasklist.
+ *
+ * (not docbooked, we don't want this one cluttering up the manual)
+ */
+static struct task_struct * select_bad_process(void)
+{
+	unsigned long maxpoints = 0;
+	struct task_struct *g, *p;
+	struct task_struct *chosen = NULL;
+	struct timespec uptime;
+
+	do_posix_clock_monotonic_gettime(&uptime);
+	do_each_thread(g, p)
+		/* skip the init task with pid == 1 */
+		if (p->pid > 1) {
+			unsigned long points;
+
+			/*
+			 * This is in the process of releasing memory so wait it
+			 * to finish before killing some other task by mistake.
+			 */
+			if ((unlikely(test_tsk_thread_flag(p, TIF_MEMDIE)) || (p->flags & PF_EXITING)) &&
+			    !(p->flags & PF_DEAD))
+				return ERR_PTR(-1UL);
+			if (p->flags & PF_SWAPOFF)
+				return p;
+
+			points = badness(p, uptime.tv_sec);
+			if (points > maxpoints || !chosen) {
+				chosen = p;
+				maxpoints = points;
+			}
+		}
+	while_each_thread(g, p);
+	return chosen;
+}
+
+/**
+ * We must be careful though to never send SIGKILL a process with
+ * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
+ * we select a process with CAP_SYS_RAW_IO set).
+ */
+static void __oom_kill_task(task_t *p)
+{
+	if (p->pid == 1) {
+		WARN_ON(1);
+		printk(KERN_WARNING "tried to kill init!\n");
+		return;
+	}
+
+	task_lock(p);
+	if (!p->mm || p->mm == &init_mm) {
+		WARN_ON(1);
+		printk(KERN_WARNING "tried to kill an mm-less task!\n");
+		task_unlock(p);
+		return;
+	}
+	task_unlock(p);
+	printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm);
+
+	/*
+	 * We give our sacrificial lamb high priority and access to
+	 * all the memory it needs. That way it should be able to
+	 * exit() and clear out its resources quickly...
+	 */
+	p->time_slice = HZ;
+	set_tsk_thread_flag(p, TIF_MEMDIE);
+
+	force_sig(SIGKILL, p);
+}
+
+static struct mm_struct *oom_kill_task(task_t *p)
+{
+	struct mm_struct *mm = get_task_mm(p);
+	task_t * g, * q;
+
+	if (!mm)
+		return NULL;
+	if (mm == &init_mm) {
+		mmput(mm);
+		return NULL;
+	}
+
+	__oom_kill_task(p);
+	/*
+	 * kill all processes that share the ->mm (i.e. all threads),
+	 * but are in a different thread group
+	 */
+	do_each_thread(g, q)
+		if (q->mm == mm && q->tgid != p->tgid)
+			__oom_kill_task(q);
+	while_each_thread(g, q);
+
+	return mm;
+}
+
+static struct mm_struct *oom_kill_process(struct task_struct *p)
+{
+ 	struct mm_struct *mm;
+	struct task_struct *c;
+	struct list_head *tsk;
+
+	/* Try to kill a child first */
+	list_for_each(tsk, &p->children) {
+		c = list_entry(tsk, struct task_struct, sibling);
+		if (c->mm == p->mm)
+			continue;
+		mm = oom_kill_task(c);
+		if (mm)
+			return mm;
+	}
+	return oom_kill_task(p);
+}
+
+/**
+ * oom_kill - kill the "best" process when we run out of memory
+ *
+ * If we run out of memory, we have the choice between either
+ * killing a random task (bad), letting the system crash (worse)
+ * OR try to be smart about which process to kill. Note that we
+ * don't have to be perfect here, we just have to be good.
+ */
+void out_of_memory(unsigned int __nocast gfp_mask)
+{
+	struct mm_struct *mm = NULL;
+	task_t * p;
+
+	read_lock(&tasklist_lock);
+retry:
+	p = select_bad_process();
+
+	if (PTR_ERR(p) == -1UL)
+		goto out;
+
+	/* Found nothing?!?! Either we hang forever, or we panic. */
+	if (!p) {
+		read_unlock(&tasklist_lock);
+		show_free_areas();
+		panic("Out of memory and no killable processes...\n");
+	}
+
+	printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
+	show_free_areas();
+	mm = oom_kill_process(p);
+	if (!mm)
+		goto retry;
+
+ out:
+	read_unlock(&tasklist_lock);
+	if (mm)
+		mmput(mm);
+
+	/*
+	 * Give "p" a good chance of killing itself before we
+	 * retry to allocate memory.
+	 */
+	__set_current_state(TASK_INTERRUPTIBLE);
+	schedule_timeout(1);
+}
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -0,0 +1,819 @@
+/*
+ * mm/page-writeback.c.
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ *
+ * Contains functions related to writing back dirty pages at the
+ * address_space level.
+ *
+ * 10Apr2002	akpm@zip.com.au
+ *		Initial version
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/init.h>
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
+#include <linux/mpage.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/smp.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+#include <linux/syscalls.h>
+
+/*
+ * The maximum number of pages to writeout in a single bdflush/kupdate
+ * operation.  We do this so we don't hold I_LOCK against an inode for
+ * enormous amounts of time, which would block a userspace task which has
+ * been forced to throttle against that inode.  Also, the code reevaluates
+ * the dirty each time it has written this many pages.
+ */
+#define MAX_WRITEBACK_PAGES	1024
+
+/*
+ * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
+ * will look to see if it needs to force writeback or throttling.
+ */
+static long ratelimit_pages = 32;
+
+static long total_pages;	/* The total number of pages in the machine. */
+static int dirty_exceeded;	/* Dirty mem may be over limit */
+
+/*
+ * When balance_dirty_pages decides that the caller needs to perform some
+ * non-background writeback, this is how many pages it will attempt to write.
+ * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably
+ * large amounts of I/O are submitted.
+ */
+static inline long sync_writeback_pages(void)
+{
+	return ratelimit_pages + ratelimit_pages / 2;
+}
+
+/* The following parameters are exported via /proc/sys/vm */
+
+/*
+ * Start background writeback (via pdflush) at this percentage
+ */
+int dirty_background_ratio = 10;
+
+/*
+ * The generator of dirty data starts writeback at this percentage
+ */
+int vm_dirty_ratio = 40;
+
+/*
+ * The interval between `kupdate'-style writebacks, in centiseconds
+ * (hundredths of a second)
+ */
+int dirty_writeback_centisecs = 5 * 100;
+
+/*
+ * The longest number of centiseconds for which data is allowed to remain dirty
+ */
+int dirty_expire_centisecs = 30 * 100;
+
+/*
+ * Flag that makes the machine dump writes/reads and block dirtyings.
+ */
+int block_dump;
+
+/*
+ * Flag that puts the machine in "laptop mode".
+ */
+int laptop_mode;
+
+EXPORT_SYMBOL(laptop_mode);
+
+/* End of sysctl-exported parameters */
+
+
+static void background_writeout(unsigned long _min_pages);
+
+struct writeback_state
+{
+	unsigned long nr_dirty;
+	unsigned long nr_unstable;
+	unsigned long nr_mapped;
+	unsigned long nr_writeback;
+};
+
+static void get_writeback_state(struct writeback_state *wbs)
+{
+	wbs->nr_dirty = read_page_state(nr_dirty);
+	wbs->nr_unstable = read_page_state(nr_unstable);
+	wbs->nr_mapped = read_page_state(nr_mapped);
+	wbs->nr_writeback = read_page_state(nr_writeback);
+}
+
+/*
+ * Work out the current dirty-memory clamping and background writeout
+ * thresholds.
+ *
+ * The main aim here is to lower them aggressively if there is a lot of mapped
+ * memory around.  To avoid stressing page reclaim with lots of unreclaimable
+ * pages.  It is better to clamp down on writers than to start swapping, and
+ * performing lots of scanning.
+ *
+ * We only allow 1/2 of the currently-unmapped memory to be dirtied.
+ *
+ * We don't permit the clamping level to fall below 5% - that is getting rather
+ * excessive.
+ *
+ * We make sure that the background writeout level is below the adjusted
+ * clamping level.
+ */
+static void
+get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty,
+		struct address_space *mapping)
+{
+	int background_ratio;		/* Percentages */
+	int dirty_ratio;
+	int unmapped_ratio;
+	long background;
+	long dirty;
+	unsigned long available_memory = total_pages;
+	struct task_struct *tsk;
+
+	get_writeback_state(wbs);
+
+#ifdef CONFIG_HIGHMEM
+	/*
+	 * If this mapping can only allocate from low memory,
+	 * we exclude high memory from our count.
+	 */
+	if (mapping && !(mapping_gfp_mask(mapping) & __GFP_HIGHMEM))
+		available_memory -= totalhigh_pages;
+#endif
+
+
+	unmapped_ratio = 100 - (wbs->nr_mapped * 100) / total_pages;
+
+	dirty_ratio = vm_dirty_ratio;
+	if (dirty_ratio > unmapped_ratio / 2)
+		dirty_ratio = unmapped_ratio / 2;
+
+	if (dirty_ratio < 5)
+		dirty_ratio = 5;
+
+	background_ratio = dirty_background_ratio;
+	if (background_ratio >= dirty_ratio)
+		background_ratio = dirty_ratio / 2;
+
+	background = (background_ratio * available_memory) / 100;
+	dirty = (dirty_ratio * available_memory) / 100;
+	tsk = current;
+	if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
+		background += background / 4;
+		dirty += dirty / 4;
+	}
+	*pbackground = background;
+	*pdirty = dirty;
+}
+
+/*
+ * balance_dirty_pages() must be called by processes which are generating dirty
+ * data.  It looks at the number of dirty pages in the machine and will force
+ * the caller to perform writeback if the system is over `vm_dirty_ratio'.
+ * If we're over `background_thresh' then pdflush is woken to perform some
+ * writeout.
+ */
+static void balance_dirty_pages(struct address_space *mapping)
+{
+	struct writeback_state wbs;
+	long nr_reclaimable;
+	long background_thresh;
+	long dirty_thresh;
+	unsigned long pages_written = 0;
+	unsigned long write_chunk = sync_writeback_pages();
+
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+
+	for (;;) {
+		struct writeback_control wbc = {
+			.bdi		= bdi,
+			.sync_mode	= WB_SYNC_NONE,
+			.older_than_this = NULL,
+			.nr_to_write	= write_chunk,
+		};
+
+		get_dirty_limits(&wbs, &background_thresh,
+					&dirty_thresh, mapping);
+		nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable;
+		if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
+			break;
+
+		dirty_exceeded = 1;
+
+		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
+		 * Unstable writes are a feature of certain networked
+		 * filesystems (i.e. NFS) in which data may have been
+		 * written to the server's write cache, but has not yet
+		 * been flushed to permanent storage.
+		 */
+		if (nr_reclaimable) {
+			writeback_inodes(&wbc);
+			get_dirty_limits(&wbs, &background_thresh,
+					&dirty_thresh, mapping);
+			nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable;
+			if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
+				break;
+			pages_written += write_chunk - wbc.nr_to_write;
+			if (pages_written >= write_chunk)
+				break;		/* We've done our duty */
+		}
+		blk_congestion_wait(WRITE, HZ/10);
+	}
+
+	if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
+		dirty_exceeded = 0;
+
+	if (writeback_in_progress(bdi))
+		return;		/* pdflush is already working this queue */
+
+	/*
+	 * In laptop mode, we wait until hitting the higher threshold before
+	 * starting background writeout, and then write out all the way down
+	 * to the lower threshold.  So slow writers cause minimal disk activity.
+	 *
+	 * In normal mode, we start background writeout at the lower
+	 * background_thresh, to keep the amount of dirty memory low.
+	 */
+	if ((laptop_mode && pages_written) ||
+	     (!laptop_mode && (nr_reclaimable > background_thresh)))
+		pdflush_operation(background_writeout, 0);
+}
+
+/**
+ * balance_dirty_pages_ratelimited - balance dirty memory state
+ * @mapping - address_space which was dirtied
+ *
+ * Processes which are dirtying memory should call in here once for each page
+ * which was newly dirtied.  The function will periodically check the system's
+ * dirty state and will initiate writeback if needed.
+ *
+ * On really big machines, get_writeback_state is expensive, so try to avoid
+ * calling it too often (ratelimiting).  But once we're over the dirty memory
+ * limit we decrease the ratelimiting by a lot, to prevent individual processes
+ * from overshooting the limit by (ratelimit_pages) each.
+ */
+void balance_dirty_pages_ratelimited(struct address_space *mapping)
+{
+	static DEFINE_PER_CPU(int, ratelimits) = 0;
+	long ratelimit;
+
+	ratelimit = ratelimit_pages;
+	if (dirty_exceeded)
+		ratelimit = 8;
+
+	/*
+	 * Check the rate limiting. Also, we do not want to throttle real-time
+	 * tasks in balance_dirty_pages(). Period.
+	 */
+	if (get_cpu_var(ratelimits)++ >= ratelimit) {
+		__get_cpu_var(ratelimits) = 0;
+		put_cpu_var(ratelimits);
+		balance_dirty_pages(mapping);
+		return;
+	}
+	put_cpu_var(ratelimits);
+}
+EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
+
+void throttle_vm_writeout(void)
+{
+	struct writeback_state wbs;
+	long background_thresh;
+	long dirty_thresh;
+
+        for ( ; ; ) {
+		get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL);
+
+                /*
+                 * Boost the allowable dirty threshold a bit for page
+                 * allocators so they don't get DoS'ed by heavy writers
+                 */
+                dirty_thresh += dirty_thresh / 10;      /* wheeee... */
+
+                if (wbs.nr_unstable + wbs.nr_writeback <= dirty_thresh)
+                        break;
+                blk_congestion_wait(WRITE, HZ/10);
+        }
+}
+
+
+/*
+ * writeback at least _min_pages, and keep writing until the amount of dirty
+ * memory is less than the background threshold, or until we're all clean.
+ */
+static void background_writeout(unsigned long _min_pages)
+{
+	long min_pages = _min_pages;
+	struct writeback_control wbc = {
+		.bdi		= NULL,
+		.sync_mode	= WB_SYNC_NONE,
+		.older_than_this = NULL,
+		.nr_to_write	= 0,
+		.nonblocking	= 1,
+	};
+
+	for ( ; ; ) {
+		struct writeback_state wbs;
+		long background_thresh;
+		long dirty_thresh;
+
+		get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL);
+		if (wbs.nr_dirty + wbs.nr_unstable < background_thresh
+				&& min_pages <= 0)
+			break;
+		wbc.encountered_congestion = 0;
+		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+		wbc.pages_skipped = 0;
+		writeback_inodes(&wbc);
+		min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
+			/* Wrote less than expected */
+			blk_congestion_wait(WRITE, HZ/10);
+			if (!wbc.encountered_congestion)
+				break;
+		}
+	}
+}
+
+/*
+ * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
+ * the whole world.  Returns 0 if a pdflush thread was dispatched.  Returns
+ * -1 if all pdflush threads were busy.
+ */
+int wakeup_bdflush(long nr_pages)
+{
+	if (nr_pages == 0) {
+		struct writeback_state wbs;
+
+		get_writeback_state(&wbs);
+		nr_pages = wbs.nr_dirty + wbs.nr_unstable;
+	}
+	return pdflush_operation(background_writeout, nr_pages);
+}
+
+static void wb_timer_fn(unsigned long unused);
+static void laptop_timer_fn(unsigned long unused);
+
+static struct timer_list wb_timer =
+			TIMER_INITIALIZER(wb_timer_fn, 0, 0);
+static struct timer_list laptop_mode_wb_timer =
+			TIMER_INITIALIZER(laptop_timer_fn, 0, 0);
+
+/*
+ * Periodic writeback of "old" data.
+ *
+ * Define "old": the first time one of an inode's pages is dirtied, we mark the
+ * dirtying-time in the inode's address_space.  So this periodic writeback code
+ * just walks the superblock inode list, writing back any inodes which are
+ * older than a specific point in time.
+ *
+ * Try to run once per dirty_writeback_centisecs.  But if a writeback event
+ * takes longer than a dirty_writeback_centisecs interval, then leave a
+ * one-second gap.
+ *
+ * older_than_this takes precedence over nr_to_write.  So we'll only write back
+ * all dirty pages if they are all attached to "old" mappings.
+ */
+static void wb_kupdate(unsigned long arg)
+{
+	unsigned long oldest_jif;
+	unsigned long start_jif;
+	unsigned long next_jif;
+	long nr_to_write;
+	struct writeback_state wbs;
+	struct writeback_control wbc = {
+		.bdi		= NULL,
+		.sync_mode	= WB_SYNC_NONE,
+		.older_than_this = &oldest_jif,
+		.nr_to_write	= 0,
+		.nonblocking	= 1,
+		.for_kupdate	= 1,
+	};
+
+	sync_supers();
+
+	get_writeback_state(&wbs);
+	oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
+	start_jif = jiffies;
+	next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
+	nr_to_write = wbs.nr_dirty + wbs.nr_unstable +
+			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
+	while (nr_to_write > 0) {
+		wbc.encountered_congestion = 0;
+		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+		writeback_inodes(&wbc);
+		if (wbc.nr_to_write > 0) {
+			if (wbc.encountered_congestion)
+				blk_congestion_wait(WRITE, HZ/10);
+			else
+				break;	/* All the old data is written */
+		}
+		nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+	}
+	if (time_before(next_jif, jiffies + HZ))
+		next_jif = jiffies + HZ;
+	if (dirty_writeback_centisecs)
+		mod_timer(&wb_timer, next_jif);
+}
+
+/*
+ * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
+ */
+int dirty_writeback_centisecs_handler(ctl_table *table, int write,
+		struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	proc_dointvec(table, write, file, buffer, length, ppos);
+	if (dirty_writeback_centisecs) {
+		mod_timer(&wb_timer,
+			jiffies + (dirty_writeback_centisecs * HZ) / 100);
+	} else {
+		del_timer(&wb_timer);
+	}
+	return 0;
+}
+
+static void wb_timer_fn(unsigned long unused)
+{
+	if (pdflush_operation(wb_kupdate, 0) < 0)
+		mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
+}
+
+static void laptop_flush(unsigned long unused)
+{
+	sys_sync();
+}
+
+static void laptop_timer_fn(unsigned long unused)
+{
+	pdflush_operation(laptop_flush, 0);
+}
+
+/*
+ * We've spun up the disk and we're in laptop mode: schedule writeback
+ * of all dirty data a few seconds from now.  If the flush is already scheduled
+ * then push it back - the user is still using the disk.
+ */
+void laptop_io_completion(void)
+{
+	mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ);
+}
+
+/*
+ * We're in laptop mode and we've just synced. The sync's writes will have
+ * caused another writeback to be scheduled by laptop_io_completion.
+ * Nothing needs to be written back anymore, so we unschedule the writeback.
+ */
+void laptop_sync_completion(void)
+{
+	del_timer(&laptop_mode_wb_timer);
+}
+
+/*
+ * If ratelimit_pages is too high then we can get into dirty-data overload
+ * if a large number of processes all perform writes at the same time.
+ * If it is too low then SMP machines will call the (expensive)
+ * get_writeback_state too often.
+ *
+ * Here we set ratelimit_pages to a level which ensures that when all CPUs are
+ * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
+ * thresholds before writeback cuts in.
+ *
+ * But the limit should not be set too high.  Because it also controls the
+ * amount of memory which the balance_dirty_pages() caller has to write back.
+ * If this is too large then the caller will block on the IO queue all the
+ * time.  So limit it to four megabytes - the balance_dirty_pages() caller
+ * will write six megabyte chunks, max.
+ */
+
+static void set_ratelimit(void)
+{
+	ratelimit_pages = total_pages / (num_online_cpus() * 32);
+	if (ratelimit_pages < 16)
+		ratelimit_pages = 16;
+	if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
+		ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
+}
+
+static int
+ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
+{
+	set_ratelimit();
+	return 0;
+}
+
+static struct notifier_block ratelimit_nb = {
+	.notifier_call	= ratelimit_handler,
+	.next		= NULL,
+};
+
+/*
+ * If the machine has a large highmem:lowmem ratio then scale back the default
+ * dirty memory thresholds: allowing too much dirty highmem pins an excessive
+ * number of buffer_heads.
+ */
+void __init page_writeback_init(void)
+{
+	long buffer_pages = nr_free_buffer_pages();
+	long correction;
+
+	total_pages = nr_free_pagecache_pages();
+
+	correction = (100 * 4 * buffer_pages) / total_pages;
+
+	if (correction < 100) {
+		dirty_background_ratio *= correction;
+		dirty_background_ratio /= 100;
+		vm_dirty_ratio *= correction;
+		vm_dirty_ratio /= 100;
+
+		if (dirty_background_ratio <= 0)
+			dirty_background_ratio = 1;
+		if (vm_dirty_ratio <= 0)
+			vm_dirty_ratio = 1;
+	}
+	mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100);
+	set_ratelimit();
+	register_cpu_notifier(&ratelimit_nb);
+}
+
+int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	if (wbc->nr_to_write <= 0)
+		return 0;
+	if (mapping->a_ops->writepages)
+		return mapping->a_ops->writepages(mapping, wbc);
+	return generic_writepages(mapping, wbc);
+}
+
+/**
+ * write_one_page - write out a single page and optionally wait on I/O
+ *
+ * @page - the page to write
+ * @wait - if true, wait on writeout
+ *
+ * The page must be locked by the caller and will be unlocked upon return.
+ *
+ * write_one_page() returns a negative error code if I/O failed.
+ */
+int write_one_page(struct page *page, int wait)
+{
+	struct address_space *mapping = page->mapping;
+	int ret = 0;
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = 1,
+	};
+
+	BUG_ON(!PageLocked(page));
+
+	if (wait)
+		wait_on_page_writeback(page);
+
+	if (clear_page_dirty_for_io(page)) {
+		page_cache_get(page);
+		ret = mapping->a_ops->writepage(page, &wbc);
+		if (ret == 0 && wait) {
+			wait_on_page_writeback(page);
+			if (PageError(page))
+				ret = -EIO;
+		}
+		page_cache_release(page);
+	} else {
+		unlock_page(page);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(write_one_page);
+
+/*
+ * For address_spaces which do not use buffers.  Just tag the page as dirty in
+ * its radix tree.
+ *
+ * This is also used when a single buffer is being dirtied: we want to set the
+ * page dirty in that case, but not all the buffers.  This is a "bottom-up"
+ * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
+ *
+ * Most callers have locked the page, which pins the address_space in memory.
+ * But zap_pte_range() does not lock the page, however in that case the
+ * mapping is pinned by the vma's ->vm_file reference.
+ *
+ * We take care to handle the case where the page was truncated from the
+ * mapping by re-checking page_mapping() insode tree_lock.
+ */
+int __set_page_dirty_nobuffers(struct page *page)
+{
+	int ret = 0;
+
+	if (!TestSetPageDirty(page)) {
+		struct address_space *mapping = page_mapping(page);
+		struct address_space *mapping2;
+
+		if (mapping) {
+			write_lock_irq(&mapping->tree_lock);
+			mapping2 = page_mapping(page);
+			if (mapping2) { /* Race with truncate? */
+				BUG_ON(mapping2 != mapping);
+				if (mapping_cap_account_dirty(mapping))
+					inc_page_state(nr_dirty);
+				radix_tree_tag_set(&mapping->page_tree,
+					page_index(page), PAGECACHE_TAG_DIRTY);
+			}
+			write_unlock_irq(&mapping->tree_lock);
+			if (mapping->host) {
+				/* !PageAnon && !swapper_space */
+				__mark_inode_dirty(mapping->host,
+							I_DIRTY_PAGES);
+			}
+		}
+	}
+	return ret;
+}
+EXPORT_SYMBOL(__set_page_dirty_nobuffers);
+
+/*
+ * When a writepage implementation decides that it doesn't want to write this
+ * page for some reason, it should redirty the locked page via
+ * redirty_page_for_writepage() and it should then unlock the page and return 0
+ */
+int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
+{
+	wbc->pages_skipped++;
+	return __set_page_dirty_nobuffers(page);
+}
+EXPORT_SYMBOL(redirty_page_for_writepage);
+
+/*
+ * If the mapping doesn't provide a set_page_dirty a_op, then
+ * just fall through and assume that it wants buffer_heads.
+ */
+int fastcall set_page_dirty(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+
+	if (likely(mapping)) {
+		int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
+		if (spd)
+			return (*spd)(page);
+		return __set_page_dirty_buffers(page);
+	}
+	if (!PageDirty(page))
+		SetPageDirty(page);
+	return 0;
+}
+EXPORT_SYMBOL(set_page_dirty);
+
+/*
+ * set_page_dirty() is racy if the caller has no reference against
+ * page->mapping->host, and if the page is unlocked.  This is because another
+ * CPU could truncate the page off the mapping and then free the mapping.
+ *
+ * Usually, the page _is_ locked, or the caller is a user-space process which
+ * holds a reference on the inode by having an open file.
+ *
+ * In other cases, the page should be locked before running set_page_dirty().
+ */
+int set_page_dirty_lock(struct page *page)
+{
+	int ret;
+
+	lock_page(page);
+	ret = set_page_dirty(page);
+	unlock_page(page);
+	return ret;
+}
+EXPORT_SYMBOL(set_page_dirty_lock);
+
+/*
+ * Clear a page's dirty flag, while caring for dirty memory accounting. 
+ * Returns true if the page was previously dirty.
+ */
+int test_clear_page_dirty(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+	unsigned long flags;
+
+	if (mapping) {
+		write_lock_irqsave(&mapping->tree_lock, flags);
+		if (TestClearPageDirty(page)) {
+			radix_tree_tag_clear(&mapping->page_tree,
+						page_index(page),
+						PAGECACHE_TAG_DIRTY);
+			write_unlock_irqrestore(&mapping->tree_lock, flags);
+			if (mapping_cap_account_dirty(mapping))
+				dec_page_state(nr_dirty);
+			return 1;
+		}
+		write_unlock_irqrestore(&mapping->tree_lock, flags);
+		return 0;
+	}
+	return TestClearPageDirty(page);
+}
+EXPORT_SYMBOL(test_clear_page_dirty);
+
+/*
+ * Clear a page's dirty flag, while caring for dirty memory accounting.
+ * Returns true if the page was previously dirty.
+ *
+ * This is for preparing to put the page under writeout.  We leave the page
+ * tagged as dirty in the radix tree so that a concurrent write-for-sync
+ * can discover it via a PAGECACHE_TAG_DIRTY walk.  The ->writepage
+ * implementation will run either set_page_writeback() or set_page_dirty(),
+ * at which stage we bring the page's dirty flag and radix-tree dirty tag
+ * back into sync.
+ *
+ * This incoherency between the page's dirty flag and radix-tree tag is
+ * unfortunate, but it only exists while the page is locked.
+ */
+int clear_page_dirty_for_io(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+
+	if (mapping) {
+		if (TestClearPageDirty(page)) {
+			if (mapping_cap_account_dirty(mapping))
+				dec_page_state(nr_dirty);
+			return 1;
+		}
+		return 0;
+	}
+	return TestClearPageDirty(page);
+}
+EXPORT_SYMBOL(clear_page_dirty_for_io);
+
+int test_clear_page_writeback(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+	int ret;
+
+	if (mapping) {
+		unsigned long flags;
+
+		write_lock_irqsave(&mapping->tree_lock, flags);
+		ret = TestClearPageWriteback(page);
+		if (ret)
+			radix_tree_tag_clear(&mapping->page_tree,
+						page_index(page),
+						PAGECACHE_TAG_WRITEBACK);
+		write_unlock_irqrestore(&mapping->tree_lock, flags);
+	} else {
+		ret = TestClearPageWriteback(page);
+	}
+	return ret;
+}
+
+int test_set_page_writeback(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+	int ret;
+
+	if (mapping) {
+		unsigned long flags;
+
+		write_lock_irqsave(&mapping->tree_lock, flags);
+		ret = TestSetPageWriteback(page);
+		if (!ret)
+			radix_tree_tag_set(&mapping->page_tree,
+						page_index(page),
+						PAGECACHE_TAG_WRITEBACK);
+		if (!PageDirty(page))
+			radix_tree_tag_clear(&mapping->page_tree,
+						page_index(page),
+						PAGECACHE_TAG_DIRTY);
+		write_unlock_irqrestore(&mapping->tree_lock, flags);
+	} else {
+		ret = TestSetPageWriteback(page);
+	}
+	return ret;
+
+}
+EXPORT_SYMBOL(test_set_page_writeback);
+
+/*
+ * Return true if any of the pages in the mapping are marged with the
+ * passed tag.
+ */
+int mapping_tagged(struct address_space *mapping, int tag)
+{
+	unsigned long flags;
+	int ret;
+
+	read_lock_irqsave(&mapping->tree_lock, flags);
+	ret = radix_tree_tagged(&mapping->page_tree, tag);
+	read_unlock_irqrestore(&mapping->tree_lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(mapping_tagged);
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -0,0 +1,160 @@
+/*
+ *  linux/mm/page_io.c
+ *
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  Swap reorganised 29.12.95, 
+ *  Asynchronous swapping added 30.12.95. Stephen Tweedie
+ *  Removed race in async swapping. 14.4.1996. Bruno Haible
+ *  Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie
+ *  Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman
+ */
+
+#include <linux/mm.h>
+#include <linux/kernel_stat.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/bio.h>
+#include <linux/swapops.h>
+#include <linux/writeback.h>
+#include <asm/pgtable.h>
+
+static struct bio *get_swap_bio(unsigned int __nocast gfp_flags, pgoff_t index,
+				struct page *page, bio_end_io_t end_io)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(gfp_flags, 1);
+	if (bio) {
+		struct swap_info_struct *sis;
+		swp_entry_t entry = { .val = index, };
+
+		sis = get_swap_info_struct(swp_type(entry));
+		bio->bi_sector = map_swap_page(sis, swp_offset(entry)) *
+					(PAGE_SIZE >> 9);
+		bio->bi_bdev = sis->bdev;
+		bio->bi_io_vec[0].bv_page = page;
+		bio->bi_io_vec[0].bv_len = PAGE_SIZE;
+		bio->bi_io_vec[0].bv_offset = 0;
+		bio->bi_vcnt = 1;
+		bio->bi_idx = 0;
+		bio->bi_size = PAGE_SIZE;
+		bio->bi_end_io = end_io;
+	}
+	return bio;
+}
+
+static int end_swap_bio_write(struct bio *bio, unsigned int bytes_done, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct page *page = bio->bi_io_vec[0].bv_page;
+
+	if (bio->bi_size)
+		return 1;
+
+	if (!uptodate)
+		SetPageError(page);
+	end_page_writeback(page);
+	bio_put(bio);
+	return 0;
+}
+
+static int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct page *page = bio->bi_io_vec[0].bv_page;
+
+	if (bio->bi_size)
+		return 1;
+
+	if (!uptodate) {
+		SetPageError(page);
+		ClearPageUptodate(page);
+	} else {
+		SetPageUptodate(page);
+	}
+	unlock_page(page);
+	bio_put(bio);
+	return 0;
+}
+
+/*
+ * We may have stale swap cache pages in memory: notice
+ * them here and get rid of the unnecessary final write.
+ */
+int swap_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct bio *bio;
+	int ret = 0, rw = WRITE;
+
+	if (remove_exclusive_swap_page(page)) {
+		unlock_page(page);
+		goto out;
+	}
+	bio = get_swap_bio(GFP_NOIO, page->private, page, end_swap_bio_write);
+	if (bio == NULL) {
+		set_page_dirty(page);
+		unlock_page(page);
+		ret = -ENOMEM;
+		goto out;
+	}
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		rw |= (1 << BIO_RW_SYNC);
+	inc_page_state(pswpout);
+	set_page_writeback(page);
+	unlock_page(page);
+	submit_bio(rw, bio);
+out:
+	return ret;
+}
+
+int swap_readpage(struct file *file, struct page *page)
+{
+	struct bio *bio;
+	int ret = 0;
+
+	BUG_ON(!PageLocked(page));
+	ClearPageUptodate(page);
+	bio = get_swap_bio(GFP_KERNEL, page->private, page, end_swap_bio_read);
+	if (bio == NULL) {
+		unlock_page(page);
+		ret = -ENOMEM;
+		goto out;
+	}
+	inc_page_state(pswpin);
+	submit_bio(READ, bio);
+out:
+	return ret;
+}
+
+#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_PM_DISK)
+/*
+ * A scruffy utility function to read or write an arbitrary swap page
+ * and wait on the I/O.  The caller must have a ref on the page.
+ *
+ * We use end_swap_bio_read() even for writes, because it happens to do what
+ * we want.
+ */
+int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page)
+{
+	struct bio *bio;
+	int ret = 0;
+
+	lock_page(page);
+
+	bio = get_swap_bio(GFP_KERNEL, entry.val, page, end_swap_bio_read);
+	if (bio == NULL) {
+		unlock_page(page);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+	wait_on_page_locked(page);
+
+	if (!PageUptodate(page) || PageError(page))
+		ret = -EIO;
+out:
+	return ret;
+}
+#endif
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -0,0 +1,228 @@
+/*
+ * mm/pdflush.c - worker threads for writing back filesystem data
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ *
+ * 09Apr2002	akpm@zip.com.au
+ *		Initial version
+ * 29Feb2004	kaos@sgi.com
+ *		Move worker thread creation to kthread to avoid chewing
+ *		up stack space with nested calls to kernel_thread.
+ */
+
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/signal.h>
+#include <linux/spinlock.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/fs.h>		// Needed by writeback.h
+#include <linux/writeback.h>	// Prototypes pdflush_operation()
+#include <linux/kthread.h>
+
+
+/*
+ * Minimum and maximum number of pdflush instances
+ */
+#define MIN_PDFLUSH_THREADS	2
+#define MAX_PDFLUSH_THREADS	8
+
+static void start_one_pdflush_thread(void);
+
+
+/*
+ * The pdflush threads are worker threads for writing back dirty data.
+ * Ideally, we'd like one thread per active disk spindle.  But the disk
+ * topology is very hard to divine at this level.   Instead, we take
+ * care in various places to prevent more than one pdflush thread from
+ * performing writeback against a single filesystem.  pdflush threads
+ * have the PF_FLUSHER flag set in current->flags to aid in this.
+ */
+
+/*
+ * All the pdflush threads.  Protected by pdflush_lock
+ */
+static LIST_HEAD(pdflush_list);
+static DEFINE_SPINLOCK(pdflush_lock);
+
+/*
+ * The count of currently-running pdflush threads.  Protected
+ * by pdflush_lock.
+ *
+ * Readable by sysctl, but not writable.  Published to userspace at
+ * /proc/sys/vm/nr_pdflush_threads.
+ */
+int nr_pdflush_threads = 0;
+
+/*
+ * The time at which the pdflush thread pool last went empty
+ */
+static unsigned long last_empty_jifs;
+
+/*
+ * The pdflush thread.
+ *
+ * Thread pool management algorithm:
+ * 
+ * - The minimum and maximum number of pdflush instances are bound
+ *   by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
+ * 
+ * - If there have been no idle pdflush instances for 1 second, create
+ *   a new one.
+ * 
+ * - If the least-recently-went-to-sleep pdflush thread has been asleep
+ *   for more than one second, terminate a thread.
+ */
+
+/*
+ * A structure for passing work to a pdflush thread.  Also for passing
+ * state information between pdflush threads.  Protected by pdflush_lock.
+ */
+struct pdflush_work {
+	struct task_struct *who;	/* The thread */
+	void (*fn)(unsigned long);	/* A callback function */
+	unsigned long arg0;		/* An argument to the callback */
+	struct list_head list;		/* On pdflush_list, when idle */
+	unsigned long when_i_went_to_sleep;
+};
+
+static int __pdflush(struct pdflush_work *my_work)
+{
+	current->flags |= PF_FLUSHER;
+	my_work->fn = NULL;
+	my_work->who = current;
+	INIT_LIST_HEAD(&my_work->list);
+
+	spin_lock_irq(&pdflush_lock);
+	nr_pdflush_threads++;
+	for ( ; ; ) {
+		struct pdflush_work *pdf;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		list_move(&my_work->list, &pdflush_list);
+		my_work->when_i_went_to_sleep = jiffies;
+		spin_unlock_irq(&pdflush_lock);
+
+		schedule();
+		if (try_to_freeze(PF_FREEZE)) {
+			spin_lock_irq(&pdflush_lock);
+			continue;
+		}
+
+		spin_lock_irq(&pdflush_lock);
+		if (!list_empty(&my_work->list)) {
+			printk("pdflush: bogus wakeup!\n");
+			my_work->fn = NULL;
+			continue;
+		}
+		if (my_work->fn == NULL) {
+			printk("pdflush: NULL work function\n");
+			continue;
+		}
+		spin_unlock_irq(&pdflush_lock);
+
+		(*my_work->fn)(my_work->arg0);
+
+		/*
+		 * Thread creation: For how long have there been zero
+		 * available threads?
+		 */
+		if (jiffies - last_empty_jifs > 1 * HZ) {
+			/* unlocked list_empty() test is OK here */
+			if (list_empty(&pdflush_list)) {
+				/* unlocked test is OK here */
+				if (nr_pdflush_threads < MAX_PDFLUSH_THREADS)
+					start_one_pdflush_thread();
+			}
+		}
+
+		spin_lock_irq(&pdflush_lock);
+		my_work->fn = NULL;
+
+		/*
+		 * Thread destruction: For how long has the sleepiest
+		 * thread slept?
+		 */
+		if (list_empty(&pdflush_list))
+			continue;
+		if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
+			continue;
+		pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
+		if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) {
+			/* Limit exit rate */
+			pdf->when_i_went_to_sleep = jiffies;
+			break;					/* exeunt */
+		}
+	}
+	nr_pdflush_threads--;
+	spin_unlock_irq(&pdflush_lock);
+	return 0;
+}
+
+/*
+ * Of course, my_work wants to be just a local in __pdflush().  It is
+ * separated out in this manner to hopefully prevent the compiler from
+ * performing unfortunate optimisations against the auto variables.  Because
+ * these are visible to other tasks and CPUs.  (No problem has actually
+ * been observed.  This is just paranoia).
+ */
+static int pdflush(void *dummy)
+{
+	struct pdflush_work my_work;
+
+	/*
+	 * pdflush can spend a lot of time doing encryption via dm-crypt.  We
+	 * don't want to do that at keventd's priority.
+	 */
+	set_user_nice(current, 0);
+	return __pdflush(&my_work);
+}
+
+/*
+ * Attempt to wake up a pdflush thread, and get it to do some work for you.
+ * Returns zero if it indeed managed to find a worker thread, and passed your
+ * payload to it.
+ */
+int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	if (fn == NULL)
+		BUG();		/* Hard to diagnose if it's deferred */
+
+	spin_lock_irqsave(&pdflush_lock, flags);
+	if (list_empty(&pdflush_list)) {
+		spin_unlock_irqrestore(&pdflush_lock, flags);
+		ret = -1;
+	} else {
+		struct pdflush_work *pdf;
+
+		pdf = list_entry(pdflush_list.next, struct pdflush_work, list);
+		list_del_init(&pdf->list);
+		if (list_empty(&pdflush_list))
+			last_empty_jifs = jiffies;
+		pdf->fn = fn;
+		pdf->arg0 = arg0;
+		wake_up_process(pdf->who);
+		spin_unlock_irqrestore(&pdflush_lock, flags);
+	}
+	return ret;
+}
+
+static void start_one_pdflush_thread(void)
+{
+	kthread_run(pdflush, NULL, "pdflush");
+}
+
+static int __init pdflush_init(void)
+{
+	int i;
+
+	for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
+		start_one_pdflush_thread();
+	return 0;
+}
+
+module_init(pdflush_init);
--- a/mm/prio_tree.c
+++ b/mm/prio_tree.c
@@ -0,0 +1,207 @@
+/*
+ * mm/prio_tree.c - priority search tree for mapping->i_mmap
+ *
+ * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
+ *
+ * This file is released under the GPL v2.
+ *
+ * Based on the radix priority search tree proposed by Edward M. McCreight
+ * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
+ *
+ * 02Feb2004	Initial version
+ */
+
+#include <linux/mm.h>
+#include <linux/prio_tree.h>
+
+/*
+ * See lib/prio_tree.c for details on the general radix priority search tree
+ * code.
+ */
+
+/*
+ * The following #defines are mirrored from lib/prio_tree.c. They're only used
+ * for debugging, and should be removed (along with the debugging code using
+ * them) when switching also VMAs to the regular prio_tree code.
+ */
+
+#define RADIX_INDEX(vma)  ((vma)->vm_pgoff)
+#define VMA_SIZE(vma)	  (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
+/* avoid overflow */
+#define HEAP_INDEX(vma)   ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
+
+/*
+ * Radix priority search tree for address_space->i_mmap
+ *
+ * For each vma that map a unique set of file pages i.e., unique [radix_index,
+ * heap_index] value, we have a corresponing priority search tree node. If
+ * multiple vmas have identical [radix_index, heap_index] value, then one of
+ * them is used as a tree node and others are stored in a vm_set list. The tree
+ * node points to the first vma (head) of the list using vm_set.head.
+ *
+ * prio_tree_root
+ *      |
+ *      A       vm_set.head
+ *     / \      /
+ *    L   R -> H-I-J-K-M-N-O-P-Q-S
+ *    ^   ^    <-- vm_set.list -->
+ *  tree nodes
+ *
+ * We need some way to identify whether a vma is a tree node, head of a vm_set
+ * list, or just a member of a vm_set list. We cannot use vm_flags to store
+ * such information. The reason is, in the above figure, it is possible that
+ * vm_flags' of R and H are covered by the different mmap_sems. When R is
+ * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold
+ * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now.
+ * That's why some trick involving shared.vm_set.parent is used for identifying
+ * tree nodes and list head nodes.
+ *
+ * vma radix priority search tree node rules:
+ *
+ * vma->shared.vm_set.parent != NULL    ==> a tree node
+ *      vma->shared.vm_set.head != NULL ==> list of others mapping same range
+ *      vma->shared.vm_set.head == NULL ==> no others map the same range
+ *
+ * vma->shared.vm_set.parent == NULL
+ * 	vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range
+ * 	vma->shared.vm_set.head == NULL ==> a list node
+ */
+
+/*
+ * Add a new vma known to map the same set of pages as the old vma:
+ * useful for fork's dup_mmap as well as vma_prio_tree_insert below.
+ * Note that it just happens to work correctly on i_mmap_nonlinear too.
+ */
+void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old)
+{
+	/* Leave these BUG_ONs till prio_tree patch stabilizes */
+	BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
+	BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
+
+	vma->shared.vm_set.head = NULL;
+	vma->shared.vm_set.parent = NULL;
+
+	if (!old->shared.vm_set.parent)
+		list_add(&vma->shared.vm_set.list,
+				&old->shared.vm_set.list);
+	else if (old->shared.vm_set.head)
+		list_add_tail(&vma->shared.vm_set.list,
+				&old->shared.vm_set.head->shared.vm_set.list);
+	else {
+		INIT_LIST_HEAD(&vma->shared.vm_set.list);
+		vma->shared.vm_set.head = old;
+		old->shared.vm_set.head = vma;
+	}
+}
+
+void vma_prio_tree_insert(struct vm_area_struct *vma,
+			  struct prio_tree_root *root)
+{
+	struct prio_tree_node *ptr;
+	struct vm_area_struct *old;
+
+	vma->shared.vm_set.head = NULL;
+
+	ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node);
+	if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) {
+		old = prio_tree_entry(ptr, struct vm_area_struct,
+					shared.prio_tree_node);
+		vma_prio_tree_add(vma, old);
+	}
+}
+
+void vma_prio_tree_remove(struct vm_area_struct *vma,
+			  struct prio_tree_root *root)
+{
+	struct vm_area_struct *node, *head, *new_head;
+
+	if (!vma->shared.vm_set.head) {
+		if (!vma->shared.vm_set.parent)
+			list_del_init(&vma->shared.vm_set.list);
+		else
+			raw_prio_tree_remove(root, &vma->shared.prio_tree_node);
+	} else {
+		/* Leave this BUG_ON till prio_tree patch stabilizes */
+		BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma);
+		if (vma->shared.vm_set.parent) {
+			head = vma->shared.vm_set.head;
+			if (!list_empty(&head->shared.vm_set.list)) {
+				new_head = list_entry(
+					head->shared.vm_set.list.next,
+					struct vm_area_struct,
+					shared.vm_set.list);
+				list_del_init(&head->shared.vm_set.list);
+			} else
+				new_head = NULL;
+
+			raw_prio_tree_replace(root, &vma->shared.prio_tree_node,
+					&head->shared.prio_tree_node);
+			head->shared.vm_set.head = new_head;
+			if (new_head)
+				new_head->shared.vm_set.head = head;
+
+		} else {
+			node = vma->shared.vm_set.head;
+			if (!list_empty(&vma->shared.vm_set.list)) {
+				new_head = list_entry(
+					vma->shared.vm_set.list.next,
+					struct vm_area_struct,
+					shared.vm_set.list);
+				list_del_init(&vma->shared.vm_set.list);
+				node->shared.vm_set.head = new_head;
+				new_head->shared.vm_set.head = node;
+			} else
+				node->shared.vm_set.head = NULL;
+		}
+	}
+}
+
+/*
+ * Helper function to enumerate vmas that map a given file page or a set of
+ * contiguous file pages. The function returns vmas that at least map a single
+ * page in the given range of contiguous file pages.
+ */
+struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
+					struct prio_tree_iter *iter)
+{
+	struct prio_tree_node *ptr;
+	struct vm_area_struct *next;
+
+	if (!vma) {
+		/*
+		 * First call is with NULL vma
+		 */
+		ptr = prio_tree_next(iter);
+		if (ptr) {
+			next = prio_tree_entry(ptr, struct vm_area_struct,
+						shared.prio_tree_node);
+			prefetch(next->shared.vm_set.head);
+			return next;
+		} else
+			return NULL;
+	}
+
+	if (vma->shared.vm_set.parent) {
+		if (vma->shared.vm_set.head) {
+			next = vma->shared.vm_set.head;
+			prefetch(next->shared.vm_set.list.next);
+			return next;
+		}
+	} else {
+		next = list_entry(vma->shared.vm_set.list.next,
+				struct vm_area_struct, shared.vm_set.list);
+		if (!next->shared.vm_set.head) {
+			prefetch(next->shared.vm_set.list.next);
+			return next;
+		}
+	}
+
+	ptr = prio_tree_next(iter);
+	if (ptr) {
+		next = prio_tree_entry(ptr, struct vm_area_struct,
+					shared.prio_tree_node);
+		prefetch(next->shared.vm_set.head);
+		return next;
+	} else
+		return NULL;
+}
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -0,0 +1,557 @@
+/*
+ * mm/readahead.c - address_space-level file readahead.
+ *
+ * Copyright (C) 2002, Linus Torvalds
+ *
+ * 09Apr2002	akpm@zip.com.au
+ *		Initial version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/pagevec.h>
+
+void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+{
+}
+EXPORT_SYMBOL(default_unplug_io_fn);
+
+struct backing_dev_info default_backing_dev_info = {
+	.ra_pages	= (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
+	.state		= 0,
+	.capabilities	= BDI_CAP_MAP_COPY,
+	.unplug_io_fn	= default_unplug_io_fn,
+};
+EXPORT_SYMBOL_GPL(default_backing_dev_info);
+
+/*
+ * Initialise a struct file's readahead state.  Assumes that the caller has
+ * memset *ra to zero.
+ */
+void
+file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
+{
+	ra->ra_pages = mapping->backing_dev_info->ra_pages;
+	ra->prev_page = -1;
+}
+
+/*
+ * Return max readahead size for this inode in number-of-pages.
+ */
+static inline unsigned long get_max_readahead(struct file_ra_state *ra)
+{
+	return ra->ra_pages;
+}
+
+static inline unsigned long get_min_readahead(struct file_ra_state *ra)
+{
+	return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+}
+
+static inline void ra_off(struct file_ra_state *ra)
+{
+	ra->start = 0;
+	ra->flags = 0;
+	ra->size = 0;
+	ra->ahead_start = 0;
+	ra->ahead_size = 0;
+	return;
+}
+
+/*
+ * Set the initial window size, round to next power of 2 and square
+ * for small size, x 4 for medium, and x 2 for large
+ * for 128k (32 page) max ra
+ * 1-8 page = 32k initial, > 8 page = 128k initial
+ */
+static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
+{
+	unsigned long newsize = roundup_pow_of_two(size);
+
+	if (newsize <= max / 64)
+		newsize = newsize * newsize;
+	else if (newsize <= max / 4)
+		newsize = max / 4;
+	else
+		newsize = max;
+	return newsize;
+}
+
+/*
+ * Set the new window size, this is called only when I/O is to be submitted,
+ * not for each call to readahead.  If a cache miss occured, reduce next I/O
+ * size, else increase depending on how close to max we are.
+ */
+static inline unsigned long get_next_ra_size(struct file_ra_state *ra)
+{
+	unsigned long max = get_max_readahead(ra);
+	unsigned long min = get_min_readahead(ra);
+	unsigned long cur = ra->size;
+	unsigned long newsize;
+
+	if (ra->flags & RA_FLAG_MISS) {
+		ra->flags &= ~RA_FLAG_MISS;
+		newsize = max((cur - 2), min);
+	} else if (cur < max / 16) {
+		newsize = 4 * cur;
+	} else {
+		newsize = 2 * cur;
+	}
+	return min(newsize, max);
+}
+
+#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
+
+/**
+ * read_cache_pages - populate an address space with some pages, and
+ * 			start reads against them.
+ * @mapping: the address_space
+ * @pages: The address of a list_head which contains the target pages.  These
+ *   pages have their ->index populated and are otherwise uninitialised.
+ * @filler: callback routine for filling a single page.
+ * @data: private data for the callback routine.
+ *
+ * Hides the details of the LRU cache etc from the filesystems.
+ */
+int read_cache_pages(struct address_space *mapping, struct list_head *pages,
+			int (*filler)(void *, struct page *), void *data)
+{
+	struct page *page;
+	struct pagevec lru_pvec;
+	int ret = 0;
+
+	pagevec_init(&lru_pvec, 0);
+
+	while (!list_empty(pages)) {
+		page = list_to_page(pages);
+		list_del(&page->lru);
+		if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) {
+			page_cache_release(page);
+			continue;
+		}
+		ret = filler(data, page);
+		if (!pagevec_add(&lru_pvec, page))
+			__pagevec_lru_add(&lru_pvec);
+		if (ret) {
+			while (!list_empty(pages)) {
+				struct page *victim;
+
+				victim = list_to_page(pages);
+				list_del(&victim->lru);
+				page_cache_release(victim);
+			}
+			break;
+		}
+	}
+	pagevec_lru_add(&lru_pvec);
+	return ret;
+}
+
+EXPORT_SYMBOL(read_cache_pages);
+
+static int read_pages(struct address_space *mapping, struct file *filp,
+		struct list_head *pages, unsigned nr_pages)
+{
+	unsigned page_idx;
+	struct pagevec lru_pvec;
+	int ret = 0;
+
+	if (mapping->a_ops->readpages) {
+		ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
+		goto out;
+	}
+
+	pagevec_init(&lru_pvec, 0);
+	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+		struct page *page = list_to_page(pages);
+		list_del(&page->lru);
+		if (!add_to_page_cache(page, mapping,
+					page->index, GFP_KERNEL)) {
+			mapping->a_ops->readpage(filp, page);
+			if (!pagevec_add(&lru_pvec, page))
+				__pagevec_lru_add(&lru_pvec);
+		} else {
+			page_cache_release(page);
+		}
+	}
+	pagevec_lru_add(&lru_pvec);
+out:
+	return ret;
+}
+
+/*
+ * Readahead design.
+ *
+ * The fields in struct file_ra_state represent the most-recently-executed
+ * readahead attempt:
+ *
+ * start:	Page index at which we started the readahead
+ * size:	Number of pages in that read
+ *              Together, these form the "current window".
+ *              Together, start and size represent the `readahead window'.
+ * prev_page:   The page which the readahead algorithm most-recently inspected.
+ *              It is mainly used to detect sequential file reading.
+ *              If page_cache_readahead sees that it is again being called for
+ *              a page which it just looked at, it can return immediately without
+ *              making any state changes.
+ * ahead_start,
+ * ahead_size:  Together, these form the "ahead window".
+ * ra_pages:	The externally controlled max readahead for this fd.
+ *
+ * When readahead is in the off state (size == 0), readahead is disabled.
+ * In this state, prev_page is used to detect the resumption of sequential I/O.
+ *
+ * The readahead code manages two windows - the "current" and the "ahead"
+ * windows.  The intent is that while the application is walking the pages
+ * in the current window, I/O is underway on the ahead window.  When the
+ * current window is fully traversed, it is replaced by the ahead window
+ * and the ahead window is invalidated.  When this copying happens, the
+ * new current window's pages are probably still locked.  So
+ * we submit a new batch of I/O immediately, creating a new ahead window.
+ *
+ * So:
+ *
+ *   ----|----------------|----------------|-----
+ *       ^start           ^start+size
+ *                        ^ahead_start     ^ahead_start+ahead_size
+ *
+ *         ^ When this page is read, we submit I/O for the
+ *           ahead window.
+ *
+ * A `readahead hit' occurs when a read request is made against a page which is
+ * the next sequential page. Ahead window calculations are done only when it
+ * is time to submit a new IO.  The code ramps up the size agressively at first,
+ * but slow down as it approaches max_readhead.
+ *
+ * Any seek/ramdom IO will result in readahead being turned off.  It will resume
+ * at the first sequential access.
+ *
+ * There is a special-case: if the first page which the application tries to
+ * read happens to be the first page of the file, it is assumed that a linear
+ * read is about to happen and the window is immediately set to the initial size
+ * based on I/O request size and the max_readahead.
+ *
+ * This function is to be called for every read request, rather than when
+ * it is time to perform readahead.  It is called only once for the entire I/O
+ * regardless of size unless readahead is unable to start enough I/O to satisfy
+ * the request (I/O request > max_readahead).
+ */
+
+/*
+ * do_page_cache_readahead actually reads a chunk of disk.  It allocates all
+ * the pages first, then submits them all for I/O. This avoids the very bad
+ * behaviour which would occur if page allocations are causing VM writeback.
+ * We really don't want to intermingle reads and writes like that.
+ *
+ * Returns the number of pages requested, or the maximum amount of I/O allowed.
+ *
+ * do_page_cache_readahead() returns -1 if it encountered request queue
+ * congestion.
+ */
+static int
+__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
+			unsigned long offset, unsigned long nr_to_read)
+{
+	struct inode *inode = mapping->host;
+	struct page *page;
+	unsigned long end_index;	/* The last page we want to read */
+	LIST_HEAD(page_pool);
+	int page_idx;
+	int ret = 0;
+	loff_t isize = i_size_read(inode);
+
+	if (isize == 0)
+		goto out;
+
+ 	end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
+
+	/*
+	 * Preallocate as many pages as we will need.
+	 */
+	read_lock_irq(&mapping->tree_lock);
+	for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
+		unsigned long page_offset = offset + page_idx;
+		
+		if (page_offset > end_index)
+			break;
+
+		page = radix_tree_lookup(&mapping->page_tree, page_offset);
+		if (page)
+			continue;
+
+		read_unlock_irq(&mapping->tree_lock);
+		page = page_cache_alloc_cold(mapping);
+		read_lock_irq(&mapping->tree_lock);
+		if (!page)
+			break;
+		page->index = page_offset;
+		list_add(&page->lru, &page_pool);
+		ret++;
+	}
+	read_unlock_irq(&mapping->tree_lock);
+
+	/*
+	 * Now start the IO.  We ignore I/O errors - if the page is not
+	 * uptodate then the caller will launch readpage again, and
+	 * will then handle the error.
+	 */
+	if (ret)
+		read_pages(mapping, filp, &page_pool, ret);
+	BUG_ON(!list_empty(&page_pool));
+out:
+	return ret;
+}
+
+/*
+ * Chunk the readahead into 2 megabyte units, so that we don't pin too much
+ * memory at once.
+ */
+int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
+		unsigned long offset, unsigned long nr_to_read)
+{
+	int ret = 0;
+
+	if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
+		return -EINVAL;
+
+	while (nr_to_read) {
+		int err;
+
+		unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE;
+
+		if (this_chunk > nr_to_read)
+			this_chunk = nr_to_read;
+		err = __do_page_cache_readahead(mapping, filp,
+						offset, this_chunk);
+		if (err < 0) {
+			ret = err;
+			break;
+		}
+		ret += err;
+		offset += this_chunk;
+		nr_to_read -= this_chunk;
+	}
+	return ret;
+}
+
+/*
+ * Check how effective readahead is being.  If the amount of started IO is
+ * less than expected then the file is partly or fully in pagecache and
+ * readahead isn't helping.
+ *
+ */
+static inline int check_ra_success(struct file_ra_state *ra,
+			unsigned long nr_to_read, unsigned long actual)
+{
+	if (actual == 0) {
+		ra->cache_hit += nr_to_read;
+		if (ra->cache_hit >= VM_MAX_CACHE_HIT) {
+			ra_off(ra);
+			ra->flags |= RA_FLAG_INCACHE;
+			return 0;
+		}
+	} else {
+		ra->cache_hit=0;
+	}
+	return 1;
+}
+
+/*
+ * This version skips the IO if the queue is read-congested, and will tell the
+ * block layer to abandon the readahead if request allocation would block.
+ *
+ * force_page_cache_readahead() will ignore queue congestion and will block on
+ * request queues.
+ */
+int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
+			unsigned long offset, unsigned long nr_to_read)
+{
+	if (bdi_read_congested(mapping->backing_dev_info))
+		return -1;
+
+	return __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
+}
+
+/*
+ * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block'
+ * is set wait till the read completes.  Otherwise attempt to read without
+ * blocking.
+ * Returns 1 meaning 'success' if read is succesfull without switching off
+ * readhaead mode. Otherwise return failure.
+ */
+static int
+blockable_page_cache_readahead(struct address_space *mapping, struct file *filp,
+			unsigned long offset, unsigned long nr_to_read,
+			struct file_ra_state *ra, int block)
+{
+	int actual;
+
+	if (!block && bdi_read_congested(mapping->backing_dev_info))
+		return 0;
+
+	actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
+
+	return check_ra_success(ra, nr_to_read, actual);
+}
+
+static int make_ahead_window(struct address_space *mapping, struct file *filp,
+				struct file_ra_state *ra, int force)
+{
+	int block, ret;
+
+	ra->ahead_size = get_next_ra_size(ra);
+	ra->ahead_start = ra->start + ra->size;
+
+	block = force || (ra->prev_page >= ra->ahead_start);
+	ret = blockable_page_cache_readahead(mapping, filp,
+			ra->ahead_start, ra->ahead_size, ra, block);
+
+	if (!ret && !force) {
+		/* A read failure in blocking mode, implies pages are
+		 * all cached. So we can safely assume we have taken
+		 * care of all the pages requested in this call.
+		 * A read failure in non-blocking mode, implies we are
+		 * reading more pages than requested in this call.  So
+		 * we safely assume we have taken care of all the pages
+		 * requested in this call.
+		 *
+		 * Just reset the ahead window in case we failed due to
+		 * congestion.  The ahead window will any way be closed
+		 * in case we failed due to excessive page cache hits.
+		 */
+		ra->ahead_start = 0;
+		ra->ahead_size = 0;
+	}
+
+	return ret;
+}
+
+/*
+ * page_cache_readahead is the main function.  If performs the adaptive
+ * readahead window size management and submits the readahead I/O.
+ */
+unsigned long
+page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
+		     struct file *filp, unsigned long offset,
+		     unsigned long req_size)
+{
+	unsigned long max, newsize;
+	int sequential;
+
+	/*
+	 * We avoid doing extra work and bogusly perturbing the readahead
+	 * window expansion logic.
+	 */
+	if (offset == ra->prev_page && --req_size)
+		++offset;
+
+	/* Note that prev_page == -1 if it is a first read */
+	sequential = (offset == ra->prev_page + 1);
+	ra->prev_page = offset;
+
+	max = get_max_readahead(ra);
+	newsize = min(req_size, max);
+
+	/* No readahead or sub-page sized read or file already in cache */
+	if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE))
+		goto out;
+
+	ra->prev_page += newsize - 1;
+
+	/*
+	 * Special case - first read at start of file. We'll assume it's
+	 * a whole-file read and grow the window fast.  Or detect first
+	 * sequential access
+	 */
+	if (sequential && ra->size == 0) {
+		ra->size = get_init_ra_size(newsize, max);
+		ra->start = offset;
+		if (!blockable_page_cache_readahead(mapping, filp, offset,
+							 ra->size, ra, 1))
+			goto out;
+
+		/*
+		 * If the request size is larger than our max readahead, we
+		 * at least want to be sure that we get 2 IOs in flight and
+		 * we know that we will definitly need the new I/O.
+		 * once we do this, subsequent calls should be able to overlap
+		 * IOs,* thus preventing stalls. so issue the ahead window
+		 * immediately.
+		 */
+		if (req_size >= max)
+			make_ahead_window(mapping, filp, ra, 1);
+
+		goto out;
+	}
+
+	/*
+	 * Now handle the random case:
+	 * partial page reads and first access were handled above,
+	 * so this must be the next page otherwise it is random
+	 */
+	if (!sequential) {
+		ra_off(ra);
+		blockable_page_cache_readahead(mapping, filp, offset,
+				 newsize, ra, 1);
+		goto out;
+	}
+
+	/*
+	 * If we get here we are doing sequential IO and this was not the first
+	 * occurence (ie we have an existing window)
+	 */
+
+	if (ra->ahead_start == 0) {	 /* no ahead window yet */
+		if (!make_ahead_window(mapping, filp, ra, 0))
+			goto out;
+	}
+	/*
+	 * Already have an ahead window, check if we crossed into it.
+	 * If so, shift windows and issue a new ahead window.
+	 * Only return the #pages that are in the current window, so that
+	 * we get called back on the first page of the ahead window which
+	 * will allow us to submit more IO.
+	 */
+	if (ra->prev_page >= ra->ahead_start) {
+		ra->start = ra->ahead_start;
+		ra->size = ra->ahead_size;
+		make_ahead_window(mapping, filp, ra, 0);
+	}
+
+out:
+	return ra->prev_page + 1;
+}
+
+/*
+ * handle_ra_miss() is called when it is known that a page which should have
+ * been present in the pagecache (we just did some readahead there) was in fact
+ * not found.  This will happen if it was evicted by the VM (readahead
+ * thrashing)
+ *
+ * Turn on the cache miss flag in the RA struct, this will cause the RA code
+ * to reduce the RA size on the next read.
+ */
+void handle_ra_miss(struct address_space *mapping,
+		struct file_ra_state *ra, pgoff_t offset)
+{
+	ra->flags |= RA_FLAG_MISS;
+	ra->flags &= ~RA_FLAG_INCACHE;
+}
+
+/*
+ * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
+ * sensible upper limit.
+ */
+unsigned long max_sane_readahead(unsigned long nr)
+{
+	unsigned long active;
+	unsigned long inactive;
+	unsigned long free;
+
+	__get_zone_counts(&active, &inactive, &free, NODE_DATA(numa_node_id()));
+	return min(nr, (inactive + free) / 2);
+}
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -0,0 +1,862 @@
+/*
+ * mm/rmap.c - physical to virtual reverse mappings
+ *
+ * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
+ * Released under the General Public License (GPL).
+ *
+ * Simple, low overhead reverse mapping scheme.
+ * Please try to keep this thing as modular as possible.
+ *
+ * Provides methods for unmapping each kind of mapped page:
+ * the anon methods track anonymous pages, and
+ * the file methods track pages belonging to an inode.
+ *
+ * Original design by Rik van Riel <riel@conectiva.com.br> 2001
+ * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
+ * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
+ * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
+ */
+
+/*
+ * Lock ordering in mm:
+ *
+ * inode->i_sem	(while writing or truncating, not reading or faulting)
+ *   inode->i_alloc_sem
+ *
+ * When a page fault occurs in writing from user to file, down_read
+ * of mmap_sem nests within i_sem; in sys_msync, i_sem nests within
+ * down_read of mmap_sem; i_sem and down_write of mmap_sem are never
+ * taken together; in truncation, i_sem is taken outermost.
+ *
+ * mm->mmap_sem
+ *   page->flags PG_locked (lock_page)
+ *     mapping->i_mmap_lock
+ *       anon_vma->lock
+ *         mm->page_table_lock
+ *           zone->lru_lock (in mark_page_accessed)
+ *           swap_list_lock (in swap_free etc's swap_info_get)
+ *             mmlist_lock (in mmput, drain_mmlist and others)
+ *             swap_device_lock (in swap_duplicate, swap_info_get)
+ *             mapping->private_lock (in __set_page_dirty_buffers)
+ *             inode_lock (in set_page_dirty's __mark_inode_dirty)
+ *               sb_lock (within inode_lock in fs/fs-writeback.c)
+ *               mapping->tree_lock (widely used, in set_page_dirty,
+ *                         in arch-dependent flush_dcache_mmap_lock,
+ *                         within inode_lock in __sync_single_inode)
+ */
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/rmap.h>
+#include <linux/rcupdate.h>
+
+#include <asm/tlbflush.h>
+
+//#define RMAP_DEBUG /* can be enabled only for debugging */
+
+kmem_cache_t *anon_vma_cachep;
+
+static inline void validate_anon_vma(struct vm_area_struct *find_vma)
+{
+#ifdef RMAP_DEBUG
+	struct anon_vma *anon_vma = find_vma->anon_vma;
+	struct vm_area_struct *vma;
+	unsigned int mapcount = 0;
+	int found = 0;
+
+	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+		mapcount++;
+		BUG_ON(mapcount > 100000);
+		if (vma == find_vma)
+			found = 1;
+	}
+	BUG_ON(!found);
+#endif
+}
+
+/* This must be called under the mmap_sem. */
+int anon_vma_prepare(struct vm_area_struct *vma)
+{
+	struct anon_vma *anon_vma = vma->anon_vma;
+
+	might_sleep();
+	if (unlikely(!anon_vma)) {
+		struct mm_struct *mm = vma->vm_mm;
+		struct anon_vma *allocated, *locked;
+
+		anon_vma = find_mergeable_anon_vma(vma);
+		if (anon_vma) {
+			allocated = NULL;
+			locked = anon_vma;
+			spin_lock(&locked->lock);
+		} else {
+			anon_vma = anon_vma_alloc();
+			if (unlikely(!anon_vma))
+				return -ENOMEM;
+			allocated = anon_vma;
+			locked = NULL;
+		}
+
+		/* page_table_lock to protect against threads */
+		spin_lock(&mm->page_table_lock);
+		if (likely(!vma->anon_vma)) {
+			vma->anon_vma = anon_vma;
+			list_add(&vma->anon_vma_node, &anon_vma->head);
+			allocated = NULL;
+		}
+		spin_unlock(&mm->page_table_lock);
+
+		if (locked)
+			spin_unlock(&locked->lock);
+		if (unlikely(allocated))
+			anon_vma_free(allocated);
+	}
+	return 0;
+}
+
+void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
+{
+	BUG_ON(vma->anon_vma != next->anon_vma);
+	list_del(&next->anon_vma_node);
+}
+
+void __anon_vma_link(struct vm_area_struct *vma)
+{
+	struct anon_vma *anon_vma = vma->anon_vma;
+
+	if (anon_vma) {
+		list_add(&vma->anon_vma_node, &anon_vma->head);
+		validate_anon_vma(vma);
+	}
+}
+
+void anon_vma_link(struct vm_area_struct *vma)
+{
+	struct anon_vma *anon_vma = vma->anon_vma;
+
+	if (anon_vma) {
+		spin_lock(&anon_vma->lock);
+		list_add(&vma->anon_vma_node, &anon_vma->head);
+		validate_anon_vma(vma);
+		spin_unlock(&anon_vma->lock);
+	}
+}
+
+void anon_vma_unlink(struct vm_area_struct *vma)
+{
+	struct anon_vma *anon_vma = vma->anon_vma;
+	int empty;
+
+	if (!anon_vma)
+		return;
+
+	spin_lock(&anon_vma->lock);
+	validate_anon_vma(vma);
+	list_del(&vma->anon_vma_node);
+
+	/* We must garbage collect the anon_vma if it's empty */
+	empty = list_empty(&anon_vma->head);
+	spin_unlock(&anon_vma->lock);
+
+	if (empty)
+		anon_vma_free(anon_vma);
+}
+
+static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
+{
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+						SLAB_CTOR_CONSTRUCTOR) {
+		struct anon_vma *anon_vma = data;
+
+		spin_lock_init(&anon_vma->lock);
+		INIT_LIST_HEAD(&anon_vma->head);
+	}
+}
+
+void __init anon_vma_init(void)
+{
+	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
+			0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL);
+}
+
+/*
+ * Getting a lock on a stable anon_vma from a page off the LRU is
+ * tricky: page_lock_anon_vma rely on RCU to guard against the races.
+ */
+static struct anon_vma *page_lock_anon_vma(struct page *page)
+{
+	struct anon_vma *anon_vma = NULL;
+	unsigned long anon_mapping;
+
+	rcu_read_lock();
+	anon_mapping = (unsigned long) page->mapping;
+	if (!(anon_mapping & PAGE_MAPPING_ANON))
+		goto out;
+	if (!page_mapped(page))
+		goto out;
+
+	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
+	spin_lock(&anon_vma->lock);
+out:
+	rcu_read_unlock();
+	return anon_vma;
+}
+
+/*
+ * At what user virtual address is page expected in vma?
+ */
+static inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	unsigned long address;
+
+	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
+		/* page should be within any vma from prio_tree_next */
+		BUG_ON(!PageAnon(page));
+		return -EFAULT;
+	}
+	return address;
+}
+
+/*
+ * At what user virtual address is page expected in vma? checking that the
+ * page matches the vma: currently only used by unuse_process, on anon pages.
+ */
+unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
+{
+	if (PageAnon(page)) {
+		if ((void *)vma->anon_vma !=
+		    (void *)page->mapping - PAGE_MAPPING_ANON)
+			return -EFAULT;
+	} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
+		if (vma->vm_file->f_mapping != page->mapping)
+			return -EFAULT;
+	} else
+		return -EFAULT;
+	return vma_address(page, vma);
+}
+
+/*
+ * Subfunctions of page_referenced: page_referenced_one called
+ * repeatedly from either page_referenced_anon or page_referenced_file.
+ */
+static int page_referenced_one(struct page *page,
+	struct vm_area_struct *vma, unsigned int *mapcount, int ignore_token)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long address;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	int referenced = 0;
+
+	if (!get_mm_counter(mm, rss))
+		goto out;
+	address = vma_address(page, vma);
+	if (address == -EFAULT)
+		goto out;
+
+	spin_lock(&mm->page_table_lock);
+
+	pgd = pgd_offset(mm, address);
+	if (!pgd_present(*pgd))
+		goto out_unlock;
+
+	pud = pud_offset(pgd, address);
+	if (!pud_present(*pud))
+		goto out_unlock;
+
+	pmd = pmd_offset(pud, address);
+	if (!pmd_present(*pmd))
+		goto out_unlock;
+
+	pte = pte_offset_map(pmd, address);
+	if (!pte_present(*pte))
+		goto out_unmap;
+
+	if (page_to_pfn(page) != pte_pfn(*pte))
+		goto out_unmap;
+
+	if (ptep_clear_flush_young(vma, address, pte))
+		referenced++;
+
+	if (mm != current->mm && !ignore_token && has_swap_token(mm))
+		referenced++;
+
+	(*mapcount)--;
+
+out_unmap:
+	pte_unmap(pte);
+out_unlock:
+	spin_unlock(&mm->page_table_lock);
+out:
+	return referenced;
+}
+
+static int page_referenced_anon(struct page *page, int ignore_token)
+{
+	unsigned int mapcount;
+	struct anon_vma *anon_vma;
+	struct vm_area_struct *vma;
+	int referenced = 0;
+
+	anon_vma = page_lock_anon_vma(page);
+	if (!anon_vma)
+		return referenced;
+
+	mapcount = page_mapcount(page);
+	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+		referenced += page_referenced_one(page, vma, &mapcount,
+							ignore_token);
+		if (!mapcount)
+			break;
+	}
+	spin_unlock(&anon_vma->lock);
+	return referenced;
+}
+
+/**
+ * page_referenced_file - referenced check for object-based rmap
+ * @page: the page we're checking references on.
+ *
+ * For an object-based mapped page, find all the places it is mapped and
+ * check/clear the referenced flag.  This is done by following the page->mapping
+ * pointer, then walking the chain of vmas it holds.  It returns the number
+ * of references it found.
+ *
+ * This function is only called from page_referenced for object-based pages.
+ */
+static int page_referenced_file(struct page *page, int ignore_token)
+{
+	unsigned int mapcount;
+	struct address_space *mapping = page->mapping;
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+	int referenced = 0;
+
+	/*
+	 * The caller's checks on page->mapping and !PageAnon have made
+	 * sure that this is a file page: the check for page->mapping
+	 * excludes the case just before it gets set on an anon page.
+	 */
+	BUG_ON(PageAnon(page));
+
+	/*
+	 * The page lock not only makes sure that page->mapping cannot
+	 * suddenly be NULLified by truncation, it makes sure that the
+	 * structure at mapping cannot be freed and reused yet,
+	 * so we can safely take mapping->i_mmap_lock.
+	 */
+	BUG_ON(!PageLocked(page));
+
+	spin_lock(&mapping->i_mmap_lock);
+
+	/*
+	 * i_mmap_lock does not stabilize mapcount at all, but mapcount
+	 * is more likely to be accurate if we note it after spinning.
+	 */
+	mapcount = page_mapcount(page);
+
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+		if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
+				  == (VM_LOCKED|VM_MAYSHARE)) {
+			referenced++;
+			break;
+		}
+		referenced += page_referenced_one(page, vma, &mapcount,
+							ignore_token);
+		if (!mapcount)
+			break;
+	}
+
+	spin_unlock(&mapping->i_mmap_lock);
+	return referenced;
+}
+
+/**
+ * page_referenced - test if the page was referenced
+ * @page: the page to test
+ * @is_locked: caller holds lock on the page
+ *
+ * Quick test_and_clear_referenced for all mappings to a page,
+ * returns the number of ptes which referenced the page.
+ */
+int page_referenced(struct page *page, int is_locked, int ignore_token)
+{
+	int referenced = 0;
+
+	if (!swap_token_default_timeout)
+		ignore_token = 1;
+
+	if (page_test_and_clear_young(page))
+		referenced++;
+
+	if (TestClearPageReferenced(page))
+		referenced++;
+
+	if (page_mapped(page) && page->mapping) {
+		if (PageAnon(page))
+			referenced += page_referenced_anon(page, ignore_token);
+		else if (is_locked)
+			referenced += page_referenced_file(page, ignore_token);
+		else if (TestSetPageLocked(page))
+			referenced++;
+		else {
+			if (page->mapping)
+				referenced += page_referenced_file(page,
+								ignore_token);
+			unlock_page(page);
+		}
+	}
+	return referenced;
+}
+
+/**
+ * page_add_anon_rmap - add pte mapping to an anonymous page
+ * @page:	the page to add the mapping to
+ * @vma:	the vm area in which the mapping is added
+ * @address:	the user virtual address mapped
+ *
+ * The caller needs to hold the mm->page_table_lock.
+ */
+void page_add_anon_rmap(struct page *page,
+	struct vm_area_struct *vma, unsigned long address)
+{
+	struct anon_vma *anon_vma = vma->anon_vma;
+	pgoff_t index;
+
+	BUG_ON(PageReserved(page));
+	BUG_ON(!anon_vma);
+
+	inc_mm_counter(vma->vm_mm, anon_rss);
+
+	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+	index = (address - vma->vm_start) >> PAGE_SHIFT;
+	index += vma->vm_pgoff;
+	index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
+
+	if (atomic_inc_and_test(&page->_mapcount)) {
+		page->index = index;
+		page->mapping = (struct address_space *) anon_vma;
+		inc_page_state(nr_mapped);
+	}
+	/* else checking page index and mapping is racy */
+}
+
+/**
+ * page_add_file_rmap - add pte mapping to a file page
+ * @page: the page to add the mapping to
+ *
+ * The caller needs to hold the mm->page_table_lock.
+ */
+void page_add_file_rmap(struct page *page)
+{
+	BUG_ON(PageAnon(page));
+	if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
+		return;
+
+	if (atomic_inc_and_test(&page->_mapcount))
+		inc_page_state(nr_mapped);
+}
+
+/**
+ * page_remove_rmap - take down pte mapping from a page
+ * @page: page to remove mapping from
+ *
+ * Caller needs to hold the mm->page_table_lock.
+ */
+void page_remove_rmap(struct page *page)
+{
+	BUG_ON(PageReserved(page));
+
+	if (atomic_add_negative(-1, &page->_mapcount)) {
+		BUG_ON(page_mapcount(page) < 0);
+		/*
+		 * It would be tidy to reset the PageAnon mapping here,
+		 * but that might overwrite a racing page_add_anon_rmap
+		 * which increments mapcount after us but sets mapping
+		 * before us: so leave the reset to free_hot_cold_page,
+		 * and remember that it's only reliable while mapped.
+		 * Leaving it set also helps swapoff to reinstate ptes
+		 * faster for those pages still in swapcache.
+		 */
+		if (page_test_and_clear_dirty(page))
+			set_page_dirty(page);
+		dec_page_state(nr_mapped);
+	}
+}
+
+/*
+ * Subfunctions of try_to_unmap: try_to_unmap_one called
+ * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
+ */
+static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long address;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	pte_t pteval;
+	int ret = SWAP_AGAIN;
+
+	if (!get_mm_counter(mm, rss))
+		goto out;
+	address = vma_address(page, vma);
+	if (address == -EFAULT)
+		goto out;
+
+	/*
+	 * We need the page_table_lock to protect us from page faults,
+	 * munmap, fork, etc...
+	 */
+	spin_lock(&mm->page_table_lock);
+
+	pgd = pgd_offset(mm, address);
+	if (!pgd_present(*pgd))
+		goto out_unlock;
+
+	pud = pud_offset(pgd, address);
+	if (!pud_present(*pud))
+		goto out_unlock;
+
+	pmd = pmd_offset(pud, address);
+	if (!pmd_present(*pmd))
+		goto out_unlock;
+
+	pte = pte_offset_map(pmd, address);
+	if (!pte_present(*pte))
+		goto out_unmap;
+
+	if (page_to_pfn(page) != pte_pfn(*pte))
+		goto out_unmap;
+
+	/*
+	 * If the page is mlock()d, we cannot swap it out.
+	 * If it's recently referenced (perhaps page_referenced
+	 * skipped over this mm) then we should reactivate it.
+	 */
+	if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) ||
+			ptep_clear_flush_young(vma, address, pte)) {
+		ret = SWAP_FAIL;
+		goto out_unmap;
+	}
+
+	/*
+	 * Don't pull an anonymous page out from under get_user_pages.
+	 * GUP carefully breaks COW and raises page count (while holding
+	 * page_table_lock, as we have here) to make sure that the page
+	 * cannot be freed.  If we unmap that page here, a user write
+	 * access to the virtual address will bring back the page, but
+	 * its raised count will (ironically) be taken to mean it's not
+	 * an exclusive swap page, do_wp_page will replace it by a copy
+	 * page, and the user never get to see the data GUP was holding
+	 * the original page for.
+	 *
+	 * This test is also useful for when swapoff (unuse_process) has
+	 * to drop page lock: its reference to the page stops existing
+	 * ptes from being unmapped, so swapoff can make progress.
+	 */
+	if (PageSwapCache(page) &&
+	    page_count(page) != page_mapcount(page) + 2) {
+		ret = SWAP_FAIL;
+		goto out_unmap;
+	}
+
+	/* Nuke the page table entry. */
+	flush_cache_page(vma, address, page_to_pfn(page));
+	pteval = ptep_clear_flush(vma, address, pte);
+
+	/* Move the dirty bit to the physical page now the pte is gone. */
+	if (pte_dirty(pteval))
+		set_page_dirty(page);
+
+	if (PageAnon(page)) {
+		swp_entry_t entry = { .val = page->private };
+		/*
+		 * Store the swap location in the pte.
+		 * See handle_pte_fault() ...
+		 */
+		BUG_ON(!PageSwapCache(page));
+		swap_duplicate(entry);
+		if (list_empty(&mm->mmlist)) {
+			spin_lock(&mmlist_lock);
+			list_add(&mm->mmlist, &init_mm.mmlist);
+			spin_unlock(&mmlist_lock);
+		}
+		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
+		BUG_ON(pte_file(*pte));
+		dec_mm_counter(mm, anon_rss);
+	}
+
+	inc_mm_counter(mm, rss);
+	page_remove_rmap(page);
+	page_cache_release(page);
+
+out_unmap:
+	pte_unmap(pte);
+out_unlock:
+	spin_unlock(&mm->page_table_lock);
+out:
+	return ret;
+}
+
+/*
+ * objrmap doesn't work for nonlinear VMAs because the assumption that
+ * offset-into-file correlates with offset-into-virtual-addresses does not hold.
+ * Consequently, given a particular page and its ->index, we cannot locate the
+ * ptes which are mapping that page without an exhaustive linear search.
+ *
+ * So what this code does is a mini "virtual scan" of each nonlinear VMA which
+ * maps the file to which the target page belongs.  The ->vm_private_data field
+ * holds the current cursor into that scan.  Successive searches will circulate
+ * around the vma's virtual address space.
+ *
+ * So as more replacement pressure is applied to the pages in a nonlinear VMA,
+ * more scanning pressure is placed against them as well.   Eventually pages
+ * will become fully unmapped and are eligible for eviction.
+ *
+ * For very sparsely populated VMAs this is a little inefficient - chances are
+ * there there won't be many ptes located within the scan cluster.  In this case
+ * maybe we could scan further - to the end of the pte page, perhaps.
+ */
+#define CLUSTER_SIZE	min(32*PAGE_SIZE, PMD_SIZE)
+#define CLUSTER_MASK	(~(CLUSTER_SIZE - 1))
+
+static void try_to_unmap_cluster(unsigned long cursor,
+	unsigned int *mapcount, struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	pte_t pteval;
+	struct page *page;
+	unsigned long address;
+	unsigned long end;
+	unsigned long pfn;
+
+	/*
+	 * We need the page_table_lock to protect us from page faults,
+	 * munmap, fork, etc...
+	 */
+	spin_lock(&mm->page_table_lock);
+
+	address = (vma->vm_start + cursor) & CLUSTER_MASK;
+	end = address + CLUSTER_SIZE;
+	if (address < vma->vm_start)
+		address = vma->vm_start;
+	if (end > vma->vm_end)
+		end = vma->vm_end;
+
+	pgd = pgd_offset(mm, address);
+	if (!pgd_present(*pgd))
+		goto out_unlock;
+
+	pud = pud_offset(pgd, address);
+	if (!pud_present(*pud))
+		goto out_unlock;
+
+	pmd = pmd_offset(pud, address);
+	if (!pmd_present(*pmd))
+		goto out_unlock;
+
+	for (pte = pte_offset_map(pmd, address);
+			address < end; pte++, address += PAGE_SIZE) {
+
+		if (!pte_present(*pte))
+			continue;
+
+		pfn = pte_pfn(*pte);
+		if (!pfn_valid(pfn))
+			continue;
+
+		page = pfn_to_page(pfn);
+		BUG_ON(PageAnon(page));
+		if (PageReserved(page))
+			continue;
+
+		if (ptep_clear_flush_young(vma, address, pte))
+			continue;
+
+		/* Nuke the page table entry. */
+		flush_cache_page(vma, address, pfn);
+		pteval = ptep_clear_flush(vma, address, pte);
+
+		/* If nonlinear, store the file page offset in the pte. */
+		if (page->index != linear_page_index(vma, address))
+			set_pte_at(mm, address, pte, pgoff_to_pte(page->index));
+
+		/* Move the dirty bit to the physical page now the pte is gone. */
+		if (pte_dirty(pteval))
+			set_page_dirty(page);
+
+		page_remove_rmap(page);
+		page_cache_release(page);
+		dec_mm_counter(mm, rss);
+		(*mapcount)--;
+	}
+
+	pte_unmap(pte);
+
+out_unlock:
+	spin_unlock(&mm->page_table_lock);
+}
+
+static int try_to_unmap_anon(struct page *page)
+{
+	struct anon_vma *anon_vma;
+	struct vm_area_struct *vma;
+	int ret = SWAP_AGAIN;
+
+	anon_vma = page_lock_anon_vma(page);
+	if (!anon_vma)
+		return ret;
+
+	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+		ret = try_to_unmap_one(page, vma);
+		if (ret == SWAP_FAIL || !page_mapped(page))
+			break;
+	}
+	spin_unlock(&anon_vma->lock);
+	return ret;
+}
+
+/**
+ * try_to_unmap_file - unmap file page using the object-based rmap method
+ * @page: the page to unmap
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the address_space struct it points to.
+ *
+ * This function is only called from try_to_unmap for object-based pages.
+ */
+static int try_to_unmap_file(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+	int ret = SWAP_AGAIN;
+	unsigned long cursor;
+	unsigned long max_nl_cursor = 0;
+	unsigned long max_nl_size = 0;
+	unsigned int mapcount;
+
+	spin_lock(&mapping->i_mmap_lock);
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+		ret = try_to_unmap_one(page, vma);
+		if (ret == SWAP_FAIL || !page_mapped(page))
+			goto out;
+	}
+
+	if (list_empty(&mapping->i_mmap_nonlinear))
+		goto out;
+
+	list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
+						shared.vm_set.list) {
+		if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
+			continue;
+		cursor = (unsigned long) vma->vm_private_data;
+		if (cursor > max_nl_cursor)
+			max_nl_cursor = cursor;
+		cursor = vma->vm_end - vma->vm_start;
+		if (cursor > max_nl_size)
+			max_nl_size = cursor;
+	}
+
+	if (max_nl_size == 0) {	/* any nonlinears locked or reserved */
+		ret = SWAP_FAIL;
+		goto out;
+	}
+
+	/*
+	 * We don't try to search for this page in the nonlinear vmas,
+	 * and page_referenced wouldn't have found it anyway.  Instead
+	 * just walk the nonlinear vmas trying to age and unmap some.
+	 * The mapcount of the page we came in with is irrelevant,
+	 * but even so use it as a guide to how hard we should try?
+	 */
+	mapcount = page_mapcount(page);
+	if (!mapcount)
+		goto out;
+	cond_resched_lock(&mapping->i_mmap_lock);
+
+	max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
+	if (max_nl_cursor == 0)
+		max_nl_cursor = CLUSTER_SIZE;
+
+	do {
+		list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
+						shared.vm_set.list) {
+			if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
+				continue;
+			cursor = (unsigned long) vma->vm_private_data;
+			while (get_mm_counter(vma->vm_mm, rss) &&
+				cursor < max_nl_cursor &&
+				cursor < vma->vm_end - vma->vm_start) {
+				try_to_unmap_cluster(cursor, &mapcount, vma);
+				cursor += CLUSTER_SIZE;
+				vma->vm_private_data = (void *) cursor;
+				if ((int)mapcount <= 0)
+					goto out;
+			}
+			vma->vm_private_data = (void *) max_nl_cursor;
+		}
+		cond_resched_lock(&mapping->i_mmap_lock);
+		max_nl_cursor += CLUSTER_SIZE;
+	} while (max_nl_cursor <= max_nl_size);
+
+	/*
+	 * Don't loop forever (perhaps all the remaining pages are
+	 * in locked vmas).  Reset cursor on all unreserved nonlinear
+	 * vmas, now forgetting on which ones it had fallen behind.
+	 */
+	list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
+						shared.vm_set.list) {
+		if (!(vma->vm_flags & VM_RESERVED))
+			vma->vm_private_data = NULL;
+	}
+out:
+	spin_unlock(&mapping->i_mmap_lock);
+	return ret;
+}
+
+/**
+ * try_to_unmap - try to remove all page table mappings to a page
+ * @page: the page to get unmapped
+ *
+ * Tries to remove all the page table entries which are mapping this
+ * page, used in the pageout path.  Caller must hold the page lock.
+ * Return values are:
+ *
+ * SWAP_SUCCESS	- we succeeded in removing all mappings
+ * SWAP_AGAIN	- we missed a mapping, try again later
+ * SWAP_FAIL	- the page is unswappable
+ */
+int try_to_unmap(struct page *page)
+{
+	int ret;
+
+	BUG_ON(PageReserved(page));
+	BUG_ON(!PageLocked(page));
+
+	if (PageAnon(page))
+		ret = try_to_unmap_anon(page);
+	else
+		ret = try_to_unmap_file(page);
+
+	if (!page_mapped(page))
+		ret = SWAP_SUCCESS;
+	return ret;
+}
--- a/mm/shmem.c
+++ b/mm/shmem.c
--- a/mm/slab.c
+++ b/mm/slab.c
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -0,0 +1,485 @@
+/*
+ *  linux/mm/swap.c
+ *
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ */
+
+/*
+ * This file contains the default values for the opereation of the
+ * Linux VM subsystem. Fine-tuning documentation can be found in
+ * Documentation/sysctl/vm.txt.
+ * Started 18.12.91
+ * Swap aging added 23.2.95, Stephen Tweedie.
+ * Buffermem limits added 12.3.98, Rik van Riel.
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/kernel_stat.h>
+#include <linux/swap.h>
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm_inline.h>
+#include <linux/buffer_head.h>	/* for try_to_release_page() */
+#include <linux/module.h>
+#include <linux/percpu_counter.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/init.h>
+
+/* How many pages do we try to swap or page in/out together? */
+int page_cluster;
+
+#ifdef CONFIG_HUGETLB_PAGE
+
+void put_page(struct page *page)
+{
+	if (unlikely(PageCompound(page))) {
+		page = (struct page *)page->private;
+		if (put_page_testzero(page)) {
+			void (*dtor)(struct page *page);
+
+			dtor = (void (*)(struct page *))page[1].mapping;
+			(*dtor)(page);
+		}
+		return;
+	}
+	if (!PageReserved(page) && put_page_testzero(page))
+		__page_cache_release(page);
+}
+EXPORT_SYMBOL(put_page);
+#endif
+
+/*
+ * Writeback is about to end against a page which has been marked for immediate
+ * reclaim.  If it still appears to be reclaimable, move it to the tail of the
+ * inactive list.  The page still has PageWriteback set, which will pin it.
+ *
+ * We don't expect many pages to come through here, so don't bother batching
+ * things up.
+ *
+ * To avoid placing the page at the tail of the LRU while PG_writeback is still
+ * set, this function will clear PG_writeback before performing the page
+ * motion.  Do that inside the lru lock because once PG_writeback is cleared
+ * we may not touch the page.
+ *
+ * Returns zero if it cleared PG_writeback.
+ */
+int rotate_reclaimable_page(struct page *page)
+{
+	struct zone *zone;
+	unsigned long flags;
+
+	if (PageLocked(page))
+		return 1;
+	if (PageDirty(page))
+		return 1;
+	if (PageActive(page))
+		return 1;
+	if (!PageLRU(page))
+		return 1;
+
+	zone = page_zone(page);
+	spin_lock_irqsave(&zone->lru_lock, flags);
+	if (PageLRU(page) && !PageActive(page)) {
+		list_del(&page->lru);
+		list_add_tail(&page->lru, &zone->inactive_list);
+		inc_page_state(pgrotated);
+	}
+	if (!test_clear_page_writeback(page))
+		BUG();
+	spin_unlock_irqrestore(&zone->lru_lock, flags);
+	return 0;
+}
+
+/*
+ * FIXME: speed this up?
+ */
+void fastcall activate_page(struct page *page)
+{
+	struct zone *zone = page_zone(page);
+
+	spin_lock_irq(&zone->lru_lock);
+	if (PageLRU(page) && !PageActive(page)) {
+		del_page_from_inactive_list(zone, page);
+		SetPageActive(page);
+		add_page_to_active_list(zone, page);
+		inc_page_state(pgactivate);
+	}
+	spin_unlock_irq(&zone->lru_lock);
+}
+
+/*
+ * Mark a page as having seen activity.
+ *
+ * inactive,unreferenced	->	inactive,referenced
+ * inactive,referenced		->	active,unreferenced
+ * active,unreferenced		->	active,referenced
+ */
+void fastcall mark_page_accessed(struct page *page)
+{
+	if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
+		activate_page(page);
+		ClearPageReferenced(page);
+	} else if (!PageReferenced(page)) {
+		SetPageReferenced(page);
+	}
+}
+
+EXPORT_SYMBOL(mark_page_accessed);
+
+/**
+ * lru_cache_add: add a page to the page lists
+ * @page: the page to add
+ */
+static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
+static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
+
+void fastcall lru_cache_add(struct page *page)
+{
+	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+
+	page_cache_get(page);
+	if (!pagevec_add(pvec, page))
+		__pagevec_lru_add(pvec);
+	put_cpu_var(lru_add_pvecs);
+}
+
+void fastcall lru_cache_add_active(struct page *page)
+{
+	struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
+
+	page_cache_get(page);
+	if (!pagevec_add(pvec, page))
+		__pagevec_lru_add_active(pvec);
+	put_cpu_var(lru_add_active_pvecs);
+}
+
+void lru_add_drain(void)
+{
+	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+
+	if (pagevec_count(pvec))
+		__pagevec_lru_add(pvec);
+	pvec = &__get_cpu_var(lru_add_active_pvecs);
+	if (pagevec_count(pvec))
+		__pagevec_lru_add_active(pvec);
+	put_cpu_var(lru_add_pvecs);
+}
+
+/*
+ * This path almost never happens for VM activity - pages are normally
+ * freed via pagevecs.  But it gets used by networking.
+ */
+void fastcall __page_cache_release(struct page *page)
+{
+	unsigned long flags;
+	struct zone *zone = page_zone(page);
+
+	spin_lock_irqsave(&zone->lru_lock, flags);
+	if (TestClearPageLRU(page))
+		del_page_from_lru(zone, page);
+	if (page_count(page) != 0)
+		page = NULL;
+	spin_unlock_irqrestore(&zone->lru_lock, flags);
+	if (page)
+		free_hot_page(page);
+}
+
+EXPORT_SYMBOL(__page_cache_release);
+
+/*
+ * Batched page_cache_release().  Decrement the reference count on all the
+ * passed pages.  If it fell to zero then remove the page from the LRU and
+ * free it.
+ *
+ * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
+ * for the remainder of the operation.
+ *
+ * The locking in this function is against shrink_cache(): we recheck the
+ * page count inside the lock to see whether shrink_cache grabbed the page
+ * via the LRU.  If it did, give up: shrink_cache will free it.
+ */
+void release_pages(struct page **pages, int nr, int cold)
+{
+	int i;
+	struct pagevec pages_to_free;
+	struct zone *zone = NULL;
+
+	pagevec_init(&pages_to_free, cold);
+	for (i = 0; i < nr; i++) {
+		struct page *page = pages[i];
+		struct zone *pagezone;
+
+		if (PageReserved(page) || !put_page_testzero(page))
+			continue;
+
+		pagezone = page_zone(page);
+		if (pagezone != zone) {
+			if (zone)
+				spin_unlock_irq(&zone->lru_lock);
+			zone = pagezone;
+			spin_lock_irq(&zone->lru_lock);
+		}
+		if (TestClearPageLRU(page))
+			del_page_from_lru(zone, page);
+		if (page_count(page) == 0) {
+			if (!pagevec_add(&pages_to_free, page)) {
+				spin_unlock_irq(&zone->lru_lock);
+				__pagevec_free(&pages_to_free);
+				pagevec_reinit(&pages_to_free);
+				zone = NULL;	/* No lock is held */
+			}
+		}
+	}
+	if (zone)
+		spin_unlock_irq(&zone->lru_lock);
+
+	pagevec_free(&pages_to_free);
+}
+
+/*
+ * The pages which we're about to release may be in the deferred lru-addition
+ * queues.  That would prevent them from really being freed right now.  That's
+ * OK from a correctness point of view but is inefficient - those pages may be
+ * cache-warm and we want to give them back to the page allocator ASAP.
+ *
+ * So __pagevec_release() will drain those queues here.  __pagevec_lru_add()
+ * and __pagevec_lru_add_active() call release_pages() directly to avoid
+ * mutual recursion.
+ */
+void __pagevec_release(struct pagevec *pvec)
+{
+	lru_add_drain();
+	release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
+	pagevec_reinit(pvec);
+}
+
+/*
+ * pagevec_release() for pages which are known to not be on the LRU
+ *
+ * This function reinitialises the caller's pagevec.
+ */
+void __pagevec_release_nonlru(struct pagevec *pvec)
+{
+	int i;
+	struct pagevec pages_to_free;
+
+	pagevec_init(&pages_to_free, pvec->cold);
+	pages_to_free.cold = pvec->cold;
+	for (i = 0; i < pagevec_count(pvec); i++) {
+		struct page *page = pvec->pages[i];
+
+		BUG_ON(PageLRU(page));
+		if (put_page_testzero(page))
+			pagevec_add(&pages_to_free, page);
+	}
+	pagevec_free(&pages_to_free);
+	pagevec_reinit(pvec);
+}
+
+/*
+ * Add the passed pages to the LRU, then drop the caller's refcount
+ * on them.  Reinitialises the caller's pagevec.
+ */
+void __pagevec_lru_add(struct pagevec *pvec)
+{
+	int i;
+	struct zone *zone = NULL;
+
+	for (i = 0; i < pagevec_count(pvec); i++) {
+		struct page *page = pvec->pages[i];
+		struct zone *pagezone = page_zone(page);
+
+		if (pagezone != zone) {
+			if (zone)
+				spin_unlock_irq(&zone->lru_lock);
+			zone = pagezone;
+			spin_lock_irq(&zone->lru_lock);
+		}
+		if (TestSetPageLRU(page))
+			BUG();
+		add_page_to_inactive_list(zone, page);
+	}
+	if (zone)
+		spin_unlock_irq(&zone->lru_lock);
+	release_pages(pvec->pages, pvec->nr, pvec->cold);
+	pagevec_reinit(pvec);
+}
+
+EXPORT_SYMBOL(__pagevec_lru_add);
+
+void __pagevec_lru_add_active(struct pagevec *pvec)
+{
+	int i;
+	struct zone *zone = NULL;
+
+	for (i = 0; i < pagevec_count(pvec); i++) {
+		struct page *page = pvec->pages[i];
+		struct zone *pagezone = page_zone(page);
+
+		if (pagezone != zone) {
+			if (zone)
+				spin_unlock_irq(&zone->lru_lock);
+			zone = pagezone;
+			spin_lock_irq(&zone->lru_lock);
+		}
+		if (TestSetPageLRU(page))
+			BUG();
+		if (TestSetPageActive(page))
+			BUG();
+		add_page_to_active_list(zone, page);
+	}
+	if (zone)
+		spin_unlock_irq(&zone->lru_lock);
+	release_pages(pvec->pages, pvec->nr, pvec->cold);
+	pagevec_reinit(pvec);
+}
+
+/*
+ * Try to drop buffers from the pages in a pagevec
+ */
+void pagevec_strip(struct pagevec *pvec)
+{
+	int i;
+
+	for (i = 0; i < pagevec_count(pvec); i++) {
+		struct page *page = pvec->pages[i];
+
+		if (PagePrivate(page) && !TestSetPageLocked(page)) {
+			try_to_release_page(page, 0);
+			unlock_page(page);
+		}
+	}
+}
+
+/**
+ * pagevec_lookup - gang pagecache lookup
+ * @pvec:	Where the resulting pages are placed
+ * @mapping:	The address_space to search
+ * @start:	The starting page index
+ * @nr_pages:	The maximum number of pages
+ *
+ * pagevec_lookup() will search for and return a group of up to @nr_pages pages
+ * in the mapping.  The pages are placed in @pvec.  pagevec_lookup() takes a
+ * reference against the pages in @pvec.
+ *
+ * The search returns a group of mapping-contiguous pages with ascending
+ * indexes.  There may be holes in the indices due to not-present pages.
+ *
+ * pagevec_lookup() returns the number of pages which were found.
+ */
+unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
+		pgoff_t start, unsigned nr_pages)
+{
+	pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
+	return pagevec_count(pvec);
+}
+
+unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
+		pgoff_t *index, int tag, unsigned nr_pages)
+{
+	pvec->nr = find_get_pages_tag(mapping, index, tag,
+					nr_pages, pvec->pages);
+	return pagevec_count(pvec);
+}
+
+
+#ifdef CONFIG_SMP
+/*
+ * We tolerate a little inaccuracy to avoid ping-ponging the counter between
+ * CPUs
+ */
+#define ACCT_THRESHOLD	max(16, NR_CPUS * 2)
+
+static DEFINE_PER_CPU(long, committed_space) = 0;
+
+void vm_acct_memory(long pages)
+{
+	long *local;
+
+	preempt_disable();
+	local = &__get_cpu_var(committed_space);
+	*local += pages;
+	if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
+		atomic_add(*local, &vm_committed_space);
+		*local = 0;
+	}
+	preempt_enable();
+}
+EXPORT_SYMBOL(vm_acct_memory);
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void lru_drain_cache(unsigned int cpu)
+{
+	struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
+
+	/* CPU is dead, so no locking needed. */
+	if (pagevec_count(pvec))
+		__pagevec_lru_add(pvec);
+	pvec = &per_cpu(lru_add_active_pvecs, cpu);
+	if (pagevec_count(pvec))
+		__pagevec_lru_add_active(pvec);
+}
+
+/* Drop the CPU's cached committed space back into the central pool. */
+static int cpu_swap_callback(struct notifier_block *nfb,
+			     unsigned long action,
+			     void *hcpu)
+{
+	long *committed;
+
+	committed = &per_cpu(committed_space, (long)hcpu);
+	if (action == CPU_DEAD) {
+		atomic_add(*committed, &vm_committed_space);
+		*committed = 0;
+		lru_drain_cache((long)hcpu);
+	}
+	return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_SMP
+void percpu_counter_mod(struct percpu_counter *fbc, long amount)
+{
+	long count;
+	long *pcount;
+	int cpu = get_cpu();
+
+	pcount = per_cpu_ptr(fbc->counters, cpu);
+	count = *pcount + amount;
+	if (count >= FBC_BATCH || count <= -FBC_BATCH) {
+		spin_lock(&fbc->lock);
+		fbc->count += count;
+		spin_unlock(&fbc->lock);
+		count = 0;
+	}
+	*pcount = count;
+	put_cpu();
+}
+EXPORT_SYMBOL(percpu_counter_mod);
+#endif
+
+/*
+ * Perform any setup for the swap system
+ */
+void __init swap_setup(void)
+{
+	unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
+
+	/* Use a smaller cluster for small-memory machines */
+	if (megs < 16)
+		page_cluster = 2;
+	else
+		page_cluster = 3;
+	/*
+	 * Right now other parts of the system means that we
+	 * _really_ don't want to cluster much more
+	 */
+	hotcpu_notifier(cpu_swap_callback, 0);
+}
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -0,0 +1,382 @@
+/*
+ *  linux/mm/swap_state.c
+ *
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *  Swap reorganised 29.12.95, Stephen Tweedie
+ *
+ *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
+ */
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/kernel_stat.h>
+#include <linux/swap.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include <linux/backing-dev.h>
+
+#include <asm/pgtable.h>
+
+/*
+ * swapper_space is a fiction, retained to simplify the path through
+ * vmscan's shrink_list, to make sync_page look nicer, and to allow
+ * future use of radix_tree tags in the swap cache.
+ */
+static struct address_space_operations swap_aops = {
+	.writepage	= swap_writepage,
+	.sync_page	= block_sync_page,
+	.set_page_dirty	= __set_page_dirty_nobuffers,
+};
+
+static struct backing_dev_info swap_backing_dev_info = {
+	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+	.unplug_io_fn	= swap_unplug_io_fn,
+};
+
+struct address_space swapper_space = {
+	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
+	.tree_lock	= RW_LOCK_UNLOCKED,
+	.a_ops		= &swap_aops,
+	.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
+	.backing_dev_info = &swap_backing_dev_info,
+};
+EXPORT_SYMBOL(swapper_space);
+
+#define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)
+
+static struct {
+	unsigned long add_total;
+	unsigned long del_total;
+	unsigned long find_success;
+	unsigned long find_total;
+	unsigned long noent_race;
+	unsigned long exist_race;
+} swap_cache_info;
+
+void show_swap_cache_info(void)
+{
+	printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n",
+		swap_cache_info.add_total, swap_cache_info.del_total,
+		swap_cache_info.find_success, swap_cache_info.find_total,
+		swap_cache_info.noent_race, swap_cache_info.exist_race);
+	printk("Free swap  = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
+	printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
+}
+
+/*
+ * __add_to_swap_cache resembles add_to_page_cache on swapper_space,
+ * but sets SwapCache flag and private instead of mapping and index.
+ */
+static int __add_to_swap_cache(struct page *page,
+		swp_entry_t entry, int gfp_mask)
+{
+	int error;
+
+	BUG_ON(PageSwapCache(page));
+	BUG_ON(PagePrivate(page));
+	error = radix_tree_preload(gfp_mask);
+	if (!error) {
+		write_lock_irq(&swapper_space.tree_lock);
+		error = radix_tree_insert(&swapper_space.page_tree,
+						entry.val, page);
+		if (!error) {
+			page_cache_get(page);
+			SetPageLocked(page);
+			SetPageSwapCache(page);
+			page->private = entry.val;
+			total_swapcache_pages++;
+			pagecache_acct(1);
+		}
+		write_unlock_irq(&swapper_space.tree_lock);
+		radix_tree_preload_end();
+	}
+	return error;
+}
+
+static int add_to_swap_cache(struct page *page, swp_entry_t entry)
+{
+	int error;
+
+	if (!swap_duplicate(entry)) {
+		INC_CACHE_INFO(noent_race);
+		return -ENOENT;
+	}
+	error = __add_to_swap_cache(page, entry, GFP_KERNEL);
+	/*
+	 * Anon pages are already on the LRU, we don't run lru_cache_add here.
+	 */
+	if (error) {
+		swap_free(entry);
+		if (error == -EEXIST)
+			INC_CACHE_INFO(exist_race);
+		return error;
+	}
+	INC_CACHE_INFO(add_total);
+	return 0;
+}
+
+/*
+ * This must be called only on pages that have
+ * been verified to be in the swap cache.
+ */
+void __delete_from_swap_cache(struct page *page)
+{
+	BUG_ON(!PageLocked(page));
+	BUG_ON(!PageSwapCache(page));
+	BUG_ON(PageWriteback(page));
+
+	radix_tree_delete(&swapper_space.page_tree, page->private);
+	page->private = 0;
+	ClearPageSwapCache(page);
+	total_swapcache_pages--;
+	pagecache_acct(-1);
+	INC_CACHE_INFO(del_total);
+}
+
+/**
+ * add_to_swap - allocate swap space for a page
+ * @page: page we want to move to swap
+ *
+ * Allocate swap space for the page and add the page to the
+ * swap cache.  Caller needs to hold the page lock. 
+ */
+int add_to_swap(struct page * page)
+{
+	swp_entry_t entry;
+	int pf_flags;
+	int err;
+
+	if (!PageLocked(page))
+		BUG();
+
+	for (;;) {
+		entry = get_swap_page();
+		if (!entry.val)
+			return 0;
+
+		/* Radix-tree node allocations are performing
+		 * GFP_ATOMIC allocations under PF_MEMALLOC.  
+		 * They can completely exhaust the page allocator.  
+		 *
+		 * So PF_MEMALLOC is dropped here.  This causes the slab 
+		 * allocations to fail earlier, so radix-tree nodes will 
+		 * then be allocated from the mempool reserves.
+		 *
+		 * We're still using __GFP_HIGH for radix-tree node
+		 * allocations, so some of the emergency pools are available,
+		 * just not all of them.
+		 */
+
+		pf_flags = current->flags;
+		current->flags &= ~PF_MEMALLOC;
+
+		/*
+		 * Add it to the swap cache and mark it dirty
+		 */
+		err = __add_to_swap_cache(page, entry, GFP_ATOMIC|__GFP_NOWARN);
+
+		if (pf_flags & PF_MEMALLOC)
+			current->flags |= PF_MEMALLOC;
+
+		switch (err) {
+		case 0:				/* Success */
+			SetPageUptodate(page);
+			SetPageDirty(page);
+			INC_CACHE_INFO(add_total);
+			return 1;
+		case -EEXIST:
+			/* Raced with "speculative" read_swap_cache_async */
+			INC_CACHE_INFO(exist_race);
+			swap_free(entry);
+			continue;
+		default:
+			/* -ENOMEM radix-tree allocation failure */
+			swap_free(entry);
+			return 0;
+		}
+	}
+}
+
+/*
+ * This must be called only on pages that have
+ * been verified to be in the swap cache and locked.
+ * It will never put the page into the free list,
+ * the caller has a reference on the page.
+ */
+void delete_from_swap_cache(struct page *page)
+{
+	swp_entry_t entry;
+
+	BUG_ON(!PageSwapCache(page));
+	BUG_ON(!PageLocked(page));
+	BUG_ON(PageWriteback(page));
+	BUG_ON(PagePrivate(page));
+  
+	entry.val = page->private;
+
+	write_lock_irq(&swapper_space.tree_lock);
+	__delete_from_swap_cache(page);
+	write_unlock_irq(&swapper_space.tree_lock);
+
+	swap_free(entry);
+	page_cache_release(page);
+}
+
+/*
+ * Strange swizzling function only for use by shmem_writepage
+ */
+int move_to_swap_cache(struct page *page, swp_entry_t entry)
+{
+	int err = __add_to_swap_cache(page, entry, GFP_ATOMIC);
+	if (!err) {
+		remove_from_page_cache(page);
+		page_cache_release(page);	/* pagecache ref */
+		if (!swap_duplicate(entry))
+			BUG();
+		SetPageDirty(page);
+		INC_CACHE_INFO(add_total);
+	} else if (err == -EEXIST)
+		INC_CACHE_INFO(exist_race);
+	return err;
+}
+
+/*
+ * Strange swizzling function for shmem_getpage (and shmem_unuse)
+ */
+int move_from_swap_cache(struct page *page, unsigned long index,
+		struct address_space *mapping)
+{
+	int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC);
+	if (!err) {
+		delete_from_swap_cache(page);
+		/* shift page from clean_pages to dirty_pages list */
+		ClearPageDirty(page);
+		set_page_dirty(page);
+	}
+	return err;
+}
+
+/* 
+ * If we are the only user, then try to free up the swap cache. 
+ * 
+ * Its ok to check for PageSwapCache without the page lock
+ * here because we are going to recheck again inside 
+ * exclusive_swap_page() _with_ the lock. 
+ * 					- Marcelo
+ */
+static inline void free_swap_cache(struct page *page)
+{
+	if (PageSwapCache(page) && !TestSetPageLocked(page)) {
+		remove_exclusive_swap_page(page);
+		unlock_page(page);
+	}
+}
+
+/* 
+ * Perform a free_page(), also freeing any swap cache associated with
+ * this page if it is the last user of the page. Can not do a lock_page,
+ * as we are holding the page_table_lock spinlock.
+ */
+void free_page_and_swap_cache(struct page *page)
+{
+	free_swap_cache(page);
+	page_cache_release(page);
+}
+
+/*
+ * Passed an array of pages, drop them all from swapcache and then release
+ * them.  They are removed from the LRU and freed if this is their last use.
+ */
+void free_pages_and_swap_cache(struct page **pages, int nr)
+{
+	int chunk = 16;
+	struct page **pagep = pages;
+
+	lru_add_drain();
+	while (nr) {
+		int todo = min(chunk, nr);
+		int i;
+
+		for (i = 0; i < todo; i++)
+			free_swap_cache(pagep[i]);
+		release_pages(pagep, todo, 0);
+		pagep += todo;
+		nr -= todo;
+	}
+}
+
+/*
+ * Lookup a swap entry in the swap cache. A found page will be returned
+ * unlocked and with its refcount incremented - we rely on the kernel
+ * lock getting page table operations atomic even if we drop the page
+ * lock before returning.
+ */
+struct page * lookup_swap_cache(swp_entry_t entry)
+{
+	struct page *page;
+
+	page = find_get_page(&swapper_space, entry.val);
+
+	if (page)
+		INC_CACHE_INFO(find_success);
+
+	INC_CACHE_INFO(find_total);
+	return page;
+}
+
+/* 
+ * Locate a page of swap in physical memory, reserving swap cache space
+ * and reading the disk if it is not already cached.
+ * A failure return means that either the page allocation failed or that
+ * the swap entry is no longer in use.
+ */
+struct page *read_swap_cache_async(swp_entry_t entry,
+			struct vm_area_struct *vma, unsigned long addr)
+{
+	struct page *found_page, *new_page = NULL;
+	int err;
+
+	do {
+		/*
+		 * First check the swap cache.  Since this is normally
+		 * called after lookup_swap_cache() failed, re-calling
+		 * that would confuse statistics.
+		 */
+		found_page = find_get_page(&swapper_space, entry.val);
+		if (found_page)
+			break;
+
+		/*
+		 * Get a new page to read into from swap.
+		 */
+		if (!new_page) {
+			new_page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
+			if (!new_page)
+				break;		/* Out of memory */
+		}
+
+		/*
+		 * Associate the page with swap entry in the swap cache.
+		 * May fail (-ENOENT) if swap entry has been freed since
+		 * our caller observed it.  May fail (-EEXIST) if there
+		 * is already a page associated with this entry in the
+		 * swap cache: added by a racing read_swap_cache_async,
+		 * or by try_to_swap_out (or shmem_writepage) re-using
+		 * the just freed swap entry for an existing page.
+		 * May fail (-ENOMEM) if radix-tree node allocation failed.
+		 */
+		err = add_to_swap_cache(new_page, entry);
+		if (!err) {
+			/*
+			 * Initiate read into locked page and return.
+			 */
+			lru_cache_add_active(new_page);
+			swap_readpage(NULL, new_page);
+			return new_page;
+		}
+	} while (err != -ENOENT && err != -ENOMEM);
+
+	if (new_page)
+		page_cache_release(new_page);
+	return found_page;
+}
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -0,0 +1,102 @@
+/*
+ * mm/thrash.c
+ *
+ * Copyright (C) 2004, Red Hat, Inc.
+ * Copyright (C) 2004, Rik van Riel <riel@redhat.com>
+ * Released under the GPL, see the file COPYING for details.
+ *
+ * Simple token based thrashing protection, using the algorithm
+ * described in:  http://www.cs.wm.edu/~sjiang/token.pdf
+ */
+#include <linux/jiffies.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/swap.h>
+
+static DEFINE_SPINLOCK(swap_token_lock);
+static unsigned long swap_token_timeout;
+static unsigned long swap_token_check;
+struct mm_struct * swap_token_mm = &init_mm;
+
+#define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2)
+#define SWAP_TOKEN_TIMEOUT	0
+/*
+ * Currently disabled; Needs further code to work at HZ * 300.
+ */
+unsigned long swap_token_default_timeout = SWAP_TOKEN_TIMEOUT;
+
+/*
+ * Take the token away if the process had no page faults
+ * in the last interval, or if it has held the token for
+ * too long.
+ */
+#define SWAP_TOKEN_ENOUGH_RSS 1
+#define SWAP_TOKEN_TIMED_OUT 2
+static int should_release_swap_token(struct mm_struct *mm)
+{
+	int ret = 0;
+	if (!mm->recent_pagein)
+		ret = SWAP_TOKEN_ENOUGH_RSS;
+	else if (time_after(jiffies, swap_token_timeout))
+		ret = SWAP_TOKEN_TIMED_OUT;
+	mm->recent_pagein = 0;
+	return ret;
+}
+
+/*
+ * Try to grab the swapout protection token.  We only try to
+ * grab it once every TOKEN_CHECK_INTERVAL, both to prevent
+ * SMP lock contention and to check that the process that held
+ * the token before is no longer thrashing.
+ */
+void grab_swap_token(void)
+{
+	struct mm_struct *mm;
+	int reason;
+
+	/* We have the token. Let others know we still need it. */
+	if (has_swap_token(current->mm)) {
+		current->mm->recent_pagein = 1;
+		return;
+	}
+
+	if (time_after(jiffies, swap_token_check)) {
+
+		/* Can't get swapout protection if we exceed our RSS limit. */
+		// if (current->mm->rss > current->mm->rlimit_rss)
+		//	return;
+
+		/* ... or if we recently held the token. */
+		if (time_before(jiffies, current->mm->swap_token_time))
+			return;
+
+		if (!spin_trylock(&swap_token_lock))
+			return;
+
+		swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
+
+		mm = swap_token_mm;
+		if ((reason = should_release_swap_token(mm))) {
+			unsigned long eligible = jiffies;
+			if (reason == SWAP_TOKEN_TIMED_OUT) {
+				eligible += swap_token_default_timeout;
+			}
+			mm->swap_token_time = eligible;
+			swap_token_timeout = jiffies + swap_token_default_timeout;
+			swap_token_mm = current->mm;
+		}
+		spin_unlock(&swap_token_lock);
+	}
+	return;
+}
+
+/* Called on process exit. */
+void __put_swap_token(struct mm_struct *mm)
+{
+	spin_lock(&swap_token_lock);
+	if (likely(mm == swap_token_mm)) {
+		swap_token_mm = &init_mm;
+		swap_token_check = jiffies;
+	}
+	spin_unlock(&swap_token_lock);
+}
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -0,0 +1,122 @@
+/*
+ * tiny-shmem.c: simple shmemfs and tmpfs using ramfs code
+ *
+ * Matt Mackall <mpm@selenic.com> January, 2004
+ * derived from mm/shmem.c and fs/ramfs/inode.c
+ *
+ * This is intended for small system where the benefits of the full
+ * shmem code (swap-backed and resource-limited) are outweighed by
+ * their complexity. On systems without swap this code should be
+ * effectively equivalent, but much lighter weight.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/devfs_fs_kernel.h>
+#include <linux/vfs.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/ramfs.h>
+
+static struct file_system_type tmpfs_fs_type = {
+	.name		= "tmpfs",
+	.get_sb		= ramfs_get_sb,
+	.kill_sb	= kill_litter_super,
+};
+
+static struct vfsmount *shm_mnt;
+
+static int __init init_tmpfs(void)
+{
+	register_filesystem(&tmpfs_fs_type);
+#ifdef CONFIG_TMPFS
+	devfs_mk_dir("shm");
+#endif
+	shm_mnt = kern_mount(&tmpfs_fs_type);
+	return 0;
+}
+module_init(init_tmpfs)
+
+/*
+ * shmem_file_setup - get an unlinked file living in tmpfs
+ *
+ * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @size: size to be set for the file
+ *
+ */
+struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
+{
+	int error;
+	struct file *file;
+	struct inode *inode;
+	struct dentry *dentry, *root;
+	struct qstr this;
+
+	if (IS_ERR(shm_mnt))
+		return (void *)shm_mnt;
+
+	error = -ENOMEM;
+	this.name = name;
+	this.len = strlen(name);
+	this.hash = 0; /* will go */
+	root = shm_mnt->mnt_root;
+	dentry = d_alloc(root, &this);
+	if (!dentry)
+		goto put_memory;
+
+	error = -ENFILE;
+	file = get_empty_filp();
+	if (!file)
+		goto put_dentry;
+
+	error = -ENOSPC;
+	inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
+	if (!inode)
+		goto close_file;
+
+	d_instantiate(dentry, inode);
+	inode->i_size = size;
+	inode->i_nlink = 0;	/* It is unlinked */
+	file->f_vfsmnt = mntget(shm_mnt);
+	file->f_dentry = dentry;
+	file->f_mapping = inode->i_mapping;
+	file->f_op = &ramfs_file_operations;
+	file->f_mode = FMODE_WRITE | FMODE_READ;
+	return file;
+
+close_file:
+	put_filp(file);
+put_dentry:
+	dput(dentry);
+put_memory:
+	return ERR_PTR(error);
+}
+
+/*
+ * shmem_zero_setup - setup a shared anonymous mapping
+ *
+ * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
+ */
+int shmem_zero_setup(struct vm_area_struct *vma)
+{
+	struct file *file;
+	loff_t size = vma->vm_end - vma->vm_start;
+
+	file = shmem_file_setup("dev/zero", size, vma->vm_flags);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	if (vma->vm_file)
+		fput(vma->vm_file);
+	vma->vm_file = file;
+	vma->vm_ops = &generic_file_vm_ops;
+	return 0;
+}
+
+int shmem_unuse(swp_entry_t entry, struct page *page)
+{
+	return 0;
+}
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -0,0 +1,336 @@
+/*
+ * mm/truncate.c - code for taking down pages from address_spaces
+ *
+ * Copyright (C) 2002, Linus Torvalds
+ *
+ * 10Sep2002	akpm@zip.com.au
+ *		Initial version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/buffer_head.h>	/* grr. try_to_release_page,
+				   block_invalidatepage */
+
+
+static int do_invalidatepage(struct page *page, unsigned long offset)
+{
+	int (*invalidatepage)(struct page *, unsigned long);
+	invalidatepage = page->mapping->a_ops->invalidatepage;
+	if (invalidatepage == NULL)
+		invalidatepage = block_invalidatepage;
+	return (*invalidatepage)(page, offset);
+}
+
+static inline void truncate_partial_page(struct page *page, unsigned partial)
+{
+	memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
+	if (PagePrivate(page))
+		do_invalidatepage(page, partial);
+}
+
+/*
+ * If truncate cannot remove the fs-private metadata from the page, the page
+ * becomes anonymous.  It will be left on the LRU and may even be mapped into
+ * user pagetables if we're racing with filemap_nopage().
+ *
+ * We need to bale out if page->mapping is no longer equal to the original
+ * mapping.  This happens a) when the VM reclaimed the page while we waited on
+ * its lock, b) when a concurrent invalidate_inode_pages got there first and
+ * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
+ */
+static void
+truncate_complete_page(struct address_space *mapping, struct page *page)
+{
+	if (page->mapping != mapping)
+		return;
+
+	if (PagePrivate(page))
+		do_invalidatepage(page, 0);
+
+	clear_page_dirty(page);
+	ClearPageUptodate(page);
+	ClearPageMappedToDisk(page);
+	remove_from_page_cache(page);
+	page_cache_release(page);	/* pagecache ref */
+}
+
+/*
+ * This is for invalidate_inode_pages().  That function can be called at
+ * any time, and is not supposed to throw away dirty pages.  But pages can
+ * be marked dirty at any time too.  So we re-check the dirtiness inside
+ * ->tree_lock.  That provides exclusion against the __set_page_dirty
+ * functions.
+ *
+ * Returns non-zero if the page was successfully invalidated.
+ */
+static int
+invalidate_complete_page(struct address_space *mapping, struct page *page)
+{
+	if (page->mapping != mapping)
+		return 0;
+
+	if (PagePrivate(page) && !try_to_release_page(page, 0))
+		return 0;
+
+	write_lock_irq(&mapping->tree_lock);
+	if (PageDirty(page)) {
+		write_unlock_irq(&mapping->tree_lock);
+		return 0;
+	}
+
+	BUG_ON(PagePrivate(page));
+	__remove_from_page_cache(page);
+	write_unlock_irq(&mapping->tree_lock);
+	ClearPageUptodate(page);
+	page_cache_release(page);	/* pagecache ref */
+	return 1;
+}
+
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from which to truncate
+ *
+ * Truncate the page cache at a set offset, removing the pages that are beyond
+ * that offset (and zeroing out partial pages).
+ *
+ * Truncate takes two passes - the first pass is nonblocking.  It will not
+ * block on page locks and it will not block on writeback.  The second pass
+ * will wait.  This is to prevent as much IO as possible in the affected region.
+ * The first pass will remove most pages, so the search cost of the second pass
+ * is low.
+ *
+ * When looking at page->index outside the page lock we need to be careful to
+ * copy it into a local to avoid races (it could change at any time).
+ *
+ * We pass down the cache-hot hint to the page freeing code.  Even if the
+ * mapping is large, it is probably the case that the final pages are the most
+ * recently touched, and freeing happens in ascending file offset order.
+ *
+ * Called under (and serialised by) inode->i_sem.
+ */
+void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+{
+	const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+	const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
+	struct pagevec pvec;
+	pgoff_t next;
+	int i;
+
+	if (mapping->nrpages == 0)
+		return;
+
+	pagevec_init(&pvec, 0);
+	next = start;
+	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			pgoff_t page_index = page->index;
+
+			if (page_index > next)
+				next = page_index;
+			next++;
+			if (TestSetPageLocked(page))
+				continue;
+			if (PageWriteback(page)) {
+				unlock_page(page);
+				continue;
+			}
+			truncate_complete_page(mapping, page);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	if (partial) {
+		struct page *page = find_lock_page(mapping, start - 1);
+		if (page) {
+			wait_on_page_writeback(page);
+			truncate_partial_page(page, partial);
+			unlock_page(page);
+			page_cache_release(page);
+		}
+	}
+
+	next = start;
+	for ( ; ; ) {
+		cond_resched();
+		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+			if (next == start)
+				break;
+			next = start;
+			continue;
+		}
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+
+			lock_page(page);
+			wait_on_page_writeback(page);
+			if (page->index > next)
+				next = page->index;
+			next++;
+			truncate_complete_page(mapping, page);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+	}
+}
+
+EXPORT_SYMBOL(truncate_inode_pages);
+
+/**
+ * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
+ * @mapping: the address_space which holds the pages to invalidate
+ * @start: the offset 'from' which to invalidate
+ * @end: the offset 'to' which to invalidate (inclusive)
+ *
+ * This function only removes the unlocked pages, if you want to
+ * remove all the pages of one inode, you must call truncate_inode_pages.
+ *
+ * invalidate_mapping_pages() will not block on IO activity. It will not
+ * invalidate pages which are dirty, locked, under writeback or mapped into
+ * pagetables.
+ */
+unsigned long invalidate_mapping_pages(struct address_space *mapping,
+				pgoff_t start, pgoff_t end)
+{
+	struct pagevec pvec;
+	pgoff_t next = start;
+	unsigned long ret = 0;
+	int i;
+
+	pagevec_init(&pvec, 0);
+	while (next <= end &&
+			pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+
+			if (TestSetPageLocked(page)) {
+				next++;
+				continue;
+			}
+			if (page->index > next)
+				next = page->index;
+			next++;
+			if (PageDirty(page) || PageWriteback(page))
+				goto unlock;
+			if (page_mapped(page))
+				goto unlock;
+			ret += invalidate_complete_page(mapping, page);
+unlock:
+			unlock_page(page);
+			if (next > end)
+				break;
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+	return ret;
+}
+
+unsigned long invalidate_inode_pages(struct address_space *mapping)
+{
+	return invalidate_mapping_pages(mapping, 0, ~0UL);
+}
+
+EXPORT_SYMBOL(invalidate_inode_pages);
+
+/**
+ * invalidate_inode_pages2_range - remove range of pages from an address_space
+ * @mapping - the address_space
+ * @start: the page offset 'from' which to invalidate
+ * @end: the page offset 'to' which to invalidate (inclusive)
+ *
+ * Any pages which are found to be mapped into pagetables are unmapped prior to
+ * invalidation.
+ *
+ * Returns -EIO if any pages could not be invalidated.
+ */
+int invalidate_inode_pages2_range(struct address_space *mapping,
+				  pgoff_t start, pgoff_t end)
+{
+	struct pagevec pvec;
+	pgoff_t next;
+	int i;
+	int ret = 0;
+	int did_range_unmap = 0;
+	int wrapped = 0;
+
+	pagevec_init(&pvec, 0);
+	next = start;
+	while (next <= end && !ret && !wrapped &&
+		pagevec_lookup(&pvec, mapping, next,
+			min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+		for (i = 0; !ret && i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			pgoff_t page_index;
+			int was_dirty;
+
+			lock_page(page);
+			if (page->mapping != mapping) {
+				unlock_page(page);
+				continue;
+			}
+			page_index = page->index;
+			next = page_index + 1;
+			if (next == 0)
+				wrapped = 1;
+			if (page_index > end) {
+				unlock_page(page);
+				break;
+			}
+			wait_on_page_writeback(page);
+			while (page_mapped(page)) {
+				if (!did_range_unmap) {
+					/*
+					 * Zap the rest of the file in one hit.
+					 */
+					unmap_mapping_range(mapping,
+					    page_index << PAGE_CACHE_SHIFT,
+					    (end - page_index + 1)
+							<< PAGE_CACHE_SHIFT,
+					    0);
+					did_range_unmap = 1;
+				} else {
+					/*
+					 * Just zap this page
+					 */
+					unmap_mapping_range(mapping,
+					  page_index << PAGE_CACHE_SHIFT,
+					  PAGE_CACHE_SIZE, 0);
+				}
+			}
+			was_dirty = test_clear_page_dirty(page);
+			if (!invalidate_complete_page(mapping, page)) {
+				if (was_dirty)
+					set_page_dirty(page);
+				ret = -EIO;
+			}
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
+
+/**
+ * invalidate_inode_pages2 - remove all pages from an address_space
+ * @mapping - the address_space
+ *
+ * Any pages which are found to be mapped into pagetables are unmapped prior to
+ * invalidation.
+ *
+ * Returns -EIO if any pages could not be invalidated.
+ */
+int invalidate_inode_pages2(struct address_space *mapping)
+{
+	return invalidate_inode_pages2_range(mapping, 0, -1);
+}
+EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -0,0 +1,588 @@
+/*
+ *  linux/mm/vmalloc.c
+ *
+ *  Copyright (C) 1993  Linus Torvalds
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
+ *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+
+#include <linux/vmalloc.h>
+
+#include <asm/uaccess.h>
+#include <asm/tlbflush.h>
+
+
+DEFINE_RWLOCK(vmlist_lock);
+struct vm_struct *vmlist;
+
+static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
+{
+	pte_t *pte;
+
+	pte = pte_offset_kernel(pmd, addr);
+	do {
+		pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
+		WARN_ON(!pte_none(ptent) && !pte_present(ptent));
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+}
+
+static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr,
+						unsigned long end)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none_or_clear_bad(pmd))
+			continue;
+		vunmap_pte_range(pmd, addr, next);
+	} while (pmd++, addr = next, addr != end);
+}
+
+static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr,
+						unsigned long end)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_offset(pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(pud))
+			continue;
+		vunmap_pmd_range(pud, addr, next);
+	} while (pud++, addr = next, addr != end);
+}
+
+void unmap_vm_area(struct vm_struct *area)
+{
+	pgd_t *pgd;
+	unsigned long next;
+	unsigned long addr = (unsigned long) area->addr;
+	unsigned long end = addr + area->size;
+
+	BUG_ON(addr >= end);
+	pgd = pgd_offset_k(addr);
+	flush_cache_vunmap(addr, end);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		vunmap_pud_range(pgd, addr, next);
+	} while (pgd++, addr = next, addr != end);
+	flush_tlb_kernel_range((unsigned long) area->addr, end);
+}
+
+static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
+			unsigned long end, pgprot_t prot, struct page ***pages)
+{
+	pte_t *pte;
+
+	pte = pte_alloc_kernel(&init_mm, pmd, addr);
+	if (!pte)
+		return -ENOMEM;
+	do {
+		struct page *page = **pages;
+		WARN_ON(!pte_none(*pte));
+		if (!page)
+			return -ENOMEM;
+		set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
+		(*pages)++;
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	return 0;
+}
+
+static inline int vmap_pmd_range(pud_t *pud, unsigned long addr,
+			unsigned long end, pgprot_t prot, struct page ***pages)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pmd = pmd_alloc(&init_mm, pud, addr);
+	if (!pmd)
+		return -ENOMEM;
+	do {
+		next = pmd_addr_end(addr, end);
+		if (vmap_pte_range(pmd, addr, next, prot, pages))
+			return -ENOMEM;
+	} while (pmd++, addr = next, addr != end);
+	return 0;
+}
+
+static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr,
+			unsigned long end, pgprot_t prot, struct page ***pages)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_alloc(&init_mm, pgd, addr);
+	if (!pud)
+		return -ENOMEM;
+	do {
+		next = pud_addr_end(addr, end);
+		if (vmap_pmd_range(pud, addr, next, prot, pages))
+			return -ENOMEM;
+	} while (pud++, addr = next, addr != end);
+	return 0;
+}
+
+int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
+{
+	pgd_t *pgd;
+	unsigned long next;
+	unsigned long addr = (unsigned long) area->addr;
+	unsigned long end = addr + area->size - PAGE_SIZE;
+	int err;
+
+	BUG_ON(addr >= end);
+	pgd = pgd_offset_k(addr);
+	spin_lock(&init_mm.page_table_lock);
+	do {
+		next = pgd_addr_end(addr, end);
+		err = vmap_pud_range(pgd, addr, next, prot, pages);
+		if (err)
+			break;
+	} while (pgd++, addr = next, addr != end);
+	spin_unlock(&init_mm.page_table_lock);
+	flush_cache_vmap((unsigned long) area->addr, end);
+	return err;
+}
+
+#define IOREMAP_MAX_ORDER	(7 + PAGE_SHIFT)	/* 128 pages */
+
+struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
+				unsigned long start, unsigned long end)
+{
+	struct vm_struct **p, *tmp, *area;
+	unsigned long align = 1;
+	unsigned long addr;
+
+	if (flags & VM_IOREMAP) {
+		int bit = fls(size);
+
+		if (bit > IOREMAP_MAX_ORDER)
+			bit = IOREMAP_MAX_ORDER;
+		else if (bit < PAGE_SHIFT)
+			bit = PAGE_SHIFT;
+
+		align = 1ul << bit;
+	}
+	addr = ALIGN(start, align);
+	size = PAGE_ALIGN(size);
+
+	area = kmalloc(sizeof(*area), GFP_KERNEL);
+	if (unlikely(!area))
+		return NULL;
+
+	if (unlikely(!size)) {
+		kfree (area);
+		return NULL;
+	}
+
+	/*
+	 * We always allocate a guard page.
+	 */
+	size += PAGE_SIZE;
+
+	write_lock(&vmlist_lock);
+	for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) {
+		if ((unsigned long)tmp->addr < addr) {
+			if((unsigned long)tmp->addr + tmp->size >= addr)
+				addr = ALIGN(tmp->size + 
+					     (unsigned long)tmp->addr, align);
+			continue;
+		}
+		if ((size + addr) < addr)
+			goto out;
+		if (size + addr <= (unsigned long)tmp->addr)
+			goto found;
+		addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align);
+		if (addr > end - size)
+			goto out;
+	}
+
+found:
+	area->next = *p;
+	*p = area;
+
+	area->flags = flags;
+	area->addr = (void *)addr;
+	area->size = size;
+	area->pages = NULL;
+	area->nr_pages = 0;
+	area->phys_addr = 0;
+	write_unlock(&vmlist_lock);
+
+	return area;
+
+out:
+	write_unlock(&vmlist_lock);
+	kfree(area);
+	if (printk_ratelimit())
+		printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc=<size> to increase size.\n");
+	return NULL;
+}
+
+/**
+ *	get_vm_area  -  reserve a contingous kernel virtual area
+ *
+ *	@size:		size of the area
+ *	@flags:		%VM_IOREMAP for I/O mappings or VM_ALLOC
+ *
+ *	Search an area of @size in the kernel virtual mapping area,
+ *	and reserved it for out purposes.  Returns the area descriptor
+ *	on success or %NULL on failure.
+ */
+struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
+{
+	return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END);
+}
+
+/**
+ *	remove_vm_area  -  find and remove a contingous kernel virtual area
+ *
+ *	@addr:		base address
+ *
+ *	Search for the kernel VM area starting at @addr, and remove it.
+ *	This function returns the found VM area, but using it is NOT safe
+ *	on SMP machines.
+ */
+struct vm_struct *remove_vm_area(void *addr)
+{
+	struct vm_struct **p, *tmp;
+
+	write_lock(&vmlist_lock);
+	for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) {
+		 if (tmp->addr == addr)
+			 goto found;
+	}
+	write_unlock(&vmlist_lock);
+	return NULL;
+
+found:
+	unmap_vm_area(tmp);
+	*p = tmp->next;
+	write_unlock(&vmlist_lock);
+
+	/*
+	 * Remove the guard page.
+	 */
+	tmp->size -= PAGE_SIZE;
+	return tmp;
+}
+
+void __vunmap(void *addr, int deallocate_pages)
+{
+	struct vm_struct *area;
+
+	if (!addr)
+		return;
+
+	if ((PAGE_SIZE-1) & (unsigned long)addr) {
+		printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
+		WARN_ON(1);
+		return;
+	}
+
+	area = remove_vm_area(addr);
+	if (unlikely(!area)) {
+		printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
+				addr);
+		WARN_ON(1);
+		return;
+	}
+
+	if (deallocate_pages) {
+		int i;
+
+		for (i = 0; i < area->nr_pages; i++) {
+			if (unlikely(!area->pages[i]))
+				BUG();
+			__free_page(area->pages[i]);
+		}
+
+		if (area->nr_pages > PAGE_SIZE/sizeof(struct page *))
+			vfree(area->pages);
+		else
+			kfree(area->pages);
+	}
+
+	kfree(area);
+	return;
+}
+
+/**
+ *	vfree  -  release memory allocated by vmalloc()
+ *
+ *	@addr:		memory base address
+ *
+ *	Free the virtually contiguous memory area starting at @addr, as
+ *	obtained from vmalloc(), vmalloc_32() or __vmalloc().
+ *
+ *	May not be called in interrupt context.
+ */
+void vfree(void *addr)
+{
+	BUG_ON(in_interrupt());
+	__vunmap(addr, 1);
+}
+
+EXPORT_SYMBOL(vfree);
+
+/**
+ *	vunmap  -  release virtual mapping obtained by vmap()
+ *
+ *	@addr:		memory base address
+ *
+ *	Free the virtually contiguous memory area starting at @addr,
+ *	which was created from the page array passed to vmap().
+ *
+ *	May not be called in interrupt context.
+ */
+void vunmap(void *addr)
+{
+	BUG_ON(in_interrupt());
+	__vunmap(addr, 0);
+}
+
+EXPORT_SYMBOL(vunmap);
+
+/**
+ *	vmap  -  map an array of pages into virtually contiguous space
+ *
+ *	@pages:		array of page pointers
+ *	@count:		number of pages to map
+ *	@flags:		vm_area->flags
+ *	@prot:		page protection for the mapping
+ *
+ *	Maps @count pages from @pages into contiguous kernel virtual
+ *	space.
+ */
+void *vmap(struct page **pages, unsigned int count,
+		unsigned long flags, pgprot_t prot)
+{
+	struct vm_struct *area;
+
+	if (count > num_physpages)
+		return NULL;
+
+	area = get_vm_area((count << PAGE_SHIFT), flags);
+	if (!area)
+		return NULL;
+	if (map_vm_area(area, prot, &pages)) {
+		vunmap(area->addr);
+		return NULL;
+	}
+
+	return area->addr;
+}
+
+EXPORT_SYMBOL(vmap);
+
+void *__vmalloc_area(struct vm_struct *area, unsigned int __nocast gfp_mask, pgprot_t prot)
+{
+	struct page **pages;
+	unsigned int nr_pages, array_size, i;
+
+	nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
+	array_size = (nr_pages * sizeof(struct page *));
+
+	area->nr_pages = nr_pages;
+	/* Please note that the recursion is strictly bounded. */
+	if (array_size > PAGE_SIZE)
+		pages = __vmalloc(array_size, gfp_mask, PAGE_KERNEL);
+	else
+		pages = kmalloc(array_size, (gfp_mask & ~__GFP_HIGHMEM));
+	area->pages = pages;
+	if (!area->pages) {
+		remove_vm_area(area->addr);
+		kfree(area);
+		return NULL;
+	}
+	memset(area->pages, 0, array_size);
+
+	for (i = 0; i < area->nr_pages; i++) {
+		area->pages[i] = alloc_page(gfp_mask);
+		if (unlikely(!area->pages[i])) {
+			/* Successfully allocated i pages, free them in __vunmap() */
+			area->nr_pages = i;
+			goto fail;
+		}
+	}
+
+	if (map_vm_area(area, prot, &pages))
+		goto fail;
+	return area->addr;
+
+fail:
+	vfree(area->addr);
+	return NULL;
+}
+
+/**
+ *	__vmalloc  -  allocate virtually contiguous memory
+ *
+ *	@size:		allocation size
+ *	@gfp_mask:	flags for the page level allocator
+ *	@prot:		protection mask for the allocated pages
+ *
+ *	Allocate enough pages to cover @size from the page level
+ *	allocator with @gfp_mask flags.  Map them into contiguous
+ *	kernel virtual space, using a pagetable protection of @prot.
+ */
+void *__vmalloc(unsigned long size, unsigned int __nocast gfp_mask, pgprot_t prot)
+{
+	struct vm_struct *area;
+
+	size = PAGE_ALIGN(size);
+	if (!size || (size >> PAGE_SHIFT) > num_physpages)
+		return NULL;
+
+	area = get_vm_area(size, VM_ALLOC);
+	if (!area)
+		return NULL;
+
+	return __vmalloc_area(area, gfp_mask, prot);
+}
+
+EXPORT_SYMBOL(__vmalloc);
+
+/**
+ *	vmalloc  -  allocate virtually contiguous memory
+ *
+ *	@size:		allocation size
+ *
+ *	Allocate enough pages to cover @size from the page level
+ *	allocator and map them into contiguous kernel virtual space.
+ *
+ *	For tight cotrol over page level allocator and protection flags
+ *	use __vmalloc() instead.
+ */
+void *vmalloc(unsigned long size)
+{
+       return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+}
+
+EXPORT_SYMBOL(vmalloc);
+
+/**
+ *	vmalloc_exec  -  allocate virtually contiguous, executable memory
+ *
+ *	@size:		allocation size
+ *
+ *	Kernel-internal function to allocate enough pages to cover @size
+ *	the page level allocator and map them into contiguous and
+ *	executable kernel virtual space.
+ *
+ *	For tight cotrol over page level allocator and protection flags
+ *	use __vmalloc() instead.
+ */
+
+#ifndef PAGE_KERNEL_EXEC
+# define PAGE_KERNEL_EXEC PAGE_KERNEL
+#endif
+
+void *vmalloc_exec(unsigned long size)
+{
+	return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
+}
+
+/**
+ *	vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
+ *
+ *	@size:		allocation size
+ *
+ *	Allocate enough 32bit PA addressable pages to cover @size from the
+ *	page level allocator and map them into contiguous kernel virtual space.
+ */
+void *vmalloc_32(unsigned long size)
+{
+	return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
+}
+
+EXPORT_SYMBOL(vmalloc_32);
+
+long vread(char *buf, char *addr, unsigned long count)
+{
+	struct vm_struct *tmp;
+	char *vaddr, *buf_start = buf;
+	unsigned long n;
+
+	/* Don't allow overflow */
+	if ((unsigned long) addr + count < count)
+		count = -(unsigned long) addr;
+
+	read_lock(&vmlist_lock);
+	for (tmp = vmlist; tmp; tmp = tmp->next) {
+		vaddr = (char *) tmp->addr;
+		if (addr >= vaddr + tmp->size - PAGE_SIZE)
+			continue;
+		while (addr < vaddr) {
+			if (count == 0)
+				goto finished;
+			*buf = '\0';
+			buf++;
+			addr++;
+			count--;
+		}
+		n = vaddr + tmp->size - PAGE_SIZE - addr;
+		do {
+			if (count == 0)
+				goto finished;
+			*buf = *addr;
+			buf++;
+			addr++;
+			count--;
+		} while (--n > 0);
+	}
+finished:
+	read_unlock(&vmlist_lock);
+	return buf - buf_start;
+}
+
+long vwrite(char *buf, char *addr, unsigned long count)
+{
+	struct vm_struct *tmp;
+	char *vaddr, *buf_start = buf;
+	unsigned long n;
+
+	/* Don't allow overflow */
+	if ((unsigned long) addr + count < count)
+		count = -(unsigned long) addr;
+
+	read_lock(&vmlist_lock);
+	for (tmp = vmlist; tmp; tmp = tmp->next) {
+		vaddr = (char *) tmp->addr;
+		if (addr >= vaddr + tmp->size - PAGE_SIZE)
+			continue;
+		while (addr < vaddr) {
+			if (count == 0)
+				goto finished;
+			buf++;
+			addr++;
+			count--;
+		}
+		n = vaddr + tmp->size - PAGE_SIZE - addr;
+		do {
+			if (count == 0)
+				goto finished;
+			*addr = *buf;
+			buf++;
+			addr++;
+			count--;
+		} while (--n > 0);
+	}
+finished:
+	read_unlock(&vmlist_lock);
+	return buf - buf_start;
+}
--- a/mm/vmscan.c
+++ b/mm/vmscan.c