Merge commit 'v2.6.28-rc2' into core/locking

Conflicts:
	arch/um/include/asm/system.h
This commit is contained in:
Ingo Molnar
2008-10-28 16:54:49 +01:00
8444 changed files with 674387 additions and 211078 deletions

View File

@@ -101,7 +101,7 @@ config HAVE_MEMORY_PRESENT
# with gcc 3.4 and later.
#
config SPARSEMEM_STATIC
def_bool n
bool
#
# Architecture platforms which require a two level mem_section in SPARSEMEM
@@ -113,7 +113,7 @@ config SPARSEMEM_EXTREME
depends on SPARSEMEM && !SPARSEMEM_STATIC
config SPARSEMEM_VMEMMAP_ENABLE
def_bool n
bool
config SPARSEMEM_VMEMMAP
bool "Sparse Memory virtual memmap"
@@ -187,6 +187,9 @@ config RESOURCES_64BIT
help
This option allows memory and IO resources to be 64 bit.
config PHYS_ADDR_T_64BIT
def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
config ZONE_DMA_FLAG
int
default "0" if !ZONE_DMA
@@ -206,5 +209,16 @@ config VIRT_TO_BUS
def_bool y
depends on !ARCH_NO_VIRT_TO_BUS
config UNEVICTABLE_LRU
bool "Add LRU list to track non-evictable pages"
default y
depends on MMU
help
Keeps unevictable pages off of the active and inactive pageout
lists, so kswapd will not waste CPU time or have its balancing
algorithms thrown off by scanning these pages. Selecting this
will use one page flag and increase the code size a little,
say Y unless you know what you are doing.
config MMU_NOTIFIER
bool

View File

@@ -33,5 +33,4 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_SMP) += allocpercpu.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o

View File

@@ -48,7 +48,7 @@ early_param("bootmem_debug", bootmem_debug_setup);
if (unlikely(bootmem_debug)) \
printk(KERN_INFO \
"bootmem::%s " fmt, \
__FUNCTION__, ## args); \
__func__, ## args); \
})
static unsigned long __init bootmap_bytes(unsigned long pages)

View File

@@ -267,7 +267,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
/*
* Data-less bio, nothing to bounce
*/
if (bio_empty_barrier(*bio_orig))
if (!bio_has_data(*bio_orig))
return;
/*

View File

@@ -3,7 +3,7 @@
*
* Copyright (C) 2002, Linus Torvalds
*
* 11Jan2003 akpm@digeo.com
* 11Jan2003 Andrew Morton
* Initial version.
*/

View File

@@ -33,6 +33,7 @@
#include <linux/cpuset.h>
#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
#include <linux/memcontrol.h>
#include <linux/mm_inline.h> /* for page_is_file_cache() */
#include "internal.h"
/*
@@ -115,12 +116,12 @@ void __remove_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
mem_cgroup_uncharge_cache_page(page);
radix_tree_delete(&mapping->page_tree, page->index);
page->mapping = NULL;
mapping->nrpages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
BUG_ON(page_mapped(page));
mem_cgroup_uncharge_cache_page(page);
/*
* Some filesystems seem to re-dirty the page even after
@@ -492,9 +493,24 @@ EXPORT_SYMBOL(add_to_page_cache_locked);
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
pgoff_t offset, gfp_t gfp_mask)
{
int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
if (ret == 0)
lru_cache_add(page);
int ret;
/*
* Splice_read and readahead add shmem/tmpfs pages into the page cache
* before shmem_readpage has a chance to mark them as SwapBacked: they
* need to go on the active_anon lru below, and mem_cgroup_cache_charge
* (called in add_to_page_cache) needs to know where they're going too.
*/
if (mapping_cap_swap_backed(mapping))
SetPageSwapBacked(page);
ret = add_to_page_cache(page, mapping, offset, gfp_mask);
if (ret == 0) {
if (page_is_file_cache(page))
lru_cache_add_file(page);
else
lru_cache_add_active_anon(page);
}
return ret;
}
@@ -557,17 +573,14 @@ EXPORT_SYMBOL(wait_on_page_bit);
* mechananism between PageLocked pages and PageWriteback pages is shared.
* But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
*
* The first mb is necessary to safely close the critical section opened by the
* test_and_set_bit() to lock the page; the second mb is necessary to enforce
* ordering between the clear_bit and the read of the waitqueue (to avoid SMP
* races with a parallel wait_on_page_locked()).
* The mb is necessary to enforce ordering between the clear_bit and the read
* of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
*/
void unlock_page(struct page *page)
{
smp_mb__before_clear_bit();
if (!test_and_clear_bit(PG_locked, &page->flags))
BUG();
smp_mb__after_clear_bit();
VM_BUG_ON(!PageLocked(page));
clear_bit_unlock(PG_locked, &page->flags);
smp_mb__after_clear_bit();
wake_up_page(page, PG_locked);
}
EXPORT_SYMBOL(unlock_page);
@@ -1100,8 +1113,9 @@ page_ok:
page_not_up_to_date:
/* Get exclusive access to the page ... */
if (lock_page_killable(page))
goto readpage_eio;
error = lock_page_killable(page);
if (unlikely(error))
goto readpage_error;
page_not_up_to_date_locked:
/* Did it get truncated before we got the lock? */
@@ -1130,8 +1144,9 @@ readpage:
}
if (!PageUptodate(page)) {
if (lock_page_killable(page))
goto readpage_eio;
error = lock_page_killable(page);
if (unlikely(error))
goto readpage_error;
if (!PageUptodate(page)) {
if (page->mapping == NULL) {
/*
@@ -1143,15 +1158,14 @@ readpage:
}
unlock_page(page);
shrink_readahead_size_eio(filp, ra);
goto readpage_eio;
error = -EIO;
goto readpage_error;
}
unlock_page(page);
}
goto page_ok;
readpage_eio:
error = -EIO;
readpage_error:
/* UHHUH! A synchronous read error occurred. Report it */
desc->error = error;
@@ -1186,8 +1200,7 @@ out:
ra->prev_pos |= prev_offset;
*ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
if (filp)
file_accessed(filp);
file_accessed(filp);
}
int file_read_actor(read_descriptor_t *desc, struct page *page,

View File

@@ -21,6 +21,8 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include "internal.h"
static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
@@ -215,15 +217,31 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
spin_unlock(&mapping->i_mmap_lock);
}
if (vma->vm_flags & VM_LOCKED) {
/*
* drop PG_Mlocked flag for over-mapped range
*/
unsigned int saved_flags = vma->vm_flags;
munlock_vma_pages_range(vma, start, start + size);
vma->vm_flags = saved_flags;
}
mmu_notifier_invalidate_range_start(mm, start, start + size);
err = populate_range(mm, vma, start, size, pgoff);
mmu_notifier_invalidate_range_end(mm, start, start + size);
if (!err && !(flags & MAP_NONBLOCK)) {
if (unlikely(has_write_lock)) {
downgrade_write(&mm->mmap_sem);
has_write_lock = 0;
if (vma->vm_flags & VM_LOCKED) {
/*
* might be mapping previously unmapped range of file
*/
mlock_vma_pages_range(vma, start, start + size);
} else {
if (unlikely(has_write_lock)) {
downgrade_write(&mm->mmap_sem);
has_write_lock = 0;
}
make_pages_present(start, start+size);
}
make_pages_present(start, start+size);
}
/*
@@ -240,4 +258,3 @@ out:
return err;
}

View File

@@ -70,6 +70,7 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
static void flush_all_zero_pkmaps(void)
{
int i;
int need_flush = 0;
flush_cache_kmaps();
@@ -101,8 +102,10 @@ static void flush_all_zero_pkmaps(void)
&pkmap_page_table[i]);
set_page_address(page, NULL);
need_flush = 1;
}
flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
if (need_flush)
flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
}
/**

View File

@@ -7,6 +7,7 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/seq_file.h>
#include <linux/sysctl.h>
#include <linux/highmem.h>
#include <linux/mmu_notifier.h>
@@ -262,7 +263,7 @@ struct resv_map {
struct list_head regions;
};
struct resv_map *resv_map_alloc(void)
static struct resv_map *resv_map_alloc(void)
{
struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
if (!resv_map)
@@ -274,7 +275,7 @@ struct resv_map *resv_map_alloc(void)
return resv_map;
}
void resv_map_release(struct kref *ref)
static void resv_map_release(struct kref *ref)
{
struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
@@ -289,7 +290,7 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
if (!(vma->vm_flags & VM_SHARED))
return (struct resv_map *)(get_vma_private_data(vma) &
~HPAGE_RESV_MASK);
return 0;
return NULL;
}
static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
@@ -1455,15 +1456,15 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
#endif /* CONFIG_SYSCTL */
int hugetlb_report_meminfo(char *buf)
void hugetlb_report_meminfo(struct seq_file *m)
{
struct hstate *h = &default_hstate;
return sprintf(buf,
"HugePages_Total: %5lu\n"
"HugePages_Free: %5lu\n"
"HugePages_Rsvd: %5lu\n"
"HugePages_Surp: %5lu\n"
"Hugepagesize: %5lu kB\n",
seq_printf(m,
"HugePages_Total: %5lu\n"
"HugePages_Free: %5lu\n"
"HugePages_Rsvd: %5lu\n"
"HugePages_Surp: %5lu\n"
"Hugepagesize: %8lu kB\n",
h->nr_huge_pages,
h->free_huge_pages,
h->resv_huge_pages,
@@ -1747,10 +1748,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
* from other VMAs and let the children be SIGKILLed if they are faulting the
* same region.
*/
int unmap_ref_private(struct mm_struct *mm,
struct vm_area_struct *vma,
struct page *page,
unsigned long address)
static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *page, unsigned long address)
{
struct vm_area_struct *iter_vma;
struct address_space *mapping;
@@ -2008,7 +2007,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
entry = huge_ptep_get(ptep);
if (huge_pte_none(entry)) {
ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
goto out_unlock;
goto out_mutex;
}
ret = 0;
@@ -2024,7 +2023,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (write_access && !pte_write(entry)) {
if (vma_needs_reservation(h, vma, address) < 0) {
ret = VM_FAULT_OOM;
goto out_unlock;
goto out_mutex;
}
if (!(vma->vm_flags & VM_SHARED))
@@ -2034,10 +2033,23 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
spin_lock(&mm->page_table_lock);
/* Check for a racing update before calling hugetlb_cow */
if (likely(pte_same(entry, huge_ptep_get(ptep))))
if (write_access && !pte_write(entry))
if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
goto out_page_table_lock;
if (write_access) {
if (!pte_write(entry)) {
ret = hugetlb_cow(mm, vma, address, ptep, entry,
pagecache_page);
goto out_page_table_lock;
}
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
if (huge_ptep_set_access_flags(vma, address, ptep, entry, write_access))
update_mmu_cache(vma, address, entry);
out_page_table_lock:
spin_unlock(&mm->page_table_lock);
if (pagecache_page) {
@@ -2045,7 +2057,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
put_page(pagecache_page);
}
out_unlock:
out_mutex:
mutex_unlock(&hugetlb_instantiation_mutex);
return ret;
@@ -2060,6 +2072,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
return NULL;
}
static int huge_zeropage_ok(pte_t *ptep, int write, int shared)
{
if (!ptep || write || shared)
return 0;
else
return huge_pte_none(huge_ptep_get(ptep));
}
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, int *length, int i,
@@ -2069,6 +2089,8 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long vaddr = *position;
int remainder = *length;
struct hstate *h = hstate_vma(vma);
int zeropage_ok = 0;
int shared = vma->vm_flags & VM_SHARED;
spin_lock(&mm->page_table_lock);
while (vaddr < vma->vm_end && remainder) {
@@ -2081,8 +2103,11 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
* first, for the page indexing below to work.
*/
pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
if (huge_zeropage_ok(pte, write, shared))
zeropage_ok = 1;
if (!pte || huge_pte_none(huge_ptep_get(pte)) ||
if (!pte ||
(huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) ||
(write && !pte_write(huge_ptep_get(pte)))) {
int ret;
@@ -2102,8 +2127,11 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
page = pte_page(huge_ptep_get(pte));
same_page:
if (pages) {
get_page(page);
pages[i] = page + pfn_offset;
if (zeropage_ok)
pages[i] = ZERO_PAGE(0);
else
pages[i] = page + pfn_offset;
get_page(pages[i]);
}
if (vmas)

View File

@@ -39,6 +39,15 @@ static inline void __put_page(struct page *page)
atomic_dec(&page->_count);
}
/*
* in mm/vmscan.c:
*/
extern int isolate_lru_page(struct page *page);
extern void putback_lru_page(struct page *page);
/*
* in mm/page_alloc.c
*/
extern void __free_pages_bootmem(struct page *page, unsigned int order);
/*
@@ -52,6 +61,120 @@ static inline unsigned long page_order(struct page *page)
return page_private(page);
}
extern long mlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
extern void munlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
{
munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
}
#ifdef CONFIG_UNEVICTABLE_LRU
/*
* unevictable_migrate_page() called only from migrate_page_copy() to
* migrate unevictable flag to new page.
* Note that the old page has been isolated from the LRU lists at this
* point so we don't need to worry about LRU statistics.
*/
static inline void unevictable_migrate_page(struct page *new, struct page *old)
{
if (TestClearPageUnevictable(old))
SetPageUnevictable(new);
}
#else
static inline void unevictable_migrate_page(struct page *new, struct page *old)
{
}
#endif
#ifdef CONFIG_UNEVICTABLE_LRU
/*
* Called only in fault path via page_evictable() for a new page
* to determine if it's being mapped into a LOCKED vma.
* If so, mark page as mlocked.
*/
static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
{
VM_BUG_ON(PageLRU(page));
if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
return 0;
if (!TestSetPageMlocked(page)) {
inc_zone_page_state(page, NR_MLOCK);
count_vm_event(UNEVICTABLE_PGMLOCKED);
}
return 1;
}
/*
* must be called with vma's mmap_sem held for read, and page locked.
*/
extern void mlock_vma_page(struct page *page);
/*
* Clear the page's PageMlocked(). This can be useful in a situation where
* we want to unconditionally remove a page from the pagecache -- e.g.,
* on truncation or freeing.
*
* It is legal to call this function for any page, mlocked or not.
* If called for a page that is still mapped by mlocked vmas, all we do
* is revert to lazy LRU behaviour -- semantics are not broken.
*/
extern void __clear_page_mlock(struct page *page);
static inline void clear_page_mlock(struct page *page)
{
if (unlikely(TestClearPageMlocked(page)))
__clear_page_mlock(page);
}
/*
* mlock_migrate_page - called only from migrate_page_copy() to
* migrate the Mlocked page flag; update statistics.
*/
static inline void mlock_migrate_page(struct page *newpage, struct page *page)
{
if (TestClearPageMlocked(page)) {
unsigned long flags;
local_irq_save(flags);
__dec_zone_page_state(page, NR_MLOCK);
SetPageMlocked(newpage);
__inc_zone_page_state(newpage, NR_MLOCK);
local_irq_restore(flags);
}
}
/*
* free_page_mlock() -- clean up attempts to free and mlocked() page.
* Page should not be on lru, so no need to fix that up.
* free_pages_check() will verify...
*/
static inline void free_page_mlock(struct page *page)
{
if (unlikely(TestClearPageMlocked(page))) {
unsigned long flags;
local_irq_save(flags);
__dec_zone_page_state(page, NR_MLOCK);
__count_vm_event(UNEVICTABLE_MLOCKFREED);
local_irq_restore(flags);
}
}
#else /* CONFIG_UNEVICTABLE_LRU */
static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
{
return 0;
}
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
static inline void mlock_migrate_page(struct page *new, struct page *old) { }
static inline void free_page_mlock(struct page *page) { }
#endif /* CONFIG_UNEVICTABLE_LRU */
/*
* FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
* so all functions starting at paging_init should be marked __init
@@ -120,4 +243,12 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
}
#endif /* CONFIG_SPARSEMEM */
#define GUP_FLAGS_WRITE 0x1
#define GUP_FLAGS_FORCE 0x2
#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int flags,
struct page **pages, struct vm_area_struct **vmas);
#endif

View File

@@ -32,11 +32,12 @@
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/vmalloc.h>
#include <linux/mm_inline.h>
#include <linux/page_cgroup.h>
#include <asm/uaccess.h>
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
static struct kmem_cache *page_cgroup_cache __read_mostly;
#define MEM_CGROUP_RECLAIM_RETRIES 5
/*
@@ -65,11 +66,10 @@ struct mem_cgroup_stat {
/*
* For accounting under irq disable, no need for increment preempt count.
*/
static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat,
static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
enum mem_cgroup_stat_index idx, int val)
{
int cpu = smp_processor_id();
stat->cpustat[cpu].count[idx] += val;
stat->count[idx] += val;
}
static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
@@ -85,22 +85,13 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
/*
* per-zone information in memory controller.
*/
enum mem_cgroup_zstat_index {
MEM_CGROUP_ZSTAT_ACTIVE,
MEM_CGROUP_ZSTAT_INACTIVE,
NR_MEM_CGROUP_ZSTAT,
};
struct mem_cgroup_per_zone {
/*
* spin_lock to protect the per cgroup LRU
*/
spinlock_t lru_lock;
struct list_head active_list;
struct list_head inactive_list;
unsigned long count[NR_MEM_CGROUP_ZSTAT];
struct list_head lists[NR_LRU_LISTS];
unsigned long count[NR_LRU_LISTS];
};
/* Macro for accessing counter */
#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
@@ -144,69 +135,52 @@ struct mem_cgroup {
};
static struct mem_cgroup init_mem_cgroup;
/*
* We use the lower bit of the page->page_cgroup pointer as a bit spin
* lock. We need to ensure that page->page_cgroup is at least two
* byte aligned (based on comments from Nick Piggin). But since
* bit_spin_lock doesn't actually set that lock bit in a non-debug
* uniprocessor kernel, we should avoid setting it here too.
*/
#define PAGE_CGROUP_LOCK_BIT 0x0
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
#else
#define PAGE_CGROUP_LOCK 0x0
#endif
/*
* A page_cgroup page is associated with every page descriptor. The
* page_cgroup helps us identify information about the cgroup
*/
struct page_cgroup {
struct list_head lru; /* per cgroup LRU list */
struct page *page;
struct mem_cgroup *mem_cgroup;
int flags;
};
#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */
static int page_cgroup_nid(struct page_cgroup *pc)
{
return page_to_nid(pc->page);
}
static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
{
return page_zonenum(pc->page);
}
enum charge_type {
MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
MEM_CGROUP_CHARGE_TYPE_MAPPED,
MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
NR_CHARGE_TYPE,
};
/* only for here (for easy reading.) */
#define PCGF_CACHE (1UL << PCG_CACHE)
#define PCGF_USED (1UL << PCG_USED)
#define PCGF_ACTIVE (1UL << PCG_ACTIVE)
#define PCGF_LOCK (1UL << PCG_LOCK)
#define PCGF_FILE (1UL << PCG_FILE)
static const unsigned long
pcg_default_flags[NR_CHARGE_TYPE] = {
PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
0, /* FORCE */
};
/*
* Always modified under lru lock. Then, not necessary to preempt_disable()
*/
static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags,
bool charge)
static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
struct page_cgroup *pc,
bool charge)
{
int val = (charge)? 1 : -1;
struct mem_cgroup_stat *stat = &mem->stat;
struct mem_cgroup_stat_cpu *cpustat;
VM_BUG_ON(!irqs_disabled());
if (flags & PAGE_CGROUP_FLAG_CACHE)
__mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val);
cpustat = &stat->cpustat[smp_processor_id()];
if (PageCgroupCache(pc))
__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
else
__mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val);
__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
if (charge)
__mem_cgroup_stat_add_safe(stat,
__mem_cgroup_stat_add_safe(cpustat,
MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
else
__mem_cgroup_stat_add_safe(stat,
__mem_cgroup_stat_add_safe(cpustat,
MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
}
@@ -227,7 +201,7 @@ page_cgroup_zoneinfo(struct page_cgroup *pc)
}
static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
enum mem_cgroup_zstat_index idx)
enum lru_list idx)
{
int nid, zid;
struct mem_cgroup_per_zone *mz;
@@ -250,89 +224,89 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
{
/*
* mm_update_next_owner() may clear mm->owner to NULL
* if it races with swapoff, page migration, etc.
* So this can be called with p == NULL.
*/
if (unlikely(!p))
return NULL;
return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
struct mem_cgroup, css);
}
static inline int page_cgroup_locked(struct page *page)
{
return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
}
static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
{
VM_BUG_ON(!page_cgroup_locked(page));
page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
}
struct page_cgroup *page_get_page_cgroup(struct page *page)
{
return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
}
static void lock_page_cgroup(struct page *page)
{
bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
}
static int try_lock_page_cgroup(struct page *page)
{
return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
}
static void unlock_page_cgroup(struct page *page)
{
bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
}
static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
struct page_cgroup *pc)
{
int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
int lru = LRU_BASE;
if (from)
MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
else
MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
if (PageCgroupUnevictable(pc))
lru = LRU_UNEVICTABLE;
else {
if (PageCgroupActive(pc))
lru += LRU_ACTIVE;
if (PageCgroupFile(pc))
lru += LRU_FILE;
}
mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false);
MEM_CGROUP_ZSTAT(mz, lru) -= 1;
mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
list_del(&pc->lru);
}
static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
struct page_cgroup *pc)
{
int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
int lru = LRU_BASE;
if (!to) {
MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
list_add(&pc->lru, &mz->inactive_list);
} else {
MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
list_add(&pc->lru, &mz->active_list);
if (PageCgroupUnevictable(pc))
lru = LRU_UNEVICTABLE;
else {
if (PageCgroupActive(pc))
lru += LRU_ACTIVE;
if (PageCgroupFile(pc))
lru += LRU_FILE;
}
mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true);
MEM_CGROUP_ZSTAT(mz, lru) += 1;
list_add(&pc->lru, &mz->lists[lru]);
mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
}
static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
{
int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
int active = PageCgroupActive(pc);
int file = PageCgroupFile(pc);
int unevictable = PageCgroupUnevictable(pc);
enum lru_list from = unevictable ? LRU_UNEVICTABLE :
(LRU_FILE * !!file + !!active);
if (from)
MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
else
MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
if (lru == from)
return;
if (active) {
MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
list_move(&pc->lru, &mz->active_list);
MEM_CGROUP_ZSTAT(mz, from) -= 1;
/*
* However this is done under mz->lru_lock, another flags, which
* are not related to LRU, will be modified from out-of-lock.
* We have to use atomic set/clear flags.
*/
if (is_unevictable_lru(lru)) {
ClearPageCgroupActive(pc);
SetPageCgroupUnevictable(pc);
} else {
MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE;
list_move(&pc->lru, &mz->inactive_list);
if (is_active_lru(lru))
SetPageCgroupActive(pc);
else
ClearPageCgroupActive(pc);
ClearPageCgroupUnevictable(pc);
}
MEM_CGROUP_ZSTAT(mz, lru) += 1;
list_move(&pc->lru, &mz->lists[lru]);
}
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
@@ -348,7 +322,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
/*
* This routine assumes that the appropriate zone's lru lock is already held
*/
void mem_cgroup_move_lists(struct page *page, bool active)
void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
{
struct page_cgroup *pc;
struct mem_cgroup_per_zone *mz;
@@ -364,17 +338,16 @@ void mem_cgroup_move_lists(struct page *page, bool active)
* safely get to page_cgroup without it, so just try_lock it:
* mem_cgroup_isolate_pages allows for page left on wrong list.
*/
if (!try_lock_page_cgroup(page))
pc = lookup_page_cgroup(page);
if (!trylock_page_cgroup(pc))
return;
pc = page_get_page_cgroup(page);
if (pc) {
if (pc && PageCgroupUsed(pc)) {
mz = page_cgroup_zoneinfo(pc);
spin_lock_irqsave(&mz->lru_lock, flags);
__mem_cgroup_move_lists(pc, active);
__mem_cgroup_move_lists(pc, lru);
spin_unlock_irqrestore(&mz->lru_lock, flags);
}
unlock_page_cgroup(page);
unlock_page_cgroup(pc);
}
/*
@@ -394,21 +367,6 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
return (int)((rss * 100L) / total);
}
/*
* This function is called from vmscan.c. In page reclaiming loop. balance
* between active and inactive list is calculated. For memory controller
* page reclaiming, we should use using mem_cgroup's imbalance rather than
* zone's global lru imbalance.
*/
long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem)
{
unsigned long active, inactive;
/* active and inactive are the number of pages. 'long' is ok.*/
active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE);
inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE);
return (long) (active / (inactive + 1));
}
/*
* prev_priority control...this will be used in memory reclaim path.
*/
@@ -436,28 +394,17 @@ void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
* (see include/linux/mmzone.h)
*/
long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem,
struct zone *zone, int priority)
long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
int priority, enum lru_list lru)
{
long nr_active;
long nr_pages;
int nid = zone->zone_pgdat->node_id;
int zid = zone_idx(zone);
struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE);
return (nr_active >> priority);
}
nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
struct zone *zone, int priority)
{
long nr_inactive;
int nid = zone->zone_pgdat->node_id;
int zid = zone_idx(zone);
struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE);
return (nr_inactive >> priority);
return (nr_pages >> priority);
}
unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
@@ -465,7 +412,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
unsigned long *scanned, int order,
int mode, struct zone *z,
struct mem_cgroup *mem_cont,
int active)
int active, int file)
{
unsigned long nr_taken = 0;
struct page *page;
@@ -476,38 +423,38 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
int nid = z->zone_pgdat->node_id;
int zid = zone_idx(z);
struct mem_cgroup_per_zone *mz;
int lru = LRU_FILE * !!file + !!active;
BUG_ON(!mem_cont);
mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
if (active)
src = &mz->active_list;
else
src = &mz->inactive_list;
src = &mz->lists[lru];
spin_lock(&mz->lru_lock);
scan = 0;
list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
if (scan >= nr_to_scan)
break;
if (unlikely(!PageCgroupUsed(pc)))
continue;
page = pc->page;
if (unlikely(!PageLRU(page)))
continue;
if (PageActive(page) && !active) {
__mem_cgroup_move_lists(pc, true);
continue;
}
if (!PageActive(page) && active) {
__mem_cgroup_move_lists(pc, false);
/*
* TODO: play better with lumpy reclaim, grabbing anything.
*/
if (PageUnevictable(page) ||
(PageActive(page) && !active) ||
(!PageActive(page) && active)) {
__mem_cgroup_move_lists(pc, page_lru(page));
continue;
}
scan++;
list_move(&pc->lru, &pc_list);
if (__isolate_lru_page(page, mode) == 0) {
if (__isolate_lru_page(page, mode, file) == 0) {
list_move(&page->lru, dst);
nr_taken++;
}
@@ -532,23 +479,29 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
{
struct mem_cgroup *mem;
struct page_cgroup *pc;
unsigned long flags;
unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct mem_cgroup_per_zone *mz;
unsigned long flags;
pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
if (unlikely(pc == NULL))
goto err;
pc = lookup_page_cgroup(page);
/* can happen at boot */
if (unlikely(!pc))
return 0;
prefetchw(pc);
/*
* We always charge the cgroup the mm_struct belongs to.
* The mm_struct's mem_cgroup changes on task migration if the
* thread group leader migrates. It's possible that mm is not
* set, if so charge the init_mm (happens for pagecache usage).
*/
if (likely(!memcg)) {
rcu_read_lock();
mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
if (unlikely(!mem)) {
rcu_read_unlock();
return 0;
}
/*
* For every charge from the cgroup, increment reference count
*/
@@ -559,7 +512,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
css_get(&memcg->css);
}
while (res_counter_charge(&mem->res, PAGE_SIZE)) {
while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
if (!(gfp_mask & __GFP_WAIT))
goto out;
@@ -582,39 +535,33 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
}
}
lock_page_cgroup(pc);
if (unlikely(PageCgroupUsed(pc))) {
unlock_page_cgroup(pc);
res_counter_uncharge(&mem->res, PAGE_SIZE);
css_put(&mem->css);
goto done;
}
pc->mem_cgroup = mem;
pc->page = page;
/*
* If a page is accounted as a page cache, insert to inactive list.
* If anon, insert to active list.
*/
if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
pc->flags = PAGE_CGROUP_FLAG_CACHE;
else
pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
lock_page_cgroup(page);
if (unlikely(page_get_page_cgroup(page))) {
unlock_page_cgroup(page);
res_counter_uncharge(&mem->res, PAGE_SIZE);
css_put(&mem->css);
kmem_cache_free(page_cgroup_cache, pc);
goto done;
}
page_assign_page_cgroup(page, pc);
pc->flags = pcg_default_flags[ctype];
mz = page_cgroup_zoneinfo(pc);
spin_lock_irqsave(&mz->lru_lock, flags);
__mem_cgroup_add_list(mz, pc);
spin_unlock_irqrestore(&mz->lru_lock, flags);
unlock_page_cgroup(pc);
unlock_page_cgroup(page);
done:
return 0;
out:
css_put(&mem->css);
kmem_cache_free(page_cgroup_cache, pc);
err:
return -ENOMEM;
}
@@ -622,7 +569,8 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
{
if (mem_cgroup_subsys.disabled)
return 0;
if (PageCompound(page))
return 0;
/*
* If already mapped, we don't have to account.
* If page cache, page->mapping has address_space.
@@ -643,7 +591,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
{
if (mem_cgroup_subsys.disabled)
return 0;
if (PageCompound(page))
return 0;
/*
* Corner case handling. This is called from add_to_page_cache()
* in usual. But some FS (shmem) precharges this page before calling it
@@ -656,22 +605,27 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
if (!(gfp_mask & __GFP_WAIT)) {
struct page_cgroup *pc;
lock_page_cgroup(page);
pc = page_get_page_cgroup(page);
if (pc) {
VM_BUG_ON(pc->page != page);
VM_BUG_ON(!pc->mem_cgroup);
unlock_page_cgroup(page);
pc = lookup_page_cgroup(page);
if (!pc)
return 0;
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
unlock_page_cgroup(pc);
return 0;
}
unlock_page_cgroup(page);
unlock_page_cgroup(pc);
}
if (unlikely(!mm))
mm = &init_mm;
return mem_cgroup_charge_common(page, mm, gfp_mask,
if (page_is_file_cache(page))
return mem_cgroup_charge_common(page, mm, gfp_mask,
MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
else
return mem_cgroup_charge_common(page, mm, gfp_mask,
MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
}
/*
@@ -691,44 +645,46 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
/*
* Check if our page_cgroup is valid
*/
lock_page_cgroup(page);
pc = page_get_page_cgroup(page);
if (unlikely(!pc))
goto unlock;
pc = lookup_page_cgroup(page);
if (unlikely(!pc || !PageCgroupUsed(pc)))
return;
VM_BUG_ON(pc->page != page);
if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
&& ((pc->flags & PAGE_CGROUP_FLAG_CACHE)
|| page_mapped(page)))
goto unlock;
lock_page_cgroup(pc);
if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page))
|| !PageCgroupUsed(pc)) {
/* This happens at race in zap_pte_range() and do_swap_page()*/
unlock_page_cgroup(pc);
return;
}
ClearPageCgroupUsed(pc);
mem = pc->mem_cgroup;
mz = page_cgroup_zoneinfo(pc);
spin_lock_irqsave(&mz->lru_lock, flags);
__mem_cgroup_remove_list(mz, pc);
spin_unlock_irqrestore(&mz->lru_lock, flags);
unlock_page_cgroup(pc);
page_assign_page_cgroup(page, NULL);
unlock_page_cgroup(page);
mem = pc->mem_cgroup;
res_counter_uncharge(&mem->res, PAGE_SIZE);
css_put(&mem->css);
kmem_cache_free(page_cgroup_cache, pc);
return;
unlock:
unlock_page_cgroup(page);
}
void mem_cgroup_uncharge_page(struct page *page)
{
/* early check. */
if (page_mapped(page))
return;
if (page->mapping && !PageAnon(page))
return;
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
}
void mem_cgroup_uncharge_cache_page(struct page *page)
{
VM_BUG_ON(page_mapped(page));
VM_BUG_ON(page->mapping);
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
}
@@ -745,15 +701,19 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
if (mem_cgroup_subsys.disabled)
return 0;
lock_page_cgroup(page);
pc = page_get_page_cgroup(page);
if (pc) {
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
mem = pc->mem_cgroup;
css_get(&mem->css);
if (pc->flags & PAGE_CGROUP_FLAG_CACHE)
ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
if (PageCgroupCache(pc)) {
if (page_is_file_cache(page))
ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
else
ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
}
}
unlock_page_cgroup(page);
unlock_page_cgroup(pc);
if (mem) {
ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
ctype, mem);
@@ -778,7 +738,7 @@ void mem_cgroup_end_migration(struct page *newpage)
*/
if (!newpage->mapping)
__mem_cgroup_uncharge_common(newpage,
MEM_CGROUP_CHARGE_TYPE_FORCE);
MEM_CGROUP_CHARGE_TYPE_FORCE);
else if (PageAnon(newpage))
mem_cgroup_uncharge_page(newpage);
}
@@ -801,11 +761,16 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
rcu_read_lock();
mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
if (unlikely(!mem)) {
rcu_read_unlock();
return 0;
}
css_get(&mem->css);
rcu_read_unlock();
do {
progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
progress += res_counter_check_under_limit(&mem->res);
} while (!progress && --retry);
css_put(&mem->css);
@@ -845,7 +810,7 @@ int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
#define FORCE_UNCHARGE_BATCH (128)
static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
struct mem_cgroup_per_zone *mz,
int active)
enum lru_list lru)
{
struct page_cgroup *pc;
struct page *page;
@@ -853,15 +818,14 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
unsigned long flags;
struct list_head *list;
if (active)
list = &mz->active_list;
else
list = &mz->inactive_list;
list = &mz->lists[lru];
spin_lock_irqsave(&mz->lru_lock, flags);
while (!list_empty(list)) {
pc = list_entry(list->prev, struct page_cgroup, lru);
page = pc->page;
if (!PageCgroupUsed(pc))
break;
get_page(page);
spin_unlock_irqrestore(&mz->lru_lock, flags);
/*
@@ -876,8 +840,10 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
count = FORCE_UNCHARGE_BATCH;
cond_resched();
}
} else
cond_resched();
} else {
spin_lock_irqsave(&mz->lru_lock, flags);
break;
}
spin_lock_irqsave(&mz->lru_lock, flags);
}
spin_unlock_irqrestore(&mz->lru_lock, flags);
@@ -901,15 +867,17 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
while (mem->res.usage > 0) {
if (atomic_read(&mem->css.cgroup->count) > 0)
goto out;
/* This is for making all *used* pages to be on LRU. */
lru_add_drain_all();
for_each_node_state(node, N_POSSIBLE)
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
struct mem_cgroup_per_zone *mz;
enum lru_list l;
mz = mem_cgroup_zoneinfo(mem, node, zid);
/* drop all page_cgroup in active_list */
mem_cgroup_force_empty_list(mem, mz, 1);
/* drop all page_cgroup in inactive_list */
mem_cgroup_force_empty_list(mem, mz, 0);
for_each_lru(l)
mem_cgroup_force_empty_list(mem, mz, l);
}
cond_resched();
}
ret = 0;
out:
@@ -994,14 +962,27 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
}
/* showing # of active pages */
{
unsigned long active, inactive;
unsigned long active_anon, inactive_anon;
unsigned long active_file, inactive_file;
unsigned long unevictable;
inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
LRU_INACTIVE_ANON);
active_anon = mem_cgroup_get_all_zonestat(mem_cont,
LRU_ACTIVE_ANON);
inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
LRU_INACTIVE_FILE);
active_file = mem_cgroup_get_all_zonestat(mem_cont,
LRU_ACTIVE_FILE);
unevictable = mem_cgroup_get_all_zonestat(mem_cont,
LRU_UNEVICTABLE);
cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
inactive = mem_cgroup_get_all_zonestat(mem_cont,
MEM_CGROUP_ZSTAT_INACTIVE);
active = mem_cgroup_get_all_zonestat(mem_cont,
MEM_CGROUP_ZSTAT_ACTIVE);
cb->fill(cb, "active", (active) * PAGE_SIZE);
cb->fill(cb, "inactive", (inactive) * PAGE_SIZE);
}
return 0;
}
@@ -1044,6 +1025,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
{
struct mem_cgroup_per_node *pn;
struct mem_cgroup_per_zone *mz;
enum lru_list l;
int zone, tmp = node;
/*
* This routine is called against possible nodes.
@@ -1064,9 +1046,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = &pn->zoneinfo[zone];
INIT_LIST_HEAD(&mz->active_list);
INIT_LIST_HEAD(&mz->inactive_list);
spin_lock_init(&mz->lru_lock);
for_each_lru(l)
INIT_LIST_HEAD(&mz->lists[l]);
}
return 0;
}
@@ -1107,7 +1089,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
if (unlikely((cont->parent) == NULL)) {
mem = &init_mem_cgroup;
page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
} else {
mem = mem_cgroup_alloc();
if (!mem)

View File

@@ -1129,12 +1129,17 @@ static inline int use_zero_page(struct vm_area_struct *vma)
return !vma->vm_ops || !vma->vm_ops->fault;
}
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int write, int force,
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int flags,
struct page **pages, struct vm_area_struct **vmas)
{
int i;
unsigned int vm_flags;
unsigned int vm_flags = 0;
int write = !!(flags & GUP_FLAGS_WRITE);
int force = !!(flags & GUP_FLAGS_FORCE);
int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
if (len <= 0)
return 0;
@@ -1158,7 +1163,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
if (write) /* user gate pages are read-only */
/* user gate pages are read-only */
if (!ignore && write)
return i ? : -EFAULT;
if (pg > TASK_SIZE)
pgd = pgd_offset_k(pg);
@@ -1190,8 +1197,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
continue;
}
if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
|| !(vm_flags & vma->vm_flags))
if (!vma ||
(vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
(!ignore && !(vm_flags & vma->vm_flags)))
return i ? : -EFAULT;
if (is_vm_hugetlb_page(vma)) {
@@ -1266,6 +1274,23 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
} while (len);
return i;
}
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int write, int force,
struct page **pages, struct vm_area_struct **vmas)
{
int flags = 0;
if (write)
flags |= GUP_FLAGS_WRITE;
if (force)
flags |= GUP_FLAGS_FORCE;
return __get_user_pages(tsk, mm,
start, len, flags,
pages, vmas);
}
EXPORT_SYMBOL(get_user_pages);
pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
@@ -1296,18 +1321,14 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
pte_t *pte;
spinlock_t *ptl;
retval = mem_cgroup_charge(page, mm, GFP_KERNEL);
if (retval)
goto out;
retval = -EINVAL;
if (PageAnon(page))
goto out_uncharge;
goto out;
retval = -ENOMEM;
flush_dcache_page(page);
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
goto out_uncharge;
goto out;
retval = -EBUSY;
if (!pte_none(*pte))
goto out_unlock;
@@ -1323,8 +1344,6 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
return retval;
out_unlock:
pte_unmap_unlock(pte, ptl);
out_uncharge:
mem_cgroup_uncharge_page(page);
out:
return retval;
}
@@ -1858,6 +1877,15 @@ gotten:
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
if (!new_page)
goto oom;
/*
* Don't let another task, with possibly unlocked vma,
* keep the mlocked page.
*/
if (vma->vm_flags & VM_LOCKED) {
lock_page(old_page); /* for LRU manipulation */
clear_page_mlock(old_page);
unlock_page(old_page);
}
cow_user_page(new_page, old_page, address, vma);
__SetPageUptodate(new_page);
@@ -1886,11 +1914,13 @@ gotten:
* thread doing COW.
*/
ptep_clear_flush_notify(vma, address, page_table);
set_pte_at(mm, address, page_table, entry);
update_mmu_cache(vma, address, entry);
lru_cache_add_active(new_page);
SetPageSwapBacked(new_page);
lru_cache_add_active_or_unevictable(new_page, vma);
page_add_new_anon_rmap(new_page, vma, address);
//TODO: is this safe? do_anonymous_page() does it this way.
set_pte_at(mm, address, page_table, entry);
update_mmu_cache(vma, address, entry);
if (old_page) {
/*
* Only after switching the pte to the new page may
@@ -2288,16 +2318,17 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(PGMAJFAULT);
}
if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
ret = VM_FAULT_OOM;
goto out;
}
mark_page_accessed(page);
lock_page(page);
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
ret = VM_FAULT_OOM;
unlock_page(page);
goto out;
}
/*
* Back out if somebody else already faulted in this pte.
*/
@@ -2324,7 +2355,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
page_add_anon_rmap(page, vma, address);
swap_free(entry);
if (vm_swap_full())
if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
remove_exclusive_swap_page(page);
unlock_page(page);
@@ -2382,7 +2413,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (!pte_none(*page_table))
goto release;
inc_mm_counter(mm, anon_rss);
lru_cache_add_active(page);
SetPageSwapBacked(page);
lru_cache_add_active_or_unevictable(page, vma);
page_add_new_anon_rmap(page, vma, address);
set_pte_at(mm, address, page_table, entry);
@@ -2423,6 +2455,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *page;
pte_t entry;
int anon = 0;
int charged = 0;
struct page *dirty_page = NULL;
struct vm_fault vmf;
int ret;
@@ -2463,6 +2496,18 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ret = VM_FAULT_OOM;
goto out;
}
if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
ret = VM_FAULT_OOM;
page_cache_release(page);
goto out;
}
charged = 1;
/*
* Don't let another task, with possibly unlocked vma,
* keep the mlocked page.
*/
if (vma->vm_flags & VM_LOCKED)
clear_page_mlock(vmf.page);
copy_user_highpage(page, vmf.page, address, vma);
__SetPageUptodate(page);
} else {
@@ -2497,11 +2542,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
ret = VM_FAULT_OOM;
goto out;
}
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
/*
@@ -2520,11 +2560,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
entry = mk_pte(page, vma->vm_page_prot);
if (flags & FAULT_FLAG_WRITE)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
set_pte_at(mm, address, page_table, entry);
if (anon) {
inc_mm_counter(mm, anon_rss);
lru_cache_add_active(page);
page_add_new_anon_rmap(page, vma, address);
inc_mm_counter(mm, anon_rss);
SetPageSwapBacked(page);
lru_cache_add_active_or_unevictable(page, vma);
page_add_new_anon_rmap(page, vma, address);
} else {
inc_mm_counter(mm, file_rss);
page_add_file_rmap(page);
@@ -2533,11 +2573,14 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
get_page(dirty_page);
}
}
//TODO: is this safe? do_anonymous_page() does it this way.
set_pte_at(mm, address, page_table, entry);
/* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, address, entry);
} else {
mem_cgroup_uncharge_page(page);
if (charged)
mem_cgroup_uncharge_page(page);
if (anon)
page_cache_release(page);
else
@@ -2772,19 +2815,9 @@ int make_pages_present(unsigned long addr, unsigned long end)
len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
ret = get_user_pages(current, current->mm, addr,
len, write, 0, NULL, NULL);
if (ret < 0) {
/*
SUS require strange return value to mlock
- invalid addr generate to ENOMEM.
- out of memory should generate EAGAIN.
*/
if (ret == -EFAULT)
ret = -ENOMEM;
else if (ret == -ENOMEM)
ret = -EAGAIN;
if (ret < 0)
return ret;
}
return ret == len ? 0 : -ENOMEM;
return ret == len ? 0 : -EFAULT;
}
#if !defined(__HAVE_ARCH_GATE_AREA)

View File

@@ -26,6 +26,7 @@
#include <linux/delay.h>
#include <linux/migrate.h>
#include <linux/page-isolation.h>
#include <linux/pfn.h>
#include <asm/tlbflush.h>
@@ -323,11 +324,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
BUG_ON(nr_pages % PAGES_PER_SECTION);
release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
sections_to_remove = nr_pages / PAGES_PER_SECTION;
for (i = 0; i < sections_to_remove; i++) {
unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
release_mem_region(pfn << PAGE_SHIFT,
PAGES_PER_SECTION << PAGE_SHIFT);
ret = __remove_section(zone, __pfn_to_section(pfn));
if (ret)
break;
@@ -657,8 +658,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
* We can skip free pages. And we can only deal with pages on
* LRU.
*/
ret = isolate_lru_page(page, &source);
ret = isolate_lru_page(page);
if (!ret) { /* Success */
list_add_tail(&page->lru, &source);
move_pages--;
} else {
/* Becasue we don't have big zone->lock. we should
@@ -849,10 +851,19 @@ failed_removal:
return ret;
}
int remove_memory(u64 start, u64 size)
{
unsigned long start_pfn, end_pfn;
start_pfn = PFN_DOWN(start);
end_pfn = start_pfn + PFN_DOWN(size);
return offline_pages(start_pfn, end_pfn, 120 * HZ);
}
#else
int remove_memory(u64 start, u64 size)
{
return -EINVAL;
}
EXPORT_SYMBOL_GPL(remove_memory);
#endif /* CONFIG_MEMORY_HOTREMOVE */
EXPORT_SYMBOL_GPL(remove_memory);

View File

@@ -93,6 +93,8 @@
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
#include "internal.h"
/* Internal flags */
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
@@ -762,8 +764,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
/*
* Avoid migrating a page that is shared with others.
*/
if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
isolate_lru_page(page, pagelist);
if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
if (!isolate_lru_page(page)) {
list_add_tail(&page->lru, pagelist);
}
}
}
static struct page *new_node_page(struct page *page, unsigned long node, int **x)
@@ -2197,7 +2202,7 @@ static void gather_stats(struct page *page, void *private, int pte_dirty)
if (PageSwapCache(page))
md->swapcache++;
if (PageActive(page))
if (PageActive(page) || PageUnevictable(page))
md->active++;
if (PageWriteback(page))

View File

@@ -36,36 +36,6 @@
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
/*
* Isolate one page from the LRU lists. If successful put it onto
* the indicated list with elevated page count.
*
* Result:
* -EBUSY: page not on LRU list
* 0: page removed from LRU list and added to the specified list.
*/
int isolate_lru_page(struct page *page, struct list_head *pagelist)
{
int ret = -EBUSY;
if (PageLRU(page)) {
struct zone *zone = page_zone(page);
spin_lock_irq(&zone->lru_lock);
if (PageLRU(page) && get_page_unless_zero(page)) {
ret = 0;
ClearPageLRU(page);
if (PageActive(page))
del_page_from_active_list(zone, page);
else
del_page_from_inactive_list(zone, page);
list_add_tail(&page->lru, pagelist);
}
spin_unlock_irq(&zone->lru_lock);
}
return ret;
}
/*
* migrate_prep() needs to be called before we start compiling a list of pages
* to be migrated using isolate_lru_page().
@@ -83,23 +53,9 @@ int migrate_prep(void)
return 0;
}
static inline void move_to_lru(struct page *page)
{
if (PageActive(page)) {
/*
* lru_cache_add_active checks that
* the PG_active bit is off.
*/
ClearPageActive(page);
lru_cache_add_active(page);
} else {
lru_cache_add(page);
}
put_page(page);
}
/*
* Add isolated pages on the list back to the LRU.
* Add isolated pages on the list back to the LRU under page lock
* to avoid leaking evictable pages back onto unevictable list.
*
* returns the number of pages put back.
*/
@@ -111,7 +67,7 @@ int putback_lru_pages(struct list_head *l)
list_for_each_entry_safe(page, page2, l, lru) {
list_del(&page->lru);
move_to_lru(page);
putback_lru_page(page);
count++;
}
return count;
@@ -374,8 +330,6 @@ static int migrate_page_move_mapping(struct address_space *mapping,
__inc_zone_page_state(newpage, NR_FILE_PAGES);
spin_unlock_irq(&mapping->tree_lock);
if (!PageSwapCache(newpage))
mem_cgroup_uncharge_cache_page(page);
return 0;
}
@@ -385,6 +339,8 @@ static int migrate_page_move_mapping(struct address_space *mapping,
*/
static void migrate_page_copy(struct page *newpage, struct page *page)
{
int anon;
copy_highpage(newpage, page);
if (PageError(page))
@@ -393,8 +349,11 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
SetPageReferenced(newpage);
if (PageUptodate(page))
SetPageUptodate(newpage);
if (PageActive(page))
if (TestClearPageActive(page)) {
VM_BUG_ON(PageUnevictable(page));
SetPageActive(newpage);
} else
unevictable_migrate_page(newpage, page);
if (PageChecked(page))
SetPageChecked(newpage);
if (PageMappedToDisk(page))
@@ -412,14 +371,20 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
__set_page_dirty_nobuffers(newpage);
}
mlock_migrate_page(newpage, page);
#ifdef CONFIG_SWAP
ClearPageSwapCache(page);
#endif
ClearPageActive(page);
ClearPagePrivate(page);
set_page_private(page, 0);
/* page->mapping contains a flag for PageAnon() */
anon = PageAnon(page);
page->mapping = NULL;
if (!anon) /* This page was removed from radix-tree. */
mem_cgroup_uncharge_cache_page(page);
/*
* If any waiters have accumulated on the new page then
* wake them up.
@@ -594,6 +559,10 @@ static int fallback_migrate_page(struct address_space *mapping,
*
* The new page will have replaced the old page if this function
* is successful.
*
* Return value:
* < 0 - error code
* == 0 - success
*/
static int move_to_new_page(struct page *newpage, struct page *page)
{
@@ -611,6 +580,8 @@ static int move_to_new_page(struct page *newpage, struct page *page)
/* Prepare mapping for the new page.*/
newpage->index = page->index;
newpage->mapping = page->mapping;
if (PageSwapBacked(page))
SetPageSwapBacked(newpage);
mapping = page_mapping(page);
if (!mapping)
@@ -654,9 +625,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
if (!newpage)
return -ENOMEM;
if (page_count(page) == 1)
if (page_count(page) == 1) {
/* page was freed from under us. So we are done. */
goto move_newpage;
}
charge = mem_cgroup_prepare_migration(page, newpage);
if (charge == -ENOMEM) {
@@ -730,7 +702,6 @@ rcu_unlock:
rcu_read_unlock();
unlock:
unlock_page(page);
if (rc != -EAGAIN) {
@@ -741,17 +712,19 @@ unlock:
* restored.
*/
list_del(&page->lru);
move_to_lru(page);
putback_lru_page(page);
}
move_newpage:
if (!charge)
mem_cgroup_end_migration(newpage);
/*
* Move the new page to the LRU. If migration was not successful
* then this will free the page.
*/
move_to_lru(newpage);
putback_lru_page(newpage);
if (result) {
if (rc)
*result = rc;
@@ -858,9 +831,11 @@ static struct page *new_page_node(struct page *p, unsigned long private,
* Move a set of pages as indicated in the pm array. The addr
* field must be set to the virtual address of the page to be moved
* and the node number must contain a valid target node.
* The pm array ends with node = MAX_NUMNODES.
*/
static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
int migrate_all)
static int do_move_page_to_node_array(struct mm_struct *mm,
struct page_to_node *pm,
int migrate_all)
{
int err;
struct page_to_node *pp;
@@ -914,7 +889,9 @@ static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
!migrate_all)
goto put_and_set;
err = isolate_lru_page(page, &pagelist);
err = isolate_lru_page(page);
if (!err)
list_add_tail(&page->lru, &pagelist);
put_and_set:
/*
* Either remove the duplicate refcount from
@@ -926,36 +903,118 @@ set_status:
pp->status = err;
}
err = 0;
if (!list_empty(&pagelist))
err = migrate_pages(&pagelist, new_page_node,
(unsigned long)pm);
else
err = -ENOENT;
up_read(&mm->mmap_sem);
return err;
}
/*
* Determine the nodes of a list of pages. The addr in the pm array
* must have been set to the virtual address of which we want to determine
* the node number.
* Migrate an array of page address onto an array of nodes and fill
* the corresponding array of status.
*/
static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
unsigned long nr_pages,
const void __user * __user *pages,
const int __user *nodes,
int __user *status, int flags)
{
down_read(&mm->mmap_sem);
struct page_to_node *pm = NULL;
nodemask_t task_nodes;
int err = 0;
int i;
for ( ; pm->node != MAX_NUMNODES; pm++) {
struct vm_area_struct *vma;
struct page *page;
int err;
task_nodes = cpuset_mems_allowed(task);
/* Limit nr_pages so that the multiplication may not overflow */
if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
err = -E2BIG;
goto out;
}
pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
if (!pm) {
err = -ENOMEM;
goto out;
}
/*
* Get parameters from user space and initialize the pm
* array. Return various errors if the user did something wrong.
*/
for (i = 0; i < nr_pages; i++) {
const void __user *p;
err = -EFAULT;
vma = find_vma(mm, pm->addr);
if (get_user(p, pages + i))
goto out_pm;
pm[i].addr = (unsigned long)p;
if (nodes) {
int node;
if (get_user(node, nodes + i))
goto out_pm;
err = -ENODEV;
if (!node_state(node, N_HIGH_MEMORY))
goto out_pm;
err = -EACCES;
if (!node_isset(node, task_nodes))
goto out_pm;
pm[i].node = node;
} else
pm[i].node = 0; /* anything to not match MAX_NUMNODES */
}
/* End marker */
pm[nr_pages].node = MAX_NUMNODES;
err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL);
if (err >= 0)
/* Return status information */
for (i = 0; i < nr_pages; i++)
if (put_user(pm[i].status, status + i))
err = -EFAULT;
out_pm:
vfree(pm);
out:
return err;
}
/*
* Determine the nodes of an array of pages and store it in an array of status.
*/
static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
const void __user * __user *pages,
int __user *status)
{
unsigned long i;
int err;
down_read(&mm->mmap_sem);
for (i = 0; i < nr_pages; i++) {
const void __user *p;
unsigned long addr;
struct vm_area_struct *vma;
struct page *page;
err = -EFAULT;
if (get_user(p, pages+i))
goto out;
addr = (unsigned long) p;
vma = find_vma(mm, addr);
if (!vma)
goto set_status;
page = follow_page(vma, pm->addr, 0);
page = follow_page(vma, addr, 0);
err = PTR_ERR(page);
if (IS_ERR(page))
@@ -968,11 +1027,13 @@ static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
err = page_to_nid(page);
set_status:
pm->status = err;
put_user(err, status+i);
}
err = 0;
out:
up_read(&mm->mmap_sem);
return 0;
return err;
}
/*
@@ -984,12 +1045,9 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
const int __user *nodes,
int __user *status, int flags)
{
int err = 0;
int i;
struct task_struct *task;
nodemask_t task_nodes;
struct mm_struct *mm;
struct page_to_node *pm = NULL;
int err;
/* Check flags */
if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
@@ -1021,75 +1079,21 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
(current->uid != task->suid) && (current->uid != task->uid) &&
!capable(CAP_SYS_NICE)) {
err = -EPERM;
goto out2;
goto out;
}
err = security_task_movememory(task);
if (err)
goto out2;
goto out;
task_nodes = cpuset_mems_allowed(task);
/* Limit nr_pages so that the multiplication may not overflow */
if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
err = -E2BIG;
goto out2;
if (nodes) {
err = do_pages_move(mm, task, nr_pages, pages, nodes, status,
flags);
} else {
err = do_pages_stat(mm, nr_pages, pages, status);
}
pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
if (!pm) {
err = -ENOMEM;
goto out2;
}
/*
* Get parameters from user space and initialize the pm
* array. Return various errors if the user did something wrong.
*/
for (i = 0; i < nr_pages; i++) {
const void __user *p;
err = -EFAULT;
if (get_user(p, pages + i))
goto out;
pm[i].addr = (unsigned long)p;
if (nodes) {
int node;
if (get_user(node, nodes + i))
goto out;
err = -ENODEV;
if (!node_state(node, N_HIGH_MEMORY))
goto out;
err = -EACCES;
if (!node_isset(node, task_nodes))
goto out;
pm[i].node = node;
} else
pm[i].node = 0; /* anything to not match MAX_NUMNODES */
}
/* End marker */
pm[nr_pages].node = MAX_NUMNODES;
if (nodes)
err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL);
else
err = do_pages_stat(mm, pm);
if (err >= 0)
/* Return status information */
for (i = 0; i < nr_pages; i++)
if (put_user(pm[i].status, status + i))
err = -EFAULT;
out:
vfree(pm);
out2:
mmput(mm);
return err;
}

View File

@@ -8,10 +8,18 @@
#include <linux/capability.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
#include <linux/syscalls.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/rmap.h>
#include <linux/mmzone.h>
#include <linux/hugetlb.h>
#include "internal.h"
int can_do_mlock(void)
{
@@ -23,17 +31,381 @@ int can_do_mlock(void)
}
EXPORT_SYMBOL(can_do_mlock);
#ifdef CONFIG_UNEVICTABLE_LRU
/*
* Mlocked pages are marked with PageMlocked() flag for efficient testing
* in vmscan and, possibly, the fault path; and to support semi-accurate
* statistics.
*
* An mlocked page [PageMlocked(page)] is unevictable. As such, it will
* be placed on the LRU "unevictable" list, rather than the [in]active lists.
* The unevictable list is an LRU sibling list to the [in]active lists.
* PageUnevictable is set to indicate the unevictable state.
*
* When lazy mlocking via vmscan, it is important to ensure that the
* vma's VM_LOCKED status is not concurrently being modified, otherwise we
* may have mlocked a page that is being munlocked. So lazy mlock must take
* the mmap_sem for read, and verify that the vma really is locked
* (see mm/rmap.c).
*/
/*
* LRU accounting for clear_page_mlock()
*/
void __clear_page_mlock(struct page *page)
{
VM_BUG_ON(!PageLocked(page));
if (!page->mapping) { /* truncated ? */
return;
}
dec_zone_page_state(page, NR_MLOCK);
count_vm_event(UNEVICTABLE_PGCLEARED);
if (!isolate_lru_page(page)) {
putback_lru_page(page);
} else {
/*
* Page not on the LRU yet. Flush all pagevecs and retry.
*/
lru_add_drain_all();
if (!isolate_lru_page(page))
putback_lru_page(page);
else if (PageUnevictable(page))
count_vm_event(UNEVICTABLE_PGSTRANDED);
}
}
/*
* Mark page as mlocked if not already.
* If page on LRU, isolate and putback to move to unevictable list.
*/
void mlock_vma_page(struct page *page)
{
BUG_ON(!PageLocked(page));
if (!TestSetPageMlocked(page)) {
inc_zone_page_state(page, NR_MLOCK);
count_vm_event(UNEVICTABLE_PGMLOCKED);
if (!isolate_lru_page(page))
putback_lru_page(page);
}
}
/*
* called from munlock()/munmap() path with page supposedly on the LRU.
*
* Note: unlike mlock_vma_page(), we can't just clear the PageMlocked
* [in try_to_munlock()] and then attempt to isolate the page. We must
* isolate the page to keep others from messing with its unevictable
* and mlocked state while trying to munlock. However, we pre-clear the
* mlocked state anyway as we might lose the isolation race and we might
* not get another chance to clear PageMlocked. If we successfully
* isolate the page and try_to_munlock() detects other VM_LOCKED vmas
* mapping the page, it will restore the PageMlocked state, unless the page
* is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(),
* perhaps redundantly.
* If we lose the isolation race, and the page is mapped by other VM_LOCKED
* vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap()
* either of which will restore the PageMlocked state by calling
* mlock_vma_page() above, if it can grab the vma's mmap sem.
*/
static void munlock_vma_page(struct page *page)
{
BUG_ON(!PageLocked(page));
if (TestClearPageMlocked(page)) {
dec_zone_page_state(page, NR_MLOCK);
if (!isolate_lru_page(page)) {
int ret = try_to_munlock(page);
/*
* did try_to_unlock() succeed or punt?
*/
if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN)
count_vm_event(UNEVICTABLE_PGMUNLOCKED);
putback_lru_page(page);
} else {
/*
* We lost the race. let try_to_unmap() deal
* with it. At least we get the page state and
* mlock stats right. However, page is still on
* the noreclaim list. We'll fix that up when
* the page is eventually freed or we scan the
* noreclaim list.
*/
if (PageUnevictable(page))
count_vm_event(UNEVICTABLE_PGSTRANDED);
else
count_vm_event(UNEVICTABLE_PGMUNLOCKED);
}
}
}
/**
* __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma.
* @vma: target vma
* @start: start address
* @end: end address
* @mlock: 0 indicate munlock, otherwise mlock.
*
* If @mlock == 0, unlock an mlocked range;
* else mlock the range of pages. This takes care of making the pages present ,
* too.
*
* return 0 on success, negative error code on error.
*
* vma->vm_mm->mmap_sem must be held for at least read.
*/
static long __mlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
int mlock)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long addr = start;
struct page *pages[16]; /* 16 gives a reasonable batch */
int nr_pages = (end - start) / PAGE_SIZE;
int ret;
int gup_flags = 0;
VM_BUG_ON(start & ~PAGE_MASK);
VM_BUG_ON(end & ~PAGE_MASK);
VM_BUG_ON(start < vma->vm_start);
VM_BUG_ON(end > vma->vm_end);
VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) &&
(atomic_read(&mm->mm_users) != 0));
/*
* mlock: don't page populate if page has PROT_NONE permission.
* munlock: the pages always do munlock althrough
* its has PROT_NONE permission.
*/
if (!mlock)
gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS;
if (vma->vm_flags & VM_WRITE)
gup_flags |= GUP_FLAGS_WRITE;
lru_add_drain_all(); /* push cached pages to LRU */
while (nr_pages > 0) {
int i;
cond_resched();
/*
* get_user_pages makes pages present if we are
* setting mlock. and this extra reference count will
* disable migration of this page. However, page may
* still be truncated out from under us.
*/
ret = __get_user_pages(current, mm, addr,
min_t(int, nr_pages, ARRAY_SIZE(pages)),
gup_flags, pages, NULL);
/*
* This can happen for, e.g., VM_NONLINEAR regions before
* a page has been allocated and mapped at a given offset,
* or for addresses that map beyond end of a file.
* We'll mlock the the pages if/when they get faulted in.
*/
if (ret < 0)
break;
if (ret == 0) {
/*
* We know the vma is there, so the only time
* we cannot get a single page should be an
* error (ret < 0) case.
*/
WARN_ON(1);
break;
}
lru_add_drain(); /* push cached pages to LRU */
for (i = 0; i < ret; i++) {
struct page *page = pages[i];
lock_page(page);
/*
* Because we lock page here and migration is blocked
* by the elevated reference, we need only check for
* page truncation (file-cache only).
*/
if (page->mapping) {
if (mlock)
mlock_vma_page(page);
else
munlock_vma_page(page);
}
unlock_page(page);
put_page(page); /* ref from get_user_pages() */
/*
* here we assume that get_user_pages() has given us
* a list of virtually contiguous pages.
*/
addr += PAGE_SIZE; /* for next get_user_pages() */
nr_pages--;
}
ret = 0;
}
lru_add_drain_all(); /* to update stats */
return ret; /* count entire vma as locked_vm */
}
/*
* convert get_user_pages() return value to posix mlock() error
*/
static int __mlock_posix_error_return(long retval)
{
if (retval == -EFAULT)
retval = -ENOMEM;
else if (retval == -ENOMEM)
retval = -EAGAIN;
return retval;
}
#else /* CONFIG_UNEVICTABLE_LRU */
/*
* Just make pages present if VM_LOCKED. No-op if unlocking.
*/
static long __mlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
int mlock)
{
if (mlock && (vma->vm_flags & VM_LOCKED))
return make_pages_present(start, end);
return 0;
}
static inline int __mlock_posix_error_return(long retval)
{
return 0;
}
#endif /* CONFIG_UNEVICTABLE_LRU */
/**
* mlock_vma_pages_range() - mlock pages in specified vma range.
* @vma - the vma containing the specfied address range
* @start - starting address in @vma to mlock
* @end - end address [+1] in @vma to mlock
*
* For mmap()/mremap()/expansion of mlocked vma.
*
* return 0 on success for "normal" vmas.
*
* return number of pages [> 0] to be removed from locked_vm on success
* of "special" vmas.
*
* return negative error if vma spanning @start-@range disappears while
* mmap semaphore is dropped. Unlikely?
*/
long mlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
struct mm_struct *mm = vma->vm_mm;
int nr_pages = (end - start) / PAGE_SIZE;
BUG_ON(!(vma->vm_flags & VM_LOCKED));
/*
* filter unlockable vmas
*/
if (vma->vm_flags & (VM_IO | VM_PFNMAP))
goto no_mlock;
if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current))) {
long error;
downgrade_write(&mm->mmap_sem);
error = __mlock_vma_pages_range(vma, start, end, 1);
up_read(&mm->mmap_sem);
/* vma can change or disappear */
down_write(&mm->mmap_sem);
vma = find_vma(mm, start);
/* non-NULL vma must contain @start, but need to check @end */
if (!vma || end > vma->vm_end)
return -ENOMEM;
return 0; /* hide other errors from mmap(), et al */
}
/*
* User mapped kernel pages or huge pages:
* make these pages present to populate the ptes, but
* fall thru' to reset VM_LOCKED--no need to unlock, and
* return nr_pages so these don't get counted against task's
* locked limit. huge pages are already counted against
* locked vm limit.
*/
make_pages_present(start, end);
no_mlock:
vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */
return nr_pages; /* error or pages NOT mlocked */
}
/*
* munlock_vma_pages_range() - munlock all pages in the vma range.'
* @vma - vma containing range to be munlock()ed.
* @start - start address in @vma of the range
* @end - end of range in @vma.
*
* For mremap(), munmap() and exit().
*
* Called with @vma VM_LOCKED.
*
* Returns with VM_LOCKED cleared. Callers must be prepared to
* deal with this.
*
* We don't save and restore VM_LOCKED here because pages are
* still on lru. In unmap path, pages might be scanned by reclaim
* and re-mlocked by try_to_{munlock|unmap} before we unmap and
* free them. This will result in freeing mlocked pages.
*/
void munlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
vma->vm_flags &= ~VM_LOCKED;
__mlock_vma_pages_range(vma, start, end, 0);
}
/*
* mlock_fixup - handle mlock[all]/munlock[all] requests.
*
* Filters out "special" vmas -- VM_LOCKED never gets set for these, and
* munlock is a no-op. However, for some special vmas, we go ahead and
* populate the ptes via make_pages_present().
*
* For vmas that pass the filters, merge/split as appropriate.
*/
static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
unsigned long start, unsigned long end, unsigned int newflags)
{
struct mm_struct * mm = vma->vm_mm;
struct mm_struct *mm = vma->vm_mm;
pgoff_t pgoff;
int pages;
int nr_pages;
int ret = 0;
int lock = newflags & VM_LOCKED;
if (newflags == vma->vm_flags) {
*prev = vma;
goto out;
if (newflags == vma->vm_flags ||
(vma->vm_flags & (VM_IO | VM_PFNMAP)))
goto out; /* don't set VM_LOCKED, don't count */
if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current)) {
if (lock)
make_pages_present(start, end);
goto out; /* don't set VM_LOCKED, don't count */
}
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
@@ -44,8 +416,6 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
goto success;
}
*prev = vma;
if (start != vma->vm_start) {
ret = split_vma(mm, vma, start, 1);
if (ret)
@@ -59,25 +429,62 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
}
success:
/*
* Keep track of amount of locked VM.
*/
nr_pages = (end - start) >> PAGE_SHIFT;
if (!lock)
nr_pages = -nr_pages;
mm->locked_vm += nr_pages;
/*
* vm_flags is protected by the mmap_sem held in write mode.
* It's okay if try_to_unmap_one unmaps a page just after we
* set VM_LOCKED, make_pages_present below will bring it back.
* set VM_LOCKED, __mlock_vma_pages_range will bring it back.
*/
vma->vm_flags = newflags;
/*
* Keep track of amount of locked VM.
*/
pages = (end - start) >> PAGE_SHIFT;
if (newflags & VM_LOCKED) {
pages = -pages;
if (!(newflags & VM_IO))
ret = make_pages_present(start, end);
if (lock) {
/*
* mmap_sem is currently held for write. Downgrade the write
* lock to a read lock so that other faults, mmap scans, ...
* while we fault in all pages.
*/
downgrade_write(&mm->mmap_sem);
ret = __mlock_vma_pages_range(vma, start, end, 1);
/*
* Need to reacquire mmap sem in write mode, as our callers
* expect this. We have no support for atomically upgrading
* a sem to write, so we need to check for ranges while sem
* is unlocked.
*/
up_read(&mm->mmap_sem);
/* vma can change or disappear */
down_write(&mm->mmap_sem);
*prev = find_vma(mm, start);
/* non-NULL *prev must contain @start, but need to check @end */
if (!(*prev) || end > (*prev)->vm_end)
ret = -ENOMEM;
else if (ret > 0) {
mm->locked_vm -= ret;
ret = 0;
} else
ret = __mlock_posix_error_return(ret); /* translate if needed */
} else {
/*
* TODO: for unlocking, pages will already be resident, so
* we don't need to wait for allocations/reclaim/pagein, ...
* However, unlocking a very large region can still take a
* while. Should we downgrade the semaphore for both lock
* AND unlock ?
*/
__mlock_vma_pages_range(vma, start, end, 0);
}
mm->locked_vm -= pages;
out:
*prev = vma;
return ret;
}

View File

@@ -410,7 +410,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
rb_insert_color(&vma->vm_rb, &mm->mm_rb);
}
static inline void __vma_link_file(struct vm_area_struct *vma)
static void __vma_link_file(struct vm_area_struct *vma)
{
struct file * file;
@@ -662,8 +662,6 @@ again: remove_next = 1 + (end > next->vm_end);
* If the vma has a ->close operation then the driver probably needs to release
* per-vma resources, so we don't attempt to merge those.
*/
#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
static inline int is_mergeable_vma(struct vm_area_struct *vma,
struct file *file, unsigned long vm_flags)
{
@@ -972,6 +970,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
return -EPERM;
vm_flags |= VM_LOCKED;
}
/* mlock MCL_FUTURE? */
if (vm_flags & VM_LOCKED) {
unsigned long locked, lock_limit;
@@ -1139,10 +1138,12 @@ munmap_back:
* The VM_SHARED test is necessary because shmem_zero_setup
* will create the file object for a shared anonymous map below.
*/
if (!file && !(vm_flags & VM_SHARED) &&
vma_merge(mm, prev, addr, addr + len, vm_flags,
NULL, NULL, pgoff, NULL))
goto out;
if (!file && !(vm_flags & VM_SHARED)) {
vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
NULL, NULL, pgoff, NULL);
if (vma)
goto out;
}
/*
* Determine the object being mapped and call the appropriate
@@ -1224,10 +1225,14 @@ out:
mm->total_vm += len >> PAGE_SHIFT;
vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
mm->locked_vm += len >> PAGE_SHIFT;
make_pages_present(addr, addr + len);
}
if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
/*
* makes pages present; downgrades, drops, reacquires mmap_sem
*/
long nr_pages = mlock_vma_pages_range(vma, addr, addr + len);
if (nr_pages < 0)
return nr_pages; /* vma gone! */
mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;
} else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
make_pages_present(addr, addr + len);
return addr;
@@ -1586,7 +1591,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
* vma is the last one with address > vma->vm_end. Have to extend vma.
*/
#ifndef CONFIG_IA64
static inline
static
#endif
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
@@ -1636,7 +1641,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
/*
* vma is the first one with address < vma->vm_start. Have to extend vma.
*/
static inline int expand_downwards(struct vm_area_struct *vma,
static int expand_downwards(struct vm_area_struct *vma,
unsigned long address)
{
int error;
@@ -1698,10 +1703,12 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
vma = find_vma_prev(mm, addr, &prev);
if (vma && (vma->vm_start <= addr))
return vma;
if (!prev || expand_stack(prev, addr))
if (expand_stack(prev, addr))
return NULL;
if (prev->vm_flags & VM_LOCKED)
make_pages_present(addr, prev->vm_end);
if (prev->vm_flags & VM_LOCKED) {
if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0)
return NULL; /* vma gone! */
}
return prev;
}
#else
@@ -1727,8 +1734,10 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
start = vma->vm_start;
if (expand_stack(vma, addr))
return NULL;
if (vma->vm_flags & VM_LOCKED)
make_pages_present(addr, start);
if (vma->vm_flags & VM_LOCKED) {
if (mlock_vma_pages_range(vma, addr, start) < 0)
return NULL; /* vma gone! */
}
return vma;
}
#endif
@@ -1747,8 +1756,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
long nrpages = vma_pages(vma);
mm->total_vm -= nrpages;
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm -= nrpages;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
vma = remove_vma(vma);
} while (vma);
@@ -1913,6 +1920,20 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
}
vma = prev? prev->vm_next: mm->mmap;
/*
* unlock any mlock()ed ranges before detaching vmas
*/
if (mm->locked_vm) {
struct vm_area_struct *tmp = vma;
while (tmp && tmp->vm_start < end) {
if (tmp->vm_flags & VM_LOCKED) {
mm->locked_vm -= vma_pages(tmp);
munlock_vma_pages_all(tmp);
}
tmp = tmp->vm_next;
}
}
/*
* Remove the vma's, and unmap the actual pages
*/
@@ -2025,8 +2046,9 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
return -ENOMEM;
/* Can we just expand an old private anonymous mapping? */
if (vma_merge(mm, prev, addr, addr + len, flags,
NULL, NULL, pgoff, NULL))
vma = vma_merge(mm, prev, addr, addr + len, flags,
NULL, NULL, pgoff, NULL);
if (vma)
goto out;
/*
@@ -2048,8 +2070,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
out:
mm->total_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED) {
mm->locked_vm += len >> PAGE_SHIFT;
make_pages_present(addr, addr + len);
if (!mlock_vma_pages_range(vma, addr, addr + len))
mm->locked_vm += (len >> PAGE_SHIFT);
}
return addr;
}
@@ -2060,7 +2082,7 @@ EXPORT_SYMBOL(do_brk);
void exit_mmap(struct mm_struct *mm)
{
struct mmu_gather *tlb;
struct vm_area_struct *vma = mm->mmap;
struct vm_area_struct *vma;
unsigned long nr_accounted = 0;
unsigned long end;
@@ -2068,6 +2090,15 @@ void exit_mmap(struct mm_struct *mm)
arch_exit_mmap(mm);
mmu_notifier_release(mm);
if (mm->locked_vm) {
vma = mm->mmap;
while (vma) {
if (vma->vm_flags & VM_LOCKED)
munlock_vma_pages_all(vma);
vma = vma->vm_next;
}
}
vma = mm->mmap;
lru_add_drain();
flush_cache_mm(mm);
tlb = tlb_gather_mmu(mm, 1);

View File

@@ -24,6 +24,8 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include "internal.h"
static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
@@ -238,8 +240,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (vm_flags & VM_LOCKED) {
mm->locked_vm += new_len >> PAGE_SHIFT;
if (new_len > old_len)
make_pages_present(new_addr + old_len,
new_addr + new_len);
mlock_vma_pages_range(new_vma, new_addr + old_len,
new_addr + new_len);
}
return new_addr;
@@ -379,7 +381,7 @@ unsigned long do_mremap(unsigned long addr,
vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
if (vma->vm_flags & VM_LOCKED) {
mm->locked_vm += pages;
make_pages_present(addr + old_len,
mlock_vma_pages_range(vma, addr + old_len,
addr + new_len);
}
ret = addr;

View File

@@ -34,6 +34,8 @@
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include "internal.h"
void *high_memory;
struct page *mem_map;
unsigned long max_mapnr;
@@ -128,20 +130,16 @@ unsigned int kobjsize(const void *objp)
return PAGE_SIZE << compound_order(page);
}
/*
* get a list of pages in an address range belonging to the specified process
* and indicate the VMA that covers each page
* - this is potentially dodgy as we may end incrementing the page count of a
* slab page or a secondary page from a compound page
* - don't permit access to VMAs that don't support it, such as I/O mappings
*/
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int write, int force,
struct page **pages, struct vm_area_struct **vmas)
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int flags,
struct page **pages, struct vm_area_struct **vmas)
{
struct vm_area_struct *vma;
unsigned long vm_flags;
int i;
int write = !!(flags & GUP_FLAGS_WRITE);
int force = !!(flags & GUP_FLAGS_FORCE);
int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
/* calculate required read or write permissions.
* - if 'force' is set, we only require the "MAY" flags.
@@ -156,7 +154,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
/* protect what we can, including chardevs */
if (vma->vm_flags & (VM_IO | VM_PFNMAP) ||
!(vm_flags & vma->vm_flags))
(!ignore && !(vm_flags & vma->vm_flags)))
goto finish_or_fault;
if (pages) {
@@ -174,6 +172,30 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
finish_or_fault:
return i ? : -EFAULT;
}
/*
* get a list of pages in an address range belonging to the specified process
* and indicate the VMA that covers each page
* - this is potentially dodgy as we may end incrementing the page count of a
* slab page or a secondary page from a compound page
* - don't permit access to VMAs that don't support it, such as I/O mappings
*/
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int write, int force,
struct page **pages, struct vm_area_struct **vmas)
{
int flags = 0;
if (write)
flags |= GUP_FLAGS_WRITE;
if (force)
flags |= GUP_FLAGS_FORCE;
return __get_user_pages(tsk, mm,
start, len, flags,
pages, vmas);
}
EXPORT_SYMBOL(get_user_pages);
DEFINE_RWLOCK(vmlist_lock);

View File

@@ -7,7 +7,7 @@
* Contains functions related to writing back dirty pages at the
* address_space level.
*
* 10Apr2002 akpm@zip.com.au
* 10Apr2002 Andrew Morton
* Initial version
*/
@@ -329,9 +329,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
struct zone *z =
&NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
x += zone_page_state(z, NR_FREE_PAGES)
+ zone_page_state(z, NR_INACTIVE)
+ zone_page_state(z, NR_ACTIVE);
x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z);
}
/*
* Make sure that the number of highmem pages is never larger
@@ -355,9 +353,7 @@ unsigned long determine_dirtyable_memory(void)
{
unsigned long x;
x = global_page_state(NR_FREE_PAGES)
+ global_page_state(NR_INACTIVE)
+ global_page_state(NR_ACTIVE);
x = global_page_state(NR_FREE_PAGES) + global_lru_pages();
if (!vm_highmem_is_dirtyable)
x -= highmem_dirtyable_memory(x);
@@ -876,6 +872,7 @@ int write_cache_pages(struct address_space *mapping,
pgoff_t end; /* Inclusive */
int scanned = 0;
int range_whole = 0;
long nr_to_write = wbc->nr_to_write;
if (wbc->nonblocking && bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
@@ -939,7 +936,7 @@ retry:
unlock_page(page);
ret = 0;
}
if (ret || (--(wbc->nr_to_write) <= 0))
if (ret || (--nr_to_write <= 0))
done = 1;
if (wbc->nonblocking && bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
@@ -958,11 +955,12 @@ retry:
index = 0;
goto retry;
}
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = index;
if (!wbc->no_nrwrite_index_update) {
if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
mapping->writeback_index = index;
wbc->nr_to_write = nr_to_write;
}
if (wbc->range_cont)
wbc->range_start = index << PAGE_CACHE_SHIFT;
return ret;
}
EXPORT_SYMBOL(write_cache_pages);

View File

@@ -44,7 +44,7 @@
#include <linux/backing-dev.h>
#include <linux/fault-inject.h>
#include <linux/page-isolation.h>
#include <linux/memcontrol.h>
#include <linux/page_cgroup.h>
#include <linux/debugobjects.h>
#include <asm/tlbflush.h>
@@ -223,17 +223,12 @@ static inline int bad_range(struct zone *zone, struct page *page)
static void bad_page(struct page *page)
{
void *pc = page_get_page_cgroup(page);
printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
"page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
current->comm, page, (int)(2*sizeof(unsigned long)),
(unsigned long)page->flags, page->mapping,
page_mapcount(page), page_count(page));
if (pc) {
printk(KERN_EMERG "cgroup:%p\n", pc);
page_reset_bad_cgroup(page);
}
printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
KERN_EMERG "Backtrace:\n");
dump_stack();
@@ -268,13 +263,14 @@ void prep_compound_page(struct page *page, unsigned long order)
{
int i;
int nr_pages = 1 << order;
struct page *p = page + 1;
set_compound_page_dtor(page, free_compound_page);
set_compound_order(page, order);
__SetPageHead(page);
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
for (i = 1; i < nr_pages; i++, p++) {
if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
p = pfn_to_page(page_to_pfn(page) + i);
__SetPageTail(p);
p->first_page = page;
}
@@ -284,6 +280,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
{
int i;
int nr_pages = 1 << order;
struct page *p = page + 1;
if (unlikely(compound_order(page) != order))
bad_page(page);
@@ -291,8 +288,9 @@ static void destroy_compound_page(struct page *page, unsigned long order)
if (unlikely(!PageHead(page)))
bad_page(page);
__ClearPageHead(page);
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
for (i = 1; i < nr_pages; i++, p++) {
if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
p = pfn_to_page(page_to_pfn(page) + i);
if (unlikely(!PageTail(p) |
(p->first_page != page)))
@@ -451,14 +449,16 @@ static inline void __free_one_page(struct page *page,
static inline int free_pages_check(struct page *page)
{
free_page_mlock(page);
if (unlikely(page_mapcount(page) |
(page->mapping != NULL) |
(page_get_page_cgroup(page) != NULL) |
(page_count(page) != 0) |
(page->flags & PAGE_FLAGS_CHECK_AT_FREE)))
bad_page(page);
if (PageDirty(page))
__ClearPageDirty(page);
if (PageSwapBacked(page))
__ClearPageSwapBacked(page);
/*
* For now, we report if PG_reserved was found set, but do not
* clear it, and do not free the page. But we shall soon need
@@ -597,7 +597,6 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
{
if (unlikely(page_mapcount(page) |
(page->mapping != NULL) |
(page_get_page_cgroup(page) != NULL) |
(page_count(page) != 0) |
(page->flags & PAGE_FLAGS_CHECK_AT_PREP)))
bad_page(page);
@@ -611,7 +610,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
1 << PG_referenced | 1 << PG_arch_1 |
1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);
1 << PG_owner_priv_1 | 1 << PG_mappedtodisk
#ifdef CONFIG_UNEVICTABLE_LRU
| 1 << PG_mlocked
#endif
);
set_page_private(page, 0);
set_page_refcounted(page);
@@ -1859,10 +1862,21 @@ void show_free_areas(void)
}
}
printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n"
printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n"
" inactive_file:%lu"
//TODO: check/adjust line lengths
#ifdef CONFIG_UNEVICTABLE_LRU
" unevictable:%lu"
#endif
" dirty:%lu writeback:%lu unstable:%lu\n"
" free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
global_page_state(NR_ACTIVE),
global_page_state(NR_INACTIVE),
global_page_state(NR_ACTIVE_ANON),
global_page_state(NR_ACTIVE_FILE),
global_page_state(NR_INACTIVE_ANON),
global_page_state(NR_INACTIVE_FILE),
#ifdef CONFIG_UNEVICTABLE_LRU
global_page_state(NR_UNEVICTABLE),
#endif
global_page_state(NR_FILE_DIRTY),
global_page_state(NR_WRITEBACK),
global_page_state(NR_UNSTABLE_NFS),
@@ -1885,8 +1899,13 @@ void show_free_areas(void)
" min:%lukB"
" low:%lukB"
" high:%lukB"
" active:%lukB"
" inactive:%lukB"
" active_anon:%lukB"
" inactive_anon:%lukB"
" active_file:%lukB"
" inactive_file:%lukB"
#ifdef CONFIG_UNEVICTABLE_LRU
" unevictable:%lukB"
#endif
" present:%lukB"
" pages_scanned:%lu"
" all_unreclaimable? %s"
@@ -1896,8 +1915,13 @@ void show_free_areas(void)
K(zone->pages_min),
K(zone->pages_low),
K(zone->pages_high),
K(zone_page_state(zone, NR_ACTIVE)),
K(zone_page_state(zone, NR_INACTIVE)),
K(zone_page_state(zone, NR_ACTIVE_ANON)),
K(zone_page_state(zone, NR_INACTIVE_ANON)),
K(zone_page_state(zone, NR_ACTIVE_FILE)),
K(zone_page_state(zone, NR_INACTIVE_FILE)),
#ifdef CONFIG_UNEVICTABLE_LRU
K(zone_page_state(zone, NR_UNEVICTABLE)),
#endif
K(zone->present_pages),
zone->pages_scanned,
(zone_is_all_unreclaimable(zone) ? "yes" : "no")
@@ -3407,10 +3431,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
pgdat->nr_zones = 0;
init_waitqueue_head(&pgdat->kswapd_wait);
pgdat->kswapd_max_order = 0;
pgdat_page_cgroup_init(pgdat);
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, memmap_pages;
enum lru_list l;
size = zone_spanned_pages_in_node(nid, j, zones_size);
realsize = size - zone_absent_pages_in_node(nid, j,
@@ -3425,8 +3451,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
if (realsize >= memmap_pages) {
realsize -= memmap_pages;
mminit_dprintk(MMINIT_TRACE, "memmap_init",
"%s zone: %lu pages used for memmap\n",
printk(KERN_DEBUG
" %s zone: %lu pages used for memmap\n",
zone_names[j], memmap_pages);
} else
printk(KERN_WARNING
@@ -3436,8 +3462,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
/* Account for reserved pages */
if (j == 0 && realsize > dma_reserve) {
realsize -= dma_reserve;
mminit_dprintk(MMINIT_TRACE, "memmap_init",
"%s zone: %lu pages reserved\n",
printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
zone_names[0], dma_reserve);
}
@@ -3462,10 +3487,14 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
zone->prev_priority = DEF_PRIORITY;
zone_pcp_init(zone);
INIT_LIST_HEAD(&zone->active_list);
INIT_LIST_HEAD(&zone->inactive_list);
zone->nr_scan_active = 0;
zone->nr_scan_inactive = 0;
for_each_lru(l) {
INIT_LIST_HEAD(&zone->lru[l].list);
zone->lru[l].nr_scan = 0;
}
zone->recent_rotated[0] = 0;
zone->recent_rotated[1] = 0;
zone->recent_scanned[0] = 0;
zone->recent_scanned[1] = 0;
zap_zone_vm_stats(zone);
zone->flags = 0;
if (!size)
@@ -3949,7 +3978,7 @@ static void check_for_regular_memory(pg_data_t *pgdat)
void __init free_area_init_nodes(unsigned long *max_zone_pfn)
{
unsigned long nid;
enum zone_type i;
int i;
/* Sort early_node_map as initialisation assumes it is sorted */
sort_node_map();
@@ -4207,7 +4236,7 @@ void setup_per_zone_pages_min(void)
for_each_zone(zone) {
u64 tmp;
spin_lock_irqsave(&zone->lru_lock, flags);
spin_lock_irqsave(&zone->lock, flags);
tmp = (u64)pages_min * zone->present_pages;
do_div(tmp, lowmem_pages);
if (is_highmem(zone)) {
@@ -4239,13 +4268,53 @@ void setup_per_zone_pages_min(void)
zone->pages_low = zone->pages_min + (tmp >> 2);
zone->pages_high = zone->pages_min + (tmp >> 1);
setup_zone_migrate_reserve(zone);
spin_unlock_irqrestore(&zone->lru_lock, flags);
spin_unlock_irqrestore(&zone->lock, flags);
}
/* update totalreserve_pages */
calculate_totalreserve_pages();
}
/**
* setup_per_zone_inactive_ratio - called when min_free_kbytes changes.
*
* The inactive anon list should be small enough that the VM never has to
* do too much work, but large enough that each inactive page has a chance
* to be referenced again before it is swapped out.
*
* The inactive_anon ratio is the target ratio of ACTIVE_ANON to
* INACTIVE_ANON pages on this zone's LRU, maintained by the
* pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
* the anonymous pages are kept on the inactive list.
*
* total target max
* memory ratio inactive anon
* -------------------------------------
* 10MB 1 5MB
* 100MB 1 50MB
* 1GB 3 250MB
* 10GB 10 0.9GB
* 100GB 31 3GB
* 1TB 101 10GB
* 10TB 320 32GB
*/
void setup_per_zone_inactive_ratio(void)
{
struct zone *zone;
for_each_zone(zone) {
unsigned int gb, ratio;
/* Zone size in gigabytes */
gb = zone->present_pages >> (30 - PAGE_SHIFT);
ratio = int_sqrt(10 * gb);
if (!ratio)
ratio = 1;
zone->inactive_ratio = ratio;
}
}
/*
* Initialise min_free_kbytes.
*
@@ -4283,6 +4352,7 @@ static int __init init_per_zone_pages_min(void)
min_free_kbytes = 65536;
setup_per_zone_pages_min();
setup_per_zone_lowmem_reserve();
setup_per_zone_inactive_ratio();
return 0;
}
module_init(init_per_zone_pages_min)

256
mm/page_cgroup.c Normal file
View File

@@ -0,0 +1,256 @@
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/bootmem.h>
#include <linux/bit_spinlock.h>
#include <linux/page_cgroup.h>
#include <linux/hash.h>
#include <linux/slab.h>
#include <linux/memory.h>
#include <linux/vmalloc.h>
#include <linux/cgroup.h>
static void __meminit
__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
{
pc->flags = 0;
pc->mem_cgroup = NULL;
pc->page = pfn_to_page(pfn);
}
static unsigned long total_usage;
#if !defined(CONFIG_SPARSEMEM)
void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
{
pgdat->node_page_cgroup = NULL;
}
struct page_cgroup *lookup_page_cgroup(struct page *page)
{
unsigned long pfn = page_to_pfn(page);
unsigned long offset;
struct page_cgroup *base;
base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
if (unlikely(!base))
return NULL;
offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
return base + offset;
}
static int __init alloc_node_page_cgroup(int nid)
{
struct page_cgroup *base, *pc;
unsigned long table_size;
unsigned long start_pfn, nr_pages, index;
start_pfn = NODE_DATA(nid)->node_start_pfn;
nr_pages = NODE_DATA(nid)->node_spanned_pages;
table_size = sizeof(struct page_cgroup) * nr_pages;
base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
if (!base)
return -ENOMEM;
for (index = 0; index < nr_pages; index++) {
pc = base + index;
__init_page_cgroup(pc, start_pfn + index);
}
NODE_DATA(nid)->node_page_cgroup = base;
total_usage += table_size;
return 0;
}
void __init page_cgroup_init(void)
{
int nid, fail;
if (mem_cgroup_subsys.disabled)
return;
for_each_online_node(nid) {
fail = alloc_node_page_cgroup(nid);
if (fail)
goto fail;
}
printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
printk(KERN_INFO "please try cgroup_disable=memory option if you"
" don't want\n");
return;
fail:
printk(KERN_CRIT "allocation of page_cgroup was failed.\n");
printk(KERN_CRIT "please try cgroup_disable=memory boot option\n");
panic("Out of memory");
}
#else /* CONFIG_FLAT_NODE_MEM_MAP */
struct page_cgroup *lookup_page_cgroup(struct page *page)
{
unsigned long pfn = page_to_pfn(page);
struct mem_section *section = __pfn_to_section(pfn);
return section->page_cgroup + pfn;
}
int __meminit init_section_page_cgroup(unsigned long pfn)
{
struct mem_section *section;
struct page_cgroup *base, *pc;
unsigned long table_size;
int nid, index;
section = __pfn_to_section(pfn);
if (section->page_cgroup)
return 0;
nid = page_to_nid(pfn_to_page(pfn));
table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
if (slab_is_available()) {
base = kmalloc_node(table_size, GFP_KERNEL, nid);
if (!base)
base = vmalloc_node(table_size, nid);
} else {
base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), table_size,
PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
}
if (!base) {
printk(KERN_ERR "page cgroup allocation failure\n");
return -ENOMEM;
}
for (index = 0; index < PAGES_PER_SECTION; index++) {
pc = base + index;
__init_page_cgroup(pc, pfn + index);
}
section = __pfn_to_section(pfn);
section->page_cgroup = base - pfn;
total_usage += table_size;
return 0;
}
#ifdef CONFIG_MEMORY_HOTPLUG
void __free_page_cgroup(unsigned long pfn)
{
struct mem_section *ms;
struct page_cgroup *base;
ms = __pfn_to_section(pfn);
if (!ms || !ms->page_cgroup)
return;
base = ms->page_cgroup + pfn;
if (is_vmalloc_addr(base)) {
vfree(base);
ms->page_cgroup = NULL;
} else {
struct page *page = virt_to_page(base);
if (!PageReserved(page)) { /* Is bootmem ? */
kfree(base);
ms->page_cgroup = NULL;
}
}
}
int online_page_cgroup(unsigned long start_pfn,
unsigned long nr_pages,
int nid)
{
unsigned long start, end, pfn;
int fail = 0;
start = start_pfn & (PAGES_PER_SECTION - 1);
end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
if (!pfn_present(pfn))
continue;
fail = init_section_page_cgroup(pfn);
}
if (!fail)
return 0;
/* rollback */
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
__free_page_cgroup(pfn);
return -ENOMEM;
}
int offline_page_cgroup(unsigned long start_pfn,
unsigned long nr_pages, int nid)
{
unsigned long start, end, pfn;
start = start_pfn & (PAGES_PER_SECTION - 1);
end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
__free_page_cgroup(pfn);
return 0;
}
static int page_cgroup_callback(struct notifier_block *self,
unsigned long action, void *arg)
{
struct memory_notify *mn = arg;
int ret = 0;
switch (action) {
case MEM_GOING_ONLINE:
ret = online_page_cgroup(mn->start_pfn,
mn->nr_pages, mn->status_change_nid);
break;
case MEM_CANCEL_ONLINE:
case MEM_OFFLINE:
offline_page_cgroup(mn->start_pfn,
mn->nr_pages, mn->status_change_nid);
break;
case MEM_GOING_OFFLINE:
break;
case MEM_ONLINE:
case MEM_CANCEL_OFFLINE:
break;
}
ret = notifier_from_errno(ret);
return ret;
}
#endif
void __init page_cgroup_init(void)
{
unsigned long pfn;
int fail = 0;
if (mem_cgroup_subsys.disabled)
return;
for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
if (!pfn_present(pfn))
continue;
fail = init_section_page_cgroup(pfn);
}
if (fail) {
printk(KERN_CRIT "try cgroup_disable=memory boot option\n");
panic("Out of memory");
} else {
hotplug_memory_notifier(page_cgroup_callback, 0);
}
printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
printk(KERN_INFO "please try cgroup_disable=memory option if you don't"
" want\n");
}
void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
{
return;
}
#endif

View File

@@ -114,8 +114,10 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn;
unsigned long pfn, flags;
struct page *page;
struct zone *zone;
int ret;
pfn = start_pfn;
/*
@@ -131,7 +133,9 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
if (pfn < end_pfn)
return -EBUSY;
/* Check all pages are free or Marked as ISOLATED */
if (__test_page_isolated_in_pageblock(start_pfn, end_pfn))
return 0;
return -EBUSY;
zone = page_zone(pfn_to_page(pfn));
spin_lock_irqsave(&zone->lock, flags);
ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn);
spin_unlock_irqrestore(&zone->lock, flags);
return ret ? 0 : -EBUSY;
}

View File

@@ -3,7 +3,7 @@
*
* Copyright (C) 2002, Linus Torvalds.
*
* 09Apr2002 akpm@zip.com.au
* 09Apr2002 Andrew Morton
* Initial version
* 29Feb2004 kaos@sgi.com
* Move worker thread creation to kthread to avoid chewing

View File

@@ -3,7 +3,7 @@
*
* Copyright (C) 2002, Linus Torvalds
*
* 09Apr2002 akpm@zip.com.au
* 09Apr2002 Andrew Morton
* Initial version.
*/
@@ -229,7 +229,7 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
*/
unsigned long max_sane_readahead(unsigned long nr)
{
return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE)
return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE)
+ node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
}

319
mm/rmap.c
View File

@@ -53,9 +53,47 @@
#include <asm/tlbflush.h>
struct kmem_cache *anon_vma_cachep;
#include "internal.h"
/* This must be called under the mmap_sem. */
static struct kmem_cache *anon_vma_cachep;
static inline struct anon_vma *anon_vma_alloc(void)
{
return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
}
static inline void anon_vma_free(struct anon_vma *anon_vma)
{
kmem_cache_free(anon_vma_cachep, anon_vma);
}
/**
* anon_vma_prepare - attach an anon_vma to a memory region
* @vma: the memory region in question
*
* This makes sure the memory mapping described by 'vma' has
* an 'anon_vma' attached to it, so that we can associate the
* anonymous pages mapped into it with that anon_vma.
*
* The common case will be that we already have one, but if
* if not we either need to find an adjacent mapping that we
* can re-use the anon_vma from (very common when the only
* reason for splitting a vma has been mprotect()), or we
* allocate a new one.
*
* Anon-vma allocations are very subtle, because we may have
* optimistically looked up an anon_vma in page_lock_anon_vma()
* and that may actually touch the spinlock even in the newly
* allocated vma (it depends on RCU to make sure that the
* anon_vma isn't actually destroyed).
*
* As a result, we need to do proper anon_vma locking even
* for the new allocation. At the same time, we do not want
* to do any locking for the common case of already having
* an anon_vma.
*
* This must be called with the mmap_sem held for reading.
*/
int anon_vma_prepare(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
@@ -63,20 +101,17 @@ int anon_vma_prepare(struct vm_area_struct *vma)
might_sleep();
if (unlikely(!anon_vma)) {
struct mm_struct *mm = vma->vm_mm;
struct anon_vma *allocated, *locked;
struct anon_vma *allocated;
anon_vma = find_mergeable_anon_vma(vma);
if (anon_vma) {
allocated = NULL;
locked = anon_vma;
spin_lock(&locked->lock);
} else {
allocated = NULL;
if (!anon_vma) {
anon_vma = anon_vma_alloc();
if (unlikely(!anon_vma))
return -ENOMEM;
allocated = anon_vma;
locked = NULL;
}
spin_lock(&anon_vma->lock);
/* page_table_lock to protect against threads */
spin_lock(&mm->page_table_lock);
@@ -87,8 +122,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
}
spin_unlock(&mm->page_table_lock);
if (locked)
spin_unlock(&locked->lock);
spin_unlock(&anon_vma->lock);
if (unlikely(allocated))
anon_vma_free(allocated);
}
@@ -157,7 +191,7 @@ void __init anon_vma_init(void)
* Getting a lock on a stable anon_vma from a page off the LRU is
* tricky: page_lock_anon_vma rely on RCU to guard against the races.
*/
static struct anon_vma *page_lock_anon_vma(struct page *page)
struct anon_vma *page_lock_anon_vma(struct page *page)
{
struct anon_vma *anon_vma;
unsigned long anon_mapping;
@@ -177,7 +211,7 @@ out:
return NULL;
}
static void page_unlock_anon_vma(struct anon_vma *anon_vma)
void page_unlock_anon_vma(struct anon_vma *anon_vma)
{
spin_unlock(&anon_vma->lock);
rcu_read_unlock();
@@ -268,6 +302,32 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
return NULL;
}
/**
* page_mapped_in_vma - check whether a page is really mapped in a VMA
* @page: the page to test
* @vma: the VMA to test
*
* Returns 1 if the page is mapped into the page tables of the VMA, 0
* if the page is not mapped into the page tables of this VMA. Only
* valid for normal file or anonymous VMAs.
*/
static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
{
unsigned long address;
pte_t *pte;
spinlock_t *ptl;
address = vma_address(page, vma);
if (address == -EFAULT) /* out of vma range */
return 0;
pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
if (!pte) /* the page is not in this mm */
return 0;
pte_unmap_unlock(pte, ptl);
return 1;
}
/*
* Subfunctions of page_referenced: page_referenced_one called
* repeatedly from either page_referenced_anon or page_referenced_file.
@@ -289,10 +349,17 @@ static int page_referenced_one(struct page *page,
if (!pte)
goto out;
/*
* Don't want to elevate referenced for mlocked page that gets this far,
* in order that it progresses to try_to_unmap and is moved to the
* unevictable list.
*/
if (vma->vm_flags & VM_LOCKED) {
referenced++;
*mapcount = 1; /* break early from loop */
} else if (ptep_clear_flush_young_notify(vma, address, pte))
goto out_unmap;
}
if (ptep_clear_flush_young_notify(vma, address, pte))
referenced++;
/* Pretend the page is referenced if the task has the
@@ -301,6 +368,7 @@ static int page_referenced_one(struct page *page,
rwsem_is_locked(&mm->mmap_sem))
referenced++;
out_unmap:
(*mapcount)--;
pte_unmap_unlock(pte, ptl);
out:
@@ -390,11 +458,6 @@ static int page_referenced_file(struct page *page,
*/
if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
continue;
if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
== (VM_LOCKED|VM_MAYSHARE)) {
referenced++;
break;
}
referenced += page_referenced_one(page, vma, &mapcount);
if (!mapcount)
break;
@@ -674,8 +737,8 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
page_clear_dirty(page);
set_page_dirty(page);
}
mem_cgroup_uncharge_page(page);
if (PageAnon(page))
mem_cgroup_uncharge_page(page);
__dec_zone_page_state(page,
PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
/*
@@ -717,11 +780,16 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* If it's recently referenced (perhaps page_referenced
* skipped over this mm) then we should reactivate it.
*/
if (!migration && ((vma->vm_flags & VM_LOCKED) ||
(ptep_clear_flush_young_notify(vma, address, pte)))) {
ret = SWAP_FAIL;
goto out_unmap;
}
if (!migration) {
if (vma->vm_flags & VM_LOCKED) {
ret = SWAP_MLOCK;
goto out_unmap;
}
if (ptep_clear_flush_young_notify(vma, address, pte)) {
ret = SWAP_FAIL;
goto out_unmap;
}
}
/* Nuke the page table entry. */
flush_cache_page(vma, address, page_to_pfn(page));
@@ -802,12 +870,17 @@ out:
* For very sparsely populated VMAs this is a little inefficient - chances are
* there there won't be many ptes located within the scan cluster. In this case
* maybe we could scan further - to the end of the pte page, perhaps.
*
* Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can
* acquire it without blocking. If vma locked, mlock the pages in the cluster,
* rather than unmapping them. If we encounter the "check_page" that vmscan is
* trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
*/
#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
#define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
static void try_to_unmap_cluster(unsigned long cursor,
unsigned int *mapcount, struct vm_area_struct *vma)
static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
struct vm_area_struct *vma, struct page *check_page)
{
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
@@ -819,6 +892,8 @@ static void try_to_unmap_cluster(unsigned long cursor,
struct page *page;
unsigned long address;
unsigned long end;
int ret = SWAP_AGAIN;
int locked_vma = 0;
address = (vma->vm_start + cursor) & CLUSTER_MASK;
end = address + CLUSTER_SIZE;
@@ -829,15 +904,26 @@ static void try_to_unmap_cluster(unsigned long cursor,
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
return;
return ret;
pud = pud_offset(pgd, address);
if (!pud_present(*pud))
return;
return ret;
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return;
return ret;
/*
* MLOCK_PAGES => feature is configured.
* if we can acquire the mmap_sem for read, and vma is VM_LOCKED,
* keep the sem while scanning the cluster for mlocking pages.
*/
if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) {
locked_vma = (vma->vm_flags & VM_LOCKED);
if (!locked_vma)
up_read(&vma->vm_mm->mmap_sem); /* don't need it */
}
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -850,6 +936,13 @@ static void try_to_unmap_cluster(unsigned long cursor,
page = vm_normal_page(vma, address, *pte);
BUG_ON(!page || PageAnon(page));
if (locked_vma) {
mlock_vma_page(page); /* no-op if already mlocked */
if (page == check_page)
ret = SWAP_MLOCK;
continue; /* don't unmap */
}
if (ptep_clear_flush_young_notify(vma, address, pte))
continue;
@@ -871,39 +964,104 @@ static void try_to_unmap_cluster(unsigned long cursor,
(*mapcount)--;
}
pte_unmap_unlock(pte - 1, ptl);
if (locked_vma)
up_read(&vma->vm_mm->mmap_sem);
return ret;
}
static int try_to_unmap_anon(struct page *page, int migration)
/*
* common handling for pages mapped in VM_LOCKED vmas
*/
static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
{
int mlocked = 0;
if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
if (vma->vm_flags & VM_LOCKED) {
mlock_vma_page(page);
mlocked++; /* really mlocked the page */
}
up_read(&vma->vm_mm->mmap_sem);
}
return mlocked;
}
/**
* try_to_unmap_anon - unmap or unlock anonymous page using the object-based
* rmap method
* @page: the page to unmap/unlock
* @unlock: request for unlock rather than unmap [unlikely]
* @migration: unmapping for migration - ignored if @unlock
*
* Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the anon_vma struct it points to.
*
* This function is only called from try_to_unmap/try_to_munlock for
* anonymous pages.
* When called from try_to_munlock(), the mmap_sem of the mm containing the vma
* where the page was found will be held for write. So, we won't recheck
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* 'LOCKED.
*/
static int try_to_unmap_anon(struct page *page, int unlock, int migration)
{
struct anon_vma *anon_vma;
struct vm_area_struct *vma;
unsigned int mlocked = 0;
int ret = SWAP_AGAIN;
if (MLOCK_PAGES && unlikely(unlock))
ret = SWAP_SUCCESS; /* default for try_to_munlock() */
anon_vma = page_lock_anon_vma(page);
if (!anon_vma)
return ret;
list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
ret = try_to_unmap_one(page, vma, migration);
if (ret == SWAP_FAIL || !page_mapped(page))
break;
if (MLOCK_PAGES && unlikely(unlock)) {
if (!((vma->vm_flags & VM_LOCKED) &&
page_mapped_in_vma(page, vma)))
continue; /* must visit all unlocked vmas */
ret = SWAP_MLOCK; /* saw at least one mlocked vma */
} else {
ret = try_to_unmap_one(page, vma, migration);
if (ret == SWAP_FAIL || !page_mapped(page))
break;
}
if (ret == SWAP_MLOCK) {
mlocked = try_to_mlock_page(page, vma);
if (mlocked)
break; /* stop if actually mlocked page */
}
}
page_unlock_anon_vma(anon_vma);
if (mlocked)
ret = SWAP_MLOCK; /* actually mlocked the page */
else if (ret == SWAP_MLOCK)
ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
return ret;
}
/**
* try_to_unmap_file - unmap file page using the object-based rmap method
* @page: the page to unmap
* @migration: migration flag
* try_to_unmap_file - unmap/unlock file page using the object-based rmap method
* @page: the page to unmap/unlock
* @unlock: request for unlock rather than unmap [unlikely]
* @migration: unmapping for migration - ignored if @unlock
*
* Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the address_space struct it points to.
*
* This function is only called from try_to_unmap for object-based pages.
* This function is only called from try_to_unmap/try_to_munlock for
* object-based pages.
* When called from try_to_munlock(), the mmap_sem of the mm containing the vma
* where the page was found will be held for write. So, we won't recheck
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* 'LOCKED.
*/
static int try_to_unmap_file(struct page *page, int migration)
static int try_to_unmap_file(struct page *page, int unlock, int migration)
{
struct address_space *mapping = page->mapping;
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -914,20 +1072,44 @@ static int try_to_unmap_file(struct page *page, int migration)
unsigned long max_nl_cursor = 0;
unsigned long max_nl_size = 0;
unsigned int mapcount;
unsigned int mlocked = 0;
if (MLOCK_PAGES && unlikely(unlock))
ret = SWAP_SUCCESS; /* default for try_to_munlock() */
spin_lock(&mapping->i_mmap_lock);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
ret = try_to_unmap_one(page, vma, migration);
if (ret == SWAP_FAIL || !page_mapped(page))
goto out;
if (MLOCK_PAGES && unlikely(unlock)) {
if (!(vma->vm_flags & VM_LOCKED))
continue; /* must visit all vmas */
ret = SWAP_MLOCK;
} else {
ret = try_to_unmap_one(page, vma, migration);
if (ret == SWAP_FAIL || !page_mapped(page))
goto out;
}
if (ret == SWAP_MLOCK) {
mlocked = try_to_mlock_page(page, vma);
if (mlocked)
break; /* stop if actually mlocked page */
}
}
if (mlocked)
goto out;
if (list_empty(&mapping->i_mmap_nonlinear))
goto out;
list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
shared.vm_set.list) {
if ((vma->vm_flags & VM_LOCKED) && !migration)
if (MLOCK_PAGES && unlikely(unlock)) {
if (!(vma->vm_flags & VM_LOCKED))
continue; /* must visit all vmas */
ret = SWAP_MLOCK; /* leave mlocked == 0 */
goto out; /* no need to look further */
}
if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED))
continue;
cursor = (unsigned long) vma->vm_private_data;
if (cursor > max_nl_cursor)
@@ -937,7 +1119,7 @@ static int try_to_unmap_file(struct page *page, int migration)
max_nl_size = cursor;
}
if (max_nl_size == 0) { /* any nonlinears locked or reserved */
if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
ret = SWAP_FAIL;
goto out;
}
@@ -961,12 +1143,16 @@ static int try_to_unmap_file(struct page *page, int migration)
do {
list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
shared.vm_set.list) {
if ((vma->vm_flags & VM_LOCKED) && !migration)
if (!MLOCK_PAGES && !migration &&
(vma->vm_flags & VM_LOCKED))
continue;
cursor = (unsigned long) vma->vm_private_data;
while ( cursor < max_nl_cursor &&
cursor < vma->vm_end - vma->vm_start) {
try_to_unmap_cluster(cursor, &mapcount, vma);
ret = try_to_unmap_cluster(cursor, &mapcount,
vma, page);
if (ret == SWAP_MLOCK)
mlocked = 2; /* to return below */
cursor += CLUSTER_SIZE;
vma->vm_private_data = (void *) cursor;
if ((int)mapcount <= 0)
@@ -987,6 +1173,10 @@ static int try_to_unmap_file(struct page *page, int migration)
vma->vm_private_data = NULL;
out:
spin_unlock(&mapping->i_mmap_lock);
if (mlocked)
ret = SWAP_MLOCK; /* actually mlocked the page */
else if (ret == SWAP_MLOCK)
ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
return ret;
}
@@ -1002,6 +1192,7 @@ out:
* SWAP_SUCCESS - we succeeded in removing all mappings
* SWAP_AGAIN - we missed a mapping, try again later
* SWAP_FAIL - the page is unswappable
* SWAP_MLOCK - page is mlocked.
*/
int try_to_unmap(struct page *page, int migration)
{
@@ -1010,12 +1201,36 @@ int try_to_unmap(struct page *page, int migration)
BUG_ON(!PageLocked(page));
if (PageAnon(page))
ret = try_to_unmap_anon(page, migration);
ret = try_to_unmap_anon(page, 0, migration);
else
ret = try_to_unmap_file(page, migration);
if (!page_mapped(page))
ret = try_to_unmap_file(page, 0, migration);
if (ret != SWAP_MLOCK && !page_mapped(page))
ret = SWAP_SUCCESS;
return ret;
}
#ifdef CONFIG_UNEVICTABLE_LRU
/**
* try_to_munlock - try to munlock a page
* @page: the page to be munlocked
*
* Called from munlock code. Checks all of the VMAs mapping the page
* to make sure nobody else has this page mlocked. The page will be
* returned with PG_mlocked cleared if no other vmas have it mlocked.
*
* Return values are:
*
* SWAP_SUCCESS - no vma's holding page mlocked.
* SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem
* SWAP_MLOCK - page is now mlocked.
*/
int try_to_munlock(struct page *page)
{
VM_BUG_ON(!PageLocked(page) || PageLRU(page));
if (PageAnon(page))
return try_to_unmap_anon(page, 1, 0);
else
return try_to_unmap_file(page, 1, 0);
}
#endif

View File

@@ -50,14 +50,12 @@
#include <linux/migrate.h>
#include <linux/highmem.h>
#include <linux/seq_file.h>
#include <linux/magic.h>
#include <asm/uaccess.h>
#include <asm/div64.h>
#include <asm/pgtable.h>
/* This magic number is used in glibc for posix shared memory */
#define TMPFS_MAGIC 0x01021994
#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
@@ -201,7 +199,7 @@ static struct vm_operations_struct shmem_vm_ops;
static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
.ra_pages = 0, /* No readahead */
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
.unplug_io_fn = default_unplug_io_fn,
};
@@ -1369,6 +1367,7 @@ repeat:
error = -ENOMEM;
goto failed;
}
SetPageSwapBacked(filepage);
/* Precharge page while we can wait, compensate after */
error = mem_cgroup_cache_charge(filepage, current->mm,
@@ -1478,12 +1477,16 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
if (!user_shm_lock(inode->i_size, user))
goto out_nomem;
info->flags |= VM_LOCKED;
mapping_set_unevictable(file->f_mapping);
}
if (!lock && (info->flags & VM_LOCKED) && user) {
user_shm_unlock(inode->i_size, user);
info->flags &= ~VM_LOCKED;
mapping_clear_unevictable(file->f_mapping);
scan_mapping_unevictable_pages(file->f_mapping);
}
retval = 0;
out_nomem:
spin_unlock(&info->lock);
return retval;
@@ -2582,6 +2585,7 @@ put_memory:
shmem_unacct_size(flags, size);
return ERR_PTR(error);
}
EXPORT_SYMBOL_GPL(shmem_file_setup);
/**
* shmem_zero_setup - setup a shared anonymous mapping

View File

@@ -95,6 +95,7 @@
#include <linux/init.h>
#include <linux/compiler.h>
#include <linux/cpuset.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/kallsyms.h>
@@ -4258,7 +4259,7 @@ static int s_show(struct seq_file *m, void *p)
* + further values on SMP and with statistics enabled
*/
const struct seq_operations slabinfo_op = {
static const struct seq_operations slabinfo_op = {
.start = s_start,
.next = s_next,
.stop = s_stop,
@@ -4315,6 +4316,19 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
return res;
}
static int slabinfo_open(struct inode *inode, struct file *file)
{
return seq_open(file, &slabinfo_op);
}
static const struct file_operations proc_slabinfo_operations = {
.open = slabinfo_open,
.read = seq_read,
.write = slabinfo_write,
.llseek = seq_lseek,
.release = seq_release,
};
#ifdef CONFIG_DEBUG_SLAB_LEAK
static void *leaks_start(struct seq_file *m, loff_t *pos)
@@ -4443,13 +4457,47 @@ static int leaks_show(struct seq_file *m, void *p)
return 0;
}
const struct seq_operations slabstats_op = {
static const struct seq_operations slabstats_op = {
.start = leaks_start,
.next = s_next,
.stop = s_stop,
.show = leaks_show,
};
static int slabstats_open(struct inode *inode, struct file *file)
{
unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL);
int ret = -ENOMEM;
if (n) {
ret = seq_open(file, &slabstats_op);
if (!ret) {
struct seq_file *m = file->private_data;
*n = PAGE_SIZE / (2 * sizeof(unsigned long));
m->private = n;
n = NULL;
}
kfree(n);
}
return ret;
}
static const struct file_operations proc_slabstats_operations = {
.open = slabstats_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
};
#endif
static int __init slab_proc_init(void)
{
proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
#ifdef CONFIG_DEBUG_SLAB_LEAK
proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
#endif
return 0;
}
module_init(slab_proc_init);
#endif
/**

View File

@@ -514,9 +514,11 @@ size_t ksize(const void *block)
return 0;
sp = (struct slob_page *)virt_to_page(block);
if (slob_page(sp))
return ((slob_t *)block - 1)->units + SLOB_UNIT;
else
if (slob_page(sp)) {
int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
unsigned int *m = (unsigned int *)(block - align);
return SLOB_UNITS(*m) * SLOB_UNIT;
} else
return sp->page.private;
}

View File

@@ -14,6 +14,7 @@
#include <linux/interrupt.h>
#include <linux/bitops.h>
#include <linux/slab.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
@@ -4417,14 +4418,6 @@ __initcall(slab_sysfs_init);
* The /proc/slabinfo ABI
*/
#ifdef CONFIG_SLABINFO
ssize_t slabinfo_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
return -EINVAL;
}
static void print_slabinfo_header(struct seq_file *m)
{
seq_puts(m, "slabinfo - version: 2.1\n");
@@ -4492,11 +4485,29 @@ static int s_show(struct seq_file *m, void *p)
return 0;
}
const struct seq_operations slabinfo_op = {
static const struct seq_operations slabinfo_op = {
.start = s_start,
.next = s_next,
.stop = s_stop,
.show = s_show,
};
static int slabinfo_open(struct inode *inode, struct file *file)
{
return seq_open(file, &slabinfo_op);
}
static const struct file_operations proc_slabinfo_operations = {
.open = slabinfo_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
static int __init slab_proc_init(void)
{
proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
return 0;
}
module_init(slab_proc_init);
#endif /* CONFIG_SLABINFO */

184
mm/swap.c
View File

@@ -31,11 +31,12 @@
#include <linux/backing-dev.h>
#include <linux/memcontrol.h>
#include "internal.h"
/* How many pages do we try to swap or page in/out together? */
int page_cluster;
static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs);
static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
/*
@@ -116,8 +117,9 @@ static void pagevec_move_tail(struct pagevec *pvec)
zone = pagezone;
spin_lock(&zone->lru_lock);
}
if (PageLRU(page) && !PageActive(page)) {
list_move_tail(&page->lru, &zone->inactive_list);
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
int lru = page_is_file_cache(page);
list_move_tail(&page->lru, &zone->lru[lru].list);
pgmoved++;
}
}
@@ -136,7 +138,7 @@ static void pagevec_move_tail(struct pagevec *pvec)
void rotate_reclaimable_page(struct page *page)
{
if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
PageLRU(page)) {
!PageUnevictable(page) && PageLRU(page)) {
struct pagevec *pvec;
unsigned long flags;
@@ -157,12 +159,19 @@ void activate_page(struct page *page)
struct zone *zone = page_zone(page);
spin_lock_irq(&zone->lru_lock);
if (PageLRU(page) && !PageActive(page)) {
del_page_from_inactive_list(zone, page);
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
int file = page_is_file_cache(page);
int lru = LRU_BASE + file;
del_page_from_lru_list(zone, page, lru);
SetPageActive(page);
add_page_to_active_list(zone, page);
lru += LRU_ACTIVE;
add_page_to_lru_list(zone, page, lru);
__count_vm_event(PGACTIVATE);
mem_cgroup_move_lists(page, true);
mem_cgroup_move_lists(page, lru);
zone->recent_rotated[!!file]++;
zone->recent_scanned[!!file]++;
}
spin_unlock_irq(&zone->lru_lock);
}
@@ -176,7 +185,8 @@ void activate_page(struct page *page)
*/
void mark_page_accessed(struct page *page)
{
if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
if (!PageActive(page) && !PageUnevictable(page) &&
PageReferenced(page) && PageLRU(page)) {
activate_page(page);
ClearPageReferenced(page);
} else if (!PageReferenced(page)) {
@@ -186,28 +196,73 @@ void mark_page_accessed(struct page *page)
EXPORT_SYMBOL(mark_page_accessed);
/**
* lru_cache_add: add a page to the page lists
* @page: the page to add
*/
void lru_cache_add(struct page *page)
void __lru_cache_add(struct page *page, enum lru_list lru)
{
struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
page_cache_get(page);
if (!pagevec_add(pvec, page))
__pagevec_lru_add(pvec);
____pagevec_lru_add(pvec, lru);
put_cpu_var(lru_add_pvecs);
}
void lru_cache_add_active(struct page *page)
/**
* lru_cache_add_lru - add a page to a page list
* @page: the page to be added to the LRU.
* @lru: the LRU list to which the page is added.
*/
void lru_cache_add_lru(struct page *page, enum lru_list lru)
{
struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
if (PageActive(page)) {
VM_BUG_ON(PageUnevictable(page));
ClearPageActive(page);
} else if (PageUnevictable(page)) {
VM_BUG_ON(PageActive(page));
ClearPageUnevictable(page);
}
page_cache_get(page);
if (!pagevec_add(pvec, page))
__pagevec_lru_add_active(pvec);
put_cpu_var(lru_add_active_pvecs);
VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page));
__lru_cache_add(page, lru);
}
/**
* add_page_to_unevictable_list - add a page to the unevictable list
* @page: the page to be added to the unevictable list
*
* Add page directly to its zone's unevictable list. To avoid races with
* tasks that might be making the page evictable, through eg. munlock,
* munmap or exit, while it's not on the lru, we want to add the page
* while it's locked or otherwise "invisible" to other tasks. This is
* difficult to do when using the pagevec cache, so bypass that.
*/
void add_page_to_unevictable_list(struct page *page)
{
struct zone *zone = page_zone(page);
spin_lock_irq(&zone->lru_lock);
SetPageUnevictable(page);
SetPageLRU(page);
add_page_to_lru_list(zone, page, LRU_UNEVICTABLE);
spin_unlock_irq(&zone->lru_lock);
}
/**
* lru_cache_add_active_or_unevictable
* @page: the page to be added to LRU
* @vma: vma in which page is mapped for determining reclaimability
*
* place @page on active or unevictable LRU list, depending on
* page_evictable(). Note that if the page is not evictable,
* it goes directly back onto it's zone's unevictable list. It does
* NOT use a per cpu pagevec.
*/
void lru_cache_add_active_or_unevictable(struct page *page,
struct vm_area_struct *vma)
{
if (page_evictable(page, vma))
lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page));
else
add_page_to_unevictable_list(page);
}
/*
@@ -217,15 +272,15 @@ void lru_cache_add_active(struct page *page)
*/
static void drain_cpu_pagevecs(int cpu)
{
struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
struct pagevec *pvec;
int lru;
pvec = &per_cpu(lru_add_pvecs, cpu);
if (pagevec_count(pvec))
__pagevec_lru_add(pvec);
pvec = &per_cpu(lru_add_active_pvecs, cpu);
if (pagevec_count(pvec))
__pagevec_lru_add_active(pvec);
for_each_lru(lru) {
pvec = &pvecs[lru - LRU_BASE];
if (pagevec_count(pvec))
____pagevec_lru_add(pvec, lru);
}
pvec = &per_cpu(lru_rotate_pvecs, cpu);
if (pagevec_count(pvec)) {
@@ -244,7 +299,7 @@ void lru_add_drain(void)
put_cpu();
}
#ifdef CONFIG_NUMA
#if defined(CONFIG_NUMA) || defined(CONFIG_UNEVICTABLE_LRU)
static void lru_add_drain_per_cpu(struct work_struct *dummy)
{
lru_add_drain();
@@ -308,6 +363,7 @@ void release_pages(struct page **pages, int nr, int cold)
if (PageLRU(page)) {
struct zone *pagezone = page_zone(page);
if (pagezone != zone) {
if (zone)
spin_unlock_irqrestore(&zone->lru_lock,
@@ -380,10 +436,11 @@ void __pagevec_release_nonlru(struct pagevec *pvec)
* Add the passed pages to the LRU, then drop the caller's refcount
* on them. Reinitialises the caller's pagevec.
*/
void __pagevec_lru_add(struct pagevec *pvec)
void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
{
int i;
struct zone *zone = NULL;
VM_BUG_ON(is_unevictable_lru(lru));
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
@@ -395,38 +452,13 @@ void __pagevec_lru_add(struct pagevec *pvec)
zone = pagezone;
spin_lock_irq(&zone->lru_lock);
}
VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
add_page_to_inactive_list(zone, page);
}
if (zone)
spin_unlock_irq(&zone->lru_lock);
release_pages(pvec->pages, pvec->nr, pvec->cold);
pagevec_reinit(pvec);
}
EXPORT_SYMBOL(__pagevec_lru_add);
void __pagevec_lru_add_active(struct pagevec *pvec)
{
int i;
struct zone *zone = NULL;
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
struct zone *pagezone = page_zone(page);
if (pagezone != zone) {
if (zone)
spin_unlock_irq(&zone->lru_lock);
zone = pagezone;
spin_lock_irq(&zone->lru_lock);
}
VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
VM_BUG_ON(PageActive(page));
SetPageActive(page);
add_page_to_active_list(zone, page);
VM_BUG_ON(PageUnevictable(page));
VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
if (is_active_lru(lru))
SetPageActive(page);
add_page_to_lru_list(zone, page, lru);
}
if (zone)
spin_unlock_irq(&zone->lru_lock);
@@ -434,6 +466,8 @@ void __pagevec_lru_add_active(struct pagevec *pvec)
pagevec_reinit(pvec);
}
EXPORT_SYMBOL(____pagevec_lru_add);
/*
* Try to drop buffers from the pages in a pagevec
*/
@@ -452,6 +486,30 @@ void pagevec_strip(struct pagevec *pvec)
}
}
/**
* pagevec_swap_free - try to free swap space from the pages in a pagevec
* @pvec: pagevec with swapcache pages to free the swap space of
*
* The caller needs to hold an extra reference to each page and
* not hold the page lock on the pages. This function uses a
* trylock on the page lock so it may not always free the swap
* space associated with a page.
*/
void pagevec_swap_free(struct pagevec *pvec)
{
int i;
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
if (PageSwapCache(page) && trylock_page(page)) {
if (PageSwapCache(page))
remove_exclusive_swap_page_ref(page);
unlock_page(page);
}
}
}
/**
* pagevec_lookup - gang pagecache lookup
* @pvec: Where the resulting pages are placed

View File

@@ -33,7 +33,7 @@ static const struct address_space_operations swap_aops = {
};
static struct backing_dev_info swap_backing_dev_info = {
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
.unplug_io_fn = swap_unplug_io_fn,
};
@@ -75,6 +75,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
BUG_ON(!PageLocked(page));
BUG_ON(PageSwapCache(page));
BUG_ON(PagePrivate(page));
BUG_ON(!PageSwapBacked(page));
error = radix_tree_preload(gfp_mask);
if (!error) {
page_cache_get(page);
@@ -302,17 +303,19 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* re-using the just freed swap entry for an existing page.
* May fail (-ENOMEM) if radix-tree node allocation failed.
*/
set_page_locked(new_page);
__set_page_locked(new_page);
SetPageSwapBacked(new_page);
err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
if (likely(!err)) {
/*
* Initiate read into locked page and return.
*/
lru_cache_add_active(new_page);
lru_cache_add_anon(new_page);
swap_readpage(NULL, new_page);
return new_page;
}
clear_page_locked(new_page);
ClearPageSwapBacked(new_page);
__clear_page_locked(new_page);
swap_free(entry);
} while (err != -ENOMEM);

View File

@@ -344,7 +344,7 @@ int can_share_swap_page(struct page *page)
* Work out if there are any other processes sharing this
* swap cache page. Free it if you can. Return success.
*/
int remove_exclusive_swap_page(struct page *page)
static int remove_exclusive_swap_page_count(struct page *page, int count)
{
int retval;
struct swap_info_struct * p;
@@ -357,7 +357,7 @@ int remove_exclusive_swap_page(struct page *page)
return 0;
if (PageWriteback(page))
return 0;
if (page_count(page) != 2) /* 2: us + cache */
if (page_count(page) != count) /* us + cache + ptes */
return 0;
entry.val = page_private(page);
@@ -370,7 +370,7 @@ int remove_exclusive_swap_page(struct page *page)
if (p->swap_map[swp_offset(entry)] == 1) {
/* Recheck the page count with the swapcache lock held.. */
spin_lock_irq(&swapper_space.tree_lock);
if ((page_count(page) == 2) && !PageWriteback(page)) {
if ((page_count(page) == count) && !PageWriteback(page)) {
__delete_from_swap_cache(page);
SetPageDirty(page);
retval = 1;
@@ -387,6 +387,25 @@ int remove_exclusive_swap_page(struct page *page)
return retval;
}
/*
* Most of the time the page should have two references: one for the
* process and one for the swap cache.
*/
int remove_exclusive_swap_page(struct page *page)
{
return remove_exclusive_swap_page_count(page, 2);
}
/*
* The pageout code holds an extra reference to the page. That raises
* the reference count to test for to 2 for a page that is only in the
* swap cache plus 1 for each process that maps the page.
*/
int remove_exclusive_swap_page_ref(struct page *page)
{
return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page));
}
/*
* Free the swap entry like above, but also try to
* free the page cache entry if it is the last user.
@@ -403,7 +422,7 @@ void free_swap_and_cache(swp_entry_t entry)
if (p) {
if (swap_entry_free(p, swp_offset(entry)) == 1) {
page = find_get_page(&swapper_space, entry.val);
if (page && unlikely(!trylock_page(page))) {
if (page && !trylock_page(page)) {
page_cache_release(page);
page = NULL;
}

View File

@@ -65,36 +65,37 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
if (!dentry)
goto put_memory;
error = -ENOSPC;
inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
if (!inode)
goto put_dentry;
d_instantiate(dentry, inode);
error = -ENFILE;
file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
&ramfs_file_operations);
file = get_empty_filp();
if (!file)
goto put_dentry;
inode->i_nlink = 0; /* It is unlinked */
/* notify everyone as to the change of file size */
error = do_truncate(dentry, size, 0, file);
if (error < 0)
error = -ENOSPC;
inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
if (!inode)
goto close_file;
d_instantiate(dentry, inode);
inode->i_size = size;
inode->i_nlink = 0; /* It is unlinked */
init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
&ramfs_file_operations);
#ifndef CONFIG_MMU
error = ramfs_nommu_expand_for_mapping(inode, size);
if (error)
goto close_file;
#endif
return file;
close_file:
put_filp(file);
return ERR_PTR(error);
put_dentry:
dput(dentry);
put_memory:
return ERR_PTR(error);
}
EXPORT_SYMBOL_GPL(shmem_file_setup);
/**
* shmem_zero_setup - setup a shared anonymous mapping

View File

@@ -3,7 +3,7 @@
*
* Copyright (C) 2002, Linus Torvalds
*
* 10Sep2002 akpm@zip.com.au
* 10Sep2002 Andrew Morton
* Initial version.
*/
@@ -18,6 +18,7 @@
#include <linux/task_io_accounting_ops.h>
#include <linux/buffer_head.h> /* grr. try_to_release_page,
do_invalidatepage */
#include "internal.h"
/**
@@ -103,6 +104,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
cancel_dirty_page(page, PAGE_CACHE_SIZE);
clear_page_mlock(page);
remove_from_page_cache(page);
ClearPageMappedToDisk(page);
page_cache_release(page); /* pagecache ref */
@@ -127,6 +129,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
if (PagePrivate(page) && !try_to_release_page(page, 0))
return 0;
clear_page_mlock(page);
ret = remove_mapping(mapping, page);
return ret;
@@ -352,6 +355,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
if (PageDirty(page))
goto failed;
clear_page_mlock(page);
BUG_ON(PagePrivate(page));
__remove_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -8,7 +8,7 @@
* Copyright (C) 2006 Silicon Graphics, Inc.,
* Christoph Lameter <christoph@lameter.com>
*/
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/err.h>
#include <linux/module.h>
@@ -384,7 +384,7 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
#endif
#ifdef CONFIG_PROC_FS
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
static char * const migratetype_names[MIGRATE_TYPES] = {
@@ -581,20 +581,44 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg)
return 0;
}
const struct seq_operations fragmentation_op = {
static const struct seq_operations fragmentation_op = {
.start = frag_start,
.next = frag_next,
.stop = frag_stop,
.show = frag_show,
};
const struct seq_operations pagetypeinfo_op = {
static int fragmentation_open(struct inode *inode, struct file *file)
{
return seq_open(file, &fragmentation_op);
}
static const struct file_operations fragmentation_file_operations = {
.open = fragmentation_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
static const struct seq_operations pagetypeinfo_op = {
.start = frag_start,
.next = frag_next,
.stop = frag_stop,
.show = pagetypeinfo_show,
};
static int pagetypeinfo_open(struct inode *inode, struct file *file)
{
return seq_open(file, &pagetypeinfo_op);
}
static const struct file_operations pagetypeinfo_file_ops = {
.open = pagetypeinfo_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
#ifdef CONFIG_ZONE_DMA
#define TEXT_FOR_DMA(xx) xx "_dma",
#else
@@ -619,8 +643,14 @@ const struct seq_operations pagetypeinfo_op = {
static const char * const vmstat_text[] = {
/* Zoned VM counters */
"nr_free_pages",
"nr_inactive",
"nr_active",
"nr_inactive_anon",
"nr_active_anon",
"nr_inactive_file",
"nr_active_file",
#ifdef CONFIG_UNEVICTABLE_LRU
"nr_unevictable",
"nr_mlock",
#endif
"nr_anon_pages",
"nr_mapped",
"nr_file_pages",
@@ -675,6 +705,16 @@ static const char * const vmstat_text[] = {
"htlb_buddy_alloc_success",
"htlb_buddy_alloc_fail",
#endif
#ifdef CONFIG_UNEVICTABLE_LRU
"unevictable_pgs_culled",
"unevictable_pgs_scanned",
"unevictable_pgs_rescued",
"unevictable_pgs_mlocked",
"unevictable_pgs_munlocked",
"unevictable_pgs_cleared",
"unevictable_pgs_stranded",
"unevictable_pgs_mlockfreed",
#endif
#endif
};
@@ -688,7 +728,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n min %lu"
"\n low %lu"
"\n high %lu"
"\n scanned %lu (a: %lu i: %lu)"
"\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)"
"\n spanned %lu"
"\n present %lu",
zone_page_state(zone, NR_FREE_PAGES),
@@ -696,7 +736,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
zone->pages_low,
zone->pages_high,
zone->pages_scanned,
zone->nr_scan_active, zone->nr_scan_inactive,
zone->lru[LRU_ACTIVE_ANON].nr_scan,
zone->lru[LRU_INACTIVE_ANON].nr_scan,
zone->lru[LRU_ACTIVE_FILE].nr_scan,
zone->lru[LRU_INACTIVE_FILE].nr_scan,
zone->spanned_pages,
zone->present_pages);
@@ -733,10 +776,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
seq_printf(m,
"\n all_unreclaimable: %u"
"\n prev_priority: %i"
"\n start_pfn: %lu",
"\n start_pfn: %lu"
"\n inactive_ratio: %u",
zone_is_all_unreclaimable(zone),
zone->prev_priority,
zone->zone_start_pfn);
zone->zone_start_pfn,
zone->inactive_ratio);
seq_putc(m, '\n');
}
@@ -750,7 +795,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
return 0;
}
const struct seq_operations zoneinfo_op = {
static const struct seq_operations zoneinfo_op = {
.start = frag_start, /* iterate over all zones. The same as in
* fragmentation. */
.next = frag_next,
@@ -758,6 +803,18 @@ const struct seq_operations zoneinfo_op = {
.show = zoneinfo_show,
};
static int zoneinfo_open(struct inode *inode, struct file *file)
{
return seq_open(file, &zoneinfo_op);
}
static const struct file_operations proc_zoneinfo_file_operations = {
.open = zoneinfo_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
static void *vmstat_start(struct seq_file *m, loff_t *pos)
{
unsigned long *v;
@@ -813,13 +870,24 @@ static void vmstat_stop(struct seq_file *m, void *arg)
m->private = NULL;
}
const struct seq_operations vmstat_op = {
static const struct seq_operations vmstat_op = {
.start = vmstat_start,
.next = vmstat_next,
.stop = vmstat_stop,
.show = vmstat_show,
};
static int vmstat_open(struct inode *inode, struct file *file)
{
return seq_open(file, &vmstat_op);
}
static const struct file_operations proc_vmstat_file_operations = {
.open = vmstat_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_SMP
@@ -877,9 +945,11 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
static struct notifier_block __cpuinitdata vmstat_notifier =
{ &vmstat_cpuup_callback, NULL, 0 };
#endif
static int __init setup_vmstat(void)
{
#ifdef CONFIG_SMP
int cpu;
refresh_zone_stat_thresholds();
@@ -887,7 +957,13 @@ static int __init setup_vmstat(void)
for_each_online_cpu(cpu)
start_cpu_timer(cpu);
#endif
#ifdef CONFIG_PROC_FS
proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
#endif
return 0;
}
module_init(setup_vmstat)
#endif