Merge commit 'v2.6.28-rc2' into core/locking
Conflicts: arch/um/include/asm/system.h
This commit is contained in:
18
mm/Kconfig
18
mm/Kconfig
@@ -101,7 +101,7 @@ config HAVE_MEMORY_PRESENT
|
||||
# with gcc 3.4 and later.
|
||||
#
|
||||
config SPARSEMEM_STATIC
|
||||
def_bool n
|
||||
bool
|
||||
|
||||
#
|
||||
# Architecture platforms which require a two level mem_section in SPARSEMEM
|
||||
@@ -113,7 +113,7 @@ config SPARSEMEM_EXTREME
|
||||
depends on SPARSEMEM && !SPARSEMEM_STATIC
|
||||
|
||||
config SPARSEMEM_VMEMMAP_ENABLE
|
||||
def_bool n
|
||||
bool
|
||||
|
||||
config SPARSEMEM_VMEMMAP
|
||||
bool "Sparse Memory virtual memmap"
|
||||
@@ -187,6 +187,9 @@ config RESOURCES_64BIT
|
||||
help
|
||||
This option allows memory and IO resources to be 64 bit.
|
||||
|
||||
config PHYS_ADDR_T_64BIT
|
||||
def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
|
||||
|
||||
config ZONE_DMA_FLAG
|
||||
int
|
||||
default "0" if !ZONE_DMA
|
||||
@@ -206,5 +209,16 @@ config VIRT_TO_BUS
|
||||
def_bool y
|
||||
depends on !ARCH_NO_VIRT_TO_BUS
|
||||
|
||||
config UNEVICTABLE_LRU
|
||||
bool "Add LRU list to track non-evictable pages"
|
||||
default y
|
||||
depends on MMU
|
||||
help
|
||||
Keeps unevictable pages off of the active and inactive pageout
|
||||
lists, so kswapd will not waste CPU time or have its balancing
|
||||
algorithms thrown off by scanning these pages. Selecting this
|
||||
will use one page flag and increase the code size a little,
|
||||
say Y unless you know what you are doing.
|
||||
|
||||
config MMU_NOTIFIER
|
||||
bool
|
||||
|
@@ -33,5 +33,4 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
|
||||
obj-$(CONFIG_MIGRATION) += migrate.o
|
||||
obj-$(CONFIG_SMP) += allocpercpu.o
|
||||
obj-$(CONFIG_QUICKLIST) += quicklist.o
|
||||
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
|
||||
|
||||
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
|
||||
|
@@ -48,7 +48,7 @@ early_param("bootmem_debug", bootmem_debug_setup);
|
||||
if (unlikely(bootmem_debug)) \
|
||||
printk(KERN_INFO \
|
||||
"bootmem::%s " fmt, \
|
||||
__FUNCTION__, ## args); \
|
||||
__func__, ## args); \
|
||||
})
|
||||
|
||||
static unsigned long __init bootmap_bytes(unsigned long pages)
|
||||
|
@@ -267,7 +267,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
|
||||
/*
|
||||
* Data-less bio, nothing to bounce
|
||||
*/
|
||||
if (bio_empty_barrier(*bio_orig))
|
||||
if (!bio_has_data(*bio_orig))
|
||||
return;
|
||||
|
||||
/*
|
||||
|
@@ -3,7 +3,7 @@
|
||||
*
|
||||
* Copyright (C) 2002, Linus Torvalds
|
||||
*
|
||||
* 11Jan2003 akpm@digeo.com
|
||||
* 11Jan2003 Andrew Morton
|
||||
* Initial version.
|
||||
*/
|
||||
|
||||
|
55
mm/filemap.c
55
mm/filemap.c
@@ -33,6 +33,7 @@
|
||||
#include <linux/cpuset.h>
|
||||
#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
|
||||
#include <linux/memcontrol.h>
|
||||
#include <linux/mm_inline.h> /* for page_is_file_cache() */
|
||||
#include "internal.h"
|
||||
|
||||
/*
|
||||
@@ -115,12 +116,12 @@ void __remove_from_page_cache(struct page *page)
|
||||
{
|
||||
struct address_space *mapping = page->mapping;
|
||||
|
||||
mem_cgroup_uncharge_cache_page(page);
|
||||
radix_tree_delete(&mapping->page_tree, page->index);
|
||||
page->mapping = NULL;
|
||||
mapping->nrpages--;
|
||||
__dec_zone_page_state(page, NR_FILE_PAGES);
|
||||
BUG_ON(page_mapped(page));
|
||||
mem_cgroup_uncharge_cache_page(page);
|
||||
|
||||
/*
|
||||
* Some filesystems seem to re-dirty the page even after
|
||||
@@ -492,9 +493,24 @@ EXPORT_SYMBOL(add_to_page_cache_locked);
|
||||
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
|
||||
pgoff_t offset, gfp_t gfp_mask)
|
||||
{
|
||||
int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
|
||||
if (ret == 0)
|
||||
lru_cache_add(page);
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* Splice_read and readahead add shmem/tmpfs pages into the page cache
|
||||
* before shmem_readpage has a chance to mark them as SwapBacked: they
|
||||
* need to go on the active_anon lru below, and mem_cgroup_cache_charge
|
||||
* (called in add_to_page_cache) needs to know where they're going too.
|
||||
*/
|
||||
if (mapping_cap_swap_backed(mapping))
|
||||
SetPageSwapBacked(page);
|
||||
|
||||
ret = add_to_page_cache(page, mapping, offset, gfp_mask);
|
||||
if (ret == 0) {
|
||||
if (page_is_file_cache(page))
|
||||
lru_cache_add_file(page);
|
||||
else
|
||||
lru_cache_add_active_anon(page);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -557,17 +573,14 @@ EXPORT_SYMBOL(wait_on_page_bit);
|
||||
* mechananism between PageLocked pages and PageWriteback pages is shared.
|
||||
* But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
|
||||
*
|
||||
* The first mb is necessary to safely close the critical section opened by the
|
||||
* test_and_set_bit() to lock the page; the second mb is necessary to enforce
|
||||
* ordering between the clear_bit and the read of the waitqueue (to avoid SMP
|
||||
* races with a parallel wait_on_page_locked()).
|
||||
* The mb is necessary to enforce ordering between the clear_bit and the read
|
||||
* of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
|
||||
*/
|
||||
void unlock_page(struct page *page)
|
||||
{
|
||||
smp_mb__before_clear_bit();
|
||||
if (!test_and_clear_bit(PG_locked, &page->flags))
|
||||
BUG();
|
||||
smp_mb__after_clear_bit();
|
||||
VM_BUG_ON(!PageLocked(page));
|
||||
clear_bit_unlock(PG_locked, &page->flags);
|
||||
smp_mb__after_clear_bit();
|
||||
wake_up_page(page, PG_locked);
|
||||
}
|
||||
EXPORT_SYMBOL(unlock_page);
|
||||
@@ -1100,8 +1113,9 @@ page_ok:
|
||||
|
||||
page_not_up_to_date:
|
||||
/* Get exclusive access to the page ... */
|
||||
if (lock_page_killable(page))
|
||||
goto readpage_eio;
|
||||
error = lock_page_killable(page);
|
||||
if (unlikely(error))
|
||||
goto readpage_error;
|
||||
|
||||
page_not_up_to_date_locked:
|
||||
/* Did it get truncated before we got the lock? */
|
||||
@@ -1130,8 +1144,9 @@ readpage:
|
||||
}
|
||||
|
||||
if (!PageUptodate(page)) {
|
||||
if (lock_page_killable(page))
|
||||
goto readpage_eio;
|
||||
error = lock_page_killable(page);
|
||||
if (unlikely(error))
|
||||
goto readpage_error;
|
||||
if (!PageUptodate(page)) {
|
||||
if (page->mapping == NULL) {
|
||||
/*
|
||||
@@ -1143,15 +1158,14 @@ readpage:
|
||||
}
|
||||
unlock_page(page);
|
||||
shrink_readahead_size_eio(filp, ra);
|
||||
goto readpage_eio;
|
||||
error = -EIO;
|
||||
goto readpage_error;
|
||||
}
|
||||
unlock_page(page);
|
||||
}
|
||||
|
||||
goto page_ok;
|
||||
|
||||
readpage_eio:
|
||||
error = -EIO;
|
||||
readpage_error:
|
||||
/* UHHUH! A synchronous read error occurred. Report it */
|
||||
desc->error = error;
|
||||
@@ -1186,8 +1200,7 @@ out:
|
||||
ra->prev_pos |= prev_offset;
|
||||
|
||||
*ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
|
||||
if (filp)
|
||||
file_accessed(filp);
|
||||
file_accessed(filp);
|
||||
}
|
||||
|
||||
int file_read_actor(read_descriptor_t *desc, struct page *page,
|
||||
|
27
mm/fremap.c
27
mm/fremap.c
@@ -21,6 +21,8 @@
|
||||
#include <asm/cacheflush.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long addr, pte_t *ptep)
|
||||
{
|
||||
@@ -215,15 +217,31 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
|
||||
spin_unlock(&mapping->i_mmap_lock);
|
||||
}
|
||||
|
||||
if (vma->vm_flags & VM_LOCKED) {
|
||||
/*
|
||||
* drop PG_Mlocked flag for over-mapped range
|
||||
*/
|
||||
unsigned int saved_flags = vma->vm_flags;
|
||||
munlock_vma_pages_range(vma, start, start + size);
|
||||
vma->vm_flags = saved_flags;
|
||||
}
|
||||
|
||||
mmu_notifier_invalidate_range_start(mm, start, start + size);
|
||||
err = populate_range(mm, vma, start, size, pgoff);
|
||||
mmu_notifier_invalidate_range_end(mm, start, start + size);
|
||||
if (!err && !(flags & MAP_NONBLOCK)) {
|
||||
if (unlikely(has_write_lock)) {
|
||||
downgrade_write(&mm->mmap_sem);
|
||||
has_write_lock = 0;
|
||||
if (vma->vm_flags & VM_LOCKED) {
|
||||
/*
|
||||
* might be mapping previously unmapped range of file
|
||||
*/
|
||||
mlock_vma_pages_range(vma, start, start + size);
|
||||
} else {
|
||||
if (unlikely(has_write_lock)) {
|
||||
downgrade_write(&mm->mmap_sem);
|
||||
has_write_lock = 0;
|
||||
}
|
||||
make_pages_present(start, start+size);
|
||||
}
|
||||
make_pages_present(start, start+size);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -240,4 +258,3 @@ out:
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
|
@@ -70,6 +70,7 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
|
||||
static void flush_all_zero_pkmaps(void)
|
||||
{
|
||||
int i;
|
||||
int need_flush = 0;
|
||||
|
||||
flush_cache_kmaps();
|
||||
|
||||
@@ -101,8 +102,10 @@ static void flush_all_zero_pkmaps(void)
|
||||
&pkmap_page_table[i]);
|
||||
|
||||
set_page_address(page, NULL);
|
||||
need_flush = 1;
|
||||
}
|
||||
flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
|
||||
if (need_flush)
|
||||
flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
|
||||
}
|
||||
|
||||
/**
|
||||
|
72
mm/hugetlb.c
72
mm/hugetlb.c
@@ -7,6 +7,7 @@
|
||||
#include <linux/init.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/mmu_notifier.h>
|
||||
@@ -262,7 +263,7 @@ struct resv_map {
|
||||
struct list_head regions;
|
||||
};
|
||||
|
||||
struct resv_map *resv_map_alloc(void)
|
||||
static struct resv_map *resv_map_alloc(void)
|
||||
{
|
||||
struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
|
||||
if (!resv_map)
|
||||
@@ -274,7 +275,7 @@ struct resv_map *resv_map_alloc(void)
|
||||
return resv_map;
|
||||
}
|
||||
|
||||
void resv_map_release(struct kref *ref)
|
||||
static void resv_map_release(struct kref *ref)
|
||||
{
|
||||
struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
|
||||
|
||||
@@ -289,7 +290,7 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
|
||||
if (!(vma->vm_flags & VM_SHARED))
|
||||
return (struct resv_map *)(get_vma_private_data(vma) &
|
||||
~HPAGE_RESV_MASK);
|
||||
return 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
|
||||
@@ -1455,15 +1456,15 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
|
||||
|
||||
#endif /* CONFIG_SYSCTL */
|
||||
|
||||
int hugetlb_report_meminfo(char *buf)
|
||||
void hugetlb_report_meminfo(struct seq_file *m)
|
||||
{
|
||||
struct hstate *h = &default_hstate;
|
||||
return sprintf(buf,
|
||||
"HugePages_Total: %5lu\n"
|
||||
"HugePages_Free: %5lu\n"
|
||||
"HugePages_Rsvd: %5lu\n"
|
||||
"HugePages_Surp: %5lu\n"
|
||||
"Hugepagesize: %5lu kB\n",
|
||||
seq_printf(m,
|
||||
"HugePages_Total: %5lu\n"
|
||||
"HugePages_Free: %5lu\n"
|
||||
"HugePages_Rsvd: %5lu\n"
|
||||
"HugePages_Surp: %5lu\n"
|
||||
"Hugepagesize: %8lu kB\n",
|
||||
h->nr_huge_pages,
|
||||
h->free_huge_pages,
|
||||
h->resv_huge_pages,
|
||||
@@ -1747,10 +1748,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
|
||||
* from other VMAs and let the children be SIGKILLed if they are faulting the
|
||||
* same region.
|
||||
*/
|
||||
int unmap_ref_private(struct mm_struct *mm,
|
||||
struct vm_area_struct *vma,
|
||||
struct page *page,
|
||||
unsigned long address)
|
||||
static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
struct page *page, unsigned long address)
|
||||
{
|
||||
struct vm_area_struct *iter_vma;
|
||||
struct address_space *mapping;
|
||||
@@ -2008,7 +2007,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
entry = huge_ptep_get(ptep);
|
||||
if (huge_pte_none(entry)) {
|
||||
ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
|
||||
goto out_unlock;
|
||||
goto out_mutex;
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
@@ -2024,7 +2023,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
if (write_access && !pte_write(entry)) {
|
||||
if (vma_needs_reservation(h, vma, address) < 0) {
|
||||
ret = VM_FAULT_OOM;
|
||||
goto out_unlock;
|
||||
goto out_mutex;
|
||||
}
|
||||
|
||||
if (!(vma->vm_flags & VM_SHARED))
|
||||
@@ -2034,10 +2033,23 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
|
||||
spin_lock(&mm->page_table_lock);
|
||||
/* Check for a racing update before calling hugetlb_cow */
|
||||
if (likely(pte_same(entry, huge_ptep_get(ptep))))
|
||||
if (write_access && !pte_write(entry))
|
||||
if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
|
||||
goto out_page_table_lock;
|
||||
|
||||
|
||||
if (write_access) {
|
||||
if (!pte_write(entry)) {
|
||||
ret = hugetlb_cow(mm, vma, address, ptep, entry,
|
||||
pagecache_page);
|
||||
goto out_page_table_lock;
|
||||
}
|
||||
entry = pte_mkdirty(entry);
|
||||
}
|
||||
entry = pte_mkyoung(entry);
|
||||
if (huge_ptep_set_access_flags(vma, address, ptep, entry, write_access))
|
||||
update_mmu_cache(vma, address, entry);
|
||||
|
||||
out_page_table_lock:
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
|
||||
if (pagecache_page) {
|
||||
@@ -2045,7 +2057,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
put_page(pagecache_page);
|
||||
}
|
||||
|
||||
out_unlock:
|
||||
out_mutex:
|
||||
mutex_unlock(&hugetlb_instantiation_mutex);
|
||||
|
||||
return ret;
|
||||
@@ -2060,6 +2072,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int huge_zeropage_ok(pte_t *ptep, int write, int shared)
|
||||
{
|
||||
if (!ptep || write || shared)
|
||||
return 0;
|
||||
else
|
||||
return huge_pte_none(huge_ptep_get(ptep));
|
||||
}
|
||||
|
||||
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
struct page **pages, struct vm_area_struct **vmas,
|
||||
unsigned long *position, int *length, int i,
|
||||
@@ -2069,6 +2089,8 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long vaddr = *position;
|
||||
int remainder = *length;
|
||||
struct hstate *h = hstate_vma(vma);
|
||||
int zeropage_ok = 0;
|
||||
int shared = vma->vm_flags & VM_SHARED;
|
||||
|
||||
spin_lock(&mm->page_table_lock);
|
||||
while (vaddr < vma->vm_end && remainder) {
|
||||
@@ -2081,8 +2103,11 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
* first, for the page indexing below to work.
|
||||
*/
|
||||
pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
|
||||
if (huge_zeropage_ok(pte, write, shared))
|
||||
zeropage_ok = 1;
|
||||
|
||||
if (!pte || huge_pte_none(huge_ptep_get(pte)) ||
|
||||
if (!pte ||
|
||||
(huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) ||
|
||||
(write && !pte_write(huge_ptep_get(pte)))) {
|
||||
int ret;
|
||||
|
||||
@@ -2102,8 +2127,11 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
page = pte_page(huge_ptep_get(pte));
|
||||
same_page:
|
||||
if (pages) {
|
||||
get_page(page);
|
||||
pages[i] = page + pfn_offset;
|
||||
if (zeropage_ok)
|
||||
pages[i] = ZERO_PAGE(0);
|
||||
else
|
||||
pages[i] = page + pfn_offset;
|
||||
get_page(pages[i]);
|
||||
}
|
||||
|
||||
if (vmas)
|
||||
|
131
mm/internal.h
131
mm/internal.h
@@ -39,6 +39,15 @@ static inline void __put_page(struct page *page)
|
||||
atomic_dec(&page->_count);
|
||||
}
|
||||
|
||||
/*
|
||||
* in mm/vmscan.c:
|
||||
*/
|
||||
extern int isolate_lru_page(struct page *page);
|
||||
extern void putback_lru_page(struct page *page);
|
||||
|
||||
/*
|
||||
* in mm/page_alloc.c
|
||||
*/
|
||||
extern void __free_pages_bootmem(struct page *page, unsigned int order);
|
||||
|
||||
/*
|
||||
@@ -52,6 +61,120 @@ static inline unsigned long page_order(struct page *page)
|
||||
return page_private(page);
|
||||
}
|
||||
|
||||
extern long mlock_vma_pages_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end);
|
||||
extern void munlock_vma_pages_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end);
|
||||
static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
|
||||
{
|
||||
munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_UNEVICTABLE_LRU
|
||||
/*
|
||||
* unevictable_migrate_page() called only from migrate_page_copy() to
|
||||
* migrate unevictable flag to new page.
|
||||
* Note that the old page has been isolated from the LRU lists at this
|
||||
* point so we don't need to worry about LRU statistics.
|
||||
*/
|
||||
static inline void unevictable_migrate_page(struct page *new, struct page *old)
|
||||
{
|
||||
if (TestClearPageUnevictable(old))
|
||||
SetPageUnevictable(new);
|
||||
}
|
||||
#else
|
||||
static inline void unevictable_migrate_page(struct page *new, struct page *old)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_UNEVICTABLE_LRU
|
||||
/*
|
||||
* Called only in fault path via page_evictable() for a new page
|
||||
* to determine if it's being mapped into a LOCKED vma.
|
||||
* If so, mark page as mlocked.
|
||||
*/
|
||||
static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
|
||||
{
|
||||
VM_BUG_ON(PageLRU(page));
|
||||
|
||||
if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
|
||||
return 0;
|
||||
|
||||
if (!TestSetPageMlocked(page)) {
|
||||
inc_zone_page_state(page, NR_MLOCK);
|
||||
count_vm_event(UNEVICTABLE_PGMLOCKED);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* must be called with vma's mmap_sem held for read, and page locked.
|
||||
*/
|
||||
extern void mlock_vma_page(struct page *page);
|
||||
|
||||
/*
|
||||
* Clear the page's PageMlocked(). This can be useful in a situation where
|
||||
* we want to unconditionally remove a page from the pagecache -- e.g.,
|
||||
* on truncation or freeing.
|
||||
*
|
||||
* It is legal to call this function for any page, mlocked or not.
|
||||
* If called for a page that is still mapped by mlocked vmas, all we do
|
||||
* is revert to lazy LRU behaviour -- semantics are not broken.
|
||||
*/
|
||||
extern void __clear_page_mlock(struct page *page);
|
||||
static inline void clear_page_mlock(struct page *page)
|
||||
{
|
||||
if (unlikely(TestClearPageMlocked(page)))
|
||||
__clear_page_mlock(page);
|
||||
}
|
||||
|
||||
/*
|
||||
* mlock_migrate_page - called only from migrate_page_copy() to
|
||||
* migrate the Mlocked page flag; update statistics.
|
||||
*/
|
||||
static inline void mlock_migrate_page(struct page *newpage, struct page *page)
|
||||
{
|
||||
if (TestClearPageMlocked(page)) {
|
||||
unsigned long flags;
|
||||
|
||||
local_irq_save(flags);
|
||||
__dec_zone_page_state(page, NR_MLOCK);
|
||||
SetPageMlocked(newpage);
|
||||
__inc_zone_page_state(newpage, NR_MLOCK);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* free_page_mlock() -- clean up attempts to free and mlocked() page.
|
||||
* Page should not be on lru, so no need to fix that up.
|
||||
* free_pages_check() will verify...
|
||||
*/
|
||||
static inline void free_page_mlock(struct page *page)
|
||||
{
|
||||
if (unlikely(TestClearPageMlocked(page))) {
|
||||
unsigned long flags;
|
||||
|
||||
local_irq_save(flags);
|
||||
__dec_zone_page_state(page, NR_MLOCK);
|
||||
__count_vm_event(UNEVICTABLE_MLOCKFREED);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
}
|
||||
|
||||
#else /* CONFIG_UNEVICTABLE_LRU */
|
||||
static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
static inline void clear_page_mlock(struct page *page) { }
|
||||
static inline void mlock_vma_page(struct page *page) { }
|
||||
static inline void mlock_migrate_page(struct page *new, struct page *old) { }
|
||||
static inline void free_page_mlock(struct page *page) { }
|
||||
|
||||
#endif /* CONFIG_UNEVICTABLE_LRU */
|
||||
|
||||
/*
|
||||
* FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
|
||||
* so all functions starting at paging_init should be marked __init
|
||||
@@ -120,4 +243,12 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
|
||||
}
|
||||
#endif /* CONFIG_SPARSEMEM */
|
||||
|
||||
#define GUP_FLAGS_WRITE 0x1
|
||||
#define GUP_FLAGS_FORCE 0x2
|
||||
#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
|
||||
|
||||
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, int len, int flags,
|
||||
struct page **pages, struct vm_area_struct **vmas);
|
||||
|
||||
#endif
|
||||
|
477
mm/memcontrol.c
477
mm/memcontrol.c
@@ -32,11 +32,12 @@
|
||||
#include <linux/fs.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/mm_inline.h>
|
||||
#include <linux/page_cgroup.h>
|
||||
|
||||
#include <asm/uaccess.h>
|
||||
|
||||
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
|
||||
static struct kmem_cache *page_cgroup_cache __read_mostly;
|
||||
#define MEM_CGROUP_RECLAIM_RETRIES 5
|
||||
|
||||
/*
|
||||
@@ -65,11 +66,10 @@ struct mem_cgroup_stat {
|
||||
/*
|
||||
* For accounting under irq disable, no need for increment preempt count.
|
||||
*/
|
||||
static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat,
|
||||
static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
|
||||
enum mem_cgroup_stat_index idx, int val)
|
||||
{
|
||||
int cpu = smp_processor_id();
|
||||
stat->cpustat[cpu].count[idx] += val;
|
||||
stat->count[idx] += val;
|
||||
}
|
||||
|
||||
static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
|
||||
@@ -85,22 +85,13 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
|
||||
/*
|
||||
* per-zone information in memory controller.
|
||||
*/
|
||||
|
||||
enum mem_cgroup_zstat_index {
|
||||
MEM_CGROUP_ZSTAT_ACTIVE,
|
||||
MEM_CGROUP_ZSTAT_INACTIVE,
|
||||
|
||||
NR_MEM_CGROUP_ZSTAT,
|
||||
};
|
||||
|
||||
struct mem_cgroup_per_zone {
|
||||
/*
|
||||
* spin_lock to protect the per cgroup LRU
|
||||
*/
|
||||
spinlock_t lru_lock;
|
||||
struct list_head active_list;
|
||||
struct list_head inactive_list;
|
||||
unsigned long count[NR_MEM_CGROUP_ZSTAT];
|
||||
struct list_head lists[NR_LRU_LISTS];
|
||||
unsigned long count[NR_LRU_LISTS];
|
||||
};
|
||||
/* Macro for accessing counter */
|
||||
#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
|
||||
@@ -144,69 +135,52 @@ struct mem_cgroup {
|
||||
};
|
||||
static struct mem_cgroup init_mem_cgroup;
|
||||
|
||||
/*
|
||||
* We use the lower bit of the page->page_cgroup pointer as a bit spin
|
||||
* lock. We need to ensure that page->page_cgroup is at least two
|
||||
* byte aligned (based on comments from Nick Piggin). But since
|
||||
* bit_spin_lock doesn't actually set that lock bit in a non-debug
|
||||
* uniprocessor kernel, we should avoid setting it here too.
|
||||
*/
|
||||
#define PAGE_CGROUP_LOCK_BIT 0x0
|
||||
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
|
||||
#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
|
||||
#else
|
||||
#define PAGE_CGROUP_LOCK 0x0
|
||||
#endif
|
||||
|
||||
/*
|
||||
* A page_cgroup page is associated with every page descriptor. The
|
||||
* page_cgroup helps us identify information about the cgroup
|
||||
*/
|
||||
struct page_cgroup {
|
||||
struct list_head lru; /* per cgroup LRU list */
|
||||
struct page *page;
|
||||
struct mem_cgroup *mem_cgroup;
|
||||
int flags;
|
||||
};
|
||||
#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
|
||||
#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */
|
||||
|
||||
static int page_cgroup_nid(struct page_cgroup *pc)
|
||||
{
|
||||
return page_to_nid(pc->page);
|
||||
}
|
||||
|
||||
static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
|
||||
{
|
||||
return page_zonenum(pc->page);
|
||||
}
|
||||
|
||||
enum charge_type {
|
||||
MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
|
||||
MEM_CGROUP_CHARGE_TYPE_MAPPED,
|
||||
MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
|
||||
MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
|
||||
NR_CHARGE_TYPE,
|
||||
};
|
||||
|
||||
/* only for here (for easy reading.) */
|
||||
#define PCGF_CACHE (1UL << PCG_CACHE)
|
||||
#define PCGF_USED (1UL << PCG_USED)
|
||||
#define PCGF_ACTIVE (1UL << PCG_ACTIVE)
|
||||
#define PCGF_LOCK (1UL << PCG_LOCK)
|
||||
#define PCGF_FILE (1UL << PCG_FILE)
|
||||
static const unsigned long
|
||||
pcg_default_flags[NR_CHARGE_TYPE] = {
|
||||
PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
|
||||
PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
|
||||
PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
|
||||
0, /* FORCE */
|
||||
};
|
||||
|
||||
/*
|
||||
* Always modified under lru lock. Then, not necessary to preempt_disable()
|
||||
*/
|
||||
static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags,
|
||||
bool charge)
|
||||
static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
|
||||
struct page_cgroup *pc,
|
||||
bool charge)
|
||||
{
|
||||
int val = (charge)? 1 : -1;
|
||||
struct mem_cgroup_stat *stat = &mem->stat;
|
||||
struct mem_cgroup_stat_cpu *cpustat;
|
||||
|
||||
VM_BUG_ON(!irqs_disabled());
|
||||
if (flags & PAGE_CGROUP_FLAG_CACHE)
|
||||
__mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val);
|
||||
|
||||
cpustat = &stat->cpustat[smp_processor_id()];
|
||||
if (PageCgroupCache(pc))
|
||||
__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
|
||||
else
|
||||
__mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val);
|
||||
__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
|
||||
|
||||
if (charge)
|
||||
__mem_cgroup_stat_add_safe(stat,
|
||||
__mem_cgroup_stat_add_safe(cpustat,
|
||||
MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
|
||||
else
|
||||
__mem_cgroup_stat_add_safe(stat,
|
||||
__mem_cgroup_stat_add_safe(cpustat,
|
||||
MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
|
||||
}
|
||||
|
||||
@@ -227,7 +201,7 @@ page_cgroup_zoneinfo(struct page_cgroup *pc)
|
||||
}
|
||||
|
||||
static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
|
||||
enum mem_cgroup_zstat_index idx)
|
||||
enum lru_list idx)
|
||||
{
|
||||
int nid, zid;
|
||||
struct mem_cgroup_per_zone *mz;
|
||||
@@ -250,89 +224,89 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
|
||||
|
||||
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
|
||||
{
|
||||
/*
|
||||
* mm_update_next_owner() may clear mm->owner to NULL
|
||||
* if it races with swapoff, page migration, etc.
|
||||
* So this can be called with p == NULL.
|
||||
*/
|
||||
if (unlikely(!p))
|
||||
return NULL;
|
||||
|
||||
return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
|
||||
struct mem_cgroup, css);
|
||||
}
|
||||
|
||||
static inline int page_cgroup_locked(struct page *page)
|
||||
{
|
||||
return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
|
||||
}
|
||||
|
||||
static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
|
||||
{
|
||||
VM_BUG_ON(!page_cgroup_locked(page));
|
||||
page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
|
||||
}
|
||||
|
||||
struct page_cgroup *page_get_page_cgroup(struct page *page)
|
||||
{
|
||||
return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
|
||||
}
|
||||
|
||||
static void lock_page_cgroup(struct page *page)
|
||||
{
|
||||
bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
|
||||
}
|
||||
|
||||
static int try_lock_page_cgroup(struct page *page)
|
||||
{
|
||||
return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
|
||||
}
|
||||
|
||||
static void unlock_page_cgroup(struct page *page)
|
||||
{
|
||||
bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
|
||||
}
|
||||
|
||||
static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
|
||||
struct page_cgroup *pc)
|
||||
{
|
||||
int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
|
||||
int lru = LRU_BASE;
|
||||
|
||||
if (from)
|
||||
MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
|
||||
else
|
||||
MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
|
||||
if (PageCgroupUnevictable(pc))
|
||||
lru = LRU_UNEVICTABLE;
|
||||
else {
|
||||
if (PageCgroupActive(pc))
|
||||
lru += LRU_ACTIVE;
|
||||
if (PageCgroupFile(pc))
|
||||
lru += LRU_FILE;
|
||||
}
|
||||
|
||||
mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false);
|
||||
MEM_CGROUP_ZSTAT(mz, lru) -= 1;
|
||||
|
||||
mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
|
||||
list_del(&pc->lru);
|
||||
}
|
||||
|
||||
static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
|
||||
struct page_cgroup *pc)
|
||||
{
|
||||
int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
|
||||
int lru = LRU_BASE;
|
||||
|
||||
if (!to) {
|
||||
MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
|
||||
list_add(&pc->lru, &mz->inactive_list);
|
||||
} else {
|
||||
MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
|
||||
list_add(&pc->lru, &mz->active_list);
|
||||
if (PageCgroupUnevictable(pc))
|
||||
lru = LRU_UNEVICTABLE;
|
||||
else {
|
||||
if (PageCgroupActive(pc))
|
||||
lru += LRU_ACTIVE;
|
||||
if (PageCgroupFile(pc))
|
||||
lru += LRU_FILE;
|
||||
}
|
||||
mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true);
|
||||
|
||||
MEM_CGROUP_ZSTAT(mz, lru) += 1;
|
||||
list_add(&pc->lru, &mz->lists[lru]);
|
||||
|
||||
mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
|
||||
}
|
||||
|
||||
static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
|
||||
static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
|
||||
{
|
||||
int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
|
||||
struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
|
||||
int active = PageCgroupActive(pc);
|
||||
int file = PageCgroupFile(pc);
|
||||
int unevictable = PageCgroupUnevictable(pc);
|
||||
enum lru_list from = unevictable ? LRU_UNEVICTABLE :
|
||||
(LRU_FILE * !!file + !!active);
|
||||
|
||||
if (from)
|
||||
MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
|
||||
else
|
||||
MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
|
||||
if (lru == from)
|
||||
return;
|
||||
|
||||
if (active) {
|
||||
MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
|
||||
pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
|
||||
list_move(&pc->lru, &mz->active_list);
|
||||
MEM_CGROUP_ZSTAT(mz, from) -= 1;
|
||||
/*
|
||||
* However this is done under mz->lru_lock, another flags, which
|
||||
* are not related to LRU, will be modified from out-of-lock.
|
||||
* We have to use atomic set/clear flags.
|
||||
*/
|
||||
if (is_unevictable_lru(lru)) {
|
||||
ClearPageCgroupActive(pc);
|
||||
SetPageCgroupUnevictable(pc);
|
||||
} else {
|
||||
MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
|
||||
pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE;
|
||||
list_move(&pc->lru, &mz->inactive_list);
|
||||
if (is_active_lru(lru))
|
||||
SetPageCgroupActive(pc);
|
||||
else
|
||||
ClearPageCgroupActive(pc);
|
||||
ClearPageCgroupUnevictable(pc);
|
||||
}
|
||||
|
||||
MEM_CGROUP_ZSTAT(mz, lru) += 1;
|
||||
list_move(&pc->lru, &mz->lists[lru]);
|
||||
}
|
||||
|
||||
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
|
||||
@@ -348,7 +322,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
|
||||
/*
|
||||
* This routine assumes that the appropriate zone's lru lock is already held
|
||||
*/
|
||||
void mem_cgroup_move_lists(struct page *page, bool active)
|
||||
void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
|
||||
{
|
||||
struct page_cgroup *pc;
|
||||
struct mem_cgroup_per_zone *mz;
|
||||
@@ -364,17 +338,16 @@ void mem_cgroup_move_lists(struct page *page, bool active)
|
||||
* safely get to page_cgroup without it, so just try_lock it:
|
||||
* mem_cgroup_isolate_pages allows for page left on wrong list.
|
||||
*/
|
||||
if (!try_lock_page_cgroup(page))
|
||||
pc = lookup_page_cgroup(page);
|
||||
if (!trylock_page_cgroup(pc))
|
||||
return;
|
||||
|
||||
pc = page_get_page_cgroup(page);
|
||||
if (pc) {
|
||||
if (pc && PageCgroupUsed(pc)) {
|
||||
mz = page_cgroup_zoneinfo(pc);
|
||||
spin_lock_irqsave(&mz->lru_lock, flags);
|
||||
__mem_cgroup_move_lists(pc, active);
|
||||
__mem_cgroup_move_lists(pc, lru);
|
||||
spin_unlock_irqrestore(&mz->lru_lock, flags);
|
||||
}
|
||||
unlock_page_cgroup(page);
|
||||
unlock_page_cgroup(pc);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -394,21 +367,6 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
|
||||
return (int)((rss * 100L) / total);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is called from vmscan.c. In page reclaiming loop. balance
|
||||
* between active and inactive list is calculated. For memory controller
|
||||
* page reclaiming, we should use using mem_cgroup's imbalance rather than
|
||||
* zone's global lru imbalance.
|
||||
*/
|
||||
long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem)
|
||||
{
|
||||
unsigned long active, inactive;
|
||||
/* active and inactive are the number of pages. 'long' is ok.*/
|
||||
active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE);
|
||||
inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE);
|
||||
return (long) (active / (inactive + 1));
|
||||
}
|
||||
|
||||
/*
|
||||
* prev_priority control...this will be used in memory reclaim path.
|
||||
*/
|
||||
@@ -436,28 +394,17 @@ void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
|
||||
* (see include/linux/mmzone.h)
|
||||
*/
|
||||
|
||||
long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem,
|
||||
struct zone *zone, int priority)
|
||||
long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
|
||||
int priority, enum lru_list lru)
|
||||
{
|
||||
long nr_active;
|
||||
long nr_pages;
|
||||
int nid = zone->zone_pgdat->node_id;
|
||||
int zid = zone_idx(zone);
|
||||
struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
|
||||
|
||||
nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE);
|
||||
return (nr_active >> priority);
|
||||
}
|
||||
nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
|
||||
|
||||
long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
|
||||
struct zone *zone, int priority)
|
||||
{
|
||||
long nr_inactive;
|
||||
int nid = zone->zone_pgdat->node_id;
|
||||
int zid = zone_idx(zone);
|
||||
struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
|
||||
|
||||
nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE);
|
||||
return (nr_inactive >> priority);
|
||||
return (nr_pages >> priority);
|
||||
}
|
||||
|
||||
unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
|
||||
@@ -465,7 +412,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
|
||||
unsigned long *scanned, int order,
|
||||
int mode, struct zone *z,
|
||||
struct mem_cgroup *mem_cont,
|
||||
int active)
|
||||
int active, int file)
|
||||
{
|
||||
unsigned long nr_taken = 0;
|
||||
struct page *page;
|
||||
@@ -476,38 +423,38 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
|
||||
int nid = z->zone_pgdat->node_id;
|
||||
int zid = zone_idx(z);
|
||||
struct mem_cgroup_per_zone *mz;
|
||||
int lru = LRU_FILE * !!file + !!active;
|
||||
|
||||
BUG_ON(!mem_cont);
|
||||
mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
|
||||
if (active)
|
||||
src = &mz->active_list;
|
||||
else
|
||||
src = &mz->inactive_list;
|
||||
|
||||
src = &mz->lists[lru];
|
||||
|
||||
spin_lock(&mz->lru_lock);
|
||||
scan = 0;
|
||||
list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
|
||||
if (scan >= nr_to_scan)
|
||||
break;
|
||||
if (unlikely(!PageCgroupUsed(pc)))
|
||||
continue;
|
||||
page = pc->page;
|
||||
|
||||
if (unlikely(!PageLRU(page)))
|
||||
continue;
|
||||
|
||||
if (PageActive(page) && !active) {
|
||||
__mem_cgroup_move_lists(pc, true);
|
||||
continue;
|
||||
}
|
||||
if (!PageActive(page) && active) {
|
||||
__mem_cgroup_move_lists(pc, false);
|
||||
/*
|
||||
* TODO: play better with lumpy reclaim, grabbing anything.
|
||||
*/
|
||||
if (PageUnevictable(page) ||
|
||||
(PageActive(page) && !active) ||
|
||||
(!PageActive(page) && active)) {
|
||||
__mem_cgroup_move_lists(pc, page_lru(page));
|
||||
continue;
|
||||
}
|
||||
|
||||
scan++;
|
||||
list_move(&pc->lru, &pc_list);
|
||||
|
||||
if (__isolate_lru_page(page, mode) == 0) {
|
||||
if (__isolate_lru_page(page, mode, file) == 0) {
|
||||
list_move(&page->lru, dst);
|
||||
nr_taken++;
|
||||
}
|
||||
@@ -532,23 +479,29 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
|
||||
{
|
||||
struct mem_cgroup *mem;
|
||||
struct page_cgroup *pc;
|
||||
unsigned long flags;
|
||||
unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
|
||||
struct mem_cgroup_per_zone *mz;
|
||||
unsigned long flags;
|
||||
|
||||
pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
|
||||
if (unlikely(pc == NULL))
|
||||
goto err;
|
||||
|
||||
pc = lookup_page_cgroup(page);
|
||||
/* can happen at boot */
|
||||
if (unlikely(!pc))
|
||||
return 0;
|
||||
prefetchw(pc);
|
||||
/*
|
||||
* We always charge the cgroup the mm_struct belongs to.
|
||||
* The mm_struct's mem_cgroup changes on task migration if the
|
||||
* thread group leader migrates. It's possible that mm is not
|
||||
* set, if so charge the init_mm (happens for pagecache usage).
|
||||
*/
|
||||
|
||||
if (likely(!memcg)) {
|
||||
rcu_read_lock();
|
||||
mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
|
||||
if (unlikely(!mem)) {
|
||||
rcu_read_unlock();
|
||||
return 0;
|
||||
}
|
||||
/*
|
||||
* For every charge from the cgroup, increment reference count
|
||||
*/
|
||||
@@ -559,7 +512,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
|
||||
css_get(&memcg->css);
|
||||
}
|
||||
|
||||
while (res_counter_charge(&mem->res, PAGE_SIZE)) {
|
||||
while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
|
||||
if (!(gfp_mask & __GFP_WAIT))
|
||||
goto out;
|
||||
|
||||
@@ -582,39 +535,33 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
lock_page_cgroup(pc);
|
||||
if (unlikely(PageCgroupUsed(pc))) {
|
||||
unlock_page_cgroup(pc);
|
||||
res_counter_uncharge(&mem->res, PAGE_SIZE);
|
||||
css_put(&mem->css);
|
||||
|
||||
goto done;
|
||||
}
|
||||
pc->mem_cgroup = mem;
|
||||
pc->page = page;
|
||||
/*
|
||||
* If a page is accounted as a page cache, insert to inactive list.
|
||||
* If anon, insert to active list.
|
||||
*/
|
||||
if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
|
||||
pc->flags = PAGE_CGROUP_FLAG_CACHE;
|
||||
else
|
||||
pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
|
||||
|
||||
lock_page_cgroup(page);
|
||||
if (unlikely(page_get_page_cgroup(page))) {
|
||||
unlock_page_cgroup(page);
|
||||
res_counter_uncharge(&mem->res, PAGE_SIZE);
|
||||
css_put(&mem->css);
|
||||
kmem_cache_free(page_cgroup_cache, pc);
|
||||
goto done;
|
||||
}
|
||||
page_assign_page_cgroup(page, pc);
|
||||
pc->flags = pcg_default_flags[ctype];
|
||||
|
||||
mz = page_cgroup_zoneinfo(pc);
|
||||
|
||||
spin_lock_irqsave(&mz->lru_lock, flags);
|
||||
__mem_cgroup_add_list(mz, pc);
|
||||
spin_unlock_irqrestore(&mz->lru_lock, flags);
|
||||
unlock_page_cgroup(pc);
|
||||
|
||||
unlock_page_cgroup(page);
|
||||
done:
|
||||
return 0;
|
||||
out:
|
||||
css_put(&mem->css);
|
||||
kmem_cache_free(page_cgroup_cache, pc);
|
||||
err:
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
@@ -622,7 +569,8 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
|
||||
{
|
||||
if (mem_cgroup_subsys.disabled)
|
||||
return 0;
|
||||
|
||||
if (PageCompound(page))
|
||||
return 0;
|
||||
/*
|
||||
* If already mapped, we don't have to account.
|
||||
* If page cache, page->mapping has address_space.
|
||||
@@ -643,7 +591,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
|
||||
{
|
||||
if (mem_cgroup_subsys.disabled)
|
||||
return 0;
|
||||
|
||||
if (PageCompound(page))
|
||||
return 0;
|
||||
/*
|
||||
* Corner case handling. This is called from add_to_page_cache()
|
||||
* in usual. But some FS (shmem) precharges this page before calling it
|
||||
@@ -656,22 +605,27 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
|
||||
if (!(gfp_mask & __GFP_WAIT)) {
|
||||
struct page_cgroup *pc;
|
||||
|
||||
lock_page_cgroup(page);
|
||||
pc = page_get_page_cgroup(page);
|
||||
if (pc) {
|
||||
VM_BUG_ON(pc->page != page);
|
||||
VM_BUG_ON(!pc->mem_cgroup);
|
||||
unlock_page_cgroup(page);
|
||||
|
||||
pc = lookup_page_cgroup(page);
|
||||
if (!pc)
|
||||
return 0;
|
||||
lock_page_cgroup(pc);
|
||||
if (PageCgroupUsed(pc)) {
|
||||
unlock_page_cgroup(pc);
|
||||
return 0;
|
||||
}
|
||||
unlock_page_cgroup(page);
|
||||
unlock_page_cgroup(pc);
|
||||
}
|
||||
|
||||
if (unlikely(!mm))
|
||||
mm = &init_mm;
|
||||
|
||||
return mem_cgroup_charge_common(page, mm, gfp_mask,
|
||||
if (page_is_file_cache(page))
|
||||
return mem_cgroup_charge_common(page, mm, gfp_mask,
|
||||
MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
|
||||
else
|
||||
return mem_cgroup_charge_common(page, mm, gfp_mask,
|
||||
MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -691,44 +645,46 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
|
||||
/*
|
||||
* Check if our page_cgroup is valid
|
||||
*/
|
||||
lock_page_cgroup(page);
|
||||
pc = page_get_page_cgroup(page);
|
||||
if (unlikely(!pc))
|
||||
goto unlock;
|
||||
pc = lookup_page_cgroup(page);
|
||||
if (unlikely(!pc || !PageCgroupUsed(pc)))
|
||||
return;
|
||||
|
||||
VM_BUG_ON(pc->page != page);
|
||||
|
||||
if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
|
||||
&& ((pc->flags & PAGE_CGROUP_FLAG_CACHE)
|
||||
|| page_mapped(page)))
|
||||
goto unlock;
|
||||
lock_page_cgroup(pc);
|
||||
if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page))
|
||||
|| !PageCgroupUsed(pc)) {
|
||||
/* This happens at race in zap_pte_range() and do_swap_page()*/
|
||||
unlock_page_cgroup(pc);
|
||||
return;
|
||||
}
|
||||
ClearPageCgroupUsed(pc);
|
||||
mem = pc->mem_cgroup;
|
||||
|
||||
mz = page_cgroup_zoneinfo(pc);
|
||||
spin_lock_irqsave(&mz->lru_lock, flags);
|
||||
__mem_cgroup_remove_list(mz, pc);
|
||||
spin_unlock_irqrestore(&mz->lru_lock, flags);
|
||||
unlock_page_cgroup(pc);
|
||||
|
||||
page_assign_page_cgroup(page, NULL);
|
||||
unlock_page_cgroup(page);
|
||||
|
||||
mem = pc->mem_cgroup;
|
||||
res_counter_uncharge(&mem->res, PAGE_SIZE);
|
||||
css_put(&mem->css);
|
||||
|
||||
kmem_cache_free(page_cgroup_cache, pc);
|
||||
return;
|
||||
unlock:
|
||||
unlock_page_cgroup(page);
|
||||
}
|
||||
|
||||
void mem_cgroup_uncharge_page(struct page *page)
|
||||
{
|
||||
/* early check. */
|
||||
if (page_mapped(page))
|
||||
return;
|
||||
if (page->mapping && !PageAnon(page))
|
||||
return;
|
||||
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
|
||||
}
|
||||
|
||||
void mem_cgroup_uncharge_cache_page(struct page *page)
|
||||
{
|
||||
VM_BUG_ON(page_mapped(page));
|
||||
VM_BUG_ON(page->mapping);
|
||||
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
|
||||
}
|
||||
|
||||
@@ -745,15 +701,19 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
|
||||
if (mem_cgroup_subsys.disabled)
|
||||
return 0;
|
||||
|
||||
lock_page_cgroup(page);
|
||||
pc = page_get_page_cgroup(page);
|
||||
if (pc) {
|
||||
pc = lookup_page_cgroup(page);
|
||||
lock_page_cgroup(pc);
|
||||
if (PageCgroupUsed(pc)) {
|
||||
mem = pc->mem_cgroup;
|
||||
css_get(&mem->css);
|
||||
if (pc->flags & PAGE_CGROUP_FLAG_CACHE)
|
||||
ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
|
||||
if (PageCgroupCache(pc)) {
|
||||
if (page_is_file_cache(page))
|
||||
ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
|
||||
else
|
||||
ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
|
||||
}
|
||||
}
|
||||
unlock_page_cgroup(page);
|
||||
unlock_page_cgroup(pc);
|
||||
if (mem) {
|
||||
ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
|
||||
ctype, mem);
|
||||
@@ -778,7 +738,7 @@ void mem_cgroup_end_migration(struct page *newpage)
|
||||
*/
|
||||
if (!newpage->mapping)
|
||||
__mem_cgroup_uncharge_common(newpage,
|
||||
MEM_CGROUP_CHARGE_TYPE_FORCE);
|
||||
MEM_CGROUP_CHARGE_TYPE_FORCE);
|
||||
else if (PageAnon(newpage))
|
||||
mem_cgroup_uncharge_page(newpage);
|
||||
}
|
||||
@@ -801,11 +761,16 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
|
||||
|
||||
rcu_read_lock();
|
||||
mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
|
||||
if (unlikely(!mem)) {
|
||||
rcu_read_unlock();
|
||||
return 0;
|
||||
}
|
||||
css_get(&mem->css);
|
||||
rcu_read_unlock();
|
||||
|
||||
do {
|
||||
progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
|
||||
progress += res_counter_check_under_limit(&mem->res);
|
||||
} while (!progress && --retry);
|
||||
|
||||
css_put(&mem->css);
|
||||
@@ -845,7 +810,7 @@ int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
|
||||
#define FORCE_UNCHARGE_BATCH (128)
|
||||
static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
|
||||
struct mem_cgroup_per_zone *mz,
|
||||
int active)
|
||||
enum lru_list lru)
|
||||
{
|
||||
struct page_cgroup *pc;
|
||||
struct page *page;
|
||||
@@ -853,15 +818,14 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
|
||||
unsigned long flags;
|
||||
struct list_head *list;
|
||||
|
||||
if (active)
|
||||
list = &mz->active_list;
|
||||
else
|
||||
list = &mz->inactive_list;
|
||||
list = &mz->lists[lru];
|
||||
|
||||
spin_lock_irqsave(&mz->lru_lock, flags);
|
||||
while (!list_empty(list)) {
|
||||
pc = list_entry(list->prev, struct page_cgroup, lru);
|
||||
page = pc->page;
|
||||
if (!PageCgroupUsed(pc))
|
||||
break;
|
||||
get_page(page);
|
||||
spin_unlock_irqrestore(&mz->lru_lock, flags);
|
||||
/*
|
||||
@@ -876,8 +840,10 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
|
||||
count = FORCE_UNCHARGE_BATCH;
|
||||
cond_resched();
|
||||
}
|
||||
} else
|
||||
cond_resched();
|
||||
} else {
|
||||
spin_lock_irqsave(&mz->lru_lock, flags);
|
||||
break;
|
||||
}
|
||||
spin_lock_irqsave(&mz->lru_lock, flags);
|
||||
}
|
||||
spin_unlock_irqrestore(&mz->lru_lock, flags);
|
||||
@@ -901,15 +867,17 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
|
||||
while (mem->res.usage > 0) {
|
||||
if (atomic_read(&mem->css.cgroup->count) > 0)
|
||||
goto out;
|
||||
/* This is for making all *used* pages to be on LRU. */
|
||||
lru_add_drain_all();
|
||||
for_each_node_state(node, N_POSSIBLE)
|
||||
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
|
||||
struct mem_cgroup_per_zone *mz;
|
||||
enum lru_list l;
|
||||
mz = mem_cgroup_zoneinfo(mem, node, zid);
|
||||
/* drop all page_cgroup in active_list */
|
||||
mem_cgroup_force_empty_list(mem, mz, 1);
|
||||
/* drop all page_cgroup in inactive_list */
|
||||
mem_cgroup_force_empty_list(mem, mz, 0);
|
||||
for_each_lru(l)
|
||||
mem_cgroup_force_empty_list(mem, mz, l);
|
||||
}
|
||||
cond_resched();
|
||||
}
|
||||
ret = 0;
|
||||
out:
|
||||
@@ -994,14 +962,27 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
|
||||
}
|
||||
/* showing # of active pages */
|
||||
{
|
||||
unsigned long active, inactive;
|
||||
unsigned long active_anon, inactive_anon;
|
||||
unsigned long active_file, inactive_file;
|
||||
unsigned long unevictable;
|
||||
|
||||
inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
|
||||
LRU_INACTIVE_ANON);
|
||||
active_anon = mem_cgroup_get_all_zonestat(mem_cont,
|
||||
LRU_ACTIVE_ANON);
|
||||
inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
|
||||
LRU_INACTIVE_FILE);
|
||||
active_file = mem_cgroup_get_all_zonestat(mem_cont,
|
||||
LRU_ACTIVE_FILE);
|
||||
unevictable = mem_cgroup_get_all_zonestat(mem_cont,
|
||||
LRU_UNEVICTABLE);
|
||||
|
||||
cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
|
||||
cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
|
||||
cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
|
||||
cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
|
||||
cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
|
||||
|
||||
inactive = mem_cgroup_get_all_zonestat(mem_cont,
|
||||
MEM_CGROUP_ZSTAT_INACTIVE);
|
||||
active = mem_cgroup_get_all_zonestat(mem_cont,
|
||||
MEM_CGROUP_ZSTAT_ACTIVE);
|
||||
cb->fill(cb, "active", (active) * PAGE_SIZE);
|
||||
cb->fill(cb, "inactive", (inactive) * PAGE_SIZE);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@@ -1044,6 +1025,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
|
||||
{
|
||||
struct mem_cgroup_per_node *pn;
|
||||
struct mem_cgroup_per_zone *mz;
|
||||
enum lru_list l;
|
||||
int zone, tmp = node;
|
||||
/*
|
||||
* This routine is called against possible nodes.
|
||||
@@ -1064,9 +1046,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
|
||||
|
||||
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
|
||||
mz = &pn->zoneinfo[zone];
|
||||
INIT_LIST_HEAD(&mz->active_list);
|
||||
INIT_LIST_HEAD(&mz->inactive_list);
|
||||
spin_lock_init(&mz->lru_lock);
|
||||
for_each_lru(l)
|
||||
INIT_LIST_HEAD(&mz->lists[l]);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@@ -1107,7 +1089,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
|
||||
|
||||
if (unlikely((cont->parent) == NULL)) {
|
||||
mem = &init_mem_cgroup;
|
||||
page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
|
||||
} else {
|
||||
mem = mem_cgroup_alloc();
|
||||
if (!mem)
|
||||
|
127
mm/memory.c
127
mm/memory.c
@@ -1129,12 +1129,17 @@ static inline int use_zero_page(struct vm_area_struct *vma)
|
||||
return !vma->vm_ops || !vma->vm_ops->fault;
|
||||
}
|
||||
|
||||
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, int len, int write, int force,
|
||||
|
||||
|
||||
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, int len, int flags,
|
||||
struct page **pages, struct vm_area_struct **vmas)
|
||||
{
|
||||
int i;
|
||||
unsigned int vm_flags;
|
||||
unsigned int vm_flags = 0;
|
||||
int write = !!(flags & GUP_FLAGS_WRITE);
|
||||
int force = !!(flags & GUP_FLAGS_FORCE);
|
||||
int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
|
||||
|
||||
if (len <= 0)
|
||||
return 0;
|
||||
@@ -1158,7 +1163,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
pte_t *pte;
|
||||
if (write) /* user gate pages are read-only */
|
||||
|
||||
/* user gate pages are read-only */
|
||||
if (!ignore && write)
|
||||
return i ? : -EFAULT;
|
||||
if (pg > TASK_SIZE)
|
||||
pgd = pgd_offset_k(pg);
|
||||
@@ -1190,8 +1197,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
|
||||
|| !(vm_flags & vma->vm_flags))
|
||||
if (!vma ||
|
||||
(vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
|
||||
(!ignore && !(vm_flags & vma->vm_flags)))
|
||||
return i ? : -EFAULT;
|
||||
|
||||
if (is_vm_hugetlb_page(vma)) {
|
||||
@@ -1266,6 +1274,23 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
} while (len);
|
||||
return i;
|
||||
}
|
||||
|
||||
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, int len, int write, int force,
|
||||
struct page **pages, struct vm_area_struct **vmas)
|
||||
{
|
||||
int flags = 0;
|
||||
|
||||
if (write)
|
||||
flags |= GUP_FLAGS_WRITE;
|
||||
if (force)
|
||||
flags |= GUP_FLAGS_FORCE;
|
||||
|
||||
return __get_user_pages(tsk, mm,
|
||||
start, len, flags,
|
||||
pages, vmas);
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(get_user_pages);
|
||||
|
||||
pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
|
||||
@@ -1296,18 +1321,14 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
|
||||
pte_t *pte;
|
||||
spinlock_t *ptl;
|
||||
|
||||
retval = mem_cgroup_charge(page, mm, GFP_KERNEL);
|
||||
if (retval)
|
||||
goto out;
|
||||
|
||||
retval = -EINVAL;
|
||||
if (PageAnon(page))
|
||||
goto out_uncharge;
|
||||
goto out;
|
||||
retval = -ENOMEM;
|
||||
flush_dcache_page(page);
|
||||
pte = get_locked_pte(mm, addr, &ptl);
|
||||
if (!pte)
|
||||
goto out_uncharge;
|
||||
goto out;
|
||||
retval = -EBUSY;
|
||||
if (!pte_none(*pte))
|
||||
goto out_unlock;
|
||||
@@ -1323,8 +1344,6 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
|
||||
return retval;
|
||||
out_unlock:
|
||||
pte_unmap_unlock(pte, ptl);
|
||||
out_uncharge:
|
||||
mem_cgroup_uncharge_page(page);
|
||||
out:
|
||||
return retval;
|
||||
}
|
||||
@@ -1858,6 +1877,15 @@ gotten:
|
||||
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
|
||||
if (!new_page)
|
||||
goto oom;
|
||||
/*
|
||||
* Don't let another task, with possibly unlocked vma,
|
||||
* keep the mlocked page.
|
||||
*/
|
||||
if (vma->vm_flags & VM_LOCKED) {
|
||||
lock_page(old_page); /* for LRU manipulation */
|
||||
clear_page_mlock(old_page);
|
||||
unlock_page(old_page);
|
||||
}
|
||||
cow_user_page(new_page, old_page, address, vma);
|
||||
__SetPageUptodate(new_page);
|
||||
|
||||
@@ -1886,11 +1914,13 @@ gotten:
|
||||
* thread doing COW.
|
||||
*/
|
||||
ptep_clear_flush_notify(vma, address, page_table);
|
||||
set_pte_at(mm, address, page_table, entry);
|
||||
update_mmu_cache(vma, address, entry);
|
||||
lru_cache_add_active(new_page);
|
||||
SetPageSwapBacked(new_page);
|
||||
lru_cache_add_active_or_unevictable(new_page, vma);
|
||||
page_add_new_anon_rmap(new_page, vma, address);
|
||||
|
||||
//TODO: is this safe? do_anonymous_page() does it this way.
|
||||
set_pte_at(mm, address, page_table, entry);
|
||||
update_mmu_cache(vma, address, entry);
|
||||
if (old_page) {
|
||||
/*
|
||||
* Only after switching the pte to the new page may
|
||||
@@ -2288,16 +2318,17 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
count_vm_event(PGMAJFAULT);
|
||||
}
|
||||
|
||||
if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
|
||||
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
|
||||
ret = VM_FAULT_OOM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
mark_page_accessed(page);
|
||||
|
||||
lock_page(page);
|
||||
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
|
||||
|
||||
if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
|
||||
ret = VM_FAULT_OOM;
|
||||
unlock_page(page);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Back out if somebody else already faulted in this pte.
|
||||
*/
|
||||
@@ -2324,7 +2355,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
page_add_anon_rmap(page, vma, address);
|
||||
|
||||
swap_free(entry);
|
||||
if (vm_swap_full())
|
||||
if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
|
||||
remove_exclusive_swap_page(page);
|
||||
unlock_page(page);
|
||||
|
||||
@@ -2382,7 +2413,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
if (!pte_none(*page_table))
|
||||
goto release;
|
||||
inc_mm_counter(mm, anon_rss);
|
||||
lru_cache_add_active(page);
|
||||
SetPageSwapBacked(page);
|
||||
lru_cache_add_active_or_unevictable(page, vma);
|
||||
page_add_new_anon_rmap(page, vma, address);
|
||||
set_pte_at(mm, address, page_table, entry);
|
||||
|
||||
@@ -2423,6 +2455,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
struct page *page;
|
||||
pte_t entry;
|
||||
int anon = 0;
|
||||
int charged = 0;
|
||||
struct page *dirty_page = NULL;
|
||||
struct vm_fault vmf;
|
||||
int ret;
|
||||
@@ -2463,6 +2496,18 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
ret = VM_FAULT_OOM;
|
||||
goto out;
|
||||
}
|
||||
if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
|
||||
ret = VM_FAULT_OOM;
|
||||
page_cache_release(page);
|
||||
goto out;
|
||||
}
|
||||
charged = 1;
|
||||
/*
|
||||
* Don't let another task, with possibly unlocked vma,
|
||||
* keep the mlocked page.
|
||||
*/
|
||||
if (vma->vm_flags & VM_LOCKED)
|
||||
clear_page_mlock(vmf.page);
|
||||
copy_user_highpage(page, vmf.page, address, vma);
|
||||
__SetPageUptodate(page);
|
||||
} else {
|
||||
@@ -2497,11 +2542,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
|
||||
}
|
||||
|
||||
if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
|
||||
ret = VM_FAULT_OOM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
|
||||
|
||||
/*
|
||||
@@ -2520,11 +2560,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
entry = mk_pte(page, vma->vm_page_prot);
|
||||
if (flags & FAULT_FLAG_WRITE)
|
||||
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
|
||||
set_pte_at(mm, address, page_table, entry);
|
||||
if (anon) {
|
||||
inc_mm_counter(mm, anon_rss);
|
||||
lru_cache_add_active(page);
|
||||
page_add_new_anon_rmap(page, vma, address);
|
||||
inc_mm_counter(mm, anon_rss);
|
||||
SetPageSwapBacked(page);
|
||||
lru_cache_add_active_or_unevictable(page, vma);
|
||||
page_add_new_anon_rmap(page, vma, address);
|
||||
} else {
|
||||
inc_mm_counter(mm, file_rss);
|
||||
page_add_file_rmap(page);
|
||||
@@ -2533,11 +2573,14 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
get_page(dirty_page);
|
||||
}
|
||||
}
|
||||
//TODO: is this safe? do_anonymous_page() does it this way.
|
||||
set_pte_at(mm, address, page_table, entry);
|
||||
|
||||
/* no need to invalidate: a not-present page won't be cached */
|
||||
update_mmu_cache(vma, address, entry);
|
||||
} else {
|
||||
mem_cgroup_uncharge_page(page);
|
||||
if (charged)
|
||||
mem_cgroup_uncharge_page(page);
|
||||
if (anon)
|
||||
page_cache_release(page);
|
||||
else
|
||||
@@ -2772,19 +2815,9 @@ int make_pages_present(unsigned long addr, unsigned long end)
|
||||
len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
|
||||
ret = get_user_pages(current, current->mm, addr,
|
||||
len, write, 0, NULL, NULL);
|
||||
if (ret < 0) {
|
||||
/*
|
||||
SUS require strange return value to mlock
|
||||
- invalid addr generate to ENOMEM.
|
||||
- out of memory should generate EAGAIN.
|
||||
*/
|
||||
if (ret == -EFAULT)
|
||||
ret = -ENOMEM;
|
||||
else if (ret == -ENOMEM)
|
||||
ret = -EAGAIN;
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
}
|
||||
return ret == len ? 0 : -ENOMEM;
|
||||
return ret == len ? 0 : -EFAULT;
|
||||
}
|
||||
|
||||
#if !defined(__HAVE_ARCH_GATE_AREA)
|
||||
|
@@ -26,6 +26,7 @@
|
||||
#include <linux/delay.h>
|
||||
#include <linux/migrate.h>
|
||||
#include <linux/page-isolation.h>
|
||||
#include <linux/pfn.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
@@ -323,11 +324,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
|
||||
BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
|
||||
BUG_ON(nr_pages % PAGES_PER_SECTION);
|
||||
|
||||
release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
|
||||
|
||||
sections_to_remove = nr_pages / PAGES_PER_SECTION;
|
||||
for (i = 0; i < sections_to_remove; i++) {
|
||||
unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
|
||||
release_mem_region(pfn << PAGE_SHIFT,
|
||||
PAGES_PER_SECTION << PAGE_SHIFT);
|
||||
ret = __remove_section(zone, __pfn_to_section(pfn));
|
||||
if (ret)
|
||||
break;
|
||||
@@ -657,8 +658,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
|
||||
* We can skip free pages. And we can only deal with pages on
|
||||
* LRU.
|
||||
*/
|
||||
ret = isolate_lru_page(page, &source);
|
||||
ret = isolate_lru_page(page);
|
||||
if (!ret) { /* Success */
|
||||
list_add_tail(&page->lru, &source);
|
||||
move_pages--;
|
||||
} else {
|
||||
/* Becasue we don't have big zone->lock. we should
|
||||
@@ -849,10 +851,19 @@ failed_removal:
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int remove_memory(u64 start, u64 size)
|
||||
{
|
||||
unsigned long start_pfn, end_pfn;
|
||||
|
||||
start_pfn = PFN_DOWN(start);
|
||||
end_pfn = start_pfn + PFN_DOWN(size);
|
||||
return offline_pages(start_pfn, end_pfn, 120 * HZ);
|
||||
}
|
||||
#else
|
||||
int remove_memory(u64 start, u64 size)
|
||||
{
|
||||
return -EINVAL;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(remove_memory);
|
||||
#endif /* CONFIG_MEMORY_HOTREMOVE */
|
||||
EXPORT_SYMBOL_GPL(remove_memory);
|
||||
|
@@ -93,6 +93,8 @@
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/uaccess.h>
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
/* Internal flags */
|
||||
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
|
||||
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
|
||||
@@ -762,8 +764,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
|
||||
/*
|
||||
* Avoid migrating a page that is shared with others.
|
||||
*/
|
||||
if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
|
||||
isolate_lru_page(page, pagelist);
|
||||
if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
|
||||
if (!isolate_lru_page(page)) {
|
||||
list_add_tail(&page->lru, pagelist);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static struct page *new_node_page(struct page *page, unsigned long node, int **x)
|
||||
@@ -2197,7 +2202,7 @@ static void gather_stats(struct page *page, void *private, int pte_dirty)
|
||||
if (PageSwapCache(page))
|
||||
md->swapcache++;
|
||||
|
||||
if (PageActive(page))
|
||||
if (PageActive(page) || PageUnevictable(page))
|
||||
md->active++;
|
||||
|
||||
if (PageWriteback(page))
|
||||
|
280
mm/migrate.c
280
mm/migrate.c
@@ -36,36 +36,6 @@
|
||||
|
||||
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
|
||||
|
||||
/*
|
||||
* Isolate one page from the LRU lists. If successful put it onto
|
||||
* the indicated list with elevated page count.
|
||||
*
|
||||
* Result:
|
||||
* -EBUSY: page not on LRU list
|
||||
* 0: page removed from LRU list and added to the specified list.
|
||||
*/
|
||||
int isolate_lru_page(struct page *page, struct list_head *pagelist)
|
||||
{
|
||||
int ret = -EBUSY;
|
||||
|
||||
if (PageLRU(page)) {
|
||||
struct zone *zone = page_zone(page);
|
||||
|
||||
spin_lock_irq(&zone->lru_lock);
|
||||
if (PageLRU(page) && get_page_unless_zero(page)) {
|
||||
ret = 0;
|
||||
ClearPageLRU(page);
|
||||
if (PageActive(page))
|
||||
del_page_from_active_list(zone, page);
|
||||
else
|
||||
del_page_from_inactive_list(zone, page);
|
||||
list_add_tail(&page->lru, pagelist);
|
||||
}
|
||||
spin_unlock_irq(&zone->lru_lock);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* migrate_prep() needs to be called before we start compiling a list of pages
|
||||
* to be migrated using isolate_lru_page().
|
||||
@@ -83,23 +53,9 @@ int migrate_prep(void)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void move_to_lru(struct page *page)
|
||||
{
|
||||
if (PageActive(page)) {
|
||||
/*
|
||||
* lru_cache_add_active checks that
|
||||
* the PG_active bit is off.
|
||||
*/
|
||||
ClearPageActive(page);
|
||||
lru_cache_add_active(page);
|
||||
} else {
|
||||
lru_cache_add(page);
|
||||
}
|
||||
put_page(page);
|
||||
}
|
||||
|
||||
/*
|
||||
* Add isolated pages on the list back to the LRU.
|
||||
* Add isolated pages on the list back to the LRU under page lock
|
||||
* to avoid leaking evictable pages back onto unevictable list.
|
||||
*
|
||||
* returns the number of pages put back.
|
||||
*/
|
||||
@@ -111,7 +67,7 @@ int putback_lru_pages(struct list_head *l)
|
||||
|
||||
list_for_each_entry_safe(page, page2, l, lru) {
|
||||
list_del(&page->lru);
|
||||
move_to_lru(page);
|
||||
putback_lru_page(page);
|
||||
count++;
|
||||
}
|
||||
return count;
|
||||
@@ -374,8 +330,6 @@ static int migrate_page_move_mapping(struct address_space *mapping,
|
||||
__inc_zone_page_state(newpage, NR_FILE_PAGES);
|
||||
|
||||
spin_unlock_irq(&mapping->tree_lock);
|
||||
if (!PageSwapCache(newpage))
|
||||
mem_cgroup_uncharge_cache_page(page);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -385,6 +339,8 @@ static int migrate_page_move_mapping(struct address_space *mapping,
|
||||
*/
|
||||
static void migrate_page_copy(struct page *newpage, struct page *page)
|
||||
{
|
||||
int anon;
|
||||
|
||||
copy_highpage(newpage, page);
|
||||
|
||||
if (PageError(page))
|
||||
@@ -393,8 +349,11 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
|
||||
SetPageReferenced(newpage);
|
||||
if (PageUptodate(page))
|
||||
SetPageUptodate(newpage);
|
||||
if (PageActive(page))
|
||||
if (TestClearPageActive(page)) {
|
||||
VM_BUG_ON(PageUnevictable(page));
|
||||
SetPageActive(newpage);
|
||||
} else
|
||||
unevictable_migrate_page(newpage, page);
|
||||
if (PageChecked(page))
|
||||
SetPageChecked(newpage);
|
||||
if (PageMappedToDisk(page))
|
||||
@@ -412,14 +371,20 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
|
||||
__set_page_dirty_nobuffers(newpage);
|
||||
}
|
||||
|
||||
mlock_migrate_page(newpage, page);
|
||||
|
||||
#ifdef CONFIG_SWAP
|
||||
ClearPageSwapCache(page);
|
||||
#endif
|
||||
ClearPageActive(page);
|
||||
ClearPagePrivate(page);
|
||||
set_page_private(page, 0);
|
||||
/* page->mapping contains a flag for PageAnon() */
|
||||
anon = PageAnon(page);
|
||||
page->mapping = NULL;
|
||||
|
||||
if (!anon) /* This page was removed from radix-tree. */
|
||||
mem_cgroup_uncharge_cache_page(page);
|
||||
|
||||
/*
|
||||
* If any waiters have accumulated on the new page then
|
||||
* wake them up.
|
||||
@@ -594,6 +559,10 @@ static int fallback_migrate_page(struct address_space *mapping,
|
||||
*
|
||||
* The new page will have replaced the old page if this function
|
||||
* is successful.
|
||||
*
|
||||
* Return value:
|
||||
* < 0 - error code
|
||||
* == 0 - success
|
||||
*/
|
||||
static int move_to_new_page(struct page *newpage, struct page *page)
|
||||
{
|
||||
@@ -611,6 +580,8 @@ static int move_to_new_page(struct page *newpage, struct page *page)
|
||||
/* Prepare mapping for the new page.*/
|
||||
newpage->index = page->index;
|
||||
newpage->mapping = page->mapping;
|
||||
if (PageSwapBacked(page))
|
||||
SetPageSwapBacked(newpage);
|
||||
|
||||
mapping = page_mapping(page);
|
||||
if (!mapping)
|
||||
@@ -654,9 +625,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
|
||||
if (!newpage)
|
||||
return -ENOMEM;
|
||||
|
||||
if (page_count(page) == 1)
|
||||
if (page_count(page) == 1) {
|
||||
/* page was freed from under us. So we are done. */
|
||||
goto move_newpage;
|
||||
}
|
||||
|
||||
charge = mem_cgroup_prepare_migration(page, newpage);
|
||||
if (charge == -ENOMEM) {
|
||||
@@ -730,7 +702,6 @@ rcu_unlock:
|
||||
rcu_read_unlock();
|
||||
|
||||
unlock:
|
||||
|
||||
unlock_page(page);
|
||||
|
||||
if (rc != -EAGAIN) {
|
||||
@@ -741,17 +712,19 @@ unlock:
|
||||
* restored.
|
||||
*/
|
||||
list_del(&page->lru);
|
||||
move_to_lru(page);
|
||||
putback_lru_page(page);
|
||||
}
|
||||
|
||||
move_newpage:
|
||||
if (!charge)
|
||||
mem_cgroup_end_migration(newpage);
|
||||
|
||||
/*
|
||||
* Move the new page to the LRU. If migration was not successful
|
||||
* then this will free the page.
|
||||
*/
|
||||
move_to_lru(newpage);
|
||||
putback_lru_page(newpage);
|
||||
|
||||
if (result) {
|
||||
if (rc)
|
||||
*result = rc;
|
||||
@@ -858,9 +831,11 @@ static struct page *new_page_node(struct page *p, unsigned long private,
|
||||
* Move a set of pages as indicated in the pm array. The addr
|
||||
* field must be set to the virtual address of the page to be moved
|
||||
* and the node number must contain a valid target node.
|
||||
* The pm array ends with node = MAX_NUMNODES.
|
||||
*/
|
||||
static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
|
||||
int migrate_all)
|
||||
static int do_move_page_to_node_array(struct mm_struct *mm,
|
||||
struct page_to_node *pm,
|
||||
int migrate_all)
|
||||
{
|
||||
int err;
|
||||
struct page_to_node *pp;
|
||||
@@ -914,7 +889,9 @@ static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
|
||||
!migrate_all)
|
||||
goto put_and_set;
|
||||
|
||||
err = isolate_lru_page(page, &pagelist);
|
||||
err = isolate_lru_page(page);
|
||||
if (!err)
|
||||
list_add_tail(&page->lru, &pagelist);
|
||||
put_and_set:
|
||||
/*
|
||||
* Either remove the duplicate refcount from
|
||||
@@ -926,36 +903,118 @@ set_status:
|
||||
pp->status = err;
|
||||
}
|
||||
|
||||
err = 0;
|
||||
if (!list_empty(&pagelist))
|
||||
err = migrate_pages(&pagelist, new_page_node,
|
||||
(unsigned long)pm);
|
||||
else
|
||||
err = -ENOENT;
|
||||
|
||||
up_read(&mm->mmap_sem);
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine the nodes of a list of pages. The addr in the pm array
|
||||
* must have been set to the virtual address of which we want to determine
|
||||
* the node number.
|
||||
* Migrate an array of page address onto an array of nodes and fill
|
||||
* the corresponding array of status.
|
||||
*/
|
||||
static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
|
||||
static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
|
||||
unsigned long nr_pages,
|
||||
const void __user * __user *pages,
|
||||
const int __user *nodes,
|
||||
int __user *status, int flags)
|
||||
{
|
||||
down_read(&mm->mmap_sem);
|
||||
struct page_to_node *pm = NULL;
|
||||
nodemask_t task_nodes;
|
||||
int err = 0;
|
||||
int i;
|
||||
|
||||
for ( ; pm->node != MAX_NUMNODES; pm++) {
|
||||
struct vm_area_struct *vma;
|
||||
struct page *page;
|
||||
int err;
|
||||
task_nodes = cpuset_mems_allowed(task);
|
||||
|
||||
/* Limit nr_pages so that the multiplication may not overflow */
|
||||
if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
|
||||
err = -E2BIG;
|
||||
goto out;
|
||||
}
|
||||
|
||||
pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
|
||||
if (!pm) {
|
||||
err = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get parameters from user space and initialize the pm
|
||||
* array. Return various errors if the user did something wrong.
|
||||
*/
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
const void __user *p;
|
||||
|
||||
err = -EFAULT;
|
||||
vma = find_vma(mm, pm->addr);
|
||||
if (get_user(p, pages + i))
|
||||
goto out_pm;
|
||||
|
||||
pm[i].addr = (unsigned long)p;
|
||||
if (nodes) {
|
||||
int node;
|
||||
|
||||
if (get_user(node, nodes + i))
|
||||
goto out_pm;
|
||||
|
||||
err = -ENODEV;
|
||||
if (!node_state(node, N_HIGH_MEMORY))
|
||||
goto out_pm;
|
||||
|
||||
err = -EACCES;
|
||||
if (!node_isset(node, task_nodes))
|
||||
goto out_pm;
|
||||
|
||||
pm[i].node = node;
|
||||
} else
|
||||
pm[i].node = 0; /* anything to not match MAX_NUMNODES */
|
||||
}
|
||||
/* End marker */
|
||||
pm[nr_pages].node = MAX_NUMNODES;
|
||||
|
||||
err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL);
|
||||
if (err >= 0)
|
||||
/* Return status information */
|
||||
for (i = 0; i < nr_pages; i++)
|
||||
if (put_user(pm[i].status, status + i))
|
||||
err = -EFAULT;
|
||||
|
||||
out_pm:
|
||||
vfree(pm);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine the nodes of an array of pages and store it in an array of status.
|
||||
*/
|
||||
static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
|
||||
const void __user * __user *pages,
|
||||
int __user *status)
|
||||
{
|
||||
unsigned long i;
|
||||
int err;
|
||||
|
||||
down_read(&mm->mmap_sem);
|
||||
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
const void __user *p;
|
||||
unsigned long addr;
|
||||
struct vm_area_struct *vma;
|
||||
struct page *page;
|
||||
|
||||
err = -EFAULT;
|
||||
if (get_user(p, pages+i))
|
||||
goto out;
|
||||
addr = (unsigned long) p;
|
||||
|
||||
vma = find_vma(mm, addr);
|
||||
if (!vma)
|
||||
goto set_status;
|
||||
|
||||
page = follow_page(vma, pm->addr, 0);
|
||||
page = follow_page(vma, addr, 0);
|
||||
|
||||
err = PTR_ERR(page);
|
||||
if (IS_ERR(page))
|
||||
@@ -968,11 +1027,13 @@ static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
|
||||
|
||||
err = page_to_nid(page);
|
||||
set_status:
|
||||
pm->status = err;
|
||||
put_user(err, status+i);
|
||||
}
|
||||
err = 0;
|
||||
|
||||
out:
|
||||
up_read(&mm->mmap_sem);
|
||||
return 0;
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -984,12 +1045,9 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
|
||||
const int __user *nodes,
|
||||
int __user *status, int flags)
|
||||
{
|
||||
int err = 0;
|
||||
int i;
|
||||
struct task_struct *task;
|
||||
nodemask_t task_nodes;
|
||||
struct mm_struct *mm;
|
||||
struct page_to_node *pm = NULL;
|
||||
int err;
|
||||
|
||||
/* Check flags */
|
||||
if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
|
||||
@@ -1021,75 +1079,21 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
|
||||
(current->uid != task->suid) && (current->uid != task->uid) &&
|
||||
!capable(CAP_SYS_NICE)) {
|
||||
err = -EPERM;
|
||||
goto out2;
|
||||
goto out;
|
||||
}
|
||||
|
||||
err = security_task_movememory(task);
|
||||
if (err)
|
||||
goto out2;
|
||||
goto out;
|
||||
|
||||
|
||||
task_nodes = cpuset_mems_allowed(task);
|
||||
|
||||
/* Limit nr_pages so that the multiplication may not overflow */
|
||||
if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
|
||||
err = -E2BIG;
|
||||
goto out2;
|
||||
if (nodes) {
|
||||
err = do_pages_move(mm, task, nr_pages, pages, nodes, status,
|
||||
flags);
|
||||
} else {
|
||||
err = do_pages_stat(mm, nr_pages, pages, status);
|
||||
}
|
||||
|
||||
pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
|
||||
if (!pm) {
|
||||
err = -ENOMEM;
|
||||
goto out2;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get parameters from user space and initialize the pm
|
||||
* array. Return various errors if the user did something wrong.
|
||||
*/
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
const void __user *p;
|
||||
|
||||
err = -EFAULT;
|
||||
if (get_user(p, pages + i))
|
||||
goto out;
|
||||
|
||||
pm[i].addr = (unsigned long)p;
|
||||
if (nodes) {
|
||||
int node;
|
||||
|
||||
if (get_user(node, nodes + i))
|
||||
goto out;
|
||||
|
||||
err = -ENODEV;
|
||||
if (!node_state(node, N_HIGH_MEMORY))
|
||||
goto out;
|
||||
|
||||
err = -EACCES;
|
||||
if (!node_isset(node, task_nodes))
|
||||
goto out;
|
||||
|
||||
pm[i].node = node;
|
||||
} else
|
||||
pm[i].node = 0; /* anything to not match MAX_NUMNODES */
|
||||
}
|
||||
/* End marker */
|
||||
pm[nr_pages].node = MAX_NUMNODES;
|
||||
|
||||
if (nodes)
|
||||
err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL);
|
||||
else
|
||||
err = do_pages_stat(mm, pm);
|
||||
|
||||
if (err >= 0)
|
||||
/* Return status information */
|
||||
for (i = 0; i < nr_pages; i++)
|
||||
if (put_user(pm[i].status, status + i))
|
||||
err = -EFAULT;
|
||||
|
||||
out:
|
||||
vfree(pm);
|
||||
out2:
|
||||
mmput(mm);
|
||||
return err;
|
||||
}
|
||||
|
441
mm/mlock.c
441
mm/mlock.c
@@ -8,10 +8,18 @@
|
||||
#include <linux/capability.h>
|
||||
#include <linux/mman.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/swapops.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/mempolicy.h>
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/rmap.h>
|
||||
#include <linux/mmzone.h>
|
||||
#include <linux/hugetlb.h>
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
int can_do_mlock(void)
|
||||
{
|
||||
@@ -23,17 +31,381 @@ int can_do_mlock(void)
|
||||
}
|
||||
EXPORT_SYMBOL(can_do_mlock);
|
||||
|
||||
#ifdef CONFIG_UNEVICTABLE_LRU
|
||||
/*
|
||||
* Mlocked pages are marked with PageMlocked() flag for efficient testing
|
||||
* in vmscan and, possibly, the fault path; and to support semi-accurate
|
||||
* statistics.
|
||||
*
|
||||
* An mlocked page [PageMlocked(page)] is unevictable. As such, it will
|
||||
* be placed on the LRU "unevictable" list, rather than the [in]active lists.
|
||||
* The unevictable list is an LRU sibling list to the [in]active lists.
|
||||
* PageUnevictable is set to indicate the unevictable state.
|
||||
*
|
||||
* When lazy mlocking via vmscan, it is important to ensure that the
|
||||
* vma's VM_LOCKED status is not concurrently being modified, otherwise we
|
||||
* may have mlocked a page that is being munlocked. So lazy mlock must take
|
||||
* the mmap_sem for read, and verify that the vma really is locked
|
||||
* (see mm/rmap.c).
|
||||
*/
|
||||
|
||||
/*
|
||||
* LRU accounting for clear_page_mlock()
|
||||
*/
|
||||
void __clear_page_mlock(struct page *page)
|
||||
{
|
||||
VM_BUG_ON(!PageLocked(page));
|
||||
|
||||
if (!page->mapping) { /* truncated ? */
|
||||
return;
|
||||
}
|
||||
|
||||
dec_zone_page_state(page, NR_MLOCK);
|
||||
count_vm_event(UNEVICTABLE_PGCLEARED);
|
||||
if (!isolate_lru_page(page)) {
|
||||
putback_lru_page(page);
|
||||
} else {
|
||||
/*
|
||||
* Page not on the LRU yet. Flush all pagevecs and retry.
|
||||
*/
|
||||
lru_add_drain_all();
|
||||
if (!isolate_lru_page(page))
|
||||
putback_lru_page(page);
|
||||
else if (PageUnevictable(page))
|
||||
count_vm_event(UNEVICTABLE_PGSTRANDED);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Mark page as mlocked if not already.
|
||||
* If page on LRU, isolate and putback to move to unevictable list.
|
||||
*/
|
||||
void mlock_vma_page(struct page *page)
|
||||
{
|
||||
BUG_ON(!PageLocked(page));
|
||||
|
||||
if (!TestSetPageMlocked(page)) {
|
||||
inc_zone_page_state(page, NR_MLOCK);
|
||||
count_vm_event(UNEVICTABLE_PGMLOCKED);
|
||||
if (!isolate_lru_page(page))
|
||||
putback_lru_page(page);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* called from munlock()/munmap() path with page supposedly on the LRU.
|
||||
*
|
||||
* Note: unlike mlock_vma_page(), we can't just clear the PageMlocked
|
||||
* [in try_to_munlock()] and then attempt to isolate the page. We must
|
||||
* isolate the page to keep others from messing with its unevictable
|
||||
* and mlocked state while trying to munlock. However, we pre-clear the
|
||||
* mlocked state anyway as we might lose the isolation race and we might
|
||||
* not get another chance to clear PageMlocked. If we successfully
|
||||
* isolate the page and try_to_munlock() detects other VM_LOCKED vmas
|
||||
* mapping the page, it will restore the PageMlocked state, unless the page
|
||||
* is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(),
|
||||
* perhaps redundantly.
|
||||
* If we lose the isolation race, and the page is mapped by other VM_LOCKED
|
||||
* vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap()
|
||||
* either of which will restore the PageMlocked state by calling
|
||||
* mlock_vma_page() above, if it can grab the vma's mmap sem.
|
||||
*/
|
||||
static void munlock_vma_page(struct page *page)
|
||||
{
|
||||
BUG_ON(!PageLocked(page));
|
||||
|
||||
if (TestClearPageMlocked(page)) {
|
||||
dec_zone_page_state(page, NR_MLOCK);
|
||||
if (!isolate_lru_page(page)) {
|
||||
int ret = try_to_munlock(page);
|
||||
/*
|
||||
* did try_to_unlock() succeed or punt?
|
||||
*/
|
||||
if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN)
|
||||
count_vm_event(UNEVICTABLE_PGMUNLOCKED);
|
||||
|
||||
putback_lru_page(page);
|
||||
} else {
|
||||
/*
|
||||
* We lost the race. let try_to_unmap() deal
|
||||
* with it. At least we get the page state and
|
||||
* mlock stats right. However, page is still on
|
||||
* the noreclaim list. We'll fix that up when
|
||||
* the page is eventually freed or we scan the
|
||||
* noreclaim list.
|
||||
*/
|
||||
if (PageUnevictable(page))
|
||||
count_vm_event(UNEVICTABLE_PGSTRANDED);
|
||||
else
|
||||
count_vm_event(UNEVICTABLE_PGMUNLOCKED);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma.
|
||||
* @vma: target vma
|
||||
* @start: start address
|
||||
* @end: end address
|
||||
* @mlock: 0 indicate munlock, otherwise mlock.
|
||||
*
|
||||
* If @mlock == 0, unlock an mlocked range;
|
||||
* else mlock the range of pages. This takes care of making the pages present ,
|
||||
* too.
|
||||
*
|
||||
* return 0 on success, negative error code on error.
|
||||
*
|
||||
* vma->vm_mm->mmap_sem must be held for at least read.
|
||||
*/
|
||||
static long __mlock_vma_pages_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end,
|
||||
int mlock)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
unsigned long addr = start;
|
||||
struct page *pages[16]; /* 16 gives a reasonable batch */
|
||||
int nr_pages = (end - start) / PAGE_SIZE;
|
||||
int ret;
|
||||
int gup_flags = 0;
|
||||
|
||||
VM_BUG_ON(start & ~PAGE_MASK);
|
||||
VM_BUG_ON(end & ~PAGE_MASK);
|
||||
VM_BUG_ON(start < vma->vm_start);
|
||||
VM_BUG_ON(end > vma->vm_end);
|
||||
VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) &&
|
||||
(atomic_read(&mm->mm_users) != 0));
|
||||
|
||||
/*
|
||||
* mlock: don't page populate if page has PROT_NONE permission.
|
||||
* munlock: the pages always do munlock althrough
|
||||
* its has PROT_NONE permission.
|
||||
*/
|
||||
if (!mlock)
|
||||
gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS;
|
||||
|
||||
if (vma->vm_flags & VM_WRITE)
|
||||
gup_flags |= GUP_FLAGS_WRITE;
|
||||
|
||||
lru_add_drain_all(); /* push cached pages to LRU */
|
||||
|
||||
while (nr_pages > 0) {
|
||||
int i;
|
||||
|
||||
cond_resched();
|
||||
|
||||
/*
|
||||
* get_user_pages makes pages present if we are
|
||||
* setting mlock. and this extra reference count will
|
||||
* disable migration of this page. However, page may
|
||||
* still be truncated out from under us.
|
||||
*/
|
||||
ret = __get_user_pages(current, mm, addr,
|
||||
min_t(int, nr_pages, ARRAY_SIZE(pages)),
|
||||
gup_flags, pages, NULL);
|
||||
/*
|
||||
* This can happen for, e.g., VM_NONLINEAR regions before
|
||||
* a page has been allocated and mapped at a given offset,
|
||||
* or for addresses that map beyond end of a file.
|
||||
* We'll mlock the the pages if/when they get faulted in.
|
||||
*/
|
||||
if (ret < 0)
|
||||
break;
|
||||
if (ret == 0) {
|
||||
/*
|
||||
* We know the vma is there, so the only time
|
||||
* we cannot get a single page should be an
|
||||
* error (ret < 0) case.
|
||||
*/
|
||||
WARN_ON(1);
|
||||
break;
|
||||
}
|
||||
|
||||
lru_add_drain(); /* push cached pages to LRU */
|
||||
|
||||
for (i = 0; i < ret; i++) {
|
||||
struct page *page = pages[i];
|
||||
|
||||
lock_page(page);
|
||||
/*
|
||||
* Because we lock page here and migration is blocked
|
||||
* by the elevated reference, we need only check for
|
||||
* page truncation (file-cache only).
|
||||
*/
|
||||
if (page->mapping) {
|
||||
if (mlock)
|
||||
mlock_vma_page(page);
|
||||
else
|
||||
munlock_vma_page(page);
|
||||
}
|
||||
unlock_page(page);
|
||||
put_page(page); /* ref from get_user_pages() */
|
||||
|
||||
/*
|
||||
* here we assume that get_user_pages() has given us
|
||||
* a list of virtually contiguous pages.
|
||||
*/
|
||||
addr += PAGE_SIZE; /* for next get_user_pages() */
|
||||
nr_pages--;
|
||||
}
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
lru_add_drain_all(); /* to update stats */
|
||||
|
||||
return ret; /* count entire vma as locked_vm */
|
||||
}
|
||||
|
||||
/*
|
||||
* convert get_user_pages() return value to posix mlock() error
|
||||
*/
|
||||
static int __mlock_posix_error_return(long retval)
|
||||
{
|
||||
if (retval == -EFAULT)
|
||||
retval = -ENOMEM;
|
||||
else if (retval == -ENOMEM)
|
||||
retval = -EAGAIN;
|
||||
return retval;
|
||||
}
|
||||
|
||||
#else /* CONFIG_UNEVICTABLE_LRU */
|
||||
|
||||
/*
|
||||
* Just make pages present if VM_LOCKED. No-op if unlocking.
|
||||
*/
|
||||
static long __mlock_vma_pages_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end,
|
||||
int mlock)
|
||||
{
|
||||
if (mlock && (vma->vm_flags & VM_LOCKED))
|
||||
return make_pages_present(start, end);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int __mlock_posix_error_return(long retval)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_UNEVICTABLE_LRU */
|
||||
|
||||
/**
|
||||
* mlock_vma_pages_range() - mlock pages in specified vma range.
|
||||
* @vma - the vma containing the specfied address range
|
||||
* @start - starting address in @vma to mlock
|
||||
* @end - end address [+1] in @vma to mlock
|
||||
*
|
||||
* For mmap()/mremap()/expansion of mlocked vma.
|
||||
*
|
||||
* return 0 on success for "normal" vmas.
|
||||
*
|
||||
* return number of pages [> 0] to be removed from locked_vm on success
|
||||
* of "special" vmas.
|
||||
*
|
||||
* return negative error if vma spanning @start-@range disappears while
|
||||
* mmap semaphore is dropped. Unlikely?
|
||||
*/
|
||||
long mlock_vma_pages_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
int nr_pages = (end - start) / PAGE_SIZE;
|
||||
BUG_ON(!(vma->vm_flags & VM_LOCKED));
|
||||
|
||||
/*
|
||||
* filter unlockable vmas
|
||||
*/
|
||||
if (vma->vm_flags & (VM_IO | VM_PFNMAP))
|
||||
goto no_mlock;
|
||||
|
||||
if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
|
||||
is_vm_hugetlb_page(vma) ||
|
||||
vma == get_gate_vma(current))) {
|
||||
long error;
|
||||
downgrade_write(&mm->mmap_sem);
|
||||
|
||||
error = __mlock_vma_pages_range(vma, start, end, 1);
|
||||
|
||||
up_read(&mm->mmap_sem);
|
||||
/* vma can change or disappear */
|
||||
down_write(&mm->mmap_sem);
|
||||
vma = find_vma(mm, start);
|
||||
/* non-NULL vma must contain @start, but need to check @end */
|
||||
if (!vma || end > vma->vm_end)
|
||||
return -ENOMEM;
|
||||
|
||||
return 0; /* hide other errors from mmap(), et al */
|
||||
}
|
||||
|
||||
/*
|
||||
* User mapped kernel pages or huge pages:
|
||||
* make these pages present to populate the ptes, but
|
||||
* fall thru' to reset VM_LOCKED--no need to unlock, and
|
||||
* return nr_pages so these don't get counted against task's
|
||||
* locked limit. huge pages are already counted against
|
||||
* locked vm limit.
|
||||
*/
|
||||
make_pages_present(start, end);
|
||||
|
||||
no_mlock:
|
||||
vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */
|
||||
return nr_pages; /* error or pages NOT mlocked */
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* munlock_vma_pages_range() - munlock all pages in the vma range.'
|
||||
* @vma - vma containing range to be munlock()ed.
|
||||
* @start - start address in @vma of the range
|
||||
* @end - end of range in @vma.
|
||||
*
|
||||
* For mremap(), munmap() and exit().
|
||||
*
|
||||
* Called with @vma VM_LOCKED.
|
||||
*
|
||||
* Returns with VM_LOCKED cleared. Callers must be prepared to
|
||||
* deal with this.
|
||||
*
|
||||
* We don't save and restore VM_LOCKED here because pages are
|
||||
* still on lru. In unmap path, pages might be scanned by reclaim
|
||||
* and re-mlocked by try_to_{munlock|unmap} before we unmap and
|
||||
* free them. This will result in freeing mlocked pages.
|
||||
*/
|
||||
void munlock_vma_pages_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end)
|
||||
{
|
||||
vma->vm_flags &= ~VM_LOCKED;
|
||||
__mlock_vma_pages_range(vma, start, end, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* mlock_fixup - handle mlock[all]/munlock[all] requests.
|
||||
*
|
||||
* Filters out "special" vmas -- VM_LOCKED never gets set for these, and
|
||||
* munlock is a no-op. However, for some special vmas, we go ahead and
|
||||
* populate the ptes via make_pages_present().
|
||||
*
|
||||
* For vmas that pass the filters, merge/split as appropriate.
|
||||
*/
|
||||
static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
|
||||
unsigned long start, unsigned long end, unsigned int newflags)
|
||||
{
|
||||
struct mm_struct * mm = vma->vm_mm;
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
pgoff_t pgoff;
|
||||
int pages;
|
||||
int nr_pages;
|
||||
int ret = 0;
|
||||
int lock = newflags & VM_LOCKED;
|
||||
|
||||
if (newflags == vma->vm_flags) {
|
||||
*prev = vma;
|
||||
goto out;
|
||||
if (newflags == vma->vm_flags ||
|
||||
(vma->vm_flags & (VM_IO | VM_PFNMAP)))
|
||||
goto out; /* don't set VM_LOCKED, don't count */
|
||||
|
||||
if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
|
||||
is_vm_hugetlb_page(vma) ||
|
||||
vma == get_gate_vma(current)) {
|
||||
if (lock)
|
||||
make_pages_present(start, end);
|
||||
goto out; /* don't set VM_LOCKED, don't count */
|
||||
}
|
||||
|
||||
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
|
||||
@@ -44,8 +416,6 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
|
||||
goto success;
|
||||
}
|
||||
|
||||
*prev = vma;
|
||||
|
||||
if (start != vma->vm_start) {
|
||||
ret = split_vma(mm, vma, start, 1);
|
||||
if (ret)
|
||||
@@ -59,25 +429,62 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
|
||||
}
|
||||
|
||||
success:
|
||||
/*
|
||||
* Keep track of amount of locked VM.
|
||||
*/
|
||||
nr_pages = (end - start) >> PAGE_SHIFT;
|
||||
if (!lock)
|
||||
nr_pages = -nr_pages;
|
||||
mm->locked_vm += nr_pages;
|
||||
|
||||
/*
|
||||
* vm_flags is protected by the mmap_sem held in write mode.
|
||||
* It's okay if try_to_unmap_one unmaps a page just after we
|
||||
* set VM_LOCKED, make_pages_present below will bring it back.
|
||||
* set VM_LOCKED, __mlock_vma_pages_range will bring it back.
|
||||
*/
|
||||
vma->vm_flags = newflags;
|
||||
|
||||
/*
|
||||
* Keep track of amount of locked VM.
|
||||
*/
|
||||
pages = (end - start) >> PAGE_SHIFT;
|
||||
if (newflags & VM_LOCKED) {
|
||||
pages = -pages;
|
||||
if (!(newflags & VM_IO))
|
||||
ret = make_pages_present(start, end);
|
||||
if (lock) {
|
||||
/*
|
||||
* mmap_sem is currently held for write. Downgrade the write
|
||||
* lock to a read lock so that other faults, mmap scans, ...
|
||||
* while we fault in all pages.
|
||||
*/
|
||||
downgrade_write(&mm->mmap_sem);
|
||||
|
||||
ret = __mlock_vma_pages_range(vma, start, end, 1);
|
||||
|
||||
/*
|
||||
* Need to reacquire mmap sem in write mode, as our callers
|
||||
* expect this. We have no support for atomically upgrading
|
||||
* a sem to write, so we need to check for ranges while sem
|
||||
* is unlocked.
|
||||
*/
|
||||
up_read(&mm->mmap_sem);
|
||||
/* vma can change or disappear */
|
||||
down_write(&mm->mmap_sem);
|
||||
*prev = find_vma(mm, start);
|
||||
/* non-NULL *prev must contain @start, but need to check @end */
|
||||
if (!(*prev) || end > (*prev)->vm_end)
|
||||
ret = -ENOMEM;
|
||||
else if (ret > 0) {
|
||||
mm->locked_vm -= ret;
|
||||
ret = 0;
|
||||
} else
|
||||
ret = __mlock_posix_error_return(ret); /* translate if needed */
|
||||
} else {
|
||||
/*
|
||||
* TODO: for unlocking, pages will already be resident, so
|
||||
* we don't need to wait for allocations/reclaim/pagein, ...
|
||||
* However, unlocking a very large region can still take a
|
||||
* while. Should we downgrade the semaphore for both lock
|
||||
* AND unlock ?
|
||||
*/
|
||||
__mlock_vma_pages_range(vma, start, end, 0);
|
||||
}
|
||||
|
||||
mm->locked_vm -= pages;
|
||||
out:
|
||||
*prev = vma;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
81
mm/mmap.c
81
mm/mmap.c
@@ -410,7 +410,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
rb_insert_color(&vma->vm_rb, &mm->mm_rb);
|
||||
}
|
||||
|
||||
static inline void __vma_link_file(struct vm_area_struct *vma)
|
||||
static void __vma_link_file(struct vm_area_struct *vma)
|
||||
{
|
||||
struct file * file;
|
||||
|
||||
@@ -662,8 +662,6 @@ again: remove_next = 1 + (end > next->vm_end);
|
||||
* If the vma has a ->close operation then the driver probably needs to release
|
||||
* per-vma resources, so we don't attempt to merge those.
|
||||
*/
|
||||
#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
|
||||
|
||||
static inline int is_mergeable_vma(struct vm_area_struct *vma,
|
||||
struct file *file, unsigned long vm_flags)
|
||||
{
|
||||
@@ -972,6 +970,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
|
||||
return -EPERM;
|
||||
vm_flags |= VM_LOCKED;
|
||||
}
|
||||
|
||||
/* mlock MCL_FUTURE? */
|
||||
if (vm_flags & VM_LOCKED) {
|
||||
unsigned long locked, lock_limit;
|
||||
@@ -1139,10 +1138,12 @@ munmap_back:
|
||||
* The VM_SHARED test is necessary because shmem_zero_setup
|
||||
* will create the file object for a shared anonymous map below.
|
||||
*/
|
||||
if (!file && !(vm_flags & VM_SHARED) &&
|
||||
vma_merge(mm, prev, addr, addr + len, vm_flags,
|
||||
NULL, NULL, pgoff, NULL))
|
||||
goto out;
|
||||
if (!file && !(vm_flags & VM_SHARED)) {
|
||||
vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
|
||||
NULL, NULL, pgoff, NULL);
|
||||
if (vma)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine the object being mapped and call the appropriate
|
||||
@@ -1224,10 +1225,14 @@ out:
|
||||
mm->total_vm += len >> PAGE_SHIFT;
|
||||
vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
|
||||
if (vm_flags & VM_LOCKED) {
|
||||
mm->locked_vm += len >> PAGE_SHIFT;
|
||||
make_pages_present(addr, addr + len);
|
||||
}
|
||||
if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
|
||||
/*
|
||||
* makes pages present; downgrades, drops, reacquires mmap_sem
|
||||
*/
|
||||
long nr_pages = mlock_vma_pages_range(vma, addr, addr + len);
|
||||
if (nr_pages < 0)
|
||||
return nr_pages; /* vma gone! */
|
||||
mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;
|
||||
} else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
|
||||
make_pages_present(addr, addr + len);
|
||||
return addr;
|
||||
|
||||
@@ -1586,7 +1591,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
|
||||
* vma is the last one with address > vma->vm_end. Have to extend vma.
|
||||
*/
|
||||
#ifndef CONFIG_IA64
|
||||
static inline
|
||||
static
|
||||
#endif
|
||||
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
|
||||
{
|
||||
@@ -1636,7 +1641,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
|
||||
/*
|
||||
* vma is the first one with address < vma->vm_start. Have to extend vma.
|
||||
*/
|
||||
static inline int expand_downwards(struct vm_area_struct *vma,
|
||||
static int expand_downwards(struct vm_area_struct *vma,
|
||||
unsigned long address)
|
||||
{
|
||||
int error;
|
||||
@@ -1698,10 +1703,12 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
|
||||
vma = find_vma_prev(mm, addr, &prev);
|
||||
if (vma && (vma->vm_start <= addr))
|
||||
return vma;
|
||||
if (!prev || expand_stack(prev, addr))
|
||||
if (expand_stack(prev, addr))
|
||||
return NULL;
|
||||
if (prev->vm_flags & VM_LOCKED)
|
||||
make_pages_present(addr, prev->vm_end);
|
||||
if (prev->vm_flags & VM_LOCKED) {
|
||||
if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0)
|
||||
return NULL; /* vma gone! */
|
||||
}
|
||||
return prev;
|
||||
}
|
||||
#else
|
||||
@@ -1727,8 +1734,10 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
|
||||
start = vma->vm_start;
|
||||
if (expand_stack(vma, addr))
|
||||
return NULL;
|
||||
if (vma->vm_flags & VM_LOCKED)
|
||||
make_pages_present(addr, start);
|
||||
if (vma->vm_flags & VM_LOCKED) {
|
||||
if (mlock_vma_pages_range(vma, addr, start) < 0)
|
||||
return NULL; /* vma gone! */
|
||||
}
|
||||
return vma;
|
||||
}
|
||||
#endif
|
||||
@@ -1747,8 +1756,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
|
||||
long nrpages = vma_pages(vma);
|
||||
|
||||
mm->total_vm -= nrpages;
|
||||
if (vma->vm_flags & VM_LOCKED)
|
||||
mm->locked_vm -= nrpages;
|
||||
vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
|
||||
vma = remove_vma(vma);
|
||||
} while (vma);
|
||||
@@ -1913,6 +1920,20 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
|
||||
}
|
||||
vma = prev? prev->vm_next: mm->mmap;
|
||||
|
||||
/*
|
||||
* unlock any mlock()ed ranges before detaching vmas
|
||||
*/
|
||||
if (mm->locked_vm) {
|
||||
struct vm_area_struct *tmp = vma;
|
||||
while (tmp && tmp->vm_start < end) {
|
||||
if (tmp->vm_flags & VM_LOCKED) {
|
||||
mm->locked_vm -= vma_pages(tmp);
|
||||
munlock_vma_pages_all(tmp);
|
||||
}
|
||||
tmp = tmp->vm_next;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove the vma's, and unmap the actual pages
|
||||
*/
|
||||
@@ -2025,8 +2046,9 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
|
||||
return -ENOMEM;
|
||||
|
||||
/* Can we just expand an old private anonymous mapping? */
|
||||
if (vma_merge(mm, prev, addr, addr + len, flags,
|
||||
NULL, NULL, pgoff, NULL))
|
||||
vma = vma_merge(mm, prev, addr, addr + len, flags,
|
||||
NULL, NULL, pgoff, NULL);
|
||||
if (vma)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
@@ -2048,8 +2070,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
|
||||
out:
|
||||
mm->total_vm += len >> PAGE_SHIFT;
|
||||
if (flags & VM_LOCKED) {
|
||||
mm->locked_vm += len >> PAGE_SHIFT;
|
||||
make_pages_present(addr, addr + len);
|
||||
if (!mlock_vma_pages_range(vma, addr, addr + len))
|
||||
mm->locked_vm += (len >> PAGE_SHIFT);
|
||||
}
|
||||
return addr;
|
||||
}
|
||||
@@ -2060,7 +2082,7 @@ EXPORT_SYMBOL(do_brk);
|
||||
void exit_mmap(struct mm_struct *mm)
|
||||
{
|
||||
struct mmu_gather *tlb;
|
||||
struct vm_area_struct *vma = mm->mmap;
|
||||
struct vm_area_struct *vma;
|
||||
unsigned long nr_accounted = 0;
|
||||
unsigned long end;
|
||||
|
||||
@@ -2068,6 +2090,15 @@ void exit_mmap(struct mm_struct *mm)
|
||||
arch_exit_mmap(mm);
|
||||
mmu_notifier_release(mm);
|
||||
|
||||
if (mm->locked_vm) {
|
||||
vma = mm->mmap;
|
||||
while (vma) {
|
||||
if (vma->vm_flags & VM_LOCKED)
|
||||
munlock_vma_pages_all(vma);
|
||||
vma = vma->vm_next;
|
||||
}
|
||||
}
|
||||
vma = mm->mmap;
|
||||
lru_add_drain();
|
||||
flush_cache_mm(mm);
|
||||
tlb = tlb_gather_mmu(mm, 1);
|
||||
|
@@ -24,6 +24,8 @@
|
||||
#include <asm/cacheflush.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
@@ -238,8 +240,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
|
||||
if (vm_flags & VM_LOCKED) {
|
||||
mm->locked_vm += new_len >> PAGE_SHIFT;
|
||||
if (new_len > old_len)
|
||||
make_pages_present(new_addr + old_len,
|
||||
new_addr + new_len);
|
||||
mlock_vma_pages_range(new_vma, new_addr + old_len,
|
||||
new_addr + new_len);
|
||||
}
|
||||
|
||||
return new_addr;
|
||||
@@ -379,7 +381,7 @@ unsigned long do_mremap(unsigned long addr,
|
||||
vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
|
||||
if (vma->vm_flags & VM_LOCKED) {
|
||||
mm->locked_vm += pages;
|
||||
make_pages_present(addr + old_len,
|
||||
mlock_vma_pages_range(vma, addr + old_len,
|
||||
addr + new_len);
|
||||
}
|
||||
ret = addr;
|
||||
|
44
mm/nommu.c
44
mm/nommu.c
@@ -34,6 +34,8 @@
|
||||
#include <asm/tlb.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
void *high_memory;
|
||||
struct page *mem_map;
|
||||
unsigned long max_mapnr;
|
||||
@@ -128,20 +130,16 @@ unsigned int kobjsize(const void *objp)
|
||||
return PAGE_SIZE << compound_order(page);
|
||||
}
|
||||
|
||||
/*
|
||||
* get a list of pages in an address range belonging to the specified process
|
||||
* and indicate the VMA that covers each page
|
||||
* - this is potentially dodgy as we may end incrementing the page count of a
|
||||
* slab page or a secondary page from a compound page
|
||||
* - don't permit access to VMAs that don't support it, such as I/O mappings
|
||||
*/
|
||||
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, int len, int write, int force,
|
||||
struct page **pages, struct vm_area_struct **vmas)
|
||||
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, int len, int flags,
|
||||
struct page **pages, struct vm_area_struct **vmas)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
unsigned long vm_flags;
|
||||
int i;
|
||||
int write = !!(flags & GUP_FLAGS_WRITE);
|
||||
int force = !!(flags & GUP_FLAGS_FORCE);
|
||||
int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
|
||||
|
||||
/* calculate required read or write permissions.
|
||||
* - if 'force' is set, we only require the "MAY" flags.
|
||||
@@ -156,7 +154,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
|
||||
/* protect what we can, including chardevs */
|
||||
if (vma->vm_flags & (VM_IO | VM_PFNMAP) ||
|
||||
!(vm_flags & vma->vm_flags))
|
||||
(!ignore && !(vm_flags & vma->vm_flags)))
|
||||
goto finish_or_fault;
|
||||
|
||||
if (pages) {
|
||||
@@ -174,6 +172,30 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
finish_or_fault:
|
||||
return i ? : -EFAULT;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* get a list of pages in an address range belonging to the specified process
|
||||
* and indicate the VMA that covers each page
|
||||
* - this is potentially dodgy as we may end incrementing the page count of a
|
||||
* slab page or a secondary page from a compound page
|
||||
* - don't permit access to VMAs that don't support it, such as I/O mappings
|
||||
*/
|
||||
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, int len, int write, int force,
|
||||
struct page **pages, struct vm_area_struct **vmas)
|
||||
{
|
||||
int flags = 0;
|
||||
|
||||
if (write)
|
||||
flags |= GUP_FLAGS_WRITE;
|
||||
if (force)
|
||||
flags |= GUP_FLAGS_FORCE;
|
||||
|
||||
return __get_user_pages(tsk, mm,
|
||||
start, len, flags,
|
||||
pages, vmas);
|
||||
}
|
||||
EXPORT_SYMBOL(get_user_pages);
|
||||
|
||||
DEFINE_RWLOCK(vmlist_lock);
|
||||
|
@@ -7,7 +7,7 @@
|
||||
* Contains functions related to writing back dirty pages at the
|
||||
* address_space level.
|
||||
*
|
||||
* 10Apr2002 akpm@zip.com.au
|
||||
* 10Apr2002 Andrew Morton
|
||||
* Initial version
|
||||
*/
|
||||
|
||||
@@ -329,9 +329,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
|
||||
struct zone *z =
|
||||
&NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
|
||||
|
||||
x += zone_page_state(z, NR_FREE_PAGES)
|
||||
+ zone_page_state(z, NR_INACTIVE)
|
||||
+ zone_page_state(z, NR_ACTIVE);
|
||||
x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z);
|
||||
}
|
||||
/*
|
||||
* Make sure that the number of highmem pages is never larger
|
||||
@@ -355,9 +353,7 @@ unsigned long determine_dirtyable_memory(void)
|
||||
{
|
||||
unsigned long x;
|
||||
|
||||
x = global_page_state(NR_FREE_PAGES)
|
||||
+ global_page_state(NR_INACTIVE)
|
||||
+ global_page_state(NR_ACTIVE);
|
||||
x = global_page_state(NR_FREE_PAGES) + global_lru_pages();
|
||||
|
||||
if (!vm_highmem_is_dirtyable)
|
||||
x -= highmem_dirtyable_memory(x);
|
||||
@@ -876,6 +872,7 @@ int write_cache_pages(struct address_space *mapping,
|
||||
pgoff_t end; /* Inclusive */
|
||||
int scanned = 0;
|
||||
int range_whole = 0;
|
||||
long nr_to_write = wbc->nr_to_write;
|
||||
|
||||
if (wbc->nonblocking && bdi_write_congested(bdi)) {
|
||||
wbc->encountered_congestion = 1;
|
||||
@@ -939,7 +936,7 @@ retry:
|
||||
unlock_page(page);
|
||||
ret = 0;
|
||||
}
|
||||
if (ret || (--(wbc->nr_to_write) <= 0))
|
||||
if (ret || (--nr_to_write <= 0))
|
||||
done = 1;
|
||||
if (wbc->nonblocking && bdi_write_congested(bdi)) {
|
||||
wbc->encountered_congestion = 1;
|
||||
@@ -958,11 +955,12 @@ retry:
|
||||
index = 0;
|
||||
goto retry;
|
||||
}
|
||||
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
|
||||
mapping->writeback_index = index;
|
||||
if (!wbc->no_nrwrite_index_update) {
|
||||
if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
|
||||
mapping->writeback_index = index;
|
||||
wbc->nr_to_write = nr_to_write;
|
||||
}
|
||||
|
||||
if (wbc->range_cont)
|
||||
wbc->range_start = index << PAGE_CACHE_SHIFT;
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(write_cache_pages);
|
||||
|
136
mm/page_alloc.c
136
mm/page_alloc.c
@@ -44,7 +44,7 @@
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/fault-inject.h>
|
||||
#include <linux/page-isolation.h>
|
||||
#include <linux/memcontrol.h>
|
||||
#include <linux/page_cgroup.h>
|
||||
#include <linux/debugobjects.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
@@ -223,17 +223,12 @@ static inline int bad_range(struct zone *zone, struct page *page)
|
||||
|
||||
static void bad_page(struct page *page)
|
||||
{
|
||||
void *pc = page_get_page_cgroup(page);
|
||||
|
||||
printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
|
||||
"page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
|
||||
current->comm, page, (int)(2*sizeof(unsigned long)),
|
||||
(unsigned long)page->flags, page->mapping,
|
||||
page_mapcount(page), page_count(page));
|
||||
if (pc) {
|
||||
printk(KERN_EMERG "cgroup:%p\n", pc);
|
||||
page_reset_bad_cgroup(page);
|
||||
}
|
||||
|
||||
printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
|
||||
KERN_EMERG "Backtrace:\n");
|
||||
dump_stack();
|
||||
@@ -268,13 +263,14 @@ void prep_compound_page(struct page *page, unsigned long order)
|
||||
{
|
||||
int i;
|
||||
int nr_pages = 1 << order;
|
||||
struct page *p = page + 1;
|
||||
|
||||
set_compound_page_dtor(page, free_compound_page);
|
||||
set_compound_order(page, order);
|
||||
__SetPageHead(page);
|
||||
for (i = 1; i < nr_pages; i++) {
|
||||
struct page *p = page + i;
|
||||
|
||||
for (i = 1; i < nr_pages; i++, p++) {
|
||||
if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
|
||||
p = pfn_to_page(page_to_pfn(page) + i);
|
||||
__SetPageTail(p);
|
||||
p->first_page = page;
|
||||
}
|
||||
@@ -284,6 +280,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
|
||||
{
|
||||
int i;
|
||||
int nr_pages = 1 << order;
|
||||
struct page *p = page + 1;
|
||||
|
||||
if (unlikely(compound_order(page) != order))
|
||||
bad_page(page);
|
||||
@@ -291,8 +288,9 @@ static void destroy_compound_page(struct page *page, unsigned long order)
|
||||
if (unlikely(!PageHead(page)))
|
||||
bad_page(page);
|
||||
__ClearPageHead(page);
|
||||
for (i = 1; i < nr_pages; i++) {
|
||||
struct page *p = page + i;
|
||||
for (i = 1; i < nr_pages; i++, p++) {
|
||||
if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
|
||||
p = pfn_to_page(page_to_pfn(page) + i);
|
||||
|
||||
if (unlikely(!PageTail(p) |
|
||||
(p->first_page != page)))
|
||||
@@ -451,14 +449,16 @@ static inline void __free_one_page(struct page *page,
|
||||
|
||||
static inline int free_pages_check(struct page *page)
|
||||
{
|
||||
free_page_mlock(page);
|
||||
if (unlikely(page_mapcount(page) |
|
||||
(page->mapping != NULL) |
|
||||
(page_get_page_cgroup(page) != NULL) |
|
||||
(page_count(page) != 0) |
|
||||
(page->flags & PAGE_FLAGS_CHECK_AT_FREE)))
|
||||
bad_page(page);
|
||||
if (PageDirty(page))
|
||||
__ClearPageDirty(page);
|
||||
if (PageSwapBacked(page))
|
||||
__ClearPageSwapBacked(page);
|
||||
/*
|
||||
* For now, we report if PG_reserved was found set, but do not
|
||||
* clear it, and do not free the page. But we shall soon need
|
||||
@@ -597,7 +597,6 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
|
||||
{
|
||||
if (unlikely(page_mapcount(page) |
|
||||
(page->mapping != NULL) |
|
||||
(page_get_page_cgroup(page) != NULL) |
|
||||
(page_count(page) != 0) |
|
||||
(page->flags & PAGE_FLAGS_CHECK_AT_PREP)))
|
||||
bad_page(page);
|
||||
@@ -611,7 +610,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
|
||||
|
||||
page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
|
||||
1 << PG_referenced | 1 << PG_arch_1 |
|
||||
1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);
|
||||
1 << PG_owner_priv_1 | 1 << PG_mappedtodisk
|
||||
#ifdef CONFIG_UNEVICTABLE_LRU
|
||||
| 1 << PG_mlocked
|
||||
#endif
|
||||
);
|
||||
set_page_private(page, 0);
|
||||
set_page_refcounted(page);
|
||||
|
||||
@@ -1859,10 +1862,21 @@ void show_free_areas(void)
|
||||
}
|
||||
}
|
||||
|
||||
printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n"
|
||||
printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n"
|
||||
" inactive_file:%lu"
|
||||
//TODO: check/adjust line lengths
|
||||
#ifdef CONFIG_UNEVICTABLE_LRU
|
||||
" unevictable:%lu"
|
||||
#endif
|
||||
" dirty:%lu writeback:%lu unstable:%lu\n"
|
||||
" free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
|
||||
global_page_state(NR_ACTIVE),
|
||||
global_page_state(NR_INACTIVE),
|
||||
global_page_state(NR_ACTIVE_ANON),
|
||||
global_page_state(NR_ACTIVE_FILE),
|
||||
global_page_state(NR_INACTIVE_ANON),
|
||||
global_page_state(NR_INACTIVE_FILE),
|
||||
#ifdef CONFIG_UNEVICTABLE_LRU
|
||||
global_page_state(NR_UNEVICTABLE),
|
||||
#endif
|
||||
global_page_state(NR_FILE_DIRTY),
|
||||
global_page_state(NR_WRITEBACK),
|
||||
global_page_state(NR_UNSTABLE_NFS),
|
||||
@@ -1885,8 +1899,13 @@ void show_free_areas(void)
|
||||
" min:%lukB"
|
||||
" low:%lukB"
|
||||
" high:%lukB"
|
||||
" active:%lukB"
|
||||
" inactive:%lukB"
|
||||
" active_anon:%lukB"
|
||||
" inactive_anon:%lukB"
|
||||
" active_file:%lukB"
|
||||
" inactive_file:%lukB"
|
||||
#ifdef CONFIG_UNEVICTABLE_LRU
|
||||
" unevictable:%lukB"
|
||||
#endif
|
||||
" present:%lukB"
|
||||
" pages_scanned:%lu"
|
||||
" all_unreclaimable? %s"
|
||||
@@ -1896,8 +1915,13 @@ void show_free_areas(void)
|
||||
K(zone->pages_min),
|
||||
K(zone->pages_low),
|
||||
K(zone->pages_high),
|
||||
K(zone_page_state(zone, NR_ACTIVE)),
|
||||
K(zone_page_state(zone, NR_INACTIVE)),
|
||||
K(zone_page_state(zone, NR_ACTIVE_ANON)),
|
||||
K(zone_page_state(zone, NR_INACTIVE_ANON)),
|
||||
K(zone_page_state(zone, NR_ACTIVE_FILE)),
|
||||
K(zone_page_state(zone, NR_INACTIVE_FILE)),
|
||||
#ifdef CONFIG_UNEVICTABLE_LRU
|
||||
K(zone_page_state(zone, NR_UNEVICTABLE)),
|
||||
#endif
|
||||
K(zone->present_pages),
|
||||
zone->pages_scanned,
|
||||
(zone_is_all_unreclaimable(zone) ? "yes" : "no")
|
||||
@@ -3407,10 +3431,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
|
||||
pgdat->nr_zones = 0;
|
||||
init_waitqueue_head(&pgdat->kswapd_wait);
|
||||
pgdat->kswapd_max_order = 0;
|
||||
pgdat_page_cgroup_init(pgdat);
|
||||
|
||||
for (j = 0; j < MAX_NR_ZONES; j++) {
|
||||
struct zone *zone = pgdat->node_zones + j;
|
||||
unsigned long size, realsize, memmap_pages;
|
||||
enum lru_list l;
|
||||
|
||||
size = zone_spanned_pages_in_node(nid, j, zones_size);
|
||||
realsize = size - zone_absent_pages_in_node(nid, j,
|
||||
@@ -3425,8 +3451,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
|
||||
PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
|
||||
if (realsize >= memmap_pages) {
|
||||
realsize -= memmap_pages;
|
||||
mminit_dprintk(MMINIT_TRACE, "memmap_init",
|
||||
"%s zone: %lu pages used for memmap\n",
|
||||
printk(KERN_DEBUG
|
||||
" %s zone: %lu pages used for memmap\n",
|
||||
zone_names[j], memmap_pages);
|
||||
} else
|
||||
printk(KERN_WARNING
|
||||
@@ -3436,8 +3462,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
|
||||
/* Account for reserved pages */
|
||||
if (j == 0 && realsize > dma_reserve) {
|
||||
realsize -= dma_reserve;
|
||||
mminit_dprintk(MMINIT_TRACE, "memmap_init",
|
||||
"%s zone: %lu pages reserved\n",
|
||||
printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
|
||||
zone_names[0], dma_reserve);
|
||||
}
|
||||
|
||||
@@ -3462,10 +3487,14 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
|
||||
zone->prev_priority = DEF_PRIORITY;
|
||||
|
||||
zone_pcp_init(zone);
|
||||
INIT_LIST_HEAD(&zone->active_list);
|
||||
INIT_LIST_HEAD(&zone->inactive_list);
|
||||
zone->nr_scan_active = 0;
|
||||
zone->nr_scan_inactive = 0;
|
||||
for_each_lru(l) {
|
||||
INIT_LIST_HEAD(&zone->lru[l].list);
|
||||
zone->lru[l].nr_scan = 0;
|
||||
}
|
||||
zone->recent_rotated[0] = 0;
|
||||
zone->recent_rotated[1] = 0;
|
||||
zone->recent_scanned[0] = 0;
|
||||
zone->recent_scanned[1] = 0;
|
||||
zap_zone_vm_stats(zone);
|
||||
zone->flags = 0;
|
||||
if (!size)
|
||||
@@ -3949,7 +3978,7 @@ static void check_for_regular_memory(pg_data_t *pgdat)
|
||||
void __init free_area_init_nodes(unsigned long *max_zone_pfn)
|
||||
{
|
||||
unsigned long nid;
|
||||
enum zone_type i;
|
||||
int i;
|
||||
|
||||
/* Sort early_node_map as initialisation assumes it is sorted */
|
||||
sort_node_map();
|
||||
@@ -4207,7 +4236,7 @@ void setup_per_zone_pages_min(void)
|
||||
for_each_zone(zone) {
|
||||
u64 tmp;
|
||||
|
||||
spin_lock_irqsave(&zone->lru_lock, flags);
|
||||
spin_lock_irqsave(&zone->lock, flags);
|
||||
tmp = (u64)pages_min * zone->present_pages;
|
||||
do_div(tmp, lowmem_pages);
|
||||
if (is_highmem(zone)) {
|
||||
@@ -4239,13 +4268,53 @@ void setup_per_zone_pages_min(void)
|
||||
zone->pages_low = zone->pages_min + (tmp >> 2);
|
||||
zone->pages_high = zone->pages_min + (tmp >> 1);
|
||||
setup_zone_migrate_reserve(zone);
|
||||
spin_unlock_irqrestore(&zone->lru_lock, flags);
|
||||
spin_unlock_irqrestore(&zone->lock, flags);
|
||||
}
|
||||
|
||||
/* update totalreserve_pages */
|
||||
calculate_totalreserve_pages();
|
||||
}
|
||||
|
||||
/**
|
||||
* setup_per_zone_inactive_ratio - called when min_free_kbytes changes.
|
||||
*
|
||||
* The inactive anon list should be small enough that the VM never has to
|
||||
* do too much work, but large enough that each inactive page has a chance
|
||||
* to be referenced again before it is swapped out.
|
||||
*
|
||||
* The inactive_anon ratio is the target ratio of ACTIVE_ANON to
|
||||
* INACTIVE_ANON pages on this zone's LRU, maintained by the
|
||||
* pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
|
||||
* the anonymous pages are kept on the inactive list.
|
||||
*
|
||||
* total target max
|
||||
* memory ratio inactive anon
|
||||
* -------------------------------------
|
||||
* 10MB 1 5MB
|
||||
* 100MB 1 50MB
|
||||
* 1GB 3 250MB
|
||||
* 10GB 10 0.9GB
|
||||
* 100GB 31 3GB
|
||||
* 1TB 101 10GB
|
||||
* 10TB 320 32GB
|
||||
*/
|
||||
void setup_per_zone_inactive_ratio(void)
|
||||
{
|
||||
struct zone *zone;
|
||||
|
||||
for_each_zone(zone) {
|
||||
unsigned int gb, ratio;
|
||||
|
||||
/* Zone size in gigabytes */
|
||||
gb = zone->present_pages >> (30 - PAGE_SHIFT);
|
||||
ratio = int_sqrt(10 * gb);
|
||||
if (!ratio)
|
||||
ratio = 1;
|
||||
|
||||
zone->inactive_ratio = ratio;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialise min_free_kbytes.
|
||||
*
|
||||
@@ -4283,6 +4352,7 @@ static int __init init_per_zone_pages_min(void)
|
||||
min_free_kbytes = 65536;
|
||||
setup_per_zone_pages_min();
|
||||
setup_per_zone_lowmem_reserve();
|
||||
setup_per_zone_inactive_ratio();
|
||||
return 0;
|
||||
}
|
||||
module_init(init_per_zone_pages_min)
|
||||
|
256
mm/page_cgroup.c
Normal file
256
mm/page_cgroup.c
Normal file
@@ -0,0 +1,256 @@
|
||||
#include <linux/mm.h>
|
||||
#include <linux/mmzone.h>
|
||||
#include <linux/bootmem.h>
|
||||
#include <linux/bit_spinlock.h>
|
||||
#include <linux/page_cgroup.h>
|
||||
#include <linux/hash.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/memory.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/cgroup.h>
|
||||
|
||||
static void __meminit
|
||||
__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
|
||||
{
|
||||
pc->flags = 0;
|
||||
pc->mem_cgroup = NULL;
|
||||
pc->page = pfn_to_page(pfn);
|
||||
}
|
||||
static unsigned long total_usage;
|
||||
|
||||
#if !defined(CONFIG_SPARSEMEM)
|
||||
|
||||
|
||||
void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
|
||||
{
|
||||
pgdat->node_page_cgroup = NULL;
|
||||
}
|
||||
|
||||
struct page_cgroup *lookup_page_cgroup(struct page *page)
|
||||
{
|
||||
unsigned long pfn = page_to_pfn(page);
|
||||
unsigned long offset;
|
||||
struct page_cgroup *base;
|
||||
|
||||
base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
|
||||
if (unlikely(!base))
|
||||
return NULL;
|
||||
|
||||
offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
|
||||
return base + offset;
|
||||
}
|
||||
|
||||
static int __init alloc_node_page_cgroup(int nid)
|
||||
{
|
||||
struct page_cgroup *base, *pc;
|
||||
unsigned long table_size;
|
||||
unsigned long start_pfn, nr_pages, index;
|
||||
|
||||
start_pfn = NODE_DATA(nid)->node_start_pfn;
|
||||
nr_pages = NODE_DATA(nid)->node_spanned_pages;
|
||||
|
||||
table_size = sizeof(struct page_cgroup) * nr_pages;
|
||||
|
||||
base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
|
||||
table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
|
||||
if (!base)
|
||||
return -ENOMEM;
|
||||
for (index = 0; index < nr_pages; index++) {
|
||||
pc = base + index;
|
||||
__init_page_cgroup(pc, start_pfn + index);
|
||||
}
|
||||
NODE_DATA(nid)->node_page_cgroup = base;
|
||||
total_usage += table_size;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void __init page_cgroup_init(void)
|
||||
{
|
||||
|
||||
int nid, fail;
|
||||
|
||||
if (mem_cgroup_subsys.disabled)
|
||||
return;
|
||||
|
||||
for_each_online_node(nid) {
|
||||
fail = alloc_node_page_cgroup(nid);
|
||||
if (fail)
|
||||
goto fail;
|
||||
}
|
||||
printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
|
||||
printk(KERN_INFO "please try cgroup_disable=memory option if you"
|
||||
" don't want\n");
|
||||
return;
|
||||
fail:
|
||||
printk(KERN_CRIT "allocation of page_cgroup was failed.\n");
|
||||
printk(KERN_CRIT "please try cgroup_disable=memory boot option\n");
|
||||
panic("Out of memory");
|
||||
}
|
||||
|
||||
#else /* CONFIG_FLAT_NODE_MEM_MAP */
|
||||
|
||||
struct page_cgroup *lookup_page_cgroup(struct page *page)
|
||||
{
|
||||
unsigned long pfn = page_to_pfn(page);
|
||||
struct mem_section *section = __pfn_to_section(pfn);
|
||||
|
||||
return section->page_cgroup + pfn;
|
||||
}
|
||||
|
||||
int __meminit init_section_page_cgroup(unsigned long pfn)
|
||||
{
|
||||
struct mem_section *section;
|
||||
struct page_cgroup *base, *pc;
|
||||
unsigned long table_size;
|
||||
int nid, index;
|
||||
|
||||
section = __pfn_to_section(pfn);
|
||||
|
||||
if (section->page_cgroup)
|
||||
return 0;
|
||||
|
||||
nid = page_to_nid(pfn_to_page(pfn));
|
||||
|
||||
table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
|
||||
if (slab_is_available()) {
|
||||
base = kmalloc_node(table_size, GFP_KERNEL, nid);
|
||||
if (!base)
|
||||
base = vmalloc_node(table_size, nid);
|
||||
} else {
|
||||
base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), table_size,
|
||||
PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
|
||||
}
|
||||
|
||||
if (!base) {
|
||||
printk(KERN_ERR "page cgroup allocation failure\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
for (index = 0; index < PAGES_PER_SECTION; index++) {
|
||||
pc = base + index;
|
||||
__init_page_cgroup(pc, pfn + index);
|
||||
}
|
||||
|
||||
section = __pfn_to_section(pfn);
|
||||
section->page_cgroup = base - pfn;
|
||||
total_usage += table_size;
|
||||
return 0;
|
||||
}
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
void __free_page_cgroup(unsigned long pfn)
|
||||
{
|
||||
struct mem_section *ms;
|
||||
struct page_cgroup *base;
|
||||
|
||||
ms = __pfn_to_section(pfn);
|
||||
if (!ms || !ms->page_cgroup)
|
||||
return;
|
||||
base = ms->page_cgroup + pfn;
|
||||
if (is_vmalloc_addr(base)) {
|
||||
vfree(base);
|
||||
ms->page_cgroup = NULL;
|
||||
} else {
|
||||
struct page *page = virt_to_page(base);
|
||||
if (!PageReserved(page)) { /* Is bootmem ? */
|
||||
kfree(base);
|
||||
ms->page_cgroup = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int online_page_cgroup(unsigned long start_pfn,
|
||||
unsigned long nr_pages,
|
||||
int nid)
|
||||
{
|
||||
unsigned long start, end, pfn;
|
||||
int fail = 0;
|
||||
|
||||
start = start_pfn & (PAGES_PER_SECTION - 1);
|
||||
end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
|
||||
|
||||
for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
|
||||
if (!pfn_present(pfn))
|
||||
continue;
|
||||
fail = init_section_page_cgroup(pfn);
|
||||
}
|
||||
if (!fail)
|
||||
return 0;
|
||||
|
||||
/* rollback */
|
||||
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
|
||||
__free_page_cgroup(pfn);
|
||||
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
int offline_page_cgroup(unsigned long start_pfn,
|
||||
unsigned long nr_pages, int nid)
|
||||
{
|
||||
unsigned long start, end, pfn;
|
||||
|
||||
start = start_pfn & (PAGES_PER_SECTION - 1);
|
||||
end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
|
||||
|
||||
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
|
||||
__free_page_cgroup(pfn);
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
static int page_cgroup_callback(struct notifier_block *self,
|
||||
unsigned long action, void *arg)
|
||||
{
|
||||
struct memory_notify *mn = arg;
|
||||
int ret = 0;
|
||||
switch (action) {
|
||||
case MEM_GOING_ONLINE:
|
||||
ret = online_page_cgroup(mn->start_pfn,
|
||||
mn->nr_pages, mn->status_change_nid);
|
||||
break;
|
||||
case MEM_CANCEL_ONLINE:
|
||||
case MEM_OFFLINE:
|
||||
offline_page_cgroup(mn->start_pfn,
|
||||
mn->nr_pages, mn->status_change_nid);
|
||||
break;
|
||||
case MEM_GOING_OFFLINE:
|
||||
break;
|
||||
case MEM_ONLINE:
|
||||
case MEM_CANCEL_OFFLINE:
|
||||
break;
|
||||
}
|
||||
ret = notifier_from_errno(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void __init page_cgroup_init(void)
|
||||
{
|
||||
unsigned long pfn;
|
||||
int fail = 0;
|
||||
|
||||
if (mem_cgroup_subsys.disabled)
|
||||
return;
|
||||
|
||||
for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
|
||||
if (!pfn_present(pfn))
|
||||
continue;
|
||||
fail = init_section_page_cgroup(pfn);
|
||||
}
|
||||
if (fail) {
|
||||
printk(KERN_CRIT "try cgroup_disable=memory boot option\n");
|
||||
panic("Out of memory");
|
||||
} else {
|
||||
hotplug_memory_notifier(page_cgroup_callback, 0);
|
||||
}
|
||||
printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
|
||||
printk(KERN_INFO "please try cgroup_disable=memory option if you don't"
|
||||
" want\n");
|
||||
}
|
||||
|
||||
void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
#endif
|
@@ -114,8 +114,10 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
|
||||
|
||||
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
|
||||
{
|
||||
unsigned long pfn;
|
||||
unsigned long pfn, flags;
|
||||
struct page *page;
|
||||
struct zone *zone;
|
||||
int ret;
|
||||
|
||||
pfn = start_pfn;
|
||||
/*
|
||||
@@ -131,7 +133,9 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
|
||||
if (pfn < end_pfn)
|
||||
return -EBUSY;
|
||||
/* Check all pages are free or Marked as ISOLATED */
|
||||
if (__test_page_isolated_in_pageblock(start_pfn, end_pfn))
|
||||
return 0;
|
||||
return -EBUSY;
|
||||
zone = page_zone(pfn_to_page(pfn));
|
||||
spin_lock_irqsave(&zone->lock, flags);
|
||||
ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn);
|
||||
spin_unlock_irqrestore(&zone->lock, flags);
|
||||
return ret ? 0 : -EBUSY;
|
||||
}
|
||||
|
@@ -3,7 +3,7 @@
|
||||
*
|
||||
* Copyright (C) 2002, Linus Torvalds.
|
||||
*
|
||||
* 09Apr2002 akpm@zip.com.au
|
||||
* 09Apr2002 Andrew Morton
|
||||
* Initial version
|
||||
* 29Feb2004 kaos@sgi.com
|
||||
* Move worker thread creation to kthread to avoid chewing
|
||||
|
@@ -3,7 +3,7 @@
|
||||
*
|
||||
* Copyright (C) 2002, Linus Torvalds
|
||||
*
|
||||
* 09Apr2002 akpm@zip.com.au
|
||||
* 09Apr2002 Andrew Morton
|
||||
* Initial version.
|
||||
*/
|
||||
|
||||
@@ -229,7 +229,7 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
|
||||
*/
|
||||
unsigned long max_sane_readahead(unsigned long nr)
|
||||
{
|
||||
return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE)
|
||||
return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE)
|
||||
+ node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
|
||||
}
|
||||
|
||||
|
319
mm/rmap.c
319
mm/rmap.c
@@ -53,9 +53,47 @@
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
struct kmem_cache *anon_vma_cachep;
|
||||
#include "internal.h"
|
||||
|
||||
/* This must be called under the mmap_sem. */
|
||||
static struct kmem_cache *anon_vma_cachep;
|
||||
|
||||
static inline struct anon_vma *anon_vma_alloc(void)
|
||||
{
|
||||
return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
|
||||
}
|
||||
|
||||
static inline void anon_vma_free(struct anon_vma *anon_vma)
|
||||
{
|
||||
kmem_cache_free(anon_vma_cachep, anon_vma);
|
||||
}
|
||||
|
||||
/**
|
||||
* anon_vma_prepare - attach an anon_vma to a memory region
|
||||
* @vma: the memory region in question
|
||||
*
|
||||
* This makes sure the memory mapping described by 'vma' has
|
||||
* an 'anon_vma' attached to it, so that we can associate the
|
||||
* anonymous pages mapped into it with that anon_vma.
|
||||
*
|
||||
* The common case will be that we already have one, but if
|
||||
* if not we either need to find an adjacent mapping that we
|
||||
* can re-use the anon_vma from (very common when the only
|
||||
* reason for splitting a vma has been mprotect()), or we
|
||||
* allocate a new one.
|
||||
*
|
||||
* Anon-vma allocations are very subtle, because we may have
|
||||
* optimistically looked up an anon_vma in page_lock_anon_vma()
|
||||
* and that may actually touch the spinlock even in the newly
|
||||
* allocated vma (it depends on RCU to make sure that the
|
||||
* anon_vma isn't actually destroyed).
|
||||
*
|
||||
* As a result, we need to do proper anon_vma locking even
|
||||
* for the new allocation. At the same time, we do not want
|
||||
* to do any locking for the common case of already having
|
||||
* an anon_vma.
|
||||
*
|
||||
* This must be called with the mmap_sem held for reading.
|
||||
*/
|
||||
int anon_vma_prepare(struct vm_area_struct *vma)
|
||||
{
|
||||
struct anon_vma *anon_vma = vma->anon_vma;
|
||||
@@ -63,20 +101,17 @@ int anon_vma_prepare(struct vm_area_struct *vma)
|
||||
might_sleep();
|
||||
if (unlikely(!anon_vma)) {
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
struct anon_vma *allocated, *locked;
|
||||
struct anon_vma *allocated;
|
||||
|
||||
anon_vma = find_mergeable_anon_vma(vma);
|
||||
if (anon_vma) {
|
||||
allocated = NULL;
|
||||
locked = anon_vma;
|
||||
spin_lock(&locked->lock);
|
||||
} else {
|
||||
allocated = NULL;
|
||||
if (!anon_vma) {
|
||||
anon_vma = anon_vma_alloc();
|
||||
if (unlikely(!anon_vma))
|
||||
return -ENOMEM;
|
||||
allocated = anon_vma;
|
||||
locked = NULL;
|
||||
}
|
||||
spin_lock(&anon_vma->lock);
|
||||
|
||||
/* page_table_lock to protect against threads */
|
||||
spin_lock(&mm->page_table_lock);
|
||||
@@ -87,8 +122,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
|
||||
}
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
|
||||
if (locked)
|
||||
spin_unlock(&locked->lock);
|
||||
spin_unlock(&anon_vma->lock);
|
||||
if (unlikely(allocated))
|
||||
anon_vma_free(allocated);
|
||||
}
|
||||
@@ -157,7 +191,7 @@ void __init anon_vma_init(void)
|
||||
* Getting a lock on a stable anon_vma from a page off the LRU is
|
||||
* tricky: page_lock_anon_vma rely on RCU to guard against the races.
|
||||
*/
|
||||
static struct anon_vma *page_lock_anon_vma(struct page *page)
|
||||
struct anon_vma *page_lock_anon_vma(struct page *page)
|
||||
{
|
||||
struct anon_vma *anon_vma;
|
||||
unsigned long anon_mapping;
|
||||
@@ -177,7 +211,7 @@ out:
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void page_unlock_anon_vma(struct anon_vma *anon_vma)
|
||||
void page_unlock_anon_vma(struct anon_vma *anon_vma)
|
||||
{
|
||||
spin_unlock(&anon_vma->lock);
|
||||
rcu_read_unlock();
|
||||
@@ -268,6 +302,32 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* page_mapped_in_vma - check whether a page is really mapped in a VMA
|
||||
* @page: the page to test
|
||||
* @vma: the VMA to test
|
||||
*
|
||||
* Returns 1 if the page is mapped into the page tables of the VMA, 0
|
||||
* if the page is not mapped into the page tables of this VMA. Only
|
||||
* valid for normal file or anonymous VMAs.
|
||||
*/
|
||||
static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
|
||||
{
|
||||
unsigned long address;
|
||||
pte_t *pte;
|
||||
spinlock_t *ptl;
|
||||
|
||||
address = vma_address(page, vma);
|
||||
if (address == -EFAULT) /* out of vma range */
|
||||
return 0;
|
||||
pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
|
||||
if (!pte) /* the page is not in this mm */
|
||||
return 0;
|
||||
pte_unmap_unlock(pte, ptl);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Subfunctions of page_referenced: page_referenced_one called
|
||||
* repeatedly from either page_referenced_anon or page_referenced_file.
|
||||
@@ -289,10 +349,17 @@ static int page_referenced_one(struct page *page,
|
||||
if (!pte)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Don't want to elevate referenced for mlocked page that gets this far,
|
||||
* in order that it progresses to try_to_unmap and is moved to the
|
||||
* unevictable list.
|
||||
*/
|
||||
if (vma->vm_flags & VM_LOCKED) {
|
||||
referenced++;
|
||||
*mapcount = 1; /* break early from loop */
|
||||
} else if (ptep_clear_flush_young_notify(vma, address, pte))
|
||||
goto out_unmap;
|
||||
}
|
||||
|
||||
if (ptep_clear_flush_young_notify(vma, address, pte))
|
||||
referenced++;
|
||||
|
||||
/* Pretend the page is referenced if the task has the
|
||||
@@ -301,6 +368,7 @@ static int page_referenced_one(struct page *page,
|
||||
rwsem_is_locked(&mm->mmap_sem))
|
||||
referenced++;
|
||||
|
||||
out_unmap:
|
||||
(*mapcount)--;
|
||||
pte_unmap_unlock(pte, ptl);
|
||||
out:
|
||||
@@ -390,11 +458,6 @@ static int page_referenced_file(struct page *page,
|
||||
*/
|
||||
if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
|
||||
continue;
|
||||
if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
|
||||
== (VM_LOCKED|VM_MAYSHARE)) {
|
||||
referenced++;
|
||||
break;
|
||||
}
|
||||
referenced += page_referenced_one(page, vma, &mapcount);
|
||||
if (!mapcount)
|
||||
break;
|
||||
@@ -674,8 +737,8 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
|
||||
page_clear_dirty(page);
|
||||
set_page_dirty(page);
|
||||
}
|
||||
|
||||
mem_cgroup_uncharge_page(page);
|
||||
if (PageAnon(page))
|
||||
mem_cgroup_uncharge_page(page);
|
||||
__dec_zone_page_state(page,
|
||||
PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
|
||||
/*
|
||||
@@ -717,11 +780,16 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
|
||||
* If it's recently referenced (perhaps page_referenced
|
||||
* skipped over this mm) then we should reactivate it.
|
||||
*/
|
||||
if (!migration && ((vma->vm_flags & VM_LOCKED) ||
|
||||
(ptep_clear_flush_young_notify(vma, address, pte)))) {
|
||||
ret = SWAP_FAIL;
|
||||
goto out_unmap;
|
||||
}
|
||||
if (!migration) {
|
||||
if (vma->vm_flags & VM_LOCKED) {
|
||||
ret = SWAP_MLOCK;
|
||||
goto out_unmap;
|
||||
}
|
||||
if (ptep_clear_flush_young_notify(vma, address, pte)) {
|
||||
ret = SWAP_FAIL;
|
||||
goto out_unmap;
|
||||
}
|
||||
}
|
||||
|
||||
/* Nuke the page table entry. */
|
||||
flush_cache_page(vma, address, page_to_pfn(page));
|
||||
@@ -802,12 +870,17 @@ out:
|
||||
* For very sparsely populated VMAs this is a little inefficient - chances are
|
||||
* there there won't be many ptes located within the scan cluster. In this case
|
||||
* maybe we could scan further - to the end of the pte page, perhaps.
|
||||
*
|
||||
* Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can
|
||||
* acquire it without blocking. If vma locked, mlock the pages in the cluster,
|
||||
* rather than unmapping them. If we encounter the "check_page" that vmscan is
|
||||
* trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
|
||||
*/
|
||||
#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
|
||||
#define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
|
||||
|
||||
static void try_to_unmap_cluster(unsigned long cursor,
|
||||
unsigned int *mapcount, struct vm_area_struct *vma)
|
||||
static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
|
||||
struct vm_area_struct *vma, struct page *check_page)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
pgd_t *pgd;
|
||||
@@ -819,6 +892,8 @@ static void try_to_unmap_cluster(unsigned long cursor,
|
||||
struct page *page;
|
||||
unsigned long address;
|
||||
unsigned long end;
|
||||
int ret = SWAP_AGAIN;
|
||||
int locked_vma = 0;
|
||||
|
||||
address = (vma->vm_start + cursor) & CLUSTER_MASK;
|
||||
end = address + CLUSTER_SIZE;
|
||||
@@ -829,15 +904,26 @@ static void try_to_unmap_cluster(unsigned long cursor,
|
||||
|
||||
pgd = pgd_offset(mm, address);
|
||||
if (!pgd_present(*pgd))
|
||||
return;
|
||||
return ret;
|
||||
|
||||
pud = pud_offset(pgd, address);
|
||||
if (!pud_present(*pud))
|
||||
return;
|
||||
return ret;
|
||||
|
||||
pmd = pmd_offset(pud, address);
|
||||
if (!pmd_present(*pmd))
|
||||
return;
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* MLOCK_PAGES => feature is configured.
|
||||
* if we can acquire the mmap_sem for read, and vma is VM_LOCKED,
|
||||
* keep the sem while scanning the cluster for mlocking pages.
|
||||
*/
|
||||
if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) {
|
||||
locked_vma = (vma->vm_flags & VM_LOCKED);
|
||||
if (!locked_vma)
|
||||
up_read(&vma->vm_mm->mmap_sem); /* don't need it */
|
||||
}
|
||||
|
||||
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
|
||||
|
||||
@@ -850,6 +936,13 @@ static void try_to_unmap_cluster(unsigned long cursor,
|
||||
page = vm_normal_page(vma, address, *pte);
|
||||
BUG_ON(!page || PageAnon(page));
|
||||
|
||||
if (locked_vma) {
|
||||
mlock_vma_page(page); /* no-op if already mlocked */
|
||||
if (page == check_page)
|
||||
ret = SWAP_MLOCK;
|
||||
continue; /* don't unmap */
|
||||
}
|
||||
|
||||
if (ptep_clear_flush_young_notify(vma, address, pte))
|
||||
continue;
|
||||
|
||||
@@ -871,39 +964,104 @@ static void try_to_unmap_cluster(unsigned long cursor,
|
||||
(*mapcount)--;
|
||||
}
|
||||
pte_unmap_unlock(pte - 1, ptl);
|
||||
if (locked_vma)
|
||||
up_read(&vma->vm_mm->mmap_sem);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int try_to_unmap_anon(struct page *page, int migration)
|
||||
/*
|
||||
* common handling for pages mapped in VM_LOCKED vmas
|
||||
*/
|
||||
static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
|
||||
{
|
||||
int mlocked = 0;
|
||||
|
||||
if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
|
||||
if (vma->vm_flags & VM_LOCKED) {
|
||||
mlock_vma_page(page);
|
||||
mlocked++; /* really mlocked the page */
|
||||
}
|
||||
up_read(&vma->vm_mm->mmap_sem);
|
||||
}
|
||||
return mlocked;
|
||||
}
|
||||
|
||||
/**
|
||||
* try_to_unmap_anon - unmap or unlock anonymous page using the object-based
|
||||
* rmap method
|
||||
* @page: the page to unmap/unlock
|
||||
* @unlock: request for unlock rather than unmap [unlikely]
|
||||
* @migration: unmapping for migration - ignored if @unlock
|
||||
*
|
||||
* Find all the mappings of a page using the mapping pointer and the vma chains
|
||||
* contained in the anon_vma struct it points to.
|
||||
*
|
||||
* This function is only called from try_to_unmap/try_to_munlock for
|
||||
* anonymous pages.
|
||||
* When called from try_to_munlock(), the mmap_sem of the mm containing the vma
|
||||
* where the page was found will be held for write. So, we won't recheck
|
||||
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
|
||||
* 'LOCKED.
|
||||
*/
|
||||
static int try_to_unmap_anon(struct page *page, int unlock, int migration)
|
||||
{
|
||||
struct anon_vma *anon_vma;
|
||||
struct vm_area_struct *vma;
|
||||
unsigned int mlocked = 0;
|
||||
int ret = SWAP_AGAIN;
|
||||
|
||||
if (MLOCK_PAGES && unlikely(unlock))
|
||||
ret = SWAP_SUCCESS; /* default for try_to_munlock() */
|
||||
|
||||
anon_vma = page_lock_anon_vma(page);
|
||||
if (!anon_vma)
|
||||
return ret;
|
||||
|
||||
list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
|
||||
ret = try_to_unmap_one(page, vma, migration);
|
||||
if (ret == SWAP_FAIL || !page_mapped(page))
|
||||
break;
|
||||
if (MLOCK_PAGES && unlikely(unlock)) {
|
||||
if (!((vma->vm_flags & VM_LOCKED) &&
|
||||
page_mapped_in_vma(page, vma)))
|
||||
continue; /* must visit all unlocked vmas */
|
||||
ret = SWAP_MLOCK; /* saw at least one mlocked vma */
|
||||
} else {
|
||||
ret = try_to_unmap_one(page, vma, migration);
|
||||
if (ret == SWAP_FAIL || !page_mapped(page))
|
||||
break;
|
||||
}
|
||||
if (ret == SWAP_MLOCK) {
|
||||
mlocked = try_to_mlock_page(page, vma);
|
||||
if (mlocked)
|
||||
break; /* stop if actually mlocked page */
|
||||
}
|
||||
}
|
||||
|
||||
page_unlock_anon_vma(anon_vma);
|
||||
|
||||
if (mlocked)
|
||||
ret = SWAP_MLOCK; /* actually mlocked the page */
|
||||
else if (ret == SWAP_MLOCK)
|
||||
ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* try_to_unmap_file - unmap file page using the object-based rmap method
|
||||
* @page: the page to unmap
|
||||
* @migration: migration flag
|
||||
* try_to_unmap_file - unmap/unlock file page using the object-based rmap method
|
||||
* @page: the page to unmap/unlock
|
||||
* @unlock: request for unlock rather than unmap [unlikely]
|
||||
* @migration: unmapping for migration - ignored if @unlock
|
||||
*
|
||||
* Find all the mappings of a page using the mapping pointer and the vma chains
|
||||
* contained in the address_space struct it points to.
|
||||
*
|
||||
* This function is only called from try_to_unmap for object-based pages.
|
||||
* This function is only called from try_to_unmap/try_to_munlock for
|
||||
* object-based pages.
|
||||
* When called from try_to_munlock(), the mmap_sem of the mm containing the vma
|
||||
* where the page was found will be held for write. So, we won't recheck
|
||||
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
|
||||
* 'LOCKED.
|
||||
*/
|
||||
static int try_to_unmap_file(struct page *page, int migration)
|
||||
static int try_to_unmap_file(struct page *page, int unlock, int migration)
|
||||
{
|
||||
struct address_space *mapping = page->mapping;
|
||||
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
|
||||
@@ -914,20 +1072,44 @@ static int try_to_unmap_file(struct page *page, int migration)
|
||||
unsigned long max_nl_cursor = 0;
|
||||
unsigned long max_nl_size = 0;
|
||||
unsigned int mapcount;
|
||||
unsigned int mlocked = 0;
|
||||
|
||||
if (MLOCK_PAGES && unlikely(unlock))
|
||||
ret = SWAP_SUCCESS; /* default for try_to_munlock() */
|
||||
|
||||
spin_lock(&mapping->i_mmap_lock);
|
||||
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
|
||||
ret = try_to_unmap_one(page, vma, migration);
|
||||
if (ret == SWAP_FAIL || !page_mapped(page))
|
||||
goto out;
|
||||
if (MLOCK_PAGES && unlikely(unlock)) {
|
||||
if (!(vma->vm_flags & VM_LOCKED))
|
||||
continue; /* must visit all vmas */
|
||||
ret = SWAP_MLOCK;
|
||||
} else {
|
||||
ret = try_to_unmap_one(page, vma, migration);
|
||||
if (ret == SWAP_FAIL || !page_mapped(page))
|
||||
goto out;
|
||||
}
|
||||
if (ret == SWAP_MLOCK) {
|
||||
mlocked = try_to_mlock_page(page, vma);
|
||||
if (mlocked)
|
||||
break; /* stop if actually mlocked page */
|
||||
}
|
||||
}
|
||||
|
||||
if (mlocked)
|
||||
goto out;
|
||||
|
||||
if (list_empty(&mapping->i_mmap_nonlinear))
|
||||
goto out;
|
||||
|
||||
list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
|
||||
shared.vm_set.list) {
|
||||
if ((vma->vm_flags & VM_LOCKED) && !migration)
|
||||
if (MLOCK_PAGES && unlikely(unlock)) {
|
||||
if (!(vma->vm_flags & VM_LOCKED))
|
||||
continue; /* must visit all vmas */
|
||||
ret = SWAP_MLOCK; /* leave mlocked == 0 */
|
||||
goto out; /* no need to look further */
|
||||
}
|
||||
if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED))
|
||||
continue;
|
||||
cursor = (unsigned long) vma->vm_private_data;
|
||||
if (cursor > max_nl_cursor)
|
||||
@@ -937,7 +1119,7 @@ static int try_to_unmap_file(struct page *page, int migration)
|
||||
max_nl_size = cursor;
|
||||
}
|
||||
|
||||
if (max_nl_size == 0) { /* any nonlinears locked or reserved */
|
||||
if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
|
||||
ret = SWAP_FAIL;
|
||||
goto out;
|
||||
}
|
||||
@@ -961,12 +1143,16 @@ static int try_to_unmap_file(struct page *page, int migration)
|
||||
do {
|
||||
list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
|
||||
shared.vm_set.list) {
|
||||
if ((vma->vm_flags & VM_LOCKED) && !migration)
|
||||
if (!MLOCK_PAGES && !migration &&
|
||||
(vma->vm_flags & VM_LOCKED))
|
||||
continue;
|
||||
cursor = (unsigned long) vma->vm_private_data;
|
||||
while ( cursor < max_nl_cursor &&
|
||||
cursor < vma->vm_end - vma->vm_start) {
|
||||
try_to_unmap_cluster(cursor, &mapcount, vma);
|
||||
ret = try_to_unmap_cluster(cursor, &mapcount,
|
||||
vma, page);
|
||||
if (ret == SWAP_MLOCK)
|
||||
mlocked = 2; /* to return below */
|
||||
cursor += CLUSTER_SIZE;
|
||||
vma->vm_private_data = (void *) cursor;
|
||||
if ((int)mapcount <= 0)
|
||||
@@ -987,6 +1173,10 @@ static int try_to_unmap_file(struct page *page, int migration)
|
||||
vma->vm_private_data = NULL;
|
||||
out:
|
||||
spin_unlock(&mapping->i_mmap_lock);
|
||||
if (mlocked)
|
||||
ret = SWAP_MLOCK; /* actually mlocked the page */
|
||||
else if (ret == SWAP_MLOCK)
|
||||
ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -1002,6 +1192,7 @@ out:
|
||||
* SWAP_SUCCESS - we succeeded in removing all mappings
|
||||
* SWAP_AGAIN - we missed a mapping, try again later
|
||||
* SWAP_FAIL - the page is unswappable
|
||||
* SWAP_MLOCK - page is mlocked.
|
||||
*/
|
||||
int try_to_unmap(struct page *page, int migration)
|
||||
{
|
||||
@@ -1010,12 +1201,36 @@ int try_to_unmap(struct page *page, int migration)
|
||||
BUG_ON(!PageLocked(page));
|
||||
|
||||
if (PageAnon(page))
|
||||
ret = try_to_unmap_anon(page, migration);
|
||||
ret = try_to_unmap_anon(page, 0, migration);
|
||||
else
|
||||
ret = try_to_unmap_file(page, migration);
|
||||
|
||||
if (!page_mapped(page))
|
||||
ret = try_to_unmap_file(page, 0, migration);
|
||||
if (ret != SWAP_MLOCK && !page_mapped(page))
|
||||
ret = SWAP_SUCCESS;
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_UNEVICTABLE_LRU
|
||||
/**
|
||||
* try_to_munlock - try to munlock a page
|
||||
* @page: the page to be munlocked
|
||||
*
|
||||
* Called from munlock code. Checks all of the VMAs mapping the page
|
||||
* to make sure nobody else has this page mlocked. The page will be
|
||||
* returned with PG_mlocked cleared if no other vmas have it mlocked.
|
||||
*
|
||||
* Return values are:
|
||||
*
|
||||
* SWAP_SUCCESS - no vma's holding page mlocked.
|
||||
* SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem
|
||||
* SWAP_MLOCK - page is now mlocked.
|
||||
*/
|
||||
int try_to_munlock(struct page *page)
|
||||
{
|
||||
VM_BUG_ON(!PageLocked(page) || PageLRU(page));
|
||||
|
||||
if (PageAnon(page))
|
||||
return try_to_unmap_anon(page, 1, 0);
|
||||
else
|
||||
return try_to_unmap_file(page, 1, 0);
|
||||
}
|
||||
#endif
|
||||
|
12
mm/shmem.c
12
mm/shmem.c
@@ -50,14 +50,12 @@
|
||||
#include <linux/migrate.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/magic.h>
|
||||
|
||||
#include <asm/uaccess.h>
|
||||
#include <asm/div64.h>
|
||||
#include <asm/pgtable.h>
|
||||
|
||||
/* This magic number is used in glibc for posix shared memory */
|
||||
#define TMPFS_MAGIC 0x01021994
|
||||
|
||||
#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
|
||||
#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
|
||||
#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
|
||||
@@ -201,7 +199,7 @@ static struct vm_operations_struct shmem_vm_ops;
|
||||
|
||||
static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
|
||||
.ra_pages = 0, /* No readahead */
|
||||
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
|
||||
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
|
||||
.unplug_io_fn = default_unplug_io_fn,
|
||||
};
|
||||
|
||||
@@ -1369,6 +1367,7 @@ repeat:
|
||||
error = -ENOMEM;
|
||||
goto failed;
|
||||
}
|
||||
SetPageSwapBacked(filepage);
|
||||
|
||||
/* Precharge page while we can wait, compensate after */
|
||||
error = mem_cgroup_cache_charge(filepage, current->mm,
|
||||
@@ -1478,12 +1477,16 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
|
||||
if (!user_shm_lock(inode->i_size, user))
|
||||
goto out_nomem;
|
||||
info->flags |= VM_LOCKED;
|
||||
mapping_set_unevictable(file->f_mapping);
|
||||
}
|
||||
if (!lock && (info->flags & VM_LOCKED) && user) {
|
||||
user_shm_unlock(inode->i_size, user);
|
||||
info->flags &= ~VM_LOCKED;
|
||||
mapping_clear_unevictable(file->f_mapping);
|
||||
scan_mapping_unevictable_pages(file->f_mapping);
|
||||
}
|
||||
retval = 0;
|
||||
|
||||
out_nomem:
|
||||
spin_unlock(&info->lock);
|
||||
return retval;
|
||||
@@ -2582,6 +2585,7 @@ put_memory:
|
||||
shmem_unacct_size(flags, size);
|
||||
return ERR_PTR(error);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(shmem_file_setup);
|
||||
|
||||
/**
|
||||
* shmem_zero_setup - setup a shared anonymous mapping
|
||||
|
52
mm/slab.c
52
mm/slab.c
@@ -95,6 +95,7 @@
|
||||
#include <linux/init.h>
|
||||
#include <linux/compiler.h>
|
||||
#include <linux/cpuset.h>
|
||||
#include <linux/proc_fs.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/notifier.h>
|
||||
#include <linux/kallsyms.h>
|
||||
@@ -4258,7 +4259,7 @@ static int s_show(struct seq_file *m, void *p)
|
||||
* + further values on SMP and with statistics enabled
|
||||
*/
|
||||
|
||||
const struct seq_operations slabinfo_op = {
|
||||
static const struct seq_operations slabinfo_op = {
|
||||
.start = s_start,
|
||||
.next = s_next,
|
||||
.stop = s_stop,
|
||||
@@ -4315,6 +4316,19 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
|
||||
return res;
|
||||
}
|
||||
|
||||
static int slabinfo_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return seq_open(file, &slabinfo_op);
|
||||
}
|
||||
|
||||
static const struct file_operations proc_slabinfo_operations = {
|
||||
.open = slabinfo_open,
|
||||
.read = seq_read,
|
||||
.write = slabinfo_write,
|
||||
.llseek = seq_lseek,
|
||||
.release = seq_release,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_DEBUG_SLAB_LEAK
|
||||
|
||||
static void *leaks_start(struct seq_file *m, loff_t *pos)
|
||||
@@ -4443,13 +4457,47 @@ static int leaks_show(struct seq_file *m, void *p)
|
||||
return 0;
|
||||
}
|
||||
|
||||
const struct seq_operations slabstats_op = {
|
||||
static const struct seq_operations slabstats_op = {
|
||||
.start = leaks_start,
|
||||
.next = s_next,
|
||||
.stop = s_stop,
|
||||
.show = leaks_show,
|
||||
};
|
||||
|
||||
static int slabstats_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL);
|
||||
int ret = -ENOMEM;
|
||||
if (n) {
|
||||
ret = seq_open(file, &slabstats_op);
|
||||
if (!ret) {
|
||||
struct seq_file *m = file->private_data;
|
||||
*n = PAGE_SIZE / (2 * sizeof(unsigned long));
|
||||
m->private = n;
|
||||
n = NULL;
|
||||
}
|
||||
kfree(n);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const struct file_operations proc_slabstats_operations = {
|
||||
.open = slabstats_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = seq_release_private,
|
||||
};
|
||||
#endif
|
||||
|
||||
static int __init slab_proc_init(void)
|
||||
{
|
||||
proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
|
||||
#ifdef CONFIG_DEBUG_SLAB_LEAK
|
||||
proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
module_init(slab_proc_init);
|
||||
#endif
|
||||
|
||||
/**
|
||||
|
@@ -514,9 +514,11 @@ size_t ksize(const void *block)
|
||||
return 0;
|
||||
|
||||
sp = (struct slob_page *)virt_to_page(block);
|
||||
if (slob_page(sp))
|
||||
return ((slob_t *)block - 1)->units + SLOB_UNIT;
|
||||
else
|
||||
if (slob_page(sp)) {
|
||||
int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
|
||||
unsigned int *m = (unsigned int *)(block - align);
|
||||
return SLOB_UNITS(*m) * SLOB_UNIT;
|
||||
} else
|
||||
return sp->page.private;
|
||||
}
|
||||
|
||||
|
29
mm/slub.c
29
mm/slub.c
@@ -14,6 +14,7 @@
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/proc_fs.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/cpuset.h>
|
||||
@@ -4417,14 +4418,6 @@ __initcall(slab_sysfs_init);
|
||||
* The /proc/slabinfo ABI
|
||||
*/
|
||||
#ifdef CONFIG_SLABINFO
|
||||
|
||||
ssize_t slabinfo_write(struct file *file, const char __user *buffer,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
|
||||
static void print_slabinfo_header(struct seq_file *m)
|
||||
{
|
||||
seq_puts(m, "slabinfo - version: 2.1\n");
|
||||
@@ -4492,11 +4485,29 @@ static int s_show(struct seq_file *m, void *p)
|
||||
return 0;
|
||||
}
|
||||
|
||||
const struct seq_operations slabinfo_op = {
|
||||
static const struct seq_operations slabinfo_op = {
|
||||
.start = s_start,
|
||||
.next = s_next,
|
||||
.stop = s_stop,
|
||||
.show = s_show,
|
||||
};
|
||||
|
||||
static int slabinfo_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return seq_open(file, &slabinfo_op);
|
||||
}
|
||||
|
||||
static const struct file_operations proc_slabinfo_operations = {
|
||||
.open = slabinfo_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = seq_release,
|
||||
};
|
||||
|
||||
static int __init slab_proc_init(void)
|
||||
{
|
||||
proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
|
||||
return 0;
|
||||
}
|
||||
module_init(slab_proc_init);
|
||||
#endif /* CONFIG_SLABINFO */
|
||||
|
184
mm/swap.c
184
mm/swap.c
@@ -31,11 +31,12 @@
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/memcontrol.h>
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
/* How many pages do we try to swap or page in/out together? */
|
||||
int page_cluster;
|
||||
|
||||
static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs);
|
||||
static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs);
|
||||
static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
|
||||
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
|
||||
|
||||
/*
|
||||
@@ -116,8 +117,9 @@ static void pagevec_move_tail(struct pagevec *pvec)
|
||||
zone = pagezone;
|
||||
spin_lock(&zone->lru_lock);
|
||||
}
|
||||
if (PageLRU(page) && !PageActive(page)) {
|
||||
list_move_tail(&page->lru, &zone->inactive_list);
|
||||
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
|
||||
int lru = page_is_file_cache(page);
|
||||
list_move_tail(&page->lru, &zone->lru[lru].list);
|
||||
pgmoved++;
|
||||
}
|
||||
}
|
||||
@@ -136,7 +138,7 @@ static void pagevec_move_tail(struct pagevec *pvec)
|
||||
void rotate_reclaimable_page(struct page *page)
|
||||
{
|
||||
if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
|
||||
PageLRU(page)) {
|
||||
!PageUnevictable(page) && PageLRU(page)) {
|
||||
struct pagevec *pvec;
|
||||
unsigned long flags;
|
||||
|
||||
@@ -157,12 +159,19 @@ void activate_page(struct page *page)
|
||||
struct zone *zone = page_zone(page);
|
||||
|
||||
spin_lock_irq(&zone->lru_lock);
|
||||
if (PageLRU(page) && !PageActive(page)) {
|
||||
del_page_from_inactive_list(zone, page);
|
||||
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
|
||||
int file = page_is_file_cache(page);
|
||||
int lru = LRU_BASE + file;
|
||||
del_page_from_lru_list(zone, page, lru);
|
||||
|
||||
SetPageActive(page);
|
||||
add_page_to_active_list(zone, page);
|
||||
lru += LRU_ACTIVE;
|
||||
add_page_to_lru_list(zone, page, lru);
|
||||
__count_vm_event(PGACTIVATE);
|
||||
mem_cgroup_move_lists(page, true);
|
||||
mem_cgroup_move_lists(page, lru);
|
||||
|
||||
zone->recent_rotated[!!file]++;
|
||||
zone->recent_scanned[!!file]++;
|
||||
}
|
||||
spin_unlock_irq(&zone->lru_lock);
|
||||
}
|
||||
@@ -176,7 +185,8 @@ void activate_page(struct page *page)
|
||||
*/
|
||||
void mark_page_accessed(struct page *page)
|
||||
{
|
||||
if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
|
||||
if (!PageActive(page) && !PageUnevictable(page) &&
|
||||
PageReferenced(page) && PageLRU(page)) {
|
||||
activate_page(page);
|
||||
ClearPageReferenced(page);
|
||||
} else if (!PageReferenced(page)) {
|
||||
@@ -186,28 +196,73 @@ void mark_page_accessed(struct page *page)
|
||||
|
||||
EXPORT_SYMBOL(mark_page_accessed);
|
||||
|
||||
/**
|
||||
* lru_cache_add: add a page to the page lists
|
||||
* @page: the page to add
|
||||
*/
|
||||
void lru_cache_add(struct page *page)
|
||||
void __lru_cache_add(struct page *page, enum lru_list lru)
|
||||
{
|
||||
struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
|
||||
struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
|
||||
|
||||
page_cache_get(page);
|
||||
if (!pagevec_add(pvec, page))
|
||||
__pagevec_lru_add(pvec);
|
||||
____pagevec_lru_add(pvec, lru);
|
||||
put_cpu_var(lru_add_pvecs);
|
||||
}
|
||||
|
||||
void lru_cache_add_active(struct page *page)
|
||||
/**
|
||||
* lru_cache_add_lru - add a page to a page list
|
||||
* @page: the page to be added to the LRU.
|
||||
* @lru: the LRU list to which the page is added.
|
||||
*/
|
||||
void lru_cache_add_lru(struct page *page, enum lru_list lru)
|
||||
{
|
||||
struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
|
||||
if (PageActive(page)) {
|
||||
VM_BUG_ON(PageUnevictable(page));
|
||||
ClearPageActive(page);
|
||||
} else if (PageUnevictable(page)) {
|
||||
VM_BUG_ON(PageActive(page));
|
||||
ClearPageUnevictable(page);
|
||||
}
|
||||
|
||||
page_cache_get(page);
|
||||
if (!pagevec_add(pvec, page))
|
||||
__pagevec_lru_add_active(pvec);
|
||||
put_cpu_var(lru_add_active_pvecs);
|
||||
VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page));
|
||||
__lru_cache_add(page, lru);
|
||||
}
|
||||
|
||||
/**
|
||||
* add_page_to_unevictable_list - add a page to the unevictable list
|
||||
* @page: the page to be added to the unevictable list
|
||||
*
|
||||
* Add page directly to its zone's unevictable list. To avoid races with
|
||||
* tasks that might be making the page evictable, through eg. munlock,
|
||||
* munmap or exit, while it's not on the lru, we want to add the page
|
||||
* while it's locked or otherwise "invisible" to other tasks. This is
|
||||
* difficult to do when using the pagevec cache, so bypass that.
|
||||
*/
|
||||
void add_page_to_unevictable_list(struct page *page)
|
||||
{
|
||||
struct zone *zone = page_zone(page);
|
||||
|
||||
spin_lock_irq(&zone->lru_lock);
|
||||
SetPageUnevictable(page);
|
||||
SetPageLRU(page);
|
||||
add_page_to_lru_list(zone, page, LRU_UNEVICTABLE);
|
||||
spin_unlock_irq(&zone->lru_lock);
|
||||
}
|
||||
|
||||
/**
|
||||
* lru_cache_add_active_or_unevictable
|
||||
* @page: the page to be added to LRU
|
||||
* @vma: vma in which page is mapped for determining reclaimability
|
||||
*
|
||||
* place @page on active or unevictable LRU list, depending on
|
||||
* page_evictable(). Note that if the page is not evictable,
|
||||
* it goes directly back onto it's zone's unevictable list. It does
|
||||
* NOT use a per cpu pagevec.
|
||||
*/
|
||||
void lru_cache_add_active_or_unevictable(struct page *page,
|
||||
struct vm_area_struct *vma)
|
||||
{
|
||||
if (page_evictable(page, vma))
|
||||
lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page));
|
||||
else
|
||||
add_page_to_unevictable_list(page);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -217,15 +272,15 @@ void lru_cache_add_active(struct page *page)
|
||||
*/
|
||||
static void drain_cpu_pagevecs(int cpu)
|
||||
{
|
||||
struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
|
||||
struct pagevec *pvec;
|
||||
int lru;
|
||||
|
||||
pvec = &per_cpu(lru_add_pvecs, cpu);
|
||||
if (pagevec_count(pvec))
|
||||
__pagevec_lru_add(pvec);
|
||||
|
||||
pvec = &per_cpu(lru_add_active_pvecs, cpu);
|
||||
if (pagevec_count(pvec))
|
||||
__pagevec_lru_add_active(pvec);
|
||||
for_each_lru(lru) {
|
||||
pvec = &pvecs[lru - LRU_BASE];
|
||||
if (pagevec_count(pvec))
|
||||
____pagevec_lru_add(pvec, lru);
|
||||
}
|
||||
|
||||
pvec = &per_cpu(lru_rotate_pvecs, cpu);
|
||||
if (pagevec_count(pvec)) {
|
||||
@@ -244,7 +299,7 @@ void lru_add_drain(void)
|
||||
put_cpu();
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
#if defined(CONFIG_NUMA) || defined(CONFIG_UNEVICTABLE_LRU)
|
||||
static void lru_add_drain_per_cpu(struct work_struct *dummy)
|
||||
{
|
||||
lru_add_drain();
|
||||
@@ -308,6 +363,7 @@ void release_pages(struct page **pages, int nr, int cold)
|
||||
|
||||
if (PageLRU(page)) {
|
||||
struct zone *pagezone = page_zone(page);
|
||||
|
||||
if (pagezone != zone) {
|
||||
if (zone)
|
||||
spin_unlock_irqrestore(&zone->lru_lock,
|
||||
@@ -380,10 +436,11 @@ void __pagevec_release_nonlru(struct pagevec *pvec)
|
||||
* Add the passed pages to the LRU, then drop the caller's refcount
|
||||
* on them. Reinitialises the caller's pagevec.
|
||||
*/
|
||||
void __pagevec_lru_add(struct pagevec *pvec)
|
||||
void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
|
||||
{
|
||||
int i;
|
||||
struct zone *zone = NULL;
|
||||
VM_BUG_ON(is_unevictable_lru(lru));
|
||||
|
||||
for (i = 0; i < pagevec_count(pvec); i++) {
|
||||
struct page *page = pvec->pages[i];
|
||||
@@ -395,38 +452,13 @@ void __pagevec_lru_add(struct pagevec *pvec)
|
||||
zone = pagezone;
|
||||
spin_lock_irq(&zone->lru_lock);
|
||||
}
|
||||
VM_BUG_ON(PageLRU(page));
|
||||
SetPageLRU(page);
|
||||
add_page_to_inactive_list(zone, page);
|
||||
}
|
||||
if (zone)
|
||||
spin_unlock_irq(&zone->lru_lock);
|
||||
release_pages(pvec->pages, pvec->nr, pvec->cold);
|
||||
pagevec_reinit(pvec);
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(__pagevec_lru_add);
|
||||
|
||||
void __pagevec_lru_add_active(struct pagevec *pvec)
|
||||
{
|
||||
int i;
|
||||
struct zone *zone = NULL;
|
||||
|
||||
for (i = 0; i < pagevec_count(pvec); i++) {
|
||||
struct page *page = pvec->pages[i];
|
||||
struct zone *pagezone = page_zone(page);
|
||||
|
||||
if (pagezone != zone) {
|
||||
if (zone)
|
||||
spin_unlock_irq(&zone->lru_lock);
|
||||
zone = pagezone;
|
||||
spin_lock_irq(&zone->lru_lock);
|
||||
}
|
||||
VM_BUG_ON(PageLRU(page));
|
||||
SetPageLRU(page);
|
||||
VM_BUG_ON(PageActive(page));
|
||||
SetPageActive(page);
|
||||
add_page_to_active_list(zone, page);
|
||||
VM_BUG_ON(PageUnevictable(page));
|
||||
VM_BUG_ON(PageLRU(page));
|
||||
SetPageLRU(page);
|
||||
if (is_active_lru(lru))
|
||||
SetPageActive(page);
|
||||
add_page_to_lru_list(zone, page, lru);
|
||||
}
|
||||
if (zone)
|
||||
spin_unlock_irq(&zone->lru_lock);
|
||||
@@ -434,6 +466,8 @@ void __pagevec_lru_add_active(struct pagevec *pvec)
|
||||
pagevec_reinit(pvec);
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(____pagevec_lru_add);
|
||||
|
||||
/*
|
||||
* Try to drop buffers from the pages in a pagevec
|
||||
*/
|
||||
@@ -452,6 +486,30 @@ void pagevec_strip(struct pagevec *pvec)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* pagevec_swap_free - try to free swap space from the pages in a pagevec
|
||||
* @pvec: pagevec with swapcache pages to free the swap space of
|
||||
*
|
||||
* The caller needs to hold an extra reference to each page and
|
||||
* not hold the page lock on the pages. This function uses a
|
||||
* trylock on the page lock so it may not always free the swap
|
||||
* space associated with a page.
|
||||
*/
|
||||
void pagevec_swap_free(struct pagevec *pvec)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < pagevec_count(pvec); i++) {
|
||||
struct page *page = pvec->pages[i];
|
||||
|
||||
if (PageSwapCache(page) && trylock_page(page)) {
|
||||
if (PageSwapCache(page))
|
||||
remove_exclusive_swap_page_ref(page);
|
||||
unlock_page(page);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* pagevec_lookup - gang pagecache lookup
|
||||
* @pvec: Where the resulting pages are placed
|
||||
|
@@ -33,7 +33,7 @@ static const struct address_space_operations swap_aops = {
|
||||
};
|
||||
|
||||
static struct backing_dev_info swap_backing_dev_info = {
|
||||
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
|
||||
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
|
||||
.unplug_io_fn = swap_unplug_io_fn,
|
||||
};
|
||||
|
||||
@@ -75,6 +75,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
|
||||
BUG_ON(!PageLocked(page));
|
||||
BUG_ON(PageSwapCache(page));
|
||||
BUG_ON(PagePrivate(page));
|
||||
BUG_ON(!PageSwapBacked(page));
|
||||
error = radix_tree_preload(gfp_mask);
|
||||
if (!error) {
|
||||
page_cache_get(page);
|
||||
@@ -302,17 +303,19 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
|
||||
* re-using the just freed swap entry for an existing page.
|
||||
* May fail (-ENOMEM) if radix-tree node allocation failed.
|
||||
*/
|
||||
set_page_locked(new_page);
|
||||
__set_page_locked(new_page);
|
||||
SetPageSwapBacked(new_page);
|
||||
err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
|
||||
if (likely(!err)) {
|
||||
/*
|
||||
* Initiate read into locked page and return.
|
||||
*/
|
||||
lru_cache_add_active(new_page);
|
||||
lru_cache_add_anon(new_page);
|
||||
swap_readpage(NULL, new_page);
|
||||
return new_page;
|
||||
}
|
||||
clear_page_locked(new_page);
|
||||
ClearPageSwapBacked(new_page);
|
||||
__clear_page_locked(new_page);
|
||||
swap_free(entry);
|
||||
} while (err != -ENOMEM);
|
||||
|
||||
|
@@ -344,7 +344,7 @@ int can_share_swap_page(struct page *page)
|
||||
* Work out if there are any other processes sharing this
|
||||
* swap cache page. Free it if you can. Return success.
|
||||
*/
|
||||
int remove_exclusive_swap_page(struct page *page)
|
||||
static int remove_exclusive_swap_page_count(struct page *page, int count)
|
||||
{
|
||||
int retval;
|
||||
struct swap_info_struct * p;
|
||||
@@ -357,7 +357,7 @@ int remove_exclusive_swap_page(struct page *page)
|
||||
return 0;
|
||||
if (PageWriteback(page))
|
||||
return 0;
|
||||
if (page_count(page) != 2) /* 2: us + cache */
|
||||
if (page_count(page) != count) /* us + cache + ptes */
|
||||
return 0;
|
||||
|
||||
entry.val = page_private(page);
|
||||
@@ -370,7 +370,7 @@ int remove_exclusive_swap_page(struct page *page)
|
||||
if (p->swap_map[swp_offset(entry)] == 1) {
|
||||
/* Recheck the page count with the swapcache lock held.. */
|
||||
spin_lock_irq(&swapper_space.tree_lock);
|
||||
if ((page_count(page) == 2) && !PageWriteback(page)) {
|
||||
if ((page_count(page) == count) && !PageWriteback(page)) {
|
||||
__delete_from_swap_cache(page);
|
||||
SetPageDirty(page);
|
||||
retval = 1;
|
||||
@@ -387,6 +387,25 @@ int remove_exclusive_swap_page(struct page *page)
|
||||
return retval;
|
||||
}
|
||||
|
||||
/*
|
||||
* Most of the time the page should have two references: one for the
|
||||
* process and one for the swap cache.
|
||||
*/
|
||||
int remove_exclusive_swap_page(struct page *page)
|
||||
{
|
||||
return remove_exclusive_swap_page_count(page, 2);
|
||||
}
|
||||
|
||||
/*
|
||||
* The pageout code holds an extra reference to the page. That raises
|
||||
* the reference count to test for to 2 for a page that is only in the
|
||||
* swap cache plus 1 for each process that maps the page.
|
||||
*/
|
||||
int remove_exclusive_swap_page_ref(struct page *page)
|
||||
{
|
||||
return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page));
|
||||
}
|
||||
|
||||
/*
|
||||
* Free the swap entry like above, but also try to
|
||||
* free the page cache entry if it is the last user.
|
||||
@@ -403,7 +422,7 @@ void free_swap_and_cache(swp_entry_t entry)
|
||||
if (p) {
|
||||
if (swap_entry_free(p, swp_offset(entry)) == 1) {
|
||||
page = find_get_page(&swapper_space, entry.val);
|
||||
if (page && unlikely(!trylock_page(page))) {
|
||||
if (page && !trylock_page(page)) {
|
||||
page_cache_release(page);
|
||||
page = NULL;
|
||||
}
|
||||
|
@@ -65,36 +65,37 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
|
||||
if (!dentry)
|
||||
goto put_memory;
|
||||
|
||||
error = -ENOSPC;
|
||||
inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
|
||||
if (!inode)
|
||||
goto put_dentry;
|
||||
|
||||
d_instantiate(dentry, inode);
|
||||
error = -ENFILE;
|
||||
file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
|
||||
&ramfs_file_operations);
|
||||
file = get_empty_filp();
|
||||
if (!file)
|
||||
goto put_dentry;
|
||||
|
||||
inode->i_nlink = 0; /* It is unlinked */
|
||||
|
||||
/* notify everyone as to the change of file size */
|
||||
error = do_truncate(dentry, size, 0, file);
|
||||
if (error < 0)
|
||||
error = -ENOSPC;
|
||||
inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
|
||||
if (!inode)
|
||||
goto close_file;
|
||||
|
||||
d_instantiate(dentry, inode);
|
||||
inode->i_size = size;
|
||||
inode->i_nlink = 0; /* It is unlinked */
|
||||
init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
|
||||
&ramfs_file_operations);
|
||||
|
||||
#ifndef CONFIG_MMU
|
||||
error = ramfs_nommu_expand_for_mapping(inode, size);
|
||||
if (error)
|
||||
goto close_file;
|
||||
#endif
|
||||
return file;
|
||||
|
||||
close_file:
|
||||
put_filp(file);
|
||||
return ERR_PTR(error);
|
||||
|
||||
put_dentry:
|
||||
dput(dentry);
|
||||
put_memory:
|
||||
return ERR_PTR(error);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(shmem_file_setup);
|
||||
|
||||
/**
|
||||
* shmem_zero_setup - setup a shared anonymous mapping
|
||||
|
@@ -3,7 +3,7 @@
|
||||
*
|
||||
* Copyright (C) 2002, Linus Torvalds
|
||||
*
|
||||
* 10Sep2002 akpm@zip.com.au
|
||||
* 10Sep2002 Andrew Morton
|
||||
* Initial version.
|
||||
*/
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
#include <linux/task_io_accounting_ops.h>
|
||||
#include <linux/buffer_head.h> /* grr. try_to_release_page,
|
||||
do_invalidatepage */
|
||||
#include "internal.h"
|
||||
|
||||
|
||||
/**
|
||||
@@ -103,6 +104,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
|
||||
|
||||
cancel_dirty_page(page, PAGE_CACHE_SIZE);
|
||||
|
||||
clear_page_mlock(page);
|
||||
remove_from_page_cache(page);
|
||||
ClearPageMappedToDisk(page);
|
||||
page_cache_release(page); /* pagecache ref */
|
||||
@@ -127,6 +129,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
|
||||
if (PagePrivate(page) && !try_to_release_page(page, 0))
|
||||
return 0;
|
||||
|
||||
clear_page_mlock(page);
|
||||
ret = remove_mapping(mapping, page);
|
||||
|
||||
return ret;
|
||||
@@ -352,6 +355,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
|
||||
if (PageDirty(page))
|
||||
goto failed;
|
||||
|
||||
clear_page_mlock(page);
|
||||
BUG_ON(PagePrivate(page));
|
||||
__remove_from_page_cache(page);
|
||||
spin_unlock_irq(&mapping->tree_lock);
|
||||
|
1030
mm/vmalloc.c
1030
mm/vmalloc.c
File diff suppressed because it is too large
Load Diff
1034
mm/vmscan.c
1034
mm/vmscan.c
File diff suppressed because it is too large
Load Diff
102
mm/vmstat.c
102
mm/vmstat.c
@@ -8,7 +8,7 @@
|
||||
* Copyright (C) 2006 Silicon Graphics, Inc.,
|
||||
* Christoph Lameter <christoph@lameter.com>
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/err.h>
|
||||
#include <linux/module.h>
|
||||
@@ -384,7 +384,7 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_PROC_FS
|
||||
|
||||
#include <linux/proc_fs.h>
|
||||
#include <linux/seq_file.h>
|
||||
|
||||
static char * const migratetype_names[MIGRATE_TYPES] = {
|
||||
@@ -581,20 +581,44 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg)
|
||||
return 0;
|
||||
}
|
||||
|
||||
const struct seq_operations fragmentation_op = {
|
||||
static const struct seq_operations fragmentation_op = {
|
||||
.start = frag_start,
|
||||
.next = frag_next,
|
||||
.stop = frag_stop,
|
||||
.show = frag_show,
|
||||
};
|
||||
|
||||
const struct seq_operations pagetypeinfo_op = {
|
||||
static int fragmentation_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return seq_open(file, &fragmentation_op);
|
||||
}
|
||||
|
||||
static const struct file_operations fragmentation_file_operations = {
|
||||
.open = fragmentation_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = seq_release,
|
||||
};
|
||||
|
||||
static const struct seq_operations pagetypeinfo_op = {
|
||||
.start = frag_start,
|
||||
.next = frag_next,
|
||||
.stop = frag_stop,
|
||||
.show = pagetypeinfo_show,
|
||||
};
|
||||
|
||||
static int pagetypeinfo_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return seq_open(file, &pagetypeinfo_op);
|
||||
}
|
||||
|
||||
static const struct file_operations pagetypeinfo_file_ops = {
|
||||
.open = pagetypeinfo_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = seq_release,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_ZONE_DMA
|
||||
#define TEXT_FOR_DMA(xx) xx "_dma",
|
||||
#else
|
||||
@@ -619,8 +643,14 @@ const struct seq_operations pagetypeinfo_op = {
|
||||
static const char * const vmstat_text[] = {
|
||||
/* Zoned VM counters */
|
||||
"nr_free_pages",
|
||||
"nr_inactive",
|
||||
"nr_active",
|
||||
"nr_inactive_anon",
|
||||
"nr_active_anon",
|
||||
"nr_inactive_file",
|
||||
"nr_active_file",
|
||||
#ifdef CONFIG_UNEVICTABLE_LRU
|
||||
"nr_unevictable",
|
||||
"nr_mlock",
|
||||
#endif
|
||||
"nr_anon_pages",
|
||||
"nr_mapped",
|
||||
"nr_file_pages",
|
||||
@@ -675,6 +705,16 @@ static const char * const vmstat_text[] = {
|
||||
"htlb_buddy_alloc_success",
|
||||
"htlb_buddy_alloc_fail",
|
||||
#endif
|
||||
#ifdef CONFIG_UNEVICTABLE_LRU
|
||||
"unevictable_pgs_culled",
|
||||
"unevictable_pgs_scanned",
|
||||
"unevictable_pgs_rescued",
|
||||
"unevictable_pgs_mlocked",
|
||||
"unevictable_pgs_munlocked",
|
||||
"unevictable_pgs_cleared",
|
||||
"unevictable_pgs_stranded",
|
||||
"unevictable_pgs_mlockfreed",
|
||||
#endif
|
||||
#endif
|
||||
};
|
||||
|
||||
@@ -688,7 +728,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
|
||||
"\n min %lu"
|
||||
"\n low %lu"
|
||||
"\n high %lu"
|
||||
"\n scanned %lu (a: %lu i: %lu)"
|
||||
"\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)"
|
||||
"\n spanned %lu"
|
||||
"\n present %lu",
|
||||
zone_page_state(zone, NR_FREE_PAGES),
|
||||
@@ -696,7 +736,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
|
||||
zone->pages_low,
|
||||
zone->pages_high,
|
||||
zone->pages_scanned,
|
||||
zone->nr_scan_active, zone->nr_scan_inactive,
|
||||
zone->lru[LRU_ACTIVE_ANON].nr_scan,
|
||||
zone->lru[LRU_INACTIVE_ANON].nr_scan,
|
||||
zone->lru[LRU_ACTIVE_FILE].nr_scan,
|
||||
zone->lru[LRU_INACTIVE_FILE].nr_scan,
|
||||
zone->spanned_pages,
|
||||
zone->present_pages);
|
||||
|
||||
@@ -733,10 +776,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
|
||||
seq_printf(m,
|
||||
"\n all_unreclaimable: %u"
|
||||
"\n prev_priority: %i"
|
||||
"\n start_pfn: %lu",
|
||||
"\n start_pfn: %lu"
|
||||
"\n inactive_ratio: %u",
|
||||
zone_is_all_unreclaimable(zone),
|
||||
zone->prev_priority,
|
||||
zone->zone_start_pfn);
|
||||
zone->zone_start_pfn,
|
||||
zone->inactive_ratio);
|
||||
seq_putc(m, '\n');
|
||||
}
|
||||
|
||||
@@ -750,7 +795,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
|
||||
return 0;
|
||||
}
|
||||
|
||||
const struct seq_operations zoneinfo_op = {
|
||||
static const struct seq_operations zoneinfo_op = {
|
||||
.start = frag_start, /* iterate over all zones. The same as in
|
||||
* fragmentation. */
|
||||
.next = frag_next,
|
||||
@@ -758,6 +803,18 @@ const struct seq_operations zoneinfo_op = {
|
||||
.show = zoneinfo_show,
|
||||
};
|
||||
|
||||
static int zoneinfo_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return seq_open(file, &zoneinfo_op);
|
||||
}
|
||||
|
||||
static const struct file_operations proc_zoneinfo_file_operations = {
|
||||
.open = zoneinfo_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = seq_release,
|
||||
};
|
||||
|
||||
static void *vmstat_start(struct seq_file *m, loff_t *pos)
|
||||
{
|
||||
unsigned long *v;
|
||||
@@ -813,13 +870,24 @@ static void vmstat_stop(struct seq_file *m, void *arg)
|
||||
m->private = NULL;
|
||||
}
|
||||
|
||||
const struct seq_operations vmstat_op = {
|
||||
static const struct seq_operations vmstat_op = {
|
||||
.start = vmstat_start,
|
||||
.next = vmstat_next,
|
||||
.stop = vmstat_stop,
|
||||
.show = vmstat_show,
|
||||
};
|
||||
|
||||
static int vmstat_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return seq_open(file, &vmstat_op);
|
||||
}
|
||||
|
||||
static const struct file_operations proc_vmstat_file_operations = {
|
||||
.open = vmstat_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = seq_release,
|
||||
};
|
||||
#endif /* CONFIG_PROC_FS */
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
@@ -877,9 +945,11 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
|
||||
|
||||
static struct notifier_block __cpuinitdata vmstat_notifier =
|
||||
{ &vmstat_cpuup_callback, NULL, 0 };
|
||||
#endif
|
||||
|
||||
static int __init setup_vmstat(void)
|
||||
{
|
||||
#ifdef CONFIG_SMP
|
||||
int cpu;
|
||||
|
||||
refresh_zone_stat_thresholds();
|
||||
@@ -887,7 +957,13 @@ static int __init setup_vmstat(void)
|
||||
|
||||
for_each_online_cpu(cpu)
|
||||
start_cpu_timer(cpu);
|
||||
#endif
|
||||
#ifdef CONFIG_PROC_FS
|
||||
proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
|
||||
proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
|
||||
proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
|
||||
proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
module_init(setup_vmstat)
|
||||
#endif
|
||||
|
Reference in New Issue
Block a user