Merge branch 'akpm'

* akpm: (182 commits)
  fbdev: bf54x-lq043fb: use kzalloc over kmalloc/memset
  fbdev: *bfin*: fix __dev{init,exit} markings
  fbdev: *bfin*: drop unnecessary calls to memset
  fbdev: bfin-t350mcqb-fb: drop unused local variables
  fbdev: blackfin has __raw I/O accessors, so use them in fb.h
  fbdev: s1d13xxxfb: add accelerated bitblt functions
  tcx: use standard fields for framebuffer physical address and length
  fbdev: add support for handoff from firmware to hw framebuffers
  intelfb: fix a bug when changing video timing
  fbdev: use framebuffer_release() for freeing fb_info structures
  radeon: P2G2CLK_ALWAYS_ONb tested twice, should 2nd be P2G2CLK_DAC_ALWAYS_ONb?
  s3c-fb: CPUFREQ frequency scaling support
  s3c-fb: fix resource releasing on error during probing
  carminefb: fix possible access beyond end of carmine_modedb[]
  acornfb: remove fb_mmap function
  mb862xxfb: use CONFIG_OF instead of CONFIG_PPC_OF
  mb862xxfb: restrict compliation of platform driver to PPC
  Samsung SoC Framebuffer driver: add Alpha Channel support
  atmel-lcdc: fix pixclock upper bound detection
  offb: use framebuffer_alloc() to allocate fb_info struct
  ...

Manually fix up conflicts due to kmemcheck in mm/slab.c
这个提交包含在:
Linus Torvalds
2009-06-16 19:50:13 -07:00
当前提交 517d08699b
修改 247 个文件,包含 5676 行新增2693 行删除

查看文件

@@ -203,25 +203,13 @@ config VIRT_TO_BUS
def_bool y
depends on !ARCH_NO_VIRT_TO_BUS
config UNEVICTABLE_LRU
bool "Add LRU list to track non-evictable pages"
default y
help
Keeps unevictable pages off of the active and inactive pageout
lists, so kswapd will not waste CPU time or have its balancing
algorithms thrown off by scanning these pages. Selecting this
will use one page flag and increase the code size a little,
say Y unless you know what you are doing.
See Documentation/vm/unevictable-lru.txt for more information.
config HAVE_MLOCK
bool
default y if MMU=y
config HAVE_MLOCKED_PAGE_BIT
bool
default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y
default y if HAVE_MLOCK=y
config MMU_NOTIFIER
bool

查看文件

@@ -12,6 +12,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
page_isolation.o mm_init.o $(mmu-y)
obj-y += init-mm.o
obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
obj-$(CONFIG_BOUNCE) += bounce.o

查看文件

@@ -101,7 +101,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
ret = force_page_cache_readahead(mapping, file,
start_index,
max_sane_readahead(nrpages));
nrpages);
if (ret > 0)
ret = 0;
break;

查看文件

@@ -521,7 +521,7 @@ struct page *__page_cache_alloc(gfp_t gfp)
{
if (cpuset_do_page_mem_spread()) {
int n = cpuset_mem_spread_node();
return alloc_pages_node(n, gfp, 0);
return alloc_pages_exact_node(n, gfp, 0);
}
return alloc_pages(gfp, 0);
}
@@ -1004,9 +1004,6 @@ EXPORT_SYMBOL(grab_cache_page_nowait);
static void shrink_readahead_size_eio(struct file *filp,
struct file_ra_state *ra)
{
if (!ra->ra_pages)
return;
ra->ra_pages /= 4;
}
@@ -1390,8 +1387,7 @@ do_readahead(struct address_space *mapping, struct file *filp,
if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
return -EINVAL;
force_page_cache_readahead(mapping, filp, index,
max_sane_readahead(nr));
force_page_cache_readahead(mapping, filp, index, nr);
return 0;
}
@@ -1457,6 +1453,73 @@ static int page_cache_read(struct file *file, pgoff_t offset)
#define MMAP_LOTSAMISS (100)
/*
* Synchronous readahead happens when we don't even find
* a page in the page cache at all.
*/
static void do_sync_mmap_readahead(struct vm_area_struct *vma,
struct file_ra_state *ra,
struct file *file,
pgoff_t offset)
{
unsigned long ra_pages;
struct address_space *mapping = file->f_mapping;
/* If we don't want any read-ahead, don't bother */
if (VM_RandomReadHint(vma))
return;
if (VM_SequentialReadHint(vma) ||
offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) {
page_cache_sync_readahead(mapping, ra, file, offset,
ra->ra_pages);
return;
}
if (ra->mmap_miss < INT_MAX)
ra->mmap_miss++;
/*
* Do we miss much more than hit in this file? If so,
* stop bothering with read-ahead. It will only hurt.
*/
if (ra->mmap_miss > MMAP_LOTSAMISS)
return;
/*
* mmap read-around
*/
ra_pages = max_sane_readahead(ra->ra_pages);
if (ra_pages) {
ra->start = max_t(long, 0, offset - ra_pages/2);
ra->size = ra_pages;
ra->async_size = 0;
ra_submit(ra, mapping, file);
}
}
/*
* Asynchronous readahead happens when we find the page and PG_readahead,
* so we want to possibly extend the readahead further..
*/
static void do_async_mmap_readahead(struct vm_area_struct *vma,
struct file_ra_state *ra,
struct file *file,
struct page *page,
pgoff_t offset)
{
struct address_space *mapping = file->f_mapping;
/* If we don't want any read-ahead, don't bother */
if (VM_RandomReadHint(vma))
return;
if (ra->mmap_miss > 0)
ra->mmap_miss--;
if (PageReadahead(page))
page_cache_async_readahead(mapping, ra, file,
page, offset, ra->ra_pages);
}
/**
* filemap_fault - read in file data for page fault handling
* @vma: vma in which the fault was taken
@@ -1476,78 +1539,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct address_space *mapping = file->f_mapping;
struct file_ra_state *ra = &file->f_ra;
struct inode *inode = mapping->host;
pgoff_t offset = vmf->pgoff;
struct page *page;
pgoff_t size;
int did_readaround = 0;
int ret = 0;
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (vmf->pgoff >= size)
if (offset >= size)
return VM_FAULT_SIGBUS;
/* If we don't want any read-ahead, don't bother */
if (VM_RandomReadHint(vma))
goto no_cached_page;
/*
* Do we have something in the page cache already?
*/
retry_find:
page = find_lock_page(mapping, vmf->pgoff);
/*
* For sequential accesses, we use the generic readahead logic.
*/
if (VM_SequentialReadHint(vma)) {
if (!page) {
page_cache_sync_readahead(mapping, ra, file,
vmf->pgoff, 1);
page = find_lock_page(mapping, vmf->pgoff);
if (!page)
goto no_cached_page;
}
if (PageReadahead(page)) {
page_cache_async_readahead(mapping, ra, file, page,
vmf->pgoff, 1);
}
}
if (!page) {
unsigned long ra_pages;
ra->mmap_miss++;
page = find_get_page(mapping, offset);
if (likely(page)) {
/*
* Do we miss much more than hit in this file? If so,
* stop bothering with read-ahead. It will only hurt.
* We found the page, so try async readahead before
* waiting for the lock.
*/
if (ra->mmap_miss > MMAP_LOTSAMISS)
do_async_mmap_readahead(vma, ra, file, page, offset);
lock_page(page);
/* Did it get truncated? */
if (unlikely(page->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto no_cached_page;
/*
* To keep the pgmajfault counter straight, we need to
* check did_readaround, as this is an inner loop.
*/
if (!did_readaround) {
ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
}
did_readaround = 1;
ra_pages = max_sane_readahead(file->f_ra.ra_pages);
if (ra_pages) {
pgoff_t start = 0;
if (vmf->pgoff > ra_pages / 2)
start = vmf->pgoff - ra_pages / 2;
do_page_cache_readahead(mapping, file, start, ra_pages);
}
page = find_lock_page(mapping, vmf->pgoff);
} else {
/* No page in the page cache at all */
do_sync_mmap_readahead(vma, ra, file, offset);
count_vm_event(PGMAJFAULT);
ret = VM_FAULT_MAJOR;
retry_find:
page = find_lock_page(mapping, offset);
if (!page)
goto no_cached_page;
}
if (!did_readaround)
ra->mmap_miss--;
/*
* We have a locked page in the page cache, now we need to check
* that it's up-to-date. If not, it is going to be due to an error.
@@ -1555,18 +1584,18 @@ retry_find:
if (unlikely(!PageUptodate(page)))
goto page_not_uptodate;
/* Must recheck i_size under page lock */
/*
* Found the page and have a reference on it.
* We must recheck i_size under page lock.
*/
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (unlikely(vmf->pgoff >= size)) {
if (unlikely(offset >= size)) {
unlock_page(page);
page_cache_release(page);
return VM_FAULT_SIGBUS;
}
/*
* Found the page and have a reference on it.
*/
ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT;
vmf->page = page;
return ret | VM_FAULT_LOCKED;
@@ -1575,7 +1604,7 @@ no_cached_page:
* We're only likely to ever get here if MADV_RANDOM is in
* effect.
*/
error = page_cache_read(file, vmf->pgoff);
error = page_cache_read(file, offset);
/*
* The page we want has now been added to the page cache.
@@ -1595,12 +1624,6 @@ no_cached_page:
return VM_FAULT_SIGBUS;
page_not_uptodate:
/* IO error path */
if (!did_readaround) {
ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
}
/*
* Umm, take care of errors if the page isn't up-to-date.
* Try to re-read it _once_. We do this synchronously,

查看文件

@@ -578,41 +578,6 @@ static void free_huge_page(struct page *page)
hugetlb_put_quota(mapping, 1);
}
/*
* Increment or decrement surplus_huge_pages. Keep node-specific counters
* balanced by operating on them in a round-robin fashion.
* Returns 1 if an adjustment was made.
*/
static int adjust_pool_surplus(struct hstate *h, int delta)
{
static int prev_nid;
int nid = prev_nid;
int ret = 0;
VM_BUG_ON(delta != -1 && delta != 1);
do {
nid = next_node(nid, node_online_map);
if (nid == MAX_NUMNODES)
nid = first_node(node_online_map);
/* To shrink on this node, there must be a surplus page */
if (delta < 0 && !h->surplus_huge_pages_node[nid])
continue;
/* Surplus cannot exceed the total number of pages */
if (delta > 0 && h->surplus_huge_pages_node[nid] >=
h->nr_huge_pages_node[nid])
continue;
h->surplus_huge_pages += delta;
h->surplus_huge_pages_node[nid] += delta;
ret = 1;
break;
} while (nid != prev_nid);
prev_nid = nid;
return ret;
}
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
{
set_compound_page_dtor(page, free_huge_page);
@@ -623,6 +588,34 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
put_page(page); /* free it into the hugepage allocator */
}
static void prep_compound_gigantic_page(struct page *page, unsigned long order)
{
int i;
int nr_pages = 1 << order;
struct page *p = page + 1;
/* we rely on prep_new_huge_page to set the destructor */
set_compound_order(page, order);
__SetPageHead(page);
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
__SetPageTail(p);
p->first_page = page;
}
}
int PageHuge(struct page *page)
{
compound_page_dtor *dtor;
if (!PageCompound(page))
return 0;
page = compound_head(page);
dtor = get_compound_page_dtor(page);
return dtor == free_huge_page;
}
static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
@@ -630,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
if (h->order >= MAX_ORDER)
return NULL;
page = alloc_pages_node(nid,
page = alloc_pages_exact_node(nid,
htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
__GFP_REPEAT|__GFP_NOWARN,
huge_page_order(h));
@@ -649,7 +642,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
* Use a helper variable to find the next node and then
* copy it back to hugetlb_next_nid afterwards:
* otherwise there's a window in which a racer might
* pass invalid nid MAX_NUMNODES to alloc_pages_node.
* pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
* But we don't need to use a spin_lock here: it really
* doesn't matter if occasionally a racer chooses the
* same nid as we do. Move nid forward in the mask even
@@ -875,7 +868,7 @@ static void return_unused_surplus_pages(struct hstate *h,
* can no longer free unreserved surplus pages. This occurs when
* the nodes with surplus pages have no free pages.
*/
unsigned long remaining_iterations = num_online_nodes();
unsigned long remaining_iterations = nr_online_nodes;
/* Uncommit the reservation */
h->resv_huge_pages -= unused_resv_pages;
@@ -904,7 +897,7 @@ static void return_unused_surplus_pages(struct hstate *h,
h->surplus_huge_pages--;
h->surplus_huge_pages_node[nid]--;
nr_pages--;
remaining_iterations = num_online_nodes();
remaining_iterations = nr_online_nodes;
}
}
}
@@ -1140,6 +1133,41 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
}
#endif
/*
* Increment or decrement surplus_huge_pages. Keep node-specific counters
* balanced by operating on them in a round-robin fashion.
* Returns 1 if an adjustment was made.
*/
static int adjust_pool_surplus(struct hstate *h, int delta)
{
static int prev_nid;
int nid = prev_nid;
int ret = 0;
VM_BUG_ON(delta != -1 && delta != 1);
do {
nid = next_node(nid, node_online_map);
if (nid == MAX_NUMNODES)
nid = first_node(node_online_map);
/* To shrink on this node, there must be a surplus page */
if (delta < 0 && !h->surplus_huge_pages_node[nid])
continue;
/* Surplus cannot exceed the total number of pages */
if (delta > 0 && h->surplus_huge_pages_node[nid] >=
h->nr_huge_pages_node[nid])
continue;
h->surplus_huge_pages += delta;
h->surplus_huge_pages_node[nid] += delta;
ret = 1;
break;
} while (nid != prev_nid);
prev_nid = nid;
return ret;
}
#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
{

20
mm/init-mm.c 普通文件
查看文件

@@ -0,0 +1,20 @@
#include <linux/mm_types.h>
#include <linux/rbtree.h>
#include <linux/rwsem.h>
#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/cpumask.h>
#include <asm/atomic.h>
#include <asm/pgtable.h>
struct mm_struct init_mm = {
.mm_rb = RB_ROOT,
.pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
.mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
.cpu_vm_mask = CPU_MASK_ALL,
};

查看文件

@@ -16,9 +16,6 @@
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
extern void prep_compound_page(struct page *page, unsigned long order);
extern void prep_compound_gigantic_page(struct page *page, unsigned long order);
static inline void set_page_count(struct page *page, int v)
{
atomic_set(&page->_count, v);
@@ -51,6 +48,8 @@ extern void putback_lru_page(struct page *page);
*/
extern unsigned long highest_memmap_pfn;
extern void __free_pages_bootmem(struct page *page, unsigned int order);
extern void prep_compound_page(struct page *page, unsigned long order);
/*
* function for dealing with page's order in buddy system.
@@ -74,7 +73,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
}
#endif
#ifdef CONFIG_UNEVICTABLE_LRU
/*
* unevictable_migrate_page() called only from migrate_page_copy() to
* migrate unevictable flag to new page.
@@ -86,11 +84,6 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old)
if (TestClearPageUnevictable(old))
SetPageUnevictable(new);
}
#else
static inline void unevictable_migrate_page(struct page *new, struct page *old)
{
}
#endif
#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
/*
@@ -150,23 +143,6 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
}
}
/*
* free_page_mlock() -- clean up attempts to free and mlocked() page.
* Page should not be on lru, so no need to fix that up.
* free_pages_check() will verify...
*/
static inline void free_page_mlock(struct page *page)
{
if (unlikely(TestClearPageMlocked(page))) {
unsigned long flags;
local_irq_save(flags);
__dec_zone_page_state(page, NR_MLOCK);
__count_vm_event(UNEVICTABLE_MLOCKFREED);
local_irq_restore(flags);
}
}
#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
{
@@ -175,7 +151,6 @@ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
static inline void mlock_migrate_page(struct page *new, struct page *old) { }
static inline void free_page_mlock(struct page *page) { }
#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
@@ -284,4 +259,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int flags,
struct page **pages, struct vm_area_struct **vmas);
#define ZONE_RECLAIM_NOSCAN -2
#define ZONE_RECLAIM_FULL -1
#define ZONE_RECLAIM_SOME 0
#define ZONE_RECLAIM_SUCCESS 1
#endif

查看文件

@@ -123,8 +123,7 @@ static long madvise_willneed(struct vm_area_struct * vma,
end = vma->vm_end;
end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
force_page_cache_readahead(file->f_mapping,
file, start, max_sane_readahead(end - start));
force_page_cache_readahead(file->f_mapping, file, start, end - start);
return 0;
}
@@ -239,12 +238,30 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
break;
default:
error = -EINVAL;
BUG();
break;
}
return error;
}
static int
madvise_behavior_valid(int behavior)
{
switch (behavior) {
case MADV_DOFORK:
case MADV_DONTFORK:
case MADV_NORMAL:
case MADV_SEQUENTIAL:
case MADV_RANDOM:
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
return 1;
default:
return 0;
}
}
/*
* The madvise(2) system call.
*
@@ -290,6 +307,9 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
int write;
size_t len;
if (!madvise_behavior_valid(behavior))
return error;
write = madvise_need_mmap_write(behavior);
if (write)
down_write(&current->mm->mmap_sem);

查看文件

@@ -570,6 +570,17 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
return 0;
}
int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
{
unsigned long active;
unsigned long inactive;
inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
return (active > inactive);
}
unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
struct zone *zone,
enum lru_list lru)

查看文件

@@ -1360,6 +1360,56 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
return i;
}
/**
* get_user_pages() - pin user pages in memory
* @tsk: task_struct of target task
* @mm: mm_struct of target mm
* @start: starting user address
* @len: number of pages from start to pin
* @write: whether pages will be written to by the caller
* @force: whether to force write access even if user mapping is
* readonly. This will result in the page being COWed even
* in MAP_SHARED mappings. You do not want this.
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
*
* Returns number of pages pinned. This may be fewer than the number
* requested. If len is 0 or negative, returns 0. If no pages
* were pinned, returns -errno. Each page returned must be released
* with a put_page() call when it is finished with. vmas will only
* remain valid while mmap_sem is held.
*
* Must be called with mmap_sem held for read or write.
*
* get_user_pages walks a process's page tables and takes a reference to
* each struct page that each user address corresponds to at a given
* instant. That is, it takes the page that would be accessed if a user
* thread accesses the given user virtual address at that instant.
*
* This does not guarantee that the page exists in the user mappings when
* get_user_pages returns, and there may even be a completely different
* page there in some cases (eg. if mmapped pagecache has been invalidated
* and subsequently re faulted). However it does guarantee that the page
* won't be freed completely. And mostly callers simply care that the page
* contains data that was valid *at some point in time*. Typically, an IO
* or similar operation cannot guarantee anything stronger anyway because
* locks can't be held over the syscall boundary.
*
* If write=0, the page must not be written to. If the page is written to,
* set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
* after the page is finished with, and before put_page is called.
*
* get_user_pages is typically used for fewer-copy IO operations, to get a
* handle on the memory by some means other than accesses via the user virtual
* addresses. The pages may be submitted for DMA to devices or accessed via
* their kernel linear mapping (via the kmap APIs). Care should be taken to
* use the correct cache flushing APIs.
*
* See also get_user_pages_fast, for performance critical applications.
*/
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int write, int force,
struct page **pages, struct vm_area_struct **vmas)
@@ -3053,22 +3103,13 @@ int in_gate_area_no_task(unsigned long addr)
#endif /* __HAVE_ARCH_GATE_AREA */
#ifdef CONFIG_HAVE_IOREMAP_PROT
int follow_phys(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
unsigned long *prot, resource_size_t *phys)
static int follow_pte(struct mm_struct *mm, unsigned long address,
pte_t **ptepp, spinlock_t **ptlp)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *ptep, pte;
spinlock_t *ptl;
resource_size_t phys_addr = 0;
struct mm_struct *mm = vma->vm_mm;
int ret = -EINVAL;
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
goto out;
pte_t *ptep;
pgd = pgd_offset(mm, address);
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
@@ -3086,22 +3127,71 @@ int follow_phys(struct vm_area_struct *vma,
if (pmd_huge(*pmd))
goto out;
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!ptep)
goto out;
pte = *ptep;
if (!pte_present(pte))
if (!pte_present(*ptep))
goto unlock;
*ptepp = ptep;
return 0;
unlock:
pte_unmap_unlock(ptep, *ptlp);
out:
return -EINVAL;
}
/**
* follow_pfn - look up PFN at a user virtual address
* @vma: memory mapping
* @address: user virtual address
* @pfn: location to store found PFN
*
* Only IO mappings and raw PFN mappings are allowed.
*
* Returns zero and the pfn at @pfn on success, -ve otherwise.
*/
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
unsigned long *pfn)
{
int ret = -EINVAL;
spinlock_t *ptl;
pte_t *ptep;
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
return ret;
ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
if (ret)
return ret;
*pfn = pte_pfn(*ptep);
pte_unmap_unlock(ptep, ptl);
return 0;
}
EXPORT_SYMBOL(follow_pfn);
#ifdef CONFIG_HAVE_IOREMAP_PROT
int follow_phys(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
unsigned long *prot, resource_size_t *phys)
{
int ret = -EINVAL;
pte_t *ptep, pte;
spinlock_t *ptl;
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
goto out;
if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
goto out;
pte = *ptep;
if ((flags & FOLL_WRITE) && !pte_write(pte))
goto unlock;
phys_addr = pte_pfn(pte);
phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
*prot = pgprot_val(pte_pgprot(pte));
*phys = phys_addr;
ret = 0;
*phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
ret = 0;
unlock:
pte_unmap_unlock(ptep, ptl);
out:

查看文件

@@ -422,7 +422,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
zone->present_pages += onlined_pages;
zone->zone_pgdat->node_present_pages += onlined_pages;
setup_per_zone_pages_min();
setup_per_zone_wmarks();
calculate_zone_inactive_ratio(zone);
if (onlined_pages) {
kswapd_run(zone_to_nid(zone));
node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
@@ -832,6 +833,9 @@ repeat:
totalram_pages -= offlined_pages;
num_physpages -= offlined_pages;
setup_per_zone_wmarks();
calculate_zone_inactive_ratio(zone);
vm_total_pages = nr_free_pagecache_pages();
writeback_set_ratelimit();

查看文件

@@ -182,13 +182,54 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
return 0;
}
/* Create a new policy */
/*
* mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
* any, for the new policy. mpol_new() has already validated the nodes
* parameter with respect to the policy mode and flags. But, we need to
* handle an empty nodemask with MPOL_PREFERRED here.
*
* Must be called holding task's alloc_lock to protect task's mems_allowed
* and mempolicy. May also be called holding the mmap_semaphore for write.
*/
static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
nodemask_t cpuset_context_nmask;
int ret;
/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
if (pol == NULL)
return 0;
VM_BUG_ON(!nodes);
if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
nodes = NULL; /* explicit local allocation */
else {
if (pol->flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&cpuset_context_nmask, nodes,
&cpuset_current_mems_allowed);
else
nodes_and(cpuset_context_nmask, *nodes,
cpuset_current_mems_allowed);
if (mpol_store_user_nodemask(pol))
pol->w.user_nodemask = *nodes;
else
pol->w.cpuset_mems_allowed =
cpuset_current_mems_allowed;
}
ret = mpol_ops[pol->mode].create(pol,
nodes ? &cpuset_context_nmask : NULL);
return ret;
}
/*
* This function just creates a new policy, does some check and simple
* initialization. You must invoke mpol_set_nodemask() to set nodes.
*/
static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
{
struct mempolicy *policy;
nodemask_t cpuset_context_nmask;
int ret;
pr_debug("setting mode %d flags %d nodes[0] %lx\n",
mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
@@ -210,7 +251,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
if (((flags & MPOL_F_STATIC_NODES) ||
(flags & MPOL_F_RELATIVE_NODES)))
return ERR_PTR(-EINVAL);
nodes = NULL; /* flag local alloc */
}
} else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
@@ -221,30 +261,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
policy->mode = mode;
policy->flags = flags;
if (nodes) {
/*
* cpuset related setup doesn't apply to local allocation
*/
cpuset_update_task_memory_state();
if (flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&cpuset_context_nmask, nodes,
&cpuset_current_mems_allowed);
else
nodes_and(cpuset_context_nmask, *nodes,
cpuset_current_mems_allowed);
if (mpol_store_user_nodemask(policy))
policy->w.user_nodemask = *nodes;
else
policy->w.cpuset_mems_allowed =
cpuset_mems_allowed(current);
}
ret = mpol_ops[mode].create(policy,
nodes ? &cpuset_context_nmask : NULL);
if (ret < 0) {
kmem_cache_free(policy_cache, policy);
return ERR_PTR(ret);
}
return policy;
}
@@ -324,6 +340,8 @@ static void mpol_rebind_policy(struct mempolicy *pol,
/*
* Wrapper for mpol_rebind_policy() that just requires task
* pointer, and updates task mempolicy.
*
* Called with task's alloc_lock held.
*/
void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
@@ -600,8 +618,9 @@ static void mpol_set_task_struct_flag(void)
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
{
struct mempolicy *new;
struct mempolicy *new, *old;
struct mm_struct *mm = current->mm;
int ret;
new = mpol_new(mode, flags, nodes);
if (IS_ERR(new))
@@ -615,20 +634,33 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
*/
if (mm)
down_write(&mm->mmap_sem);
mpol_put(current->mempolicy);
task_lock(current);
ret = mpol_set_nodemask(new, nodes);
if (ret) {
task_unlock(current);
if (mm)
up_write(&mm->mmap_sem);
mpol_put(new);
return ret;
}
old = current->mempolicy;
current->mempolicy = new;
mpol_set_task_struct_flag();
if (new && new->mode == MPOL_INTERLEAVE &&
nodes_weight(new->v.nodes))
current->il_next = first_node(new->v.nodes);
task_unlock(current);
if (mm)
up_write(&mm->mmap_sem);
mpol_put(old);
return 0;
}
/*
* Return nodemask for policy for get_mempolicy() query
*
* Called with task's alloc_lock held
*/
static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
{
@@ -674,7 +706,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
struct vm_area_struct *vma = NULL;
struct mempolicy *pol = current->mempolicy;
cpuset_update_task_memory_state();
if (flags &
~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
return -EINVAL;
@@ -683,7 +714,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
return -EINVAL;
*policy = 0; /* just so it's initialized */
task_lock(current);
*nmask = cpuset_current_mems_allowed;
task_unlock(current);
return 0;
}
@@ -738,8 +771,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
}
err = 0;
if (nmask)
if (nmask) {
task_lock(current);
get_policy_nodemask(pol, nmask);
task_unlock(current);
}
out:
mpol_cond_put(pol);
@@ -767,7 +803,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
static struct page *new_node_page(struct page *page, unsigned long node, int **x)
{
return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
}
/*
@@ -979,6 +1015,14 @@ static long do_mbind(unsigned long start, unsigned long len,
return err;
}
down_write(&mm->mmap_sem);
task_lock(current);
err = mpol_set_nodemask(new, nmask);
task_unlock(current);
if (err) {
up_write(&mm->mmap_sem);
mpol_put(new);
return err;
}
vma = check_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
@@ -1545,8 +1589,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
struct mempolicy *pol = get_vma_policy(current, vma, addr);
struct zonelist *zl;
cpuset_update_task_memory_state();
if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
unsigned nid;
@@ -1593,8 +1635,6 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
struct mempolicy *pol = current->mempolicy;
if ((gfp & __GFP_WAIT) && !in_interrupt())
cpuset_update_task_memory_state();
if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
pol = &default_policy;
@@ -1854,6 +1894,8 @@ restart:
*/
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
{
int ret;
sp->root = RB_ROOT; /* empty tree == default mempolicy */
spin_lock_init(&sp->lock);
@@ -1863,9 +1905,19 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
/* contextualize the tmpfs mount point mempolicy */
new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
mpol_put(mpol); /* drop our ref on sb mpol */
if (IS_ERR(new))
if (IS_ERR(new)) {
mpol_put(mpol); /* drop our ref on sb mpol */
return; /* no valid nodemask intersection */
}
task_lock(current);
ret = mpol_set_nodemask(new, &mpol->w.user_nodemask);
task_unlock(current);
mpol_put(mpol); /* drop our ref on sb mpol */
if (ret) {
mpol_put(new);
return;
}
/* Create pseudo-vma that contains just the policy */
memset(&pvma, 0, sizeof(struct vm_area_struct));
@@ -2086,8 +2138,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
new = mpol_new(mode, mode_flags, &nodes);
if (IS_ERR(new))
err = 1;
else if (no_context)
new->w.user_nodemask = nodes; /* save for contextualization */
else {
int ret;
task_lock(current);
ret = mpol_set_nodemask(new, &nodes);
task_unlock(current);
if (ret)
err = 1;
else if (no_context) {
/* save for contextualization */
new->w.user_nodemask = nodes;
}
}
out:
/* Restore string for error message */

查看文件

@@ -802,7 +802,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
*result = &pm->status;
return alloc_pages_node(pm->node,
return alloc_pages_exact_node(pm->node,
GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
}
@@ -820,7 +820,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
struct page_to_node *pp;
LIST_HEAD(pagelist);
migrate_prep();
down_read(&mm->mmap_sem);
/*
@@ -907,6 +906,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
if (!pm)
goto out;
migrate_prep();
/*
* Store a chunk of page_to_node array in a page,
* but keep the last one as a marker

查看文件

@@ -31,7 +31,6 @@ int can_do_mlock(void)
}
EXPORT_SYMBOL(can_do_mlock);
#ifdef CONFIG_UNEVICTABLE_LRU
/*
* Mlocked pages are marked with PageMlocked() flag for efficient testing
* in vmscan and, possibly, the fault path; and to support semi-accurate
@@ -261,27 +260,6 @@ static int __mlock_posix_error_return(long retval)
return retval;
}
#else /* CONFIG_UNEVICTABLE_LRU */
/*
* Just make pages present if VM_LOCKED. No-op if unlocking.
*/
static long __mlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
int mlock)
{
if (mlock && (vma->vm_flags & VM_LOCKED))
return make_pages_present(start, end);
return 0;
}
static inline int __mlock_posix_error_return(long retval)
{
return 0;
}
#endif /* CONFIG_UNEVICTABLE_LRU */
/**
* mlock_vma_pages_range() - mlock pages in specified vma range.
* @vma - the vma containing the specfied address range

查看文件

@@ -58,6 +58,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
unsigned long points, cpu_time, run_time;
struct mm_struct *mm;
struct task_struct *child;
int oom_adj;
task_lock(p);
mm = p->mm;
@@ -65,6 +66,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
task_unlock(p);
return 0;
}
oom_adj = mm->oom_adj;
if (oom_adj == OOM_DISABLE) {
task_unlock(p);
return 0;
}
/*
* The memory size of the process is the basis for the badness.
@@ -148,15 +154,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
points /= 8;
/*
* Adjust the score by oomkilladj.
* Adjust the score by oom_adj.
*/
if (p->oomkilladj) {
if (p->oomkilladj > 0) {
if (oom_adj) {
if (oom_adj > 0) {
if (!points)
points = 1;
points <<= p->oomkilladj;
points <<= oom_adj;
} else
points >>= -(p->oomkilladj);
points >>= -(oom_adj);
}
#ifdef DEBUG
@@ -251,11 +257,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
*ppoints = ULONG_MAX;
}
if (p->oomkilladj == OOM_DISABLE)
continue;
points = badness(p, uptime.tv_sec);
if (points > *ppoints || !chosen) {
if (points > *ppoints) {
chosen = p;
*ppoints = points;
}
@@ -304,8 +307,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
}
printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj,
p->comm);
get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm);
task_unlock(p);
} while_each_thread(g, p);
}
@@ -323,11 +325,8 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
return;
}
if (!p->mm) {
WARN_ON(1);
printk(KERN_WARNING "tried to kill an mm-less task!\n");
if (!p->mm)
return;
}
if (verbose)
printk(KERN_ERR "Killed process %d (%s)\n",
@@ -349,28 +348,13 @@ static int oom_kill_task(struct task_struct *p)
struct mm_struct *mm;
struct task_struct *g, *q;
task_lock(p);
mm = p->mm;
/* WARNING: mm may not be dereferenced since we did not obtain its
* value from get_task_mm(p). This is OK since all we need to do is
* compare mm to q->mm below.
*
* Furthermore, even if mm contains a non-NULL value, p->mm may
* change to NULL at any time since we do not hold task_lock(p).
* However, this is of no concern to us.
*/
if (mm == NULL)
if (!mm || mm->oom_adj == OOM_DISABLE) {
task_unlock(p);
return 1;
/*
* Don't kill the process if any threads are set to OOM_DISABLE
*/
do_each_thread(g, q) {
if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
return 1;
} while_each_thread(g, q);
}
task_unlock(p);
__oom_kill_task(p, 1);
/*
@@ -393,10 +377,11 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
struct task_struct *c;
if (printk_ratelimit()) {
printk(KERN_WARNING "%s invoked oom-killer: "
"gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
current->comm, gfp_mask, order, current->oomkilladj);
task_lock(current);
printk(KERN_WARNING "%s invoked oom-killer: "
"gfp_mask=0x%x, order=%d, oom_adj=%d\n",
current->comm, gfp_mask, order,
current->mm ? current->mm->oom_adj : OOM_DISABLE);
cpuset_print_task_mems_allowed(current);
task_unlock(current);
dump_stack();
@@ -409,8 +394,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
/*
* If the task is already exiting, don't alarm the sysadmin or kill
* its children or threads, just set TIF_MEMDIE so it can die quickly
* if its mm is still attached.
*/
if (p->flags & PF_EXITING) {
if (p->mm && (p->flags & PF_EXITING)) {
__oom_kill_task(p, 0);
return 0;
}

查看文件

@@ -265,18 +265,19 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
* This avoids exceeding the total dirty_limit when the floating averages
* fluctuate too quickly.
*/
static void
clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty)
static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
unsigned long dirty, unsigned long *pbdi_dirty)
{
long avail_dirty;
unsigned long avail_dirty;
avail_dirty = dirty -
(global_page_state(NR_FILE_DIRTY) +
avail_dirty = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_WRITEBACK) +
global_page_state(NR_UNSTABLE_NFS) +
global_page_state(NR_WRITEBACK_TEMP));
global_page_state(NR_WRITEBACK_TEMP);
if (avail_dirty < 0)
if (avail_dirty < dirty)
avail_dirty = dirty - avail_dirty;
else
avail_dirty = 0;
avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
@@ -299,10 +300,10 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
*
* dirty -= (dirty/8) * p_{t}
*/
static void task_dirty_limit(struct task_struct *tsk, long *pdirty)
static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
{
long numerator, denominator;
long dirty = *pdirty;
unsigned long dirty = *pdirty;
u64 inv = dirty >> 3;
task_dirties_fraction(tsk, &numerator, &denominator);

文件差异内容过多而无法显示 加载差异

查看文件

@@ -120,7 +120,7 @@ out:
return ret;
}
int swap_readpage(struct file *file, struct page *page)
int swap_readpage(struct page *page)
{
struct bio *bio;
int ret = 0;

查看文件

@@ -133,15 +133,12 @@ out:
}
/*
* do_page_cache_readahead actually reads a chunk of disk. It allocates all
* __do_page_cache_readahead() actually reads a chunk of disk. It allocates all
* the pages first, then submits them all for I/O. This avoids the very bad
* behaviour which would occur if page allocations are causing VM writeback.
* We really don't want to intermingle reads and writes like that.
*
* Returns the number of pages requested, or the maximum amount of I/O allowed.
*
* do_page_cache_readahead() returns -1 if it encountered request queue
* congestion.
*/
static int
__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
@@ -210,6 +207,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
return -EINVAL;
nr_to_read = max_sane_readahead(nr_to_read);
while (nr_to_read) {
int err;
@@ -230,22 +228,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
return ret;
}
/*
* This version skips the IO if the queue is read-congested, and will tell the
* block layer to abandon the readahead if request allocation would block.
*
* force_page_cache_readahead() will ignore queue congestion and will block on
* request queues.
*/
int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read)
{
if (bdi_read_congested(mapping->backing_dev_info))
return -1;
return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
}
/*
* Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
* sensible upper limit.
@@ -259,7 +241,7 @@ unsigned long max_sane_readahead(unsigned long nr)
/*
* Submit IO for the read-ahead request in file_ra_state.
*/
static unsigned long ra_submit(struct file_ra_state *ra,
unsigned long ra_submit(struct file_ra_state *ra,
struct address_space *mapping, struct file *filp)
{
int actual;
@@ -347,6 +329,59 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra,
* it approaches max_readhead.
*/
/*
* Count contiguously cached pages from @offset-1 to @offset-@max,
* this count is a conservative estimation of
* - length of the sequential read sequence, or
* - thrashing threshold in memory tight systems
*/
static pgoff_t count_history_pages(struct address_space *mapping,
struct file_ra_state *ra,
pgoff_t offset, unsigned long max)
{
pgoff_t head;
rcu_read_lock();
head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max);
rcu_read_unlock();
return offset - 1 - head;
}
/*
* page cache context based read-ahead
*/
static int try_context_readahead(struct address_space *mapping,
struct file_ra_state *ra,
pgoff_t offset,
unsigned long req_size,
unsigned long max)
{
pgoff_t size;
size = count_history_pages(mapping, ra, offset, max);
/*
* no history pages:
* it could be a random read
*/
if (!size)
return 0;
/*
* starts from beginning of file:
* it is a strong indication of long-run stream (or whole-file-read)
*/
if (size >= offset)
size *= 2;
ra->start = offset;
ra->size = get_init_ra_size(size + req_size, max);
ra->async_size = ra->size;
return 1;
}
/*
* A minimal readahead algorithm for trivial sequential/random reads.
*/
@@ -356,34 +391,26 @@ ondemand_readahead(struct address_space *mapping,
bool hit_readahead_marker, pgoff_t offset,
unsigned long req_size)
{
int max = ra->ra_pages; /* max readahead pages */
pgoff_t prev_offset;
int sequential;
unsigned long max = max_sane_readahead(ra->ra_pages);
/*
* start of file
*/
if (!offset)
goto initial_readahead;
/*
* It's the expected callback offset, assume sequential access.
* Ramp up sizes, and push forward the readahead window.
*/
if (offset && (offset == (ra->start + ra->size - ra->async_size) ||
offset == (ra->start + ra->size))) {
if ((offset == (ra->start + ra->size - ra->async_size) ||
offset == (ra->start + ra->size))) {
ra->start += ra->size;
ra->size = get_next_ra_size(ra, max);
ra->async_size = ra->size;
goto readit;
}
prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT;
sequential = offset - prev_offset <= 1UL || req_size > max;
/*
* Standalone, small read.
* Read as is, and do not pollute the readahead state.
*/
if (!hit_readahead_marker && !sequential) {
return __do_page_cache_readahead(mapping, filp,
offset, req_size, 0);
}
/*
* Hit a marked page without valid readahead state.
* E.g. interleaved reads.
@@ -394,7 +421,7 @@ ondemand_readahead(struct address_space *mapping,
pgoff_t start;
rcu_read_lock();
start = radix_tree_next_hole(&mapping->page_tree, offset,max+1);
start = radix_tree_next_hole(&mapping->page_tree, offset+1,max);
rcu_read_unlock();
if (!start || start - offset > max)
@@ -402,23 +429,53 @@ ondemand_readahead(struct address_space *mapping,
ra->start = start;
ra->size = start - offset; /* old async_size */
ra->size += req_size;
ra->size = get_next_ra_size(ra, max);
ra->async_size = ra->size;
goto readit;
}
/*
* It may be one of
* - first read on start of file
* - sequential cache miss
* - oversize random read
* Start readahead for it.
* oversize read
*/
if (req_size > max)
goto initial_readahead;
/*
* sequential cache miss
*/
if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL)
goto initial_readahead;
/*
* Query the page cache and look for the traces(cached history pages)
* that a sequential stream would leave behind.
*/
if (try_context_readahead(mapping, ra, offset, req_size, max))
goto readit;
/*
* standalone, small random read
* Read as is, and do not pollute the readahead state.
*/
return __do_page_cache_readahead(mapping, filp, offset, req_size, 0);
initial_readahead:
ra->start = offset;
ra->size = get_init_ra_size(req_size, max);
ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
readit:
/*
* Will this read hit the readahead marker made by itself?
* If so, trigger the readahead marker hit now, and merge
* the resulted next readahead window into the current one.
*/
if (offset == ra->start && ra->size == ra->async_size) {
ra->async_size = get_next_ra_size(ra, max);
ra->size += ra->async_size;
}
return ra_submit(ra, mapping, filp);
}

查看文件

@@ -333,7 +333,9 @@ static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
* repeatedly from either page_referenced_anon or page_referenced_file.
*/
static int page_referenced_one(struct page *page,
struct vm_area_struct *vma, unsigned int *mapcount)
struct vm_area_struct *vma,
unsigned int *mapcount,
unsigned long *vm_flags)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
@@ -381,11 +383,14 @@ out_unmap:
(*mapcount)--;
pte_unmap_unlock(pte, ptl);
out:
if (referenced)
*vm_flags |= vma->vm_flags;
return referenced;
}
static int page_referenced_anon(struct page *page,
struct mem_cgroup *mem_cont)
struct mem_cgroup *mem_cont,
unsigned long *vm_flags)
{
unsigned int mapcount;
struct anon_vma *anon_vma;
@@ -405,7 +410,8 @@ static int page_referenced_anon(struct page *page,
*/
if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
continue;
referenced += page_referenced_one(page, vma, &mapcount);
referenced += page_referenced_one(page, vma,
&mapcount, vm_flags);
if (!mapcount)
break;
}
@@ -418,6 +424,7 @@ static int page_referenced_anon(struct page *page,
* page_referenced_file - referenced check for object-based rmap
* @page: the page we're checking references on.
* @mem_cont: target memory controller
* @vm_flags: collect encountered vma->vm_flags who actually referenced the page
*
* For an object-based mapped page, find all the places it is mapped and
* check/clear the referenced flag. This is done by following the page->mapping
@@ -427,7 +434,8 @@ static int page_referenced_anon(struct page *page,
* This function is only called from page_referenced for object-based pages.
*/
static int page_referenced_file(struct page *page,
struct mem_cgroup *mem_cont)
struct mem_cgroup *mem_cont,
unsigned long *vm_flags)
{
unsigned int mapcount;
struct address_space *mapping = page->mapping;
@@ -467,7 +475,8 @@ static int page_referenced_file(struct page *page,
*/
if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
continue;
referenced += page_referenced_one(page, vma, &mapcount);
referenced += page_referenced_one(page, vma,
&mapcount, vm_flags);
if (!mapcount)
break;
}
@@ -481,29 +490,35 @@ static int page_referenced_file(struct page *page,
* @page: the page to test
* @is_locked: caller holds lock on the page
* @mem_cont: target memory controller
* @vm_flags: collect encountered vma->vm_flags who actually referenced the page
*
* Quick test_and_clear_referenced for all mappings to a page,
* returns the number of ptes which referenced the page.
*/
int page_referenced(struct page *page, int is_locked,
struct mem_cgroup *mem_cont)
int page_referenced(struct page *page,
int is_locked,
struct mem_cgroup *mem_cont,
unsigned long *vm_flags)
{
int referenced = 0;
if (TestClearPageReferenced(page))
referenced++;
*vm_flags = 0;
if (page_mapped(page) && page->mapping) {
if (PageAnon(page))
referenced += page_referenced_anon(page, mem_cont);
referenced += page_referenced_anon(page, mem_cont,
vm_flags);
else if (is_locked)
referenced += page_referenced_file(page, mem_cont);
referenced += page_referenced_file(page, mem_cont,
vm_flags);
else if (!trylock_page(page))
referenced++;
else {
if (page->mapping)
referenced +=
page_referenced_file(page, mem_cont);
referenced += page_referenced_file(page,
mem_cont, vm_flags);
unlock_page(page);
}
}
@@ -1202,7 +1217,6 @@ int try_to_unmap(struct page *page, int migration)
return ret;
}
#ifdef CONFIG_UNEVICTABLE_LRU
/**
* try_to_munlock - try to munlock a page
* @page: the page to be munlocked
@@ -1226,4 +1240,4 @@ int try_to_munlock(struct page *page)
else
return try_to_unmap_file(page, 1, 0);
}
#endif

查看文件

@@ -1097,7 +1097,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
shmem_swp_unmap(entry);
unlock:
spin_unlock(&info->lock);
swap_free(swap);
swapcache_free(swap, NULL);
redirty:
set_page_dirty(page);
if (wbc->for_reclaim)
@@ -2612,7 +2612,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
* @size: size to be set for the file
* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
*/
struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
{
int error;
struct file *file;

查看文件

@@ -818,7 +818,6 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
*/
static int use_alien_caches __read_mostly = 1;
static int numa_platform __read_mostly = 1;
static int __init noaliencache_setup(char *s)
{
use_alien_caches = 0;
@@ -1377,10 +1376,8 @@ void __init kmem_cache_init(void)
int order;
int node;
if (num_possible_nodes() == 1) {
if (num_possible_nodes() == 1)
use_alien_caches = 0;
numa_platform = 0;
}
for (i = 0; i < NUM_INIT_LISTS; i++) {
kmem_list3_init(&initkmem_list3[i]);
@@ -1627,7 +1624,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
flags |= __GFP_RECLAIMABLE;
page = alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
if (!page)
return NULL;
@@ -3193,7 +3190,7 @@ retry:
if (local_flags & __GFP_WAIT)
local_irq_enable();
kmem_flagcheck(cache, flags);
obj = kmem_getpages(cache, local_flags, -1);
obj = kmem_getpages(cache, local_flags, numa_node_id());
if (local_flags & __GFP_WAIT)
local_irq_disable();
if (obj) {
@@ -3530,7 +3527,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
* variable to skip the call, which is mostly likely to be present in
* the cache.
*/
if (numa_platform && cache_free_alien(cachep, objp))
if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
return;
if (likely(ac->avail < ac->limit)) {

查看文件

@@ -46,7 +46,7 @@
* NUMA support in SLOB is fairly simplistic, pushing most of the real
* logic down to the page allocator, and simply doing the node accounting
* on the upper levels. In the event that a node id is explicitly
* provided, alloc_pages_node() with the specified node id is used
* provided, alloc_pages_exact_node() with the specified node id is used
* instead. The common case (or when the node id isn't explicitly provided)
* will default to the current node, as per numa_node_id().
*
@@ -244,7 +244,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
#ifdef CONFIG_NUMA
if (node != -1)
page = alloc_pages_node(node, gfp, order);
page = alloc_pages_exact_node(node, gfp, order);
else
#endif
page = alloc_pages(gfp, order);

查看文件

@@ -3765,7 +3765,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
to_cpumask(l->cpus));
}
if (num_online_nodes() > 1 && !nodes_empty(l->nodes) &&
if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
len < PAGE_SIZE - 60) {
len += sprintf(buf + len, " nodes=");
len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50,

查看文件

@@ -124,7 +124,6 @@ void __delete_from_swap_cache(struct page *page)
/**
* add_to_swap - allocate swap space for a page
* @page: page we want to move to swap
* @gfp_mask: memory allocation flags
*
* Allocate swap space for the page and add the page to the
* swap cache. Caller needs to hold the page lock.
@@ -162,11 +161,11 @@ int add_to_swap(struct page *page)
return 1;
case -EEXIST:
/* Raced with "speculative" read_swap_cache_async */
swap_free(entry);
swapcache_free(entry, NULL);
continue;
default:
/* -ENOMEM radix-tree allocation failure */
swap_free(entry);
swapcache_free(entry, NULL);
return 0;
}
}
@@ -188,8 +187,7 @@ void delete_from_swap_cache(struct page *page)
__delete_from_swap_cache(page);
spin_unlock_irq(&swapper_space.tree_lock);
mem_cgroup_uncharge_swapcache(page, entry);
swap_free(entry);
swapcache_free(entry, page);
page_cache_release(page);
}
@@ -293,7 +291,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
/*
* Swap entry may have been freed since our caller observed it.
*/
if (!swap_duplicate(entry))
err = swapcache_prepare(entry);
if (err == -EEXIST) /* seems racy */
continue;
if (err) /* swp entry is obsolete ? */
break;
/*
@@ -312,12 +313,12 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* Initiate read into locked page and return.
*/
lru_cache_add_anon(new_page);
swap_readpage(NULL, new_page);
swap_readpage(new_page);
return new_page;
}
ClearPageSwapBacked(new_page);
__clear_page_locked(new_page);
swap_free(entry);
swapcache_free(entry, NULL);
} while (err != -ENOMEM);
if (new_page)

查看文件

@@ -53,6 +53,59 @@ static struct swap_info_struct swap_info[MAX_SWAPFILES];
static DEFINE_MUTEX(swapon_mutex);
/* For reference count accounting in swap_map */
/* enum for swap_map[] handling. internal use only */
enum {
SWAP_MAP = 0, /* ops for reference from swap users */
SWAP_CACHE, /* ops for reference from swap cache */
};
static inline int swap_count(unsigned short ent)
{
return ent & SWAP_COUNT_MASK;
}
static inline bool swap_has_cache(unsigned short ent)
{
return !!(ent & SWAP_HAS_CACHE);
}
static inline unsigned short encode_swapmap(int count, bool has_cache)
{
unsigned short ret = count;
if (has_cache)
return SWAP_HAS_CACHE | ret;
return ret;
}
/* returnes 1 if swap entry is freed */
static int
__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
{
int type = si - swap_info;
swp_entry_t entry = swp_entry(type, offset);
struct page *page;
int ret = 0;
page = find_get_page(&swapper_space, entry.val);
if (!page)
return 0;
/*
* This function is called from scan_swap_map() and it's called
* by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
* We have to use trylock for avoiding deadlock. This is a special
* case and you should use try_to_free_swap() with explicit lock_page()
* in usual operations.
*/
if (trylock_page(page)) {
ret = try_to_free_swap(page);
unlock_page(page);
}
page_cache_release(page);
return ret;
}
/*
* We need this because the bdev->unplug_fn can sleep and we cannot
* hold swap_lock while calling the unplug_fn. And swap_lock
@@ -167,7 +220,8 @@ static int wait_for_discard(void *word)
#define SWAPFILE_CLUSTER 256
#define LATENCY_LIMIT 256
static inline unsigned long scan_swap_map(struct swap_info_struct *si)
static inline unsigned long scan_swap_map(struct swap_info_struct *si,
int cache)
{
unsigned long offset;
unsigned long scan_base;
@@ -273,6 +327,19 @@ checks:
goto no_page;
if (offset > si->highest_bit)
scan_base = offset = si->lowest_bit;
/* reuse swap entry of cache-only swap if not busy. */
if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
int swap_was_freed;
spin_unlock(&swap_lock);
swap_was_freed = __try_to_reclaim_swap(si, offset);
spin_lock(&swap_lock);
/* entry was freed successfully, try to use this again */
if (swap_was_freed)
goto checks;
goto scan; /* check next one */
}
if (si->swap_map[offset])
goto scan;
@@ -285,7 +352,10 @@ checks:
si->lowest_bit = si->max;
si->highest_bit = 0;
}
si->swap_map[offset] = 1;
if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */
si->swap_map[offset] = encode_swapmap(0, true);
else /* at suspend */
si->swap_map[offset] = encode_swapmap(1, false);
si->cluster_next = offset + 1;
si->flags -= SWP_SCANNING;
@@ -351,6 +421,10 @@ scan:
spin_lock(&swap_lock);
goto checks;
}
if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
spin_lock(&swap_lock);
goto checks;
}
if (unlikely(--latency_ration < 0)) {
cond_resched();
latency_ration = LATENCY_LIMIT;
@@ -362,6 +436,10 @@ scan:
spin_lock(&swap_lock);
goto checks;
}
if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
spin_lock(&swap_lock);
goto checks;
}
if (unlikely(--latency_ration < 0)) {
cond_resched();
latency_ration = LATENCY_LIMIT;
@@ -401,7 +479,8 @@ swp_entry_t get_swap_page(void)
continue;
swap_list.next = next;
offset = scan_swap_map(si);
/* This is called for allocating swap entry for cache */
offset = scan_swap_map(si, SWAP_CACHE);
if (offset) {
spin_unlock(&swap_lock);
return swp_entry(type, offset);
@@ -415,6 +494,7 @@ noswap:
return (swp_entry_t) {0};
}
/* The only caller of this function is now susupend routine */
swp_entry_t get_swap_page_of_type(int type)
{
struct swap_info_struct *si;
@@ -424,7 +504,8 @@ swp_entry_t get_swap_page_of_type(int type)
si = swap_info + type;
if (si->flags & SWP_WRITEOK) {
nr_swap_pages--;
offset = scan_swap_map(si);
/* This is called for allocating swap entry, not cache */
offset = scan_swap_map(si, SWAP_MAP);
if (offset) {
spin_unlock(&swap_lock);
return swp_entry(type, offset);
@@ -471,25 +552,38 @@ out:
return NULL;
}
static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent)
static int swap_entry_free(struct swap_info_struct *p,
swp_entry_t ent, int cache)
{
unsigned long offset = swp_offset(ent);
int count = p->swap_map[offset];
int count = swap_count(p->swap_map[offset]);
bool has_cache;
if (count < SWAP_MAP_MAX) {
count--;
p->swap_map[offset] = count;
if (!count) {
if (offset < p->lowest_bit)
p->lowest_bit = offset;
if (offset > p->highest_bit)
p->highest_bit = offset;
if (p->prio > swap_info[swap_list.next].prio)
swap_list.next = p - swap_info;
nr_swap_pages++;
p->inuse_pages--;
mem_cgroup_uncharge_swap(ent);
has_cache = swap_has_cache(p->swap_map[offset]);
if (cache == SWAP_MAP) { /* dropping usage count of swap */
if (count < SWAP_MAP_MAX) {
count--;
p->swap_map[offset] = encode_swapmap(count, has_cache);
}
} else { /* dropping swap cache flag */
VM_BUG_ON(!has_cache);
p->swap_map[offset] = encode_swapmap(count, false);
}
/* return code. */
count = p->swap_map[offset];
/* free if no reference */
if (!count) {
if (offset < p->lowest_bit)
p->lowest_bit = offset;
if (offset > p->highest_bit)
p->highest_bit = offset;
if (p->prio > swap_info[swap_list.next].prio)
swap_list.next = p - swap_info;
nr_swap_pages++;
p->inuse_pages--;
mem_cgroup_uncharge_swap(ent);
}
return count;
}
@@ -504,11 +598,28 @@ void swap_free(swp_entry_t entry)
p = swap_info_get(entry);
if (p) {
swap_entry_free(p, entry);
swap_entry_free(p, entry, SWAP_MAP);
spin_unlock(&swap_lock);
}
}
/*
* Called after dropping swapcache to decrease refcnt to swap entries.
*/
void swapcache_free(swp_entry_t entry, struct page *page)
{
struct swap_info_struct *p;
if (page)
mem_cgroup_uncharge_swapcache(page, entry);
p = swap_info_get(entry);
if (p) {
swap_entry_free(p, entry, SWAP_CACHE);
spin_unlock(&swap_lock);
}
return;
}
/*
* How many references to page are currently swapped out?
*/
@@ -521,8 +632,7 @@ static inline int page_swapcount(struct page *page)
entry.val = page_private(page);
p = swap_info_get(entry);
if (p) {
/* Subtract the 1 for the swap cache itself */
count = p->swap_map[swp_offset(entry)] - 1;
count = swap_count(p->swap_map[swp_offset(entry)]);
spin_unlock(&swap_lock);
}
return count;
@@ -584,7 +694,7 @@ int free_swap_and_cache(swp_entry_t entry)
p = swap_info_get(entry);
if (p) {
if (swap_entry_free(p, entry) == 1) {
if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) {
page = find_get_page(&swapper_space, entry.val);
if (page && !trylock_page(page)) {
page_cache_release(page);
@@ -891,7 +1001,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
i = 1;
}
count = si->swap_map[i];
if (count && count != SWAP_MAP_BAD)
if (count && swap_count(count) != SWAP_MAP_BAD)
break;
}
return i;
@@ -995,13 +1105,13 @@ static int try_to_unuse(unsigned int type)
*/
shmem = 0;
swcount = *swap_map;
if (swcount > 1) {
if (swap_count(swcount)) {
if (start_mm == &init_mm)
shmem = shmem_unuse(entry, page);
else
retval = unuse_mm(start_mm, entry, page);
}
if (*swap_map > 1) {
if (swap_count(*swap_map)) {
int set_start_mm = (*swap_map >= swcount);
struct list_head *p = &start_mm->mmlist;
struct mm_struct *new_start_mm = start_mm;
@@ -1011,7 +1121,7 @@ static int try_to_unuse(unsigned int type)
atomic_inc(&new_start_mm->mm_users);
atomic_inc(&prev_mm->mm_users);
spin_lock(&mmlist_lock);
while (*swap_map > 1 && !retval && !shmem &&
while (swap_count(*swap_map) && !retval && !shmem &&
(p = p->next) != &start_mm->mmlist) {
mm = list_entry(p, struct mm_struct, mmlist);
if (!atomic_inc_not_zero(&mm->mm_users))
@@ -1023,14 +1133,16 @@ static int try_to_unuse(unsigned int type)
cond_resched();
swcount = *swap_map;
if (swcount <= 1)
if (!swap_count(swcount)) /* any usage ? */
;
else if (mm == &init_mm) {
set_start_mm = 1;
shmem = shmem_unuse(entry, page);
} else
retval = unuse_mm(mm, entry, page);
if (set_start_mm && *swap_map < swcount) {
if (set_start_mm &&
swap_count(*swap_map) < swcount) {
mmput(new_start_mm);
atomic_inc(&mm->mm_users);
new_start_mm = mm;
@@ -1057,21 +1169,25 @@ static int try_to_unuse(unsigned int type)
}
/*
* How could swap count reach 0x7fff when the maximum
* pid is 0x7fff, and there's no way to repeat a swap
* page within an mm (except in shmem, where it's the
* shared object which takes the reference count)?
* We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
*
* How could swap count reach 0x7ffe ?
* There's no way to repeat a swap page within an mm
* (except in shmem, where it's the shared object which takes
* the reference count)?
* We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
* short is too small....)
* If that's wrong, then we should worry more about
* exit_mmap() and do_munmap() cases described above:
* we might be resetting SWAP_MAP_MAX too early here.
* We know "Undead"s can happen, they're okay, so don't
* report them; but do report if we reset SWAP_MAP_MAX.
*/
if (*swap_map == SWAP_MAP_MAX) {
/* We might release the lock_page() in unuse_mm(). */
if (!PageSwapCache(page) || page_private(page) != entry.val)
goto retry;
if (swap_count(*swap_map) == SWAP_MAP_MAX) {
spin_lock(&swap_lock);
*swap_map = 1;
*swap_map = encode_swapmap(0, true);
spin_unlock(&swap_lock);
reset_overflow = 1;
}
@@ -1089,7 +1205,8 @@ static int try_to_unuse(unsigned int type)
* pages would be incorrect if swap supported "shared
* private" pages, but they are handled by tmpfs files.
*/
if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
if (swap_count(*swap_map) &&
PageDirty(page) && PageSwapCache(page)) {
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
};
@@ -1116,6 +1233,7 @@ static int try_to_unuse(unsigned int type)
* mark page dirty so shrink_page_list will preserve it.
*/
SetPageDirty(page);
retry:
unlock_page(page);
page_cache_release(page);
@@ -1942,15 +2060,23 @@ void si_swapinfo(struct sysinfo *val)
*
* Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
* "permanent", but will be reclaimed by the next swapoff.
* Returns error code in following case.
* - success -> 0
* - swp_entry is invalid -> EINVAL
* - swp_entry is migration entry -> EINVAL
* - swap-cache reference is requested but there is already one. -> EEXIST
* - swap-cache reference is requested but the entry is not used. -> ENOENT
*/
int swap_duplicate(swp_entry_t entry)
static int __swap_duplicate(swp_entry_t entry, bool cache)
{
struct swap_info_struct * p;
unsigned long offset, type;
int result = 0;
int result = -EINVAL;
int count;
bool has_cache;
if (is_migration_entry(entry))
return 1;
return -EINVAL;
type = swp_type(entry);
if (type >= nr_swapfiles)
@@ -1959,17 +2085,40 @@ int swap_duplicate(swp_entry_t entry)
offset = swp_offset(entry);
spin_lock(&swap_lock);
if (offset < p->max && p->swap_map[offset]) {
if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
p->swap_map[offset]++;
result = 1;
} else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
if (unlikely(offset >= p->max))
goto unlock_out;
count = swap_count(p->swap_map[offset]);
has_cache = swap_has_cache(p->swap_map[offset]);
if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */
/* set SWAP_HAS_CACHE if there is no cache and entry is used */
if (!has_cache && count) {
p->swap_map[offset] = encode_swapmap(count, true);
result = 0;
} else if (has_cache) /* someone added cache */
result = -EEXIST;
else if (!count) /* no users */
result = -ENOENT;
} else if (count || has_cache) {
if (count < SWAP_MAP_MAX - 1) {
p->swap_map[offset] = encode_swapmap(count + 1,
has_cache);
result = 0;
} else if (count <= SWAP_MAP_MAX) {
if (swap_overflow++ < 5)
printk(KERN_WARNING "swap_dup: swap entry overflow\n");
p->swap_map[offset] = SWAP_MAP_MAX;
result = 1;
printk(KERN_WARNING
"swap_dup: swap entry overflow\n");
p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX,
has_cache);
result = 0;
}
}
} else
result = -ENOENT; /* unused swap entry */
unlock_out:
spin_unlock(&swap_lock);
out:
return result;
@@ -1978,6 +2127,27 @@ bad_file:
printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
goto out;
}
/*
* increase reference count of swap entry by 1.
*/
void swap_duplicate(swp_entry_t entry)
{
__swap_duplicate(entry, SWAP_MAP);
}
/*
* @entry: swap entry for which we allocate swap cache.
*
* Called when allocating swap cache for exising swap entry,
* This can return error codes. Returns 0 at success.
* -EBUSY means there is a swap cache.
* Note: return code is different from swap_duplicate().
*/
int swapcache_prepare(swp_entry_t entry)
{
return __swap_duplicate(entry, SWAP_CACHE);
}
struct swap_info_struct *
get_swap_info_struct(unsigned type)
@@ -2016,7 +2186,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
/* Don't read in free or bad pages */
if (!si->swap_map[toff])
break;
if (si->swap_map[toff] == SWAP_MAP_BAD)
if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
break;
}
/* Count contiguous allocated slots below our target */
@@ -2024,7 +2194,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
/* Don't read in free or bad pages */
if (!si->swap_map[toff])
break;
if (si->swap_map[toff] == SWAP_MAP_BAD)
if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
break;
}
spin_unlock(&swap_lock);

查看文件

@@ -267,8 +267,21 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
}
EXPORT_SYMBOL(truncate_inode_pages);
unsigned long __invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end, bool be_atomic)
/**
* invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
* @mapping: the address_space which holds the pages to invalidate
* @start: the offset 'from' which to invalidate
* @end: the offset 'to' which to invalidate (inclusive)
*
* This function only removes the unlocked pages, if you want to
* remove all the pages of one inode, you must call truncate_inode_pages.
*
* invalidate_mapping_pages() will not block on IO activity. It will not
* invalidate pages which are dirty, locked, under writeback or mapped into
* pagetables.
*/
unsigned long invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
struct pagevec pvec;
pgoff_t next = start;
@@ -309,30 +322,10 @@ unlock:
break;
}
pagevec_release(&pvec);
if (likely(!be_atomic))
cond_resched();
cond_resched();
}
return ret;
}
/**
* invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
* @mapping: the address_space which holds the pages to invalidate
* @start: the offset 'from' which to invalidate
* @end: the offset 'to' which to invalidate (inclusive)
*
* This function only removes the unlocked pages, if you want to
* remove all the pages of one inode, you must call truncate_inode_pages.
*
* invalidate_mapping_pages() will not block on IO activity. It will not
* invalidate pages which are dirty, locked, under writeback or mapped into
* pagetables.
*/
unsigned long invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
return __invalidate_mapping_pages(mapping, start, end, false);
}
EXPORT_SYMBOL(invalidate_mapping_pages);
/*

查看文件

@@ -233,13 +233,21 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
* Attempt to pin user pages in memory without taking mm->mmap_sem.
* If not successful, it will fall back to taking the lock and
* calling get_user_pages().
*
* Returns number of pages pinned. This may be fewer than the number
* requested. If nr_pages is 0 or negative, returns 0. If no pages
* were pinned, returns -errno.
*
* get_user_pages_fast provides equivalent functionality to get_user_pages,
* operating on current and current->mm, with force=0 and vma=NULL. However
* unlike get_user_pages, it must be called without mmap_sem held.
*
* get_user_pages_fast may take mmap_sem and page table locks, so no
* assumptions can be made about lack of locking. get_user_pages_fast is to be
* implemented in a way that is advantageous (vs get_user_pages()) when the
* user memory area is already faulted in and present in ptes. However if the
* pages have to be faulted in, it may turn out to be slightly slower so
* callers need to carefully consider what to use. On many architectures,
* get_user_pages_fast simply falls back to get_user_pages.
*/
int __attribute__((weak)) get_user_pages_fast(unsigned long start,
int nr_pages, int write, struct page **pages)

查看文件

@@ -470,8 +470,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
swp_entry_t swap = { .val = page_private(page) };
__delete_from_swap_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_swapcache(page, swap);
swap_free(swap);
swapcache_free(swap, page);
} else {
__remove_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
@@ -514,7 +513,6 @@ int remove_mapping(struct address_space *mapping, struct page *page)
*
* lru_lock must not be held, interrupts must be enabled.
*/
#ifdef CONFIG_UNEVICTABLE_LRU
void putback_lru_page(struct page *page)
{
int lru;
@@ -568,20 +566,6 @@ redo:
put_page(page); /* drop ref from isolate */
}
#else /* CONFIG_UNEVICTABLE_LRU */
void putback_lru_page(struct page *page)
{
int lru;
VM_BUG_ON(PageLRU(page));
lru = !!TestClearPageActive(page) + page_is_file_cache(page);
lru_cache_add_lru(page, lru);
put_page(page);
}
#endif /* CONFIG_UNEVICTABLE_LRU */
/*
* shrink_page_list() returns the number of reclaimed pages
*/
@@ -593,6 +577,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
struct pagevec freed_pvec;
int pgactivate = 0;
unsigned long nr_reclaimed = 0;
unsigned long vm_flags;
cond_resched();
@@ -643,7 +628,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep_locked;
}
referenced = page_referenced(page, 1, sc->mem_cgroup);
referenced = page_referenced(page, 1,
sc->mem_cgroup, &vm_flags);
/* In active use or really unfreeable? Activate it. */
if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
referenced && page_mapping_inuse(page))
@@ -943,18 +929,10 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
/* Check that we have not crossed a zone boundary. */
if (unlikely(page_zone_id(cursor_page) != zone_id))
continue;
switch (__isolate_lru_page(cursor_page, mode, file)) {
case 0:
if (__isolate_lru_page(cursor_page, mode, file) == 0) {
list_move(&cursor_page->lru, dst);
nr_taken++;
scan++;
break;
case -EBUSY:
/* else it is being freed elsewhere */
list_move(&cursor_page->lru, src);
default:
break; /* ! on LRU or wrong list */
}
}
}
@@ -1061,6 +1039,19 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
unsigned long nr_scanned = 0;
unsigned long nr_reclaimed = 0;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
int lumpy_reclaim = 0;
/*
* If we need a large contiguous chunk of memory, or have
* trouble getting a small set of contiguous pages, we
* will reclaim both active and inactive pages.
*
* We use the same threshold as pageout congestion_wait below.
*/
if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
lumpy_reclaim = 1;
else if (sc->order && priority < DEF_PRIORITY - 2)
lumpy_reclaim = 1;
pagevec_init(&pvec, 1);
@@ -1073,19 +1064,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
unsigned long nr_freed;
unsigned long nr_active;
unsigned int count[NR_LRU_LISTS] = { 0, };
int mode = ISOLATE_INACTIVE;
/*
* If we need a large contiguous chunk of memory, or have
* trouble getting a small set of contiguous pages, we
* will reclaim both active and inactive pages.
*
* We use the same threshold as pageout congestion_wait below.
*/
if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
mode = ISOLATE_BOTH;
else if (sc->order && priority < DEF_PRIORITY - 2)
mode = ISOLATE_BOTH;
int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE;
nr_taken = sc->isolate_pages(sc->swap_cluster_max,
&page_list, &nr_scan, sc->order, mode,
@@ -1122,7 +1101,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
* but that should be acceptable to the caller
*/
if (nr_freed < nr_taken && !current_is_kswapd() &&
sc->order > PAGE_ALLOC_COSTLY_ORDER) {
lumpy_reclaim) {
congestion_wait(WRITE, HZ/10);
/*
@@ -1217,18 +1196,54 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
* But we had to alter page->flags anyway.
*/
static void move_active_pages_to_lru(struct zone *zone,
struct list_head *list,
enum lru_list lru)
{
unsigned long pgmoved = 0;
struct pagevec pvec;
struct page *page;
pagevec_init(&pvec, 1);
while (!list_empty(list)) {
page = lru_to_page(list);
prefetchw_prev_lru_page(page, list, flags);
VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
VM_BUG_ON(!PageActive(page));
if (!is_active_lru(lru))
ClearPageActive(page); /* we are de-activating */
list_move(&page->lru, &zone->lru[lru].list);
mem_cgroup_add_lru_list(page, lru);
pgmoved++;
if (!pagevec_add(&pvec, page) || list_empty(list)) {
spin_unlock_irq(&zone->lru_lock);
if (buffer_heads_over_limit)
pagevec_strip(&pvec);
__pagevec_release(&pvec);
spin_lock_irq(&zone->lru_lock);
}
}
__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
if (!is_active_lru(lru))
__count_vm_events(PGDEACTIVATE, pgmoved);
}
static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
struct scan_control *sc, int priority, int file)
{
unsigned long pgmoved;
int pgdeactivate = 0;
unsigned long pgscanned;
unsigned long vm_flags;
LIST_HEAD(l_hold); /* The pages which were snipped off */
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
struct page *page;
struct pagevec pvec;
enum lru_list lru;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
lru_add_drain();
@@ -1245,13 +1260,14 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
}
reclaim_stat->recent_scanned[!!file] += pgmoved;
__count_zone_vm_events(PGREFILL, zone, pgscanned);
if (file)
__mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
else
__mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
spin_unlock_irq(&zone->lru_lock);
pgmoved = 0;
pgmoved = 0; /* count referenced (mapping) mapped pages */
while (!list_empty(&l_hold)) {
cond_resched();
page = lru_to_page(&l_hold);
@@ -1264,58 +1280,44 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
/* page_referenced clears PageReferenced */
if (page_mapping_inuse(page) &&
page_referenced(page, 0, sc->mem_cgroup))
page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
pgmoved++;
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
* that executable code get better chances to stay in
* memory under moderate memory pressure. Anon pages
* are not likely to be evicted by use-once streaming
* IO, plus JVM can create lots of anon VM_EXEC pages,
* so we ignore them here.
*/
if ((vm_flags & VM_EXEC) && !PageAnon(page)) {
list_add(&page->lru, &l_active);
continue;
}
}
list_add(&page->lru, &l_inactive);
}
/*
* Move the pages to the [file or anon] inactive list.
* Move pages back to the lru list.
*/
pagevec_init(&pvec, 1);
lru = LRU_BASE + file * LRU_FILE;
spin_lock_irq(&zone->lru_lock);
/*
* Count referenced pages from currently used mappings as
* rotated, even though they are moved to the inactive list.
* This helps balance scan pressure between file and anonymous
* pages in get_scan_ratio.
* Count referenced pages from currently used mappings as rotated,
* even though only some of them are actually re-activated. This
* helps balance scan pressure between file and anonymous pages in
* get_scan_ratio.
*/
reclaim_stat->recent_rotated[!!file] += pgmoved;
pgmoved = 0;
while (!list_empty(&l_inactive)) {
page = lru_to_page(&l_inactive);
prefetchw_prev_lru_page(page, &l_inactive, flags);
VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
VM_BUG_ON(!PageActive(page));
ClearPageActive(page);
move_active_pages_to_lru(zone, &l_active,
LRU_ACTIVE + file * LRU_FILE);
move_active_pages_to_lru(zone, &l_inactive,
LRU_BASE + file * LRU_FILE);
list_move(&page->lru, &zone->lru[lru].list);
mem_cgroup_add_lru_list(page, lru);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
spin_unlock_irq(&zone->lru_lock);
pgdeactivate += pgmoved;
pgmoved = 0;
if (buffer_heads_over_limit)
pagevec_strip(&pvec);
__pagevec_release(&pvec);
spin_lock_irq(&zone->lru_lock);
}
}
__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
pgdeactivate += pgmoved;
__count_zone_vm_events(PGREFILL, zone, pgscanned);
__count_vm_events(PGDEACTIVATE, pgdeactivate);
spin_unlock_irq(&zone->lru_lock);
if (buffer_heads_over_limit)
pagevec_strip(&pvec);
pagevec_release(&pvec);
}
static int inactive_anon_is_low_global(struct zone *zone)
@@ -1350,12 +1352,48 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
return low;
}
static int inactive_file_is_low_global(struct zone *zone)
{
unsigned long active, inactive;
active = zone_page_state(zone, NR_ACTIVE_FILE);
inactive = zone_page_state(zone, NR_INACTIVE_FILE);
return (active > inactive);
}
/**
* inactive_file_is_low - check if file pages need to be deactivated
* @zone: zone to check
* @sc: scan control of this context
*
* When the system is doing streaming IO, memory pressure here
* ensures that active file pages get deactivated, until more
* than half of the file pages are on the inactive list.
*
* Once we get to that situation, protect the system's working
* set from being evicted by disabling active file page aging.
*
* This uses a different ratio than the anonymous pages, because
* the page cache uses a use-once replacement algorithm.
*/
static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
{
int low;
if (scanning_global_lru(sc))
low = inactive_file_is_low_global(zone);
else
low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
return low;
}
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
struct zone *zone, struct scan_control *sc, int priority)
{
int file = is_file_lru(lru);
if (lru == LRU_ACTIVE_FILE) {
if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) {
shrink_active_list(nr_to_scan, zone, sc, priority, file);
return 0;
}
@@ -1384,13 +1422,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
unsigned long ap, fp;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
/* If we have no swap space, do not bother scanning anon pages. */
if (!sc->may_swap || (nr_swap_pages <= 0)) {
percent[0] = 0;
percent[1] = 100;
return;
}
anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) +
zone_nr_pages(zone, sc, LRU_INACTIVE_ANON);
file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) +
@@ -1400,7 +1431,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
free = zone_page_state(zone, NR_FREE_PAGES);
/* If we have very few page cache pages,
force-scan anon pages. */
if (unlikely(file + free <= zone->pages_high)) {
if (unlikely(file + free <= high_wmark_pages(zone))) {
percent[0] = 100;
percent[1] = 0;
return;
@@ -1455,6 +1486,26 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
percent[1] = 100 - percent[0];
}
/*
* Smallish @nr_to_scan's are deposited in @nr_saved_scan,
* until we collected @swap_cluster_max pages to scan.
*/
static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
unsigned long *nr_saved_scan,
unsigned long swap_cluster_max)
{
unsigned long nr;
*nr_saved_scan += nr_to_scan;
nr = *nr_saved_scan;
if (nr >= swap_cluster_max)
*nr_saved_scan = 0;
else
nr = 0;
return nr;
}
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
@@ -1468,26 +1519,30 @@ static void shrink_zone(int priority, struct zone *zone,
enum lru_list l;
unsigned long nr_reclaimed = sc->nr_reclaimed;
unsigned long swap_cluster_max = sc->swap_cluster_max;
int noswap = 0;
get_scan_ratio(zone, sc, percent);
/* If we have no swap space, do not bother scanning anon pages. */
if (!sc->may_swap || (nr_swap_pages <= 0)) {
noswap = 1;
percent[0] = 0;
percent[1] = 100;
} else
get_scan_ratio(zone, sc, percent);
for_each_evictable_lru(l) {
int file = is_file_lru(l);
unsigned long scan;
scan = zone_nr_pages(zone, sc, l);
if (priority) {
if (priority || noswap) {
scan >>= priority;
scan = (scan * percent[file]) / 100;
}
if (scanning_global_lru(sc)) {
zone->lru[l].nr_scan += scan;
nr[l] = zone->lru[l].nr_scan;
if (nr[l] >= swap_cluster_max)
zone->lru[l].nr_scan = 0;
else
nr[l] = 0;
} else
if (scanning_global_lru(sc))
nr[l] = nr_scan_try_batch(scan,
&zone->lru[l].nr_saved_scan,
swap_cluster_max);
else
nr[l] = scan;
}
@@ -1521,7 +1576,7 @@ static void shrink_zone(int priority, struct zone *zone,
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
if (inactive_anon_is_low(zone, sc))
if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0)
shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
throttle_vm_writeout(sc->gfp_mask);
@@ -1532,11 +1587,13 @@ static void shrink_zone(int priority, struct zone *zone,
* try to reclaim pages from zones which will satisfy the caller's allocation
* request.
*
* We reclaim from a zone even if that zone is over pages_high. Because:
* We reclaim from a zone even if that zone is over high_wmark_pages(zone).
* Because:
* a) The caller may be trying to free *extra* pages to satisfy a higher-order
* allocation or
* b) The zones may be over pages_high but they must go *over* pages_high to
* satisfy the `incremental min' zone defense algorithm.
* b) The target zone may be at high_wmark_pages(zone) but the lower zones
* must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
* zone defense algorithm.
*
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
@@ -1742,7 +1799,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
/*
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at pages_high.
* they are all at high_wmark_pages(zone).
*
* Returns the number of pages which were actually freed.
*
@@ -1755,11 +1812,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
* the zone for when the problem goes away.
*
* kswapd scans the zones in the highmem->normal->dma direction. It skips
* zones which have free_pages > pages_high, but once a zone is found to have
* free_pages <= pages_high, we scan that zone and the lower zones regardless
* of the number of free pages in the lower zones. This interoperates with
* the page allocator fallback scheme to ensure that aging of pages is balanced
* across the zones.
* zones which have free_pages > high_wmark_pages(zone), but once a zone is
* found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
* lower zones regardless of the number of free pages in the lower zones. This
* interoperates with the page allocator fallback scheme to ensure that aging
* of pages is balanced across the zones.
*/
static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
{
@@ -1780,7 +1837,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
};
/*
* temp_priority is used to remember the scanning priority at which
* this zone was successfully refilled to free_pages == pages_high.
* this zone was successfully refilled to
* free_pages == high_wmark_pages(zone).
*/
int temp_priority[MAX_NR_ZONES];
@@ -1825,8 +1883,8 @@ loop_again:
shrink_active_list(SWAP_CLUSTER_MAX, zone,
&sc, priority, 0);
if (!zone_watermark_ok(zone, order, zone->pages_high,
0, 0)) {
if (!zone_watermark_ok(zone, order,
high_wmark_pages(zone), 0, 0)) {
end_zone = i;
break;
}
@@ -1860,8 +1918,8 @@ loop_again:
priority != DEF_PRIORITY)
continue;
if (!zone_watermark_ok(zone, order, zone->pages_high,
end_zone, 0))
if (!zone_watermark_ok(zone, order,
high_wmark_pages(zone), end_zone, 0))
all_zones_ok = 0;
temp_priority[i] = priority;
sc.nr_scanned = 0;
@@ -1870,8 +1928,8 @@ loop_again:
* We put equal pressure on every zone, unless one
* zone has way too many pages free already.
*/
if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
end_zone, 0))
if (!zone_watermark_ok(zone, order,
8*high_wmark_pages(zone), end_zone, 0))
shrink_zone(priority, zone, &sc);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
@@ -2037,7 +2095,7 @@ void wakeup_kswapd(struct zone *zone, int order)
return;
pgdat = zone->zone_pgdat;
if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
return;
if (pgdat->kswapd_max_order < order)
pgdat->kswapd_max_order = order;
@@ -2084,11 +2142,11 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
l == LRU_ACTIVE_FILE))
continue;
zone->lru[l].nr_scan += (lru_pages >> prio) + 1;
if (zone->lru[l].nr_scan >= nr_pages || pass > 3) {
zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1;
if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) {
unsigned long nr_to_scan;
zone->lru[l].nr_scan = 0;
zone->lru[l].nr_saved_scan = 0;
nr_to_scan = min(nr_pages, lru_pages);
nr_reclaimed += shrink_list(l, nr_to_scan, zone,
sc, prio);
@@ -2290,6 +2348,48 @@ int sysctl_min_unmapped_ratio = 1;
*/
int sysctl_min_slab_ratio = 5;
static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
{
unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
zone_page_state(zone, NR_ACTIVE_FILE);
/*
* It's possible for there to be more file mapped pages than
* accounted for by the pages on the file LRU lists because
* tmpfs pages accounted for as ANON can also be FILE_MAPPED
*/
return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
}
/* Work out how many page cache pages we can reclaim in this reclaim_mode */
static long zone_pagecache_reclaimable(struct zone *zone)
{
long nr_pagecache_reclaimable;
long delta = 0;
/*
* If RECLAIM_SWAP is set, then all file pages are considered
* potentially reclaimable. Otherwise, we have to worry about
* pages like swapcache and zone_unmapped_file_pages() provides
* a better estimate
*/
if (zone_reclaim_mode & RECLAIM_SWAP)
nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
else
nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
/* If we can't clean pages, remove dirty pages from consideration */
if (!(zone_reclaim_mode & RECLAIM_WRITE))
delta += zone_page_state(zone, NR_FILE_DIRTY);
/* Watch for any possible underflows due to delta */
if (unlikely(delta > nr_pagecache_reclaimable))
delta = nr_pagecache_reclaimable;
return nr_pagecache_reclaimable - delta;
}
/*
* Try to free up some pages from this zone through reclaim.
*/
@@ -2324,9 +2424,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
if (zone_page_state(zone, NR_FILE_PAGES) -
zone_page_state(zone, NR_FILE_MAPPED) >
zone->min_unmapped_pages) {
if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
/*
* Free memory by calling shrink zone with increasing
* priorities until we have enough memory freed.
@@ -2384,20 +2482,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* if less than a specified percentage of the zone is used by
* unmapped file backed pages.
*/
if (zone_page_state(zone, NR_FILE_PAGES) -
zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages
&& zone_page_state(zone, NR_SLAB_RECLAIMABLE)
<= zone->min_slab_pages)
return 0;
if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
return ZONE_RECLAIM_FULL;
if (zone_is_all_unreclaimable(zone))
return 0;
return ZONE_RECLAIM_FULL;
/*
* Do not scan if the allocation should not be delayed.
*/
if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
return 0;
return ZONE_RECLAIM_NOSCAN;
/*
* Only run zone reclaim on the local zone or on zones that do not
@@ -2407,18 +2503,21 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
*/
node_id = zone_to_nid(zone);
if (node_state(node_id, N_CPU) && node_id != numa_node_id())
return 0;
return ZONE_RECLAIM_NOSCAN;
if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
return 0;
return ZONE_RECLAIM_NOSCAN;
ret = __zone_reclaim(zone, gfp_mask, order);
zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
if (!ret)
count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
return ret;
}
#endif
#ifdef CONFIG_UNEVICTABLE_LRU
/*
* page_evictable - test whether a page is evictable
* @page: the page to test
@@ -2665,4 +2764,3 @@ void scan_unevictable_unregister_node(struct node *node)
sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
}
#endif

查看文件

@@ -629,10 +629,8 @@ static const char * const vmstat_text[] = {
"nr_active_anon",
"nr_inactive_file",
"nr_active_file",
#ifdef CONFIG_UNEVICTABLE_LRU
"nr_unevictable",
"nr_mlock",
#endif
"nr_anon_pages",
"nr_mapped",
"nr_file_pages",
@@ -675,6 +673,9 @@ static const char * const vmstat_text[] = {
TEXTS_FOR_ZONES("pgscan_kswapd")
TEXTS_FOR_ZONES("pgscan_direct")
#ifdef CONFIG_NUMA
"zone_reclaim_failed",
#endif
"pginodesteal",
"slabs_scanned",
"kswapd_steal",
@@ -687,7 +688,6 @@ static const char * const vmstat_text[] = {
"htlb_buddy_alloc_success",
"htlb_buddy_alloc_fail",
#endif
#ifdef CONFIG_UNEVICTABLE_LRU
"unevictable_pgs_culled",
"unevictable_pgs_scanned",
"unevictable_pgs_rescued",
@@ -697,7 +697,6 @@ static const char * const vmstat_text[] = {
"unevictable_pgs_stranded",
"unevictable_pgs_mlockfreed",
#endif
#endif
};
static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
@@ -710,18 +709,14 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n min %lu"
"\n low %lu"
"\n high %lu"
"\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)"
"\n scanned %lu"
"\n spanned %lu"
"\n present %lu",
zone_page_state(zone, NR_FREE_PAGES),
zone->pages_min,
zone->pages_low,
zone->pages_high,
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),
zone->pages_scanned,
zone->lru[LRU_ACTIVE_ANON].nr_scan,
zone->lru[LRU_INACTIVE_ANON].nr_scan,
zone->lru[LRU_ACTIVE_FILE].nr_scan,
zone->lru[LRU_INACTIVE_FILE].nr_scan,
zone->spanned_pages,
zone->present_pages);