Merge commit 'v2.6.30-rc6' into perfcounters/core

Merge reason: this branch was on an -rc4 base, merge it up to -rc6
              to get the latest upstream fixes.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
Ingo Molnar
2009-05-18 07:37:44 +02:00
786 changed files with 17787 additions and 9563 deletions

View File

@@ -225,3 +225,31 @@ config HAVE_MLOCKED_PAGE_BIT
config MMU_NOTIFIER
bool
config NOMMU_INITIAL_TRIM_EXCESS
int "Turn on mmap() excess space trimming before booting"
depends on !MMU
default 1
help
The NOMMU mmap() frequently needs to allocate large contiguous chunks
of memory on which to store mappings, but it can only ask the system
allocator for chunks in 2^N*PAGE_SIZE amounts - which is frequently
more than it requires. To deal with this, mmap() is able to trim off
the excess and return it to the allocator.
If trimming is enabled, the excess is trimmed off and returned to the
system allocator, which can cause extra fragmentation, particularly
if there are a lot of transient processes.
If trimming is disabled, the excess is kept, but not used, which for
long-term mappings means that the space is wasted.
Trimming can be dynamically controlled through a sysctl option
(/proc/sys/vm/nr_trim_pages) which specifies the minimum number of
excess pages there must be before trimming should occur, or zero if
no trimming is to occur.
This option specifies the initial value of this option. The default
of 1 says that all excess pages should be trimmed.
See Documentation/nommu-mmap.txt for more information.

View File

@@ -1024,9 +1024,7 @@ static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
return NULL;
pc = lookup_page_cgroup(page);
/*
* Used bit of swapcache is solid under page lock.
*/
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
mem = pc->mem_cgroup;
if (mem && !css_tryget(&mem->css))
@@ -1040,6 +1038,7 @@ static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
mem = NULL;
rcu_read_unlock();
}
unlock_page_cgroup(pc);
return mem;
}
@@ -1618,37 +1617,28 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
}
/*
* A call to try to shrink memory usage under specified resource controller.
* This is typically used for page reclaiming for shmem for reducing side
* effect of page allocation from shmem, which is used by some mem_cgroup.
* A call to try to shrink memory usage on charge failure at shmem's swapin.
* Calling hierarchical_reclaim is not enough because we should update
* last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
* Moreover considering hierarchy, we should reclaim from the mem_over_limit,
* not from the memcg which this page would be charged to.
* try_charge_swapin does all of these works properly.
*/
int mem_cgroup_shrink_usage(struct page *page,
int mem_cgroup_shmem_charge_fallback(struct page *page,
struct mm_struct *mm,
gfp_t gfp_mask)
{
struct mem_cgroup *mem = NULL;
int progress = 0;
int retry = MEM_CGROUP_RECLAIM_RETRIES;
int ret;
if (mem_cgroup_disabled())
return 0;
if (page)
mem = try_get_mem_cgroup_from_swapcache(page);
if (!mem && mm)
mem = try_get_mem_cgroup_from_mm(mm);
if (unlikely(!mem))
return 0;
do {
progress = mem_cgroup_hierarchical_reclaim(mem,
gfp_mask, true, false);
progress += mem_cgroup_check_under_limit(mem);
} while (!progress && --retry);
ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
if (!ret)
mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
css_put(&mem->css);
if (!retry)
return -ENOMEM;
return 0;
return ret;
}
static DEFINE_MUTEX(set_limit_mutex);

View File

@@ -1971,6 +1971,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
ret = tmp;
goto unwritable_page;
}
if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
lock_page(old_page);
if (!old_page->mapping) {
ret = 0; /* retry the fault */
unlock_page(old_page);
goto unwritable_page;
}
} else
VM_BUG_ON(!PageLocked(old_page));
/*
* Since we dropped the lock we need to revalidate
@@ -1980,9 +1989,11 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
page_table = pte_offset_map_lock(mm, pmd, address,
&ptl);
page_cache_release(old_page);
if (!pte_same(*page_table, orig_pte))
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
page_cache_release(old_page);
goto unlock;
}
page_mkwrite = 1;
}
@@ -2094,9 +2105,6 @@ gotten:
unlock:
pte_unmap_unlock(page_table, ptl);
if (dirty_page) {
if (vma->vm_file)
file_update_time(vma->vm_file);
/*
* Yes, Virginia, this is actually required to prevent a race
* with clear_page_dirty_for_io() from clearing the page dirty
@@ -2105,16 +2113,41 @@ unlock:
*
* do_no_page is protected similarly.
*/
wait_on_page_locked(dirty_page);
set_page_dirty_balance(dirty_page, page_mkwrite);
if (!page_mkwrite) {
wait_on_page_locked(dirty_page);
set_page_dirty_balance(dirty_page, page_mkwrite);
}
put_page(dirty_page);
if (page_mkwrite) {
struct address_space *mapping = dirty_page->mapping;
set_page_dirty(dirty_page);
unlock_page(dirty_page);
page_cache_release(dirty_page);
if (mapping) {
/*
* Some device drivers do not set page.mapping
* but still dirty their pages
*/
balance_dirty_pages_ratelimited(mapping);
}
}
/* file_update_time outside page_lock */
if (vma->vm_file)
file_update_time(vma->vm_file);
}
return ret;
oom_free_new:
page_cache_release(new_page);
oom:
if (old_page)
if (old_page) {
if (page_mkwrite) {
unlock_page(old_page);
page_cache_release(old_page);
}
page_cache_release(old_page);
}
return VM_FAULT_OOM;
unwritable_page:
@@ -2458,8 +2491,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
ret = VM_FAULT_OOM;
unlock_page(page);
goto out;
goto out_page;
}
/*
@@ -2521,6 +2553,7 @@ out:
out_nomap:
mem_cgroup_cancel_charge_swapin(ptr);
pte_unmap_unlock(page_table, ptl);
out_page:
unlock_page(page);
page_cache_release(page);
return ret;
@@ -2664,27 +2697,22 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
int tmp;
unlock_page(page);
vmf.flags |= FAULT_FLAG_MKWRITE;
vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
if (unlikely(tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
ret = tmp;
anon = 1; /* no anon but release vmf.page */
goto out_unlocked;
}
lock_page(page);
/*
* XXX: this is not quite right (racy vs
* invalidate) to unlock and relock the page
* like this, however a better fix requires
* reworking page_mkwrite locking API, which
* is better done later.
*/
if (!page->mapping) {
ret = 0;
anon = 1; /* no anon but release vmf.page */
goto out;
goto unwritable_page;
}
if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
lock_page(page);
if (!page->mapping) {
ret = 0; /* retry the fault */
unlock_page(page);
goto unwritable_page;
}
} else
VM_BUG_ON(!PageLocked(page));
page_mkwrite = 1;
}
}
@@ -2736,19 +2764,35 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap_unlock(page_table, ptl);
out:
unlock_page(vmf.page);
out_unlocked:
if (anon)
page_cache_release(vmf.page);
else if (dirty_page) {
if (dirty_page) {
struct address_space *mapping = page->mapping;
if (set_page_dirty(dirty_page))
page_mkwrite = 1;
unlock_page(dirty_page);
put_page(dirty_page);
if (page_mkwrite && mapping) {
/*
* Some device drivers do not set page.mapping but still
* dirty their pages
*/
balance_dirty_pages_ratelimited(mapping);
}
/* file_update_time outside page_lock */
if (vma->vm_file)
file_update_time(vma->vm_file);
set_page_dirty_balance(dirty_page, page_mkwrite);
put_page(dirty_page);
} else {
unlock_page(vmf.page);
if (anon)
page_cache_release(vmf.page);
}
return ret;
unwritable_page:
page_cache_release(page);
return ret;
}
static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,

View File

@@ -86,7 +86,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio = 50; /* default is 50% */
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
struct percpu_counter vm_committed_as;
/*
* Check that a process has enough memory to allocate a new virtual
@@ -180,11 +180,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
if (mm)
allowed -= mm->total_vm / 32;
/*
* cast `allowed' as a signed long because vm_committed_space
* sometimes has a negative value
*/
if (atomic_long_read(&vm_committed_space) < (long)allowed)
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
error:
vm_unacct_memory(pages);
@@ -2491,4 +2487,8 @@ void mm_drop_all_locks(struct mm_struct *mm)
*/
void __init mmap_init(void)
{
int ret;
ret = percpu_counter_init(&vm_committed_as, 0);
VM_BUG_ON(ret);
}

View File

@@ -62,11 +62,11 @@ void *high_memory;
struct page *mem_map;
unsigned long max_mapnr;
unsigned long num_physpages;
atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
struct percpu_counter vm_committed_as;
int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio = 50; /* default is 50% */
int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
int sysctl_nr_trim_pages = 1; /* page trimming behaviour */
int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
int heap_stack_gap = 0;
atomic_long_t mmap_pages_allocated;
@@ -463,6 +463,10 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
*/
void __init mmap_init(void)
{
int ret;
ret = percpu_counter_init(&vm_committed_as, 0);
VM_BUG_ON(ret);
vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
}
@@ -511,8 +515,6 @@ static void add_nommu_region(struct vm_region *region)
validate_nommu_regions();
BUG_ON(region->vm_start & ~PAGE_MASK);
parent = NULL;
p = &nommu_region_tree.rb_node;
while (*p) {
@@ -1847,12 +1849,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
if (mm)
allowed -= mm->total_vm / 32;
/*
* cast `allowed' as a signed long because vm_committed_space
* sometimes has a negative value
*/
if (atomic_long_read(&vm_committed_space) < (long)allowed)
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
error:
vm_unacct_memory(pages);

View File

@@ -514,34 +514,32 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
*/
static void __out_of_memory(gfp_t gfp_mask, int order)
{
if (sysctl_oom_kill_allocating_task) {
oom_kill_process(current, gfp_mask, order, 0, NULL,
"Out of memory (oom_kill_allocating_task)");
struct task_struct *p;
unsigned long points;
} else {
unsigned long points;
struct task_struct *p;
retry:
/*
* Rambo mode: Shoot down a process and hope it solves whatever
* issues we may have.
*/
p = select_bad_process(&points, NULL);
if (PTR_ERR(p) == -1UL)
if (sysctl_oom_kill_allocating_task)
if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
"Out of memory (oom_kill_allocating_task)"))
return;
retry:
/*
* Rambo mode: Shoot down a process and hope it solves whatever
* issues we may have.
*/
p = select_bad_process(&points, NULL);
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
read_unlock(&tasklist_lock);
panic("Out of memory and no killable processes...\n");
}
if (PTR_ERR(p) == -1UL)
return;
if (oom_kill_process(p, gfp_mask, order, points, NULL,
"Out of memory"))
goto retry;
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
read_unlock(&tasklist_lock);
panic("Out of memory and no killable processes...\n");
}
if (oom_kill_process(p, gfp_mask, order, points, NULL,
"Out of memory"))
goto retry;
}
/*

View File

@@ -2681,6 +2681,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
static int zone_batchsize(struct zone *zone)
{
#ifdef CONFIG_MMU
int batch;
/*
@@ -2706,9 +2707,26 @@ static int zone_batchsize(struct zone *zone)
* of pages of one half of the possible page colors
* and the other with pages of the other colors.
*/
batch = (1 << (fls(batch + batch/2)-1)) - 1;
batch = rounddown_pow_of_two(batch + batch/2) - 1;
return batch;
#else
/* The deferral and batching of frees should be suppressed under NOMMU
* conditions.
*
* The problem is that NOMMU needs to be able to allocate large chunks
* of contiguous memory as there's no hardware page translation to
* assemble apparent contiguous memory from discontiguous pages.
*
* Queueing large contiguous runs of pages for batching, however,
* causes the pages to actually be freed in smaller chunks. As there
* can be a significant delay between the individual batches being
* recycled, this leads to the once large chunks of space being
* fragmented and becoming unavailable for high-order allocations.
*/
return 0;
#endif
}
static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

View File

@@ -57,14 +57,6 @@ static DEFINE_SPINLOCK(pdflush_lock);
*/
int nr_pdflush_threads = 0;
/*
* The max/min number of pdflush threads. R/W by sysctl at
* /proc/sys/vm/nr_pdflush_threads_max/min
*/
int nr_pdflush_threads_max __read_mostly = MAX_PDFLUSH_THREADS;
int nr_pdflush_threads_min __read_mostly = MIN_PDFLUSH_THREADS;
/*
* The time at which the pdflush thread pool last went empty
*/
@@ -76,7 +68,7 @@ static unsigned long last_empty_jifs;
* Thread pool management algorithm:
*
* - The minimum and maximum number of pdflush instances are bound
* by nr_pdflush_threads_min and nr_pdflush_threads_max.
* by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
*
* - If there have been no idle pdflush instances for 1 second, create
* a new one.
@@ -142,13 +134,14 @@ static int __pdflush(struct pdflush_work *my_work)
* To throttle creation, we reset last_empty_jifs.
*/
if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
if (list_empty(&pdflush_list) &&
nr_pdflush_threads < nr_pdflush_threads_max) {
last_empty_jifs = jiffies;
nr_pdflush_threads++;
spin_unlock_irq(&pdflush_lock);
start_one_pdflush_thread();
spin_lock_irq(&pdflush_lock);
if (list_empty(&pdflush_list)) {
if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) {
last_empty_jifs = jiffies;
nr_pdflush_threads++;
spin_unlock_irq(&pdflush_lock);
start_one_pdflush_thread();
spin_lock_irq(&pdflush_lock);
}
}
}
@@ -160,7 +153,7 @@ static int __pdflush(struct pdflush_work *my_work)
*/
if (list_empty(&pdflush_list))
continue;
if (nr_pdflush_threads <= nr_pdflush_threads_min)
if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
continue;
pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
@@ -266,9 +259,9 @@ static int __init pdflush_init(void)
* Pre-set nr_pdflush_threads... If we fail to create,
* the count will be decremented.
*/
nr_pdflush_threads = nr_pdflush_threads_min;
nr_pdflush_threads = MIN_PDFLUSH_THREADS;
for (i = 0; i < nr_pdflush_threads_min; i++)
for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
start_one_pdflush_thread();
return 0;
}

View File

@@ -1340,8 +1340,12 @@ repeat:
shmem_swp_unmap(entry);
spin_unlock(&info->lock);
if (error == -ENOMEM) {
/* allow reclaim from this memory cgroup */
error = mem_cgroup_shrink_usage(swappage,
/*
* reclaim from proper memory cgroup and
* call memcg's OOM if needed.
*/
error = mem_cgroup_shmem_charge_fallback(
swappage,
current->mm,
gfp);
if (error) {

View File

@@ -491,49 +491,6 @@ unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
EXPORT_SYMBOL(pagevec_lookup_tag);
#ifdef CONFIG_SMP
/*
* We tolerate a little inaccuracy to avoid ping-ponging the counter between
* CPUs
*/
#define ACCT_THRESHOLD max(16, NR_CPUS * 2)
static DEFINE_PER_CPU(long, committed_space);
void vm_acct_memory(long pages)
{
long *local;
preempt_disable();
local = &__get_cpu_var(committed_space);
*local += pages;
if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
atomic_long_add(*local, &vm_committed_space);
*local = 0;
}
preempt_enable();
}
#ifdef CONFIG_HOTPLUG_CPU
/* Drop the CPU's cached committed space back into the central pool. */
static int cpu_swap_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
long *committed;
committed = &per_cpu(committed_space, (long)hcpu);
if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
atomic_long_add(*committed, &vm_committed_space);
*committed = 0;
drain_cpu_pagevecs((long)hcpu);
}
return NOTIFY_OK;
}
#endif /* CONFIG_HOTPLUG_CPU */
#endif /* CONFIG_SMP */
/*
* Perform any setup for the swap system
*/
@@ -554,7 +511,4 @@ void __init swap_setup(void)
* Right now other parts of the system means that we
* _really_ don't want to cluster much more
*/
#ifdef CONFIG_HOTPLUG_CPU
hotcpu_notifier(cpu_swap_callback, 0);
#endif
}

View File

@@ -402,6 +402,7 @@ overflow:
printk(KERN_WARNING
"vmap allocation for size %lu failed: "
"use vmalloc=<size> to increase size.\n", size);
kfree(va);
return ERR_PTR(-EBUSY);
}

View File

@@ -1471,7 +1471,7 @@ static void shrink_zone(int priority, struct zone *zone,
for_each_evictable_lru(l) {
int file = is_file_lru(l);
int scan;
unsigned long scan;
scan = zone_nr_pages(zone, sc, l);
if (priority) {