Merge tag 'v4.5-rc1' into x86/asm, to refresh the branch before merging new changes
Signed-off-by: Ingo Molnar <mingo@kernel.org>
此提交包含在:
@@ -672,7 +672,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
|
||||
|
||||
ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
|
||||
if (!ret) {
|
||||
bdi->wb.memcg_css = mem_cgroup_root_css;
|
||||
bdi->wb.memcg_css = &root_mem_cgroup->css;
|
||||
bdi->wb.blkcg_css = blkcg_root_css;
|
||||
}
|
||||
return ret;
|
||||
|
@@ -61,6 +61,7 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
|
||||
bool dequeued_page;
|
||||
|
||||
dequeued_page = false;
|
||||
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
|
||||
list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) {
|
||||
/*
|
||||
* Block others from accessing the 'page' while we get around
|
||||
@@ -75,15 +76,14 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
|
||||
balloon_page_delete(page);
|
||||
__count_vm_event(BALLOON_DEFLATE);
|
||||
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
|
||||
unlock_page(page);
|
||||
dequeued_page = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
|
||||
|
||||
if (!dequeued_page) {
|
||||
/*
|
||||
|
@@ -1658,14 +1658,15 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
|
||||
!compaction_deferred(zone, cc->order))
|
||||
compact_zone(zone, cc);
|
||||
|
||||
if (cc->order > 0) {
|
||||
if (zone_watermark_ok(zone, cc->order,
|
||||
low_wmark_pages(zone), 0, 0))
|
||||
compaction_defer_reset(zone, cc->order, false);
|
||||
}
|
||||
|
||||
VM_BUG_ON(!list_empty(&cc->freepages));
|
||||
VM_BUG_ON(!list_empty(&cc->migratepages));
|
||||
|
||||
if (is_via_compact_memory(cc->order))
|
||||
continue;
|
||||
|
||||
if (zone_watermark_ok(zone, cc->order,
|
||||
low_wmark_pages(zone), 0, 0))
|
||||
compaction_defer_reset(zone, cc->order, false);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1708,7 +1709,10 @@ static void compact_nodes(void)
|
||||
/* The written value is actually unused, all memory is compacted */
|
||||
int sysctl_compact_memory;
|
||||
|
||||
/* This is the entry point for compacting all nodes via /proc/sys/vm */
|
||||
/*
|
||||
* This is the entry point for compacting all nodes via
|
||||
* /proc/sys/vm/compact_memory
|
||||
*/
|
||||
int sysctl_compaction_handler(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *length, loff_t *ppos)
|
||||
{
|
||||
|
12
mm/debug.c
12
mm/debug.c
@@ -40,9 +40,6 @@ static const struct trace_print_flags pageflag_names[] = {
|
||||
#ifdef CONFIG_MEMORY_FAILURE
|
||||
{1UL << PG_hwpoison, "hwpoison" },
|
||||
#endif
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
{1UL << PG_compound_lock, "compound_lock" },
|
||||
#endif
|
||||
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
|
||||
{1UL << PG_young, "young" },
|
||||
{1UL << PG_idle, "idle" },
|
||||
@@ -82,9 +79,12 @@ static void dump_flags(unsigned long flags,
|
||||
void dump_page_badflags(struct page *page, const char *reason,
|
||||
unsigned long badflags)
|
||||
{
|
||||
pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
|
||||
pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
|
||||
page, atomic_read(&page->_count), page_mapcount(page),
|
||||
page->mapping, page->index);
|
||||
if (PageCompound(page))
|
||||
pr_cont(" compound_mapcount: %d", compound_mapcount(page));
|
||||
pr_cont("\n");
|
||||
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
|
||||
dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names));
|
||||
if (reason)
|
||||
@@ -175,7 +175,7 @@ void dump_mm(const struct mm_struct *mm)
|
||||
"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
|
||||
"pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
|
||||
"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
|
||||
"pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n"
|
||||
"pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
|
||||
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
|
||||
"start_brk %lx brk %lx start_stack %lx\n"
|
||||
"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
|
||||
@@ -209,7 +209,7 @@ void dump_mm(const struct mm_struct *mm)
|
||||
mm_nr_pmds((struct mm_struct *)mm),
|
||||
mm->map_count,
|
||||
mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
|
||||
mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm,
|
||||
mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm,
|
||||
mm->start_code, mm->end_code, mm->start_data, mm->end_data,
|
||||
mm->start_brk, mm->brk, mm->start_stack,
|
||||
mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
|
||||
|
129
mm/filemap.c
129
mm/filemap.c
@@ -11,6 +11,7 @@
|
||||
*/
|
||||
#include <linux/export.h>
|
||||
#include <linux/compiler.h>
|
||||
#include <linux/dax.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/capability.h>
|
||||
@@ -123,9 +124,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
|
||||
__radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
|
||||
|
||||
if (shadow) {
|
||||
mapping->nrshadows++;
|
||||
mapping->nrexceptional++;
|
||||
/*
|
||||
* Make sure the nrshadows update is committed before
|
||||
* Make sure the nrexceptional update is committed before
|
||||
* the nrpages update so that final truncate racing
|
||||
* with reclaim does not see both counters 0 at the
|
||||
* same time and miss a shadow entry.
|
||||
@@ -204,7 +205,7 @@ void __delete_from_page_cache(struct page *page, void *shadow,
|
||||
__dec_zone_page_state(page, NR_FILE_PAGES);
|
||||
if (PageSwapBacked(page))
|
||||
__dec_zone_page_state(page, NR_SHMEM);
|
||||
BUG_ON(page_mapped(page));
|
||||
VM_BUG_ON_PAGE(page_mapped(page), page);
|
||||
|
||||
/*
|
||||
* At this point page must be either written or cleaned by truncate.
|
||||
@@ -481,6 +482,12 @@ int filemap_write_and_wait_range(struct address_space *mapping,
|
||||
{
|
||||
int err = 0;
|
||||
|
||||
if (dax_mapping(mapping) && mapping->nrexceptional) {
|
||||
err = dax_writeback_mapping_range(mapping, lstart, lend);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
if (mapping->nrpages) {
|
||||
err = __filemap_fdatawrite_range(mapping, lstart, lend,
|
||||
WB_SYNC_ALL);
|
||||
@@ -579,9 +586,13 @@ static int page_cache_tree_insert(struct address_space *mapping,
|
||||
p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
|
||||
if (!radix_tree_exceptional_entry(p))
|
||||
return -EEXIST;
|
||||
|
||||
if (WARN_ON(dax_mapping(mapping)))
|
||||
return -EINVAL;
|
||||
|
||||
if (shadowp)
|
||||
*shadowp = p;
|
||||
mapping->nrshadows--;
|
||||
mapping->nrexceptional--;
|
||||
if (node)
|
||||
workingset_node_shadows_dec(node);
|
||||
}
|
||||
@@ -618,7 +629,7 @@ static int __add_to_page_cache_locked(struct page *page,
|
||||
|
||||
if (!huge) {
|
||||
error = mem_cgroup_try_charge(page, current->mm,
|
||||
gfp_mask, &memcg);
|
||||
gfp_mask, &memcg, false);
|
||||
if (error)
|
||||
return error;
|
||||
}
|
||||
@@ -626,7 +637,7 @@ static int __add_to_page_cache_locked(struct page *page,
|
||||
error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
|
||||
if (error) {
|
||||
if (!huge)
|
||||
mem_cgroup_cancel_charge(page, memcg);
|
||||
mem_cgroup_cancel_charge(page, memcg, false);
|
||||
return error;
|
||||
}
|
||||
|
||||
@@ -645,7 +656,7 @@ static int __add_to_page_cache_locked(struct page *page,
|
||||
__inc_zone_page_state(page, NR_FILE_PAGES);
|
||||
spin_unlock_irq(&mapping->tree_lock);
|
||||
if (!huge)
|
||||
mem_cgroup_commit_charge(page, memcg, false);
|
||||
mem_cgroup_commit_charge(page, memcg, false, false);
|
||||
trace_mm_filemap_add_to_page_cache(page);
|
||||
return 0;
|
||||
err_insert:
|
||||
@@ -653,7 +664,7 @@ err_insert:
|
||||
/* Leave page->index set: truncation relies upon it */
|
||||
spin_unlock_irq(&mapping->tree_lock);
|
||||
if (!huge)
|
||||
mem_cgroup_cancel_charge(page, memcg);
|
||||
mem_cgroup_cancel_charge(page, memcg, false);
|
||||
page_cache_release(page);
|
||||
return error;
|
||||
}
|
||||
@@ -682,11 +693,11 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
|
||||
void *shadow = NULL;
|
||||
int ret;
|
||||
|
||||
__set_page_locked(page);
|
||||
__SetPageLocked(page);
|
||||
ret = __add_to_page_cache_locked(page, mapping, offset,
|
||||
gfp_mask, &shadow);
|
||||
if (unlikely(ret))
|
||||
__clear_page_locked(page);
|
||||
__ClearPageLocked(page);
|
||||
else {
|
||||
/*
|
||||
* The page might have been evicted from cache only
|
||||
@@ -809,6 +820,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
|
||||
*/
|
||||
void unlock_page(struct page *page)
|
||||
{
|
||||
page = compound_head(page);
|
||||
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
||||
clear_bit_unlock(PG_locked, &page->flags);
|
||||
smp_mb__after_atomic();
|
||||
@@ -873,18 +885,20 @@ EXPORT_SYMBOL_GPL(page_endio);
|
||||
*/
|
||||
void __lock_page(struct page *page)
|
||||
{
|
||||
DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
|
||||
struct page *page_head = compound_head(page);
|
||||
DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
|
||||
|
||||
__wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io,
|
||||
__wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io,
|
||||
TASK_UNINTERRUPTIBLE);
|
||||
}
|
||||
EXPORT_SYMBOL(__lock_page);
|
||||
|
||||
int __lock_page_killable(struct page *page)
|
||||
{
|
||||
DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
|
||||
struct page *page_head = compound_head(page);
|
||||
DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
|
||||
|
||||
return __wait_on_bit_lock(page_waitqueue(page), &wait,
|
||||
return __wait_on_bit_lock(page_waitqueue(page_head), &wait,
|
||||
bit_wait_io, TASK_KILLABLE);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__lock_page_killable);
|
||||
@@ -1242,9 +1256,9 @@ repeat:
|
||||
if (radix_tree_deref_retry(page))
|
||||
goto restart;
|
||||
/*
|
||||
* A shadow entry of a recently evicted page,
|
||||
* or a swap entry from shmem/tmpfs. Return
|
||||
* it without attempting to raise page count.
|
||||
* A shadow entry of a recently evicted page, a swap
|
||||
* entry from shmem/tmpfs or a DAX entry. Return it
|
||||
* without attempting to raise page count.
|
||||
*/
|
||||
goto export;
|
||||
}
|
||||
@@ -1491,6 +1505,74 @@ repeat:
|
||||
}
|
||||
EXPORT_SYMBOL(find_get_pages_tag);
|
||||
|
||||
/**
|
||||
* find_get_entries_tag - find and return entries that match @tag
|
||||
* @mapping: the address_space to search
|
||||
* @start: the starting page cache index
|
||||
* @tag: the tag index
|
||||
* @nr_entries: the maximum number of entries
|
||||
* @entries: where the resulting entries are placed
|
||||
* @indices: the cache indices corresponding to the entries in @entries
|
||||
*
|
||||
* Like find_get_entries, except we only return entries which are tagged with
|
||||
* @tag.
|
||||
*/
|
||||
unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
|
||||
int tag, unsigned int nr_entries,
|
||||
struct page **entries, pgoff_t *indices)
|
||||
{
|
||||
void **slot;
|
||||
unsigned int ret = 0;
|
||||
struct radix_tree_iter iter;
|
||||
|
||||
if (!nr_entries)
|
||||
return 0;
|
||||
|
||||
rcu_read_lock();
|
||||
restart:
|
||||
radix_tree_for_each_tagged(slot, &mapping->page_tree,
|
||||
&iter, start, tag) {
|
||||
struct page *page;
|
||||
repeat:
|
||||
page = radix_tree_deref_slot(slot);
|
||||
if (unlikely(!page))
|
||||
continue;
|
||||
if (radix_tree_exception(page)) {
|
||||
if (radix_tree_deref_retry(page)) {
|
||||
/*
|
||||
* Transient condition which can only trigger
|
||||
* when entry at index 0 moves out of or back
|
||||
* to root: none yet gotten, safe to restart.
|
||||
*/
|
||||
goto restart;
|
||||
}
|
||||
|
||||
/*
|
||||
* A shadow entry of a recently evicted page, a swap
|
||||
* entry from shmem/tmpfs or a DAX entry. Return it
|
||||
* without attempting to raise page count.
|
||||
*/
|
||||
goto export;
|
||||
}
|
||||
if (!page_cache_get_speculative(page))
|
||||
goto repeat;
|
||||
|
||||
/* Has the page moved? */
|
||||
if (unlikely(page != *slot)) {
|
||||
page_cache_release(page);
|
||||
goto repeat;
|
||||
}
|
||||
export:
|
||||
indices[ret] = iter.index;
|
||||
entries[ret] = page;
|
||||
if (++ret == nr_entries)
|
||||
break;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(find_get_entries_tag);
|
||||
|
||||
/*
|
||||
* CD/DVDs are error prone. When a medium error occurs, the driver may fail
|
||||
* a _large_ part of the i/o request. Imagine the worst scenario:
|
||||
@@ -1812,19 +1894,18 @@ EXPORT_SYMBOL(generic_file_read_iter);
|
||||
* This adds the requested page to the page cache if it isn't already there,
|
||||
* and schedules an I/O to read in its contents from disk.
|
||||
*/
|
||||
static int page_cache_read(struct file *file, pgoff_t offset)
|
||||
static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
|
||||
{
|
||||
struct address_space *mapping = file->f_mapping;
|
||||
struct page *page;
|
||||
int ret;
|
||||
|
||||
do {
|
||||
page = page_cache_alloc_cold(mapping);
|
||||
page = __page_cache_alloc(gfp_mask|__GFP_COLD);
|
||||
if (!page)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = add_to_page_cache_lru(page, mapping, offset,
|
||||
mapping_gfp_constraint(mapping, GFP_KERNEL));
|
||||
ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_KERNEL);
|
||||
if (ret == 0)
|
||||
ret = mapping->a_ops->readpage(file, page);
|
||||
else if (ret == -EEXIST)
|
||||
@@ -2005,7 +2086,7 @@ no_cached_page:
|
||||
* We're only likely to ever get here if MADV_RANDOM is in
|
||||
* effect.
|
||||
*/
|
||||
error = page_cache_read(file, offset);
|
||||
error = page_cache_read(file, offset, vmf->gfp_mask);
|
||||
|
||||
/*
|
||||
* The page we want has now been added to the page cache.
|
||||
@@ -2682,11 +2763,11 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
||||
struct inode *inode = file->f_mapping->host;
|
||||
ssize_t ret;
|
||||
|
||||
mutex_lock(&inode->i_mutex);
|
||||
inode_lock(inode);
|
||||
ret = generic_write_checks(iocb, from);
|
||||
if (ret > 0)
|
||||
ret = __generic_file_write_iter(iocb, from);
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
inode_unlock(inode);
|
||||
|
||||
if (ret > 0) {
|
||||
ssize_t err;
|
||||
|
174
mm/gup.c
174
mm/gup.c
@@ -4,6 +4,7 @@
|
||||
#include <linux/spinlock.h>
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/rmap.h>
|
||||
#include <linux/swap.h>
|
||||
@@ -62,6 +63,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
|
||||
unsigned long address, pmd_t *pmd, unsigned int flags)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
struct dev_pagemap *pgmap = NULL;
|
||||
struct page *page;
|
||||
spinlock_t *ptl;
|
||||
pte_t *ptep, pte;
|
||||
@@ -98,7 +100,17 @@ retry:
|
||||
}
|
||||
|
||||
page = vm_normal_page(vma, address, pte);
|
||||
if (unlikely(!page)) {
|
||||
if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
|
||||
/*
|
||||
* Only return device mapping pages in the FOLL_GET case since
|
||||
* they are only valid while holding the pgmap reference.
|
||||
*/
|
||||
pgmap = get_dev_pagemap(pte_pfn(pte), NULL);
|
||||
if (pgmap)
|
||||
page = pte_page(pte);
|
||||
else
|
||||
goto no_page;
|
||||
} else if (unlikely(!page)) {
|
||||
if (flags & FOLL_DUMP) {
|
||||
/* Avoid special (like zero) pages in core dumps */
|
||||
page = ERR_PTR(-EFAULT);
|
||||
@@ -116,8 +128,28 @@ retry:
|
||||
}
|
||||
}
|
||||
|
||||
if (flags & FOLL_GET)
|
||||
get_page_foll(page);
|
||||
if (flags & FOLL_SPLIT && PageTransCompound(page)) {
|
||||
int ret;
|
||||
get_page(page);
|
||||
pte_unmap_unlock(ptep, ptl);
|
||||
lock_page(page);
|
||||
ret = split_huge_page(page);
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
goto retry;
|
||||
}
|
||||
|
||||
if (flags & FOLL_GET) {
|
||||
get_page(page);
|
||||
|
||||
/* drop the pgmap reference now that we hold the page */
|
||||
if (pgmap) {
|
||||
put_dev_pagemap(pgmap);
|
||||
pgmap = NULL;
|
||||
}
|
||||
}
|
||||
if (flags & FOLL_TOUCH) {
|
||||
if ((flags & FOLL_WRITE) &&
|
||||
!pte_dirty(pte) && !PageDirty(page))
|
||||
@@ -130,6 +162,10 @@ retry:
|
||||
mark_page_accessed(page);
|
||||
}
|
||||
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
|
||||
/* Do not mlock pte-mapped THP */
|
||||
if (PageTransCompound(page))
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* The preliminary mapping check is mainly to avoid the
|
||||
* pointless overhead of lock_page on the ZERO_PAGE
|
||||
@@ -220,27 +256,45 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
|
||||
}
|
||||
if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
|
||||
return no_page_table(vma, flags);
|
||||
if (pmd_trans_huge(*pmd)) {
|
||||
if (flags & FOLL_SPLIT) {
|
||||
split_huge_page_pmd(vma, address, pmd);
|
||||
return follow_page_pte(vma, address, pmd, flags);
|
||||
}
|
||||
if (pmd_devmap(*pmd)) {
|
||||
ptl = pmd_lock(mm, pmd);
|
||||
if (likely(pmd_trans_huge(*pmd))) {
|
||||
if (unlikely(pmd_trans_splitting(*pmd))) {
|
||||
spin_unlock(ptl);
|
||||
wait_split_huge_page(vma->anon_vma, pmd);
|
||||
} else {
|
||||
page = follow_trans_huge_pmd(vma, address,
|
||||
pmd, flags);
|
||||
spin_unlock(ptl);
|
||||
*page_mask = HPAGE_PMD_NR - 1;
|
||||
return page;
|
||||
}
|
||||
} else
|
||||
spin_unlock(ptl);
|
||||
page = follow_devmap_pmd(vma, address, pmd, flags);
|
||||
spin_unlock(ptl);
|
||||
if (page)
|
||||
return page;
|
||||
}
|
||||
return follow_page_pte(vma, address, pmd, flags);
|
||||
if (likely(!pmd_trans_huge(*pmd)))
|
||||
return follow_page_pte(vma, address, pmd, flags);
|
||||
|
||||
ptl = pmd_lock(mm, pmd);
|
||||
if (unlikely(!pmd_trans_huge(*pmd))) {
|
||||
spin_unlock(ptl);
|
||||
return follow_page_pte(vma, address, pmd, flags);
|
||||
}
|
||||
if (flags & FOLL_SPLIT) {
|
||||
int ret;
|
||||
page = pmd_page(*pmd);
|
||||
if (is_huge_zero_page(page)) {
|
||||
spin_unlock(ptl);
|
||||
ret = 0;
|
||||
split_huge_pmd(vma, pmd, address);
|
||||
} else {
|
||||
get_page(page);
|
||||
spin_unlock(ptl);
|
||||
lock_page(page);
|
||||
ret = split_huge_page(page);
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
}
|
||||
|
||||
return ret ? ERR_PTR(ret) :
|
||||
follow_page_pte(vma, address, pmd, flags);
|
||||
}
|
||||
|
||||
page = follow_trans_huge_pmd(vma, address, pmd, flags);
|
||||
spin_unlock(ptl);
|
||||
*page_mask = HPAGE_PMD_NR - 1;
|
||||
return page;
|
||||
}
|
||||
|
||||
static int get_gate_page(struct mm_struct *mm, unsigned long address,
|
||||
@@ -564,6 +618,8 @@ EXPORT_SYMBOL(__get_user_pages);
|
||||
* @mm: mm_struct of target mm
|
||||
* @address: user address
|
||||
* @fault_flags:flags to pass down to handle_mm_fault()
|
||||
* @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller
|
||||
* does not allow retry
|
||||
*
|
||||
* This is meant to be called in the specific scenario where for locking reasons
|
||||
* we try to access user memory in atomic context (within a pagefault_disable()
|
||||
@@ -575,22 +631,28 @@ EXPORT_SYMBOL(__get_user_pages);
|
||||
* The main difference with get_user_pages() is that this function will
|
||||
* unconditionally call handle_mm_fault() which will in turn perform all the
|
||||
* necessary SW fixup of the dirty and young bits in the PTE, while
|
||||
* handle_mm_fault() only guarantees to update these in the struct page.
|
||||
* get_user_pages() only guarantees to update these in the struct page.
|
||||
*
|
||||
* This is important for some architectures where those bits also gate the
|
||||
* access permission to the page because they are maintained in software. On
|
||||
* such architectures, gup() will not be enough to make a subsequent access
|
||||
* succeed.
|
||||
*
|
||||
* This has the same semantics wrt the @mm->mmap_sem as does filemap_fault().
|
||||
* This function will not return with an unlocked mmap_sem. So it has not the
|
||||
* same semantics wrt the @mm->mmap_sem as does filemap_fault().
|
||||
*/
|
||||
int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long address, unsigned int fault_flags)
|
||||
unsigned long address, unsigned int fault_flags,
|
||||
bool *unlocked)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
vm_flags_t vm_flags;
|
||||
int ret;
|
||||
int ret, major = 0;
|
||||
|
||||
if (unlocked)
|
||||
fault_flags |= FAULT_FLAG_ALLOW_RETRY;
|
||||
|
||||
retry:
|
||||
vma = find_extend_vma(mm, address);
|
||||
if (!vma || address < vma->vm_start)
|
||||
return -EFAULT;
|
||||
@@ -600,6 +662,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
|
||||
return -EFAULT;
|
||||
|
||||
ret = handle_mm_fault(mm, vma, address, fault_flags);
|
||||
major |= ret & VM_FAULT_MAJOR;
|
||||
if (ret & VM_FAULT_ERROR) {
|
||||
if (ret & VM_FAULT_OOM)
|
||||
return -ENOMEM;
|
||||
@@ -609,8 +672,19 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
|
||||
return -EFAULT;
|
||||
BUG();
|
||||
}
|
||||
|
||||
if (ret & VM_FAULT_RETRY) {
|
||||
down_read(&mm->mmap_sem);
|
||||
if (!(fault_flags & FAULT_FLAG_TRIED)) {
|
||||
*unlocked = true;
|
||||
fault_flags &= ~FAULT_FLAG_ALLOW_RETRY;
|
||||
fault_flags |= FAULT_FLAG_TRIED;
|
||||
goto retry;
|
||||
}
|
||||
}
|
||||
|
||||
if (tsk) {
|
||||
if (ret & VM_FAULT_MAJOR)
|
||||
if (major)
|
||||
tsk->maj_flt++;
|
||||
else
|
||||
tsk->min_flt++;
|
||||
@@ -896,7 +970,6 @@ long populate_vma_page_range(struct vm_area_struct *vma,
|
||||
gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
|
||||
if (vma->vm_flags & VM_LOCKONFAULT)
|
||||
gup_flags &= ~FOLL_POPULATE;
|
||||
|
||||
/*
|
||||
* We want to touch writable mappings with a write fault in order
|
||||
* to break COW, except for shared mappings because these don't COW
|
||||
@@ -1036,9 +1109,6 @@ struct page *get_dump_page(unsigned long addr)
|
||||
* *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
|
||||
* pages containing page tables.
|
||||
*
|
||||
* *) THP splits will broadcast an IPI, this can be achieved by overriding
|
||||
* pmdp_splitting_flush.
|
||||
*
|
||||
* *) ptes can be read atomically by the architecture.
|
||||
*
|
||||
* *) access_ok is sufficient to validate userspace address ranges.
|
||||
@@ -1066,7 +1136,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
|
||||
* for an example see gup_get_pte in arch/x86/mm/gup.c
|
||||
*/
|
||||
pte_t pte = READ_ONCE(*ptep);
|
||||
struct page *page;
|
||||
struct page *head, *page;
|
||||
|
||||
/*
|
||||
* Similar to the PMD case below, NUMA hinting must take slow
|
||||
@@ -1078,15 +1148,17 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
|
||||
|
||||
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
|
||||
page = pte_page(pte);
|
||||
head = compound_head(page);
|
||||
|
||||
if (!page_cache_get_speculative(page))
|
||||
if (!page_cache_get_speculative(head))
|
||||
goto pte_unmap;
|
||||
|
||||
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
|
||||
put_page(page);
|
||||
put_page(head);
|
||||
goto pte_unmap;
|
||||
}
|
||||
|
||||
VM_BUG_ON_PAGE(compound_head(page) != head, page);
|
||||
pages[*nr] = page;
|
||||
(*nr)++;
|
||||
|
||||
@@ -1119,7 +1191,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
|
||||
static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
|
||||
unsigned long end, int write, struct page **pages, int *nr)
|
||||
{
|
||||
struct page *head, *page, *tail;
|
||||
struct page *head, *page;
|
||||
int refs;
|
||||
|
||||
if (write && !pmd_write(orig))
|
||||
@@ -1128,7 +1200,6 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
|
||||
refs = 0;
|
||||
head = pmd_page(orig);
|
||||
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
|
||||
tail = page;
|
||||
do {
|
||||
VM_BUG_ON_PAGE(compound_head(page) != head, page);
|
||||
pages[*nr] = page;
|
||||
@@ -1149,24 +1220,13 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Any tail pages need their mapcount reference taken before we
|
||||
* return. (This allows the THP code to bump their ref count when
|
||||
* they are split into base pages).
|
||||
*/
|
||||
while (refs--) {
|
||||
if (PageTail(tail))
|
||||
get_huge_page_tail(tail);
|
||||
tail++;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
|
||||
unsigned long end, int write, struct page **pages, int *nr)
|
||||
{
|
||||
struct page *head, *page, *tail;
|
||||
struct page *head, *page;
|
||||
int refs;
|
||||
|
||||
if (write && !pud_write(orig))
|
||||
@@ -1175,7 +1235,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
|
||||
refs = 0;
|
||||
head = pud_page(orig);
|
||||
page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
|
||||
tail = page;
|
||||
do {
|
||||
VM_BUG_ON_PAGE(compound_head(page) != head, page);
|
||||
pages[*nr] = page;
|
||||
@@ -1196,12 +1255,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
|
||||
return 0;
|
||||
}
|
||||
|
||||
while (refs--) {
|
||||
if (PageTail(tail))
|
||||
get_huge_page_tail(tail);
|
||||
tail++;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -1210,7 +1263,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
|
||||
struct page **pages, int *nr)
|
||||
{
|
||||
int refs;
|
||||
struct page *head, *page, *tail;
|
||||
struct page *head, *page;
|
||||
|
||||
if (write && !pgd_write(orig))
|
||||
return 0;
|
||||
@@ -1218,7 +1271,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
|
||||
refs = 0;
|
||||
head = pgd_page(orig);
|
||||
page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
|
||||
tail = page;
|
||||
do {
|
||||
VM_BUG_ON_PAGE(compound_head(page) != head, page);
|
||||
pages[*nr] = page;
|
||||
@@ -1239,12 +1291,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
|
||||
return 0;
|
||||
}
|
||||
|
||||
while (refs--) {
|
||||
if (PageTail(tail))
|
||||
get_huge_page_tail(tail);
|
||||
tail++;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -1259,7 +1305,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
|
||||
pmd_t pmd = READ_ONCE(*pmdp);
|
||||
|
||||
next = pmd_addr_end(addr, end);
|
||||
if (pmd_none(pmd) || pmd_trans_splitting(pmd))
|
||||
if (pmd_none(pmd))
|
||||
return 0;
|
||||
|
||||
if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
|
||||
|
1683
mm/huge_memory.c
1683
mm/huge_memory.c
檔案差異因為檔案過大而無法顯示
載入差異
51
mm/hugetlb.c
51
mm/hugetlb.c
@@ -4,7 +4,6 @@
|
||||
*/
|
||||
#include <linux/list.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/sysctl.h>
|
||||
@@ -1268,8 +1267,8 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
|
||||
|
||||
/* we rely on prep_new_huge_page to set the destructor */
|
||||
set_compound_order(page, order);
|
||||
__SetPageHead(page);
|
||||
__ClearPageReserved(page);
|
||||
__SetPageHead(page);
|
||||
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
|
||||
/*
|
||||
* For gigantic hugepages allocated through bootmem at
|
||||
@@ -2549,25 +2548,6 @@ static void hugetlb_unregister_node(struct node *node)
|
||||
nhs->hugepages_kobj = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* hugetlb module exit: unregister hstate attributes from node devices
|
||||
* that have them.
|
||||
*/
|
||||
static void hugetlb_unregister_all_nodes(void)
|
||||
{
|
||||
int nid;
|
||||
|
||||
/*
|
||||
* disable node device registrations.
|
||||
*/
|
||||
register_hugetlbfs_with_node(NULL, NULL);
|
||||
|
||||
/*
|
||||
* remove hstate attributes from any nodes that have them.
|
||||
*/
|
||||
for (nid = 0; nid < nr_node_ids; nid++)
|
||||
hugetlb_unregister_node(node_devices[nid]);
|
||||
}
|
||||
|
||||
/*
|
||||
* Register hstate attributes for a single node device.
|
||||
@@ -2632,27 +2612,10 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void hugetlb_unregister_all_nodes(void) { }
|
||||
|
||||
static void hugetlb_register_all_nodes(void) { }
|
||||
|
||||
#endif
|
||||
|
||||
static void __exit hugetlb_exit(void)
|
||||
{
|
||||
struct hstate *h;
|
||||
|
||||
hugetlb_unregister_all_nodes();
|
||||
|
||||
for_each_hstate(h) {
|
||||
kobject_put(hstate_kobjs[hstate_index(h)]);
|
||||
}
|
||||
|
||||
kobject_put(hugepages_kobj);
|
||||
kfree(hugetlb_fault_mutex_table);
|
||||
}
|
||||
module_exit(hugetlb_exit);
|
||||
|
||||
static int __init hugetlb_init(void)
|
||||
{
|
||||
int i;
|
||||
@@ -2690,7 +2653,7 @@ static int __init hugetlb_init(void)
|
||||
mutex_init(&hugetlb_fault_mutex_table[i]);
|
||||
return 0;
|
||||
}
|
||||
module_init(hugetlb_init);
|
||||
subsys_initcall(hugetlb_init);
|
||||
|
||||
/* Should be called on processing a hugepagesz=... option */
|
||||
void __init hugetlb_add_hstate(unsigned int order)
|
||||
@@ -3139,7 +3102,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
|
||||
entry = huge_ptep_get(src_pte);
|
||||
ptepage = pte_page(entry);
|
||||
get_page(ptepage);
|
||||
page_dup_rmap(ptepage);
|
||||
page_dup_rmap(ptepage, true);
|
||||
set_huge_pte_at(dst, addr, dst_pte, entry);
|
||||
hugetlb_count_add(pages_per_huge_page(h), dst);
|
||||
}
|
||||
@@ -3223,7 +3186,7 @@ again:
|
||||
set_page_dirty(page);
|
||||
|
||||
hugetlb_count_sub(pages_per_huge_page(h), mm);
|
||||
page_remove_rmap(page);
|
||||
page_remove_rmap(page, true);
|
||||
force_flush = !__tlb_remove_page(tlb, page);
|
||||
if (force_flush) {
|
||||
address += sz;
|
||||
@@ -3452,7 +3415,7 @@ retry_avoidcopy:
|
||||
mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
|
||||
set_huge_pte_at(mm, address, ptep,
|
||||
make_huge_pte(vma, new_page, 1));
|
||||
page_remove_rmap(old_page);
|
||||
page_remove_rmap(old_page, true);
|
||||
hugepage_add_new_anon_rmap(new_page, vma, address);
|
||||
/* Make the old page be freed below */
|
||||
new_page = old_page;
|
||||
@@ -3622,7 +3585,7 @@ retry:
|
||||
ClearPagePrivate(page);
|
||||
hugepage_add_new_anon_rmap(page, vma, address);
|
||||
} else
|
||||
page_dup_rmap(page);
|
||||
page_dup_rmap(page, true);
|
||||
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
|
||||
&& (vma->vm_flags & VM_SHARED)));
|
||||
set_huge_pte_at(mm, address, ptep, new_pte);
|
||||
@@ -3902,7 +3865,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
same_page:
|
||||
if (pages) {
|
||||
pages[i] = mem_map_offset(page, pfn_offset);
|
||||
get_page_foll(pages[i]);
|
||||
get_page(pages[i]);
|
||||
}
|
||||
|
||||
if (vmas)
|
||||
|
@@ -13,6 +13,7 @@
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/pagemap.h>
|
||||
|
||||
/*
|
||||
* The set of flags that only affect watermark checking and reclaim
|
||||
@@ -66,50 +67,6 @@ static inline void set_page_refcounted(struct page *page)
|
||||
set_page_count(page, 1);
|
||||
}
|
||||
|
||||
static inline void __get_page_tail_foll(struct page *page,
|
||||
bool get_page_head)
|
||||
{
|
||||
/*
|
||||
* If we're getting a tail page, the elevated page->_count is
|
||||
* required only in the head page and we will elevate the head
|
||||
* page->_count and tail page->_mapcount.
|
||||
*
|
||||
* We elevate page_tail->_mapcount for tail pages to force
|
||||
* page_tail->_count to be zero at all times to avoid getting
|
||||
* false positives from get_page_unless_zero() with
|
||||
* speculative page access (like in
|
||||
* page_cache_get_speculative()) on tail pages.
|
||||
*/
|
||||
VM_BUG_ON_PAGE(atomic_read(&compound_head(page)->_count) <= 0, page);
|
||||
if (get_page_head)
|
||||
atomic_inc(&compound_head(page)->_count);
|
||||
get_huge_page_tail(page);
|
||||
}
|
||||
|
||||
/*
|
||||
* This is meant to be called as the FOLL_GET operation of
|
||||
* follow_page() and it must be called while holding the proper PT
|
||||
* lock while the pte (or pmd_trans_huge) is still mapping the page.
|
||||
*/
|
||||
static inline void get_page_foll(struct page *page)
|
||||
{
|
||||
if (unlikely(PageTail(page)))
|
||||
/*
|
||||
* This is safe only because
|
||||
* __split_huge_page_refcount() can't run under
|
||||
* get_page_foll() because we hold the proper PT lock.
|
||||
*/
|
||||
__get_page_tail_foll(page, true);
|
||||
else {
|
||||
/*
|
||||
* Getting a normal page or the head of a compound page
|
||||
* requires to already have an elevated page->_count.
|
||||
*/
|
||||
VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
|
||||
atomic_inc(&page->_count);
|
||||
}
|
||||
}
|
||||
|
||||
extern unsigned long highest_memmap_pfn;
|
||||
|
||||
/*
|
||||
@@ -309,10 +266,27 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
|
||||
|
||||
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
|
||||
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
extern unsigned long vma_address(struct page *page,
|
||||
struct vm_area_struct *vma);
|
||||
#endif
|
||||
/*
|
||||
* At what user virtual address is page expected in @vma?
|
||||
*/
|
||||
static inline unsigned long
|
||||
__vma_address(struct page *page, struct vm_area_struct *vma)
|
||||
{
|
||||
pgoff_t pgoff = page_to_pgoff(page);
|
||||
return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
|
||||
}
|
||||
|
||||
static inline unsigned long
|
||||
vma_address(struct page *page, struct vm_area_struct *vma)
|
||||
{
|
||||
unsigned long address = __vma_address(page, vma);
|
||||
|
||||
/* page should be within @vma mapping range */
|
||||
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#else /* !CONFIG_MMU */
|
||||
static inline void clear_page_mlock(struct page *page) { }
|
||||
static inline void mlock_vma_page(struct page *page) { }
|
||||
|
@@ -1,4 +1,5 @@
|
||||
KASAN_SANITIZE := n
|
||||
UBSAN_SANITIZE_kasan.o := n
|
||||
|
||||
CFLAGS_REMOVE_kasan.o = -pg
|
||||
# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
|
||||
|
@@ -122,8 +122,7 @@
|
||||
#define BYTES_PER_POINTER sizeof(void *)
|
||||
|
||||
/* GFP bitmask for kmemleak internal allocations */
|
||||
#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC | \
|
||||
__GFP_NOACCOUNT)) | \
|
||||
#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
|
||||
__GFP_NORETRY | __GFP_NOMEMALLOC | \
|
||||
__GFP_NOWARN)
|
||||
|
||||
|
89
mm/ksm.c
89
mm/ksm.c
@@ -441,20 +441,6 @@ static void break_cow(struct rmap_item *rmap_item)
|
||||
up_read(&mm->mmap_sem);
|
||||
}
|
||||
|
||||
static struct page *page_trans_compound_anon(struct page *page)
|
||||
{
|
||||
if (PageTransCompound(page)) {
|
||||
struct page *head = compound_head(page);
|
||||
/*
|
||||
* head may actually be splitted and freed from under
|
||||
* us but it's ok here.
|
||||
*/
|
||||
if (PageAnon(head))
|
||||
return head;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct page *get_mergeable_page(struct rmap_item *rmap_item)
|
||||
{
|
||||
struct mm_struct *mm = rmap_item->mm;
|
||||
@@ -470,7 +456,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
|
||||
page = follow_page(vma, addr, FOLL_GET);
|
||||
if (IS_ERR_OR_NULL(page))
|
||||
goto out;
|
||||
if (PageAnon(page) || page_trans_compound_anon(page)) {
|
||||
if (PageAnon(page)) {
|
||||
flush_anon_page(vma, page, addr);
|
||||
flush_dcache_page(page);
|
||||
} else {
|
||||
@@ -740,8 +726,7 @@ static int remove_stable_node(struct stable_node *stable_node)
|
||||
|
||||
static int remove_all_stable_nodes(void)
|
||||
{
|
||||
struct stable_node *stable_node;
|
||||
struct list_head *this, *next;
|
||||
struct stable_node *stable_node, *next;
|
||||
int nid;
|
||||
int err = 0;
|
||||
|
||||
@@ -756,8 +741,7 @@ static int remove_all_stable_nodes(void)
|
||||
cond_resched();
|
||||
}
|
||||
}
|
||||
list_for_each_safe(this, next, &migrate_nodes) {
|
||||
stable_node = list_entry(this, struct stable_node, list);
|
||||
list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
|
||||
if (remove_stable_node(stable_node))
|
||||
err = -EBUSY;
|
||||
cond_resched();
|
||||
@@ -958,13 +942,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
|
||||
}
|
||||
|
||||
get_page(kpage);
|
||||
page_add_anon_rmap(kpage, vma, addr);
|
||||
page_add_anon_rmap(kpage, vma, addr, false);
|
||||
|
||||
flush_cache_page(vma, addr, pte_pfn(*ptep));
|
||||
ptep_clear_flush_notify(vma, addr, ptep);
|
||||
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
|
||||
|
||||
page_remove_rmap(page);
|
||||
page_remove_rmap(page, false);
|
||||
if (!page_mapped(page))
|
||||
try_to_free_swap(page);
|
||||
put_page(page);
|
||||
@@ -977,33 +961,6 @@ out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int page_trans_compound_anon_split(struct page *page)
|
||||
{
|
||||
int ret = 0;
|
||||
struct page *transhuge_head = page_trans_compound_anon(page);
|
||||
if (transhuge_head) {
|
||||
/* Get the reference on the head to split it. */
|
||||
if (get_page_unless_zero(transhuge_head)) {
|
||||
/*
|
||||
* Recheck we got the reference while the head
|
||||
* was still anonymous.
|
||||
*/
|
||||
if (PageAnon(transhuge_head))
|
||||
ret = split_huge_page(transhuge_head);
|
||||
else
|
||||
/*
|
||||
* Retry later if split_huge_page run
|
||||
* from under us.
|
||||
*/
|
||||
ret = 1;
|
||||
put_page(transhuge_head);
|
||||
} else
|
||||
/* Retry later if split_huge_page run from under us. */
|
||||
ret = 1;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* try_to_merge_one_page - take two pages and merge them into one
|
||||
* @vma: the vma that holds the pte pointing to page
|
||||
@@ -1022,9 +979,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
|
||||
if (page == kpage) /* ksm page forked */
|
||||
return 0;
|
||||
|
||||
if (PageTransCompound(page) && page_trans_compound_anon_split(page))
|
||||
goto out;
|
||||
BUG_ON(PageTransCompound(page));
|
||||
if (!PageAnon(page))
|
||||
goto out;
|
||||
|
||||
@@ -1037,6 +991,13 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
|
||||
*/
|
||||
if (!trylock_page(page))
|
||||
goto out;
|
||||
|
||||
if (PageTransCompound(page)) {
|
||||
err = split_huge_page(page);
|
||||
if (err)
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
/*
|
||||
* If this anonymous page is mapped only here, its pte may need
|
||||
* to be write-protected. If it's mapped elsewhere, all of its
|
||||
@@ -1052,6 +1013,12 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
|
||||
*/
|
||||
set_page_stable_node(page, NULL);
|
||||
mark_page_accessed(page);
|
||||
/*
|
||||
* Page reclaim just frees a clean page with no dirty
|
||||
* ptes: make sure that the ksm page would be swapped.
|
||||
*/
|
||||
if (!PageDirty(page))
|
||||
SetPageDirty(page);
|
||||
err = 0;
|
||||
} else if (pages_identical(page, kpage))
|
||||
err = replace_page(vma, page, kpage, orig_pte);
|
||||
@@ -1067,6 +1034,7 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
|
||||
}
|
||||
}
|
||||
|
||||
out_unlock:
|
||||
unlock_page(page);
|
||||
out:
|
||||
return err;
|
||||
@@ -1583,13 +1551,11 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
|
||||
* so prune them once before each full scan.
|
||||
*/
|
||||
if (!ksm_merge_across_nodes) {
|
||||
struct stable_node *stable_node;
|
||||
struct list_head *this, *next;
|
||||
struct stable_node *stable_node, *next;
|
||||
struct page *page;
|
||||
|
||||
list_for_each_safe(this, next, &migrate_nodes) {
|
||||
stable_node = list_entry(this,
|
||||
struct stable_node, list);
|
||||
list_for_each_entry_safe(stable_node, next,
|
||||
&migrate_nodes, list) {
|
||||
page = get_ksm_page(stable_node, false);
|
||||
if (page)
|
||||
put_page(page);
|
||||
@@ -1639,8 +1605,7 @@ next_mm:
|
||||
cond_resched();
|
||||
continue;
|
||||
}
|
||||
if (PageAnon(*page) ||
|
||||
page_trans_compound_anon(*page)) {
|
||||
if (PageAnon(*page)) {
|
||||
flush_anon_page(vma, *page, ksm_scan.address);
|
||||
flush_dcache_page(*page);
|
||||
rmap_item = get_next_rmap_item(slot,
|
||||
@@ -1903,7 +1868,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
|
||||
|
||||
SetPageDirty(new_page);
|
||||
__SetPageUptodate(new_page);
|
||||
__set_page_locked(new_page);
|
||||
__SetPageLocked(new_page);
|
||||
}
|
||||
|
||||
return new_page;
|
||||
@@ -2012,8 +1977,7 @@ static void wait_while_offlining(void)
|
||||
static void ksm_check_stable_tree(unsigned long start_pfn,
|
||||
unsigned long end_pfn)
|
||||
{
|
||||
struct stable_node *stable_node;
|
||||
struct list_head *this, *next;
|
||||
struct stable_node *stable_node, *next;
|
||||
struct rb_node *node;
|
||||
int nid;
|
||||
|
||||
@@ -2034,8 +1998,7 @@ static void ksm_check_stable_tree(unsigned long start_pfn,
|
||||
cond_resched();
|
||||
}
|
||||
}
|
||||
list_for_each_safe(this, next, &migrate_nodes) {
|
||||
stable_node = list_entry(this, struct stable_node, list);
|
||||
list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
|
||||
if (stable_node->kpfn >= start_pfn &&
|
||||
stable_node->kpfn < end_pfn)
|
||||
remove_node_from_stable_tree(stable_node);
|
||||
|
@@ -12,7 +12,7 @@
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/memcontrol.h>
|
||||
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
|
||||
static LIST_HEAD(list_lrus);
|
||||
static DEFINE_MUTEX(list_lrus_mutex);
|
||||
|
||||
@@ -37,9 +37,9 @@ static void list_lru_register(struct list_lru *lru)
|
||||
static void list_lru_unregister(struct list_lru *lru)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_MEMCG_KMEM */
|
||||
#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
|
||||
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
|
||||
static inline bool list_lru_memcg_aware(struct list_lru *lru)
|
||||
{
|
||||
/*
|
||||
@@ -104,7 +104,7 @@ list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
|
||||
{
|
||||
return &nlru->lru;
|
||||
}
|
||||
#endif /* CONFIG_MEMCG_KMEM */
|
||||
#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
|
||||
|
||||
bool list_lru_add(struct list_lru *lru, struct list_head *item)
|
||||
{
|
||||
@@ -292,7 +292,7 @@ static void init_one_lru(struct list_lru_one *l)
|
||||
l->nr_items = 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
|
||||
static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
|
||||
int begin, int end)
|
||||
{
|
||||
@@ -529,7 +529,7 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
|
||||
static void memcg_destroy_list_lru(struct list_lru *lru)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_MEMCG_KMEM */
|
||||
#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
|
||||
|
||||
int __list_lru_init(struct list_lru *lru, bool memcg_aware,
|
||||
struct lock_class_key *key)
|
||||
|
201
mm/madvise.c
201
mm/madvise.c
@@ -20,6 +20,9 @@
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/swapops.h>
|
||||
#include <linux/mmu_notifier.h>
|
||||
|
||||
#include <asm/tlb.h>
|
||||
|
||||
/*
|
||||
* Any behaviour which results in changes to the vma->vm_flags needs to
|
||||
@@ -32,6 +35,7 @@ static int madvise_need_mmap_write(int behavior)
|
||||
case MADV_REMOVE:
|
||||
case MADV_WILLNEED:
|
||||
case MADV_DONTNEED:
|
||||
case MADV_FREE:
|
||||
return 0;
|
||||
default:
|
||||
/* be safe, default to 1. list exceptions explicitly */
|
||||
@@ -256,6 +260,194 @@ static long madvise_willneed(struct vm_area_struct *vma,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
|
||||
unsigned long end, struct mm_walk *walk)
|
||||
|
||||
{
|
||||
struct mmu_gather *tlb = walk->private;
|
||||
struct mm_struct *mm = tlb->mm;
|
||||
struct vm_area_struct *vma = walk->vma;
|
||||
spinlock_t *ptl;
|
||||
pte_t *orig_pte, *pte, ptent;
|
||||
struct page *page;
|
||||
int nr_swap = 0;
|
||||
unsigned long next;
|
||||
|
||||
next = pmd_addr_end(addr, end);
|
||||
if (pmd_trans_huge(*pmd))
|
||||
if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
|
||||
goto next;
|
||||
|
||||
if (pmd_trans_unstable(pmd))
|
||||
return 0;
|
||||
|
||||
orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
|
||||
arch_enter_lazy_mmu_mode();
|
||||
for (; addr != end; pte++, addr += PAGE_SIZE) {
|
||||
ptent = *pte;
|
||||
|
||||
if (pte_none(ptent))
|
||||
continue;
|
||||
/*
|
||||
* If the pte has swp_entry, just clear page table to
|
||||
* prevent swap-in which is more expensive rather than
|
||||
* (page allocation + zeroing).
|
||||
*/
|
||||
if (!pte_present(ptent)) {
|
||||
swp_entry_t entry;
|
||||
|
||||
entry = pte_to_swp_entry(ptent);
|
||||
if (non_swap_entry(entry))
|
||||
continue;
|
||||
nr_swap--;
|
||||
free_swap_and_cache(entry);
|
||||
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
|
||||
continue;
|
||||
}
|
||||
|
||||
page = vm_normal_page(vma, addr, ptent);
|
||||
if (!page)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* If pmd isn't transhuge but the page is THP and
|
||||
* is owned by only this process, split it and
|
||||
* deactivate all pages.
|
||||
*/
|
||||
if (PageTransCompound(page)) {
|
||||
if (page_mapcount(page) != 1)
|
||||
goto out;
|
||||
get_page(page);
|
||||
if (!trylock_page(page)) {
|
||||
put_page(page);
|
||||
goto out;
|
||||
}
|
||||
pte_unmap_unlock(orig_pte, ptl);
|
||||
if (split_huge_page(page)) {
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
pte_offset_map_lock(mm, pmd, addr, &ptl);
|
||||
goto out;
|
||||
}
|
||||
put_page(page);
|
||||
unlock_page(page);
|
||||
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
|
||||
pte--;
|
||||
addr -= PAGE_SIZE;
|
||||
continue;
|
||||
}
|
||||
|
||||
VM_BUG_ON_PAGE(PageTransCompound(page), page);
|
||||
|
||||
if (PageSwapCache(page) || PageDirty(page)) {
|
||||
if (!trylock_page(page))
|
||||
continue;
|
||||
/*
|
||||
* If page is shared with others, we couldn't clear
|
||||
* PG_dirty of the page.
|
||||
*/
|
||||
if (page_mapcount(page) != 1) {
|
||||
unlock_page(page);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (PageSwapCache(page) && !try_to_free_swap(page)) {
|
||||
unlock_page(page);
|
||||
continue;
|
||||
}
|
||||
|
||||
ClearPageDirty(page);
|
||||
unlock_page(page);
|
||||
}
|
||||
|
||||
if (pte_young(ptent) || pte_dirty(ptent)) {
|
||||
/*
|
||||
* Some of architecture(ex, PPC) don't update TLB
|
||||
* with set_pte_at and tlb_remove_tlb_entry so for
|
||||
* the portability, remap the pte with old|clean
|
||||
* after pte clearing.
|
||||
*/
|
||||
ptent = ptep_get_and_clear_full(mm, addr, pte,
|
||||
tlb->fullmm);
|
||||
|
||||
ptent = pte_mkold(ptent);
|
||||
ptent = pte_mkclean(ptent);
|
||||
set_pte_at(mm, addr, pte, ptent);
|
||||
if (PageActive(page))
|
||||
deactivate_page(page);
|
||||
tlb_remove_tlb_entry(tlb, pte, addr);
|
||||
}
|
||||
}
|
||||
out:
|
||||
if (nr_swap) {
|
||||
if (current->mm == mm)
|
||||
sync_mm_rss(mm);
|
||||
|
||||
add_mm_counter(mm, MM_SWAPENTS, nr_swap);
|
||||
}
|
||||
arch_leave_lazy_mmu_mode();
|
||||
pte_unmap_unlock(orig_pte, ptl);
|
||||
cond_resched();
|
||||
next:
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void madvise_free_page_range(struct mmu_gather *tlb,
|
||||
struct vm_area_struct *vma,
|
||||
unsigned long addr, unsigned long end)
|
||||
{
|
||||
struct mm_walk free_walk = {
|
||||
.pmd_entry = madvise_free_pte_range,
|
||||
.mm = vma->vm_mm,
|
||||
.private = tlb,
|
||||
};
|
||||
|
||||
tlb_start_vma(tlb, vma);
|
||||
walk_page_range(addr, end, &free_walk);
|
||||
tlb_end_vma(tlb, vma);
|
||||
}
|
||||
|
||||
static int madvise_free_single_vma(struct vm_area_struct *vma,
|
||||
unsigned long start_addr, unsigned long end_addr)
|
||||
{
|
||||
unsigned long start, end;
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
struct mmu_gather tlb;
|
||||
|
||||
if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
|
||||
return -EINVAL;
|
||||
|
||||
/* MADV_FREE works for only anon vma at the moment */
|
||||
if (!vma_is_anonymous(vma))
|
||||
return -EINVAL;
|
||||
|
||||
start = max(vma->vm_start, start_addr);
|
||||
if (start >= vma->vm_end)
|
||||
return -EINVAL;
|
||||
end = min(vma->vm_end, end_addr);
|
||||
if (end <= vma->vm_start)
|
||||
return -EINVAL;
|
||||
|
||||
lru_add_drain();
|
||||
tlb_gather_mmu(&tlb, mm, start, end);
|
||||
update_hiwater_rss(mm);
|
||||
|
||||
mmu_notifier_invalidate_range_start(mm, start, end);
|
||||
madvise_free_page_range(&tlb, vma, start, end);
|
||||
mmu_notifier_invalidate_range_end(mm, start, end);
|
||||
tlb_finish_mmu(&tlb, start, end);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static long madvise_free(struct vm_area_struct *vma,
|
||||
struct vm_area_struct **prev,
|
||||
unsigned long start, unsigned long end)
|
||||
{
|
||||
*prev = vma;
|
||||
return madvise_free_single_vma(vma, start, end);
|
||||
}
|
||||
|
||||
/*
|
||||
* Application no longer needs these pages. If the pages are dirty,
|
||||
* it's OK to just throw them away. The app will be more careful about
|
||||
@@ -379,6 +571,14 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
|
||||
return madvise_remove(vma, prev, start, end);
|
||||
case MADV_WILLNEED:
|
||||
return madvise_willneed(vma, prev, start, end);
|
||||
case MADV_FREE:
|
||||
/*
|
||||
* XXX: In this implementation, MADV_FREE works like
|
||||
* MADV_DONTNEED on swapless system or full swap.
|
||||
*/
|
||||
if (get_nr_swap_pages() > 0)
|
||||
return madvise_free(vma, prev, start, end);
|
||||
/* passthrough */
|
||||
case MADV_DONTNEED:
|
||||
return madvise_dontneed(vma, prev, start, end);
|
||||
default:
|
||||
@@ -398,6 +598,7 @@ madvise_behavior_valid(int behavior)
|
||||
case MADV_REMOVE:
|
||||
case MADV_WILLNEED:
|
||||
case MADV_DONTNEED:
|
||||
case MADV_FREE:
|
||||
#ifdef CONFIG_KSM
|
||||
case MADV_MERGEABLE:
|
||||
case MADV_UNMERGEABLE:
|
||||
|
@@ -96,13 +96,10 @@ bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
|
||||
{
|
||||
unsigned long i;
|
||||
|
||||
for (i = 0; i < type->cnt; i++) {
|
||||
phys_addr_t rgnbase = type->regions[i].base;
|
||||
phys_addr_t rgnsize = type->regions[i].size;
|
||||
if (memblock_addrs_overlap(base, size, rgnbase, rgnsize))
|
||||
for (i = 0; i < type->cnt; i++)
|
||||
if (memblock_addrs_overlap(base, size, type->regions[i].base,
|
||||
type->regions[i].size))
|
||||
break;
|
||||
}
|
||||
|
||||
return i < type->cnt;
|
||||
}
|
||||
|
||||
@@ -528,7 +525,8 @@ int __init_memblock memblock_add_range(struct memblock_type *type,
|
||||
bool insert = false;
|
||||
phys_addr_t obase = base;
|
||||
phys_addr_t end = base + memblock_cap_size(base, &size);
|
||||
int i, nr_new;
|
||||
int idx, nr_new;
|
||||
struct memblock_region *rgn;
|
||||
|
||||
if (!size)
|
||||
return 0;
|
||||
@@ -552,8 +550,7 @@ repeat:
|
||||
base = obase;
|
||||
nr_new = 0;
|
||||
|
||||
for (i = 0; i < type->cnt; i++) {
|
||||
struct memblock_region *rgn = &type->regions[i];
|
||||
for_each_memblock_type(type, rgn) {
|
||||
phys_addr_t rbase = rgn->base;
|
||||
phys_addr_t rend = rbase + rgn->size;
|
||||
|
||||
@@ -572,7 +569,7 @@ repeat:
|
||||
WARN_ON(flags != rgn->flags);
|
||||
nr_new++;
|
||||
if (insert)
|
||||
memblock_insert_region(type, i++, base,
|
||||
memblock_insert_region(type, idx++, base,
|
||||
rbase - base, nid,
|
||||
flags);
|
||||
}
|
||||
@@ -584,7 +581,7 @@ repeat:
|
||||
if (base < end) {
|
||||
nr_new++;
|
||||
if (insert)
|
||||
memblock_insert_region(type, i, base, end - base,
|
||||
memblock_insert_region(type, idx, base, end - base,
|
||||
nid, flags);
|
||||
}
|
||||
|
||||
@@ -651,7 +648,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
|
||||
int *start_rgn, int *end_rgn)
|
||||
{
|
||||
phys_addr_t end = base + memblock_cap_size(base, &size);
|
||||
int i;
|
||||
int idx;
|
||||
struct memblock_region *rgn;
|
||||
|
||||
*start_rgn = *end_rgn = 0;
|
||||
|
||||
@@ -663,8 +661,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
|
||||
if (memblock_double_array(type, base, size) < 0)
|
||||
return -ENOMEM;
|
||||
|
||||
for (i = 0; i < type->cnt; i++) {
|
||||
struct memblock_region *rgn = &type->regions[i];
|
||||
for_each_memblock_type(type, rgn) {
|
||||
phys_addr_t rbase = rgn->base;
|
||||
phys_addr_t rend = rbase + rgn->size;
|
||||
|
||||
@@ -681,7 +678,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
|
||||
rgn->base = base;
|
||||
rgn->size -= base - rbase;
|
||||
type->total_size -= base - rbase;
|
||||
memblock_insert_region(type, i, rbase, base - rbase,
|
||||
memblock_insert_region(type, idx, rbase, base - rbase,
|
||||
memblock_get_region_node(rgn),
|
||||
rgn->flags);
|
||||
} else if (rend > end) {
|
||||
@@ -692,14 +689,14 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
|
||||
rgn->base = end;
|
||||
rgn->size -= end - rbase;
|
||||
type->total_size -= end - rbase;
|
||||
memblock_insert_region(type, i--, rbase, end - rbase,
|
||||
memblock_insert_region(type, idx--, rbase, end - rbase,
|
||||
memblock_get_region_node(rgn),
|
||||
rgn->flags);
|
||||
} else {
|
||||
/* @rgn is fully contained, record it */
|
||||
if (!*end_rgn)
|
||||
*start_rgn = i;
|
||||
*end_rgn = i + 1;
|
||||
*start_rgn = idx;
|
||||
*end_rgn = idx + 1;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -822,6 +819,17 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
|
||||
return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR);
|
||||
}
|
||||
|
||||
/**
|
||||
* memblock_mark_nomap - Mark a memory region with flag MEMBLOCK_NOMAP.
|
||||
* @base: the base phys addr of the region
|
||||
* @size: the size of the region
|
||||
*
|
||||
* Return 0 on success, -errno on failure.
|
||||
*/
|
||||
int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size)
|
||||
{
|
||||
return memblock_setclr_flag(base, size, 1, MEMBLOCK_NOMAP);
|
||||
}
|
||||
|
||||
/**
|
||||
* __next_reserved_mem_region - next function for for_each_reserved_region()
|
||||
@@ -913,6 +921,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
|
||||
if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
|
||||
continue;
|
||||
|
||||
/* skip nomap memory unless we were asked for it explicitly */
|
||||
if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m))
|
||||
continue;
|
||||
|
||||
if (!type_b) {
|
||||
if (out_start)
|
||||
*out_start = m_start;
|
||||
@@ -1022,6 +1034,10 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
|
||||
if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
|
||||
continue;
|
||||
|
||||
/* skip nomap memory unless we were asked for it explicitly */
|
||||
if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m))
|
||||
continue;
|
||||
|
||||
if (!type_b) {
|
||||
if (out_start)
|
||||
*out_start = m_start;
|
||||
@@ -1509,16 +1525,25 @@ static int __init_memblock memblock_search(struct memblock_type *type, phys_addr
|
||||
return -1;
|
||||
}
|
||||
|
||||
int __init memblock_is_reserved(phys_addr_t addr)
|
||||
bool __init memblock_is_reserved(phys_addr_t addr)
|
||||
{
|
||||
return memblock_search(&memblock.reserved, addr) != -1;
|
||||
}
|
||||
|
||||
int __init_memblock memblock_is_memory(phys_addr_t addr)
|
||||
bool __init_memblock memblock_is_memory(phys_addr_t addr)
|
||||
{
|
||||
return memblock_search(&memblock.memory, addr) != -1;
|
||||
}
|
||||
|
||||
int __init_memblock memblock_is_map_memory(phys_addr_t addr)
|
||||
{
|
||||
int i = memblock_search(&memblock.memory, addr);
|
||||
|
||||
if (i == -1)
|
||||
return false;
|
||||
return !memblock_is_nomap(&memblock.memory.regions[i]);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
|
||||
int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
|
||||
unsigned long *start_pfn, unsigned long *end_pfn)
|
||||
@@ -1613,12 +1638,12 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name
|
||||
{
|
||||
unsigned long long base, size;
|
||||
unsigned long flags;
|
||||
int i;
|
||||
int idx;
|
||||
struct memblock_region *rgn;
|
||||
|
||||
pr_info(" %s.cnt = 0x%lx\n", name, type->cnt);
|
||||
|
||||
for (i = 0; i < type->cnt; i++) {
|
||||
struct memblock_region *rgn = &type->regions[i];
|
||||
for_each_memblock_type(type, rgn) {
|
||||
char nid_buf[32] = "";
|
||||
|
||||
base = rgn->base;
|
||||
@@ -1630,7 +1655,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name
|
||||
memblock_get_region_node(rgn));
|
||||
#endif
|
||||
pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n",
|
||||
name, i, base, base + size - 1, size, nid_buf, flags);
|
||||
name, idx, base, base + size - 1, size, nid_buf, flags);
|
||||
}
|
||||
}
|
||||
|
||||
|
1218
mm/memcontrol.c
1218
mm/memcontrol.c
檔案差異因為檔案過大而無法顯示
載入差異
@@ -882,15 +882,7 @@ int get_hwpoison_page(struct page *page)
|
||||
{
|
||||
struct page *head = compound_head(page);
|
||||
|
||||
if (PageHuge(head))
|
||||
return get_page_unless_zero(head);
|
||||
|
||||
/*
|
||||
* Thp tail page has special refcounting rule (refcount of tail pages
|
||||
* is stored in ->_mapcount,) so we can't call get_page_unless_zero()
|
||||
* directly for tail pages.
|
||||
*/
|
||||
if (PageTransHuge(head)) {
|
||||
if (!PageHuge(head) && PageTransHuge(head)) {
|
||||
/*
|
||||
* Non anonymous thp exists only in allocation/free time. We
|
||||
* can't handle such a case correctly, so let's give it up.
|
||||
@@ -902,41 +894,12 @@ int get_hwpoison_page(struct page *page)
|
||||
page_to_pfn(page));
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (get_page_unless_zero(head)) {
|
||||
if (PageTail(page))
|
||||
get_page(page);
|
||||
return 1;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return get_page_unless_zero(page);
|
||||
return get_page_unless_zero(head);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(get_hwpoison_page);
|
||||
|
||||
/**
|
||||
* put_hwpoison_page() - Put refcount for memory error handling:
|
||||
* @page: raw error page (hit by memory error)
|
||||
*/
|
||||
void put_hwpoison_page(struct page *page)
|
||||
{
|
||||
struct page *head = compound_head(page);
|
||||
|
||||
if (PageHuge(head)) {
|
||||
put_page(head);
|
||||
return;
|
||||
}
|
||||
|
||||
if (PageTransHuge(head))
|
||||
if (page != head)
|
||||
put_page(head);
|
||||
|
||||
put_page(page);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(put_hwpoison_page);
|
||||
|
||||
/*
|
||||
* Do all that is necessary to remove user space mappings. Unmap
|
||||
* the pages and send SIGBUS to the processes if the data was dirty.
|
||||
@@ -1149,7 +1112,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
||||
}
|
||||
|
||||
if (!PageHuge(p) && PageTransHuge(hpage)) {
|
||||
lock_page(hpage);
|
||||
if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
|
||||
unlock_page(hpage);
|
||||
if (!PageAnon(hpage))
|
||||
pr_err("MCE: %#lx: non anonymous thp\n", pfn);
|
||||
else
|
||||
@@ -1159,6 +1124,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
||||
put_hwpoison_page(p);
|
||||
return -EBUSY;
|
||||
}
|
||||
unlock_page(hpage);
|
||||
get_hwpoison_page(p);
|
||||
put_hwpoison_page(hpage);
|
||||
VM_BUG_ON_PAGE(!page_count(p), p);
|
||||
hpage = compound_head(p);
|
||||
}
|
||||
@@ -1166,7 +1134,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
||||
/*
|
||||
* We ignore non-LRU pages for good reasons.
|
||||
* - PG_locked is only well defined for LRU pages and a few others
|
||||
* - to avoid races with __set_page_locked()
|
||||
* - to avoid races with __SetPageLocked()
|
||||
* - to avoid races with __SetPageSlab*() (and more non-atomic ops)
|
||||
* The check (unnecessarily) ignores LRU pages being isolated and
|
||||
* walked by the page reclaim code, however that's not a big loss.
|
||||
@@ -1572,7 +1540,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
|
||||
* Did it turn free?
|
||||
*/
|
||||
ret = __get_any_page(page, pfn, 0);
|
||||
if (!PageLRU(page)) {
|
||||
if (ret == 1 && !PageLRU(page)) {
|
||||
/* Drop page reference which is from __get_any_page() */
|
||||
put_hwpoison_page(page);
|
||||
pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
|
||||
@@ -1716,6 +1684,49 @@ static int __soft_offline_page(struct page *page, int flags)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int soft_offline_in_use_page(struct page *page, int flags)
|
||||
{
|
||||
int ret;
|
||||
struct page *hpage = compound_head(page);
|
||||
|
||||
if (!PageHuge(page) && PageTransHuge(hpage)) {
|
||||
lock_page(hpage);
|
||||
if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
|
||||
unlock_page(hpage);
|
||||
if (!PageAnon(hpage))
|
||||
pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
|
||||
else
|
||||
pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
|
||||
put_hwpoison_page(hpage);
|
||||
return -EBUSY;
|
||||
}
|
||||
unlock_page(hpage);
|
||||
get_hwpoison_page(page);
|
||||
put_hwpoison_page(hpage);
|
||||
}
|
||||
|
||||
if (PageHuge(page))
|
||||
ret = soft_offline_huge_page(page, flags);
|
||||
else
|
||||
ret = __soft_offline_page(page, flags);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void soft_offline_free_page(struct page *page)
|
||||
{
|
||||
if (PageHuge(page)) {
|
||||
struct page *hpage = compound_head(page);
|
||||
|
||||
set_page_hwpoison_huge_page(hpage);
|
||||
if (!dequeue_hwpoisoned_huge_page(hpage))
|
||||
num_poisoned_pages_add(1 << compound_order(hpage));
|
||||
} else {
|
||||
if (!TestSetPageHWPoison(page))
|
||||
num_poisoned_pages_inc();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* soft_offline_page - Soft offline a page.
|
||||
* @page: page to offline
|
||||
@@ -1742,7 +1753,6 @@ int soft_offline_page(struct page *page, int flags)
|
||||
{
|
||||
int ret;
|
||||
unsigned long pfn = page_to_pfn(page);
|
||||
struct page *hpage = compound_head(page);
|
||||
|
||||
if (PageHWPoison(page)) {
|
||||
pr_info("soft offline: %#lx page already poisoned\n", pfn);
|
||||
@@ -1750,34 +1760,15 @@ int soft_offline_page(struct page *page, int flags)
|
||||
put_hwpoison_page(page);
|
||||
return -EBUSY;
|
||||
}
|
||||
if (!PageHuge(page) && PageTransHuge(hpage)) {
|
||||
if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
|
||||
pr_info("soft offline: %#lx: failed to split THP\n",
|
||||
pfn);
|
||||
if (flags & MF_COUNT_INCREASED)
|
||||
put_hwpoison_page(page);
|
||||
return -EBUSY;
|
||||
}
|
||||
}
|
||||
|
||||
get_online_mems();
|
||||
|
||||
ret = get_any_page(page, pfn, flags);
|
||||
put_online_mems();
|
||||
if (ret > 0) { /* for in-use pages */
|
||||
if (PageHuge(page))
|
||||
ret = soft_offline_huge_page(page, flags);
|
||||
else
|
||||
ret = __soft_offline_page(page, flags);
|
||||
} else if (ret == 0) { /* for free pages */
|
||||
if (PageHuge(page)) {
|
||||
set_page_hwpoison_huge_page(hpage);
|
||||
if (!dequeue_hwpoisoned_huge_page(hpage))
|
||||
num_poisoned_pages_add(1 << compound_order(hpage));
|
||||
} else {
|
||||
if (!TestSetPageHWPoison(page))
|
||||
num_poisoned_pages_inc();
|
||||
}
|
||||
}
|
||||
|
||||
if (ret > 0)
|
||||
ret = soft_offline_in_use_page(page, flags);
|
||||
else if (ret == 0)
|
||||
soft_offline_free_page(page);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
151
mm/memory.c
151
mm/memory.c
@@ -50,6 +50,7 @@
|
||||
#include <linux/export.h>
|
||||
#include <linux/delayacct.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/pfn_t.h>
|
||||
#include <linux/writeback.h>
|
||||
#include <linux/memcontrol.h>
|
||||
#include <linux/mmu_notifier.h>
|
||||
@@ -566,7 +567,6 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
{
|
||||
spinlock_t *ptl;
|
||||
pgtable_t new = pte_alloc_one(mm, address);
|
||||
int wait_split_huge_page;
|
||||
if (!new)
|
||||
return -ENOMEM;
|
||||
|
||||
@@ -586,18 +586,14 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
|
||||
|
||||
ptl = pmd_lock(mm, pmd);
|
||||
wait_split_huge_page = 0;
|
||||
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
|
||||
atomic_long_inc(&mm->nr_ptes);
|
||||
pmd_populate(mm, pmd, new);
|
||||
new = NULL;
|
||||
} else if (unlikely(pmd_trans_splitting(*pmd)))
|
||||
wait_split_huge_page = 1;
|
||||
}
|
||||
spin_unlock(ptl);
|
||||
if (new)
|
||||
pte_free(mm, new);
|
||||
if (wait_split_huge_page)
|
||||
wait_split_huge_page(vma->anon_vma, pmd);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -613,8 +609,7 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
|
||||
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
|
||||
pmd_populate_kernel(&init_mm, pmd, new);
|
||||
new = NULL;
|
||||
} else
|
||||
VM_BUG_ON(pmd_trans_splitting(*pmd));
|
||||
}
|
||||
spin_unlock(&init_mm.page_table_lock);
|
||||
if (new)
|
||||
pte_free_kernel(&init_mm, new);
|
||||
@@ -832,10 +827,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
||||
} else if (is_migration_entry(entry)) {
|
||||
page = migration_entry_to_page(entry);
|
||||
|
||||
if (PageAnon(page))
|
||||
rss[MM_ANONPAGES]++;
|
||||
else
|
||||
rss[MM_FILEPAGES]++;
|
||||
rss[mm_counter(page)]++;
|
||||
|
||||
if (is_write_migration_entry(entry) &&
|
||||
is_cow_mapping(vm_flags)) {
|
||||
@@ -873,11 +865,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
||||
page = vm_normal_page(vma, addr, pte);
|
||||
if (page) {
|
||||
get_page(page);
|
||||
page_dup_rmap(page);
|
||||
if (PageAnon(page))
|
||||
rss[MM_ANONPAGES]++;
|
||||
else
|
||||
rss[MM_FILEPAGES]++;
|
||||
page_dup_rmap(page, false);
|
||||
rss[mm_counter(page)]++;
|
||||
}
|
||||
|
||||
out_set_pte:
|
||||
@@ -961,7 +950,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
|
||||
src_pmd = pmd_offset(src_pud, addr);
|
||||
do {
|
||||
next = pmd_addr_end(addr, end);
|
||||
if (pmd_trans_huge(*src_pmd)) {
|
||||
if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
|
||||
int err;
|
||||
VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
|
||||
err = copy_huge_pmd(dst_mm, src_mm,
|
||||
@@ -1113,9 +1102,8 @@ again:
|
||||
tlb_remove_tlb_entry(tlb, pte, addr);
|
||||
if (unlikely(!page))
|
||||
continue;
|
||||
if (PageAnon(page))
|
||||
rss[MM_ANONPAGES]--;
|
||||
else {
|
||||
|
||||
if (!PageAnon(page)) {
|
||||
if (pte_dirty(ptent)) {
|
||||
force_flush = 1;
|
||||
set_page_dirty(page);
|
||||
@@ -1123,9 +1111,9 @@ again:
|
||||
if (pte_young(ptent) &&
|
||||
likely(!(vma->vm_flags & VM_SEQ_READ)))
|
||||
mark_page_accessed(page);
|
||||
rss[MM_FILEPAGES]--;
|
||||
}
|
||||
page_remove_rmap(page);
|
||||
rss[mm_counter(page)]--;
|
||||
page_remove_rmap(page, false);
|
||||
if (unlikely(page_mapcount(page) < 0))
|
||||
print_bad_pte(vma, addr, ptent, page);
|
||||
if (unlikely(!__tlb_remove_page(tlb, page))) {
|
||||
@@ -1146,11 +1134,7 @@ again:
|
||||
struct page *page;
|
||||
|
||||
page = migration_entry_to_page(entry);
|
||||
|
||||
if (PageAnon(page))
|
||||
rss[MM_ANONPAGES]--;
|
||||
else
|
||||
rss[MM_FILEPAGES]--;
|
||||
rss[mm_counter(page)]--;
|
||||
}
|
||||
if (unlikely(!free_swap_and_cache(entry)))
|
||||
print_bad_pte(vma, addr, ptent, NULL);
|
||||
@@ -1193,7 +1177,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
|
||||
pmd = pmd_offset(pud, addr);
|
||||
do {
|
||||
next = pmd_addr_end(addr, end);
|
||||
if (pmd_trans_huge(*pmd)) {
|
||||
if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
|
||||
if (next - addr != HPAGE_PMD_SIZE) {
|
||||
#ifdef CONFIG_DEBUG_VM
|
||||
if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
|
||||
@@ -1204,7 +1188,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
|
||||
BUG();
|
||||
}
|
||||
#endif
|
||||
split_huge_page_pmd(vma, addr, pmd);
|
||||
split_huge_pmd(vma, pmd, addr);
|
||||
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
|
||||
goto next;
|
||||
/* fall through */
|
||||
@@ -1460,7 +1444,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
|
||||
|
||||
/* Ok, finally just insert the thing.. */
|
||||
get_page(page);
|
||||
inc_mm_counter_fast(mm, MM_FILEPAGES);
|
||||
inc_mm_counter_fast(mm, mm_counter_file(page));
|
||||
page_add_file_rmap(page);
|
||||
set_pte_at(mm, addr, pte, mk_pte(page, prot));
|
||||
|
||||
@@ -1517,7 +1501,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
|
||||
EXPORT_SYMBOL(vm_insert_page);
|
||||
|
||||
static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
|
||||
unsigned long pfn, pgprot_t prot)
|
||||
pfn_t pfn, pgprot_t prot)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
int retval;
|
||||
@@ -1533,7 +1517,10 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
|
||||
goto out_unlock;
|
||||
|
||||
/* Ok, finally just insert the thing.. */
|
||||
entry = pte_mkspecial(pfn_pte(pfn, prot));
|
||||
if (pfn_t_devmap(pfn))
|
||||
entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
|
||||
else
|
||||
entry = pte_mkspecial(pfn_t_pte(pfn, prot));
|
||||
set_pte_at(mm, addr, pte, entry);
|
||||
update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
|
||||
|
||||
@@ -1601,17 +1588,17 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
|
||||
|
||||
if (addr < vma->vm_start || addr >= vma->vm_end)
|
||||
return -EFAULT;
|
||||
if (track_pfn_insert(vma, &pgprot, pfn))
|
||||
if (track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)))
|
||||
return -EINVAL;
|
||||
|
||||
ret = insert_pfn(vma, addr, pfn, pgprot);
|
||||
ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(vm_insert_pfn_prot);
|
||||
|
||||
int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
|
||||
unsigned long pfn)
|
||||
pfn_t pfn)
|
||||
{
|
||||
BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
|
||||
|
||||
@@ -1625,10 +1612,10 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
|
||||
* than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
|
||||
* without pte special, it would there be refcounted as a normal page.
|
||||
*/
|
||||
if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
|
||||
if (!HAVE_PTE_SPECIAL && pfn_t_valid(pfn)) {
|
||||
struct page *page;
|
||||
|
||||
page = pfn_to_page(pfn);
|
||||
page = pfn_t_to_page(pfn);
|
||||
return insert_page(vma, addr, page, vma->vm_page_prot);
|
||||
}
|
||||
return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
|
||||
@@ -1970,6 +1957,20 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
|
||||
copy_user_highpage(dst, src, va, vma);
|
||||
}
|
||||
|
||||
static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
|
||||
{
|
||||
struct file *vm_file = vma->vm_file;
|
||||
|
||||
if (vm_file)
|
||||
return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
|
||||
|
||||
/*
|
||||
* Special mappings (e.g. VDSO) do not have any file so fake
|
||||
* a default GFP_KERNEL for them.
|
||||
*/
|
||||
return GFP_KERNEL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Notify the address space that the page is about to become writable so that
|
||||
* it can prohibit this or wait for the page to get into an appropriate state.
|
||||
@@ -1985,6 +1986,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
|
||||
vmf.virtual_address = (void __user *)(address & PAGE_MASK);
|
||||
vmf.pgoff = page->index;
|
||||
vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
|
||||
vmf.gfp_mask = __get_fault_gfp_mask(vma);
|
||||
vmf.page = page;
|
||||
vmf.cow_page = NULL;
|
||||
|
||||
@@ -2104,7 +2106,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
cow_user_page(new_page, old_page, address, vma);
|
||||
}
|
||||
|
||||
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
|
||||
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
|
||||
goto oom_free_new;
|
||||
|
||||
__SetPageUptodate(new_page);
|
||||
@@ -2118,7 +2120,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
if (likely(pte_same(*page_table, orig_pte))) {
|
||||
if (old_page) {
|
||||
if (!PageAnon(old_page)) {
|
||||
dec_mm_counter_fast(mm, MM_FILEPAGES);
|
||||
dec_mm_counter_fast(mm,
|
||||
mm_counter_file(old_page));
|
||||
inc_mm_counter_fast(mm, MM_ANONPAGES);
|
||||
}
|
||||
} else {
|
||||
@@ -2134,8 +2137,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
* thread doing COW.
|
||||
*/
|
||||
ptep_clear_flush_notify(vma, address, page_table);
|
||||
page_add_new_anon_rmap(new_page, vma, address);
|
||||
mem_cgroup_commit_charge(new_page, memcg, false);
|
||||
page_add_new_anon_rmap(new_page, vma, address, false);
|
||||
mem_cgroup_commit_charge(new_page, memcg, false, false);
|
||||
lru_cache_add_active_or_unevictable(new_page, vma);
|
||||
/*
|
||||
* We call the notify macro here because, when using secondary
|
||||
@@ -2167,14 +2170,14 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
* mapcount is visible. So transitively, TLBs to
|
||||
* old page will be flushed before it can be reused.
|
||||
*/
|
||||
page_remove_rmap(old_page);
|
||||
page_remove_rmap(old_page, false);
|
||||
}
|
||||
|
||||
/* Free the old page.. */
|
||||
new_page = old_page;
|
||||
page_copied = 1;
|
||||
} else {
|
||||
mem_cgroup_cancel_charge(new_page, memcg);
|
||||
mem_cgroup_cancel_charge(new_page, memcg, false);
|
||||
}
|
||||
|
||||
if (new_page)
|
||||
@@ -2189,7 +2192,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
*/
|
||||
if (page_copied && (vma->vm_flags & VM_LOCKED)) {
|
||||
lock_page(old_page); /* LRU manipulation */
|
||||
munlock_vma_page(old_page);
|
||||
if (PageMlocked(old_page))
|
||||
munlock_vma_page(old_page);
|
||||
unlock_page(old_page);
|
||||
}
|
||||
page_cache_release(old_page);
|
||||
@@ -2549,7 +2553,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
goto out_page;
|
||||
}
|
||||
|
||||
if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) {
|
||||
if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) {
|
||||
ret = VM_FAULT_OOM;
|
||||
goto out_page;
|
||||
}
|
||||
@@ -2583,7 +2587,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
|
||||
flags &= ~FAULT_FLAG_WRITE;
|
||||
ret |= VM_FAULT_WRITE;
|
||||
exclusive = 1;
|
||||
exclusive = RMAP_EXCLUSIVE;
|
||||
}
|
||||
flush_icache_page(vma, page);
|
||||
if (pte_swp_soft_dirty(orig_pte))
|
||||
@@ -2591,15 +2595,16 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
set_pte_at(mm, address, page_table, pte);
|
||||
if (page == swapcache) {
|
||||
do_page_add_anon_rmap(page, vma, address, exclusive);
|
||||
mem_cgroup_commit_charge(page, memcg, true);
|
||||
mem_cgroup_commit_charge(page, memcg, true, false);
|
||||
} else { /* ksm created a completely new copy */
|
||||
page_add_new_anon_rmap(page, vma, address);
|
||||
mem_cgroup_commit_charge(page, memcg, false);
|
||||
page_add_new_anon_rmap(page, vma, address, false);
|
||||
mem_cgroup_commit_charge(page, memcg, false, false);
|
||||
lru_cache_add_active_or_unevictable(page, vma);
|
||||
}
|
||||
|
||||
swap_free(entry);
|
||||
if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
|
||||
if (mem_cgroup_swap_full(page) ||
|
||||
(vma->vm_flags & VM_LOCKED) || PageMlocked(page))
|
||||
try_to_free_swap(page);
|
||||
unlock_page(page);
|
||||
if (page != swapcache) {
|
||||
@@ -2629,7 +2634,7 @@ unlock:
|
||||
out:
|
||||
return ret;
|
||||
out_nomap:
|
||||
mem_cgroup_cancel_charge(page, memcg);
|
||||
mem_cgroup_cancel_charge(page, memcg, false);
|
||||
pte_unmap_unlock(page_table, ptl);
|
||||
out_page:
|
||||
unlock_page(page);
|
||||
@@ -2723,7 +2728,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
if (!page)
|
||||
goto oom;
|
||||
|
||||
if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
|
||||
if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false))
|
||||
goto oom_free_page;
|
||||
|
||||
/*
|
||||
@@ -2744,15 +2749,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
/* Deliver the page fault to userland, check inside PT lock */
|
||||
if (userfaultfd_missing(vma)) {
|
||||
pte_unmap_unlock(page_table, ptl);
|
||||
mem_cgroup_cancel_charge(page, memcg);
|
||||
mem_cgroup_cancel_charge(page, memcg, false);
|
||||
page_cache_release(page);
|
||||
return handle_userfault(vma, address, flags,
|
||||
VM_UFFD_MISSING);
|
||||
}
|
||||
|
||||
inc_mm_counter_fast(mm, MM_ANONPAGES);
|
||||
page_add_new_anon_rmap(page, vma, address);
|
||||
mem_cgroup_commit_charge(page, memcg, false);
|
||||
page_add_new_anon_rmap(page, vma, address, false);
|
||||
mem_cgroup_commit_charge(page, memcg, false, false);
|
||||
lru_cache_add_active_or_unevictable(page, vma);
|
||||
setpte:
|
||||
set_pte_at(mm, address, page_table, entry);
|
||||
@@ -2763,7 +2768,7 @@ unlock:
|
||||
pte_unmap_unlock(page_table, ptl);
|
||||
return 0;
|
||||
release:
|
||||
mem_cgroup_cancel_charge(page, memcg);
|
||||
mem_cgroup_cancel_charge(page, memcg, false);
|
||||
page_cache_release(page);
|
||||
goto unlock;
|
||||
oom_free_page:
|
||||
@@ -2788,6 +2793,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
|
||||
vmf.pgoff = pgoff;
|
||||
vmf.flags = flags;
|
||||
vmf.page = NULL;
|
||||
vmf.gfp_mask = __get_fault_gfp_mask(vma);
|
||||
vmf.cow_page = cow_page;
|
||||
|
||||
ret = vma->vm_ops->fault(vma, &vmf);
|
||||
@@ -2839,9 +2845,9 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
|
||||
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
|
||||
if (anon) {
|
||||
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
|
||||
page_add_new_anon_rmap(page, vma, address);
|
||||
page_add_new_anon_rmap(page, vma, address, false);
|
||||
} else {
|
||||
inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES);
|
||||
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
|
||||
page_add_file_rmap(page);
|
||||
}
|
||||
set_pte_at(vma->vm_mm, address, pte, entry);
|
||||
@@ -2954,6 +2960,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
|
||||
vmf.pgoff = pgoff;
|
||||
vmf.max_pgoff = max_pgoff;
|
||||
vmf.flags = flags;
|
||||
vmf.gfp_mask = __get_fault_gfp_mask(vma);
|
||||
vma->vm_ops->map_pages(vma, &vmf);
|
||||
}
|
||||
|
||||
@@ -3014,7 +3021,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
if (!new_page)
|
||||
return VM_FAULT_OOM;
|
||||
|
||||
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) {
|
||||
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) {
|
||||
page_cache_release(new_page);
|
||||
return VM_FAULT_OOM;
|
||||
}
|
||||
@@ -3043,7 +3050,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
goto uncharge_out;
|
||||
}
|
||||
do_set_pte(vma, address, new_page, pte, true, true);
|
||||
mem_cgroup_commit_charge(new_page, memcg, false);
|
||||
mem_cgroup_commit_charge(new_page, memcg, false, false);
|
||||
lru_cache_add_active_or_unevictable(new_page, vma);
|
||||
pte_unmap_unlock(pte, ptl);
|
||||
if (fault_page) {
|
||||
@@ -3058,7 +3065,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
}
|
||||
return ret;
|
||||
uncharge_out:
|
||||
mem_cgroup_cancel_charge(new_page, memcg);
|
||||
mem_cgroup_cancel_charge(new_page, memcg, false);
|
||||
page_cache_release(new_page);
|
||||
return ret;
|
||||
}
|
||||
@@ -3110,7 +3117,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
* pinned by vma->vm_file's reference. We rely on unlock_page()'s
|
||||
* release semantics to prevent the compiler from undoing this copying.
|
||||
*/
|
||||
mapping = fault_page->mapping;
|
||||
mapping = page_rmapping(fault_page);
|
||||
unlock_page(fault_page);
|
||||
if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
|
||||
/*
|
||||
@@ -3212,6 +3219,12 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* TODO: handle PTE-mapped THP */
|
||||
if (PageCompound(page)) {
|
||||
pte_unmap_unlock(ptep, ptl);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Avoid grouping on RO pages in general. RO pages shouldn't hurt as
|
||||
* much anyway since they can be in shared cache state. This misses
|
||||
@@ -3384,17 +3397,9 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
int ret;
|
||||
|
||||
barrier();
|
||||
if (pmd_trans_huge(orig_pmd)) {
|
||||
if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
|
||||
unsigned int dirty = flags & FAULT_FLAG_WRITE;
|
||||
|
||||
/*
|
||||
* If the pmd is splitting, return and retry the
|
||||
* the fault. Alternative: wait until the split
|
||||
* is done, and goto retry.
|
||||
*/
|
||||
if (pmd_trans_splitting(orig_pmd))
|
||||
return 0;
|
||||
|
||||
if (pmd_protnone(orig_pmd))
|
||||
return do_huge_pmd_numa_page(mm, vma, address,
|
||||
orig_pmd, pmd);
|
||||
@@ -3421,7 +3426,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unlikely(__pte_alloc(mm, vma, pmd, address)))
|
||||
return VM_FAULT_OOM;
|
||||
/* if an huge pmd materialized from under us just retry later */
|
||||
if (unlikely(pmd_trans_huge(*pmd)))
|
||||
if (unlikely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
|
||||
return 0;
|
||||
/*
|
||||
* A regular pmd is established and it can't morph into a huge pmd
|
||||
|
@@ -17,6 +17,7 @@
|
||||
#include <linux/sysctl.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/memory.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/memory_hotplug.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/vmalloc.h>
|
||||
@@ -131,7 +132,8 @@ static struct resource *register_memory_resource(u64 start, u64 size)
|
||||
{
|
||||
struct resource *res;
|
||||
res = kzalloc(sizeof(struct resource), GFP_KERNEL);
|
||||
BUG_ON(!res);
|
||||
if (!res)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
res->name = "System RAM";
|
||||
res->start = start;
|
||||
@@ -140,7 +142,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
|
||||
if (request_resource(&iomem_resource, res) < 0) {
|
||||
pr_debug("System RAM resource %pR cannot be added\n", res);
|
||||
kfree(res);
|
||||
res = NULL;
|
||||
return ERR_PTR(-EEXIST);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
@@ -505,10 +507,25 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
|
||||
unsigned long i;
|
||||
int err = 0;
|
||||
int start_sec, end_sec;
|
||||
struct vmem_altmap *altmap;
|
||||
|
||||
/* during initialize mem_map, align hot-added range to section */
|
||||
start_sec = pfn_to_section_nr(phys_start_pfn);
|
||||
end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
|
||||
|
||||
altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn));
|
||||
if (altmap) {
|
||||
/*
|
||||
* Validate altmap is within bounds of the total request
|
||||
*/
|
||||
if (altmap->base_pfn != phys_start_pfn
|
||||
|| vmem_altmap_offset(altmap) > nr_pages) {
|
||||
pr_warn_once("memory add fail, invalid altmap\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
altmap->alloc = 0;
|
||||
}
|
||||
|
||||
for (i = start_sec; i <= end_sec; i++) {
|
||||
err = __add_section(nid, zone, section_nr_to_pfn(i));
|
||||
|
||||
@@ -730,7 +747,8 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn)
|
||||
pgdat_resize_unlock(zone->zone_pgdat, &flags);
|
||||
}
|
||||
|
||||
static int __remove_section(struct zone *zone, struct mem_section *ms)
|
||||
static int __remove_section(struct zone *zone, struct mem_section *ms,
|
||||
unsigned long map_offset)
|
||||
{
|
||||
unsigned long start_pfn;
|
||||
int scn_nr;
|
||||
@@ -747,7 +765,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
|
||||
start_pfn = section_nr_to_pfn(scn_nr);
|
||||
__remove_zone(zone, start_pfn);
|
||||
|
||||
sparse_remove_one_section(zone, ms);
|
||||
sparse_remove_one_section(zone, ms, map_offset);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -766,9 +784,32 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
|
||||
unsigned long nr_pages)
|
||||
{
|
||||
unsigned long i;
|
||||
int sections_to_remove;
|
||||
resource_size_t start, size;
|
||||
int ret = 0;
|
||||
unsigned long map_offset = 0;
|
||||
int sections_to_remove, ret = 0;
|
||||
|
||||
/* In the ZONE_DEVICE case device driver owns the memory region */
|
||||
if (is_dev_zone(zone)) {
|
||||
struct page *page = pfn_to_page(phys_start_pfn);
|
||||
struct vmem_altmap *altmap;
|
||||
|
||||
altmap = to_vmem_altmap((unsigned long) page);
|
||||
if (altmap)
|
||||
map_offset = vmem_altmap_offset(altmap);
|
||||
} else {
|
||||
resource_size_t start, size;
|
||||
|
||||
start = phys_start_pfn << PAGE_SHIFT;
|
||||
size = nr_pages * PAGE_SIZE;
|
||||
|
||||
ret = release_mem_region_adjustable(&iomem_resource, start,
|
||||
size);
|
||||
if (ret) {
|
||||
resource_size_t endres = start + size - 1;
|
||||
|
||||
pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
|
||||
&start, &endres, ret);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* We can only remove entire sections
|
||||
@@ -776,23 +817,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
|
||||
BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
|
||||
BUG_ON(nr_pages % PAGES_PER_SECTION);
|
||||
|
||||
start = phys_start_pfn << PAGE_SHIFT;
|
||||
size = nr_pages * PAGE_SIZE;
|
||||
|
||||
/* in the ZONE_DEVICE case device driver owns the memory region */
|
||||
if (!is_dev_zone(zone))
|
||||
ret = release_mem_region_adjustable(&iomem_resource, start, size);
|
||||
if (ret) {
|
||||
resource_size_t endres = start + size - 1;
|
||||
|
||||
pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
|
||||
&start, &endres, ret);
|
||||
}
|
||||
|
||||
sections_to_remove = nr_pages / PAGES_PER_SECTION;
|
||||
for (i = 0; i < sections_to_remove; i++) {
|
||||
unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
|
||||
ret = __remove_section(zone, __pfn_to_section(pfn));
|
||||
|
||||
ret = __remove_section(zone, __pfn_to_section(pfn), map_offset);
|
||||
map_offset = 0;
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
@@ -1312,8 +1342,8 @@ int __ref add_memory(int nid, u64 start, u64 size)
|
||||
int ret;
|
||||
|
||||
res = register_memory_resource(start, size);
|
||||
if (!res)
|
||||
return -EEXIST;
|
||||
if (IS_ERR(res))
|
||||
return PTR_ERR(res);
|
||||
|
||||
ret = add_memory_resource(nid, res);
|
||||
if (ret < 0)
|
||||
|
@@ -489,14 +489,33 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
|
||||
struct page *page;
|
||||
struct queue_pages *qp = walk->private;
|
||||
unsigned long flags = qp->flags;
|
||||
int nid;
|
||||
int nid, ret;
|
||||
pte_t *pte;
|
||||
spinlock_t *ptl;
|
||||
|
||||
split_huge_page_pmd(vma, addr, pmd);
|
||||
if (pmd_trans_unstable(pmd))
|
||||
return 0;
|
||||
if (pmd_trans_huge(*pmd)) {
|
||||
ptl = pmd_lock(walk->mm, pmd);
|
||||
if (pmd_trans_huge(*pmd)) {
|
||||
page = pmd_page(*pmd);
|
||||
if (is_huge_zero_page(page)) {
|
||||
spin_unlock(ptl);
|
||||
split_huge_pmd(vma, pmd, addr);
|
||||
} else {
|
||||
get_page(page);
|
||||
spin_unlock(ptl);
|
||||
lock_page(page);
|
||||
ret = split_huge_page(page);
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
if (ret)
|
||||
return 0;
|
||||
}
|
||||
} else {
|
||||
spin_unlock(ptl);
|
||||
}
|
||||
}
|
||||
|
||||
retry:
|
||||
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
|
||||
for (; addr != end; pte++, addr += PAGE_SIZE) {
|
||||
if (!pte_present(*pte))
|
||||
@@ -513,6 +532,21 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
|
||||
nid = page_to_nid(page);
|
||||
if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
|
||||
continue;
|
||||
if (PageTail(page) && PageAnon(page)) {
|
||||
get_page(page);
|
||||
pte_unmap_unlock(pte, ptl);
|
||||
lock_page(page);
|
||||
ret = split_huge_page(page);
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
/* Failed to split -- skip. */
|
||||
if (ret) {
|
||||
pte = pte_offset_map_lock(walk->mm, pmd,
|
||||
addr, &ptl);
|
||||
continue;
|
||||
}
|
||||
goto retry;
|
||||
}
|
||||
|
||||
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
|
||||
migrate_page_add(page, qp->pagelist, flags);
|
||||
@@ -610,7 +644,8 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
|
||||
|
||||
if (flags & MPOL_MF_LAZY) {
|
||||
/* Similar to task_numa_work, skip inaccessible VMAs */
|
||||
if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
|
||||
if (vma_migratable(vma) &&
|
||||
vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
|
||||
change_prot_numa(vma, start, endvma);
|
||||
return 1;
|
||||
}
|
||||
@@ -2142,12 +2177,14 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
|
||||
*
|
||||
* Remember policies even when nobody has shared memory mapped.
|
||||
* The policies are kept in Red-Black tree linked from the inode.
|
||||
* They are protected by the sp->lock spinlock, which should be held
|
||||
* They are protected by the sp->lock rwlock, which should be held
|
||||
* for any accesses to the tree.
|
||||
*/
|
||||
|
||||
/* lookup first element intersecting start-end */
|
||||
/* Caller holds sp->lock */
|
||||
/*
|
||||
* lookup first element intersecting start-end. Caller holds sp->lock for
|
||||
* reading or for writing
|
||||
*/
|
||||
static struct sp_node *
|
||||
sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
|
||||
{
|
||||
@@ -2178,8 +2215,10 @@ sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
|
||||
return rb_entry(n, struct sp_node, nd);
|
||||
}
|
||||
|
||||
/* Insert a new shared policy into the list. */
|
||||
/* Caller holds sp->lock */
|
||||
/*
|
||||
* Insert a new shared policy into the list. Caller holds sp->lock for
|
||||
* writing.
|
||||
*/
|
||||
static void sp_insert(struct shared_policy *sp, struct sp_node *new)
|
||||
{
|
||||
struct rb_node **p = &sp->root.rb_node;
|
||||
@@ -2211,13 +2250,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
|
||||
|
||||
if (!sp->root.rb_node)
|
||||
return NULL;
|
||||
spin_lock(&sp->lock);
|
||||
read_lock(&sp->lock);
|
||||
sn = sp_lookup(sp, idx, idx+1);
|
||||
if (sn) {
|
||||
mpol_get(sn->policy);
|
||||
pol = sn->policy;
|
||||
}
|
||||
spin_unlock(&sp->lock);
|
||||
read_unlock(&sp->lock);
|
||||
return pol;
|
||||
}
|
||||
|
||||
@@ -2360,7 +2399,7 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
|
||||
int ret = 0;
|
||||
|
||||
restart:
|
||||
spin_lock(&sp->lock);
|
||||
write_lock(&sp->lock);
|
||||
n = sp_lookup(sp, start, end);
|
||||
/* Take care of old policies in the same range. */
|
||||
while (n && n->start < end) {
|
||||
@@ -2393,7 +2432,7 @@ restart:
|
||||
}
|
||||
if (new)
|
||||
sp_insert(sp, new);
|
||||
spin_unlock(&sp->lock);
|
||||
write_unlock(&sp->lock);
|
||||
ret = 0;
|
||||
|
||||
err_out:
|
||||
@@ -2405,7 +2444,7 @@ err_out:
|
||||
return ret;
|
||||
|
||||
alloc_new:
|
||||
spin_unlock(&sp->lock);
|
||||
write_unlock(&sp->lock);
|
||||
ret = -ENOMEM;
|
||||
n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
|
||||
if (!n_new)
|
||||
@@ -2431,7 +2470,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
|
||||
int ret;
|
||||
|
||||
sp->root = RB_ROOT; /* empty tree == default mempolicy */
|
||||
spin_lock_init(&sp->lock);
|
||||
rwlock_init(&sp->lock);
|
||||
|
||||
if (mpol) {
|
||||
struct vm_area_struct pvma;
|
||||
@@ -2497,14 +2536,14 @@ void mpol_free_shared_policy(struct shared_policy *p)
|
||||
|
||||
if (!p->root.rb_node)
|
||||
return;
|
||||
spin_lock(&p->lock);
|
||||
write_lock(&p->lock);
|
||||
next = rb_first(&p->root);
|
||||
while (next) {
|
||||
n = rb_entry(next, struct sp_node, nd);
|
||||
next = rb_next(&n->nd);
|
||||
sp_delete(p, n);
|
||||
}
|
||||
spin_unlock(&p->lock);
|
||||
write_unlock(&p->lock);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
|
21
mm/migrate.c
21
mm/migrate.c
@@ -165,9 +165,9 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
|
||||
if (PageAnon(new))
|
||||
hugepage_add_anon_rmap(new, vma, addr);
|
||||
else
|
||||
page_dup_rmap(new);
|
||||
page_dup_rmap(new, true);
|
||||
} else if (PageAnon(new))
|
||||
page_add_anon_rmap(new, vma, addr);
|
||||
page_add_anon_rmap(new, vma, addr, false);
|
||||
else
|
||||
page_add_file_rmap(new);
|
||||
|
||||
@@ -943,9 +943,13 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (unlikely(PageTransHuge(page)))
|
||||
if (unlikely(split_huge_page(page)))
|
||||
if (unlikely(PageTransHuge(page))) {
|
||||
lock_page(page);
|
||||
rc = split_huge_page(page);
|
||||
unlock_page(page);
|
||||
if (rc)
|
||||
goto out;
|
||||
}
|
||||
|
||||
rc = __unmap_and_move(page, newpage, force, mode);
|
||||
if (rc == MIGRATEPAGE_SUCCESS)
|
||||
@@ -1756,6 +1760,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
|
||||
HPAGE_PMD_ORDER);
|
||||
if (!new_page)
|
||||
goto out_fail;
|
||||
prep_transhuge_page(new_page);
|
||||
|
||||
isolated = numamigrate_isolate_page(pgdat, page);
|
||||
if (!isolated) {
|
||||
@@ -1767,7 +1772,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
|
||||
flush_tlb_range(vma, mmun_start, mmun_end);
|
||||
|
||||
/* Prepare a page as a migration target */
|
||||
__set_page_locked(new_page);
|
||||
__SetPageLocked(new_page);
|
||||
SetPageSwapBacked(new_page);
|
||||
|
||||
/* anon mapping, we can simply copy page->mapping to the new page: */
|
||||
@@ -1815,7 +1820,7 @@ fail_putback:
|
||||
* guarantee the copy is visible before the pagetable update.
|
||||
*/
|
||||
flush_cache_range(vma, mmun_start, mmun_end);
|
||||
page_add_anon_rmap(new_page, vma, mmun_start);
|
||||
page_add_anon_rmap(new_page, vma, mmun_start, true);
|
||||
pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
|
||||
set_pmd_at(mm, mmun_start, pmd, entry);
|
||||
flush_tlb_range(vma, mmun_start, mmun_end);
|
||||
@@ -1826,14 +1831,14 @@ fail_putback:
|
||||
flush_tlb_range(vma, mmun_start, mmun_end);
|
||||
mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
|
||||
update_mmu_cache_pmd(vma, address, &entry);
|
||||
page_remove_rmap(new_page);
|
||||
page_remove_rmap(new_page, true);
|
||||
goto fail_putback;
|
||||
}
|
||||
|
||||
mlock_migrate_page(new_page, page);
|
||||
set_page_memcg(new_page, page_memcg(page));
|
||||
set_page_memcg(page, NULL);
|
||||
page_remove_rmap(page);
|
||||
page_remove_rmap(page, true);
|
||||
|
||||
spin_unlock(ptl);
|
||||
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
|
||||
|
@@ -117,7 +117,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
|
||||
unsigned char *vec = walk->private;
|
||||
int nr = (end - addr) >> PAGE_SHIFT;
|
||||
|
||||
if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
|
||||
ptl = pmd_trans_huge_lock(pmd, vma);
|
||||
if (ptl) {
|
||||
memset(vec, 1, nr);
|
||||
spin_unlock(ptl);
|
||||
goto out;
|
||||
|
31
mm/mlock.c
31
mm/mlock.c
@@ -24,13 +24,13 @@
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
int can_do_mlock(void)
|
||||
bool can_do_mlock(void)
|
||||
{
|
||||
if (rlimit(RLIMIT_MEMLOCK) != 0)
|
||||
return 1;
|
||||
return true;
|
||||
if (capable(CAP_IPC_LOCK))
|
||||
return 1;
|
||||
return 0;
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
EXPORT_SYMBOL(can_do_mlock);
|
||||
|
||||
@@ -82,6 +82,9 @@ void mlock_vma_page(struct page *page)
|
||||
/* Serialize with page migration */
|
||||
BUG_ON(!PageLocked(page));
|
||||
|
||||
VM_BUG_ON_PAGE(PageTail(page), page);
|
||||
VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
|
||||
|
||||
if (!TestSetPageMlocked(page)) {
|
||||
mod_zone_page_state(page_zone(page), NR_MLOCK,
|
||||
hpage_nr_pages(page));
|
||||
@@ -172,12 +175,14 @@ static void __munlock_isolation_failed(struct page *page)
|
||||
*/
|
||||
unsigned int munlock_vma_page(struct page *page)
|
||||
{
|
||||
unsigned int nr_pages;
|
||||
int nr_pages;
|
||||
struct zone *zone = page_zone(page);
|
||||
|
||||
/* For try_to_munlock() and to serialize with page migration */
|
||||
BUG_ON(!PageLocked(page));
|
||||
|
||||
VM_BUG_ON_PAGE(PageTail(page), page);
|
||||
|
||||
/*
|
||||
* Serialize with any parallel __split_huge_page_refcount() which
|
||||
* might otherwise copy PageMlocked to part of the tail pages before
|
||||
@@ -388,6 +393,13 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
|
||||
if (!page || page_zone_id(page) != zoneid)
|
||||
break;
|
||||
|
||||
/*
|
||||
* Do not use pagevec for PTE-mapped THP,
|
||||
* munlock_vma_pages_range() will handle them.
|
||||
*/
|
||||
if (PageTransCompound(page))
|
||||
break;
|
||||
|
||||
get_page(page);
|
||||
/*
|
||||
* Increase the address that will be returned *before* the
|
||||
@@ -425,7 +437,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
|
||||
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
|
||||
|
||||
while (start < end) {
|
||||
struct page *page = NULL;
|
||||
struct page *page;
|
||||
unsigned int page_mask;
|
||||
unsigned long page_increm;
|
||||
struct pagevec pvec;
|
||||
@@ -444,7 +456,10 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
|
||||
&page_mask);
|
||||
|
||||
if (page && !IS_ERR(page)) {
|
||||
if (PageTransHuge(page)) {
|
||||
if (PageTransTail(page)) {
|
||||
VM_BUG_ON_PAGE(PageMlocked(page), page);
|
||||
put_page(page); /* follow_page_mask() */
|
||||
} else if (PageTransHuge(page)) {
|
||||
lock_page(page);
|
||||
/*
|
||||
* Any THP page found by follow_page_mask() may
|
||||
@@ -477,8 +492,6 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
|
||||
goto next;
|
||||
}
|
||||
}
|
||||
/* It's a bug to munlock in the middle of a THP page */
|
||||
VM_BUG_ON((start >> PAGE_SHIFT) & page_mask);
|
||||
page_increm = 1 + page_mask;
|
||||
start += page_increm * PAGE_SIZE;
|
||||
next:
|
||||
|
106
mm/mmap.c
106
mm/mmap.c
@@ -58,6 +58,18 @@
|
||||
#define arch_rebalance_pgtables(addr, len) (addr)
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
|
||||
const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
|
||||
const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX;
|
||||
int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS;
|
||||
#endif
|
||||
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
|
||||
const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN;
|
||||
const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
|
||||
int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
|
||||
#endif
|
||||
|
||||
|
||||
static void unmap_region(struct mm_struct *mm,
|
||||
struct vm_area_struct *vma, struct vm_area_struct *prev,
|
||||
unsigned long start, unsigned long end);
|
||||
@@ -1208,24 +1220,6 @@ none:
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PROC_FS
|
||||
void vm_stat_account(struct mm_struct *mm, unsigned long flags,
|
||||
struct file *file, long pages)
|
||||
{
|
||||
const unsigned long stack_flags
|
||||
= VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
|
||||
|
||||
mm->total_vm += pages;
|
||||
|
||||
if (file) {
|
||||
mm->shared_vm += pages;
|
||||
if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
|
||||
mm->exec_vm += pages;
|
||||
} else if (flags & stack_flags)
|
||||
mm->stack_vm += pages;
|
||||
}
|
||||
#endif /* CONFIG_PROC_FS */
|
||||
|
||||
/*
|
||||
* If a hint addr is less than mmap_min_addr change hint to be as
|
||||
* low as possible but still greater than mmap_min_addr
|
||||
@@ -1544,19 +1538,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
|
||||
unsigned long charged = 0;
|
||||
|
||||
/* Check against address space limit. */
|
||||
if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
|
||||
if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
|
||||
unsigned long nr_pages;
|
||||
|
||||
/*
|
||||
* MAP_FIXED may remove pages of mappings that intersects with
|
||||
* requested mapping. Account for the pages it would unmap.
|
||||
*/
|
||||
if (!(vm_flags & MAP_FIXED))
|
||||
return -ENOMEM;
|
||||
|
||||
nr_pages = count_vma_pages_range(mm, addr, addr + len);
|
||||
|
||||
if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
|
||||
if (!may_expand_vm(mm, vm_flags,
|
||||
(len >> PAGE_SHIFT) - nr_pages))
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
@@ -1655,7 +1647,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
|
||||
out:
|
||||
perf_event_mmap(vma);
|
||||
|
||||
vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
|
||||
vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
|
||||
if (vm_flags & VM_LOCKED) {
|
||||
if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
|
||||
vma == get_gate_vma(current->mm)))
|
||||
@@ -2102,7 +2094,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
|
||||
unsigned long new_start, actual_size;
|
||||
|
||||
/* address space limit tests */
|
||||
if (!may_expand_vm(mm, grow))
|
||||
if (!may_expand_vm(mm, vma->vm_flags, grow))
|
||||
return -ENOMEM;
|
||||
|
||||
/* Stack limit test */
|
||||
@@ -2199,8 +2191,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
|
||||
spin_lock(&mm->page_table_lock);
|
||||
if (vma->vm_flags & VM_LOCKED)
|
||||
mm->locked_vm += grow;
|
||||
vm_stat_account(mm, vma->vm_flags,
|
||||
vma->vm_file, grow);
|
||||
vm_stat_account(mm, vma->vm_flags, grow);
|
||||
anon_vma_interval_tree_pre_update_vma(vma);
|
||||
vma->vm_end = address;
|
||||
anon_vma_interval_tree_post_update_vma(vma);
|
||||
@@ -2275,8 +2266,7 @@ int expand_downwards(struct vm_area_struct *vma,
|
||||
spin_lock(&mm->page_table_lock);
|
||||
if (vma->vm_flags & VM_LOCKED)
|
||||
mm->locked_vm += grow;
|
||||
vm_stat_account(mm, vma->vm_flags,
|
||||
vma->vm_file, grow);
|
||||
vm_stat_account(mm, vma->vm_flags, grow);
|
||||
anon_vma_interval_tree_pre_update_vma(vma);
|
||||
vma->vm_start = address;
|
||||
vma->vm_pgoff -= grow;
|
||||
@@ -2390,7 +2380,7 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
|
||||
|
||||
if (vma->vm_flags & VM_ACCOUNT)
|
||||
nr_accounted += nrpages;
|
||||
vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
|
||||
vm_stat_account(mm, vma->vm_flags, -nrpages);
|
||||
vma = remove_vma(vma);
|
||||
} while (vma);
|
||||
vm_unacct_memory(nr_accounted);
|
||||
@@ -2760,7 +2750,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
|
||||
}
|
||||
|
||||
/* Check against address space limits *after* clearing old maps... */
|
||||
if (!may_expand_vm(mm, len >> PAGE_SHIFT))
|
||||
if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
|
||||
return -ENOMEM;
|
||||
|
||||
if (mm->map_count > sysctl_max_map_count)
|
||||
@@ -2795,6 +2785,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
|
||||
out:
|
||||
perf_event_mmap(vma);
|
||||
mm->total_vm += len >> PAGE_SHIFT;
|
||||
mm->data_vm += len >> PAGE_SHIFT;
|
||||
if (flags & VM_LOCKED)
|
||||
mm->locked_vm += (len >> PAGE_SHIFT);
|
||||
vma->vm_flags |= VM_SOFTDIRTY;
|
||||
@@ -2986,16 +2977,28 @@ out:
|
||||
* Return true if the calling process may expand its vm space by the passed
|
||||
* number of pages
|
||||
*/
|
||||
int may_expand_vm(struct mm_struct *mm, unsigned long npages)
|
||||
bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
|
||||
{
|
||||
unsigned long cur = mm->total_vm; /* pages */
|
||||
unsigned long lim;
|
||||
if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
|
||||
return false;
|
||||
|
||||
lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT;
|
||||
if ((flags & (VM_WRITE | VM_SHARED | (VM_STACK_FLAGS &
|
||||
(VM_GROWSUP | VM_GROWSDOWN)))) == VM_WRITE)
|
||||
return mm->data_vm + npages <= rlimit(RLIMIT_DATA);
|
||||
|
||||
if (cur + npages > lim)
|
||||
return 0;
|
||||
return 1;
|
||||
return true;
|
||||
}
|
||||
|
||||
void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
|
||||
{
|
||||
mm->total_vm += npages;
|
||||
|
||||
if ((flags & (VM_EXEC | VM_WRITE)) == VM_EXEC)
|
||||
mm->exec_vm += npages;
|
||||
else if (flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN)))
|
||||
mm->stack_vm += npages;
|
||||
else if ((flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
|
||||
mm->data_vm += npages;
|
||||
}
|
||||
|
||||
static int special_mapping_fault(struct vm_area_struct *vma,
|
||||
@@ -3082,7 +3085,7 @@ static struct vm_area_struct *__install_special_mapping(
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
mm->total_vm += len >> PAGE_SHIFT;
|
||||
vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);
|
||||
|
||||
perf_event_mmap(vma);
|
||||
|
||||
@@ -3186,10 +3189,16 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
|
||||
* mapping->flags avoid to take the same lock twice, if more than one
|
||||
* vma in this mm is backed by the same anon_vma or address_space.
|
||||
*
|
||||
* We can take all the locks in random order because the VM code
|
||||
* taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never
|
||||
* takes more than one of them in a row. Secondly we're protected
|
||||
* against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
|
||||
* We take locks in following order, accordingly to comment at beginning
|
||||
* of mm/rmap.c:
|
||||
* - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
|
||||
* hugetlb mapping);
|
||||
* - all i_mmap_rwsem locks;
|
||||
* - all anon_vma->rwseml
|
||||
*
|
||||
* We can take all locks within these types randomly because the VM code
|
||||
* doesn't nest them and we protected from parallel mm_take_all_locks() by
|
||||
* mm_all_locks_mutex.
|
||||
*
|
||||
* mm_take_all_locks() and mm_drop_all_locks are expensive operations
|
||||
* that may have to take thousand of locks.
|
||||
@@ -3208,7 +3217,16 @@ int mm_take_all_locks(struct mm_struct *mm)
|
||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
||||
if (signal_pending(current))
|
||||
goto out_unlock;
|
||||
if (vma->vm_file && vma->vm_file->f_mapping)
|
||||
if (vma->vm_file && vma->vm_file->f_mapping &&
|
||||
is_vm_hugetlb_page(vma))
|
||||
vm_lock_mapping(mm, vma->vm_file->f_mapping);
|
||||
}
|
||||
|
||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
||||
if (signal_pending(current))
|
||||
goto out_unlock;
|
||||
if (vma->vm_file && vma->vm_file->f_mapping &&
|
||||
!is_vm_hugetlb_page(vma))
|
||||
vm_lock_mapping(mm, vma->vm_file->f_mapping);
|
||||
}
|
||||
|
||||
|
@@ -72,16 +72,16 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
|
||||
int memmap_valid_within(unsigned long pfn,
|
||||
bool memmap_valid_within(unsigned long pfn,
|
||||
struct page *page, struct zone *zone)
|
||||
{
|
||||
if (page_to_pfn(page) != pfn)
|
||||
return 0;
|
||||
return false;
|
||||
|
||||
if (page_zone(page) != zone)
|
||||
return 0;
|
||||
return false;
|
||||
|
||||
return 1;
|
||||
return true;
|
||||
}
|
||||
#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
|
||||
|
||||
|
@@ -149,7 +149,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
|
||||
unsigned long this_pages;
|
||||
|
||||
next = pmd_addr_end(addr, end);
|
||||
if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd))
|
||||
if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
|
||||
&& pmd_none_or_clear_bad(pmd))
|
||||
continue;
|
||||
|
||||
/* invoke the mmu notifier if the pmd is populated */
|
||||
@@ -158,9 +159,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
|
||||
mmu_notifier_invalidate_range_start(mm, mni_start, end);
|
||||
}
|
||||
|
||||
if (pmd_trans_huge(*pmd)) {
|
||||
if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
|
||||
if (next - addr != HPAGE_PMD_SIZE)
|
||||
split_huge_page_pmd(vma, addr, pmd);
|
||||
split_huge_pmd(vma, pmd, addr);
|
||||
else {
|
||||
int nr_ptes = change_huge_pmd(vma, pmd, addr,
|
||||
newprot, prot_numa);
|
||||
@@ -278,6 +279,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
|
||||
* even if read-only so there is no need to account for them here
|
||||
*/
|
||||
if (newflags & VM_WRITE) {
|
||||
/* Check space limits when area turns into data. */
|
||||
if (!may_expand_vm(mm, newflags, nrpages) &&
|
||||
may_expand_vm(mm, oldflags, nrpages))
|
||||
return -ENOMEM;
|
||||
if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
|
||||
VM_SHARED|VM_NORESERVE))) {
|
||||
charged = nrpages;
|
||||
@@ -334,8 +339,8 @@ success:
|
||||
populate_vma_page_range(vma, start, end, NULL);
|
||||
}
|
||||
|
||||
vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
|
||||
vm_stat_account(mm, newflags, vma->vm_file, nrpages);
|
||||
vm_stat_account(mm, oldflags, -nrpages);
|
||||
vm_stat_account(mm, newflags, nrpages);
|
||||
perf_event_mmap(vma);
|
||||
return 0;
|
||||
|
||||
|
22
mm/mremap.c
22
mm/mremap.c
@@ -192,25 +192,24 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
|
||||
if (!new_pmd)
|
||||
break;
|
||||
if (pmd_trans_huge(*old_pmd)) {
|
||||
int err = 0;
|
||||
if (extent == HPAGE_PMD_SIZE) {
|
||||
bool moved;
|
||||
VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma,
|
||||
vma);
|
||||
/* See comment in move_ptes() */
|
||||
if (need_rmap_locks)
|
||||
anon_vma_lock_write(vma->anon_vma);
|
||||
err = move_huge_pmd(vma, new_vma, old_addr,
|
||||
moved = move_huge_pmd(vma, new_vma, old_addr,
|
||||
new_addr, old_end,
|
||||
old_pmd, new_pmd);
|
||||
if (need_rmap_locks)
|
||||
anon_vma_unlock_write(vma->anon_vma);
|
||||
if (moved) {
|
||||
need_flush = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (err > 0) {
|
||||
need_flush = true;
|
||||
continue;
|
||||
} else if (!err) {
|
||||
split_huge_page_pmd(vma, old_addr, old_pmd);
|
||||
}
|
||||
split_huge_pmd(vma, old_pmd, old_addr);
|
||||
VM_BUG_ON(pmd_trans_huge(*old_pmd));
|
||||
}
|
||||
if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
|
||||
@@ -317,7 +316,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
|
||||
* If this were a serious issue, we'd add a flag to do_munmap().
|
||||
*/
|
||||
hiwater_vm = mm->hiwater_vm;
|
||||
vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
|
||||
vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
|
||||
|
||||
/* Tell pfnmap has moved from this vma */
|
||||
if (unlikely(vma->vm_flags & VM_PFNMAP))
|
||||
@@ -383,7 +382,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
|
||||
return ERR_PTR(-EAGAIN);
|
||||
}
|
||||
|
||||
if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
|
||||
if (!may_expand_vm(mm, vma->vm_flags,
|
||||
(new_len - old_len) >> PAGE_SHIFT))
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
if (vma->vm_flags & VM_ACCOUNT) {
|
||||
@@ -545,7 +545,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
|
||||
goto out;
|
||||
}
|
||||
|
||||
vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
|
||||
vm_stat_account(mm, vma->vm_flags, pages);
|
||||
if (vma->vm_flags & VM_LOCKED) {
|
||||
mm->locked_vm += pages;
|
||||
locked = true;
|
||||
|
@@ -560,7 +560,7 @@ void __init mmap_init(void)
|
||||
|
||||
ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
|
||||
VM_BUG_ON(ret);
|
||||
vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
|
||||
vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@@ -585,10 +585,11 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
|
||||
*/
|
||||
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
|
||||
mark_oom_victim(victim);
|
||||
pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
|
||||
pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
|
||||
task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
|
||||
K(get_mm_counter(victim->mm, MM_ANONPAGES)),
|
||||
K(get_mm_counter(victim->mm, MM_FILEPAGES)));
|
||||
K(get_mm_counter(victim->mm, MM_FILEPAGES)),
|
||||
K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
|
||||
task_unlock(victim);
|
||||
|
||||
/*
|
||||
|
@@ -278,7 +278,12 @@ static unsigned long zone_dirtyable_memory(struct zone *zone)
|
||||
unsigned long nr_pages;
|
||||
|
||||
nr_pages = zone_page_state(zone, NR_FREE_PAGES);
|
||||
nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
|
||||
/*
|
||||
* Pages reserved for the kernel should not be considered
|
||||
* dirtyable, to prevent a situation where reclaim has to
|
||||
* clean pages in order to balance the zones.
|
||||
*/
|
||||
nr_pages -= min(nr_pages, zone->totalreserve_pages);
|
||||
|
||||
nr_pages += zone_page_state(zone, NR_INACTIVE_FILE);
|
||||
nr_pages += zone_page_state(zone, NR_ACTIVE_FILE);
|
||||
@@ -332,7 +337,12 @@ static unsigned long global_dirtyable_memory(void)
|
||||
unsigned long x;
|
||||
|
||||
x = global_page_state(NR_FREE_PAGES);
|
||||
x -= min(x, dirty_balance_reserve);
|
||||
/*
|
||||
* Pages reserved for the kernel should not be considered
|
||||
* dirtyable, to prevent a situation where reclaim has to
|
||||
* clean pages in order to balance the zones.
|
||||
*/
|
||||
x -= min(x, totalreserve_pages);
|
||||
|
||||
x += global_page_state(NR_INACTIVE_FILE);
|
||||
x += global_page_state(NR_ACTIVE_FILE);
|
||||
|
205
mm/page_alloc.c
205
mm/page_alloc.c
@@ -43,6 +43,7 @@
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/vmstat.h>
|
||||
#include <linux/mempolicy.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/stop_machine.h>
|
||||
#include <linux/sort.h>
|
||||
#include <linux/pfn.h>
|
||||
@@ -114,13 +115,6 @@ static DEFINE_SPINLOCK(managed_page_count_lock);
|
||||
unsigned long totalram_pages __read_mostly;
|
||||
unsigned long totalreserve_pages __read_mostly;
|
||||
unsigned long totalcma_pages __read_mostly;
|
||||
/*
|
||||
* When calculating the number of globally allowed dirty pages, there
|
||||
* is a certain number of per-zone reserves that should not be
|
||||
* considered dirtyable memory. This is the sum of those reserves
|
||||
* over all existing zones that contribute dirtyable memory.
|
||||
*/
|
||||
unsigned long dirty_balance_reserve __read_mostly;
|
||||
|
||||
int percpu_pagelist_fraction;
|
||||
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
|
||||
@@ -229,13 +223,15 @@ static char * const zone_names[MAX_NR_ZONES] = {
|
||||
#endif
|
||||
};
|
||||
|
||||
static void free_compound_page(struct page *page);
|
||||
compound_page_dtor * const compound_page_dtors[] = {
|
||||
NULL,
|
||||
free_compound_page,
|
||||
#ifdef CONFIG_HUGETLB_PAGE
|
||||
free_huge_page,
|
||||
#endif
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
free_transhuge_page,
|
||||
#endif
|
||||
};
|
||||
|
||||
int min_free_kbytes = 1024;
|
||||
@@ -457,7 +453,7 @@ out:
|
||||
* This usage means that zero-order pages may not be compound.
|
||||
*/
|
||||
|
||||
static void free_compound_page(struct page *page)
|
||||
void free_compound_page(struct page *page)
|
||||
{
|
||||
__free_pages_ok(page, compound_order(page));
|
||||
}
|
||||
@@ -473,8 +469,10 @@ void prep_compound_page(struct page *page, unsigned int order)
|
||||
for (i = 1; i < nr_pages; i++) {
|
||||
struct page *p = page + i;
|
||||
set_page_count(p, 0);
|
||||
p->mapping = TAIL_MAPPING;
|
||||
set_compound_head(p, page);
|
||||
}
|
||||
atomic_set(compound_mapcount_ptr(page), -1);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_DEBUG_PAGEALLOC
|
||||
@@ -739,7 +737,7 @@ static inline int free_pages_check(struct page *page)
|
||||
const char *bad_reason = NULL;
|
||||
unsigned long bad_flags = 0;
|
||||
|
||||
if (unlikely(page_mapcount(page)))
|
||||
if (unlikely(atomic_read(&page->_mapcount) != -1))
|
||||
bad_reason = "nonzero mapcount";
|
||||
if (unlikely(page->mapping != NULL))
|
||||
bad_reason = "non-NULL mapping";
|
||||
@@ -812,7 +810,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
|
||||
do {
|
||||
int mt; /* migratetype of the to-be-freed page */
|
||||
|
||||
page = list_entry(list->prev, struct page, lru);
|
||||
page = list_last_entry(list, struct page, lru);
|
||||
/* must delete as __free_one_page list manipulates */
|
||||
list_del(&page->lru);
|
||||
|
||||
@@ -863,6 +861,27 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
switch (page - head_page) {
|
||||
case 1:
|
||||
/* the first tail page: ->mapping is compound_mapcount() */
|
||||
if (unlikely(compound_mapcount(page))) {
|
||||
bad_page(page, "nonzero compound_mapcount", 0);
|
||||
goto out;
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
/*
|
||||
* the second tail page: ->mapping is
|
||||
* page_deferred_list().next -- ignore value.
|
||||
*/
|
||||
break;
|
||||
default:
|
||||
if (page->mapping != TAIL_MAPPING) {
|
||||
bad_page(page, "corrupted mapping in tail page", 0);
|
||||
goto out;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (unlikely(!PageTail(page))) {
|
||||
bad_page(page, "PageTail not set", 0);
|
||||
goto out;
|
||||
@@ -873,6 +892,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
|
||||
}
|
||||
ret = 0;
|
||||
out:
|
||||
page->mapping = NULL;
|
||||
clear_compound_head(page);
|
||||
return ret;
|
||||
}
|
||||
@@ -1336,7 +1356,7 @@ static inline int check_new_page(struct page *page)
|
||||
const char *bad_reason = NULL;
|
||||
unsigned long bad_flags = 0;
|
||||
|
||||
if (unlikely(page_mapcount(page)))
|
||||
if (unlikely(atomic_read(&page->_mapcount) != -1))
|
||||
bad_reason = "nonzero mapcount";
|
||||
if (unlikely(page->mapping != NULL))
|
||||
bad_reason = "non-NULL mapping";
|
||||
@@ -1417,11 +1437,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
|
||||
/* Find a page of the appropriate size in the preferred list */
|
||||
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
|
||||
area = &(zone->free_area[current_order]);
|
||||
if (list_empty(&area->free_list[migratetype]))
|
||||
continue;
|
||||
|
||||
page = list_entry(area->free_list[migratetype].next,
|
||||
page = list_first_entry_or_null(&area->free_list[migratetype],
|
||||
struct page, lru);
|
||||
if (!page)
|
||||
continue;
|
||||
list_del(&page->lru);
|
||||
rmv_page_order(page);
|
||||
area->nr_free--;
|
||||
@@ -1700,12 +1719,12 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
|
||||
for (order = 0; order < MAX_ORDER; order++) {
|
||||
struct free_area *area = &(zone->free_area[order]);
|
||||
|
||||
if (list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
|
||||
page = list_first_entry_or_null(
|
||||
&area->free_list[MIGRATE_HIGHATOMIC],
|
||||
struct page, lru);
|
||||
if (!page)
|
||||
continue;
|
||||
|
||||
page = list_entry(area->free_list[MIGRATE_HIGHATOMIC].next,
|
||||
struct page, lru);
|
||||
|
||||
/*
|
||||
* It should never happen but changes to locking could
|
||||
* inadvertently allow a per-cpu drain to add pages
|
||||
@@ -1753,7 +1772,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
|
||||
if (fallback_mt == -1)
|
||||
continue;
|
||||
|
||||
page = list_entry(area->free_list[fallback_mt].next,
|
||||
page = list_first_entry(&area->free_list[fallback_mt],
|
||||
struct page, lru);
|
||||
if (can_steal)
|
||||
steal_suitable_fallback(zone, page, start_migratetype);
|
||||
@@ -1788,7 +1807,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
|
||||
* Call me with the zone->lock already held.
|
||||
*/
|
||||
static struct page *__rmqueue(struct zone *zone, unsigned int order,
|
||||
int migratetype, gfp_t gfp_flags)
|
||||
int migratetype)
|
||||
{
|
||||
struct page *page;
|
||||
|
||||
@@ -1818,7 +1837,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
|
||||
|
||||
spin_lock(&zone->lock);
|
||||
for (i = 0; i < count; ++i) {
|
||||
struct page *page = __rmqueue(zone, order, migratetype, 0);
|
||||
struct page *page = __rmqueue(zone, order, migratetype);
|
||||
if (unlikely(page == NULL))
|
||||
break;
|
||||
|
||||
@@ -1988,7 +2007,7 @@ void mark_free_pages(struct zone *zone)
|
||||
unsigned long pfn, max_zone_pfn;
|
||||
unsigned long flags;
|
||||
unsigned int order, t;
|
||||
struct list_head *curr;
|
||||
struct page *page;
|
||||
|
||||
if (zone_is_empty(zone))
|
||||
return;
|
||||
@@ -1998,17 +2017,17 @@ void mark_free_pages(struct zone *zone)
|
||||
max_zone_pfn = zone_end_pfn(zone);
|
||||
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
|
||||
if (pfn_valid(pfn)) {
|
||||
struct page *page = pfn_to_page(pfn);
|
||||
|
||||
page = pfn_to_page(pfn);
|
||||
if (!swsusp_page_is_forbidden(page))
|
||||
swsusp_unset_page_free(page);
|
||||
}
|
||||
|
||||
for_each_migratetype_order(order, t) {
|
||||
list_for_each(curr, &zone->free_area[order].free_list[t]) {
|
||||
list_for_each_entry(page,
|
||||
&zone->free_area[order].free_list[t], lru) {
|
||||
unsigned long i;
|
||||
|
||||
pfn = page_to_pfn(list_entry(curr, struct page, lru));
|
||||
pfn = page_to_pfn(page);
|
||||
for (i = 0; i < (1UL << order); i++)
|
||||
swsusp_set_page_free(pfn_to_page(pfn + i));
|
||||
}
|
||||
@@ -2212,9 +2231,9 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
|
||||
}
|
||||
|
||||
if (cold)
|
||||
page = list_entry(list->prev, struct page, lru);
|
||||
page = list_last_entry(list, struct page, lru);
|
||||
else
|
||||
page = list_entry(list->next, struct page, lru);
|
||||
page = list_first_entry(list, struct page, lru);
|
||||
|
||||
list_del(&page->lru);
|
||||
pcp->count--;
|
||||
@@ -2241,7 +2260,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
|
||||
trace_mm_page_alloc_zone_locked(page, order, migratetype);
|
||||
}
|
||||
if (!page)
|
||||
page = __rmqueue(zone, order, migratetype, gfp_flags);
|
||||
page = __rmqueue(zone, order, migratetype);
|
||||
spin_unlock(&zone->lock);
|
||||
if (!page)
|
||||
goto failed;
|
||||
@@ -2740,8 +2759,21 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
|
||||
goto out;
|
||||
}
|
||||
/* Exhausted what can be done so it's blamo time */
|
||||
if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
|
||||
if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
|
||||
*did_some_progress = 1;
|
||||
|
||||
if (gfp_mask & __GFP_NOFAIL) {
|
||||
page = get_page_from_freelist(gfp_mask, order,
|
||||
ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac);
|
||||
/*
|
||||
* fallback to ignore cpuset restriction if our nodes
|
||||
* are depleted
|
||||
*/
|
||||
if (!page)
|
||||
page = get_page_from_freelist(gfp_mask, order,
|
||||
ALLOC_NO_WATERMARKS, ac);
|
||||
}
|
||||
}
|
||||
out:
|
||||
mutex_unlock(&oom_lock);
|
||||
return page;
|
||||
@@ -2876,28 +2908,6 @@ retry:
|
||||
return page;
|
||||
}
|
||||
|
||||
/*
|
||||
* This is called in the allocator slow-path if the allocation request is of
|
||||
* sufficient urgency to ignore watermarks and take other desperate measures
|
||||
*/
|
||||
static inline struct page *
|
||||
__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
|
||||
const struct alloc_context *ac)
|
||||
{
|
||||
struct page *page;
|
||||
|
||||
do {
|
||||
page = get_page_from_freelist(gfp_mask, order,
|
||||
ALLOC_NO_WATERMARKS, ac);
|
||||
|
||||
if (!page && gfp_mask & __GFP_NOFAIL)
|
||||
wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC,
|
||||
HZ/50);
|
||||
} while (!page && (gfp_mask & __GFP_NOFAIL));
|
||||
|
||||
return page;
|
||||
}
|
||||
|
||||
static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
|
||||
{
|
||||
struct zoneref *z;
|
||||
@@ -3042,28 +3052,36 @@ retry:
|
||||
* allocations are system rather than user orientated
|
||||
*/
|
||||
ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
|
||||
|
||||
page = __alloc_pages_high_priority(gfp_mask, order, ac);
|
||||
|
||||
if (page) {
|
||||
page = get_page_from_freelist(gfp_mask, order,
|
||||
ALLOC_NO_WATERMARKS, ac);
|
||||
if (page)
|
||||
goto got_pg;
|
||||
}
|
||||
}
|
||||
|
||||
/* Caller is not willing to reclaim, we can't balance anything */
|
||||
if (!can_direct_reclaim) {
|
||||
/*
|
||||
* All existing users of the deprecated __GFP_NOFAIL are
|
||||
* blockable, so warn of any new users that actually allow this
|
||||
* type of allocation to fail.
|
||||
* All existing users of the __GFP_NOFAIL are blockable, so warn
|
||||
* of any new users that actually allow this type of allocation
|
||||
* to fail.
|
||||
*/
|
||||
WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
|
||||
goto nopage;
|
||||
}
|
||||
|
||||
/* Avoid recursion of direct reclaim */
|
||||
if (current->flags & PF_MEMALLOC)
|
||||
if (current->flags & PF_MEMALLOC) {
|
||||
/*
|
||||
* __GFP_NOFAIL request from this context is rather bizarre
|
||||
* because we cannot reclaim anything and only can loop waiting
|
||||
* for somebody to do a work for us.
|
||||
*/
|
||||
if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
|
||||
cond_resched();
|
||||
goto retry;
|
||||
}
|
||||
goto nopage;
|
||||
}
|
||||
|
||||
/* Avoid allocations with no watermarks from looping endlessly */
|
||||
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
|
||||
@@ -3402,7 +3420,8 @@ EXPORT_SYMBOL(__free_page_frag);
|
||||
|
||||
/*
|
||||
* alloc_kmem_pages charges newly allocated pages to the kmem resource counter
|
||||
* of the current memory cgroup.
|
||||
* of the current memory cgroup if __GFP_ACCOUNT is set, other than that it is
|
||||
* equivalent to alloc_pages.
|
||||
*
|
||||
* It should be used when the caller would like to use kmalloc, but since the
|
||||
* allocation is large, it has to fall back to the page allocator.
|
||||
@@ -4147,8 +4166,7 @@ static void set_zonelist_order(void)
|
||||
|
||||
static void build_zonelists(pg_data_t *pgdat)
|
||||
{
|
||||
int j, node, load;
|
||||
enum zone_type i;
|
||||
int i, node, load;
|
||||
nodemask_t used_mask;
|
||||
int local_node, prev_node;
|
||||
struct zonelist *zonelist;
|
||||
@@ -4168,7 +4186,7 @@ static void build_zonelists(pg_data_t *pgdat)
|
||||
nodes_clear(used_mask);
|
||||
|
||||
memset(node_order, 0, sizeof(node_order));
|
||||
j = 0;
|
||||
i = 0;
|
||||
|
||||
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
|
||||
/*
|
||||
@@ -4185,12 +4203,12 @@ static void build_zonelists(pg_data_t *pgdat)
|
||||
if (order == ZONELIST_ORDER_NODE)
|
||||
build_zonelists_in_node_order(pgdat, node);
|
||||
else
|
||||
node_order[j++] = node; /* remember order */
|
||||
node_order[i++] = node; /* remember order */
|
||||
}
|
||||
|
||||
if (order == ZONELIST_ORDER_ZONE) {
|
||||
/* calculate node order -- i.e., DMA last! */
|
||||
build_zonelists_in_zone_order(pgdat, j);
|
||||
build_zonelists_in_zone_order(pgdat, i);
|
||||
}
|
||||
|
||||
build_thisnode_zonelists(pgdat);
|
||||
@@ -4468,16 +4486,22 @@ static inline unsigned long wait_table_bits(unsigned long size)
|
||||
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
|
||||
unsigned long start_pfn, enum memmap_context context)
|
||||
{
|
||||
pg_data_t *pgdat = NODE_DATA(nid);
|
||||
struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn));
|
||||
unsigned long end_pfn = start_pfn + size;
|
||||
pg_data_t *pgdat = NODE_DATA(nid);
|
||||
unsigned long pfn;
|
||||
struct zone *z;
|
||||
unsigned long nr_initialised = 0;
|
||||
|
||||
if (highest_memmap_pfn < end_pfn - 1)
|
||||
highest_memmap_pfn = end_pfn - 1;
|
||||
|
||||
z = &pgdat->node_zones[zone];
|
||||
/*
|
||||
* Honor reservation requested by the driver for this ZONE_DEVICE
|
||||
* memory
|
||||
*/
|
||||
if (altmap && start_pfn == altmap->base_pfn)
|
||||
start_pfn += altmap->reserve;
|
||||
|
||||
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
|
||||
/*
|
||||
* There can be holes in boot-time mem_map[]s
|
||||
@@ -5956,20 +5980,12 @@ static void calculate_totalreserve_pages(void)
|
||||
|
||||
if (max > zone->managed_pages)
|
||||
max = zone->managed_pages;
|
||||
|
||||
zone->totalreserve_pages = max;
|
||||
|
||||
reserve_pages += max;
|
||||
/*
|
||||
* Lowmem reserves are not available to
|
||||
* GFP_HIGHUSER page cache allocations and
|
||||
* kswapd tries to balance zones to their high
|
||||
* watermark. As a result, neither should be
|
||||
* regarded as dirtyable memory, to prevent a
|
||||
* situation where reclaim has to clean pages
|
||||
* in order to balance the zones.
|
||||
*/
|
||||
zone->dirty_balance_reserve = max;
|
||||
}
|
||||
}
|
||||
dirty_balance_reserve = reserve_pages;
|
||||
totalreserve_pages = reserve_pages;
|
||||
}
|
||||
|
||||
@@ -6724,8 +6740,12 @@ int alloc_contig_range(unsigned long start, unsigned long end,
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* In case of -EBUSY, we'd like to know which page causes problem.
|
||||
* So, just fall through. We will check it in test_pages_isolated().
|
||||
*/
|
||||
ret = __alloc_contig_migrate_range(&cc, start, end);
|
||||
if (ret)
|
||||
if (ret && ret != -EBUSY)
|
||||
goto done;
|
||||
|
||||
/*
|
||||
@@ -6752,12 +6772,25 @@ int alloc_contig_range(unsigned long start, unsigned long end,
|
||||
outer_start = start;
|
||||
while (!PageBuddy(pfn_to_page(outer_start))) {
|
||||
if (++order >= MAX_ORDER) {
|
||||
ret = -EBUSY;
|
||||
goto done;
|
||||
outer_start = start;
|
||||
break;
|
||||
}
|
||||
outer_start &= ~0UL << order;
|
||||
}
|
||||
|
||||
if (outer_start != start) {
|
||||
order = page_order(pfn_to_page(outer_start));
|
||||
|
||||
/*
|
||||
* outer_start page could be small order buddy page and
|
||||
* it doesn't include start page. Adjust outer_start
|
||||
* in this case to report failed page properly
|
||||
* on tracepoint in test_pages_isolated()
|
||||
*/
|
||||
if (outer_start + (1UL << order) <= start)
|
||||
outer_start = start;
|
||||
}
|
||||
|
||||
/* Make sure the range is really isolated. */
|
||||
if (test_pages_isolated(outer_start, end, false)) {
|
||||
pr_info("%s: [%lx, %lx) PFNs busy\n",
|
||||
|
@@ -55,25 +55,26 @@ static int page_idle_clear_pte_refs_one(struct page *page,
|
||||
unsigned long addr, void *arg)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
spinlock_t *ptl;
|
||||
pmd_t *pmd;
|
||||
pte_t *pte;
|
||||
spinlock_t *ptl;
|
||||
bool referenced = false;
|
||||
|
||||
if (unlikely(PageTransHuge(page))) {
|
||||
pmd = page_check_address_pmd(page, mm, addr,
|
||||
PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
|
||||
if (pmd) {
|
||||
referenced = pmdp_clear_young_notify(vma, addr, pmd);
|
||||
spin_unlock(ptl);
|
||||
}
|
||||
if (!page_check_address_transhuge(page, mm, addr, &pmd, &pte, &ptl))
|
||||
return SWAP_AGAIN;
|
||||
|
||||
if (pte) {
|
||||
referenced = ptep_clear_young_notify(vma, addr, pte);
|
||||
pte_unmap(pte);
|
||||
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
|
||||
referenced = pmdp_clear_young_notify(vma, addr, pmd);
|
||||
} else {
|
||||
pte = page_check_address(page, mm, addr, &ptl, 0);
|
||||
if (pte) {
|
||||
referenced = ptep_clear_young_notify(vma, addr, pte);
|
||||
pte_unmap_unlock(pte, ptl);
|
||||
}
|
||||
/* unexpected pmd-mapped page? */
|
||||
WARN_ON_ONCE(1);
|
||||
}
|
||||
|
||||
spin_unlock(ptl);
|
||||
|
||||
if (referenced) {
|
||||
clear_page_idle(page);
|
||||
/*
|
||||
|
@@ -9,6 +9,9 @@
|
||||
#include <linux/hugetlb.h>
|
||||
#include "internal.h"
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include <trace/events/page_isolation.h>
|
||||
|
||||
static int set_migratetype_isolate(struct page *page,
|
||||
bool skip_hwpoisoned_pages)
|
||||
{
|
||||
@@ -162,8 +165,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
|
||||
unsigned long undo_pfn;
|
||||
struct page *page;
|
||||
|
||||
BUG_ON((start_pfn) & (pageblock_nr_pages - 1));
|
||||
BUG_ON((end_pfn) & (pageblock_nr_pages - 1));
|
||||
BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
|
||||
BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
|
||||
|
||||
for (pfn = start_pfn;
|
||||
pfn < end_pfn;
|
||||
@@ -193,8 +196,10 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
|
||||
{
|
||||
unsigned long pfn;
|
||||
struct page *page;
|
||||
BUG_ON((start_pfn) & (pageblock_nr_pages - 1));
|
||||
BUG_ON((end_pfn) & (pageblock_nr_pages - 1));
|
||||
|
||||
BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
|
||||
BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
|
||||
|
||||
for (pfn = start_pfn;
|
||||
pfn < end_pfn;
|
||||
pfn += pageblock_nr_pages) {
|
||||
@@ -212,7 +217,7 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
|
||||
*
|
||||
* Returns 1 if all pages in the range are isolated.
|
||||
*/
|
||||
static int
|
||||
static unsigned long
|
||||
__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
|
||||
bool skip_hwpoisoned_pages)
|
||||
{
|
||||
@@ -237,9 +242,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
|
||||
else
|
||||
break;
|
||||
}
|
||||
if (pfn < end_pfn)
|
||||
return 0;
|
||||
return 1;
|
||||
|
||||
return pfn;
|
||||
}
|
||||
|
||||
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
|
||||
@@ -248,7 +252,6 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
|
||||
unsigned long pfn, flags;
|
||||
struct page *page;
|
||||
struct zone *zone;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages
|
||||
@@ -266,10 +269,13 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
|
||||
/* Check all pages are free or marked as ISOLATED */
|
||||
zone = page_zone(page);
|
||||
spin_lock_irqsave(&zone->lock, flags);
|
||||
ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn,
|
||||
pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn,
|
||||
skip_hwpoisoned_pages);
|
||||
spin_unlock_irqrestore(&zone->lock, flags);
|
||||
return ret ? 0 : -EBUSY;
|
||||
|
||||
trace_test_pages_isolated(start_pfn, end_pfn, pfn);
|
||||
|
||||
return pfn < end_pfn ? -EBUSY : 0;
|
||||
}
|
||||
|
||||
struct page *alloc_migrate_target(struct page *page, unsigned long private,
|
||||
|
@@ -58,7 +58,7 @@ again:
|
||||
if (!walk->pte_entry)
|
||||
continue;
|
||||
|
||||
split_huge_page_pmd_mm(walk->mm, addr, pmd);
|
||||
split_huge_pmd(walk->vma, pmd, addr);
|
||||
if (pmd_trans_unstable(pmd))
|
||||
goto again;
|
||||
err = walk_pte_range(pmd, addr, next, walk);
|
||||
|
18
mm/percpu.c
18
mm/percpu.c
@@ -305,16 +305,12 @@ static void *pcpu_mem_zalloc(size_t size)
|
||||
/**
|
||||
* pcpu_mem_free - free memory
|
||||
* @ptr: memory to free
|
||||
* @size: size of the area
|
||||
*
|
||||
* Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc().
|
||||
*/
|
||||
static void pcpu_mem_free(void *ptr, size_t size)
|
||||
static void pcpu_mem_free(void *ptr)
|
||||
{
|
||||
if (size <= PAGE_SIZE)
|
||||
kfree(ptr);
|
||||
else
|
||||
vfree(ptr);
|
||||
kvfree(ptr);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -463,8 +459,8 @@ out_unlock:
|
||||
* pcpu_mem_free() might end up calling vfree() which uses
|
||||
* IRQ-unsafe lock and thus can't be called under pcpu_lock.
|
||||
*/
|
||||
pcpu_mem_free(old, old_size);
|
||||
pcpu_mem_free(new, new_size);
|
||||
pcpu_mem_free(old);
|
||||
pcpu_mem_free(new);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -732,7 +728,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
|
||||
chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
|
||||
sizeof(chunk->map[0]));
|
||||
if (!chunk->map) {
|
||||
pcpu_mem_free(chunk, pcpu_chunk_struct_size);
|
||||
pcpu_mem_free(chunk);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -753,8 +749,8 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
|
||||
{
|
||||
if (!chunk)
|
||||
return;
|
||||
pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
|
||||
pcpu_mem_free(chunk, pcpu_chunk_struct_size);
|
||||
pcpu_mem_free(chunk->map);
|
||||
pcpu_mem_free(chunk);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@@ -132,25 +132,13 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
|
||||
{
|
||||
pmd_t pmd;
|
||||
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
|
||||
VM_BUG_ON(!pmd_trans_huge(*pmdp));
|
||||
VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
|
||||
pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
|
||||
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
|
||||
return pmd;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
|
||||
void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
|
||||
pmd_t *pmdp)
|
||||
{
|
||||
pmd_t pmd = pmd_mksplitting(*pmdp);
|
||||
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
|
||||
set_pmd_at(vma->vm_mm, address, pmdp, pmd);
|
||||
/* tlb flush only to serialize against gup-fast */
|
||||
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
|
||||
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
|
||||
pgtable_t pgtable)
|
||||
@@ -176,13 +164,10 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
|
||||
|
||||
/* FIFO */
|
||||
pgtable = pmd_huge_pte(mm, pmdp);
|
||||
if (list_empty(&pgtable->lru))
|
||||
pmd_huge_pte(mm, pmdp) = NULL;
|
||||
else {
|
||||
pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next,
|
||||
struct page, lru);
|
||||
pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru,
|
||||
struct page, lru);
|
||||
if (pmd_huge_pte(mm, pmdp))
|
||||
list_del(&pgtable->lru);
|
||||
}
|
||||
return pgtable;
|
||||
}
|
||||
#endif
|
||||
|
@@ -194,7 +194,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
|
||||
goto free_proc_pages;
|
||||
}
|
||||
|
||||
mm = mm_access(task, PTRACE_MODE_ATTACH);
|
||||
mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS);
|
||||
if (!mm || IS_ERR(mm)) {
|
||||
rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
|
||||
/*
|
||||
|
@@ -17,6 +17,7 @@
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/mm_inline.h>
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
@@ -32,8 +33,6 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(file_ra_state_init);
|
||||
|
||||
#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
|
||||
|
||||
/*
|
||||
* see if a page needs releasing upon read_cache_pages() failure
|
||||
* - the caller of read_cache_pages() may have set PG_private or PG_fscache
|
||||
@@ -64,7 +63,7 @@ static void read_cache_pages_invalidate_pages(struct address_space *mapping,
|
||||
struct page *victim;
|
||||
|
||||
while (!list_empty(pages)) {
|
||||
victim = list_to_page(pages);
|
||||
victim = lru_to_page(pages);
|
||||
list_del(&victim->lru);
|
||||
read_cache_pages_invalidate_page(mapping, victim);
|
||||
}
|
||||
@@ -87,7 +86,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
|
||||
int ret = 0;
|
||||
|
||||
while (!list_empty(pages)) {
|
||||
page = list_to_page(pages);
|
||||
page = lru_to_page(pages);
|
||||
list_del(&page->lru);
|
||||
if (add_to_page_cache_lru(page, mapping, page->index,
|
||||
mapping_gfp_constraint(mapping, GFP_KERNEL))) {
|
||||
@@ -125,7 +124,7 @@ static int read_pages(struct address_space *mapping, struct file *filp,
|
||||
}
|
||||
|
||||
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
|
||||
struct page *page = list_to_page(pages);
|
||||
struct page *page = lru_to_page(pages);
|
||||
list_del(&page->lru);
|
||||
if (!add_to_page_cache_lru(page, mapping, page->index,
|
||||
mapping_gfp_constraint(mapping, GFP_KERNEL))) {
|
||||
|
401
mm/rmap.c
401
mm/rmap.c
@@ -23,21 +23,22 @@
|
||||
* inode->i_mutex (while writing or truncating, not reading or faulting)
|
||||
* mm->mmap_sem
|
||||
* page->flags PG_locked (lock_page)
|
||||
* mapping->i_mmap_rwsem
|
||||
* anon_vma->rwsem
|
||||
* mm->page_table_lock or pte_lock
|
||||
* zone->lru_lock (in mark_page_accessed, isolate_lru_page)
|
||||
* swap_lock (in swap_duplicate, swap_info_get)
|
||||
* mmlist_lock (in mmput, drain_mmlist and others)
|
||||
* mapping->private_lock (in __set_page_dirty_buffers)
|
||||
* mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
|
||||
* mapping->tree_lock (widely used)
|
||||
* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
|
||||
* bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
|
||||
* sb_lock (within inode_lock in fs/fs-writeback.c)
|
||||
* mapping->tree_lock (widely used, in set_page_dirty,
|
||||
* in arch-dependent flush_dcache_mmap_lock,
|
||||
* within bdi.wb->list_lock in __sync_single_inode)
|
||||
* hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
|
||||
* mapping->i_mmap_rwsem
|
||||
* anon_vma->rwsem
|
||||
* mm->page_table_lock or pte_lock
|
||||
* zone->lru_lock (in mark_page_accessed, isolate_lru_page)
|
||||
* swap_lock (in swap_duplicate, swap_info_get)
|
||||
* mmlist_lock (in mmput, drain_mmlist and others)
|
||||
* mapping->private_lock (in __set_page_dirty_buffers)
|
||||
* mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
|
||||
* mapping->tree_lock (widely used)
|
||||
* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
|
||||
* bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
|
||||
* sb_lock (within inode_lock in fs/fs-writeback.c)
|
||||
* mapping->tree_lock (widely used, in set_page_dirty,
|
||||
* in arch-dependent flush_dcache_mmap_lock,
|
||||
* within bdi.wb->list_lock in __sync_single_inode)
|
||||
*
|
||||
* anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
|
||||
* ->tasklist_lock
|
||||
@@ -428,8 +429,10 @@ static void anon_vma_ctor(void *data)
|
||||
void __init anon_vma_init(void)
|
||||
{
|
||||
anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
|
||||
0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
|
||||
anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC);
|
||||
0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
|
||||
anon_vma_ctor);
|
||||
anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
|
||||
SLAB_PANIC|SLAB_ACCOUNT);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -565,27 +568,6 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
|
||||
anon_vma_unlock_read(anon_vma);
|
||||
}
|
||||
|
||||
/*
|
||||
* At what user virtual address is page expected in @vma?
|
||||
*/
|
||||
static inline unsigned long
|
||||
__vma_address(struct page *page, struct vm_area_struct *vma)
|
||||
{
|
||||
pgoff_t pgoff = page_to_pgoff(page);
|
||||
return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
|
||||
}
|
||||
|
||||
inline unsigned long
|
||||
vma_address(struct page *page, struct vm_area_struct *vma)
|
||||
{
|
||||
unsigned long address = __vma_address(page, vma);
|
||||
|
||||
/* page should be within @vma mapping range */
|
||||
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
|
||||
|
||||
return address;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
|
||||
static void percpu_flush_tlb_batch_pages(void *data)
|
||||
{
|
||||
@@ -817,6 +799,96 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
/*
|
||||
* Check that @page is mapped at @address into @mm. In contrast to
|
||||
* page_check_address(), this function can handle transparent huge pages.
|
||||
*
|
||||
* On success returns true with pte mapped and locked. For PMD-mapped
|
||||
* transparent huge pages *@ptep is set to NULL.
|
||||
*/
|
||||
bool page_check_address_transhuge(struct page *page, struct mm_struct *mm,
|
||||
unsigned long address, pmd_t **pmdp,
|
||||
pte_t **ptep, spinlock_t **ptlp)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
pte_t *pte;
|
||||
spinlock_t *ptl;
|
||||
|
||||
if (unlikely(PageHuge(page))) {
|
||||
/* when pud is not present, pte will be NULL */
|
||||
pte = huge_pte_offset(mm, address);
|
||||
if (!pte)
|
||||
return false;
|
||||
|
||||
ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
|
||||
pmd = NULL;
|
||||
goto check_pte;
|
||||
}
|
||||
|
||||
pgd = pgd_offset(mm, address);
|
||||
if (!pgd_present(*pgd))
|
||||
return false;
|
||||
pud = pud_offset(pgd, address);
|
||||
if (!pud_present(*pud))
|
||||
return false;
|
||||
pmd = pmd_offset(pud, address);
|
||||
|
||||
if (pmd_trans_huge(*pmd)) {
|
||||
ptl = pmd_lock(mm, pmd);
|
||||
if (!pmd_present(*pmd))
|
||||
goto unlock_pmd;
|
||||
if (unlikely(!pmd_trans_huge(*pmd))) {
|
||||
spin_unlock(ptl);
|
||||
goto map_pte;
|
||||
}
|
||||
|
||||
if (pmd_page(*pmd) != page)
|
||||
goto unlock_pmd;
|
||||
|
||||
pte = NULL;
|
||||
goto found;
|
||||
unlock_pmd:
|
||||
spin_unlock(ptl);
|
||||
return false;
|
||||
} else {
|
||||
pmd_t pmde = *pmd;
|
||||
|
||||
barrier();
|
||||
if (!pmd_present(pmde) || pmd_trans_huge(pmde))
|
||||
return false;
|
||||
}
|
||||
map_pte:
|
||||
pte = pte_offset_map(pmd, address);
|
||||
if (!pte_present(*pte)) {
|
||||
pte_unmap(pte);
|
||||
return false;
|
||||
}
|
||||
|
||||
ptl = pte_lockptr(mm, pmd);
|
||||
check_pte:
|
||||
spin_lock(ptl);
|
||||
|
||||
if (!pte_present(*pte)) {
|
||||
pte_unmap_unlock(pte, ptl);
|
||||
return false;
|
||||
}
|
||||
|
||||
/* THP can be referenced by any subpage */
|
||||
if (pte_pfn(*pte) - page_to_pfn(page) >= hpage_nr_pages(page)) {
|
||||
pte_unmap_unlock(pte, ptl);
|
||||
return false;
|
||||
}
|
||||
found:
|
||||
*ptep = pte;
|
||||
*pmdp = pmd;
|
||||
*ptlp = ptl;
|
||||
return true;
|
||||
}
|
||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
|
||||
struct page_referenced_arg {
|
||||
int mapcount;
|
||||
int referenced;
|
||||
@@ -830,49 +902,24 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
|
||||
unsigned long address, void *arg)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
struct page_referenced_arg *pra = arg;
|
||||
pmd_t *pmd;
|
||||
pte_t *pte;
|
||||
spinlock_t *ptl;
|
||||
int referenced = 0;
|
||||
struct page_referenced_arg *pra = arg;
|
||||
|
||||
if (unlikely(PageTransHuge(page))) {
|
||||
pmd_t *pmd;
|
||||
if (!page_check_address_transhuge(page, mm, address, &pmd, &pte, &ptl))
|
||||
return SWAP_AGAIN;
|
||||
|
||||
/*
|
||||
* rmap might return false positives; we must filter
|
||||
* these out using page_check_address_pmd().
|
||||
*/
|
||||
pmd = page_check_address_pmd(page, mm, address,
|
||||
PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
|
||||
if (!pmd)
|
||||
return SWAP_AGAIN;
|
||||
|
||||
if (vma->vm_flags & VM_LOCKED) {
|
||||
spin_unlock(ptl);
|
||||
pra->vm_flags |= VM_LOCKED;
|
||||
return SWAP_FAIL; /* To break the loop */
|
||||
}
|
||||
|
||||
/* go ahead even if the pmd is pmd_trans_splitting() */
|
||||
if (pmdp_clear_flush_young_notify(vma, address, pmd))
|
||||
referenced++;
|
||||
if (vma->vm_flags & VM_LOCKED) {
|
||||
if (pte)
|
||||
pte_unmap(pte);
|
||||
spin_unlock(ptl);
|
||||
} else {
|
||||
pte_t *pte;
|
||||
|
||||
/*
|
||||
* rmap might return false positives; we must filter
|
||||
* these out using page_check_address().
|
||||
*/
|
||||
pte = page_check_address(page, mm, address, &ptl, 0);
|
||||
if (!pte)
|
||||
return SWAP_AGAIN;
|
||||
|
||||
if (vma->vm_flags & VM_LOCKED) {
|
||||
pte_unmap_unlock(pte, ptl);
|
||||
pra->vm_flags |= VM_LOCKED;
|
||||
return SWAP_FAIL; /* To break the loop */
|
||||
}
|
||||
pra->vm_flags |= VM_LOCKED;
|
||||
return SWAP_FAIL; /* To break the loop */
|
||||
}
|
||||
|
||||
if (pte) {
|
||||
if (ptep_clear_flush_young_notify(vma, address, pte)) {
|
||||
/*
|
||||
* Don't treat a reference through a sequentially read
|
||||
@@ -884,8 +931,15 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
|
||||
if (likely(!(vma->vm_flags & VM_SEQ_READ)))
|
||||
referenced++;
|
||||
}
|
||||
pte_unmap_unlock(pte, ptl);
|
||||
pte_unmap(pte);
|
||||
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
|
||||
if (pmdp_clear_flush_young_notify(vma, address, pmd))
|
||||
referenced++;
|
||||
} else {
|
||||
/* unexpected pmd-mapped page? */
|
||||
WARN_ON_ONCE(1);
|
||||
}
|
||||
spin_unlock(ptl);
|
||||
|
||||
if (referenced)
|
||||
clear_page_idle(page);
|
||||
@@ -933,7 +987,7 @@ int page_referenced(struct page *page,
|
||||
int ret;
|
||||
int we_locked = 0;
|
||||
struct page_referenced_arg pra = {
|
||||
.mapcount = page_mapcount(page),
|
||||
.mapcount = total_mapcount(page),
|
||||
.memcg = memcg,
|
||||
};
|
||||
struct rmap_walk_control rwc = {
|
||||
@@ -1122,7 +1176,7 @@ static void __page_check_anon_rmap(struct page *page,
|
||||
* over the call to page_add_new_anon_rmap.
|
||||
*/
|
||||
BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
|
||||
BUG_ON(page->index != linear_page_index(vma, address));
|
||||
BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address));
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1131,6 +1185,7 @@ static void __page_check_anon_rmap(struct page *page,
|
||||
* @page: the page to add the mapping to
|
||||
* @vma: the vm area in which the mapping is added
|
||||
* @address: the user virtual address mapped
|
||||
* @compound: charge the page as compound or small page
|
||||
*
|
||||
* The caller needs to hold the pte lock, and the page must be locked in
|
||||
* the anon_vma case: to serialize mapping,index checking after setting,
|
||||
@@ -1138,9 +1193,9 @@ static void __page_check_anon_rmap(struct page *page,
|
||||
* (but PageKsm is never downgraded to PageAnon).
|
||||
*/
|
||||
void page_add_anon_rmap(struct page *page,
|
||||
struct vm_area_struct *vma, unsigned long address)
|
||||
struct vm_area_struct *vma, unsigned long address, bool compound)
|
||||
{
|
||||
do_page_add_anon_rmap(page, vma, address, 0);
|
||||
do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1149,29 +1204,44 @@ void page_add_anon_rmap(struct page *page,
|
||||
* Everybody else should continue to use page_add_anon_rmap above.
|
||||
*/
|
||||
void do_page_add_anon_rmap(struct page *page,
|
||||
struct vm_area_struct *vma, unsigned long address, int exclusive)
|
||||
struct vm_area_struct *vma, unsigned long address, int flags)
|
||||
{
|
||||
int first = atomic_inc_and_test(&page->_mapcount);
|
||||
bool compound = flags & RMAP_COMPOUND;
|
||||
bool first;
|
||||
|
||||
if (compound) {
|
||||
atomic_t *mapcount;
|
||||
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
||||
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
|
||||
mapcount = compound_mapcount_ptr(page);
|
||||
first = atomic_inc_and_test(mapcount);
|
||||
} else {
|
||||
first = atomic_inc_and_test(&page->_mapcount);
|
||||
}
|
||||
|
||||
if (first) {
|
||||
int nr = compound ? hpage_nr_pages(page) : 1;
|
||||
/*
|
||||
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
|
||||
* these counters are not modified in interrupt context, and
|
||||
* pte lock(a spinlock) is held, which implies preemption
|
||||
* disabled.
|
||||
*/
|
||||
if (PageTransHuge(page))
|
||||
if (compound) {
|
||||
__inc_zone_page_state(page,
|
||||
NR_ANON_TRANSPARENT_HUGEPAGES);
|
||||
__mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
|
||||
hpage_nr_pages(page));
|
||||
}
|
||||
__mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
|
||||
}
|
||||
if (unlikely(PageKsm(page)))
|
||||
return;
|
||||
|
||||
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
||||
|
||||
/* address might be in next vma when migration races vma_adjust */
|
||||
if (first)
|
||||
__page_set_anon_rmap(page, vma, address, exclusive);
|
||||
__page_set_anon_rmap(page, vma, address,
|
||||
flags & RMAP_EXCLUSIVE);
|
||||
else
|
||||
__page_check_anon_rmap(page, vma, address);
|
||||
}
|
||||
@@ -1181,21 +1251,31 @@ void do_page_add_anon_rmap(struct page *page,
|
||||
* @page: the page to add the mapping to
|
||||
* @vma: the vm area in which the mapping is added
|
||||
* @address: the user virtual address mapped
|
||||
* @compound: charge the page as compound or small page
|
||||
*
|
||||
* Same as page_add_anon_rmap but must only be called on *new* pages.
|
||||
* This means the inc-and-test can be bypassed.
|
||||
* Page does not have to be locked.
|
||||
*/
|
||||
void page_add_new_anon_rmap(struct page *page,
|
||||
struct vm_area_struct *vma, unsigned long address)
|
||||
struct vm_area_struct *vma, unsigned long address, bool compound)
|
||||
{
|
||||
int nr = compound ? hpage_nr_pages(page) : 1;
|
||||
|
||||
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
|
||||
SetPageSwapBacked(page);
|
||||
atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
|
||||
if (PageTransHuge(page))
|
||||
if (compound) {
|
||||
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
|
||||
/* increment count (starts at -1) */
|
||||
atomic_set(compound_mapcount_ptr(page), 0);
|
||||
__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
|
||||
__mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
|
||||
hpage_nr_pages(page));
|
||||
} else {
|
||||
/* Anon THP always mapped first with PMD */
|
||||
VM_BUG_ON_PAGE(PageTransCompound(page), page);
|
||||
/* increment count (starts at -1) */
|
||||
atomic_set(&page->_mapcount, 0);
|
||||
}
|
||||
__mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
|
||||
__page_set_anon_rmap(page, vma, address, 1);
|
||||
}
|
||||
|
||||
@@ -1223,14 +1303,17 @@ static void page_remove_file_rmap(struct page *page)
|
||||
|
||||
memcg = mem_cgroup_begin_page_stat(page);
|
||||
|
||||
/* Hugepages are not counted in NR_FILE_MAPPED for now. */
|
||||
if (unlikely(PageHuge(page))) {
|
||||
/* hugetlb pages are always mapped with pmds */
|
||||
atomic_dec(compound_mapcount_ptr(page));
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* page still mapped by someone else? */
|
||||
if (!atomic_add_negative(-1, &page->_mapcount))
|
||||
goto out;
|
||||
|
||||
/* Hugepages are not counted in NR_FILE_MAPPED for now. */
|
||||
if (unlikely(PageHuge(page)))
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
|
||||
* these counters are not modified in interrupt context, and
|
||||
@@ -1245,41 +1328,79 @@ out:
|
||||
mem_cgroup_end_page_stat(memcg);
|
||||
}
|
||||
|
||||
/**
|
||||
* page_remove_rmap - take down pte mapping from a page
|
||||
* @page: page to remove mapping from
|
||||
*
|
||||
* The caller needs to hold the pte lock.
|
||||
*/
|
||||
void page_remove_rmap(struct page *page)
|
||||
static void page_remove_anon_compound_rmap(struct page *page)
|
||||
{
|
||||
if (!PageAnon(page)) {
|
||||
page_remove_file_rmap(page);
|
||||
return;
|
||||
}
|
||||
int i, nr;
|
||||
|
||||
/* page still mapped by someone else? */
|
||||
if (!atomic_add_negative(-1, &page->_mapcount))
|
||||
if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
|
||||
return;
|
||||
|
||||
/* Hugepages are not counted in NR_ANON_PAGES for now. */
|
||||
if (unlikely(PageHuge(page)))
|
||||
return;
|
||||
|
||||
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
|
||||
return;
|
||||
|
||||
__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
|
||||
|
||||
if (TestClearPageDoubleMap(page)) {
|
||||
/*
|
||||
* Subpages can be mapped with PTEs too. Check how many of
|
||||
* themi are still mapped.
|
||||
*/
|
||||
for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
|
||||
if (atomic_add_negative(-1, &page[i]._mapcount))
|
||||
nr++;
|
||||
}
|
||||
} else {
|
||||
nr = HPAGE_PMD_NR;
|
||||
}
|
||||
|
||||
if (unlikely(PageMlocked(page)))
|
||||
clear_page_mlock(page);
|
||||
|
||||
if (nr) {
|
||||
__mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr);
|
||||
deferred_split_huge_page(page);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* page_remove_rmap - take down pte mapping from a page
|
||||
* @page: page to remove mapping from
|
||||
* @compound: uncharge the page as compound or small page
|
||||
*
|
||||
* The caller needs to hold the pte lock.
|
||||
*/
|
||||
void page_remove_rmap(struct page *page, bool compound)
|
||||
{
|
||||
if (!PageAnon(page)) {
|
||||
VM_BUG_ON_PAGE(compound && !PageHuge(page), page);
|
||||
page_remove_file_rmap(page);
|
||||
return;
|
||||
}
|
||||
|
||||
if (compound)
|
||||
return page_remove_anon_compound_rmap(page);
|
||||
|
||||
/* page still mapped by someone else? */
|
||||
if (!atomic_add_negative(-1, &page->_mapcount))
|
||||
return;
|
||||
|
||||
/*
|
||||
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
|
||||
* these counters are not modified in interrupt context, and
|
||||
* pte lock(a spinlock) is held, which implies preemption disabled.
|
||||
*/
|
||||
if (PageTransHuge(page))
|
||||
__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
|
||||
|
||||
__mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
|
||||
-hpage_nr_pages(page));
|
||||
__dec_zone_page_state(page, NR_ANON_PAGES);
|
||||
|
||||
if (unlikely(PageMlocked(page)))
|
||||
clear_page_mlock(page);
|
||||
|
||||
if (PageTransCompound(page))
|
||||
deferred_split_huge_page(compound_head(page));
|
||||
|
||||
/*
|
||||
* It would be tidy to reset the PageAnon mapping here,
|
||||
* but that might overwrite a racing page_add_anon_rmap
|
||||
@@ -1291,6 +1412,11 @@ void page_remove_rmap(struct page *page)
|
||||
*/
|
||||
}
|
||||
|
||||
struct rmap_private {
|
||||
enum ttu_flags flags;
|
||||
int lazyfreed;
|
||||
};
|
||||
|
||||
/*
|
||||
* @arg: enum ttu_flags will be passed to this argument
|
||||
*/
|
||||
@@ -1302,7 +1428,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
|
||||
pte_t pteval;
|
||||
spinlock_t *ptl;
|
||||
int ret = SWAP_AGAIN;
|
||||
enum ttu_flags flags = (enum ttu_flags)arg;
|
||||
struct rmap_private *rp = arg;
|
||||
enum ttu_flags flags = rp->flags;
|
||||
|
||||
/* munlock has nothing to gain from examining un-locked vmas */
|
||||
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
|
||||
@@ -1362,10 +1489,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
|
||||
if (PageHuge(page)) {
|
||||
hugetlb_count_sub(1 << compound_order(page), mm);
|
||||
} else {
|
||||
if (PageAnon(page))
|
||||
dec_mm_counter(mm, MM_ANONPAGES);
|
||||
else
|
||||
dec_mm_counter(mm, MM_FILEPAGES);
|
||||
dec_mm_counter(mm, mm_counter(page));
|
||||
}
|
||||
set_pte_at(mm, address, pte,
|
||||
swp_entry_to_pte(make_hwpoison_entry(page)));
|
||||
@@ -1375,10 +1499,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
|
||||
* interest anymore. Simply discard the pte, vmscan
|
||||
* will take care of the rest.
|
||||
*/
|
||||
if (PageAnon(page))
|
||||
dec_mm_counter(mm, MM_ANONPAGES);
|
||||
else
|
||||
dec_mm_counter(mm, MM_FILEPAGES);
|
||||
dec_mm_counter(mm, mm_counter(page));
|
||||
} else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) {
|
||||
swp_entry_t entry;
|
||||
pte_t swp_pte;
|
||||
@@ -1400,6 +1521,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
|
||||
* See handle_pte_fault() ...
|
||||
*/
|
||||
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
|
||||
|
||||
if (!PageDirty(page) && (flags & TTU_LZFREE)) {
|
||||
/* It's a freeable page by MADV_FREE */
|
||||
dec_mm_counter(mm, MM_ANONPAGES);
|
||||
rp->lazyfreed++;
|
||||
goto discard;
|
||||
}
|
||||
|
||||
if (swap_duplicate(entry) < 0) {
|
||||
set_pte_at(mm, address, pte, pteval);
|
||||
ret = SWAP_FAIL;
|
||||
@@ -1418,9 +1547,10 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
|
||||
swp_pte = pte_swp_mksoft_dirty(swp_pte);
|
||||
set_pte_at(mm, address, pte, swp_pte);
|
||||
} else
|
||||
dec_mm_counter(mm, MM_FILEPAGES);
|
||||
dec_mm_counter(mm, mm_counter_file(page));
|
||||
|
||||
page_remove_rmap(page);
|
||||
discard:
|
||||
page_remove_rmap(page, PageHuge(page));
|
||||
page_cache_release(page);
|
||||
|
||||
out_unmap:
|
||||
@@ -1472,9 +1602,14 @@ static int page_not_mapped(struct page *page)
|
||||
int try_to_unmap(struct page *page, enum ttu_flags flags)
|
||||
{
|
||||
int ret;
|
||||
struct rmap_private rp = {
|
||||
.flags = flags,
|
||||
.lazyfreed = 0,
|
||||
};
|
||||
|
||||
struct rmap_walk_control rwc = {
|
||||
.rmap_one = try_to_unmap_one,
|
||||
.arg = (void *)flags,
|
||||
.arg = &rp,
|
||||
.done = page_not_mapped,
|
||||
.anon_lock = page_lock_anon_vma_read,
|
||||
};
|
||||
@@ -1494,8 +1629,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
|
||||
|
||||
ret = rmap_walk(page, &rwc);
|
||||
|
||||
if (ret != SWAP_MLOCK && !page_mapped(page))
|
||||
if (ret != SWAP_MLOCK && !page_mapped(page)) {
|
||||
ret = SWAP_SUCCESS;
|
||||
if (rp.lazyfreed && !PageDirty(page))
|
||||
ret = SWAP_LZFREE;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -1517,9 +1655,14 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
|
||||
int try_to_munlock(struct page *page)
|
||||
{
|
||||
int ret;
|
||||
struct rmap_private rp = {
|
||||
.flags = TTU_MUNLOCK,
|
||||
.lazyfreed = 0,
|
||||
};
|
||||
|
||||
struct rmap_walk_control rwc = {
|
||||
.rmap_one = try_to_unmap_one,
|
||||
.arg = (void *)TTU_MUNLOCK,
|
||||
.arg = &rp,
|
||||
.done = page_not_mapped,
|
||||
.anon_lock = page_lock_anon_vma_read,
|
||||
|
||||
@@ -1702,7 +1845,7 @@ void hugepage_add_anon_rmap(struct page *page,
|
||||
BUG_ON(!PageLocked(page));
|
||||
BUG_ON(!anon_vma);
|
||||
/* address might be in next vma when migration races vma_adjust */
|
||||
first = atomic_inc_and_test(&page->_mapcount);
|
||||
first = atomic_inc_and_test(compound_mapcount_ptr(page));
|
||||
if (first)
|
||||
__hugepage_set_anon_rmap(page, vma, address, 0);
|
||||
}
|
||||
@@ -1711,7 +1854,7 @@ void hugepage_add_new_anon_rmap(struct page *page,
|
||||
struct vm_area_struct *vma, unsigned long address)
|
||||
{
|
||||
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
|
||||
atomic_set(&page->_mapcount, 0);
|
||||
atomic_set(compound_mapcount_ptr(page), 0);
|
||||
__hugepage_set_anon_rmap(page, vma, address, 1);
|
||||
}
|
||||
#endif /* CONFIG_HUGETLB_PAGE */
|
||||
|
135
mm/shmem.c
135
mm/shmem.c
@@ -359,6 +359,87 @@ static int shmem_free_swap(struct address_space *mapping,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine (in bytes) how many of the shmem object's pages mapped by the
|
||||
* given offsets are swapped out.
|
||||
*
|
||||
* This is safe to call without i_mutex or mapping->tree_lock thanks to RCU,
|
||||
* as long as the inode doesn't go away and racy results are not a problem.
|
||||
*/
|
||||
unsigned long shmem_partial_swap_usage(struct address_space *mapping,
|
||||
pgoff_t start, pgoff_t end)
|
||||
{
|
||||
struct radix_tree_iter iter;
|
||||
void **slot;
|
||||
struct page *page;
|
||||
unsigned long swapped = 0;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
restart:
|
||||
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
|
||||
if (iter.index >= end)
|
||||
break;
|
||||
|
||||
page = radix_tree_deref_slot(slot);
|
||||
|
||||
/*
|
||||
* This should only be possible to happen at index 0, so we
|
||||
* don't need to reset the counter, nor do we risk infinite
|
||||
* restarts.
|
||||
*/
|
||||
if (radix_tree_deref_retry(page))
|
||||
goto restart;
|
||||
|
||||
if (radix_tree_exceptional_entry(page))
|
||||
swapped++;
|
||||
|
||||
if (need_resched()) {
|
||||
cond_resched_rcu();
|
||||
start = iter.index + 1;
|
||||
goto restart;
|
||||
}
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
|
||||
return swapped << PAGE_SHIFT;
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine (in bytes) how many of the shmem object's pages mapped by the
|
||||
* given vma is swapped out.
|
||||
*
|
||||
* This is safe to call without i_mutex or mapping->tree_lock thanks to RCU,
|
||||
* as long as the inode doesn't go away and racy results are not a problem.
|
||||
*/
|
||||
unsigned long shmem_swap_usage(struct vm_area_struct *vma)
|
||||
{
|
||||
struct inode *inode = file_inode(vma->vm_file);
|
||||
struct shmem_inode_info *info = SHMEM_I(inode);
|
||||
struct address_space *mapping = inode->i_mapping;
|
||||
unsigned long swapped;
|
||||
|
||||
/* Be careful as we don't hold info->lock */
|
||||
swapped = READ_ONCE(info->swapped);
|
||||
|
||||
/*
|
||||
* The easier cases are when the shmem object has nothing in swap, or
|
||||
* the vma maps it whole. Then we can simply use the stats that we
|
||||
* already track.
|
||||
*/
|
||||
if (!swapped)
|
||||
return 0;
|
||||
|
||||
if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
|
||||
return swapped << PAGE_SHIFT;
|
||||
|
||||
/* Here comes the more involved part */
|
||||
return shmem_partial_swap_usage(mapping,
|
||||
linear_page_index(vma, vma->vm_start),
|
||||
linear_page_index(vma, vma->vm_end));
|
||||
}
|
||||
|
||||
/*
|
||||
* SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
|
||||
*/
|
||||
@@ -620,8 +701,7 @@ static void shmem_evict_inode(struct inode *inode)
|
||||
list_del_init(&info->swaplist);
|
||||
mutex_unlock(&shmem_swaplist_mutex);
|
||||
}
|
||||
} else
|
||||
kfree(info->symlink);
|
||||
}
|
||||
|
||||
simple_xattrs_free(&info->xattrs);
|
||||
WARN_ON(inode->i_blocks);
|
||||
@@ -729,7 +809,8 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
|
||||
* the shmem_swaplist_mutex which might hold up shmem_writepage().
|
||||
* Charged back to the user (not to caller) when swap account is used.
|
||||
*/
|
||||
error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg);
|
||||
error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg,
|
||||
false);
|
||||
if (error)
|
||||
goto out;
|
||||
/* No radix_tree_preload: swap entry keeps a place for page in tree */
|
||||
@@ -752,9 +833,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
|
||||
if (error) {
|
||||
if (error != -ENOMEM)
|
||||
error = 0;
|
||||
mem_cgroup_cancel_charge(page, memcg);
|
||||
mem_cgroup_cancel_charge(page, memcg, false);
|
||||
} else
|
||||
mem_cgroup_commit_charge(page, memcg, true);
|
||||
mem_cgroup_commit_charge(page, memcg, true, false);
|
||||
out:
|
||||
unlock_page(page);
|
||||
page_cache_release(page);
|
||||
@@ -830,6 +911,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
|
||||
if (!swap.val)
|
||||
goto redirty;
|
||||
|
||||
if (mem_cgroup_try_charge_swap(page, swap))
|
||||
goto free_swap;
|
||||
|
||||
/*
|
||||
* Add inode to shmem_unuse()'s list of swapped-out inodes,
|
||||
* if it's not already there. Do it now before the page is
|
||||
@@ -858,6 +942,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
|
||||
}
|
||||
|
||||
mutex_unlock(&shmem_swaplist_mutex);
|
||||
free_swap:
|
||||
swapcache_free(swap);
|
||||
redirty:
|
||||
set_page_dirty(page);
|
||||
@@ -1004,7 +1089,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
|
||||
copy_highpage(newpage, oldpage);
|
||||
flush_dcache_page(newpage);
|
||||
|
||||
__set_page_locked(newpage);
|
||||
__SetPageLocked(newpage);
|
||||
SetPageUptodate(newpage);
|
||||
SetPageSwapBacked(newpage);
|
||||
set_page_private(newpage, swap_index);
|
||||
@@ -1137,7 +1222,8 @@ repeat:
|
||||
goto failed;
|
||||
}
|
||||
|
||||
error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
|
||||
error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
|
||||
false);
|
||||
if (!error) {
|
||||
error = shmem_add_to_page_cache(page, mapping, index,
|
||||
swp_to_radix_entry(swap));
|
||||
@@ -1154,14 +1240,14 @@ repeat:
|
||||
* "repeat": reading a hole and writing should succeed.
|
||||
*/
|
||||
if (error) {
|
||||
mem_cgroup_cancel_charge(page, memcg);
|
||||
mem_cgroup_cancel_charge(page, memcg, false);
|
||||
delete_from_swap_cache(page);
|
||||
}
|
||||
}
|
||||
if (error)
|
||||
goto failed;
|
||||
|
||||
mem_cgroup_commit_charge(page, memcg, true);
|
||||
mem_cgroup_commit_charge(page, memcg, true, false);
|
||||
|
||||
spin_lock(&info->lock);
|
||||
info->swapped--;
|
||||
@@ -1196,11 +1282,12 @@ repeat:
|
||||
}
|
||||
|
||||
__SetPageSwapBacked(page);
|
||||
__set_page_locked(page);
|
||||
__SetPageLocked(page);
|
||||
if (sgp == SGP_WRITE)
|
||||
__SetPageReferenced(page);
|
||||
|
||||
error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
|
||||
error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
|
||||
false);
|
||||
if (error)
|
||||
goto decused;
|
||||
error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
|
||||
@@ -1210,10 +1297,10 @@ repeat:
|
||||
radix_tree_preload_end();
|
||||
}
|
||||
if (error) {
|
||||
mem_cgroup_cancel_charge(page, memcg);
|
||||
mem_cgroup_cancel_charge(page, memcg, false);
|
||||
goto decused;
|
||||
}
|
||||
mem_cgroup_commit_charge(page, memcg, false);
|
||||
mem_cgroup_commit_charge(page, memcg, false, false);
|
||||
lru_cache_add_anon(page);
|
||||
|
||||
spin_lock(&info->lock);
|
||||
@@ -1814,7 +1901,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
|
||||
if (whence != SEEK_DATA && whence != SEEK_HOLE)
|
||||
return generic_file_llseek_size(file, offset, whence,
|
||||
MAX_LFS_FILESIZE, i_size_read(inode));
|
||||
mutex_lock(&inode->i_mutex);
|
||||
inode_lock(inode);
|
||||
/* We're holding i_mutex so we can access i_size directly */
|
||||
|
||||
if (offset < 0)
|
||||
@@ -1838,7 +1925,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
|
||||
|
||||
if (offset >= 0)
|
||||
offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
inode_unlock(inode);
|
||||
return offset;
|
||||
}
|
||||
|
||||
@@ -2003,7 +2090,7 @@ int shmem_add_seals(struct file *file, unsigned int seals)
|
||||
if (seals & ~(unsigned int)F_ALL_SEALS)
|
||||
return -EINVAL;
|
||||
|
||||
mutex_lock(&inode->i_mutex);
|
||||
inode_lock(inode);
|
||||
|
||||
if (info->seals & F_SEAL_SEAL) {
|
||||
error = -EPERM;
|
||||
@@ -2026,7 +2113,7 @@ int shmem_add_seals(struct file *file, unsigned int seals)
|
||||
error = 0;
|
||||
|
||||
unlock:
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
inode_unlock(inode);
|
||||
return error;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(shmem_add_seals);
|
||||
@@ -2076,7 +2163,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
|
||||
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
mutex_lock(&inode->i_mutex);
|
||||
inode_lock(inode);
|
||||
|
||||
if (mode & FALLOC_FL_PUNCH_HOLE) {
|
||||
struct address_space *mapping = file->f_mapping;
|
||||
@@ -2189,7 +2276,7 @@ undone:
|
||||
inode->i_private = NULL;
|
||||
spin_unlock(&inode->i_lock);
|
||||
out:
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
inode_unlock(inode);
|
||||
return error;
|
||||
}
|
||||
|
||||
@@ -2461,14 +2548,14 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
|
||||
info = SHMEM_I(inode);
|
||||
inode->i_size = len-1;
|
||||
if (len <= SHORT_SYMLINK_LEN) {
|
||||
info->symlink = kmemdup(symname, len, GFP_KERNEL);
|
||||
if (!info->symlink) {
|
||||
inode->i_link = kmemdup(symname, len, GFP_KERNEL);
|
||||
if (!inode->i_link) {
|
||||
iput(inode);
|
||||
return -ENOMEM;
|
||||
}
|
||||
inode->i_op = &shmem_short_symlink_operations;
|
||||
inode->i_link = info->symlink;
|
||||
} else {
|
||||
inode_nohighmem(inode);
|
||||
error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
|
||||
if (error) {
|
||||
iput(inode);
|
||||
@@ -2476,7 +2563,6 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
|
||||
}
|
||||
inode->i_mapping->a_ops = &shmem_aops;
|
||||
inode->i_op = &shmem_symlink_inode_operations;
|
||||
inode_nohighmem(inode);
|
||||
memcpy(page_address(page), symname, len);
|
||||
SetPageUptodate(page);
|
||||
set_page_dirty(page);
|
||||
@@ -3044,6 +3130,7 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
|
||||
static void shmem_destroy_callback(struct rcu_head *head)
|
||||
{
|
||||
struct inode *inode = container_of(head, struct inode, i_rcu);
|
||||
kfree(inode->i_link);
|
||||
kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
|
||||
}
|
||||
|
||||
@@ -3064,7 +3151,7 @@ static int shmem_init_inodecache(void)
|
||||
{
|
||||
shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
|
||||
sizeof(struct shmem_inode_info),
|
||||
0, SLAB_PANIC, shmem_init_inode);
|
||||
0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
48
mm/slab.c
48
mm/slab.c
@@ -2756,6 +2756,21 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
|
||||
#define cache_free_debugcheck(x,objp,z) (objp)
|
||||
#endif
|
||||
|
||||
static struct page *get_first_slab(struct kmem_cache_node *n)
|
||||
{
|
||||
struct page *page;
|
||||
|
||||
page = list_first_entry_or_null(&n->slabs_partial,
|
||||
struct page, lru);
|
||||
if (!page) {
|
||||
n->free_touched = 1;
|
||||
page = list_first_entry_or_null(&n->slabs_free,
|
||||
struct page, lru);
|
||||
}
|
||||
|
||||
return page;
|
||||
}
|
||||
|
||||
static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
|
||||
bool force_refill)
|
||||
{
|
||||
@@ -2791,18 +2806,12 @@ retry:
|
||||
}
|
||||
|
||||
while (batchcount > 0) {
|
||||
struct list_head *entry;
|
||||
struct page *page;
|
||||
/* Get slab alloc is to come from. */
|
||||
entry = n->slabs_partial.next;
|
||||
if (entry == &n->slabs_partial) {
|
||||
n->free_touched = 1;
|
||||
entry = n->slabs_free.next;
|
||||
if (entry == &n->slabs_free)
|
||||
goto must_grow;
|
||||
}
|
||||
page = get_first_slab(n);
|
||||
if (!page)
|
||||
goto must_grow;
|
||||
|
||||
page = list_entry(entry, struct page, lru);
|
||||
check_spinlock_acquired(cachep);
|
||||
|
||||
/*
|
||||
@@ -3085,7 +3094,6 @@ retry:
|
||||
static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
|
||||
int nodeid)
|
||||
{
|
||||
struct list_head *entry;
|
||||
struct page *page;
|
||||
struct kmem_cache_node *n;
|
||||
void *obj;
|
||||
@@ -3098,15 +3106,10 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
|
||||
retry:
|
||||
check_irq_off();
|
||||
spin_lock(&n->list_lock);
|
||||
entry = n->slabs_partial.next;
|
||||
if (entry == &n->slabs_partial) {
|
||||
n->free_touched = 1;
|
||||
entry = n->slabs_free.next;
|
||||
if (entry == &n->slabs_free)
|
||||
goto must_grow;
|
||||
}
|
||||
page = get_first_slab(n);
|
||||
if (!page)
|
||||
goto must_grow;
|
||||
|
||||
page = list_entry(entry, struct page, lru);
|
||||
check_spinlock_acquired_node(cachep, nodeid);
|
||||
|
||||
STATS_INC_NODEALLOCS(cachep);
|
||||
@@ -3338,17 +3341,12 @@ free_done:
|
||||
#if STATS
|
||||
{
|
||||
int i = 0;
|
||||
struct list_head *p;
|
||||
struct page *page;
|
||||
|
||||
p = n->slabs_free.next;
|
||||
while (p != &(n->slabs_free)) {
|
||||
struct page *page;
|
||||
|
||||
page = list_entry(p, struct page, lru);
|
||||
list_for_each_entry(page, &n->slabs_free, lru) {
|
||||
BUG_ON(page->active);
|
||||
|
||||
i++;
|
||||
p = p->next;
|
||||
}
|
||||
STATS_SET_FREEABLE(cachep, i);
|
||||
}
|
||||
|
11
mm/slab.h
11
mm/slab.h
@@ -128,10 +128,11 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
|
||||
|
||||
#if defined(CONFIG_SLAB)
|
||||
#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
|
||||
SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK)
|
||||
SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \
|
||||
SLAB_NOTRACK | SLAB_ACCOUNT)
|
||||
#elif defined(CONFIG_SLUB)
|
||||
#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
|
||||
SLAB_TEMPORARY | SLAB_NOTRACK)
|
||||
SLAB_TEMPORARY | SLAB_NOTRACK | SLAB_ACCOUNT)
|
||||
#else
|
||||
#define SLAB_CACHE_FLAGS (0)
|
||||
#endif
|
||||
@@ -172,7 +173,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
|
||||
void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
|
||||
int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
|
||||
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
|
||||
/*
|
||||
* Iterate over all memcg caches of the given root cache. The caller must hold
|
||||
* slab_mutex.
|
||||
@@ -250,7 +251,7 @@ static __always_inline int memcg_charge_slab(struct page *page,
|
||||
|
||||
extern void slab_init_memcg_params(struct kmem_cache *);
|
||||
|
||||
#else /* !CONFIG_MEMCG_KMEM */
|
||||
#else /* CONFIG_MEMCG && !CONFIG_SLOB */
|
||||
|
||||
#define for_each_memcg_cache(iter, root) \
|
||||
for ((void)(iter), (void)(root); 0; )
|
||||
@@ -291,7 +292,7 @@ static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order,
|
||||
static inline void slab_init_memcg_params(struct kmem_cache *s)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_MEMCG_KMEM */
|
||||
#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
|
||||
|
||||
static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
|
||||
{
|
||||
|
@@ -37,7 +37,8 @@ struct kmem_cache *kmem_cache;
|
||||
SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
|
||||
SLAB_FAILSLAB)
|
||||
|
||||
#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | SLAB_NOTRACK)
|
||||
#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
|
||||
SLAB_NOTRACK | SLAB_ACCOUNT)
|
||||
|
||||
/*
|
||||
* Merge control. If this is set then no merging of slab caches will occur.
|
||||
@@ -127,7 +128,7 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
|
||||
return i;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
|
||||
void slab_init_memcg_params(struct kmem_cache *s)
|
||||
{
|
||||
s->memcg_params.is_root_cache = true;
|
||||
@@ -220,7 +221,7 @@ static inline int init_memcg_params(struct kmem_cache *s,
|
||||
static inline void destroy_memcg_params(struct kmem_cache *s)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_MEMCG_KMEM */
|
||||
#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
|
||||
|
||||
/*
|
||||
* Find a mergeable slab cache
|
||||
@@ -476,7 +477,7 @@ static void release_caches(struct list_head *release, bool need_rcu_barrier)
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
|
||||
/*
|
||||
* memcg_create_kmem_cache - Create a cache for a memory cgroup.
|
||||
* @memcg: The memory cgroup the new cache is for.
|
||||
@@ -502,10 +503,10 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
|
||||
mutex_lock(&slab_mutex);
|
||||
|
||||
/*
|
||||
* The memory cgroup could have been deactivated while the cache
|
||||
* The memory cgroup could have been offlined while the cache
|
||||
* creation work was pending.
|
||||
*/
|
||||
if (!memcg_kmem_is_active(memcg))
|
||||
if (!memcg_kmem_online(memcg))
|
||||
goto out_unlock;
|
||||
|
||||
idx = memcg_cache_id(memcg);
|
||||
@@ -688,7 +689,7 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s,
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_MEMCG_KMEM */
|
||||
#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
|
||||
|
||||
void slab_kmem_cache_release(struct kmem_cache *s)
|
||||
{
|
||||
@@ -1122,7 +1123,7 @@ static int slab_show(struct seq_file *m, void *p)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
|
||||
int memcg_slab_show(struct seq_file *m, void *p)
|
||||
{
|
||||
struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
|
||||
|
14
mm/slub.c
14
mm/slub.c
@@ -338,11 +338,13 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
|
||||
*/
|
||||
static __always_inline void slab_lock(struct page *page)
|
||||
{
|
||||
VM_BUG_ON_PAGE(PageTail(page), page);
|
||||
bit_spin_lock(PG_locked, &page->flags);
|
||||
}
|
||||
|
||||
static __always_inline void slab_unlock(struct page *page)
|
||||
{
|
||||
VM_BUG_ON_PAGE(PageTail(page), page);
|
||||
__bit_spin_unlock(PG_locked, &page->flags);
|
||||
}
|
||||
|
||||
@@ -5205,7 +5207,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
|
||||
return -EIO;
|
||||
|
||||
err = attribute->store(s, buf, len);
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
#ifdef CONFIG_MEMCG
|
||||
if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
|
||||
struct kmem_cache *c;
|
||||
|
||||
@@ -5240,7 +5242,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
|
||||
|
||||
static void memcg_propagate_slab_attrs(struct kmem_cache *s)
|
||||
{
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
#ifdef CONFIG_MEMCG
|
||||
int i;
|
||||
char *buffer = NULL;
|
||||
struct kmem_cache *root_cache;
|
||||
@@ -5326,7 +5328,7 @@ static struct kset *slab_kset;
|
||||
|
||||
static inline struct kset *cache_kset(struct kmem_cache *s)
|
||||
{
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
#ifdef CONFIG_MEMCG
|
||||
if (!is_root_cache(s))
|
||||
return s->memcg_params.root_cache->memcg_kset;
|
||||
#endif
|
||||
@@ -5362,6 +5364,8 @@ static char *create_unique_id(struct kmem_cache *s)
|
||||
*p++ = 'F';
|
||||
if (!(s->flags & SLAB_NOTRACK))
|
||||
*p++ = 't';
|
||||
if (s->flags & SLAB_ACCOUNT)
|
||||
*p++ = 'A';
|
||||
if (p != name + 1)
|
||||
*p++ = '-';
|
||||
p += sprintf(p, "%07d", s->size);
|
||||
@@ -5401,7 +5405,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
|
||||
if (err)
|
||||
goto out_del_kobj;
|
||||
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
#ifdef CONFIG_MEMCG
|
||||
if (is_root_cache(s)) {
|
||||
s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
|
||||
if (!s->memcg_kset) {
|
||||
@@ -5434,7 +5438,7 @@ void sysfs_slab_remove(struct kmem_cache *s)
|
||||
*/
|
||||
return;
|
||||
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
#ifdef CONFIG_MEMCG
|
||||
kset_unregister(s->memcg_kset);
|
||||
#endif
|
||||
kobject_uevent(&s->kobj, KOBJ_REMOVE);
|
||||
|
@@ -20,6 +20,7 @@
|
||||
#include <linux/mm.h>
|
||||
#include <linux/mmzone.h>
|
||||
#include <linux/bootmem.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/spinlock.h>
|
||||
@@ -70,7 +71,7 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
|
||||
}
|
||||
|
||||
/* need to make sure size is all the same during early stage */
|
||||
void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
|
||||
static void * __meminit alloc_block_buf(unsigned long size, int node)
|
||||
{
|
||||
void *ptr;
|
||||
|
||||
@@ -87,6 +88,77 @@ void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
|
||||
return ptr;
|
||||
}
|
||||
|
||||
static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
|
||||
{
|
||||
return altmap->base_pfn + altmap->reserve + altmap->alloc
|
||||
+ altmap->align;
|
||||
}
|
||||
|
||||
static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap)
|
||||
{
|
||||
unsigned long allocated = altmap->alloc + altmap->align;
|
||||
|
||||
if (altmap->free > allocated)
|
||||
return altmap->free - allocated;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* vmem_altmap_alloc - allocate pages from the vmem_altmap reservation
|
||||
* @altmap - reserved page pool for the allocation
|
||||
* @nr_pfns - size (in pages) of the allocation
|
||||
*
|
||||
* Allocations are aligned to the size of the request
|
||||
*/
|
||||
static unsigned long __meminit vmem_altmap_alloc(struct vmem_altmap *altmap,
|
||||
unsigned long nr_pfns)
|
||||
{
|
||||
unsigned long pfn = vmem_altmap_next_pfn(altmap);
|
||||
unsigned long nr_align;
|
||||
|
||||
nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG);
|
||||
nr_align = ALIGN(pfn, nr_align) - pfn;
|
||||
|
||||
if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap))
|
||||
return ULONG_MAX;
|
||||
altmap->alloc += nr_pfns;
|
||||
altmap->align += nr_align;
|
||||
return pfn + nr_align;
|
||||
}
|
||||
|
||||
static void * __meminit altmap_alloc_block_buf(unsigned long size,
|
||||
struct vmem_altmap *altmap)
|
||||
{
|
||||
unsigned long pfn, nr_pfns;
|
||||
void *ptr;
|
||||
|
||||
if (size & ~PAGE_MASK) {
|
||||
pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n",
|
||||
__func__, size);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
nr_pfns = size >> PAGE_SHIFT;
|
||||
pfn = vmem_altmap_alloc(altmap, nr_pfns);
|
||||
if (pfn < ULONG_MAX)
|
||||
ptr = __va(__pfn_to_phys(pfn));
|
||||
else
|
||||
ptr = NULL;
|
||||
pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n",
|
||||
__func__, pfn, altmap->alloc, altmap->align, nr_pfns);
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/* need to make sure size is all the same during early stage */
|
||||
void * __meminit __vmemmap_alloc_block_buf(unsigned long size, int node,
|
||||
struct vmem_altmap *altmap)
|
||||
{
|
||||
if (altmap)
|
||||
return altmap_alloc_block_buf(size, altmap);
|
||||
return alloc_block_buf(size, node);
|
||||
}
|
||||
|
||||
void __meminit vmemmap_verify(pte_t *pte, int node,
|
||||
unsigned long start, unsigned long end)
|
||||
{
|
||||
@@ -103,7 +175,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
|
||||
pte_t *pte = pte_offset_kernel(pmd, addr);
|
||||
if (pte_none(*pte)) {
|
||||
pte_t entry;
|
||||
void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node);
|
||||
void *p = alloc_block_buf(PAGE_SIZE, node);
|
||||
if (!p)
|
||||
return NULL;
|
||||
entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
|
||||
|
@@ -748,7 +748,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
|
||||
if (!memmap)
|
||||
return;
|
||||
|
||||
for (i = 0; i < PAGES_PER_SECTION; i++) {
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
if (PageHWPoison(&memmap[i])) {
|
||||
atomic_long_sub(1, &num_poisoned_pages);
|
||||
ClearPageHWPoison(&memmap[i]);
|
||||
@@ -788,7 +788,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
|
||||
free_map_bootmem(memmap);
|
||||
}
|
||||
|
||||
void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
|
||||
void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
|
||||
unsigned long map_offset)
|
||||
{
|
||||
struct page *memmap = NULL;
|
||||
unsigned long *usemap = NULL, flags;
|
||||
@@ -804,7 +805,8 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
|
||||
}
|
||||
pgdat_resize_unlock(pgdat, &flags);
|
||||
|
||||
clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
|
||||
clear_hwpoisoned_pages(memmap + map_offset,
|
||||
PAGES_PER_SECTION - map_offset);
|
||||
free_section_usemap(memmap, usemap);
|
||||
}
|
||||
#endif /* CONFIG_MEMORY_HOTREMOVE */
|
||||
|
319
mm/swap.c
319
mm/swap.c
@@ -24,6 +24,7 @@
|
||||
#include <linux/export.h>
|
||||
#include <linux/mm_inline.h>
|
||||
#include <linux/percpu_counter.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/percpu.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/notifier.h>
|
||||
@@ -45,6 +46,7 @@ int page_cluster;
|
||||
static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
|
||||
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
|
||||
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
|
||||
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
|
||||
|
||||
/*
|
||||
* This path almost never happens for VM activity - pages are normally
|
||||
@@ -89,260 +91,14 @@ static void __put_compound_page(struct page *page)
|
||||
(*dtor)(page);
|
||||
}
|
||||
|
||||
/**
|
||||
* Two special cases here: we could avoid taking compound_lock_irqsave
|
||||
* and could skip the tail refcounting(in _mapcount).
|
||||
*
|
||||
* 1. Hugetlbfs page:
|
||||
*
|
||||
* PageHeadHuge will remain true until the compound page
|
||||
* is released and enters the buddy allocator, and it could
|
||||
* not be split by __split_huge_page_refcount().
|
||||
*
|
||||
* So if we see PageHeadHuge set, and we have the tail page pin,
|
||||
* then we could safely put head page.
|
||||
*
|
||||
* 2. Slab THP page:
|
||||
*
|
||||
* PG_slab is cleared before the slab frees the head page, and
|
||||
* tail pin cannot be the last reference left on the head page,
|
||||
* because the slab code is free to reuse the compound page
|
||||
* after a kfree/kmem_cache_free without having to check if
|
||||
* there's any tail pin left. In turn all tail pinsmust be always
|
||||
* released while the head is still pinned by the slab code
|
||||
* and so we know PG_slab will be still set too.
|
||||
*
|
||||
* So if we see PageSlab set, and we have the tail page pin,
|
||||
* then we could safely put head page.
|
||||
*/
|
||||
static __always_inline
|
||||
void put_unrefcounted_compound_page(struct page *page_head, struct page *page)
|
||||
{
|
||||
/*
|
||||
* If @page is a THP tail, we must read the tail page
|
||||
* flags after the head page flags. The
|
||||
* __split_huge_page_refcount side enforces write memory barriers
|
||||
* between clearing PageTail and before the head page
|
||||
* can be freed and reallocated.
|
||||
*/
|
||||
smp_rmb();
|
||||
if (likely(PageTail(page))) {
|
||||
/*
|
||||
* __split_huge_page_refcount cannot race
|
||||
* here, see the comment above this function.
|
||||
*/
|
||||
VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
|
||||
if (put_page_testzero(page_head)) {
|
||||
/*
|
||||
* If this is the tail of a slab THP page,
|
||||
* the tail pin must not be the last reference
|
||||
* held on the page, because the PG_slab cannot
|
||||
* be cleared before all tail pins (which skips
|
||||
* the _mapcount tail refcounting) have been
|
||||
* released.
|
||||
*
|
||||
* If this is the tail of a hugetlbfs page,
|
||||
* the tail pin may be the last reference on
|
||||
* the page instead, because PageHeadHuge will
|
||||
* not go away until the compound page enters
|
||||
* the buddy allocator.
|
||||
*/
|
||||
VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
|
||||
__put_compound_page(page_head);
|
||||
}
|
||||
} else
|
||||
/*
|
||||
* __split_huge_page_refcount run before us,
|
||||
* @page was a THP tail. The split @page_head
|
||||
* has been freed and reallocated as slab or
|
||||
* hugetlbfs page of smaller order (only
|
||||
* possible if reallocated as slab on x86).
|
||||
*/
|
||||
if (put_page_testzero(page))
|
||||
__put_single_page(page);
|
||||
}
|
||||
|
||||
static __always_inline
|
||||
void put_refcounted_compound_page(struct page *page_head, struct page *page)
|
||||
{
|
||||
if (likely(page != page_head && get_page_unless_zero(page_head))) {
|
||||
unsigned long flags;
|
||||
|
||||
/*
|
||||
* @page_head wasn't a dangling pointer but it may not
|
||||
* be a head page anymore by the time we obtain the
|
||||
* lock. That is ok as long as it can't be freed from
|
||||
* under us.
|
||||
*/
|
||||
flags = compound_lock_irqsave(page_head);
|
||||
if (unlikely(!PageTail(page))) {
|
||||
/* __split_huge_page_refcount run before us */
|
||||
compound_unlock_irqrestore(page_head, flags);
|
||||
if (put_page_testzero(page_head)) {
|
||||
/*
|
||||
* The @page_head may have been freed
|
||||
* and reallocated as a compound page
|
||||
* of smaller order and then freed
|
||||
* again. All we know is that it
|
||||
* cannot have become: a THP page, a
|
||||
* compound page of higher order, a
|
||||
* tail page. That is because we
|
||||
* still hold the refcount of the
|
||||
* split THP tail and page_head was
|
||||
* the THP head before the split.
|
||||
*/
|
||||
if (PageHead(page_head))
|
||||
__put_compound_page(page_head);
|
||||
else
|
||||
__put_single_page(page_head);
|
||||
}
|
||||
out_put_single:
|
||||
if (put_page_testzero(page))
|
||||
__put_single_page(page);
|
||||
return;
|
||||
}
|
||||
VM_BUG_ON_PAGE(page_head != compound_head(page), page);
|
||||
/*
|
||||
* We can release the refcount taken by
|
||||
* get_page_unless_zero() now that
|
||||
* __split_huge_page_refcount() is blocked on the
|
||||
* compound_lock.
|
||||
*/
|
||||
if (put_page_testzero(page_head))
|
||||
VM_BUG_ON_PAGE(1, page_head);
|
||||
/* __split_huge_page_refcount will wait now */
|
||||
VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page);
|
||||
atomic_dec(&page->_mapcount);
|
||||
VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head);
|
||||
VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
|
||||
compound_unlock_irqrestore(page_head, flags);
|
||||
|
||||
if (put_page_testzero(page_head)) {
|
||||
if (PageHead(page_head))
|
||||
__put_compound_page(page_head);
|
||||
else
|
||||
__put_single_page(page_head);
|
||||
}
|
||||
} else {
|
||||
/* @page_head is a dangling pointer */
|
||||
VM_BUG_ON_PAGE(PageTail(page), page);
|
||||
goto out_put_single;
|
||||
}
|
||||
}
|
||||
|
||||
static void put_compound_page(struct page *page)
|
||||
{
|
||||
struct page *page_head;
|
||||
|
||||
/*
|
||||
* We see the PageCompound set and PageTail not set, so @page maybe:
|
||||
* 1. hugetlbfs head page, or
|
||||
* 2. THP head page.
|
||||
*/
|
||||
if (likely(!PageTail(page))) {
|
||||
if (put_page_testzero(page)) {
|
||||
/*
|
||||
* By the time all refcounts have been released
|
||||
* split_huge_page cannot run anymore from under us.
|
||||
*/
|
||||
if (PageHead(page))
|
||||
__put_compound_page(page);
|
||||
else
|
||||
__put_single_page(page);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* We see the PageCompound set and PageTail set, so @page maybe:
|
||||
* 1. a tail hugetlbfs page, or
|
||||
* 2. a tail THP page, or
|
||||
* 3. a split THP page.
|
||||
*
|
||||
* Case 3 is possible, as we may race with
|
||||
* __split_huge_page_refcount tearing down a THP page.
|
||||
*/
|
||||
page_head = compound_head(page);
|
||||
if (!__compound_tail_refcounted(page_head))
|
||||
put_unrefcounted_compound_page(page_head, page);
|
||||
else
|
||||
put_refcounted_compound_page(page_head, page);
|
||||
}
|
||||
|
||||
void put_page(struct page *page)
|
||||
void __put_page(struct page *page)
|
||||
{
|
||||
if (unlikely(PageCompound(page)))
|
||||
put_compound_page(page);
|
||||
else if (put_page_testzero(page))
|
||||
__put_compound_page(page);
|
||||
else
|
||||
__put_single_page(page);
|
||||
}
|
||||
EXPORT_SYMBOL(put_page);
|
||||
|
||||
/*
|
||||
* This function is exported but must not be called by anything other
|
||||
* than get_page(). It implements the slow path of get_page().
|
||||
*/
|
||||
bool __get_page_tail(struct page *page)
|
||||
{
|
||||
/*
|
||||
* This takes care of get_page() if run on a tail page
|
||||
* returned by one of the get_user_pages/follow_page variants.
|
||||
* get_user_pages/follow_page itself doesn't need the compound
|
||||
* lock because it runs __get_page_tail_foll() under the
|
||||
* proper PT lock that already serializes against
|
||||
* split_huge_page().
|
||||
*/
|
||||
unsigned long flags;
|
||||
bool got;
|
||||
struct page *page_head = compound_head(page);
|
||||
|
||||
/* Ref to put_compound_page() comment. */
|
||||
if (!__compound_tail_refcounted(page_head)) {
|
||||
smp_rmb();
|
||||
if (likely(PageTail(page))) {
|
||||
/*
|
||||
* This is a hugetlbfs page or a slab
|
||||
* page. __split_huge_page_refcount
|
||||
* cannot race here.
|
||||
*/
|
||||
VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
|
||||
__get_page_tail_foll(page, true);
|
||||
return true;
|
||||
} else {
|
||||
/*
|
||||
* __split_huge_page_refcount run
|
||||
* before us, "page" was a THP
|
||||
* tail. The split page_head has been
|
||||
* freed and reallocated as slab or
|
||||
* hugetlbfs page of smaller order
|
||||
* (only possible if reallocated as
|
||||
* slab on x86).
|
||||
*/
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
got = false;
|
||||
if (likely(page != page_head && get_page_unless_zero(page_head))) {
|
||||
/*
|
||||
* page_head wasn't a dangling pointer but it
|
||||
* may not be a head page anymore by the time
|
||||
* we obtain the lock. That is ok as long as it
|
||||
* can't be freed from under us.
|
||||
*/
|
||||
flags = compound_lock_irqsave(page_head);
|
||||
/* here __split_huge_page_refcount won't run anymore */
|
||||
if (likely(PageTail(page))) {
|
||||
__get_page_tail_foll(page, false);
|
||||
got = true;
|
||||
}
|
||||
compound_unlock_irqrestore(page_head, flags);
|
||||
if (unlikely(!got))
|
||||
put_page(page_head);
|
||||
}
|
||||
return got;
|
||||
}
|
||||
EXPORT_SYMBOL(__get_page_tail);
|
||||
EXPORT_SYMBOL(__put_page);
|
||||
|
||||
/**
|
||||
* put_pages_list() - release a list of pages
|
||||
@@ -604,6 +360,7 @@ static void __lru_cache_activate_page(struct page *page)
|
||||
*/
|
||||
void mark_page_accessed(struct page *page)
|
||||
{
|
||||
page = compound_head(page);
|
||||
if (!PageActive(page) && !PageUnevictable(page) &&
|
||||
PageReferenced(page)) {
|
||||
|
||||
@@ -799,6 +556,24 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
|
||||
update_page_reclaim_stat(lruvec, file, 0);
|
||||
}
|
||||
|
||||
|
||||
static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
|
||||
void *arg)
|
||||
{
|
||||
if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
|
||||
int file = page_is_file_cache(page);
|
||||
int lru = page_lru_base_type(page);
|
||||
|
||||
del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
|
||||
ClearPageActive(page);
|
||||
ClearPageReferenced(page);
|
||||
add_page_to_lru_list(page, lruvec, lru);
|
||||
|
||||
__count_vm_event(PGDEACTIVATE);
|
||||
update_page_reclaim_stat(lruvec, file, 0);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Drain pages out of the cpu's pagevecs.
|
||||
* Either "cpu" is the current CPU, and preemption has already been
|
||||
@@ -825,6 +600,10 @@ void lru_add_drain_cpu(int cpu)
|
||||
if (pagevec_count(pvec))
|
||||
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
|
||||
|
||||
pvec = &per_cpu(lru_deactivate_pvecs, cpu);
|
||||
if (pagevec_count(pvec))
|
||||
pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
|
||||
|
||||
activate_page_drain(cpu);
|
||||
}
|
||||
|
||||
@@ -854,6 +633,26 @@ void deactivate_file_page(struct page *page)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* deactivate_page - deactivate a page
|
||||
* @page: page to deactivate
|
||||
*
|
||||
* deactivate_page() moves @page to the inactive list if @page was on the active
|
||||
* list and was not an unevictable page. This is done to accelerate the reclaim
|
||||
* of @page.
|
||||
*/
|
||||
void deactivate_page(struct page *page)
|
||||
{
|
||||
if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
|
||||
struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
|
||||
|
||||
page_cache_get(page);
|
||||
if (!pagevec_add(pvec, page))
|
||||
pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
|
||||
put_cpu_var(lru_deactivate_pvecs);
|
||||
}
|
||||
}
|
||||
|
||||
void lru_add_drain(void)
|
||||
{
|
||||
lru_add_drain_cpu(get_cpu());
|
||||
@@ -883,6 +682,7 @@ void lru_add_drain_all(void)
|
||||
if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
|
||||
pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
|
||||
pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
|
||||
pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
|
||||
need_activate_page_drain(cpu)) {
|
||||
INIT_WORK(work, lru_add_drain_per_cpu);
|
||||
schedule_work_on(cpu, work);
|
||||
@@ -918,15 +718,6 @@ void release_pages(struct page **pages, int nr, bool cold)
|
||||
for (i = 0; i < nr; i++) {
|
||||
struct page *page = pages[i];
|
||||
|
||||
if (unlikely(PageCompound(page))) {
|
||||
if (zone) {
|
||||
spin_unlock_irqrestore(&zone->lru_lock, flags);
|
||||
zone = NULL;
|
||||
}
|
||||
put_compound_page(page);
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure the IRQ-safe lock-holding time does not get
|
||||
* excessive with a continuous string of pages from the
|
||||
@@ -937,9 +728,19 @@ void release_pages(struct page **pages, int nr, bool cold)
|
||||
zone = NULL;
|
||||
}
|
||||
|
||||
page = compound_head(page);
|
||||
if (!put_page_testzero(page))
|
||||
continue;
|
||||
|
||||
if (PageCompound(page)) {
|
||||
if (zone) {
|
||||
spin_unlock_irqrestore(&zone->lru_lock, flags);
|
||||
zone = NULL;
|
||||
}
|
||||
__put_compound_page(page);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (PageLRU(page)) {
|
||||
struct zone *pagezone = page_zone(page);
|
||||
|
||||
|
@@ -170,6 +170,11 @@ int add_to_swap(struct page *page, struct list_head *list)
|
||||
if (!entry.val)
|
||||
return 0;
|
||||
|
||||
if (mem_cgroup_try_charge_swap(page, entry)) {
|
||||
swapcache_free(entry);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (unlikely(PageTransHuge(page)))
|
||||
if (unlikely(split_huge_page_to_list(page, list))) {
|
||||
swapcache_free(entry);
|
||||
@@ -185,13 +190,12 @@ int add_to_swap(struct page *page, struct list_head *list)
|
||||
* deadlock in the swap out path.
|
||||
*/
|
||||
/*
|
||||
* Add it to the swap cache and mark it dirty
|
||||
* Add it to the swap cache.
|
||||
*/
|
||||
err = add_to_swap_cache(page, entry,
|
||||
__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
|
||||
|
||||
if (!err) { /* Success */
|
||||
SetPageDirty(page);
|
||||
if (!err) {
|
||||
return 1;
|
||||
} else { /* -ENOMEM radix-tree allocation failure */
|
||||
/*
|
||||
@@ -353,7 +357,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
|
||||
}
|
||||
|
||||
/* May fail (-ENOMEM) if radix-tree node allocation failed. */
|
||||
__set_page_locked(new_page);
|
||||
__SetPageLocked(new_page);
|
||||
SetPageSwapBacked(new_page);
|
||||
err = __add_to_swap_cache(new_page, entry);
|
||||
if (likely(!err)) {
|
||||
@@ -367,7 +371,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
|
||||
}
|
||||
radix_tree_preload_end();
|
||||
ClearPageSwapBacked(new_page);
|
||||
__clear_page_locked(new_page);
|
||||
__ClearPageLocked(new_page);
|
||||
/*
|
||||
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
|
||||
* clear SWAP_HAS_CACHE flag.
|
||||
|
@@ -165,8 +165,6 @@ static void discard_swap_cluster(struct swap_info_struct *si,
|
||||
int found_extent = 0;
|
||||
|
||||
while (nr_pages) {
|
||||
struct list_head *lh;
|
||||
|
||||
if (se->start_page <= start_page &&
|
||||
start_page < se->start_page + se->nr_pages) {
|
||||
pgoff_t offset = start_page - se->start_page;
|
||||
@@ -188,8 +186,7 @@ static void discard_swap_cluster(struct swap_info_struct *si,
|
||||
break;
|
||||
}
|
||||
|
||||
lh = se->list.next;
|
||||
se = list_entry(lh, struct swap_extent, list);
|
||||
se = list_next_entry(se, list);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -788,14 +785,12 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
|
||||
count--;
|
||||
}
|
||||
|
||||
if (!count)
|
||||
mem_cgroup_uncharge_swap(entry);
|
||||
|
||||
usage = count | has_cache;
|
||||
p->swap_map[offset] = usage;
|
||||
|
||||
/* free if no reference */
|
||||
if (!usage) {
|
||||
mem_cgroup_uncharge_swap(entry);
|
||||
dec_cluster_info_page(p, p->cluster_info, offset);
|
||||
if (offset < p->lowest_bit)
|
||||
p->lowest_bit = offset;
|
||||
@@ -903,7 +898,7 @@ int swp_swapcount(swp_entry_t entry)
|
||||
VM_BUG_ON(page_private(page) != SWP_CONTINUED);
|
||||
|
||||
do {
|
||||
page = list_entry(page->lru.next, struct page, lru);
|
||||
page = list_next_entry(page, lru);
|
||||
map = kmap_atomic(page);
|
||||
tmp_count = map[offset];
|
||||
kunmap_atomic(map);
|
||||
@@ -929,6 +924,9 @@ int reuse_swap_page(struct page *page)
|
||||
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
||||
if (unlikely(PageKsm(page)))
|
||||
return 0;
|
||||
/* The page is part of THP and cannot be reused */
|
||||
if (PageTransCompound(page))
|
||||
return 0;
|
||||
count = page_mapcount(page);
|
||||
if (count <= 1 && PageSwapCache(page)) {
|
||||
count += page_swapcount(page);
|
||||
@@ -1008,7 +1006,7 @@ int free_swap_and_cache(swp_entry_t entry)
|
||||
* Also recheck PageSwapCache now page is locked (above).
|
||||
*/
|
||||
if (PageSwapCache(page) && !PageWriteback(page) &&
|
||||
(!page_mapped(page) || vm_swap_full())) {
|
||||
(!page_mapped(page) || mem_cgroup_swap_full(page))) {
|
||||
delete_from_swap_cache(page);
|
||||
SetPageDirty(page);
|
||||
}
|
||||
@@ -1111,19 +1109,9 @@ unsigned int count_swap_pages(int type, int free)
|
||||
}
|
||||
#endif /* CONFIG_HIBERNATION */
|
||||
|
||||
static inline int maybe_same_pte(pte_t pte, pte_t swp_pte)
|
||||
static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
|
||||
{
|
||||
#ifdef CONFIG_MEM_SOFT_DIRTY
|
||||
/*
|
||||
* When pte keeps soft dirty bit the pte generated
|
||||
* from swap entry does not has it, still it's same
|
||||
* pte from logical point of view.
|
||||
*/
|
||||
pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte);
|
||||
return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty);
|
||||
#else
|
||||
return pte_same(pte, swp_pte);
|
||||
#endif
|
||||
return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1145,14 +1133,15 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
if (unlikely(!page))
|
||||
return -ENOMEM;
|
||||
|
||||
if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) {
|
||||
if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
|
||||
&memcg, false)) {
|
||||
ret = -ENOMEM;
|
||||
goto out_nolock;
|
||||
}
|
||||
|
||||
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
|
||||
if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) {
|
||||
mem_cgroup_cancel_charge(page, memcg);
|
||||
if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
|
||||
mem_cgroup_cancel_charge(page, memcg, false);
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
@@ -1163,11 +1152,11 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
set_pte_at(vma->vm_mm, addr, pte,
|
||||
pte_mkold(mk_pte(page, vma->vm_page_prot)));
|
||||
if (page == swapcache) {
|
||||
page_add_anon_rmap(page, vma, addr);
|
||||
mem_cgroup_commit_charge(page, memcg, true);
|
||||
page_add_anon_rmap(page, vma, addr, false);
|
||||
mem_cgroup_commit_charge(page, memcg, true, false);
|
||||
} else { /* ksm created a completely new copy */
|
||||
page_add_new_anon_rmap(page, vma, addr);
|
||||
mem_cgroup_commit_charge(page, memcg, false);
|
||||
page_add_new_anon_rmap(page, vma, addr, false);
|
||||
mem_cgroup_commit_charge(page, memcg, false, false);
|
||||
lru_cache_add_active_or_unevictable(page, vma);
|
||||
}
|
||||
swap_free(entry);
|
||||
@@ -1209,7 +1198,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
* swapoff spends a _lot_ of time in this loop!
|
||||
* Test inline before going to call unuse_pte.
|
||||
*/
|
||||
if (unlikely(maybe_same_pte(*pte, swp_pte))) {
|
||||
if (unlikely(pte_same_as_swp(*pte, swp_pte))) {
|
||||
pte_unmap(pte);
|
||||
ret = unuse_pte(vma, pmd, addr, entry, page);
|
||||
if (ret)
|
||||
@@ -1633,14 +1622,11 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
|
||||
se = start_se;
|
||||
|
||||
for ( ; ; ) {
|
||||
struct list_head *lh;
|
||||
|
||||
if (se->start_page <= offset &&
|
||||
offset < (se->start_page + se->nr_pages)) {
|
||||
return se->start_block + (offset - se->start_page);
|
||||
}
|
||||
lh = se->list.next;
|
||||
se = list_entry(lh, struct swap_extent, list);
|
||||
se = list_next_entry(se, list);
|
||||
sis->curr_swap_extent = se;
|
||||
BUG_ON(se == start_se); /* It *must* be present */
|
||||
}
|
||||
@@ -1664,7 +1650,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
|
||||
while (!list_empty(&sis->first_swap_extent.list)) {
|
||||
struct swap_extent *se;
|
||||
|
||||
se = list_entry(sis->first_swap_extent.list.next,
|
||||
se = list_first_entry(&sis->first_swap_extent.list,
|
||||
struct swap_extent, list);
|
||||
list_del(&se->list);
|
||||
kfree(se);
|
||||
@@ -1970,9 +1956,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
|
||||
set_blocksize(bdev, old_block_size);
|
||||
blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
|
||||
} else {
|
||||
mutex_lock(&inode->i_mutex);
|
||||
inode_lock(inode);
|
||||
inode->i_flags &= ~S_SWAPFILE;
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
inode_unlock(inode);
|
||||
}
|
||||
filp_close(swap_file, NULL);
|
||||
|
||||
@@ -2197,7 +2183,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
|
||||
p->flags |= SWP_BLKDEV;
|
||||
} else if (S_ISREG(inode->i_mode)) {
|
||||
p->bdev = inode->i_sb->s_bdev;
|
||||
mutex_lock(&inode->i_mutex);
|
||||
inode_lock(inode);
|
||||
if (IS_SWAPFILE(inode))
|
||||
return -EBUSY;
|
||||
} else
|
||||
@@ -2430,7 +2416,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
|
||||
mapping = swap_file->f_mapping;
|
||||
inode = mapping->host;
|
||||
|
||||
/* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */
|
||||
/* If S_ISREG(inode->i_mode) will do inode_lock(inode); */
|
||||
error = claim_swapfile(p, inode);
|
||||
if (unlikely(error))
|
||||
goto bad_swap;
|
||||
@@ -2575,7 +2561,7 @@ bad_swap:
|
||||
vfree(cluster_info);
|
||||
if (swap_file) {
|
||||
if (inode && S_ISREG(inode->i_mode)) {
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
inode_unlock(inode);
|
||||
inode = NULL;
|
||||
}
|
||||
filp_close(swap_file, NULL);
|
||||
@@ -2588,7 +2574,7 @@ out:
|
||||
if (name)
|
||||
putname(name);
|
||||
if (inode && S_ISREG(inode->i_mode))
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
inode_unlock(inode);
|
||||
return error;
|
||||
}
|
||||
|
||||
@@ -2959,11 +2945,10 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
|
||||
struct page *head;
|
||||
head = vmalloc_to_page(si->swap_map + offset);
|
||||
if (page_private(head)) {
|
||||
struct list_head *this, *next;
|
||||
list_for_each_safe(this, next, &head->lru) {
|
||||
struct page *page;
|
||||
page = list_entry(this, struct page, lru);
|
||||
list_del(this);
|
||||
struct page *page, *next;
|
||||
|
||||
list_for_each_entry_safe(page, next, &head->lru, lru) {
|
||||
list_del(&page->lru);
|
||||
__free_page(page);
|
||||
}
|
||||
}
|
||||
|
@@ -9,6 +9,7 @@
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/dax.h>
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/swap.h>
|
||||
@@ -34,31 +35,39 @@ static void clear_exceptional_entry(struct address_space *mapping,
|
||||
return;
|
||||
|
||||
spin_lock_irq(&mapping->tree_lock);
|
||||
/*
|
||||
* Regular page slots are stabilized by the page lock even
|
||||
* without the tree itself locked. These unlocked entries
|
||||
* need verification under the tree lock.
|
||||
*/
|
||||
if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
|
||||
goto unlock;
|
||||
if (*slot != entry)
|
||||
goto unlock;
|
||||
radix_tree_replace_slot(slot, NULL);
|
||||
mapping->nrshadows--;
|
||||
if (!node)
|
||||
goto unlock;
|
||||
workingset_node_shadows_dec(node);
|
||||
/*
|
||||
* Don't track node without shadow entries.
|
||||
*
|
||||
* Avoid acquiring the list_lru lock if already untracked.
|
||||
* The list_empty() test is safe as node->private_list is
|
||||
* protected by mapping->tree_lock.
|
||||
*/
|
||||
if (!workingset_node_shadows(node) &&
|
||||
!list_empty(&node->private_list))
|
||||
list_lru_del(&workingset_shadow_nodes, &node->private_list);
|
||||
__radix_tree_delete_node(&mapping->page_tree, node);
|
||||
|
||||
if (dax_mapping(mapping)) {
|
||||
if (radix_tree_delete_item(&mapping->page_tree, index, entry))
|
||||
mapping->nrexceptional--;
|
||||
} else {
|
||||
/*
|
||||
* Regular page slots are stabilized by the page lock even
|
||||
* without the tree itself locked. These unlocked entries
|
||||
* need verification under the tree lock.
|
||||
*/
|
||||
if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
|
||||
&slot))
|
||||
goto unlock;
|
||||
if (*slot != entry)
|
||||
goto unlock;
|
||||
radix_tree_replace_slot(slot, NULL);
|
||||
mapping->nrexceptional--;
|
||||
if (!node)
|
||||
goto unlock;
|
||||
workingset_node_shadows_dec(node);
|
||||
/*
|
||||
* Don't track node without shadow entries.
|
||||
*
|
||||
* Avoid acquiring the list_lru lock if already untracked.
|
||||
* The list_empty() test is safe as node->private_list is
|
||||
* protected by mapping->tree_lock.
|
||||
*/
|
||||
if (!workingset_node_shadows(node) &&
|
||||
!list_empty(&node->private_list))
|
||||
list_lru_del(&workingset_shadow_nodes,
|
||||
&node->private_list);
|
||||
__radix_tree_delete_node(&mapping->page_tree, node);
|
||||
}
|
||||
unlock:
|
||||
spin_unlock_irq(&mapping->tree_lock);
|
||||
}
|
||||
@@ -228,7 +237,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
|
||||
int i;
|
||||
|
||||
cleancache_invalidate_inode(mapping);
|
||||
if (mapping->nrpages == 0 && mapping->nrshadows == 0)
|
||||
if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
|
||||
return;
|
||||
|
||||
/* Offsets within partial pages */
|
||||
@@ -402,7 +411,7 @@ EXPORT_SYMBOL(truncate_inode_pages);
|
||||
*/
|
||||
void truncate_inode_pages_final(struct address_space *mapping)
|
||||
{
|
||||
unsigned long nrshadows;
|
||||
unsigned long nrexceptional;
|
||||
unsigned long nrpages;
|
||||
|
||||
/*
|
||||
@@ -416,14 +425,14 @@ void truncate_inode_pages_final(struct address_space *mapping)
|
||||
|
||||
/*
|
||||
* When reclaim installs eviction entries, it increases
|
||||
* nrshadows first, then decreases nrpages. Make sure we see
|
||||
* nrexceptional first, then decreases nrpages. Make sure we see
|
||||
* this in the right order or we might miss an entry.
|
||||
*/
|
||||
nrpages = mapping->nrpages;
|
||||
smp_rmb();
|
||||
nrshadows = mapping->nrshadows;
|
||||
nrexceptional = mapping->nrexceptional;
|
||||
|
||||
if (nrpages || nrshadows) {
|
||||
if (nrpages || nrexceptional) {
|
||||
/*
|
||||
* As truncation uses a lockless tree lookup, cycle
|
||||
* the tree lock to make sure any ongoing tree
|
||||
|
@@ -63,7 +63,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
|
||||
__SetPageUptodate(page);
|
||||
|
||||
ret = -ENOMEM;
|
||||
if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg))
|
||||
if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
|
||||
goto out_release;
|
||||
|
||||
_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
|
||||
@@ -76,8 +76,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
|
||||
goto out_release_uncharge_unlock;
|
||||
|
||||
inc_mm_counter(dst_mm, MM_ANONPAGES);
|
||||
page_add_new_anon_rmap(page, dst_vma, dst_addr);
|
||||
mem_cgroup_commit_charge(page, memcg, false);
|
||||
page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
|
||||
mem_cgroup_commit_charge(page, memcg, false, false);
|
||||
lru_cache_add_active_or_unevictable(page, dst_vma);
|
||||
|
||||
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
|
||||
@@ -91,7 +91,7 @@ out:
|
||||
return ret;
|
||||
out_release_uncharge_unlock:
|
||||
pte_unmap_unlock(dst_pte, ptl);
|
||||
mem_cgroup_cancel_charge(page, memcg);
|
||||
mem_cgroup_cancel_charge(page, memcg, false);
|
||||
out_release:
|
||||
page_cache_release(page);
|
||||
goto out;
|
||||
|
71
mm/util.c
71
mm/util.c
@@ -176,6 +176,37 @@ char *strndup_user(const char __user *s, long n)
|
||||
}
|
||||
EXPORT_SYMBOL(strndup_user);
|
||||
|
||||
/**
|
||||
* memdup_user_nul - duplicate memory region from user space and NUL-terminate
|
||||
*
|
||||
* @src: source address in user space
|
||||
* @len: number of bytes to copy
|
||||
*
|
||||
* Returns an ERR_PTR() on failure.
|
||||
*/
|
||||
void *memdup_user_nul(const void __user *src, size_t len)
|
||||
{
|
||||
char *p;
|
||||
|
||||
/*
|
||||
* Always use GFP_KERNEL, since copy_from_user() can sleep and
|
||||
* cause pagefault, which makes it pointless to use GFP_NOFS
|
||||
* or GFP_ATOMIC.
|
||||
*/
|
||||
p = kmalloc_track_caller(len + 1, GFP_KERNEL);
|
||||
if (!p)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
if (copy_from_user(p, src, len)) {
|
||||
kfree(p);
|
||||
return ERR_PTR(-EFAULT);
|
||||
}
|
||||
p[len] = '\0';
|
||||
|
||||
return p;
|
||||
}
|
||||
EXPORT_SYMBOL(memdup_user_nul);
|
||||
|
||||
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
struct vm_area_struct *prev, struct rb_node *rb_parent)
|
||||
{
|
||||
@@ -355,7 +386,9 @@ struct anon_vma *page_anon_vma(struct page *page)
|
||||
|
||||
struct address_space *page_mapping(struct page *page)
|
||||
{
|
||||
unsigned long mapping;
|
||||
struct address_space *mapping;
|
||||
|
||||
page = compound_head(page);
|
||||
|
||||
/* This happens if someone calls flush_dcache_page on slab page */
|
||||
if (unlikely(PageSlab(page)))
|
||||
@@ -368,12 +401,26 @@ struct address_space *page_mapping(struct page *page)
|
||||
return swap_address_space(entry);
|
||||
}
|
||||
|
||||
mapping = (unsigned long)page->mapping;
|
||||
if (mapping & PAGE_MAPPING_FLAGS)
|
||||
mapping = page->mapping;
|
||||
if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
|
||||
return NULL;
|
||||
return page->mapping;
|
||||
return mapping;
|
||||
}
|
||||
|
||||
/* Slow path of page_mapcount() for compound pages */
|
||||
int __page_mapcount(struct page *page)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = atomic_read(&page->_mapcount) + 1;
|
||||
page = compound_head(page);
|
||||
ret += atomic_read(compound_mapcount_ptr(page)) + 1;
|
||||
if (PageDoubleMap(page))
|
||||
ret--;
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__page_mapcount);
|
||||
|
||||
int overcommit_ratio_handler(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp,
|
||||
loff_t *ppos)
|
||||
@@ -429,17 +476,25 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen)
|
||||
int res = 0;
|
||||
unsigned int len;
|
||||
struct mm_struct *mm = get_task_mm(task);
|
||||
unsigned long arg_start, arg_end, env_start, env_end;
|
||||
if (!mm)
|
||||
goto out;
|
||||
if (!mm->arg_end)
|
||||
goto out_mm; /* Shh! No looking before we're done */
|
||||
|
||||
len = mm->arg_end - mm->arg_start;
|
||||
down_read(&mm->mmap_sem);
|
||||
arg_start = mm->arg_start;
|
||||
arg_end = mm->arg_end;
|
||||
env_start = mm->env_start;
|
||||
env_end = mm->env_end;
|
||||
up_read(&mm->mmap_sem);
|
||||
|
||||
len = arg_end - arg_start;
|
||||
|
||||
if (len > buflen)
|
||||
len = buflen;
|
||||
|
||||
res = access_process_vm(task, mm->arg_start, buffer, len, 0);
|
||||
res = access_process_vm(task, arg_start, buffer, len, 0);
|
||||
|
||||
/*
|
||||
* If the nul at the end of args has been overwritten, then
|
||||
@@ -450,10 +505,10 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen)
|
||||
if (len < res) {
|
||||
res = len;
|
||||
} else {
|
||||
len = mm->env_end - mm->env_start;
|
||||
len = env_end - env_start;
|
||||
if (len > buflen - res)
|
||||
len = buflen - res;
|
||||
res += access_process_vm(task, mm->env_start,
|
||||
res += access_process_vm(task, env_start,
|
||||
buffer+res, len, 0);
|
||||
res = strnlen(buffer, res);
|
||||
}
|
||||
|
27
mm/vmalloc.c
27
mm/vmalloc.c
@@ -441,8 +441,7 @@ nocache:
|
||||
if (list_is_last(&first->list, &vmap_area_list))
|
||||
goto found;
|
||||
|
||||
first = list_entry(first->list.next,
|
||||
struct vmap_area, list);
|
||||
first = list_next_entry(first, list);
|
||||
}
|
||||
|
||||
found:
|
||||
@@ -456,7 +455,7 @@ found:
|
||||
free_vmap_cache = &va->rb_node;
|
||||
spin_unlock(&vmap_area_lock);
|
||||
|
||||
BUG_ON(va->va_start & (align-1));
|
||||
BUG_ON(!IS_ALIGNED(va->va_start, align));
|
||||
BUG_ON(va->va_start < vstart);
|
||||
BUG_ON(va->va_end > vend);
|
||||
|
||||
@@ -1087,7 +1086,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
|
||||
BUG_ON(!addr);
|
||||
BUG_ON(addr < VMALLOC_START);
|
||||
BUG_ON(addr > VMALLOC_END);
|
||||
BUG_ON(addr & (PAGE_SIZE-1));
|
||||
BUG_ON(!IS_ALIGNED(addr, PAGE_SIZE));
|
||||
|
||||
debug_check_no_locks_freed(mem, size);
|
||||
vmap_debug_free_range(addr, addr+size);
|
||||
@@ -1477,13 +1476,10 @@ static void __vunmap(const void *addr, int deallocate_pages)
|
||||
struct page *page = area->pages[i];
|
||||
|
||||
BUG_ON(!page);
|
||||
__free_page(page);
|
||||
__free_kmem_pages(page, 0);
|
||||
}
|
||||
|
||||
if (area->flags & VM_VPAGES)
|
||||
vfree(area->pages);
|
||||
else
|
||||
kfree(area->pages);
|
||||
kvfree(area->pages);
|
||||
}
|
||||
|
||||
kfree(area);
|
||||
@@ -1593,7 +1589,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
|
||||
if (array_size > PAGE_SIZE) {
|
||||
pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
|
||||
PAGE_KERNEL, node, area->caller);
|
||||
area->flags |= VM_VPAGES;
|
||||
} else {
|
||||
pages = kmalloc_node(array_size, nested_gfp, node);
|
||||
}
|
||||
@@ -1608,9 +1603,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
|
||||
struct page *page;
|
||||
|
||||
if (node == NUMA_NO_NODE)
|
||||
page = alloc_page(alloc_mask);
|
||||
page = alloc_kmem_pages(alloc_mask, order);
|
||||
else
|
||||
page = alloc_pages_node(node, alloc_mask, order);
|
||||
page = alloc_kmem_pages_node(node, alloc_mask, order);
|
||||
|
||||
if (unlikely(!page)) {
|
||||
/* Successfully allocated i pages, free them in __vunmap() */
|
||||
@@ -2559,10 +2554,10 @@ static void *s_start(struct seq_file *m, loff_t *pos)
|
||||
struct vmap_area *va;
|
||||
|
||||
spin_lock(&vmap_area_lock);
|
||||
va = list_entry((&vmap_area_list)->next, typeof(*va), list);
|
||||
va = list_first_entry(&vmap_area_list, typeof(*va), list);
|
||||
while (n > 0 && &va->list != &vmap_area_list) {
|
||||
n--;
|
||||
va = list_entry(va->list.next, typeof(*va), list);
|
||||
va = list_next_entry(va, list);
|
||||
}
|
||||
if (!n && &va->list != &vmap_area_list)
|
||||
return va;
|
||||
@@ -2576,7 +2571,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
|
||||
struct vmap_area *va = p, *next;
|
||||
|
||||
++*pos;
|
||||
next = list_entry(va->list.next, typeof(*va), list);
|
||||
next = list_next_entry(va, list);
|
||||
if (&next->list != &vmap_area_list)
|
||||
return next;
|
||||
|
||||
@@ -2651,7 +2646,7 @@ static int s_show(struct seq_file *m, void *p)
|
||||
if (v->flags & VM_USERMAP)
|
||||
seq_puts(m, " user");
|
||||
|
||||
if (v->flags & VM_VPAGES)
|
||||
if (is_vmalloc_addr(v->pages))
|
||||
seq_puts(m, " vpages");
|
||||
|
||||
show_numa_info(m, v);
|
||||
|
@@ -137,14 +137,11 @@ struct vmpressure_event {
|
||||
};
|
||||
|
||||
static bool vmpressure_event(struct vmpressure *vmpr,
|
||||
unsigned long scanned, unsigned long reclaimed)
|
||||
enum vmpressure_levels level)
|
||||
{
|
||||
struct vmpressure_event *ev;
|
||||
enum vmpressure_levels level;
|
||||
bool signalled = false;
|
||||
|
||||
level = vmpressure_calc_level(scanned, reclaimed);
|
||||
|
||||
mutex_lock(&vmpr->events_lock);
|
||||
|
||||
list_for_each_entry(ev, &vmpr->events, node) {
|
||||
@@ -164,6 +161,7 @@ static void vmpressure_work_fn(struct work_struct *work)
|
||||
struct vmpressure *vmpr = work_to_vmpressure(work);
|
||||
unsigned long scanned;
|
||||
unsigned long reclaimed;
|
||||
enum vmpressure_levels level;
|
||||
|
||||
spin_lock(&vmpr->sr_lock);
|
||||
/*
|
||||
@@ -174,19 +172,21 @@ static void vmpressure_work_fn(struct work_struct *work)
|
||||
* here. No need for any locks here since we don't care if
|
||||
* vmpr->reclaimed is in sync.
|
||||
*/
|
||||
scanned = vmpr->scanned;
|
||||
scanned = vmpr->tree_scanned;
|
||||
if (!scanned) {
|
||||
spin_unlock(&vmpr->sr_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
reclaimed = vmpr->reclaimed;
|
||||
vmpr->scanned = 0;
|
||||
vmpr->reclaimed = 0;
|
||||
reclaimed = vmpr->tree_reclaimed;
|
||||
vmpr->tree_scanned = 0;
|
||||
vmpr->tree_reclaimed = 0;
|
||||
spin_unlock(&vmpr->sr_lock);
|
||||
|
||||
level = vmpressure_calc_level(scanned, reclaimed);
|
||||
|
||||
do {
|
||||
if (vmpressure_event(vmpr, scanned, reclaimed))
|
||||
if (vmpressure_event(vmpr, level))
|
||||
break;
|
||||
/*
|
||||
* If not handled, propagate the event upward into the
|
||||
@@ -199,6 +199,7 @@ static void vmpressure_work_fn(struct work_struct *work)
|
||||
* vmpressure() - Account memory pressure through scanned/reclaimed ratio
|
||||
* @gfp: reclaimer's gfp mask
|
||||
* @memcg: cgroup memory controller handle
|
||||
* @tree: legacy subtree mode
|
||||
* @scanned: number of pages scanned
|
||||
* @reclaimed: number of pages reclaimed
|
||||
*
|
||||
@@ -206,9 +207,16 @@ static void vmpressure_work_fn(struct work_struct *work)
|
||||
* "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
|
||||
* pressure index is then further refined and averaged over time.
|
||||
*
|
||||
* If @tree is set, vmpressure is in traditional userspace reporting
|
||||
* mode: @memcg is considered the pressure root and userspace is
|
||||
* notified of the entire subtree's reclaim efficiency.
|
||||
*
|
||||
* If @tree is not set, reclaim efficiency is recorded for @memcg, and
|
||||
* only in-kernel users are notified.
|
||||
*
|
||||
* This function does not return any value.
|
||||
*/
|
||||
void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
|
||||
void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
|
||||
unsigned long scanned, unsigned long reclaimed)
|
||||
{
|
||||
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
|
||||
@@ -238,15 +246,47 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
|
||||
if (!scanned)
|
||||
return;
|
||||
|
||||
spin_lock(&vmpr->sr_lock);
|
||||
vmpr->scanned += scanned;
|
||||
vmpr->reclaimed += reclaimed;
|
||||
scanned = vmpr->scanned;
|
||||
spin_unlock(&vmpr->sr_lock);
|
||||
if (tree) {
|
||||
spin_lock(&vmpr->sr_lock);
|
||||
vmpr->tree_scanned += scanned;
|
||||
vmpr->tree_reclaimed += reclaimed;
|
||||
scanned = vmpr->scanned;
|
||||
spin_unlock(&vmpr->sr_lock);
|
||||
|
||||
if (scanned < vmpressure_win)
|
||||
return;
|
||||
schedule_work(&vmpr->work);
|
||||
if (scanned < vmpressure_win)
|
||||
return;
|
||||
schedule_work(&vmpr->work);
|
||||
} else {
|
||||
enum vmpressure_levels level;
|
||||
|
||||
/* For now, no users for root-level efficiency */
|
||||
if (!memcg || memcg == root_mem_cgroup)
|
||||
return;
|
||||
|
||||
spin_lock(&vmpr->sr_lock);
|
||||
scanned = vmpr->scanned += scanned;
|
||||
reclaimed = vmpr->reclaimed += reclaimed;
|
||||
if (scanned < vmpressure_win) {
|
||||
spin_unlock(&vmpr->sr_lock);
|
||||
return;
|
||||
}
|
||||
vmpr->scanned = vmpr->reclaimed = 0;
|
||||
spin_unlock(&vmpr->sr_lock);
|
||||
|
||||
level = vmpressure_calc_level(scanned, reclaimed);
|
||||
|
||||
if (level > VMPRESSURE_LOW) {
|
||||
/*
|
||||
* Let the socket buffer allocator know that
|
||||
* we are having trouble reclaiming LRU pages.
|
||||
*
|
||||
* For hysteresis keep the pressure state
|
||||
* asserted for a second in which subsequent
|
||||
* pressure events can occur.
|
||||
*/
|
||||
memcg->socket_pressure = jiffies + HZ;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -276,7 +316,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
|
||||
* to the vmpressure() basically means that we signal 'critical'
|
||||
* level.
|
||||
*/
|
||||
vmpressure(gfp, memcg, vmpressure_win, 0);
|
||||
vmpressure(gfp, memcg, true, vmpressure_win, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
|
93
mm/vmscan.c
93
mm/vmscan.c
@@ -46,6 +46,7 @@
|
||||
#include <linux/oom.h>
|
||||
#include <linux/prefetch.h>
|
||||
#include <linux/printk.h>
|
||||
#include <linux/dax.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/div64.h>
|
||||
@@ -106,8 +107,6 @@ struct scan_control {
|
||||
unsigned long nr_reclaimed;
|
||||
};
|
||||
|
||||
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
|
||||
|
||||
#ifdef ARCH_HAS_PREFETCH
|
||||
#define prefetch_prev_lru_page(_page, _base, _field) \
|
||||
do { \
|
||||
@@ -197,11 +196,13 @@ static unsigned long zone_reclaimable_pages(struct zone *zone)
|
||||
unsigned long nr;
|
||||
|
||||
nr = zone_page_state(zone, NR_ACTIVE_FILE) +
|
||||
zone_page_state(zone, NR_INACTIVE_FILE);
|
||||
zone_page_state(zone, NR_INACTIVE_FILE) +
|
||||
zone_page_state(zone, NR_ISOLATED_FILE);
|
||||
|
||||
if (get_nr_swap_pages() > 0)
|
||||
nr += zone_page_state(zone, NR_ACTIVE_ANON) +
|
||||
zone_page_state(zone, NR_INACTIVE_ANON);
|
||||
zone_page_state(zone, NR_INACTIVE_ANON) +
|
||||
zone_page_state(zone, NR_ISOLATED_ANON);
|
||||
|
||||
return nr;
|
||||
}
|
||||
@@ -411,7 +412,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
|
||||
struct shrinker *shrinker;
|
||||
unsigned long freed = 0;
|
||||
|
||||
if (memcg && !memcg_kmem_is_active(memcg))
|
||||
if (memcg && !memcg_kmem_online(memcg))
|
||||
return 0;
|
||||
|
||||
if (nr_scanned == 0)
|
||||
@@ -594,7 +595,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
|
||||
/* synchronous write or broken a_ops? */
|
||||
ClearPageReclaim(page);
|
||||
}
|
||||
trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
|
||||
trace_mm_vmscan_writepage(page);
|
||||
inc_zone_page_state(page, NR_VMSCAN_WRITE);
|
||||
return PAGE_SUCCESS;
|
||||
}
|
||||
@@ -671,9 +672,15 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
|
||||
* inode reclaim needs to empty out the radix tree or
|
||||
* the nodes are lost. Don't plant shadows behind its
|
||||
* back.
|
||||
*
|
||||
* We also don't store shadows for DAX mappings because the
|
||||
* only page cache pages found in these are zero pages
|
||||
* covering holes, and because we don't want to mix DAX
|
||||
* exceptional entries and shadow exceptional entries in the
|
||||
* same page_tree.
|
||||
*/
|
||||
if (reclaimed && page_is_file_cache(page) &&
|
||||
!mapping_exiting(mapping))
|
||||
!mapping_exiting(mapping) && !dax_mapping(mapping))
|
||||
shadow = workingset_eviction(mapping, page);
|
||||
__delete_from_page_cache(page, shadow, memcg);
|
||||
spin_unlock_irqrestore(&mapping->tree_lock, flags);
|
||||
@@ -906,6 +913,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
|
||||
int may_enter_fs;
|
||||
enum page_references references = PAGEREF_RECLAIM_CLEAN;
|
||||
bool dirty, writeback;
|
||||
bool lazyfree = false;
|
||||
int ret = SWAP_SUCCESS;
|
||||
|
||||
cond_resched();
|
||||
|
||||
@@ -1049,6 +1058,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
|
||||
goto keep_locked;
|
||||
if (!add_to_swap(page, page_list))
|
||||
goto activate_locked;
|
||||
lazyfree = true;
|
||||
may_enter_fs = 1;
|
||||
|
||||
/* Adding to swap updated mapping */
|
||||
@@ -1060,14 +1070,17 @@ static unsigned long shrink_page_list(struct list_head *page_list,
|
||||
* processes. Try to unmap it here.
|
||||
*/
|
||||
if (page_mapped(page) && mapping) {
|
||||
switch (try_to_unmap(page,
|
||||
ttu_flags|TTU_BATCH_FLUSH)) {
|
||||
switch (ret = try_to_unmap(page, lazyfree ?
|
||||
(ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) :
|
||||
(ttu_flags | TTU_BATCH_FLUSH))) {
|
||||
case SWAP_FAIL:
|
||||
goto activate_locked;
|
||||
case SWAP_AGAIN:
|
||||
goto keep_locked;
|
||||
case SWAP_MLOCK:
|
||||
goto cull_mlocked;
|
||||
case SWAP_LZFREE:
|
||||
goto lazyfree;
|
||||
case SWAP_SUCCESS:
|
||||
; /* try to free the page below */
|
||||
}
|
||||
@@ -1174,6 +1187,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
|
||||
}
|
||||
}
|
||||
|
||||
lazyfree:
|
||||
if (!mapping || !__remove_mapping(mapping, page, true))
|
||||
goto keep_locked;
|
||||
|
||||
@@ -1184,8 +1198,11 @@ static unsigned long shrink_page_list(struct list_head *page_list,
|
||||
* we obviously don't have to worry about waking up a process
|
||||
* waiting on the page lock, because there are no references.
|
||||
*/
|
||||
__clear_page_locked(page);
|
||||
__ClearPageLocked(page);
|
||||
free_it:
|
||||
if (ret == SWAP_LZFREE)
|
||||
count_vm_event(PGLAZYFREED);
|
||||
|
||||
nr_reclaimed++;
|
||||
|
||||
/*
|
||||
@@ -1204,7 +1221,7 @@ cull_mlocked:
|
||||
|
||||
activate_locked:
|
||||
/* Not a candidate for swapping, so reclaim swap space. */
|
||||
if (PageSwapCache(page) && vm_swap_full())
|
||||
if (PageSwapCache(page) && mem_cgroup_swap_full(page))
|
||||
try_to_free_swap(page);
|
||||
VM_BUG_ON_PAGE(PageActive(page), page);
|
||||
SetPageActive(page);
|
||||
@@ -1426,6 +1443,7 @@ int isolate_lru_page(struct page *page)
|
||||
int ret = -EBUSY;
|
||||
|
||||
VM_BUG_ON_PAGE(!page_count(page), page);
|
||||
VM_BUG_ON_PAGE(PageTail(page), page);
|
||||
|
||||
if (PageLRU(page)) {
|
||||
struct zone *zone = page_zone(page);
|
||||
@@ -1691,11 +1709,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
|
||||
current_may_throttle())
|
||||
wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
|
||||
|
||||
trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
|
||||
zone_idx(zone),
|
||||
nr_scanned, nr_reclaimed,
|
||||
sc->priority,
|
||||
trace_shrink_flags(file));
|
||||
trace_mm_vmscan_lru_shrink_inactive(zone, nr_scanned, nr_reclaimed,
|
||||
sc->priority, file);
|
||||
return nr_reclaimed;
|
||||
}
|
||||
|
||||
@@ -1958,10 +1973,11 @@ enum scan_balance {
|
||||
* nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
|
||||
* nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
|
||||
*/
|
||||
static void get_scan_count(struct lruvec *lruvec, int swappiness,
|
||||
static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
|
||||
struct scan_control *sc, unsigned long *nr,
|
||||
unsigned long *lru_pages)
|
||||
{
|
||||
int swappiness = mem_cgroup_swappiness(memcg);
|
||||
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
|
||||
u64 fraction[2];
|
||||
u64 denominator = 0; /* gcc */
|
||||
@@ -1988,14 +2004,14 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness,
|
||||
if (current_is_kswapd()) {
|
||||
if (!zone_reclaimable(zone))
|
||||
force_scan = true;
|
||||
if (!mem_cgroup_lruvec_online(lruvec))
|
||||
if (!mem_cgroup_online(memcg))
|
||||
force_scan = true;
|
||||
}
|
||||
if (!global_reclaim(sc))
|
||||
force_scan = true;
|
||||
|
||||
/* If we have no swap space, do not bother scanning anon pages. */
|
||||
if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
|
||||
if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
|
||||
scan_balance = SCAN_FILE;
|
||||
goto out;
|
||||
}
|
||||
@@ -2046,10 +2062,16 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness,
|
||||
}
|
||||
|
||||
/*
|
||||
* There is enough inactive page cache, do not reclaim
|
||||
* anything from the anonymous working set right now.
|
||||
* If there is enough inactive page cache, i.e. if the size of the
|
||||
* inactive list is greater than that of the active list *and* the
|
||||
* inactive list actually has some pages to scan on this priority, we
|
||||
* do not reclaim anything from the anonymous working set right now.
|
||||
* Without the second condition we could end up never scanning an
|
||||
* lruvec even if it has plenty of old anonymous pages unless the
|
||||
* system is under heavy pressure.
|
||||
*/
|
||||
if (!inactive_file_is_low(lruvec)) {
|
||||
if (!inactive_file_is_low(lruvec) &&
|
||||
get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
|
||||
scan_balance = SCAN_FILE;
|
||||
goto out;
|
||||
}
|
||||
@@ -2179,9 +2201,10 @@ static inline void init_tlb_ubc(void)
|
||||
/*
|
||||
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
|
||||
*/
|
||||
static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
|
||||
struct scan_control *sc, unsigned long *lru_pages)
|
||||
static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg,
|
||||
struct scan_control *sc, unsigned long *lru_pages)
|
||||
{
|
||||
struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
|
||||
unsigned long nr[NR_LRU_LISTS];
|
||||
unsigned long targets[NR_LRU_LISTS];
|
||||
unsigned long nr_to_scan;
|
||||
@@ -2191,7 +2214,7 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
|
||||
struct blk_plug plug;
|
||||
bool scan_adjusted;
|
||||
|
||||
get_scan_count(lruvec, swappiness, sc, nr, lru_pages);
|
||||
get_scan_count(lruvec, memcg, sc, nr, lru_pages);
|
||||
|
||||
/* Record the original scan target for proportional adjustments later */
|
||||
memcpy(targets, nr, sizeof(nr));
|
||||
@@ -2393,9 +2416,8 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
|
||||
memcg = mem_cgroup_iter(root, NULL, &reclaim);
|
||||
do {
|
||||
unsigned long lru_pages;
|
||||
unsigned long reclaimed;
|
||||
unsigned long scanned;
|
||||
struct lruvec *lruvec;
|
||||
int swappiness;
|
||||
|
||||
if (mem_cgroup_low(root, memcg)) {
|
||||
if (!sc->may_thrash)
|
||||
@@ -2403,11 +2425,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
|
||||
mem_cgroup_events(memcg, MEMCG_LOW, 1);
|
||||
}
|
||||
|
||||
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
|
||||
swappiness = mem_cgroup_swappiness(memcg);
|
||||
reclaimed = sc->nr_reclaimed;
|
||||
scanned = sc->nr_scanned;
|
||||
|
||||
shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
|
||||
shrink_zone_memcg(zone, memcg, sc, &lru_pages);
|
||||
zone_lru_pages += lru_pages;
|
||||
|
||||
if (memcg && is_classzone)
|
||||
@@ -2415,6 +2436,11 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
|
||||
memcg, sc->nr_scanned - scanned,
|
||||
lru_pages);
|
||||
|
||||
/* Record the group's reclaim efficiency */
|
||||
vmpressure(sc->gfp_mask, memcg, false,
|
||||
sc->nr_scanned - scanned,
|
||||
sc->nr_reclaimed - reclaimed);
|
||||
|
||||
/*
|
||||
* Direct reclaim and kswapd have to scan all memory
|
||||
* cgroups to fulfill the overall scan target for the
|
||||
@@ -2446,7 +2472,8 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
|
||||
reclaim_state->reclaimed_slab = 0;
|
||||
}
|
||||
|
||||
vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
|
||||
/* Record the subtree's reclaim efficiency */
|
||||
vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
|
||||
sc->nr_scanned - nr_scanned,
|
||||
sc->nr_reclaimed - nr_reclaimed);
|
||||
|
||||
@@ -2871,8 +2898,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
|
||||
.may_unmap = 1,
|
||||
.may_swap = !noswap,
|
||||
};
|
||||
struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
|
||||
int swappiness = mem_cgroup_swappiness(memcg);
|
||||
unsigned long lru_pages;
|
||||
|
||||
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
|
||||
@@ -2889,7 +2914,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
|
||||
* will pick up pages from other mem cgroup's as well. We hack
|
||||
* the priority and make it zero.
|
||||
*/
|
||||
shrink_lruvec(lruvec, swappiness, &sc, &lru_pages);
|
||||
shrink_zone_memcg(zone, memcg, &sc, &lru_pages);
|
||||
|
||||
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
|
||||
|
||||
|
86
mm/vmstat.c
86
mm/vmstat.c
@@ -460,7 +460,7 @@ static int fold_diff(int *diff)
|
||||
*
|
||||
* The function returns the number of global counters updated.
|
||||
*/
|
||||
static int refresh_cpu_vm_stats(void)
|
||||
static int refresh_cpu_vm_stats(bool do_pagesets)
|
||||
{
|
||||
struct zone *zone;
|
||||
int i;
|
||||
@@ -484,33 +484,35 @@ static int refresh_cpu_vm_stats(void)
|
||||
#endif
|
||||
}
|
||||
}
|
||||
cond_resched();
|
||||
#ifdef CONFIG_NUMA
|
||||
/*
|
||||
* Deal with draining the remote pageset of this
|
||||
* processor
|
||||
*
|
||||
* Check if there are pages remaining in this pageset
|
||||
* if not then there is nothing to expire.
|
||||
*/
|
||||
if (!__this_cpu_read(p->expire) ||
|
||||
if (do_pagesets) {
|
||||
cond_resched();
|
||||
/*
|
||||
* Deal with draining the remote pageset of this
|
||||
* processor
|
||||
*
|
||||
* Check if there are pages remaining in this pageset
|
||||
* if not then there is nothing to expire.
|
||||
*/
|
||||
if (!__this_cpu_read(p->expire) ||
|
||||
!__this_cpu_read(p->pcp.count))
|
||||
continue;
|
||||
continue;
|
||||
|
||||
/*
|
||||
* We never drain zones local to this processor.
|
||||
*/
|
||||
if (zone_to_nid(zone) == numa_node_id()) {
|
||||
__this_cpu_write(p->expire, 0);
|
||||
continue;
|
||||
}
|
||||
/*
|
||||
* We never drain zones local to this processor.
|
||||
*/
|
||||
if (zone_to_nid(zone) == numa_node_id()) {
|
||||
__this_cpu_write(p->expire, 0);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (__this_cpu_dec_return(p->expire))
|
||||
continue;
|
||||
if (__this_cpu_dec_return(p->expire))
|
||||
continue;
|
||||
|
||||
if (__this_cpu_read(p->pcp.count)) {
|
||||
drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
|
||||
changes++;
|
||||
if (__this_cpu_read(p->pcp.count)) {
|
||||
drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
|
||||
changes++;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -781,6 +783,7 @@ const char * const vmstat_text[] = {
|
||||
|
||||
"pgfault",
|
||||
"pgmajfault",
|
||||
"pglazyfreed",
|
||||
|
||||
TEXTS_FOR_ZONES("pgrefill")
|
||||
TEXTS_FOR_ZONES("pgsteal_kswapd")
|
||||
@@ -842,7 +845,9 @@ const char * const vmstat_text[] = {
|
||||
"thp_fault_fallback",
|
||||
"thp_collapse_alloc",
|
||||
"thp_collapse_alloc_failed",
|
||||
"thp_split",
|
||||
"thp_split_page",
|
||||
"thp_split_page_failed",
|
||||
"thp_split_pmd",
|
||||
"thp_zero_page_alloc",
|
||||
"thp_zero_page_alloc_failed",
|
||||
#endif
|
||||
@@ -1386,7 +1391,7 @@ static cpumask_var_t cpu_stat_off;
|
||||
|
||||
static void vmstat_update(struct work_struct *w)
|
||||
{
|
||||
if (refresh_cpu_vm_stats()) {
|
||||
if (refresh_cpu_vm_stats(true)) {
|
||||
/*
|
||||
* Counters were updated so we expect more updates
|
||||
* to occur in the future. Keep on running the
|
||||
@@ -1403,20 +1408,27 @@ static void vmstat_update(struct work_struct *w)
|
||||
* Defer the checking for differentials to the
|
||||
* shepherd thread on a different processor.
|
||||
*/
|
||||
int r;
|
||||
/*
|
||||
* Shepherd work thread does not race since it never
|
||||
* changes the bit if its zero but the cpu
|
||||
* online / off line code may race if
|
||||
* worker threads are still allowed during
|
||||
* shutdown / startup.
|
||||
*/
|
||||
r = cpumask_test_and_set_cpu(smp_processor_id(),
|
||||
cpu_stat_off);
|
||||
VM_BUG_ON(r);
|
||||
cpumask_set_cpu(smp_processor_id(), cpu_stat_off);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Switch off vmstat processing and then fold all the remaining differentials
|
||||
* until the diffs stay at zero. The function is used by NOHZ and can only be
|
||||
* invoked when tick processing is not active.
|
||||
*/
|
||||
void quiet_vmstat(void)
|
||||
{
|
||||
if (system_state != SYSTEM_RUNNING)
|
||||
return;
|
||||
|
||||
do {
|
||||
if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
|
||||
cancel_delayed_work(this_cpu_ptr(&vmstat_work));
|
||||
|
||||
} while (refresh_cpu_vm_stats(false));
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if the diffs for a certain cpu indicate that
|
||||
* an update is needed.
|
||||
@@ -1449,7 +1461,7 @@ static bool need_update(int cpu)
|
||||
*/
|
||||
static void vmstat_shepherd(struct work_struct *w);
|
||||
|
||||
static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd);
|
||||
static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
|
||||
|
||||
static void vmstat_shepherd(struct work_struct *w)
|
||||
{
|
||||
|
@@ -351,8 +351,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
|
||||
node->slots[i] = NULL;
|
||||
BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT));
|
||||
node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
|
||||
BUG_ON(!mapping->nrshadows);
|
||||
mapping->nrshadows--;
|
||||
BUG_ON(!mapping->nrexceptional);
|
||||
mapping->nrexceptional--;
|
||||
}
|
||||
}
|
||||
BUG_ON(node->count);
|
||||
|
@@ -463,9 +463,6 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle)
|
||||
spin_unlock(&pool->lock);
|
||||
}
|
||||
|
||||
#define list_tail_entry(ptr, type, member) \
|
||||
list_entry((ptr)->prev, type, member)
|
||||
|
||||
/**
|
||||
* zbud_reclaim_page() - evicts allocations from a pool page and frees it
|
||||
* @pool: pool from which a page will attempt to be evicted
|
||||
@@ -514,7 +511,7 @@ int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
|
||||
return -EINVAL;
|
||||
}
|
||||
for (i = 0; i < retries; i++) {
|
||||
zhdr = list_tail_entry(&pool->lru, struct zbud_header, lru);
|
||||
zhdr = list_last_entry(&pool->lru, struct zbud_header, lru);
|
||||
list_del(&zhdr->lru);
|
||||
list_del(&zhdr->buddy);
|
||||
/* Protect zbud page against free */
|
||||
|
@@ -213,10 +213,10 @@ struct size_class {
|
||||
int size;
|
||||
unsigned int index;
|
||||
|
||||
/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
|
||||
int pages_per_zspage;
|
||||
struct zs_size_stat stats;
|
||||
|
||||
/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
|
||||
int pages_per_zspage;
|
||||
/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
|
||||
bool huge;
|
||||
};
|
||||
@@ -309,7 +309,12 @@ static void free_handle(struct zs_pool *pool, unsigned long handle)
|
||||
|
||||
static void record_obj(unsigned long handle, unsigned long obj)
|
||||
{
|
||||
*(unsigned long *)handle = obj;
|
||||
/*
|
||||
* lsb of @obj represents handle lock while other bits
|
||||
* represent object value the handle is pointing so
|
||||
* updating shouldn't do store tearing.
|
||||
*/
|
||||
WRITE_ONCE(*(unsigned long *)handle, obj);
|
||||
}
|
||||
|
||||
/* zpool driver */
|
||||
@@ -1635,6 +1640,13 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
|
||||
free_obj = obj_malloc(d_page, class, handle);
|
||||
zs_object_copy(free_obj, used_obj, class);
|
||||
index++;
|
||||
/*
|
||||
* record_obj updates handle's value to free_obj and it will
|
||||
* invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which
|
||||
* breaks synchronization using pin_tag(e,g, zs_free) so
|
||||
* let's keep the lock bit.
|
||||
*/
|
||||
free_obj |= BIT(HANDLE_PIN_BIT);
|
||||
record_obj(handle, free_obj);
|
||||
unpin_tag(handle);
|
||||
obj_free(pool, class, used_obj);
|
||||
|
新增問題並參考
封鎖使用者